Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ Runtime configuration is loaded from `.envs/.local/` or `.envs/.production/` thr
| `OPENSEARCH_INDEX_NAME` | `usage` | OpenSearch index prefix |
| `OPENSEARCH_BASIC_AUTH` | `admin:admin` | OpenSearch basic auth credentials |
| `OPENSEARCH_VERIFY_CERTS` | `False` | Verify SSL certificates for OpenSearch connections |
| `COUNTER_ROBOTS_URL` | `https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/COUNTER_Robots_list.json` | COUNTER robot user-agent list URL used by the resources loader |
| `MMDB_URL_TEMPLATE` | `https://download.db-ip.com/free/dbip-country-lite-{year}-{month:02d}.mmdb.gz` | DB-IP GeoIP MMDB gzip URL template; `{year}` and `{month}` are filled from the current and previous month |
| `USE_LOCAL_SCIELO_LIBS` | `0` | Mount local `scielo_log_validator` and `scielo_usage_counter` repos for development |
| `DJANGO_SETTINGS_MODULE` | `config.settings.local` | Django settings module |
| `REDIS_URL` | — | Redis connection URL for Celery |
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.0.2
2.0.3
3 changes: 1 addition & 2 deletions collection/tasks.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from django.contrib.auth import get_user_model
from django.utils.translation import gettext as _

from core.utils.request_utils import _get_user
from collection.models import Collection
Expand All @@ -8,7 +7,7 @@
User = get_user_model()


@celery_app.task(bind=True, name=_('[Collection] Load Collection Data'))
@celery_app.task(bind=True, name='[Collection] Load Collection Data')
def task_load_collections(self, user_id=None, username=None):
user = _get_user(self.request, username=username, user_id=user_id)
Collection.load(user)
17 changes: 17 additions & 0 deletions config/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,6 +420,23 @@
default=False,
)

# Resources
# ------------------------------------------------------------------------------
COUNTER_ROBOTS_URL = env(
"COUNTER_ROBOTS_URL",
default=(
"https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/"
"COUNTER_Robots_list.json"
),
)
MMDB_URL_TEMPLATE = env(
"MMDB_URL_TEMPLATE",
default=(
"https://download.db-ip.com/free/"
"dbip-country-lite-{year}-{month:02d}.mmdb.gz"
),
)

# Collectors configuration
# ------------------------------------------------------------------------------
# ArticleMeta
Expand Down
10 changes: 7 additions & 3 deletions django_celery_beat/views.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import inspect
import json

from celery import current_app
from django.shortcuts import get_object_or_404, redirect
from django.urls import reverse
from django.utils.translation import gettext as _
from wagtail.admin import messages

Expand All @@ -26,10 +28,12 @@ def task_run(request):
request,
_("Task '{0}' not found in the Celery registry.").format(p_task.task),
)
return redirect(request.META.get("HTTP_REFERER"))
return redirect(request.META.get("HTTP_REFERER") or reverse("wagtailadmin_home"))

kwargs = json.loads(p_task.kwargs)
kwargs["user_id"] = request.user.id
sig = inspect.signature(task.run)
if "user_id" in sig.parameters:
kwargs["user_id"] = request.user.id

task.apply_async(
args=json.loads(p_task.args),
Expand All @@ -40,4 +44,4 @@ def task_run(request):

messages.success(request, _("Task {0} was successfully run").format(p_task.name))

return redirect(request.META.get("HTTP_REFERER"))
return redirect(request.META.get("HTTP_REFERER") or reverse("wagtailadmin_home"))
9 changes: 4 additions & 5 deletions log_manager/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

from celery import chord
from django.conf import settings
from django.utils.translation import gettext as _

from collection.models import Collection
from config import celery_app
Expand All @@ -19,7 +18,7 @@


@celery_app.task(
bind=True, name=_("[Log Pipeline] 1. Search Logs (Manual)"), queue="load"
bind=True, name="[Log Pipeline] 1. Search Logs (Manual)", queue="load"
)
def task_search_log_files(
self,
Expand Down Expand Up @@ -100,7 +99,7 @@ def task_search_log_files(

@celery_app.task(
bind=True,
name=_("[Log Pipeline] 2. Validate Logs (Manual)"),
name="[Log Pipeline] 2. Validate Logs (Manual)",
timelimit=-1,
queue="load",
)
Expand Down Expand Up @@ -183,7 +182,7 @@ def task_validate_log_files(

@celery_app.task(
bind=True,
name=_("[Log Pipeline] Validate Single Log File (Auto)"),
name="[Log Pipeline] Validate Single Log File (Auto)",
timelimit=-1,
queue="load",
)
Expand Down Expand Up @@ -219,7 +218,7 @@ def task_validate_log_file(self, log_file_hash, user_id=None, username=None):
log_file.save()


@celery_app.task(bind=True, name=_("[Log Pipeline] Daily Routine (Auto)"), queue="load")
@celery_app.task(bind=True, name="[Log Pipeline] Daily Routine (Auto)", queue="load")
def task_daily_log_ingestion_pipeline(self):
"""
Start the daily Search -> Validate -> Parse chain with default parameters.
Expand Down
3 changes: 1 addition & 2 deletions log_manager_config/tasks.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from django.conf import settings
from django.utils.translation import gettext as _

from config import celery_app
from config.collections import COLLECTION_SIZE_SAMPLE_MAP, LOG_MANAGER_SEED_DATA
Expand All @@ -8,7 +7,7 @@
from . import models


@celery_app.task(bind=True, name=_('[Log Pipeline] Load Log Manager Settings (Seed)'))
@celery_app.task(bind=True, name='[Log Pipeline] Load Log Manager Settings (Seed)')
def task_load_log_manager_collection_settings(self, data=None, user_id=None, username=None):
user = _get_user(self.request, username=username, user_id=user_id)

Expand Down
4 changes: 2 additions & 2 deletions reports/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def _extract_date_from_log_file(lf):
return None


@celery_app.task(bind=True, name=_("[Reports] Populate All Reports"))
@celery_app.task(bind=True, name="[Reports] Populate All Reports")
def task_populate_all_reports(self, year=None, collection_acron=None):
qs = LogFile.objects.select_related("collection")
if collection_acron:
Expand Down Expand Up @@ -120,7 +120,7 @@ def _upsert_reports(model_class, data):

@celery_app.task(
bind=True,
name=_("[Reports] Generate Log Report Summary (Manual)"),
name="[Reports] Generate Log Report Summary (Manual)",
queue="load",
)
def task_log_files_count_status_report(
Expand Down
2 changes: 0 additions & 2 deletions resources/constants.py

This file was deleted.

63 changes: 41 additions & 22 deletions resources/tasks.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import logging

from django.utils.translation import gettext as _
from django.conf import settings

from config import celery_app

from . import constants, models, utils
from . import models, utils

@celery_app.task(bind=True, name=_('[Resources] Load Robots Data'))

@celery_app.task(bind=True, name='[Resources] Load Robots Data')
def task_load_robots(self, url_robots=None):
"""
Load robots from a given URL and save them to the database.
Expand All @@ -27,7 +28,7 @@ def task_load_robots(self, url_robots=None):
- Debug information for each robot saved.
"""
if not url_robots:
url_robots = constants.DEFAULT_COUNTER_ROBOTS_URL
url_robots = settings.COUNTER_ROBOTS_URL
logging.warning(f'No robots URL provided. Using default: {url_robots}')

try:
Expand Down Expand Up @@ -82,27 +83,45 @@ def task_load_robots(self, url_robots=None):
return False


@celery_app.task(bind=True, name=_('[Resources] Load Geolocation Data'))
@celery_app.task(bind=True, name='[Resources] Load Geolocation Data')
def task_load_geoip(self, url_geoip=None, validate=True):
"""
Load GeoIP data from a specified URL, validate it, and save it to the database.

When ``url_geoip`` is not provided the task resolves the URL automatically:
it tries the current month first and, if the file is not yet available,
falls back to the previous month.

Args:
url_geoip (str, optional): The URL to download the GeoIP data from. Defaults to None.
url_geoip (str, optional): Explicit URL to download. Defaults to None
(auto-resolved for the current/previous month).
validate (bool, optional): Whether to validate the GeoIP data. Defaults to True.
Returns:
bool: True if the GeoIP data was successfully loaded and saved, False otherwise.
Raises:
Exception: If there is an error downloading, decompressing, or validating the GeoIP data.
"""

if not url_geoip:
url_geoip = constants.DEFAULT_MMDB_URL
logging.warning(f'No GeoIP URL provided. Using default: {url_geoip}')

try:
data = utils.fetch_data(url_geoip, data_type='content')
except Exception as e:
logging.error(f'Error downloading GeoIP: {e}')
if url_geoip:
candidates = [url_geoip]
else:
candidates = utils.resolve_mmdb_url()
logging.info('No GeoIP URL provided. Will try candidates: %s', candidates)

data = None
resolved_url = None
for url in candidates:
try:
data = utils.fetch_data(url, data_type='content')
resolved_url = url
logging.info('GeoIP data downloaded from: %s', url)
break
except Exception as e:
logging.warning(
'Failed to download GeoIP from %s: %s. Trying next candidate.', url, e
)

if data is None:
logging.error(
'Could not download GeoIP data from any candidate URL: %s', candidates
)
return False

try:
Expand All @@ -119,16 +138,16 @@ def task_load_geoip(self, url_geoip=None, validate=True):
return False

mmdb_hash = models.MMDB.compute_hash(mmdb_data)
try:

try:
mmdb_obj = models.MMDB.objects.get(id=mmdb_hash)
logging.debug(f'GeoIP data already exists: {mmdb_obj}')

except models.MMDB.DoesNotExist:
mmdb_obj = models.MMDB.objects.create(id=mmdb_hash, data=mmdb_data)
mmdb_obj.url = url_geoip or constants.DEFAULT_MMDB_URL
mmdb_obj.url = resolved_url

mmdb_obj.save()
logging.debug(f'GeoIP data has been saved: {mmdb_obj}')
logging.info('GeoIP data saved (url=%s, hash=%s)', resolved_url, mmdb_hash)

return True
49 changes: 47 additions & 2 deletions resources/tests/test_robots.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from datetime import date
from unittest.mock import patch

from django.test import TestCase
from django.test import TestCase, override_settings

from resources import models, tasks
from resources import models, tasks, utils


class RobotUserAgentModelTests(TestCase):
Expand Down Expand Up @@ -66,6 +67,30 @@ def test_get_patterns_rejects_invalid_source(self):

class LoadRobotsTaskTests(TestCase):

@patch("resources.tasks.utils.fetch_data")
@override_settings(COUNTER_ROBOTS_URL="https://settings.example.org/robots.json")
def test_task_load_robots_uses_settings_url_when_not_provided(
self,
mock_fetch_data,
):
mock_fetch_data.return_value = [
{"pattern": "CounterBot", "last_changed": "2025-01-15"},
]

result = tasks.task_load_robots.run()

self.assertTrue(result)
mock_fetch_data.assert_called_once_with(
"https://settings.example.org/robots.json",
data_type="json",
)

counter_bot = models.RobotUserAgent.objects.get(pattern="CounterBot")
self.assertEqual(
counter_bot.source_url,
"https://settings.example.org/robots.json",
)

@patch("resources.tasks.utils.fetch_data")
def test_task_load_robots_marks_counter_source_and_deactivates_stale_counter_entries(
self,
Expand Down Expand Up @@ -111,3 +136,23 @@ def test_task_load_robots_marks_counter_source_and_deactivates_stale_counter_ent
self.assertFalse(stale_counter.is_active)
self.assertIsNone(stale_counter.source_url)
self.assertIsNone(stale_counter.last_changed)


class GeoIPUtilsTests(TestCase):
@override_settings(
MMDB_URL_TEMPLATE="https://example.org/dbip-{year}-{month:02d}.mmdb.gz"
)
@patch("resources.utils.date")
def test_resolve_mmdb_url_returns_current_and_previous_month_from_settings(
self,
mock_date,
):
mock_date.today.return_value = date(2026, 5, 5)

self.assertEqual(
utils.resolve_mmdb_url(),
[
"https://example.org/dbip-2026-05.mmdb.gz",
"https://example.org/dbip-2026-04.mmdb.gz",
],
)
32 changes: 29 additions & 3 deletions resources/utils.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import logging
import gzip
import io
import requests
import logging
import tempfile

from datetime import date
from time import sleep

import geoip2.database
import requests
from django.conf import settings


def fetch_data(url, data_type='json', max_retries=5, sleep_time=30):
Expand Down Expand Up @@ -99,3 +100,28 @@ def validate_geoip_data(data):
else:
reader.close()
return True


def resolve_mmdb_url():
"""
Return DB-IP MMDB candidate URLs for the current and previous month.

The DB-IP free database is published monthly. This helper returns:
[current_month_url, previous_month_url]

The caller should try each URL in order and use the first one that
succeeds, providing a natural fallback when the current-month file
has not yet been published.
"""
today = date.today()
current_url = settings.MMDB_URL_TEMPLATE.format(year=today.year, month=today.month)

if today.month == 1:
prev_url = settings.MMDB_URL_TEMPLATE.format(year=today.year - 1, month=12)
else:
prev_url = settings.MMDB_URL_TEMPLATE.format(
year=today.year,
month=today.month - 1,
)

return [current_url, prev_url]
Loading
Loading