From 59e36b4e99a9187fbbbc8a5f2e2af1b1ef99db4a Mon Sep 17 00:00:00 2001 From: Rafael JPD Date: Tue, 5 May 2026 10:29:51 -0300 Subject: [PATCH 1/4] Usa nomes literais nas tasks Celery --- collection/tasks.py | 3 +-- log_manager/tasks.py | 9 ++++----- log_manager_config/tasks.py | 3 +-- reports/tasks.py | 4 ++-- source/tasks.py | 5 ++--- 5 files changed, 10 insertions(+), 14 deletions(-) diff --git a/collection/tasks.py b/collection/tasks.py index 19372de..221e8bc 100644 --- a/collection/tasks.py +++ b/collection/tasks.py @@ -1,5 +1,4 @@ from django.contrib.auth import get_user_model -from django.utils.translation import gettext as _ from core.utils.request_utils import _get_user from collection.models import Collection @@ -8,7 +7,7 @@ User = get_user_model() -@celery_app.task(bind=True, name=_('[Collection] Load Collection Data')) +@celery_app.task(bind=True, name='[Collection] Load Collection Data') def task_load_collections(self, user_id=None, username=None): user = _get_user(self.request, username=username, user_id=user_id) Collection.load(user) diff --git a/log_manager/tasks.py b/log_manager/tasks.py index 08da275..614106d 100644 --- a/log_manager/tasks.py +++ b/log_manager/tasks.py @@ -3,7 +3,6 @@ from celery import chord from django.conf import settings -from django.utils.translation import gettext as _ from collection.models import Collection from config import celery_app @@ -19,7 +18,7 @@ @celery_app.task( - bind=True, name=_("[Log Pipeline] 1. Search Logs (Manual)"), queue="load" + bind=True, name="[Log Pipeline] 1. Search Logs (Manual)", queue="load" ) def task_search_log_files( self, @@ -100,7 +99,7 @@ def task_search_log_files( @celery_app.task( bind=True, - name=_("[Log Pipeline] 2. Validate Logs (Manual)"), + name="[Log Pipeline] 2. Validate Logs (Manual)", timelimit=-1, queue="load", ) @@ -183,7 +182,7 @@ def task_validate_log_files( @celery_app.task( bind=True, - name=_("[Log Pipeline] Validate Single Log File (Auto)"), + name="[Log Pipeline] Validate Single Log File (Auto)", timelimit=-1, queue="load", ) @@ -219,7 +218,7 @@ def task_validate_log_file(self, log_file_hash, user_id=None, username=None): log_file.save() -@celery_app.task(bind=True, name=_("[Log Pipeline] Daily Routine (Auto)"), queue="load") +@celery_app.task(bind=True, name="[Log Pipeline] Daily Routine (Auto)", queue="load") def task_daily_log_ingestion_pipeline(self): """ Start the daily Search -> Validate -> Parse chain with default parameters. diff --git a/log_manager_config/tasks.py b/log_manager_config/tasks.py index c4ff399..415dbf9 100644 --- a/log_manager_config/tasks.py +++ b/log_manager_config/tasks.py @@ -1,5 +1,4 @@ from django.conf import settings -from django.utils.translation import gettext as _ from config import celery_app from config.collections import COLLECTION_SIZE_SAMPLE_MAP, LOG_MANAGER_SEED_DATA @@ -8,7 +7,7 @@ from . import models -@celery_app.task(bind=True, name=_('[Log Pipeline] Load Log Manager Settings (Seed)')) +@celery_app.task(bind=True, name='[Log Pipeline] Load Log Manager Settings (Seed)') def task_load_log_manager_collection_settings(self, data=None, user_id=None, username=None): user = _get_user(self.request, username=username, user_id=user_id) diff --git a/reports/tasks.py b/reports/tasks.py index d4dde0b..6a70048 100644 --- a/reports/tasks.py +++ b/reports/tasks.py @@ -34,7 +34,7 @@ def _extract_date_from_log_file(lf): return None -@celery_app.task(bind=True, name=_("[Reports] Populate All Reports")) +@celery_app.task(bind=True, name="[Reports] Populate All Reports") def task_populate_all_reports(self, year=None, collection_acron=None): qs = LogFile.objects.select_related("collection") if collection_acron: @@ -120,7 +120,7 @@ def _upsert_reports(model_class, data): @celery_app.task( bind=True, - name=_("[Reports] Generate Log Report Summary (Manual)"), + name="[Reports] Generate Log Report Summary (Manual)", queue="load", ) def task_log_files_count_status_report( diff --git a/source/tasks.py b/source/tasks.py index eb1633b..6b7eeb2 100644 --- a/source/tasks.py +++ b/source/tasks.py @@ -1,6 +1,5 @@ import logging -from django.utils.translation import gettext as _ from django.conf import settings from collection.models import Collection @@ -104,7 +103,7 @@ def load_sources_from_scielo_books( return True -@celery_app.task(bind=True, name=_("[Metadata] Sync Sources (Article Meta)"), queue="load") +@celery_app.task(bind=True, name="[Metadata] Sync Sources (Article Meta)", queue="load") def task_load_sources_from_article_meta( self, collections=None, @@ -122,7 +121,7 @@ def task_load_sources_from_article_meta( ) -@celery_app.task(bind=True, name=_("[Metadata] Sync Sources (SciELO Books)"), queue="load") +@celery_app.task(bind=True, name="[Metadata] Sync Sources (SciELO Books)", queue="load") def task_load_sources_from_scielo_books( self, collection="books", From a41612a90b75bf4c4baf6501dce5d8f74c4a4408 Mon Sep 17 00:00:00 2001 From: Rafael JPD Date: Tue, 5 May 2026 10:29:59 -0300 Subject: [PATCH 2/4] Evita user_id em tasks sem parametro --- django_celery_beat/views.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/django_celery_beat/views.py b/django_celery_beat/views.py index b5cff84..86327ec 100644 --- a/django_celery_beat/views.py +++ b/django_celery_beat/views.py @@ -1,7 +1,9 @@ +import inspect import json from celery import current_app from django.shortcuts import get_object_or_404, redirect +from django.urls import reverse from django.utils.translation import gettext as _ from wagtail.admin import messages @@ -26,10 +28,12 @@ def task_run(request): request, _("Task '{0}' not found in the Celery registry.").format(p_task.task), ) - return redirect(request.META.get("HTTP_REFERER")) + return redirect(request.META.get("HTTP_REFERER") or reverse("wagtailadmin_home")) kwargs = json.loads(p_task.kwargs) - kwargs["user_id"] = request.user.id + sig = inspect.signature(task.run) + if "user_id" in sig.parameters: + kwargs["user_id"] = request.user.id task.apply_async( args=json.loads(p_task.args), @@ -40,4 +44,4 @@ def task_run(request): messages.success(request, _("Task {0} was successfully run").format(p_task.name)) - return redirect(request.META.get("HTTP_REFERER")) + return redirect(request.META.get("HTTP_REFERER") or reverse("wagtailadmin_home")) From 81a817a22403104cddda959ab8ecb8a3eec370ed Mon Sep 17 00:00:00 2001 From: Rafael JPD Date: Tue, 5 May 2026 10:30:06 -0300 Subject: [PATCH 3/4] Move defaults de resources para settings --- README.md | 2 ++ config/settings/base.py | 17 +++++++++ resources/constants.py | 2 -- resources/tasks.py | 63 ++++++++++++++++++++++------------ resources/tests/test_robots.py | 49 ++++++++++++++++++++++++-- resources/utils.py | 32 +++++++++++++++-- 6 files changed, 136 insertions(+), 29 deletions(-) delete mode 100644 resources/constants.py diff --git a/README.md b/README.md index 87734e0..2433fa8 100644 --- a/README.md +++ b/README.md @@ -98,6 +98,8 @@ Runtime configuration is loaded from `.envs/.local/` or `.envs/.production/` thr | `OPENSEARCH_INDEX_NAME` | `usage` | OpenSearch index prefix | | `OPENSEARCH_BASIC_AUTH` | `admin:admin` | OpenSearch basic auth credentials | | `OPENSEARCH_VERIFY_CERTS` | `False` | Verify SSL certificates for OpenSearch connections | +| `COUNTER_ROBOTS_URL` | `https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/COUNTER_Robots_list.json` | COUNTER robot user-agent list URL used by the resources loader | +| `MMDB_URL_TEMPLATE` | `https://download.db-ip.com/free/dbip-country-lite-{year}-{month:02d}.mmdb.gz` | DB-IP GeoIP MMDB gzip URL template; `{year}` and `{month}` are filled from the current and previous month | | `USE_LOCAL_SCIELO_LIBS` | `0` | Mount local `scielo_log_validator` and `scielo_usage_counter` repos for development | | `DJANGO_SETTINGS_MODULE` | `config.settings.local` | Django settings module | | `REDIS_URL` | — | Redis connection URL for Celery | diff --git a/config/settings/base.py b/config/settings/base.py index e4a99fa..62aa17a 100644 --- a/config/settings/base.py +++ b/config/settings/base.py @@ -420,6 +420,23 @@ default=False, ) +# Resources +# ------------------------------------------------------------------------------ +COUNTER_ROBOTS_URL = env( + "COUNTER_ROBOTS_URL", + default=( + "https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/" + "COUNTER_Robots_list.json" + ), +) +MMDB_URL_TEMPLATE = env( + "MMDB_URL_TEMPLATE", + default=( + "https://download.db-ip.com/free/" + "dbip-country-lite-{year}-{month:02d}.mmdb.gz" + ), +) + # Collectors configuration # ------------------------------------------------------------------------------ # ArticleMeta diff --git a/resources/constants.py b/resources/constants.py deleted file mode 100644 index 2ce64da..0000000 --- a/resources/constants.py +++ /dev/null @@ -1,2 +0,0 @@ -DEFAULT_COUNTER_ROBOTS_URL = 'https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/COUNTER_Robots_list.json' -DEFAULT_MMDB_URL = 'https://download.db-ip.com/free/dbip-country-lite-2026-03.mmdb.gz' diff --git a/resources/tasks.py b/resources/tasks.py index 4df60a9..0a87600 100644 --- a/resources/tasks.py +++ b/resources/tasks.py @@ -1,12 +1,13 @@ import logging -from django.utils.translation import gettext as _ +from django.conf import settings from config import celery_app -from . import constants, models, utils +from . import models, utils -@celery_app.task(bind=True, name=_('[Resources] Load Robots Data')) + +@celery_app.task(bind=True, name='[Resources] Load Robots Data') def task_load_robots(self, url_robots=None): """ Load robots from a given URL and save them to the database. @@ -27,7 +28,7 @@ def task_load_robots(self, url_robots=None): - Debug information for each robot saved. """ if not url_robots: - url_robots = constants.DEFAULT_COUNTER_ROBOTS_URL + url_robots = settings.COUNTER_ROBOTS_URL logging.warning(f'No robots URL provided. Using default: {url_robots}') try: @@ -82,27 +83,45 @@ def task_load_robots(self, url_robots=None): return False -@celery_app.task(bind=True, name=_('[Resources] Load Geolocation Data')) +@celery_app.task(bind=True, name='[Resources] Load Geolocation Data') def task_load_geoip(self, url_geoip=None, validate=True): """ Load GeoIP data from a specified URL, validate it, and save it to the database. + + When ``url_geoip`` is not provided the task resolves the URL automatically: + it tries the current month first and, if the file is not yet available, + falls back to the previous month. + Args: - url_geoip (str, optional): The URL to download the GeoIP data from. Defaults to None. + url_geoip (str, optional): Explicit URL to download. Defaults to None + (auto-resolved for the current/previous month). validate (bool, optional): Whether to validate the GeoIP data. Defaults to True. Returns: bool: True if the GeoIP data was successfully loaded and saved, False otherwise. - Raises: - Exception: If there is an error downloading, decompressing, or validating the GeoIP data. """ - - if not url_geoip: - url_geoip = constants.DEFAULT_MMDB_URL - logging.warning(f'No GeoIP URL provided. Using default: {url_geoip}') - - try: - data = utils.fetch_data(url_geoip, data_type='content') - except Exception as e: - logging.error(f'Error downloading GeoIP: {e}') + if url_geoip: + candidates = [url_geoip] + else: + candidates = utils.resolve_mmdb_url() + logging.info('No GeoIP URL provided. Will try candidates: %s', candidates) + + data = None + resolved_url = None + for url in candidates: + try: + data = utils.fetch_data(url, data_type='content') + resolved_url = url + logging.info('GeoIP data downloaded from: %s', url) + break + except Exception as e: + logging.warning( + 'Failed to download GeoIP from %s: %s. Trying next candidate.', url, e + ) + + if data is None: + logging.error( + 'Could not download GeoIP data from any candidate URL: %s', candidates + ) return False try: @@ -119,16 +138,16 @@ def task_load_geoip(self, url_geoip=None, validate=True): return False mmdb_hash = models.MMDB.compute_hash(mmdb_data) - - try: + + try: mmdb_obj = models.MMDB.objects.get(id=mmdb_hash) logging.debug(f'GeoIP data already exists: {mmdb_obj}') except models.MMDB.DoesNotExist: mmdb_obj = models.MMDB.objects.create(id=mmdb_hash, data=mmdb_data) - mmdb_obj.url = url_geoip or constants.DEFAULT_MMDB_URL + mmdb_obj.url = resolved_url mmdb_obj.save() - logging.debug(f'GeoIP data has been saved: {mmdb_obj}') - + logging.info('GeoIP data saved (url=%s, hash=%s)', resolved_url, mmdb_hash) + return True diff --git a/resources/tests/test_robots.py b/resources/tests/test_robots.py index 4d6bf74..330d4db 100644 --- a/resources/tests/test_robots.py +++ b/resources/tests/test_robots.py @@ -1,8 +1,9 @@ +from datetime import date from unittest.mock import patch -from django.test import TestCase +from django.test import TestCase, override_settings -from resources import models, tasks +from resources import models, tasks, utils class RobotUserAgentModelTests(TestCase): @@ -66,6 +67,30 @@ def test_get_patterns_rejects_invalid_source(self): class LoadRobotsTaskTests(TestCase): + @patch("resources.tasks.utils.fetch_data") + @override_settings(COUNTER_ROBOTS_URL="https://settings.example.org/robots.json") + def test_task_load_robots_uses_settings_url_when_not_provided( + self, + mock_fetch_data, + ): + mock_fetch_data.return_value = [ + {"pattern": "CounterBot", "last_changed": "2025-01-15"}, + ] + + result = tasks.task_load_robots.run() + + self.assertTrue(result) + mock_fetch_data.assert_called_once_with( + "https://settings.example.org/robots.json", + data_type="json", + ) + + counter_bot = models.RobotUserAgent.objects.get(pattern="CounterBot") + self.assertEqual( + counter_bot.source_url, + "https://settings.example.org/robots.json", + ) + @patch("resources.tasks.utils.fetch_data") def test_task_load_robots_marks_counter_source_and_deactivates_stale_counter_entries( self, @@ -111,3 +136,23 @@ def test_task_load_robots_marks_counter_source_and_deactivates_stale_counter_ent self.assertFalse(stale_counter.is_active) self.assertIsNone(stale_counter.source_url) self.assertIsNone(stale_counter.last_changed) + + +class GeoIPUtilsTests(TestCase): + @override_settings( + MMDB_URL_TEMPLATE="https://example.org/dbip-{year}-{month:02d}.mmdb.gz" + ) + @patch("resources.utils.date") + def test_resolve_mmdb_url_returns_current_and_previous_month_from_settings( + self, + mock_date, + ): + mock_date.today.return_value = date(2026, 5, 5) + + self.assertEqual( + utils.resolve_mmdb_url(), + [ + "https://example.org/dbip-2026-05.mmdb.gz", + "https://example.org/dbip-2026-04.mmdb.gz", + ], + ) diff --git a/resources/utils.py b/resources/utils.py index 41df3f6..c8d58fe 100644 --- a/resources/utils.py +++ b/resources/utils.py @@ -1,12 +1,13 @@ -import logging import gzip import io -import requests +import logging import tempfile - +from datetime import date from time import sleep import geoip2.database +import requests +from django.conf import settings def fetch_data(url, data_type='json', max_retries=5, sleep_time=30): @@ -99,3 +100,28 @@ def validate_geoip_data(data): else: reader.close() return True + + +def resolve_mmdb_url(): + """ + Return DB-IP MMDB candidate URLs for the current and previous month. + + The DB-IP free database is published monthly. This helper returns: + [current_month_url, previous_month_url] + + The caller should try each URL in order and use the first one that + succeeds, providing a natural fallback when the current-month file + has not yet been published. + """ + today = date.today() + current_url = settings.MMDB_URL_TEMPLATE.format(year=today.year, month=today.month) + + if today.month == 1: + prev_url = settings.MMDB_URL_TEMPLATE.format(year=today.year - 1, month=12) + else: + prev_url = settings.MMDB_URL_TEMPLATE.format( + year=today.year, + month=today.month - 1, + ) + + return [current_url, prev_url] From 8e9fab75d341681bd75bd60c960078af69caa6a0 Mon Sep 17 00:00:00 2001 From: Rafael JPD Date: Tue, 5 May 2026 10:30:10 -0300 Subject: [PATCH 4/4] Atualiza versao para 2.0.3 --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index e9307ca..50ffc5a 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.0.2 +2.0.3