Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
exclude: "^docs/|/migrations/"
exclude: "^docs/|/migrations/|.*/migrations/.*"
default_stages: [commit]

repos:
Expand All @@ -10,7 +10,7 @@ repos:
- id: check-yaml

- repo: https://github.com/psf/black
rev: 23.1.0
rev: 23.12.0
hooks:
- id: black

Expand All @@ -20,11 +20,11 @@ repos:
- id: isort

- repo: https://github.com/PyCQA/flake8
rev: 6.0.0
rev: 6.1.0
hooks:
- id: flake8
args: ["--config=setup.cfg"]
additional_dependencies: [flake8-isort]
additional_dependencies: [flake8-isort==6.1.1]

# sets up .pre-commit-ci.yaml to ensure pre-commit dependencies stay up to date
ci:
Expand Down
33 changes: 16 additions & 17 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@ COMPOSE_FILE_DEV = local.yml

compose = ${COMPOSE_FILE_DEV}

export SCMS_BUILD_DATE=$(shell date -u +"%Y-%m-%dT%H:%M:%SZ")
export SCMS_VCS_REF=$(strip $(shell git rev-parse --short HEAD))
export SCMS_WEBAPP_VERSION=$(strip $(shell cat VERSION))
export SCIELO_USAGE_BUILD_DATE=$(shell date -u +"%Y-%m-%dT%H:%M:%SZ")
export SCIELO_USAGE_VCS_REF=$(strip $(shell git rev-parse --short HEAD))
export SCIELO_USAGE_WEBAPP_VERSION=$(strip $(shell cat VERSION))

help: ## Show this help
@echo 'Usage: make [target] [argument] ...'
Expand All @@ -23,13 +23,13 @@ help: ## Show this help
@echo "\t Type 'make up' is the same of type 'make up compose=local.yml'"

app_version: ## Show version of webapp
@echo "Version: " $(SCMS_WEBAPP_VERSION)
@echo "Version: " $(SCIELO_USAGE_WEBAPP_VERSION)

latest_commit: ## Show last commit ref
@echo "Latest commit: " $(SCMS_VCS_REF)
@echo "Latest commit: " $(SCIELO_USAGE_VCS_REF)

build_date: ## Show build date
@echo "Build date: " $(SCMS_BUILD_DATE)
@echo "Build date: " $(SCIELO_USAGE_BUILD_DATE)

############################################
## atalhos docker compose desenvolvimento ##
Expand Down Expand Up @@ -75,10 +75,10 @@ django_bash: ## Open a bash terminar from django container using $(compose)
@docker compose -f $(compose) run --rm django bash

django_test: ## Run tests from django container using $(compose)
@docker compose -f $(compose) run --rm django python manage.py test
@docker compose -f $(compose) run --rm django pytest

django_fast: ## Run tests fast from django container using $(compose)
@docker compose -f $(compose) run --rm django python manage.py test --failfast
@docker compose -f $(compose) run --rm django pytest --failfast

django_makemigrations: ## Run makemigrations from django container using $(compose)
@docker compose -f $(compose) run --rm django python manage.py makemigrations
Expand All @@ -99,32 +99,31 @@ django_load_auth: ## Run manage.py dumpdata auth --indent=2 $(compose)
@docker compose -f $(compose) run --rm django python manage.py loaddata --database=default fixtures/auth.json

dump_data: ## Dump database into .sql $(compose)
docker exec -t scielo_core_local_postgres pg_dumpall -c -U debug > dump_`date +%d-%m-%Y"_"%H_%M_%S`.sql
@docker compose -f $(compose) exec -T postgres sh -c 'pg_dumpall -c -U "$$POSTGRES_USER"' > dump_`date +%d-%m-%Y"_"%H_%M_%S`.sql

restore_data: ## Restore database into from latest.sql file $(compose)
cat backup/latest.sql | docker exec -i scielo_core_local_postgres psql -U debug
@docker compose -f $(compose) exec -T postgres sh -c 'psql -U "$$POSTGRES_USER"' < backup/latest.sql

############################################
## Atalhos Úteis ##
############################################

clean_container: ## Remove all containers
@docker rm $$(docker ps -a -q --no-trunc)
@docker compose -f $(compose) rm -sf

clean_dangling_images: ## Remove all dangling images
@docker rmi -f $$(docker images --filter 'dangling=true' -q --no-trunc)

clean_dangling_volumes: ## Remove all dangling volumes
@docker volume rm $$(docker volume ls -f dangling=true -q)

clean_project_images: ## Remove all images with "core" on name
@docker rmi -f $$(docker images --filter=reference='*scielo_core*' -q)
clean_project_images: ## Remove all images with "scielo_usage" on name
@docker rmi -f $$(docker images --filter=reference='*scielo_usage*' -q)

volume_down: ## Remove all volume
@docker compose -f $(compose) down -v

clean_migrations: ## Remove all migrations
@echo "Cleaning migrations..."
@find . -path "*/migrations/*.py" -not -name "__init__.py" -not -path "./django_celery_beat/migrations*" -not -path "./core_settings/migrations*" -not -path "./core/contrib/sites/migrations*" -not -path "./core/users/migrations*" -delete
clean_migrations: ## Remove generated migration bytecode only
@echo "Cleaning migration bytecode..."
@find . -path "*/migrations/*.pyc" -delete
@echo "Migrations cleaned successfully."
@echo "Migration bytecode cleaned successfully."
53 changes: 44 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@

A modernized platform for processing and indexing SciELO usage logs into OpenSearch, adhering to COUNTER R5.1 standards.

**Version**: 2.0.0

## Quick Start (Dev Installation)

To build and run the application locally:
Expand All @@ -30,6 +28,10 @@ make django_fast # tests with --failfast
make django_migrate # apply migrations
make django_makemigrations # generate new migrations
make django_createsuperuser # create Wagtail admin user
make logs # follow all service logs
make ps # list compose services
make django_bash # open a bash shell in the django container
make django_compilemessages # compile translation files
```

**Run a single test file/path:**
Expand Down Expand Up @@ -86,21 +88,48 @@ Metadata is kept in sync with SciELO sources (ArticleMeta, OPAC, Books, etc.) vi

## Environment Variables

Runtime configuration is loaded from `.envs/.local/` or `.envs/.production/` through the Compose files.

### Core Services

| Variable | Default | Description |
|---|---|---|
| `OPENSEARCH_URL` | — | OpenSearch cluster URL |
| `OPENSEARCH_BASIC_AUTH` | — | OpenSearch basic auth credentials (`user:pass`) |
| `OPENSEARCH_URL` | `http://localhost:9200/` | OpenSearch cluster URL |
| `OPENSEARCH_INDEX_NAME` | `usage` | OpenSearch index prefix |
| `OPENSEARCH_BASIC_AUTH` | `admin:admin` | OpenSearch basic auth credentials |
| `OPENSEARCH_VERIFY_CERTS` | `False` | Verify SSL certificates for OpenSearch connections |
| `USE_LOCAL_SCIELO_LIBS` | `0` | Mount local `scielo_log_validator` and `scielo_usage_counter` repos for development |
| `DJANGO_SETTINGS_MODULE` | `config.settings.local` | Django settings module |
| `REDIS_URL` | — | Redis connection URL for Celery |

## OpenSearch Storage Strategy (Hybrid Monthly)

To optimize storage and performance, this system employs a **Hybrid Granularity** approach in OpenSearch:
### Collector Endpoints

- **Monthly Partitioning**: Indices are partitioned by month (e.g., `usage_monthly_books_2026`).
- **One Document per Month**: Each article/PID has exactly **one document per month**, drastically reducing the total document count (up to 30x reduction).
| Variable | Default | Description |
|---|---|---|
| `ARTICLEMETA_COLLECT_URL` | `http://articlemeta.scielo.org/api/v1/article/counter_dict` | ArticleMeta counter metadata endpoint |
| `ARTICLEMETA_MAX_RETRIES` | `5` | ArticleMeta retry attempts |
| `ARTICLEMETA_SLEEP_TIME` | `30` | Delay between ArticleMeta retries, in seconds |
| `OPAC_ENDPOINT` | `https://www.scielo.br/api/v1/counter_dict` | OPAC counter metadata endpoint |
| `OPAC_MAX_RETRIES` | `5` | OPAC retry attempts |
| `OPAC_SLEEP_TIME` | `30` | Delay between OPAC retries, in seconds |
| `OAI_PMH_PREPRINT_ENDPOINT` | `https://preprints.scielo.org/index.php/scielo/oai` | SciELO Preprints OAI-PMH endpoint |
| `OAI_METADATA_PREFIX` | `oai_dc` | OAI-PMH metadata prefix |
| `OAI_PMH_MAX_RETRIES` | `5` | OAI-PMH retry attempts |
| `DATAVERSE_ENDPOINT` | `https://data.scielo.org/api` | SciELO Data Dataverse API endpoint |
| `DATAVERSE_ROOT_COLLECTION` | `scielodata` | Dataverse root collection alias |
| `DATAVERSE_SLEEP_TIME` | `30` | Dataverse request timeout/retry delay, in seconds |
| `SCIELO_BOOKS_BASE_URL` | `http://localhost:5984` | SciELO Books CouchDB base URL |
| `SCIELO_BOOKS_DB_NAME` | `scielobooks_1a` | SciELO Books CouchDB database name |
| `SCIELO_BOOKS_TIMEOUT` | `60` | SciELO Books request timeout, in seconds |
| `SCIELO_BOOKS_LIMIT` | `1000` | SciELO Books changes-feed page size |

## OpenSearch Storage Strategy

The OpenSearch export keeps monthly usage documents with nested daily metrics, while index names depend on collection size:

- **Large and xlarge collections**: annual indices, such as `usage_monthly_scl_2024` and `usage_yearly_scl_2024`.
- **Small collections**: stable collection indices, such as `usage_monthly_books` and `usage_yearly_books`.
- **One Document per Month**: Each document/PID has one monthly document per metric scope.
- **Daily Nested Metrics**: Daily granularity is preserved inside each monthly document using a `daily_metrics` object.
- **Atomic Upserts**: Data is merged using OpenSearch **Painless Scripts**, allowing multiple logs for the same day/month to be processed without data duplication or loss.

Expand All @@ -112,9 +141,15 @@ All pipelines can be monitored through the **Wagtail Admin**:
- **Daily Metric Jobs**: Track the history of daily processing and OpenSearch export attempts.
- **Log Config**: Manage collection-specific settings, log paths, and notification emails.

Internally, log file statuses are stored as short codes such as `QUE`, `PAR`, and `PRO`, with labels displayed in the admin.

### Useful Commands

- `make django_shell`: Access the Django interactive shell.
- `make django_bash`: Open a bash shell in the Django container.
- `make logs`: Follow Docker Compose logs.
- `make ps`: Show running services.
- `docker compose -f local.yml run --rm django pytest path/to/test_file.py`: Run a single test file or path.
- `docker logs -f scielo_usage_local_celeryworker`: Monitor real-time task execution.

## Dependencies
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.0.0
2.0.1
21 changes: 20 additions & 1 deletion collection/wagtail_hooks.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
from django.utils.translation import gettext as _
from wagtail.snippets.views.snippets import SnippetViewSet
from wagtail.snippets.models import register_snippet
from wagtail.snippets.views.snippets import SnippetViewSet, SnippetViewSetGroup

from config.menu import get_menu_order
from document.wagtail_hooks import DocumentSnippetViewSet
from source.wagtail_hooks import SourceSnippetViewSet
from .models import Collection


Expand Down Expand Up @@ -52,3 +56,18 @@ class CollectionSnippetViewSet(SnippetViewSet):
"updated_by",
)
export_filename = "collections"


class MetadataSnippetViewSetGroup(SnippetViewSetGroup):
menu_name = "metadata"
menu_label = _("Metadata")
menu_icon = "folder-open-inverse"
menu_order = get_menu_order("metadata")
items = (
CollectionSnippetViewSet,
SourceSnippetViewSet,
DocumentSnippetViewSet,
)


register_snippet(MetadataSnippetViewSetGroup)
16 changes: 16 additions & 0 deletions core/wagtail_hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,15 @@
from wagtail import hooks


HIDDEN_MAIN_MENU_ITEMS = {
"documents",
"explorer",
"images",
"reports",
"snippets",
}


@hooks.register("insert_global_admin_css", order=100)
def global_admin_css():
"""Add /static/css/custom.css to the admin."""
Expand All @@ -24,3 +33,10 @@ def global_admin_js():
@hooks.register("construct_homepage_summary_items", order=1)
def remove_all_summary_items(request, items):
items.clear()


@hooks.register("construct_main_menu")
def hide_generic_main_menu_items(request, menu_items):
menu_items[:] = [
item for item in menu_items if item.name not in HIDDEN_MAIN_MENU_ITEMS
]
6 changes: 1 addition & 5 deletions production.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
version: '3'

services:
django: &django
build:
Expand All @@ -11,12 +9,10 @@ services:
- redis
- postgres
- mailhog
- solr
- pgbouncer
links:
- pgbouncer
- solr
volumes:
volumes:
- .:/app:z
- ../scms_data/scielo_usage/data/logs:/data/logs
- ../scms_data/scielo_usage/data/supplies:/data/supplies
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ force_grid_wrap = 0
use_parentheses = true

[mypy]
python_version = 3.9
python_version = 3.11
check_untyped_defs = True
ignore_missing_imports = True
warn_unused_ignores = True
Expand Down
Loading