diff --git a/core/tests/test_datetime_parsing.py b/core/tests/test_datetime_parsing.py index 2bea4fc..6f4fd08 100644 --- a/core/tests/test_datetime_parsing.py +++ b/core/tests/test_datetime_parsing.py @@ -1,11 +1,18 @@ """Tests for core.utils.datetime_parsing.""" +import re from datetime import datetime, timedelta, timezone import pytest from django.utils import timezone as django_timezone -from core.utils.datetime_parsing import ensure_aware_utc, parse_iso_datetime +from core.utils.datetime_parsing import ( + CANONICAL_INSTANT_UTC_Z_PATTERN, + ensure_aware_utc, + format_instant_iso_z, + parse_iso_datetime, + parse_iso_datetime_lenient, +) def test_ensure_aware_utc_none(): @@ -67,3 +74,58 @@ def test_parse_iso_datetime_with_offset_strips_tz_to_naive_utc(): def test_parse_iso_datetime_invalid_raises(): with pytest.raises(ValueError, match="Invalid ISO datetime"): parse_iso_datetime("not-a-date") + + +def test_format_instant_iso_z_empty(): + assert format_instant_iso_z(None) == "" + assert format_instant_iso_z("") == "" + assert format_instant_iso_z(" ") == "" + + +def test_format_instant_iso_z_z_suffix_utc(): + assert format_instant_iso_z("2024-03-15T10:30:00Z") == "2024-03-15T10:30:00Z" + + +def test_format_instant_iso_z_offset_to_z(): + assert format_instant_iso_z("2024-01-01T00:00:00+05:00") == "2023-12-31T19:00:00Z" + + +def test_format_instant_iso_z_invalid_returns_original(): + assert format_instant_iso_z("not-a-date") == "not-a-date" + + +def test_parse_iso_datetime_lenient_empty(): + assert parse_iso_datetime_lenient(None) is None + assert parse_iso_datetime_lenient("") is None + assert parse_iso_datetime_lenient(" ") is None + + +def test_parse_iso_datetime_lenient_z_utc_aware(): + dt = parse_iso_datetime_lenient("2024-01-15T10:30:00Z") + assert dt == datetime(2024, 1, 15, 10, 30, 0, tzinfo=timezone.utc) + + +def test_parse_iso_datetime_lenient_invalid_returns_none(): + assert parse_iso_datetime_lenient("not-a-date") is None + + +@pytest.mark.parametrize( + "date_str,expected_year", + [ + ("2023-06-01T00:00:00Z", 2023), + ("2025-12-31T23:59:59Z", 2025), + ], +) +def test_parse_iso_datetime_lenient_parametrized(date_str, expected_year): + result = parse_iso_datetime_lenient(date_str) + assert result is not None + assert result.year == expected_year + + +def test_canonical_instant_utc_z_pattern(): + pat = re.compile(CANONICAL_INSTANT_UTC_Z_PATTERN) + assert pat.fullmatch("2026-01-01T00:00:00Z") + assert pat.fullmatch("2026-01-01T00:00:00.5Z") + assert pat.fullmatch("2026-01-01T00:00:00.123456Z") + assert pat.fullmatch("2026-01-01T00:00:00+00:00") is None + assert pat.fullmatch("") is None diff --git a/core/tests/test_text_processing.py b/core/tests/test_text_processing.py index af03acb..60dc109 100644 --- a/core/tests/test_text_processing.py +++ b/core/tests/test_text_processing.py @@ -1,9 +1,12 @@ """Tests for core.utils.text_processing.""" +import pytest + from core.utils.text_processing import ( SLACK_GREETING_WORDS, clean_text, filter_sentence, + truncate_content, validate_content_length, ) @@ -67,3 +70,23 @@ def test_validate_content_length(): def test_slack_constants_non_empty(): assert "hello" in SLACK_GREETING_WORDS + + +def test_truncate_content_short_unchanged(): + assert truncate_content("hi", max_length=100) == "hi" + + +def test_truncate_content_long_adds_ellipsis(): + s = "x" * 50 + out = truncate_content(s, max_length=10) + assert out.endswith("...") + assert len(out) == 10 + + +def test_truncate_content_max_length_at_most_three(): + assert truncate_content("abcdef", max_length=2) == "ab" + + +def test_truncate_content_negative_max_length_raises(): + with pytest.raises(ValueError, match="max_length must be non-negative"): + truncate_content("hello", max_length=-1) diff --git a/core/utils/datetime_parsing.py b/core/utils/datetime_parsing.py index 47de949..8d2f75b 100644 --- a/core/utils/datetime_parsing.py +++ b/core/utils/datetime_parsing.py @@ -2,10 +2,16 @@ from __future__ import annotations +import logging from datetime import datetime, timezone from django.utils import timezone as django_timezone +logger = logging.getLogger(__name__) + +# ISO 8601 UTC instant with ``Z`` (optional fractional seconds). Used by JSON Schema and Pydantic. +CANONICAL_INSTANT_UTC_Z_PATTERN = r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?Z$" + def ensure_aware_utc(dt: datetime | None) -> datetime | None: """ @@ -20,15 +26,34 @@ def ensure_aware_utc(dt: datetime | None) -> datetime | None: return dt.astimezone(timezone.utc) +def parse_iso_datetime_lenient(raw: str | None) -> datetime | None: + """ + Parse ISO-like date/datetime strings from APIs (GitHub, Discord, etc.). + + Returns ``None`` for empty/whitespace input or on parse failure (logs at DEBUG). + ``Z`` is normalized to ``+00:00`` for :meth:`datetime.fromisoformat`. Preserves + timezone awareness when present (unlike :func:`parse_iso_datetime`, which returns + naive UTC). + + :func:`parse_iso_datetime` delegates here for the actual parse, then applies + strict error handling and naive-UTC normalization. + """ + if not raw or not str(raw).strip(): + return None + s = str(raw).strip().replace("Z", "+00:00") + try: + return datetime.fromisoformat(s) + except ValueError as e: + logger.debug("Failed to parse datetime %r: %s", s, e) + return None + + def parse_iso_datetime(raw: str | None) -> datetime | None: """ Parse a date or datetime string using ``datetime.fromisoformat``. - Accepts common ISO-style forms (e.g. ``YYYY-MM-DD``, ``YYYY-MM-DDTHH:MM:SS``, - ``YYYY-MM-DD HH:MM:SS`` on Python 3.11+, optional fractional seconds and offsets). - If the string ends with ``Z`` and contains ``T``, ``Z`` is treated as UTC before parsing. - - Empty or whitespace-only input returns ``None``. + Delegates to :func:`parse_iso_datetime_lenient` for parsing. Empty or + whitespace-only input returns ``None``. Raises: ValueError: If the string is non-empty but cannot be parsed. @@ -39,12 +64,32 @@ def parse_iso_datetime(raw: str | None) -> datetime | None: if not raw or not str(raw).strip(): return None s = str(raw).strip() - if s.endswith("Z") and "T" in s: - s = s[:-1] + "+00:00" - try: - dt = datetime.fromisoformat(s) - except ValueError as e: - raise ValueError(f"Invalid ISO datetime ({s!r}): {e}") from e + dt = parse_iso_datetime_lenient(raw) + if dt is None: + raise ValueError(f"Invalid ISO datetime ({s!r})") if dt.tzinfo: return dt.astimezone(timezone.utc).replace(tzinfo=None) return dt + + +def format_instant_iso_z(raw: str | None) -> str: + """ + Normalize a date/datetime string to an ISO 8601 **instant** in UTC with a ``Z`` suffix. + + Uses :func:`parse_iso_datetime` for parsing. Empty or whitespace-only input returns + ``""``. If the string is non-empty but cannot be parsed, returns the stripped + original string (lenient handling for odd exporter payloads). + + Naive datetimes from parsing are interpreted as UTC wall clock before formatting. + """ + text = (raw or "").strip() + if not text: + return "" + try: + dt = parse_iso_datetime(text) + except ValueError: + return text + if dt is None: + return text + aware = dt.replace(tzinfo=timezone.utc) + return aware.astimezone(timezone.utc).isoformat().replace("+00:00", "Z") diff --git a/core/utils/text_processing.py b/core/utils/text_processing.py index 405d163..0b8a763 100644 --- a/core/utils/text_processing.py +++ b/core/utils/text_processing.py @@ -246,6 +246,17 @@ def filter_sentence( return sentence_lower.strip() +def truncate_content(content: str, max_length: int = 100) -> str: + """Return ``content`` truncated to ``max_length`` characters with ``...`` when longer.""" + if max_length < 0: + raise ValueError("max_length must be non-negative") + if len(content) <= max_length: + return content + if max_length <= 3: + return content[:max_length] + return content[: max_length - 3] + "..." + + def validate_content_length(content: str | None, min_length: int = 50) -> bool: """ Validate that content meets minimum length requirement. diff --git a/discord_activity_tracker/admin.py b/discord_activity_tracker/admin.py index 0f4fb87..8dd4f52 100644 --- a/discord_activity_tracker/admin.py +++ b/discord_activity_tracker/admin.py @@ -24,8 +24,6 @@ class DiscordChannelAdmin(admin.ModelAdmin): "server", "channel_type", "category_name", - "last_synced_at", - "last_activity_at", ) list_filter = ("channel_type", "server") search_fields = ("channel_name", "channel_id", "category_name") diff --git a/discord_activity_tracker/management/commands/backfill_discord_activity_tracker.py b/discord_activity_tracker/management/commands/backfill_discord_activity_tracker.py index aab5f99..5535ce4 100644 --- a/discord_activity_tracker/management/commands/backfill_discord_activity_tracker.py +++ b/discord_activity_tracker/management/commands/backfill_discord_activity_tracker.py @@ -22,8 +22,10 @@ from discord_activity_tracker.services import ( get_or_create_discord_channel, get_or_create_discord_server, - update_channel_last_activity, - update_channel_last_synced, +) +from discord_activity_tracker.staging_schema import ( + validate_envelope, + validate_normalized_message, ) from discord_activity_tracker.sync.chat_exporter import ( convert_exporter_message_to_dict, @@ -32,7 +34,6 @@ _safe_int, ) from discord_activity_tracker.sync.messages import _process_messages_in_batches -from discord_activity_tracker.sync.utils import parse_datetime from discord_activity_tracker.workspace import get_cpp_discussion_import_dir logger = logging.getLogger(__name__) @@ -77,12 +78,13 @@ def run(self) -> None: for i, json_path in enumerate(json_files, 1): try: data = parse_exported_json(json_path) - guild_info = data.get("guild", {}) - channel_info = data.get("channel", {}) - messages = data.get("messages", []) + rel = _json_display_path(import_dir, json_path) + envelope = validate_envelope(data, source=rel) + guild_info = envelope.guild.model_dump(by_alias=True) + channel_info = envelope.channel.model_dump(by_alias=True) + messages = envelope.messages ch_name = channel_info.get("name", "?") - rel = _json_display_path(import_dir, json_path) self.stdout.write( f" [{i}/{len(json_files)}] {rel} — #{ch_name}: {len(messages)} messages" ) @@ -132,16 +134,15 @@ async def _persist_channel( category_name=channel_info.get("category") or "", ) - converted = [convert_exporter_message_to_dict(m) for m in messages] + srv_id = _safe_int(guild_info.get("id", 0)) + ch_id = _safe_int(channel_info.get("id", 0)) + converted = [ + convert_exporter_message_to_dict(m, server_id=srv_id, channel_id=ch_id) + for m in messages + ] + for idx, cmsg in enumerate(converted): + validate_normalized_message(cmsg, source=f"message[{idx}]") count = await _process_messages_in_batches(channel, converted) - - if messages: - last_converted = convert_exporter_message_to_dict(messages[-1]) - last_time = parse_datetime(last_converted.get("created_at")) - if last_time: - await sync_to_async(update_channel_last_activity)(channel, last_time) - - await sync_to_async(update_channel_last_synced)(channel) return count def sync_pinecone(self) -> None: diff --git a/discord_activity_tracker/management/commands/run_discord_activity_tracker.py b/discord_activity_tracker/management/commands/run_discord_activity_tracker.py index c108b59..1871f3a 100644 --- a/discord_activity_tracker/management/commands/run_discord_activity_tracker.py +++ b/discord_activity_tracker/management/commands/run_discord_activity_tracker.py @@ -29,8 +29,11 @@ from discord_activity_tracker.services import ( get_or_create_discord_channel, get_or_create_discord_server, - update_channel_last_activity, - update_channel_last_synced, +) +from discord_activity_tracker.staging_schema import ( + StagingValidationError, + validate_envelope, + validate_normalized_message, ) from discord_activity_tracker.sync.exporter_window import ( latest_message_created_at_for_guild, @@ -44,7 +47,6 @@ parse_exported_json, ) from discord_activity_tracker.sync.messages import _process_messages_in_batches -from discord_activity_tracker.sync.utils import parse_datetime from discord_activity_tracker.workspace import ( clear_exporter_staging_dir, get_channel_raw_dir, @@ -197,9 +199,10 @@ def task_discord_sync( for i, json_path in enumerate(json_files, 1): try: data = parse_exported_json(json_path) - guild_info = data.get("guild", {}) - channel_info = data.get("channel", {}) - messages = data.get("messages", []) + envelope = validate_envelope(data, source=json_path.name) + guild_info = envelope.guild.model_dump(by_alias=True) + channel_info = envelope.channel.model_dump(by_alias=True) + messages = envelope.messages ch_name = channel_info.get("name", "?") ch_id = _safe_int(channel_info.get("id", 0)) @@ -223,6 +226,17 @@ def task_discord_sync( dest = channel_raw_dir / f"{date_tag}.json" json_path.rename(dest) + except StagingValidationError as exc: + logger.error( + "Staging validation failed for %s (file left in staging): %s", + json_path.name, + exc, + ) + continue + except ValueError as exc: + logger.error("Failed to process %s: %s", json_path.name, exc) + json_path.unlink(missing_ok=True) + continue except Exception as exc: logger.error("Failed to process %s: %s", json_path.name, exc) json_path.unlink(missing_ok=True) @@ -347,21 +361,15 @@ async def _persist_channel( category_name=channel_info.get("category") or "", ) - converted = [convert_exporter_message_to_dict(m) for m in messages] + srv_id = _safe_int(guild_info.get("id", 0)) + ch_id = _safe_int(channel_info.get("id", 0)) + converted = [ + convert_exporter_message_to_dict(m, server_id=srv_id, channel_id=ch_id) + for m in messages + ] + for idx, cmsg in enumerate(converted): + validate_normalized_message(cmsg, source=f"message[{idx}]") count = await _process_messages_in_batches(channel, converted) - - def finalize_exporter_channel_sync() -> None: - if converted: - parsed_times = [ - t - for m in converted - if (t := parse_datetime(m.get("created_at"))) is not None - ] - if parsed_times: - update_channel_last_activity(channel, max(parsed_times)) - update_channel_last_synced(channel) - - await sync_to_async(finalize_exporter_channel_sync)() return count diff --git a/discord_activity_tracker/migrations/0006_remove_channel_last_timestamps.py b/discord_activity_tracker/migrations/0006_remove_channel_last_timestamps.py new file mode 100644 index 0000000..f5c503e --- /dev/null +++ b/discord_activity_tracker/migrations/0006_remove_channel_last_timestamps.py @@ -0,0 +1,25 @@ +"""Remove DiscordChannel.last_synced_at and last_activity_at (use DiscordMessage instead).""" + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ("discord_activity_tracker", "0005_channel_category_message_type_is_pinned"), + ] + + operations = [ + migrations.RemoveIndex( + model_name="discordchannel", + name="discord_act_last_ac_87ebfd_idx", + ), + migrations.RemoveField( + model_name="discordchannel", + name="last_activity_at", + ), + migrations.RemoveField( + model_name="discordchannel", + name="last_synced_at", + ), + ] diff --git a/discord_activity_tracker/models.py b/discord_activity_tracker/models.py index 5fe9636..f1f0504 100644 --- a/discord_activity_tracker/models.py +++ b/discord_activity_tracker/models.py @@ -36,8 +36,6 @@ class DiscordChannel(models.Model): category_name = models.CharField(max_length=255, blank=True) topic = models.TextField(blank=True) position = models.IntegerField(default=0) - last_synced_at = models.DateTimeField(null=True, blank=True, db_index=True) - last_activity_at = models.DateTimeField(null=True, blank=True, db_index=True) created_at = models.DateTimeField(auto_now_add=True) updated_at = models.DateTimeField(auto_now=True) @@ -45,7 +43,6 @@ class Meta: ordering = ["server", "position", "channel_name"] indexes = [ models.Index(fields=["server", "channel_name"]), - models.Index(fields=["last_activity_at"]), ] def __str__(self): diff --git a/discord_activity_tracker/schemas/discord_staging_v1.json b/discord_activity_tracker/schemas/discord_staging_v1.json new file mode 100644 index 0000000..fcccad4 --- /dev/null +++ b/discord_activity_tracker/schemas/discord_staging_v1.json @@ -0,0 +1,337 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "discord_staging_v1", + "description": "Optional JSON Schema bundle for Discord staging data. Runtime validation uses Pydantic models in discord_activity_tracker/staging_schema.py.", + "discord_chat_exporter_envelope": { + "$defs": { + "DiscordExporterChannel": { + "additionalProperties": true, + "description": "Channel object inside a DiscordChatExporter JSON file.", + "properties": { + "id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Id" + }, + "name": { + "default": "", + "title": "Name", + "type": "string" + }, + "type": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Type" + }, + "topic": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Topic" + }, + "category": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Category" + }, + "categoryId": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Categoryid" + } + }, + "title": "DiscordExporterChannel", + "type": "object" + }, + "DiscordExporterGuild": { + "additionalProperties": true, + "description": "Guild object inside a DiscordChatExporter JSON file.", + "properties": { + "id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Id" + }, + "name": { + "default": "", + "title": "Name", + "type": "string" + }, + "iconUrl": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Iconurl" + } + }, + "title": "DiscordExporterGuild", + "type": "object" + } + }, + "additionalProperties": true, + "description": "Top-level shape of a DiscordChatExporter ``.json`` export.", + "properties": { + "guild": { + "$ref": "#/discord_chat_exporter_envelope/$defs/DiscordExporterGuild" + }, + "channel": { + "$ref": "#/discord_chat_exporter_envelope/$defs/DiscordExporterChannel" + }, + "messages": { + "items": {}, + "title": "Messages", + "type": "array" + } + }, + "title": "DiscordChatExporterEnvelope", + "type": "object" + }, + "normalized_discord_message": { + "$defs": { + "NormalizedAttachment": { + "additionalProperties": true, + "properties": { + "url": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Url" + } + }, + "title": "NormalizedAttachment", + "type": "object" + }, + "NormalizedAuthorExport": { + "additionalProperties": true, + "description": "Author block after ``convert_exporter_message_to_dict``.", + "properties": { + "id": { + "default": 0, + "title": "Id", + "type": "integer" + }, + "username": { + "default": "unknown", + "title": "Username", + "type": "string" + }, + "global_name": { + "default": "", + "title": "Global Name", + "type": "string" + }, + "avatar_url": { + "default": "", + "title": "Avatar Url", + "type": "string" + }, + "bot": { + "default": false, + "title": "Bot", + "type": "boolean" + } + }, + "title": "NormalizedAuthorExport", + "type": "object" + }, + "NormalizedReaction": { + "additionalProperties": true, + "properties": { + "emoji": { + "minLength": 1, + "title": "Emoji", + "type": "string" + }, + "count": { + "minimum": 0, + "title": "Count", + "type": "integer" + } + }, + "required": [ + "emoji", + "count" + ], + "title": "NormalizedReaction", + "type": "object" + } + }, + "additionalProperties": false, + "description": "Post-converter message dict (API-shaped + canonical enrichment fields).", + "properties": { + "id": { + "title": "Id", + "type": "integer" + }, + "content": { + "default": "", + "title": "Content", + "type": "string" + }, + "created_at": { + "pattern": "^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}(?:\\.\\d+)?Z$", + "title": "Created At", + "type": "string" + }, + "edited_at": { + "anyOf": [ + { + "pattern": "^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}(?:\\.\\d+)?Z$", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Edited At" + }, + "message_type": { + "default": "Default", + "title": "Message Type", + "type": "string" + }, + "is_pinned": { + "default": false, + "title": "Is Pinned", + "type": "boolean" + }, + "author": { + "$ref": "#/normalized_discord_message/$defs/NormalizedAuthorExport" + }, + "attachments": { + "items": { + "$ref": "#/normalized_discord_message/$defs/NormalizedAttachment" + }, + "title": "Attachments", + "type": "array" + }, + "reactions": { + "items": { + "$ref": "#/normalized_discord_message/$defs/NormalizedReaction" + }, + "title": "Reactions", + "type": "array" + }, + "reference": { + "anyOf": [ + { + "additionalProperties": true, + "type": "object" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Reference" + }, + "occurred_at": { + "anyOf": [ + { + "pattern": "^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}(?:\\.\\d+)?Z$", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Occurred At" + }, + "actor_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Actor Id" + }, + "source_url": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Source Url" + } + }, + "required": [ + "id", + "created_at", + "author" + ], + "title": "NormalizedDiscordMessage", + "type": "object" + } +} diff --git a/discord_activity_tracker/scripts/__init__.py b/discord_activity_tracker/scripts/__init__.py new file mode 100644 index 0000000..5afb933 --- /dev/null +++ b/discord_activity_tracker/scripts/__init__.py @@ -0,0 +1 @@ +"""Utility entry points for discord_activity_tracker (e.g. schema export).""" diff --git a/discord_activity_tracker/scripts/write_staging_json_schema.py b/discord_activity_tracker/scripts/write_staging_json_schema.py new file mode 100644 index 0000000..a7d3992 --- /dev/null +++ b/discord_activity_tracker/scripts/write_staging_json_schema.py @@ -0,0 +1,21 @@ +"""Write ``discord_activity_tracker/schemas/discord_staging_v1.json``. + +Run from the repository root:: + + python -m discord_activity_tracker.scripts.write_staging_json_schema + +See ``docs/discord-tracker-schema.md`` (section *JSON Schema artifact vs runtime validation*). +""" + +from __future__ import annotations + +from discord_activity_tracker.staging_schema import write_staging_json_schema + + +def main() -> None: + path = write_staging_json_schema() + print(path) + + +if __name__ == "__main__": + main() diff --git a/discord_activity_tracker/services.py b/discord_activity_tracker/services.py index 5c6f24d..647e894 100644 --- a/discord_activity_tracker/services.py +++ b/discord_activity_tracker/services.py @@ -8,6 +8,7 @@ from typing import Any, Dict, List, Optional, Tuple from django.db import transaction +from django.db.models import Max, QuerySet from django.utils import timezone as django_timezone from cppa_user_tracker.models import DiscordProfile @@ -175,46 +176,47 @@ def add_or_update_reaction( return reaction, created -def update_channel_last_activity( - channel: DiscordChannel, last_activity_at: datetime -) -> DiscordChannel: - """Update channel last_activity_at timestamp.""" - channel.last_activity_at = last_activity_at - channel.save(update_fields=["last_activity_at", "updated_at"]) - return channel - +def get_channel_latest_message_at(channel: DiscordChannel) -> Optional[datetime]: + """Latest ``message_created_at`` among non-deleted messages in this channel, or None.""" + row = DiscordMessage.objects.filter(channel=channel, is_deleted=False).aggregate( + m=Max("message_created_at") + ) + return row["m"] -def update_channel_last_synced( - channel: DiscordChannel, timestamp: Optional[datetime] = None -) -> DiscordChannel: - """Update channel last_synced_at (defaults to now).""" - if timestamp is None: - timestamp = django_timezone.now() - channel.last_synced_at = timestamp - channel.save(update_fields=["last_synced_at", "updated_at"]) - logger.info(f"Updated last_synced_at for channel {channel.channel_name}") - return channel +def queryset_channels_with_recent_messages( + server: DiscordServer, + cutoff: datetime, + channel_ids: Optional[List[int]] = None, +) -> QuerySet[DiscordChannel]: + """Channels on *server* that have at least one non-deleted message at or after *cutoff*.""" + pks = ( + DiscordMessage.objects.filter( + channel__server=server, + message_created_at__gte=cutoff, + is_deleted=False, + ) + .values_list("channel_id", flat=True) + .distinct() + ) + qs = DiscordChannel.objects.filter(server=server, pk__in=pks).order_by( + "position", "channel_name" + ) + if channel_ids: + qs = qs.filter(channel_id__in=channel_ids) + return qs def get_active_channels( server: DiscordServer, days: int = 30, channel_ids: Optional[List[int]] = None, -) -> list: - """Get channels with activity in last N days, optionally filtered by channel_ids allowlist.""" +) -> QuerySet[DiscordChannel]: + """Channels with at least one non-deleted message in the last *days*, optional allowlist.""" from datetime import timedelta cutoff = django_timezone.now() - timedelta(days=days) - - qs = DiscordChannel.objects.filter( - server=server, last_activity_at__gte=cutoff - ).order_by("position", "channel_name") - - if channel_ids: - qs = qs.filter(channel_id__in=channel_ids) - - return qs + return queryset_channels_with_recent_messages(server, cutoff, channel_ids) # --------------------------------------------------------------------------- diff --git a/discord_activity_tracker/staging_schema.py b/discord_activity_tracker/staging_schema.py new file mode 100644 index 0000000..614b241 --- /dev/null +++ b/discord_activity_tracker/staging_schema.py @@ -0,0 +1,195 @@ +"""Pydantic validation for Discord staging / ingestion payloads. + +Runtime validation uses the models in this module only. + +Reviewers who prefer raw JSON Schema may read the optional committed copy at +``discord_activity_tracker/schemas/discord_staging_v1.json`` (see generation +notes in ``docs/discord-tracker-schema.md``, section **JSON Schema artifact vs +runtime validation**). That file can drift if models change; regenerate it with +``python -m discord_activity_tracker.scripts.write_staging_json_schema`` (see +script docstring) or by calling ``write_staging_json_schema`` from a REPL. + +Human-readable field definitions and cross-tracker alignment notes live in +``docs/discord-tracker-schema.md``. +""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Annotated, Any, Union + +from pydantic import BaseModel, ConfigDict, Field, ValidationError, field_validator + +from core.utils.datetime_parsing import CANONICAL_INSTANT_UTC_Z_PATTERN + +NormalizedMessageInstantUtcZ = Annotated[ + str, Field(pattern=CANONICAL_INSTANT_UTC_Z_PATTERN) +] + + +class StagingValidationError(ValueError): + """Discord staging payload failed Pydantic validation (envelope or normalized message).""" + + +class DiscordExporterGuild(BaseModel): + """Guild object inside a DiscordChatExporter JSON file.""" + + model_config = ConfigDict(extra="allow", populate_by_name=True) + + id: Union[str, int, None] = None + name: str = "" + iconUrl: str | None = Field(default=None, validation_alias="iconUrl") + + +class DiscordExporterChannel(BaseModel): + """Channel object inside a DiscordChatExporter JSON file.""" + + model_config = ConfigDict(extra="allow", populate_by_name=True) + + id: Union[str, int, None] = None + name: str = "" + type: str | None = None + topic: str | None = None + category: str | None = None + categoryId: Union[str, int, None] = Field( + default=None, validation_alias="categoryId" + ) + + +class DiscordChatExporterEnvelope(BaseModel): + """Top-level shape of a DiscordChatExporter ``.json`` export.""" + + model_config = ConfigDict(extra="allow") + + guild: DiscordExporterGuild = Field(default_factory=DiscordExporterGuild) + channel: DiscordExporterChannel = Field(default_factory=DiscordExporterChannel) + messages: list[Any] = Field(default_factory=list) + + @field_validator("messages", mode="before") + @classmethod + def _messages_must_be_list(cls, v: Any) -> Any: + if v is None: + return [] + if not isinstance(v, list): + raise ValueError("messages must be a JSON array") + return v + + +class NormalizedAttachment(BaseModel): + model_config = ConfigDict(extra="allow") + + url: str | None = None + + +class NormalizedAuthorExport(BaseModel): + """Author block after ``convert_exporter_message_to_dict``.""" + + model_config = ConfigDict(extra="allow") + + id: int = 0 + username: str = "unknown" + global_name: str = "" + avatar_url: str = "" + bot: bool = False + + +class NormalizedReaction(BaseModel): + model_config = ConfigDict(extra="allow") + + emoji: str = Field(min_length=1) + count: int = Field(ge=0) + + +class NormalizedDiscordMessage(BaseModel): + """Post-converter message dict (API-shaped + canonical enrichment fields).""" + + model_config = ConfigDict(extra="forbid") + + id: int + content: str = "" + created_at: NormalizedMessageInstantUtcZ + edited_at: NormalizedMessageInstantUtcZ | None = None + message_type: str = "Default" + is_pinned: bool = False + author: NormalizedAuthorExport + attachments: list[NormalizedAttachment] = Field(default_factory=list) + reactions: list[NormalizedReaction] = Field(default_factory=list) + reference: dict[str, Any] | None = None + occurred_at: NormalizedMessageInstantUtcZ | None = None + actor_id: str | None = None + source_url: str | None = None + + @field_validator("edited_at", "occurred_at", mode="before") + @classmethod + def _blank_optional_timestamp_to_none(cls, v: Any) -> Any: + if v is None: + return None + if isinstance(v, str) and not v.strip(): + return None + return v + + +def _validation_error(prefix: str, err: ValidationError) -> None: + detail = err.errors()[:5] + msg = f"{prefix}: " + "; ".join( + f"{e.get('loc', ())}: {e.get('msg', '')}" for e in detail + ) + if len(err.errors()) > 5: + msg += f" … ({len(err.errors())} errors total)" + raise StagingValidationError(msg) from err + + +def validate_envelope( + data: dict[str, Any], + *, + source: str | None = None, +) -> DiscordChatExporterEnvelope: + """Validate parsed DiscordChatExporter file contents. Raises ``StagingValidationError``.""" + prefix = f"Invalid Discord export envelope{f' ({source})' if source else ''}" + try: + return DiscordChatExporterEnvelope.model_validate(data) + except ValidationError as e: + raise _validation_error(prefix, e) from e + + +def validate_normalized_message( + obj: dict[str, Any], + *, + source: str | None = None, +) -> NormalizedDiscordMessage: + """Validate one normalized message dict. Raises ``StagingValidationError``.""" + prefix = f"Invalid normalized Discord message{f' ({source})' if source else ''}" + try: + return NormalizedDiscordMessage.model_validate(obj) + except ValidationError as e: + raise _validation_error(prefix, e) from e + + +def build_staging_json_schema_bundle() -> dict[str, Any]: + """Build a JSON-serializable object holding JSON Schemas for reviewer use.""" + return { + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "discord_staging_v1", + "description": ( + "Optional JSON Schema bundle for Discord staging data. Runtime " + "validation uses Pydantic models in discord_activity_tracker/staging_schema.py." + ), + "discord_chat_exporter_envelope": DiscordChatExporterEnvelope.model_json_schema( + ref_template="#/discord_chat_exporter_envelope/$defs/{model}" + ), + "normalized_discord_message": NormalizedDiscordMessage.model_json_schema( + ref_template="#/normalized_discord_message/$defs/{model}" + ), + } + + +def write_staging_json_schema(path: Path | None = None) -> Path: + """Write ``discord_staging_v1.json`` next to this package's ``schemas/`` dir.""" + target = path or ( + Path(__file__).resolve().parent / "schemas" / "discord_staging_v1.json" + ) + target.parent.mkdir(parents=True, exist_ok=True) + bundle = build_staging_json_schema_bundle() + target.write_text(json.dumps(bundle, indent=2) + "\n", encoding="utf-8") + return target diff --git a/discord_activity_tracker/sync/chat_exporter.py b/discord_activity_tracker/sync/chat_exporter.py index 09421a5..6186359 100644 --- a/discord_activity_tracker/sync/chat_exporter.py +++ b/discord_activity_tracker/sync/chat_exporter.py @@ -12,10 +12,31 @@ from pathlib import Path from typing import Any, Dict, Iterable, List, Optional, Sequence +from core.utils.datetime_parsing import ( + CANONICAL_INSTANT_UTC_Z_PATTERN, + format_instant_iso_z, +) + +from .utils import format_discord_url from ..workspace import get_workspace_root _SAFE_INT_MAX = 2**63 - 1 # max safe BigIntegerField value +_INSTANT_Z_RE = re.compile(CANONICAL_INSTANT_UTC_Z_PATTERN) + + +def _coerce_exporter_timestamp(raw: Any, *, optional: bool = False) -> str | None: + """Normalize DiscordChatExporter timestamp strings toward ISO 8601 UTC ``Z``. + + Uses :func:`format_instant_iso_z`. When *optional* is True, missing or blank + values return ``None`` (e.g. ``timestampEdited``). Otherwise empty input is + normalized via ``format_instant_iso_z`` like any other string (typically ``""``). + """ + if optional: + if raw is None or (isinstance(raw, str) and not str(raw).strip()): + return None + return format_instant_iso_z(raw if raw is not None else "") + def _safe_int(value: object, default: int = 0) -> int: """Convert a snowflake string or int to int; clamp to BigIntegerField range.""" @@ -623,22 +644,59 @@ def parse_exported_json(json_path: Path) -> Dict[str, Any]: raise -def convert_exporter_message_to_dict(msg_data: Dict[str, Any]) -> Dict[str, Any]: +def convert_exporter_message_to_dict( + msg_data: Dict[str, Any], + *, + server_id: Optional[int] = None, + channel_id: Optional[int] = None, +) -> Dict[str, Any]: """Convert DiscordChatExporter message format to our internal format. Key normalizations applied here: - All snowflake IDs coerced from string → int via _safe_int. - - Reaction emoji extracted from nested {"name": ...} dict to plain string. + - Reaction emoji extracted from nested {"name": ...} dict to plain string; + reactions with no resolvable emoji are dropped (not persisted). + - Reaction ``count`` coerced via :func:`_safe_int` (malformed values → ``0``). - Author avatarUrl mapped to avatar_url. - message_type and is_pinned mapped from DiscordChatExporter fields. + - When ``server_id`` and ``channel_id`` are set, adds ``source_url``; + ``occurred_at`` (ISO 8601 UTC ``Z``) and ``actor_id`` when derivable. + - ``created_at`` / ``edited_at`` are normalized with :func:`format_instant_iso_z` + where possible so values match the canonical ``Z`` instant pattern used at + validation time. """ author = msg_data.get("author", {}) - converted = { + reactions_out: List[Dict[str, Any]] = [] + for reaction in msg_data.get("reactions", []): + emoji_raw = reaction.get("emoji") + if isinstance(emoji_raw, dict): + emoji_name = emoji_raw.get("name") or "" + elif emoji_raw is None: + emoji_name = "" + else: + emoji_name = str(emoji_raw) + emoji_name = (emoji_name or "").strip() + if not emoji_name: + continue + count = _safe_int(reaction.get("count", 0), default=0) + reactions_out.append( + { + "emoji": emoji_name, + "count": max(0, count), + } + ) + + created_at_z = _coerce_exporter_timestamp(msg_data.get("timestamp", "")) + edited_at_z = _coerce_exporter_timestamp( + msg_data.get("timestampEdited"), optional=True + ) + + converted: Dict[str, Any] = { "id": _safe_int(msg_data.get("id", 0)), "content": msg_data.get("content", ""), - "created_at": msg_data.get("timestamp", ""), - "edited_at": msg_data.get("timestampEdited"), + "created_at": created_at_z, + "edited_at": edited_at_z, "message_type": msg_data.get("type", "Default") or "Default", "is_pinned": bool(msg_data.get("isPinned", False)), "author": { @@ -651,13 +709,7 @@ def convert_exporter_message_to_dict(msg_data: Dict[str, Any]) -> Dict[str, Any] "attachments": [ {"url": att.get("url")} for att in msg_data.get("attachments", []) ], - "reactions": [ - { - "emoji": (reaction.get("emoji") or {}).get("name") or "", - "count": reaction.get("count", 0), - } - for reaction in msg_data.get("reactions", []) - ], + "reactions": reactions_out, "reference": None, } @@ -666,6 +718,25 @@ def convert_exporter_message_to_dict(msg_data: Dict[str, Any]) -> Dict[str, Any] ref_id = ref.get("messageId") or ref.get("message_id") converted["reference"] = {"message_id": _safe_int(ref_id) if ref_id else None} + if created_at_z and _INSTANT_Z_RE.fullmatch(created_at_z): + converted["occurred_at"] = created_at_z + + author_id = _safe_int(author.get("id", 0)) + if author_id: + converted["actor_id"] = str(author_id) + + mid = converted["id"] + if ( + server_id is not None + and channel_id is not None + and mid + and int(server_id) > 0 + and int(channel_id) > 0 + ): + converted["source_url"] = format_discord_url( + int(server_id), int(channel_id), int(mid) + ) + return converted diff --git a/discord_activity_tracker/sync/export.py b/discord_activity_tracker/sync/export.py index 18ba216..1051a0f 100644 --- a/discord_activity_tracker/sync/export.py +++ b/discord_activity_tracker/sync/export.py @@ -12,6 +12,7 @@ from django.utils import timezone as django_timezone from ..models import DiscordServer, DiscordChannel, DiscordMessage +from ..services import queryset_channels_with_recent_messages from .utils import sanitize_channel_name, format_discord_url logger = logging.getLogger(__name__) @@ -286,10 +287,8 @@ def export_all_active_channels( logger.info(f"Exporting all active channels for last {months_back} months") cutoff = django_timezone.now() - timedelta(days=active_days) - channels = ( - DiscordChannel.objects.filter(server=server, last_activity_at__gte=cutoff) - .select_related("server") - .order_by("position", "channel_name") + channels = queryset_channels_with_recent_messages(server, cutoff).select_related( + "server" ) logger.info(f"Found {channels.count()} active channels") diff --git a/discord_activity_tracker/sync/messages.py b/discord_activity_tracker/sync/messages.py index ef1a191..709fe81 100644 --- a/discord_activity_tracker/sync/messages.py +++ b/discord_activity_tracker/sync/messages.py @@ -9,18 +9,19 @@ from asgiref.sync import sync_to_async from cppa_user_tracker.services import get_or_create_discord_profile -from ..models import DiscordServer, DiscordChannel +from core.utils.datetime_parsing import parse_iso_datetime_lenient + +from ..models import DiscordServer, DiscordChannel, DiscordMessage from ..services import ( get_or_create_discord_server, get_or_create_discord_channel, create_or_update_discord_message, add_or_update_reaction, - update_channel_last_synced, - update_channel_last_activity, + get_channel_latest_message_at, bulk_process_message_batch, ) from .client import DiscordSyncClient -from .utils import parse_datetime, parse_discord_user +from .utils import parse_discord_user logger = logging.getLogger(__name__) @@ -90,8 +91,8 @@ async def _process_message_data(channel: DiscordChannel, message_data: Dict[str, is_bot=author_info["is_bot"], ) - created_at = parse_datetime(message_data.get("created_at")) - edited_at = parse_datetime(message_data.get("edited_at")) + created_at = parse_iso_datetime_lenient(message_data.get("created_at")) + edited_at = parse_iso_datetime_lenient(message_data.get("edited_at")) if created_at is None: logger.error( @@ -142,8 +143,8 @@ def _prepare_message_data( author_data = message_data.get("author", {}) author_info = parse_discord_user(author_data) - created_at = parse_datetime(message_data.get("created_at")) - edited_at = parse_datetime(message_data.get("edited_at")) + created_at = parse_iso_datetime_lenient(message_data.get("created_at")) + edited_at = parse_iso_datetime_lenient(message_data.get("edited_at")) if created_at is None: logger.error(f"Message {message_data.get('id')} has no created_at timestamp") @@ -217,13 +218,15 @@ async def sync_channel_messages_async( elif since_date: after = since_date logger.info(f"Syncing messages since: {after}") - elif channel.last_synced_at: - after = channel.last_synced_at - logger.info(f"Syncing messages since last sync: {after}") else: - # Default: fetch last 30 days - after = django_timezone.now() - timedelta(days=30) - logger.info(f"First sync: fetching messages from last 30 days ({after})") + latest = await sync_to_async(get_channel_latest_message_at)(channel) + if latest: + after = latest + logger.info(f"Syncing messages since last stored message: {after}") + else: + # Default: fetch last 30 days + after = django_timezone.now() - timedelta(days=30) + logger.info(f"First sync: fetching messages from last 30 days ({after})") discord_channel = await client.get_channel(channel.channel_id) if discord_channel is None: @@ -242,15 +245,6 @@ async def sync_channel_messages_async( processed = await _process_messages_in_batches(channel, messages) logger.info(f"Bulk-processed {processed} messages for #{channel.channel_name}") - if messages: - last_message_time = parse_datetime(messages[-1]["created_at"]) - if last_message_time: - await sync_to_async(update_channel_last_activity)( - channel, last_message_time - ) - - await sync_to_async(update_channel_last_synced)(channel) - logger.info( f"Successfully synced {len(messages)} messages for #{channel.channel_name}" ) @@ -344,11 +338,16 @@ def sync_all_channels( # Filter for active channels if requested if active_only and not full_sync: cutoff = django_timezone.now() - timedelta(days=active_days) - channels = [ - ch - for ch in channels - if ch.last_activity_at and ch.last_activity_at >= cutoff - ] + recent_pks = set( + DiscordMessage.objects.filter( + channel__server=server, + message_created_at__gte=cutoff, + is_deleted=False, + ) + .values_list("channel_id", flat=True) + .distinct() + ) + channels = [ch for ch in channels if ch.pk in recent_pks] logger.info( f"Filtered to {len(channels)} active channels " f"(last {active_days} days)" diff --git a/discord_activity_tracker/sync/utils.py b/discord_activity_tracker/sync/utils.py index 8471e88..a0a72a7 100644 --- a/discord_activity_tracker/sync/utils.py +++ b/discord_activity_tracker/sync/utils.py @@ -1,24 +1,6 @@ """Helpers for Discord sync.""" -import logging -from datetime import datetime -from typing import Optional, Dict, Any - -logger = logging.getLogger(__name__) - - -def parse_datetime(date_str: Optional[str]) -> Optional[datetime]: - """Parse ISO datetime.""" - if not date_str: - return None - - try: - if date_str.endswith("Z"): - date_str = date_str[:-1] + "+00:00" - return datetime.fromisoformat(date_str) - except (ValueError, AttributeError) as e: - logger.debug(f"Failed to parse datetime '{date_str}': {e}") - return None +from typing import Any, Dict, Optional def parse_discord_user(user_data: Optional[Dict[str, Any]]) -> Dict[str, Any]: @@ -74,10 +56,3 @@ def sanitize_channel_name(channel_name: str) -> str: def format_discord_url(server_id: int, channel_id: int, message_id: int) -> str: """Build Discord message URL.""" return f"https://discord.com/channels/{server_id}/{channel_id}/{message_id}" - - -def truncate_content(content: str, max_length: int = 100) -> str: - """Truncate with ellipsis.""" - if len(content) <= max_length: - return content - return content[: max_length - 3] + "..." diff --git a/discord_activity_tracker/tests/test_messages_more.py b/discord_activity_tracker/tests/test_messages_more.py index 151c601..e39b8f5 100644 --- a/discord_activity_tracker/tests/test_messages_more.py +++ b/discord_activity_tracker/tests/test_messages_more.py @@ -7,7 +7,9 @@ import pytest from django.utils import timezone as django_timezone +from cppa_user_tracker.models import DiscordProfile from discord_activity_tracker.models import DiscordChannel, DiscordServer +from discord_activity_tracker.services import create_or_update_discord_message from discord_activity_tracker.sync import messages as messages_mod from discord_activity_tracker.sync.messages import ( _sync_all_channels_async, @@ -68,8 +70,8 @@ async def main(): asyncio.run(main()) -@pytest.mark.django_db -def test_sync_channel_messages_async_last_synced_branch(): +@pytest.mark.django_db(transaction=True) +def test_sync_channel_messages_async_uses_latest_stored_message_for_after(): gid = _uniq() cid = _uniq() server = DiscordServer.objects.create(server_id=gid, server_name="S", icon_url="") @@ -78,13 +80,27 @@ def test_sync_channel_messages_async_last_synced_branch(): channel_id=cid, channel_name="c", channel_type="text", - last_synced_at=django_timezone.now() - timedelta(hours=3), + ) + author = DiscordProfile.objects.create( + discord_user_id=_uniq(), + username="u", + display_name="", + avatar_url="", + is_bot=False, + ) + stored_ts = django_timezone.now() - timedelta(hours=3) + create_or_update_discord_message( + _uniq(), channel, author, "x", message_created_at=stored_ts ) async def main(): client = MagicMock() - client.get_channel = AsyncMock(return_value=None) + dch = MagicMock() + client.get_channel = AsyncMock(return_value=dch) + client.fetch_messages_since = AsyncMock(return_value=[]) await sync_channel_messages_async(client, channel, gid) + client.fetch_messages_since.assert_awaited_once() + assert client.fetch_messages_since.await_args.kwargs["after"] == stored_ts asyncio.run(main()) @@ -102,13 +118,18 @@ def test_sync_channel_messages_async_default_window(monkeypatch): channel_id=cid, channel_name="c", channel_type="text", - last_synced_at=None, ) async def main(): client = MagicMock() - client.get_channel = AsyncMock(return_value=None) + dch = MagicMock() + client.get_channel = AsyncMock(return_value=dch) + client.fetch_messages_since = AsyncMock(return_value=[]) await sync_channel_messages_async(client, channel, gid) + client.fetch_messages_since.assert_awaited_once() + assert client.fetch_messages_since.await_args.kwargs[ + "after" + ] == fixed_now - timedelta(days=30) asyncio.run(main()) diff --git a/discord_activity_tracker/tests/test_run_discord_activity_tracker_command.py b/discord_activity_tracker/tests/test_run_discord_activity_tracker_command.py index dd823a9..2d38cee 100644 --- a/discord_activity_tracker/tests/test_run_discord_activity_tracker_command.py +++ b/discord_activity_tracker/tests/test_run_discord_activity_tracker_command.py @@ -14,6 +14,7 @@ _parse_channel_ids, _resolve_exporter_date_bounds, ) +from discord_activity_tracker.staging_schema import StagingValidationError def _cmd_and_collector(**opts): @@ -38,6 +39,10 @@ def _cmd_and_collector(**opts): return cmd, collector +def test_staging_validation_error_subclasses_value_error(): + assert issubclass(StagingValidationError, ValueError) + + # --------------------------------------------------------------------------- # _parse_channel_ids # --------------------------------------------------------------------------- diff --git a/discord_activity_tracker/tests/test_services_core.py b/discord_activity_tracker/tests/test_services_core.py index 2c03749..bf403ea 100644 --- a/discord_activity_tracker/tests/test_services_core.py +++ b/discord_activity_tracker/tests/test_services_core.py @@ -12,11 +12,10 @@ add_or_update_reaction, create_or_update_discord_message, get_active_channels, + get_channel_latest_message_at, get_or_create_discord_channel, get_or_create_discord_server, mark_message_deleted, - update_channel_last_activity, - update_channel_last_synced, ) @@ -199,22 +198,39 @@ def test_add_or_update_reaction(channel, author): @pytest.mark.django_db -def test_update_channel_last_activity_and_synced(channel): - act = django_timezone.now() - update_channel_last_activity(channel, act) - channel.refresh_from_db() - assert channel.last_activity_at == act +def test_get_channel_latest_message_at(channel, author): + assert get_channel_latest_message_at(channel) is None + ts = django_timezone.now() + create_or_update_discord_message( + _uniq_id(), channel, author, "x", message_created_at=ts + ) + assert get_channel_latest_message_at(channel) == ts + - sync_ts = django_timezone.now() - timedelta(hours=1) - update_channel_last_synced(channel, sync_ts) - channel.refresh_from_db() - assert channel.last_synced_at == sync_ts +@pytest.mark.django_db +def test_get_channel_latest_message_at_ignores_deleted(channel, author): + t_old = django_timezone.now() - timedelta(hours=1) + t_new = django_timezone.now() + create_or_update_discord_message( + _uniq_id(), channel, author, "old", message_created_at=t_old + ) + msg_new, _ = create_or_update_discord_message( + _uniq_id(), channel, author, "new", message_created_at=t_new + ) + assert get_channel_latest_message_at(channel) == t_new + mark_message_deleted(msg_new) + assert get_channel_latest_message_at(channel) == t_old @pytest.mark.django_db -def test_get_active_channels_filters_by_days(channel, server): - channel.last_activity_at = django_timezone.now() - channel.save() +def test_get_active_channels_filters_by_days(channel, server, author): + create_or_update_discord_message( + _uniq_id(), + channel, + author, + "recent", + message_created_at=django_timezone.now(), + ) stale = DiscordChannel.objects.create( server=server, channel_id=_uniq_id(), @@ -222,7 +238,13 @@ def test_get_active_channels_filters_by_days(channel, server): channel_type="text", topic="", position=1, - last_activity_at=django_timezone.now() - timedelta(days=60), + ) + create_or_update_discord_message( + _uniq_id(), + stale, + author, + "old", + message_created_at=django_timezone.now() - timedelta(days=60), ) active = get_active_channels(server, days=30) ids = {c.channel_id for c in active} @@ -231,7 +253,7 @@ def test_get_active_channels_filters_by_days(channel, server): @pytest.mark.django_db -def test_get_active_channels_allowlist_filter(server): +def test_get_active_channels_allowlist_filter(server, author): """channel_ids allowlist pre-filters the queryset.""" now = django_timezone.now() ch1 = DiscordChannel.objects.create( @@ -239,15 +261,17 @@ def test_get_active_channels_allowlist_filter(server): channel_id=_uniq_id(), channel_name="allowed", channel_type="text", - last_activity_at=now, ) ch2 = DiscordChannel.objects.create( server=server, channel_id=_uniq_id(), channel_name="blocked", channel_type="text", - last_activity_at=now, ) + for ch in (ch1, ch2): + create_or_update_discord_message( + _uniq_id(), ch, author, "x", message_created_at=now + ) result = get_active_channels(server, days=30, channel_ids=[ch1.channel_id]) ids = {c.channel_id for c in result} assert ch1.channel_id in ids @@ -255,7 +279,7 @@ def test_get_active_channels_allowlist_filter(server): @pytest.mark.django_db -def test_get_active_channels_empty_allowlist_returns_all(server): +def test_get_active_channels_empty_allowlist_returns_all(server, author): """Empty channel_ids means no filter — all active channels returned.""" now = django_timezone.now() ch1 = DiscordChannel.objects.create( @@ -263,15 +287,17 @@ def test_get_active_channels_empty_allowlist_returns_all(server): channel_id=_uniq_id(), channel_name="a", channel_type="text", - last_activity_at=now, ) ch2 = DiscordChannel.objects.create( server=server, channel_id=_uniq_id(), channel_name="b", channel_type="text", - last_activity_at=now, ) + for ch in (ch1, ch2): + create_or_update_discord_message( + _uniq_id(), ch, author, "x", message_created_at=now + ) result = get_active_channels(server, days=30, channel_ids=None) ids = {c.channel_id for c in result} assert ch1.channel_id in ids diff --git a/discord_activity_tracker/tests/test_services_extras.py b/discord_activity_tracker/tests/test_services_extras.py index d2035c3..4a8f205 100644 --- a/discord_activity_tracker/tests/test_services_extras.py +++ b/discord_activity_tracker/tests/test_services_extras.py @@ -13,7 +13,6 @@ bulk_upsert_discord_reactions, bulk_upsert_discord_users, mark_message_deleted, - update_channel_last_synced, ) @@ -57,13 +56,6 @@ def test_mark_message_deleted_default_timestamp(channel): assert msg.deleted_at >= before -@pytest.mark.django_db -def test_update_channel_last_synced_default_now(channel): - update_channel_last_synced(channel) - channel.refresh_from_db() - assert channel.last_synced_at is not None - - @pytest.mark.django_db def test_bulk_upsert_skips_message_without_author(channel): """Covers bulk_upsert_discord_messages warning path when author missing.""" diff --git a/discord_activity_tracker/tests/test_staging_schema.py b/discord_activity_tracker/tests/test_staging_schema.py new file mode 100644 index 0000000..5c84795 --- /dev/null +++ b/discord_activity_tracker/tests/test_staging_schema.py @@ -0,0 +1,119 @@ +"""Tests for discord_activity_tracker.staging_schema validation.""" + +import pytest + +from discord_activity_tracker.staging_schema import ( + StagingValidationError, + validate_envelope, + validate_normalized_message, +) +from discord_activity_tracker.sync.chat_exporter import convert_exporter_message_to_dict + + +def _minimal_exporter_message(): + return { + "id": "1399663560723923005", + "type": "Default", + "isPinned": False, + "timestamp": "2026-01-01T12:00:00Z", + "content": "hello world example text long enough", + "author": {"id": "1082347485026070548", "name": "user"}, + "attachments": [], + "reactions": [], + } + + +def test_validate_normalized_well_formed_message(): + raw = _minimal_exporter_message() + converted = convert_exporter_message_to_dict( + raw, server_id=900, channel_id=851121440425639956 + ) + model = validate_normalized_message(converted, source="test") + assert model.id == 1399663560723923005 + assert model.source_url.startswith("https://discord.com/channels/") + assert model.actor_id == "1082347485026070548" + assert model.occurred_at.endswith("Z") + + +def test_validate_normalized_well_formed_reactions(): + raw = { + "id": "1", + "timestamp": "2026-01-01T00:00:00Z", + "content": "x", + "author": {"id": "1", "name": "a"}, + "attachments": [], + "reactions": [ + {"emoji": {"id": None, "name": "thumbsup", "isAnimated": False}, "count": 2} + ], + } + converted = convert_exporter_message_to_dict(raw, server_id=1, channel_id=2) + model = validate_normalized_message(converted) + assert len(model.reactions) == 1 + assert model.reactions[0].emoji == "thumbsup" + assert model.reactions[0].count == 2 + + +def test_validate_normalized_malformed_rejects_with_staging_validation_error(): + bad = { + "id": 1, + "content": "", + "created_at": "", + "edited_at": None, + "message_type": "Default", + "is_pinned": False, + "author": { + "id": 0, + "username": "x", + "global_name": "", + "avatar_url": "", + "bot": False, + }, + "attachments": [], + "reactions": [], + "reference": None, + } + with pytest.raises( + StagingValidationError, match="Invalid normalized Discord message" + ) as excinfo: + validate_normalized_message(bad, source="unit") + assert "pydantic" not in type(excinfo.value).__name__.lower() + err = excinfo.value + assert err.__cause__ is not None + + +def test_validate_normalized_rejects_created_at_without_z_suffix(): + raw = _minimal_exporter_message() + bad = convert_exporter_message_to_dict(raw, server_id=1, channel_id=2) + bad["created_at"] = "2026-01-01T00:00:00+00:00" + with pytest.raises( + StagingValidationError, match="Invalid normalized Discord message" + ): + validate_normalized_message(bad, source="unit") + + +def test_validate_envelope_rejects_non_list_messages(): + with pytest.raises(StagingValidationError, match="Invalid Discord export envelope"): + validate_envelope( + { + "guild": {"id": "1", "name": "G"}, + "channel": {"id": "2", "name": "C"}, + "messages": "nope", + }, + source="x.json", + ) + + +def test_validate_envelope_messages_none_becomes_empty_list(): + env = validate_envelope( + { + "guild": {"id": "1", "name": "G"}, + "channel": {"id": "2", "name": "C"}, + "messages": None, + }, + source="empty.json", + ) + assert env.messages == [] + guild = env.guild.model_dump(by_alias=True) + channel = env.channel.model_dump(by_alias=True) + assert guild.get("id") == "1" + assert channel.get("name") == "C" diff --git a/discord_activity_tracker/tests/test_sync_chat_exporter.py b/discord_activity_tracker/tests/test_sync_chat_exporter.py index 121b9ca..584fc9f 100644 --- a/discord_activity_tracker/tests/test_sync_chat_exporter.py +++ b/discord_activity_tracker/tests/test_sync_chat_exporter.py @@ -520,6 +520,21 @@ def test_convert_exporter_message_ids_are_int(): assert isinstance(out["author"]["id"], int) +def test_convert_exporter_message_canonical_fields_with_server_channel(): + raw = { + "id": "10", + "timestamp": "2026-01-01T00:00:00Z", + "content": "c", + "author": {"id": "1", "name": "a"}, + "attachments": [], + "reactions": [], + } + out = convert_exporter_message_to_dict(raw, server_id=99, channel_id=100) + assert out["actor_id"] == "1" + assert out["occurred_at"] == "2026-01-01T00:00:00Z" + assert out["source_url"] == "https://discord.com/channels/99/100/10" + + def test_convert_exporter_message_reaction_emoji_flattened(): """Reaction emoji dict must be flattened to a plain string.""" raw = { @@ -537,7 +552,25 @@ def test_convert_exporter_message_reaction_emoji_flattened(): assert out["reactions"][0]["count"] == 3 -def test_convert_exporter_message_reaction_null_emoji_is_empty_string(): +def test_convert_exporter_message_reaction_malformed_count_defaults(): + raw = { + "id": "1", + "timestamp": "2026-01-01T00:00:00Z", + "content": "", + "author": {"id": "1", "name": "a"}, + "attachments": [], + "reactions": [ + {"emoji": {"name": "x"}, "count": "not-a-number"}, + {"emoji": {"name": "y"}, "count": -2}, + ], + } + out = convert_exporter_message_to_dict(raw) + assert len(out["reactions"]) == 2 + assert out["reactions"][0]["count"] == 0 + assert out["reactions"][1]["count"] == 0 + + +def test_convert_exporter_message_reaction_null_emoji_is_dropped(): raw = { "id": "1", "timestamp": "2026-01-01T00:00:00Z", @@ -547,7 +580,7 @@ def test_convert_exporter_message_reaction_null_emoji_is_empty_string(): "reactions": [{"emoji": None, "count": 1}], } out = convert_exporter_message_to_dict(raw) - assert out["reactions"][0]["emoji"] == "" + assert out["reactions"] == [] def test_convert_exporter_message_avatarUrl_mapped(): diff --git a/discord_activity_tracker/tests/test_sync_messages.py b/discord_activity_tracker/tests/test_sync_messages.py index bbe6391..19dedc4 100644 --- a/discord_activity_tracker/tests/test_sync_messages.py +++ b/discord_activity_tracker/tests/test_sync_messages.py @@ -9,6 +9,7 @@ from django.utils import timezone as django_timezone from cppa_user_tracker.services import get_or_create_discord_profile +from cppa_user_tracker.models import DiscordProfile from discord_activity_tracker.models import DiscordChannel, DiscordServer from discord_activity_tracker.services import ( add_or_update_reaction, @@ -383,30 +384,6 @@ async def main(): asyncio.run(main()) -def _sync_to_async_updates_only(): - def router(fn): - if fn.__name__ == "update_channel_last_synced": - - async def inner(ch): - ch.last_synced_at = django_timezone.now() - return ch - - return inner - if fn.__name__ == "update_channel_last_activity": - - async def inner(_ch, _ts): - return None - - return inner - - return asgiref_sync_to_async(fn, thread_sensitive=True) - - return patch( - "discord_activity_tracker.sync.messages.sync_to_async", - side_effect=router, - ) - - @pytest.mark.django_db def test_sync_channel_messages_async_full_sync(): gid = _uniq_id() @@ -420,6 +397,8 @@ def test_sync_channel_messages_async_full_sync(): channel_type="text", ) + holder = {} + async def main(): client = MagicMock() dch = MagicMock() @@ -429,20 +408,18 @@ async def main(): msg = _sample_api_message(mid=_uniq_id(), uid=author_uid) client.fetch_messages_since = AsyncMock(return_value=[msg]) - with ( - patch.object( - messages_mod, - "_process_messages_in_batches", - new_callable=AsyncMock, - return_value=1, - ), - _sync_to_async_updates_only(), + with patch.object( + messages_mod, + "_process_messages_in_batches", + new_callable=AsyncMock, + return_value=1, ): await sync_channel_messages_async(client, channel, gid, full_sync=True) + holder["client"] = client asyncio.run(main()) - assert channel.last_synced_at is not None + assert holder["client"].fetch_messages_since.await_args.kwargs["after"] is None @pytest.mark.django_db @@ -457,15 +434,17 @@ def test_sync_channel_messages_async_no_discord_channel(): channel_type="text", ) + holder = {} + async def main(): client = MagicMock() client.get_channel = AsyncMock(return_value=None) + holder["client"] = client await sync_channel_messages_async(client, channel, gid) asyncio.run(main()) - channel.refresh_from_db() - assert channel.last_synced_at is None + assert not holder["client"].fetch_messages_since.called @pytest.mark.django_db @@ -599,19 +578,38 @@ def test_sync_all_channels_respects_active_filter(): cid_active = _uniq_id() cid_stale = _uniq_id() server = DiscordServer.objects.create(server_id=gid, server_name="S", icon_url="") + author = DiscordProfile.objects.create( + discord_user_id=_uniq_id(), + username="u", + display_name="", + avatar_url="", + is_bot=False, + ) active_ch = DiscordChannel.objects.create( server=server, channel_id=cid_active, channel_name="active", channel_type="text", - last_activity_at=now, ) - DiscordChannel.objects.create( + stale_ch = DiscordChannel.objects.create( server=server, channel_id=cid_stale, channel_name="stale", channel_type="text", - last_activity_at=now - timedelta(days=90), + ) + create_or_update_discord_message( + _uniq_id(), + active_ch, + author, + "recent", + message_created_at=now, + ) + create_or_update_discord_message( + _uniq_id(), + stale_ch, + author, + "old", + message_created_at=now - timedelta(days=90), ) channels_snapshot = list(DiscordChannel.objects.filter(server=server)) @@ -672,7 +670,6 @@ def test_sync_all_channels_full_sync_no_active_filter(): channel_id=cid, channel_name="c", channel_type="text", - last_activity_at=None, ) channels_snapshot = list(DiscordChannel.objects.filter(server=server)) diff --git a/discord_activity_tracker/tests/test_sync_utils.py b/discord_activity_tracker/tests/test_sync_utils.py index 2f7ce48..789126c 100644 --- a/discord_activity_tracker/tests/test_sync_utils.py +++ b/discord_activity_tracker/tests/test_sync_utils.py @@ -1,29 +1,19 @@ -"""Tests for discord_activity_tracker.sync.utils.""" +"""Tests for discord_activity_tracker.sync.utils. + +ISO datetime parsing is implemented and tested in ``core.utils.datetime_parsing`` +(see ``core/tests/test_datetime_parsing.py``). + +``truncate_content`` lives in ``core.utils.text_processing``; see +``core/tests/test_text_processing.py``. +""" from discord_activity_tracker.sync.utils import ( format_discord_url, - parse_datetime, parse_discord_user, sanitize_channel_name, - truncate_content, ) -def test_parse_datetime_empty(): - assert parse_datetime("") is None - assert parse_datetime(None) is None - - -def test_parse_datetime_z_normalized(): - dt = parse_datetime("2026-03-01T15:30:00Z") - assert dt is not None - assert dt.tzinfo is not None - - -def test_parse_datetime_invalid_returns_none(): - assert parse_datetime("not-a-timestamp") is None - - def test_parse_discord_user_empty_dict(): out = parse_discord_user(None) assert out["user_id"] == 0 @@ -127,14 +117,3 @@ def test_sanitize_channel_name_strips_unsafe_chars(): def test_format_discord_url(): assert format_discord_url(1, 2, 3) == "https://discord.com/channels/1/2/3" - - -def test_truncate_content_short_unchanged(): - assert truncate_content("hi", max_length=100) == "hi" - - -def test_truncate_content_long_adds_ellipsis(): - s = "x" * 50 - out = truncate_content(s, max_length=10) - assert out.endswith("...") - assert len(out) == 10 diff --git a/discord_activity_tracker/tests/test_workspace.py b/discord_activity_tracker/tests/test_workspace.py index 54ad0d3..2f037c8 100644 --- a/discord_activity_tracker/tests/test_workspace.py +++ b/discord_activity_tracker/tests/test_workspace.py @@ -107,3 +107,17 @@ def test_iter_existing_message_jsons_empty_when_missing(mock_discord_workspace): return_value=mock_discord_workspace, ): assert list(iter_existing_message_jsons(99, 99)) == [] + + +def test_iter_existing_message_jsons_sorted_by_filename(mock_discord_workspace): + """Paths must be yielded in sorted order for deterministic incremental reads.""" + with patch( + "discord_activity_tracker.workspace.get_workspace_path", + return_value=mock_discord_workspace, + ): + msg_dir = mock_discord_workspace / "7" / "messages" / "8" + msg_dir.mkdir(parents=True) + (msg_dir / "2026-01-02.json").write_text("{}", encoding="utf-8") + (msg_dir / "2026-01-01.json").write_text("{}", encoding="utf-8") + names = [p.name for p in iter_existing_message_jsons(7, 8)] + assert names == ["2026-01-01.json", "2026-01-02.json"] diff --git a/discord_activity_tracker/workspace.py b/discord_activity_tracker/workspace.py index f5ee63a..7bfde80 100644 --- a/discord_activity_tracker/workspace.py +++ b/discord_activity_tracker/workspace.py @@ -83,7 +83,7 @@ def iter_existing_message_jsons(server_id: int, channel_id: int): messages_dir = get_server_dir(server_id) / "messages" / str(channel_id) if not messages_dir.is_dir(): return - for path in messages_dir.glob("*.json"): + for path in sorted(messages_dir.glob("*.json")): if path.name.startswith("._"): continue yield path diff --git a/docs/Schema.md b/docs/Schema.md index 74e5dcf..39d9819 100644 --- a/docs/Schema.md +++ b/docs/Schema.md @@ -899,8 +899,6 @@ erDiagram string category_name text topic int position - datetime last_synced_at "IX" - datetime last_activity_at "IX" datetime created_at datetime updated_at } diff --git a/docs/discord-tracker-schema.md b/docs/discord-tracker-schema.md new file mode 100644 index 0000000..d1817ef --- /dev/null +++ b/docs/discord-tracker-schema.md @@ -0,0 +1,127 @@ +# Discord activity tracker — staging JSON schema + +This document describes the JSON shapes used when Discord data is staged on disk or normalized immediately before database writes in `discord_activity_tracker`. Runtime validation is implemented with **Pydantic** in [`discord_activity_tracker/staging_schema.py`](../discord_activity_tracker/staging_schema.py) (`validate_envelope`, `validate_normalized_message`). + +## 1. Envelope (DiscordChatExporter file) + +A single exported channel file is one JSON object with three top-level keys: + +| Key | Type | Description | +| --- | --- | --- | +| `guild` | object | Guild metadata from DiscordChatExporter. | +| `channel` | object | Channel metadata. | +| `messages` | array | Message objects in export order. | + +Common **guild** keys (camelCase as emitted by the exporter; `extra` fields are allowed and ignored by validation): + +- `id` — guild snowflake (string or number in JSON). +- `name` — guild name. +- `iconUrl` — optional guild icon URL. + +Common **channel** keys: + +- `id`, `name`, `type`, `topic`, `category`, `categoryId` — as provided by the exporter. + +**Normalization contract:** After `json.load`, ingestion validates the envelope with `validate_envelope`, then converts each raw message with `convert_exporter_message_to_dict` in [`discord_activity_tracker/sync/chat_exporter.py`](../discord_activity_tracker/sync/chat_exporter.py) before bulk DB upsert. + +## 2. Normalized message record + +The dict returned by `convert_exporter_message_to_dict` (and consumed by `_prepare_message_data` in [`discord_activity_tracker/sync/messages.py`](../discord_activity_tracker/sync/messages.py), which **drops unknown keys** before ORM bulk write) uses **snake_case** for nested author fields aligned with the Discord bot API shape. + +| Field | Type | Notes | +| --- | --- | --- | +| `id` | integer | Message snowflake. | +| `content` | string | Message body; may be empty. | +| `created_at` | string | ISO 8601 timestamp (from exporter `timestamp`). Required non-empty for validation. | +| `edited_at` | string or null | ISO 8601 if edited; otherwise JSON `null` or omitted when absent. | +| `message_type` | string | Exporter/API `type` string (e.g. `Default`, `Reply`). **Opaque passthrough** — see [Limitations](#6-limitations--out-of-scope). | +| `is_pinned` | boolean | | +| `author` | object | `id`, `username`, `global_name`, `avatar_url`, `bot`. | +| `attachments` | array | Objects with optional `url`. | +| `reactions` | array | Only entries with a non-empty resolved emoji; `{ "emoji": string, "count": integer >= 0 }`. | +| `reference` | object or null | When present: `{ "message_id": integer or null }`. | + +### Canonical cross-tracker fields (additive) + +These are set by `convert_exporter_message_to_dict` when enough context exists. They are **not** persisted as separate ORM columns; they exist on the normalized dict for validation, logs, and downstream consumers. + +| Field | Type | When set | +| --- | --- | --- | +| `occurred_at` | string | ISO 8601 instant in UTC with `Z` suffix, from `created_at` when non-empty (implementation: `core.utils.datetime_parsing.format_instant_iso_z`). | +| `actor_id` | string | Discord user snowflake as decimal string when author `id` is non-zero. | +| `source_url` | string | When `server_id` and `channel_id` are passed into the converter and message id is non-zero: `https://discord.com/channels/{server}/{channel}/{message}` via `format_discord_url`. | + +### Null vs omitted + +- Prefer JSON **`null`** for nullable scalars when serializing (e.g. `edited_at`, `reference`) to match common REST-style workspace files elsewhere in the monorepo. +- Omit optional keys when the exporter does not provide them (e.g. `edited_at` absent vs `null`). + +## 3. Reactions + +Each reaction in the normalized message: + +- `emoji` — non-empty string (custom emoji name or Unicode). +- `count` — integer `>= 0`. + +Exporter rows with no resolvable emoji are **dropped** during conversion (they are not stored). + +## 4. `message_type` + +Treated as an **opaque string** from DiscordChatExporter or the Discord API. The app stores it on `DiscordMessage.message_type` without interpreting join/leave semantics from this field alone. See [Limitations](#6-limitations--out-of-scope). + +## 5. Channel activity summary (derived) + +Not materialized as a separate JSON file in this iteration. For a given export envelope, a logical summary can be computed as: + +- `server_id` / `channel_id` from `guild.id` / `channel.id`. +- `message_count` — `len(messages)`. +- `first_message_at` / `last_message_at` — from the first and last message `timestamp` / `created_at` after conversion, in UTC, if messages are non-empty. + +## 6. Limitations / out of scope + +The collector’s primary path fetches **per-channel** message history (DiscordChatExporter export or bot API sync). Therefore: + +- **`message_type` is not a membership lifecycle log.** System or non-default types may appear when the exporter includes them, but rows are **not** a complete or authoritative log of users joining or leaving the **server** or a **channel**. +- **Single-channel export/fetch** cannot infer server-wide join/leave; Discord does not guarantee join system messages appear in every text channel, and leaves often have no built-in chat message. Authoritative membership tracking would require gateway events, audit log (where permitted), multi-channel export including the guild system channel, or dedicated bot logging — outside the current design. + +Do not document join/leave **detection** as a capability of this schema. + +## 7. JSON Schema artifact vs runtime validation + +The committed file [`discord_activity_tracker/schemas/discord_staging_v1.json`](../discord_activity_tracker/schemas/discord_staging_v1.json) is an **optional** JSON document for reviewers who prefer raw [JSON Schema](https://json-schema.org/). It bundles `model_json_schema()` output for: + +- `DiscordChatExporterEnvelope` +- `NormalizedDiscordMessage` + +**Single source of truth at runtime:** the Pydantic models in [`discord_activity_tracker/staging_schema.py`](../discord_activity_tracker/staging_schema.py). The `.json` file can **drift** if models change and the file is not regenerated. + +**Regenerate** (from repository root, with `discord_activity_tracker` importable, e.g. `PYTHONPATH=.`): + +```bash +python -m discord_activity_tracker.scripts.write_staging_json_schema +``` + +or: + +```bash +python -c "from discord_activity_tracker.staging_schema import write_staging_json_schema; write_staging_json_schema()" +``` + +## Alignment with other trackers (conventions) + +| Concern | `github_activity_tracker` | `cppa_slack_tracker` | `discord_activity_tracker` (this doc) | +| --- | --- | --- | --- | +| Workspace layout | Per-owner/repo trees; JSON per commit/issue/PR under [`github_activity_tracker/workspace.py`](../github_activity_tracker/workspace.py). | Per team/channel; daily `YYYY-MM-DD.json`; iterators **sorted** by path. | Per-server under `workspace/discord_activity_tracker/`; raw archive under `WORKSPACE_DIR/raw/discord_activity_tracker/`; `iter_existing_message_jsons` yields **sorted** paths. | +| Field naming | Mostly GitHub REST / snake_case in cached JSON. | Slack API native keys in daily lists (`ts`, `text`, `user`, …). | Exporter camelCase in file → **normalized** snake_case + ISO timestamps on message dict. | +| Links | e.g. `html_url` on GitHub entities. | Slack permalinks vary by payload. | Canonical `source_url` on normalized message when guild/channel ids are known. | + +### Shared conceptual fields (mapping) + +| Concept | Discord (normalized dict) | Slack (workspace message) | GitHub (example) | +| --- | --- | --- | --- | +| When | `created_at`, `occurred_at` | `ts` (Unix fractional string) | `created_at`, `commit.author.date`, … | +| Actor | `author.id` + `actor_id` string | `user` | `author.login`, `user.login`, … | +| Body | `content` | `text` | `body`, commit `message`, … | +| Link | `source_url` | (construct from team/channel/ts) | `html_url` | + +Discord ingestion keeps legacy keys (`created_at`, `id`, …) for `_prepare_message_data` compatibility and adds **parallel** canonical fields above rather than renaming bulk keys. diff --git a/docs/service_api/discord_activity_tracker.md b/docs/service_api/discord_activity_tracker.md index e2bd42b..b2c3f28 100644 --- a/docs/service_api/discord_activity_tracker.md +++ b/docs/service_api/discord_activity_tracker.md @@ -22,8 +22,8 @@ New fields (migration `0005`): `category_id: BigIntegerField | null`, `category_ | Function | Parameter types | Return type | Description | | ------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------ | ------------------------------------------------------------------------- | | `get_or_create_discord_channel` | `server: DiscordServer`, `channel_id: int`, `channel_name: str`, `channel_type: str`, `topic: str = ""`, `position: int = 0`, `category_id: int \| None = None`, `category_name: str = ""` | `tuple[DiscordChannel, bool]` | Get or create channel; update all fields (incl. category) if changed. | -| `update_channel_last_activity` | `channel: DiscordChannel`, `last_activity_at: datetime` | `DiscordChannel` | Update `last_activity_at`. | -| `update_channel_last_synced` | `channel: DiscordChannel`, `timestamp: datetime \| None = None` | `DiscordChannel` | Update `last_synced_at` (defaults to now). | +| `get_channel_latest_message_at` | `channel: DiscordChannel` | `datetime \| None` | Max `message_created_at` among non-deleted `DiscordMessage` rows for the channel. | +| `queryset_channels_with_recent_messages` | `server: DiscordServer`, `cutoff: datetime`, `channel_ids: list[int] \| None = None` | `QuerySet[DiscordChannel]` | Channels on the server with at least one non-deleted message at or after `cutoff`; optional Discord `channel_id` allowlist. | --- @@ -65,7 +65,7 @@ Inputs are lists of pre-normalised message dicts (from `sync.messages._prepare_m | Function | Parameter types | Return type | Description | | --------------------- | ------------------------------------------------------------------ | ----------- | --------------------------------------------------- | -| `get_active_channels` | `server: DiscordServer`, `days: int = 30`, `channel_ids: list[int] \| None = None` | `QuerySet` | Channels with activity in last N days, optionally filtered by `channel_ids` allowlist. | +| `get_active_channels` | `server: DiscordServer`, `days: int = 30`, `channel_ids: list[int] \| None = None` | `QuerySet` | Same as `queryset_channels_with_recent_messages` with `cutoff = now - days`. | --- diff --git a/github_activity_tracker/sync/utils.py b/github_activity_tracker/sync/utils.py index 31b9e07..f852dd8 100644 --- a/github_activity_tracker/sync/utils.py +++ b/github_activity_tracker/sync/utils.py @@ -4,13 +4,10 @@ from __future__ import annotations -import logging -from datetime import datetime from typing import Any, Optional from core.operations.github_ops import get_github_client, get_github_token - -logger = logging.getLogger(__name__) +from core.utils.datetime_parsing import parse_iso_datetime_lenient as parse_datetime # Re-export for backward compatibility; prefer "from core.operations.github_ops import ..." __all__ = [ @@ -66,14 +63,3 @@ def parse_github_user(user_dict: Optional[dict]) -> dict: "display_name": user_dict.get("name", ""), "avatar_url": user_dict.get("avatar_url", ""), } - - -def parse_datetime(date_str: Optional[str]) -> Optional[datetime]: - """Parse ISO datetime string from GitHub API. Returns datetime or None.""" - if not date_str: - return None - try: - return datetime.fromisoformat(date_str.replace("Z", "+00:00")) - except Exception as e: - logger.debug(f"Failed to parse datetime '{date_str}': {e}") - return None diff --git a/requirements-dev.lock b/requirements-dev.lock index 8a0ac63..ef18404 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -8,6 +8,8 @@ aiosignal==1.4.0 # via aiohttp amqp==5.3.1 # via kombu +annotated-types==0.7.0 + # via pydantic asgiref==3.11.1 # via django async-timeout==5.0.1 @@ -32,7 +34,6 @@ cffi==2.0.0 # via # cryptography # pynacl - # trio cfgv==3.5.0 # via pre-commit charset-normalizer==3.4.7 @@ -49,11 +50,6 @@ click-plugins==1.1.1.2 # via celery click-repl==0.3.0 # via celery -colorama==0.4.6 - # via - # click - # pytest - # tqdm coverage==7.14.0 # via pytest-cov cryptography==48.0.0 @@ -177,6 +173,10 @@ pyasn1-modules==0.4.2 # via google-auth pycparser==3.0 # via cffi +pydantic==2.13.4 + # via -r requirements.in +pydantic-core==2.46.4 + # via pydantic pygithub==2.9.1 # via -r requirements.in pygments==2.20.0 @@ -262,16 +262,18 @@ typing-extensions==4.15.0 # beautifulsoup4 # pinecone # psycopg + # pydantic + # pydantic-core # pygithub # pyright # selenium + # typing-inspection +typing-inspection==0.4.2 + # via pydantic tzdata==2026.2 # via - # django # faker # kombu - # psycopg - # tzlocal tzlocal==5.3.1 # via celery uritemplate==4.2.0 diff --git a/requirements.in b/requirements.in index 349d429..1f0bd64 100644 --- a/requirements.in +++ b/requirements.in @@ -5,6 +5,7 @@ # --- Core web / config --- Django>=4.2,<5 +pydantic>=2,<3 psycopg[binary]>=3.1,<4 django-environ>=0.11,<1 requests>=2.31,<3 diff --git a/requirements.lock b/requirements.lock index 729ca6a..92eb213 100644 --- a/requirements.lock +++ b/requirements.lock @@ -8,6 +8,8 @@ aiosignal==1.4.0 # via aiohttp amqp==5.3.1 # via kombu +annotated-types==0.7.0 + # via pydantic asgiref==3.11.1 # via django async-timeout==5.0.1 @@ -32,7 +34,6 @@ cffi==2.0.0 # via # cryptography # pynacl - # trio charset-normalizer==3.4.7 # via requests click==8.3.3 @@ -47,10 +48,6 @@ click-plugins==1.1.1.2 # via celery click-repl==0.3.0 # via celery -colorama==0.4.6 - # via - # click - # tqdm cryptography==48.0.0 # via # google-auth @@ -138,6 +135,10 @@ pyasn1-modules==0.4.2 # via google-auth pycparser==3.0 # via cffi +pydantic==2.13.4 + # via -r requirements.in +pydantic-core==2.46.4 + # via pydantic pygithub==2.9.1 # via -r requirements.in pyjwt==2.12.1 @@ -200,14 +201,15 @@ typing-extensions==4.15.0 # beautifulsoup4 # pinecone # psycopg + # pydantic + # pydantic-core # pygithub # selenium + # typing-inspection +typing-inspection==0.4.2 + # via pydantic tzdata==2026.2 - # via - # django - # kombu - # psycopg - # tzlocal + # via kombu tzlocal==5.3.1 # via celery uritemplate==4.2.0