From 60ce771aadae61496004f266a167b19648cbf484 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Wed, 13 May 2026 23:05:41 -0700 Subject: [PATCH 1/3] feat: add Pyright configuration and enhance development dependencies --- core/collectors/base.py | 38 +++++- core/collectors/base_collector.py | 86 +++++++++++-- core/collectors/command_base.py | 62 ++++++++-- core/errors.py | 54 +++++++- core/protocols.py | 85 +++++++++++++ .../protocol_assignment_negative.py | 9 ++ .../protocol_assignment_positive.py | 18 +++ core/tests/test_collectors_base.py | 52 ++++++++ core/tests/test_errors.py | 38 ++++++ core/tests/test_protocols.py | 117 ++++++++++++++++++ discord_activity_tracker/protocol_impl.py | 96 ++++++++++++++ .../sync/chat_exporter.py | 23 ++++ .../tests/test_protocol_impl.py | 54 ++++++++ docs/Core_public_API.md | 17 ++- docs/How_to_add_a_collector.md | 10 ++ docs/cross-app-dependencies.md | 2 + github_activity_tracker/protocol_impl.py | 87 +++++++++++++ .../tests/test_protocol_impl.py | 32 +++++ pyrightconfig.json | 15 +++ requirements-dev.in | 2 + requirements-dev.lock | 14 +++ 21 files changed, 885 insertions(+), 26 deletions(-) create mode 100644 core/protocols.py create mode 100644 core/pyright_samples/protocol_assignment_negative.py create mode 100644 core/pyright_samples/protocol_assignment_positive.py create mode 100644 core/tests/test_protocols.py create mode 100644 discord_activity_tracker/protocol_impl.py create mode 100644 discord_activity_tracker/tests/test_protocol_impl.py create mode 100644 github_activity_tracker/protocol_impl.py create mode 100644 github_activity_tracker/tests/test_protocol_impl.py create mode 100644 pyrightconfig.json diff --git a/core/collectors/base.py b/core/collectors/base.py index 1ef838f..25cad8b 100644 --- a/core/collectors/base.py +++ b/core/collectors/base.py @@ -17,11 +17,45 @@ class CollectorBase(_CollectorLifecycleMixin, ABC): - """Base type for collectors run via management commands or YAML schedules.""" + """ + Legacy abstract base for collectors run via management commands or YAML schedules. + + **Purpose:** Provide a single orchestration hook, :meth:`run`, while sharing + Pinecone and structured logging behavior with :class:`AbstractCollector` through + :class:`_CollectorLifecycleMixin`. + + **Subclasses must implement:** :meth:`run` only. For new collectors, prefer + :class:`AbstractCollector` (``name``, ``validate_config``, ``collect``); its + concrete :meth:`~AbstractCollector.run` calls those hooks so + :class:`~core.collectors.command_base.BaseCollectorCommand` stays unchanged. + + **Lifecycle hooks (inherited):** :meth:`~_CollectorLifecycleMixin.sync_pinecone` + (optional post-run, default no-op) and :meth:`~_CollectorLifecycleMixin.handle_error` + (structured logging). When the collector is driven by + :class:`~core.collectors.command_base.BaseCollectorCommand`, attribute + ``_error_phase`` is set to ``\"run\"`` or ``\"sync_pinecone\"`` for the duration + of each phase so logs include ``collector_phase``. + + **Error handling contract:** Exceptions other than :class:`django.core.management.base.CommandError` + are passed to :meth:`~_CollectorLifecycleMixin.handle_error`, which uses + :func:`core.errors.classify_failure` and ``logger.exception`` with ``failure_category`` + in ``extra``. The original exception is then re-raised. ``CommandError`` is logged + by the command layer without calling :meth:`~_CollectorLifecycleMixin.handle_error`. + """ @abstractmethod def run(self) -> None: - """Execute the collector work unit.""" + """ + Execute the collector work unit. + + Returns: + None: Implementations should return implicitly after successful work. + + Note: + Prefer :class:`AbstractCollector` when splitting validation from + collection improves clarity; it still satisfies this base's contract via + its concrete :meth:`~AbstractCollector.run`. + """ class DjangoCommandCollector(CollectorBase): diff --git a/core/collectors/base_collector.py b/core/collectors/base_collector.py index d3883a9..e8c5c34 100644 --- a/core/collectors/base_collector.py +++ b/core/collectors/base_collector.py @@ -19,25 +19,56 @@ @runtime_checkable class CollectorRunnable(Protocol): - """Collector instance with ``run``, ``sync_pinecone``, and ``handle_error`` (see ``BaseCollectorCommand``).""" + """ + Structural type for objects executed by :class:`BaseCollectorCommand`. - def run(self) -> None: ... + Implementations are typically :class:`CollectorBase` or :class:`AbstractCollector` + subclasses. The command invokes :meth:`run`, then :meth:`sync_pinecone`, and + routes failures through :meth:`handle_error` (except :class:`~django.core.management.base.CommandError`). + """ - def sync_pinecone(self) -> None: ... + def run(self) -> None: + """Main collection phase; see :class:`CollectorBase` or :class:`AbstractCollector`.""" + ... - def handle_error(self, exc: BaseException) -> None: ... + def sync_pinecone(self) -> None: + """Optional post-run sync; may be a no-op.""" + ... + + def handle_error(self, exc: BaseException) -> None: + """Log *exc* with structured ``failure_category``; must not swallow *exc*.""" + ... class _CollectorLifecycleMixin: """ Shared ``handle_error`` / ``sync_pinecone`` for legacy and structured collectors. - Uses :func:`core.errors.classify_failure` so logs align with - :class:`core.errors.CollectorFailureCategory`. + Uses :func:`core.errors.classify_failure` (not a method on + :class:`~core.errors.CollectorFailureCategory`) so log ``extra`` includes a stable + ``failure_category`` enum value. + + **``_error_phase``:** Set only by :class:`~core.collectors.command_base.BaseCollectorCommand` + around each phase; used when logging. If :func:`~core.errors.classify_failure` + or logging raises, the command's ``finally`` still clears ``_error_phase`` on the + collector instance. + + **Intentional gaps:** Many domain or SDK exceptions map to ``unknown``. Override + :meth:`handle_error` when you need a different category or extra context. """ def handle_error(self, exc: BaseException) -> None: - """Log failures with ``failure_category`` from :class:`CollectorFailureCategory`.""" + """ + Log *exc* at exception level with structured fields for metrics and alerting. + + Args: + exc: The exception from ``run`` or ``sync_pinecone`` (never a + :class:`~django.core.management.base.CommandError`; those are handled + in the command). + + ``logger.exception`` receives ``extra`` keys: ``collector``, ``collector_phase``, + ``failure_category`` (string value of :class:`~core.errors.CollectorFailureCategory`). + """ category = classify_failure(exc) phase = getattr(self, "_error_phase", None) or "unknown" collector_id = getattr(self, "name", None) @@ -56,7 +87,12 @@ def handle_error(self, exc: BaseException) -> None: ) def sync_pinecone(self) -> None: - """Optional post-run Pinecone sync; default is no-op.""" + """ + Optional post-run Pinecone sync; default is no-op. + + Returns: + None + """ return None @@ -74,15 +110,43 @@ class AbstractCollector(_CollectorLifecycleMixin, ABC): @property @abstractmethod def name(self) -> str: - """Stable collector id for logs and metrics (e.g. app or command slug).""" + """ + Stable collector id for logs and metrics (e.g. app or command slug). + + Returns: + Non-empty string used as the ``collector`` field in structured logs when + :meth:`handle_error` runs. + """ @abstractmethod def validate_config(self) -> None: - """Raise or no-op before :meth:`collect`; keep side effects in services.""" + """ + Validate settings and environment before :meth:`collect`. + + Returns: + None + + Raises: + Exception: Typically validation-related errors; should not perform + heavy I/O—keep that in :meth:`collect` via services. + """ @abstractmethod def collect(self) -> None: - """Main collection work; DB writes belong in ``services.py``.""" + """ + Main collection work (fetch, transform, persist). + + Returns: + None + + Raises: + Exception: Domain failures; propagate after logging when run under + :class:`~core.collectors.command_base.BaseCollectorCommand`. + + Note: + DB writes should go through the app's ``services.py`` module per project + conventions. + """ def run(self) -> None: self.validate_config() diff --git a/core/collectors/command_base.py b/core/collectors/command_base.py index 4199c24..c336540 100644 --- a/core/collectors/command_base.py +++ b/core/collectors/command_base.py @@ -1,9 +1,17 @@ """Django management command base class for CollectorBase-backed collectors.""" +# Design notes (review summary): +# - Template method: handle() -> get_collector(**options) -> phase(run) -> phase(sync_pinecone). +# - ABC: subclasses that omit get_collector() raise TypeError at instantiation, not at import. +# - Each _run_collector_phase uses try/except/finally; finally clears _error_phase even if +# handle_error, classify_failure, or logging raises (double fault cleanup). +# - classify_failure (core.errors) maps a core dependency surface; many SDK/DB errors stay +# unknown—override handle_error on the collector when you need a specific category. + from __future__ import annotations import logging -from abc import abstractmethod +from abc import ABC, abstractmethod from typing import Any from django.core.management.base import BaseCommand, CommandError @@ -13,12 +21,42 @@ logger = logging.getLogger(__name__) -class BaseCollectorCommand(BaseCommand): - """Runs ``get_collector(**options).run()`` then ``sync_pinecone()`` with shared error handling.""" +class BaseCollectorCommand(ABC, BaseCommand): + """ + Thin Django ``BaseCommand`` adapter using the template-method pattern. + + **Flow:** :meth:`django.core.management.base.BaseCommand.handle` is implemented as + ``get_collector(**options)``, then :meth:`_run_collector_phase` with ``collector.run``, + then :meth:`_run_collector_phase` with ``collector.sync_pinecone``. + + **``get_collector`` contract:** Must return a :class:`CollectorRunnable`—any object + with ``run()``, ``sync_pinecone()``, and ``handle_error(exc)``. Typical implementations + return a :class:`~core.collectors.base.CollectorBase` or + :class:`~core.collectors.base_collector.AbstractCollector` instance. Subclasses that + do not implement :meth:`get_collector` cannot be instantiated (``TypeError`` from + ``abc``), which surfaces as soon as the command object is constructed, usually when + Django loads the command. + + **Errors:** :class:`~django.core.management.base.CommandError` is logged with + ``failure_category`` set to ``\"command\"`` and re-raised without calling + ``handle_error``. Any other :class:`Exception` is passed to ``collector.handle_error`` + (which classifies via :func:`core.errors.classify_failure` and logs), then re-raised. + A ``finally`` block always removes ``collector._error_phase`` after each phase. + """ @abstractmethod def get_collector(self, **options: Any) -> CollectorRunnable: - """Instantiate the collector from parsed CLI options.""" + """ + Build the collector instance from parsed CLI options. + + Args: + **options: Keyword arguments forwarded from :meth:`handle` (Django-parsed + command-line options and defaults). + + Returns: + A :class:`CollectorRunnable` executed by :meth:`handle` (``run`` then + ``sync_pinecone``). + """ def handle(self, *args: Any, **options: Any) -> None: collector = self.get_collector(**options) @@ -31,11 +69,19 @@ def _run_collector_phase( phase: Any, ) -> None: """ - Run one collector phase (``run`` or ``sync_pinecone``). + Run a single zero-argument callable phase on *collector*. + + Sets ``collector._error_phase`` to the callable's ``__name__`` (for example + ``\"run\"`` or ``\"sync_pinecone\"``) before invoking *phase*, clears it in + ``finally``, and routes failures per :class:`BaseCollectorCommand` error rules. + + Args: + collector: Object providing ``handle_error`` for non-command failures. + phase: Bound method or callable with no arguments (typically + ``collector.run`` or ``collector.sync_pinecone``). - On unexpected exceptions, :meth:`CollectorRunnable.handle_error` is invoked with - ``collector._error_phase`` set to the phase name for structured logs - (``collector``, ``collector_phase``, ``failure_category`` on the log record). + Returns: + None """ phase_name = getattr(phase, "__name__", str(phase)) setattr(collector, "_error_phase", phase_name) diff --git a/core/errors.py b/core/errors.py index 13c75a0..1ab3ef4 100644 --- a/core/errors.py +++ b/core/errors.py @@ -138,10 +138,35 @@ def classify_failure(exc: BaseException) -> CollectorFailureCategory: """ Map an exception to a failure category for structured logging. - ``requests.HTTPError`` with ``response.status_code`` 429 maps to - :attr:`CollectorFailureCategory.RATE_LIMIT`; 401 and 403 map to - :attr:`CollectorFailureCategory.AUTH`. ``discord.errors.HTTPException`` - with ``status`` is classified similarly when discord.py is in use. + **Django:** :class:`~django.core.management.base.CommandError` and + :class:`~django.core.exceptions.ValidationError` are recognized when Django is + importable; if those imports fail (e.g. unusual test doubles), matching exceptions + fall through to :attr:`~CollectorFailureCategory.UNKNOWN`. All + :class:`~django.db.Error` subclasses map to :attr:`~CollectorFailureCategory.UNKNOWN` + (schema vs transport ambiguity—override :meth:`handle_error` on the collector when + you need finer buckets). + + **HTTP clients:** ``requests`` / ``urllib3`` / ``httpx`` exceptions are classified + by module and type name; ``requests.HTTPError`` with ``response.status_code`` 429 + maps to :attr:`~CollectorFailureCategory.RATE_LIMIT`; 401 and 403 map to + :attr:`~CollectorFailureCategory.AUTH`. + + **discord.py:** ``discord.errors.HTTPException`` and related types use ``status`` + when present (429 → rate limit; 401/403 → auth; 5xx → network; other 4xx → unknown). + ``HTTPException`` without a status is treated as network; ``LoginFailure`` and + similar map to auth. + + **slack_sdk:** Exceptions use ``response.status_code`` when present; otherwise + :attr:`~CollectorFailureCategory.UNKNOWN`. + + Everything else maps to :attr:`~CollectorFailureCategory.UNKNOWN` unless it matches + built-ins (for example :class:`OSError`, :class:`ValueError`) handled below. + + Args: + exc: Any exception raised during collector work. + + Returns: + A :class:`CollectorFailureCategory` member (use ``.value`` for logs). """ # Django / app try: @@ -159,6 +184,13 @@ def classify_failure(exc: BaseException) -> CollectorFailureCategory: if ValidationError and isinstance(exc, ValidationError): return CollectorFailureCategory.VALIDATION + try: + from django.db import Error as DjangoDBError + except ImportError: + DjangoDBError = () # type: ignore[misc, assignment] + if DjangoDBError and isinstance(exc, DjangoDBError): + return CollectorFailureCategory.UNKNOWN + if isinstance(exc, PermissionError): return CollectorFailureCategory.PERMISSION if isinstance(exc, (TimeoutError,)): @@ -210,6 +242,20 @@ def classify_failure(exc: BaseException) -> CollectorFailureCategory: if exc_name in ("LoginFailure", "PrivilegedIntentsRequired", "ClientException"): return CollectorFailureCategory.AUTH + if exc_mod.startswith("slack_sdk"): + response = getattr(exc, "response", None) + status = getattr(response, "status_code", None) + if isinstance(status, int): + if status == 429: + return CollectorFailureCategory.RATE_LIMIT + if status in (401, 403): + return CollectorFailureCategory.AUTH + if status >= 500: + return CollectorFailureCategory.NETWORK + if status >= 400: + return CollectorFailureCategory.NETWORK + return CollectorFailureCategory.UNKNOWN + if isinstance(exc, OSError): return _classify_os_error(exc) diff --git a/core/protocols.py b/core/protocols.py new file mode 100644 index 0000000..d62d3f1 --- /dev/null +++ b/core/protocols.py @@ -0,0 +1,85 @@ +""" +Portable DTO protocols for tracker sync and collection boundaries. + +These types complement **orchestration** protocols in :mod:`core.collectors` +(e.g. :class:`~core.collectors.base_collector.CollectorRunnable`), which describe +**how** a management command runs phases. Here we describe **what** crosses layer +and app boundaries: run outcomes, activity events before ORM persistence, and +incremental checkpoints. + +Implementations should be small frozen :class:`dataclasses.dataclass` types in each +tracker app. Prefer them over plain ``dict`` for :func:`isinstance` checks with +``@runtime_checkable`` — dict instances do not reliably satisfy attribute-based +protocols at runtime. Attributes on these protocols are read-only ``@property`` +stubs so frozen dataclasses (and Pyright) treat implementations as structurally +compatible. +""" + +from __future__ import annotations + +from typing import Any, Mapping, Protocol, runtime_checkable + + +@runtime_checkable +class TrackerResult(Protocol): + """Outcome of one logical collection or sync cycle.""" + + @property + def success(self) -> bool: ... + + @property + def counts(self) -> Mapping[str, int]: ... + + +@runtime_checkable +class ActivityRecord(Protocol): + """Portable activity event (not a Django model).""" + + @property + def source_system(self) -> str: ... + + @property + def external_id(self) -> str: ... + + @property + def occurred_at(self) -> str: ... + + @property + def activity_type(self) -> str: ... + + @property + def actor_external_id(self) -> str: ... + + @property + def source_url(self) -> str | None: ... + + @property + def summary(self) -> str: ... + + +@runtime_checkable +class IncrementalState(Protocol): + """Serializable checkpoint between runs (opaque token + human marker + extras).""" + + @property + def checkpoint_token(self) -> str | None: ... + + @property + def human_readable_marker(self) -> str | None: ... + + @property + def extras(self) -> Mapping[str, Any]: ... + + +def require_tracker_result(obj: object) -> TrackerResult: + """Return *obj* if it satisfies :class:`TrackerResult`; else raise ``TypeError``.""" + if not isinstance(obj, TrackerResult): + raise TypeError(f"expected TrackerResult, got {type(obj).__name__!r}") + return obj + + +def require_activity_record(obj: object) -> ActivityRecord: + """Return *obj* if it satisfies :class:`ActivityRecord`; else raise ``TypeError``.""" + if not isinstance(obj, ActivityRecord): + raise TypeError(f"expected ActivityRecord, got {type(obj).__name__!r}") + return obj diff --git a/core/pyright_samples/protocol_assignment_negative.py b/core/pyright_samples/protocol_assignment_negative.py new file mode 100644 index 0000000..11f3565 --- /dev/null +++ b/core/pyright_samples/protocol_assignment_negative.py @@ -0,0 +1,9 @@ +"""Pyright-negative sample: invalid return type for :class:`core.protocols.TrackerResult`.""" + +from __future__ import annotations + +from core.protocols import TrackerResult + + +def broken_tracker_result() -> TrackerResult: + return "not a TrackerResult" diff --git a/core/pyright_samples/protocol_assignment_positive.py b/core/pyright_samples/protocol_assignment_positive.py new file mode 100644 index 0000000..bd968ca --- /dev/null +++ b/core/pyright_samples/protocol_assignment_positive.py @@ -0,0 +1,18 @@ +"""Pyright-positive sample: structural conformance to :class:`core.protocols.TrackerResult`.""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Mapping + +from core.protocols import TrackerResult + + +@dataclass(frozen=True) +class _LocalTrackerResult: + success: bool + counts: Mapping[str, int] + + +def sample_tracker_result() -> TrackerResult: + return _LocalTrackerResult(success=True, counts={"items": 0}) diff --git a/core/tests/test_collectors_base.py b/core/tests/test_collectors_base.py index f2d544f..7266004 100644 --- a/core/tests/test_collectors_base.py +++ b/core/tests/test_collectors_base.py @@ -93,6 +93,58 @@ def get_collector(self, **options): assert isinstance(mock_handle.call_args[0][0], RuntimeError) +def test_base_collector_command_requires_get_collector_at_instantiation(): + class IncompleteCmd(BaseCollectorCommand): + help = "test" + + with pytest.raises(TypeError, match="get_collector"): + IncompleteCmd(stdout=StringIO(), stderr=StringIO()) + + +def test_base_collector_command_failure_classifies_in_handle_error(): + class BadCollector(CollectorBase): + def run(self) -> None: + raise ValueError("bad input") + + class Cmd(BaseCollectorCommand): + help = "test" + + def get_collector(self, **options): + return BadCollector() + + with patch.object(collector_lifecycle.logger, "exception") as mock_exc: + with pytest.raises(ValueError, match="bad input"): + Cmd(stdout=StringIO(), stderr=StringIO()).handle() + mock_exc.assert_called_once() + assert mock_exc.call_args[1]["extra"]["failure_category"] == "validation" + + +def test_base_collector_command_double_fault_clears_error_phase(): + class BadCollector(CollectorBase): + def run(self) -> None: + raise RuntimeError("primary") + + held: dict[str, CollectorBase] = {} + + class Cmd(BaseCollectorCommand): + help = "test" + + def get_collector(self, **options): + c = BadCollector() + held["c"] = c + return c + + cmd = Cmd(stdout=StringIO(), stderr=StringIO()) + with patch.object( + BadCollector, + "handle_error", + side_effect=AssertionError("secondary"), + ): + with pytest.raises(AssertionError, match="secondary"): + cmd.handle() + assert not hasattr(held["c"], "_error_phase") + + def test_abstract_collector_run_calls_validate_then_collect(): order = [] diff --git a/core/tests/test_errors.py b/core/tests/test_errors.py index 6f8a96c..02311d6 100644 --- a/core/tests/test_errors.py +++ b/core/tests/test_errors.py @@ -58,6 +58,44 @@ def test_classify_command_error(): assert classify_failure(CommandError("bad")) is CollectorFailureCategory.COMMAND +def test_classify_django_db_error_unknown(): + from django.db import OperationalError + + assert classify_failure(OperationalError("db")) is CollectorFailureCategory.UNKNOWN + + +def test_classify_discord_errors_module_429(): + class HTTPException(Exception): + pass + + HTTPException.__module__ = "discord.errors" + exc = HTTPException("rate limited") + exc.status = 429 + assert classify_failure(exc) is CollectorFailureCategory.RATE_LIMIT + + +def test_classify_discord_errors_module_unknown_without_status(): + class DiscordExc(Exception): + pass + + DiscordExc.__module__ = "discord.errors" + assert classify_failure(DiscordExc("x")) is CollectorFailureCategory.UNKNOWN + + +def test_classify_slack_sdk_errors_module_401(): + class SlackApiError(Exception): + pass + + SlackApiError.__module__ = "slack_sdk.errors" + + class Resp: + status_code = 401 + + exc = SlackApiError("auth") + exc.response = Resp() + assert classify_failure(exc) is CollectorFailureCategory.AUTH + + def test_classify_validation_error(): assert classify_failure(ValidationError("x")) is CollectorFailureCategory.VALIDATION diff --git a/core/tests/test_protocols.py b/core/tests/test_protocols.py new file mode 100644 index 0000000..d91f426 --- /dev/null +++ b/core/tests/test_protocols.py @@ -0,0 +1,117 @@ +"""Runtime checks for :mod:`core.protocols` and ``core/pyright_samples`` Pyright snippets.""" + +from __future__ import annotations + +import subprocess +import sys +from pathlib import Path + +import pytest + +from core.protocols import ( + ActivityRecord, + IncrementalState, + TrackerResult, + require_activity_record, + require_tracker_result, +) +from discord_activity_tracker.protocol_impl import ( + DiscordActivityRecord, + DiscordCollectionTrackerResult, +) +from github_activity_tracker.protocol_impl import ( + GitHubActivityRecord, + GitHubIncrementalState, + GitHubSyncTrackerResult, +) + +_REPO_ROOT = Path(__file__).resolve().parents[2] +_TYPING_DIR = _REPO_ROOT / "core" / "pyright_samples" + + +def test_tracker_result_isinstance_github_dataclass() -> None: + r = GitHubSyncTrackerResult(success=True, counts={"issues": 2, "pull_requests": 1}) + assert isinstance(r, TrackerResult) + assert r.counts["issues"] == 2 + + +def test_activity_record_isinstance_discord_dataclass() -> None: + rec = DiscordActivityRecord( + source_system="discord", + external_id="1:2:3", + occurred_at="2024-01-01T00:00:00Z", + activity_type="discord.Default", + actor_external_id="99", + source_url="https://discord.com/channels/1/2/3", + summary="hi", + ) + assert isinstance(rec, ActivityRecord) + + +def test_incremental_state_isinstance_github() -> None: + st = GitHubIncrementalState.from_repo_watermark(repo_id=42, marker="2024-06") + assert isinstance(st, IncrementalState) + + +def test_activity_record_isinstance_github_from_issue() -> None: + rec = GitHubActivityRecord.from_issue(repo_id=7, issue_number=123, summary="title") + assert isinstance(rec, ActivityRecord) + assert "7:issue:123" in rec.external_id + + +def test_tracker_result_isinstance_discord_dataclass() -> None: + r = DiscordCollectionTrackerResult( + success=True, counts={"messages": 5, "channels": 1} + ) + assert isinstance(r, TrackerResult) + + +def test_require_tracker_result_raises_type_error_on_bad_object() -> None: + class NotAResult: + success = True + # missing counts + + with pytest.raises(TypeError, match="TrackerResult"): + require_tracker_result(NotAResult()) + + +def test_require_activity_record_raises_type_error_on_bad_object() -> None: + class NotARecord: + source_system = "x" + # missing fields + + with pytest.raises(TypeError, match="ActivityRecord"): + require_activity_record(NotARecord()) + + +def test_pyright_positive_protocol_assignment_file() -> None: + path = _TYPING_DIR / "protocol_assignment_positive.py" + proc = subprocess.run( + [sys.executable, "-m", "pyright", str(path)], + cwd=_REPO_ROOT, + capture_output=True, + text=True, + check=False, + ) + if proc.returncode != 0 and "No module named pyright" in (proc.stderr or ""): + pytest.skip("pyright not installed (pip install -r requirements-dev.lock)") + assert proc.returncode == 0, proc.stdout + proc.stderr + + +def test_pyright_negative_protocol_assignment_file() -> None: + path = _TYPING_DIR / "protocol_assignment_negative.py" + proc = subprocess.run( + [sys.executable, "-m", "pyright", str(path)], + cwd=_REPO_ROOT, + capture_output=True, + text=True, + check=False, + ) + stderr = proc.stderr or "" + stdout = proc.stdout or "" + if "No module named pyright" in stderr: + pytest.skip("pyright not installed (pip install -r requirements-dev.lock)") + assert proc.returncode != 0, ( + "expected pyright errors for protocol_assignment_negative.py; " + f"stdout={stdout!r} stderr={stderr!r}" + ) diff --git a/discord_activity_tracker/protocol_impl.py b/discord_activity_tracker/protocol_impl.py new file mode 100644 index 0000000..1a1ab49 --- /dev/null +++ b/discord_activity_tracker/protocol_impl.py @@ -0,0 +1,96 @@ +"""Frozen DTOs implementing :mod:`core.protocols` for Discord activity sync.""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from datetime import datetime +from typing import Any, Mapping + + +@dataclass(frozen=True) +class DiscordCollectionTrackerResult: + """Counts for a Discord collection slice (messages, channels, etc.).""" + + success: bool + counts: Mapping[str, int] + + +@dataclass(frozen=True) +class DiscordIncrementalState: + """Checkpoint between Discord runs (after-cursor + optional snowflake).""" + + checkpoint_token: str | None + human_readable_marker: str | None + extras: Mapping[str, Any] = field(default_factory=dict) + + @classmethod + def from_after_date( + cls, + *, + after: datetime | None, + last_message_id: int | None = None, + channel_id: int | None = None, + ) -> DiscordIncrementalState: + marker = after.isoformat() if after is not None else "" + tok_parts = ["discord"] + if channel_id is not None: + tok_parts.append(f"ch:{channel_id}") + if last_message_id is not None: + tok_parts.append(f"msg:{last_message_id}") + checkpoint = ":".join(tok_parts) + return cls( + checkpoint_token=checkpoint, + human_readable_marker=marker or None, + extras={ + "after_iso": marker, + "last_message_id": last_message_id, + "channel_id": channel_id, + }, + ) + + +@dataclass(frozen=True) +class DiscordActivityRecord: + """Normalized Discord message as a portable activity row.""" + + source_system: str + external_id: str + occurred_at: str + activity_type: str + actor_external_id: str + source_url: str | None + summary: str + + @classmethod + def from_converted_export_dict( + cls, + converted: Mapping[str, Any], + *, + server_id: int, + channel_id: int, + ) -> DiscordActivityRecord: + mid = int(converted.get("id") or 0) + author = converted.get("author") or {} + if isinstance(author, Mapping): + aid = author.get("id") + actor = str(aid) if aid is not None else "" + else: + actor = "" + occurred = str( + converted.get("occurred_at") or converted.get("created_at") or "" + ) + content = str(converted.get("content") or "") + summary = content[:2000] + ext_id = f"{server_id}:{channel_id}:{mid}" + src = converted.get("source_url") + source_url = str(src) if src else None + mtype = str(converted.get("message_type") or "Default") + return cls( + source_system="discord", + external_id=ext_id, + occurred_at=occurred, + activity_type=f"discord.{mtype}", + actor_external_id=actor, + source_url=source_url, + summary=summary, + ) diff --git a/discord_activity_tracker/sync/chat_exporter.py b/discord_activity_tracker/sync/chat_exporter.py index 6186359..ac9ceed 100644 --- a/discord_activity_tracker/sync/chat_exporter.py +++ b/discord_activity_tracker/sync/chat_exporter.py @@ -1,5 +1,7 @@ """DiscordChatExporter CLI wrapper for user token-based scraping.""" +from __future__ import annotations + import json import logging import os @@ -17,6 +19,8 @@ format_instant_iso_z, ) +from discord_activity_tracker.protocol_impl import DiscordActivityRecord + from .utils import format_discord_url from ..workspace import get_workspace_root @@ -776,3 +780,22 @@ def export_and_parse_guild( continue return parsed_channels + + +def exporter_message_to_activity_record( + msg_data: Dict[str, Any], + *, + server_id: int, + channel_id: int, +) -> DiscordActivityRecord: + """Convert exporter JSON to :class:`~discord_activity_tracker.protocol_impl.DiscordActivityRecord`. + + Thin bridge for :mod:`core.protocols` without changing :func:`convert_exporter_message_to_dict` + return shape for existing callers. + """ + converted = convert_exporter_message_to_dict( + msg_data, server_id=server_id, channel_id=channel_id + ) + return DiscordActivityRecord.from_converted_export_dict( + converted, server_id=server_id, channel_id=channel_id + ) diff --git a/discord_activity_tracker/tests/test_protocol_impl.py b/discord_activity_tracker/tests/test_protocol_impl.py new file mode 100644 index 0000000..4eedad9 --- /dev/null +++ b/discord_activity_tracker/tests/test_protocol_impl.py @@ -0,0 +1,54 @@ +"""Tests for :mod:`discord_activity_tracker.protocol_impl` and chat_exporter bridge.""" + +from __future__ import annotations + +from core.protocols import ActivityRecord, IncrementalState + +from discord_activity_tracker.protocol_impl import ( + DiscordActivityRecord, + DiscordIncrementalState, +) +from discord_activity_tracker.sync.chat_exporter import ( + exporter_message_to_activity_record, +) + + +def test_discord_incremental_state_from_after_date(): + st = DiscordIncrementalState.from_after_date( + after=None, last_message_id=100, channel_id=55 + ) + assert isinstance(st, IncrementalState) + assert st.extras["channel_id"] == 55 + + +def test_exporter_message_to_activity_record_matches_protocol(): + msg = { + "id": "12", + "timestamp": "2024-06-01T12:00:00.0000000+00:00", + "content": "hello world", + "type": "Default", + "author": {"id": "99", "name": "user1"}, + "attachments": [], + "reactions": [], + } + rec = exporter_message_to_activity_record(msg, server_id=1, channel_id=2) + assert isinstance(rec, ActivityRecord) + assert rec.external_id == "1:2:12" + assert "hello" in rec.summary + + +def test_discord_activity_record_from_converted_export_dict(): + converted = { + "id": 5, + "created_at": "2024-01-01T00:00:00.0000000Z", + "occurred_at": "2024-01-01T00:00:00.0000000Z", + "message_type": "Reply", + "content": "x", + "author": {"id": 7}, + "source_url": "https://discord.com/channels/1/2/5", + } + rec = DiscordActivityRecord.from_converted_export_dict( + converted, server_id=1, channel_id=2 + ) + assert rec.actor_external_id == "7" + assert rec.activity_type == "discord.Reply" diff --git a/docs/Core_public_API.md b/docs/Core_public_API.md index aa5d065..300f437 100644 --- a/docs/Core_public_API.md +++ b/docs/Core_public_API.md @@ -21,11 +21,26 @@ The `core` Django app holds shared infrastructure. Treat the following as the ** Log records from `CollectorBase.handle_error` / `AbstractCollector.handle_error` include `extra` keys: `collector`, `collector_phase`, `failure_category`. +## Tracker protocols (DTOs) + +Structural contracts for **data** that crosses tracker layers (sync outcomes, activity events before ORM persistence, incremental checkpoints). These live in **`core.protocols`** and complement **orchestration** types in `core.collectors` (for example `CollectorRunnable` for management-command phases). + +| Import | Purpose | +|--------|---------| +| `core.protocols.TrackerResult` | `@runtime_checkable` protocol: `success`, `counts` (`Mapping[str, int]`). | +| `core.protocols.ActivityRecord` | `@runtime_checkable` protocol: portable activity row (`source_system`, `external_id`, `occurred_at`, …). | +| `core.protocols.IncrementalState` | `@runtime_checkable` protocol: `checkpoint_token`, `human_readable_marker`, `extras`. | +| `core.protocols.require_tracker_result` / `require_activity_record` | Runtime guards raising `TypeError` when an object does not satisfy the protocol. | + +Implementations are frozen dataclasses in each tracker app (for example `github_activity_tracker.protocol_impl`, `discord_activity_tracker.protocol_impl`). Prefer dataclasses over plain `dict` for reliable `isinstance` checks with `@runtime_checkable`. + +**Local static check:** with dev dependencies installed (`requirements-dev.lock`), from the repo root run `python -m pyright` (optionally scoped to `core/pyright_samples/` snippet files). Pyright is not required in CI for this repository. + ## Reducing coupling - Prefer **no** `ForeignKey` from one tracker app into another's models (see Development guideline). - When you need shared behavior, add it under `core` (for example **`core.operations`** for Slack/markdown/file helpers, or **`core.operations.github_ops`** for GitHub API/git/tokens). Those utilities are **not** separate Django apps—they live under the **`core`** package and are not listed in **`INSTALLED_APPS`**. -- Long-term: shrink opportunistic imports between tracker apps by extracting shared protocols into `core` or small neutral apps. +- Long-term: shrink opportunistic imports between tracker apps by extracting shared protocols into `core` or small neutral apps (see **[Tracker protocols (DTOs)](#tracker-protocols-dtos)** for typed data shapes). - The current state of all cross-app FKs, ORM read-coupling, and Python imports is catalogued in **[cross-app-dependencies.md](cross-app-dependencies.md)**, together with `import-linter` contracts that can enforce the coupling guideline mechanically. ## Related docs diff --git a/docs/How_to_add_a_collector.md b/docs/How_to_add_a_collector.md index 4319a5f..0b3753b 100644 --- a/docs/How_to_add_a_collector.md +++ b/docs/How_to_add_a_collector.md @@ -19,6 +19,16 @@ Stable imports live under **`core.collectors`** (re-exported in [`core/collector - **Legacy:** Subclass **`CollectorBase`** and implement `run()` only (same error/Pinecone hooks). New work should prefer **`AbstractCollector`**. - **`DjangoCommandCollector`** remains available for tests or internal `call_command` wrappers. +### Collector contracts (source of truth) + +The detailed contracts (abstract methods, lifecycle hooks, error handling, template-method flow) live in the **class docstrings** in the codebase—read these when wiring a new collector: + +- [`core/collectors/base.py`](../core/collectors/base.py) — `CollectorBase` +- [`core/collectors/command_base.py`](../core/collectors/command_base.py) — `BaseCollectorCommand` +- [`core/collectors/base_collector.py`](../core/collectors/base_collector.py) — `CollectorRunnable`, `AbstractCollector`, `_CollectorLifecycleMixin` + +**At a glance:** `BaseCollectorCommand` calls `get_collector(**options)` then runs `run` and `sync_pinecone`. During each phase it sets `collector._error_phase` (for example `"run"`) and clears it in a `finally` block. `django.core.management.base.CommandError` is logged with `failure_category="command"` and is **not** passed to `handle_error`; any other exception is passed to `handle_error`, which logs using **`classify_failure()`** from [`core/errors.py`](../core/errors.py) (the function maps exceptions to **`CollectorFailureCategory`** values—it is not a method on the enum). Override `handle_error` when the default classifier does not fit your domain. + ## 4. Skeleton collector (minimal copy-paste example) This section is a **canonical minimal pattern**: the management command is only responsible for parsing options and returning a collector from `get_collector()` (often ~10–15 lines). The **`AbstractCollector` subclass** implements `name`, `validate_config`, and `collect` (orchestration); `BaseCollectorCommand` still calls `run()`, which the base implements as validate-then-collect. The **service layer** (`services.py`) is the main place for DB and API logic—match the project rule that writes go through services (see [Contributing.md](Contributing.md#service-layer-single-place-for-writes)). diff --git a/docs/cross-app-dependencies.md b/docs/cross-app-dependencies.md index e63109a..dc23828 100644 --- a/docs/cross-app-dependencies.md +++ b/docs/cross-app-dependencies.md @@ -3,6 +3,8 @@ This document maps every cross-app dependency between the tracker Django apps in this project. It exists to make the [Contributing.md](Contributing.md) guideline — "prefer no ForeignKey from one tracker app into another's models" — visible and therefore enforceable. +For **typed data boundaries** (run results, activity rows, checkpoints) shared across apps, +prefer :mod:`core.protocols` (see [Core_public_API.md](Core_public_API.md#tracker-protocols-dtos)). **Re-generate the import tables** after large refactors: diff --git a/github_activity_tracker/protocol_impl.py b/github_activity_tracker/protocol_impl.py new file mode 100644 index 0000000..7607896 --- /dev/null +++ b/github_activity_tracker/protocol_impl.py @@ -0,0 +1,87 @@ +"""Frozen DTOs implementing :mod:`core.protocols` for GitHub activity sync.""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from datetime import datetime +from typing import Any, Mapping + +from github_activity_tracker.sync import sync_github + + +@dataclass(frozen=True) +class GitHubSyncTrackerResult: + """Structured :class:`~core.protocols.TrackerResult` for ``sync_github`` outcomes.""" + + success: bool + counts: Mapping[str, int] + + @classmethod + def from_sync_dict(cls, d: dict[str, list[int]]) -> GitHubSyncTrackerResult: + issues = d.get("issues") or [] + prs = d.get("pull_requests") or [] + return cls( + success=True, + counts={"issues": len(issues), "pull_requests": len(prs)}, + ) + + +def sync_github_tracker_result( + repo: Any, + start_date: datetime | None = None, + end_date: datetime | None = None, +) -> GitHubSyncTrackerResult: + """Run :func:`~github_activity_tracker.sync.sync_github.sync_github` and return a protocol-friendly DTO.""" + raw = sync_github(repo, start_date=start_date, end_date=end_date) + return GitHubSyncTrackerResult.from_sync_dict(raw) + + +@dataclass(frozen=True) +class GitHubActivityRecord: + """Single issue/PR touch for cross-layer logging or bridges.""" + + source_system: str + external_id: str + occurred_at: str + activity_type: str + actor_external_id: str + source_url: str | None + summary: str + + @classmethod + def from_issue( + cls, + *, + repo_id: int, + issue_number: int, + occurred_at: str = "", + summary: str = "", + ) -> GitHubActivityRecord: + return cls( + source_system="github", + external_id=f"{repo_id}:issue:{issue_number}", + occurred_at=occurred_at, + activity_type="github.issue", + actor_external_id="", + source_url=None, + summary=summary[:2000], + ) + + +@dataclass(frozen=True) +class GitHubIncrementalState: + """Opaque + human-readable sync watermark (app-specific *extras*).""" + + checkpoint_token: str | None + human_readable_marker: str | None + extras: Mapping[str, Any] = field(default_factory=dict) + + @classmethod + def from_repo_watermark( + cls, *, repo_id: int, marker: str + ) -> GitHubIncrementalState: + return cls( + checkpoint_token=f"github:repo:{repo_id}", + human_readable_marker=marker, + extras={"repo_id": repo_id}, + ) diff --git a/github_activity_tracker/tests/test_protocol_impl.py b/github_activity_tracker/tests/test_protocol_impl.py new file mode 100644 index 0000000..68e8382 --- /dev/null +++ b/github_activity_tracker/tests/test_protocol_impl.py @@ -0,0 +1,32 @@ +"""Tests for :mod:`github_activity_tracker.protocol_impl`.""" + +from __future__ import annotations + +from datetime import datetime, timezone +from unittest.mock import MagicMock, patch + +from core.protocols import TrackerResult + +from github_activity_tracker.protocol_impl import ( + GitHubSyncTrackerResult, + sync_github_tracker_result, +) + + +def test_sync_github_tracker_result_wraps_sync_github_dict(): + repo = MagicMock() + with patch("github_activity_tracker.protocol_impl.sync_github") as m: + m.return_value = {"issues": [1, 2], "pull_requests": [9]} + out = sync_github_tracker_result( + repo, start_date=datetime(2024, 1, 1, tzinfo=timezone.utc) + ) + m.assert_called_once() + assert isinstance(out, TrackerResult) + assert out.counts == {"issues": 2, "pull_requests": 1} + assert out.success is True + + +def test_github_sync_tracker_result_from_sync_dict(): + r = GitHubSyncTrackerResult.from_sync_dict({"issues": [], "pull_requests": [1]}) + assert r.counts["issues"] == 0 + assert r.counts["pull_requests"] == 1 diff --git a/pyrightconfig.json b/pyrightconfig.json new file mode 100644 index 0000000..abe32a0 --- /dev/null +++ b/pyrightconfig.json @@ -0,0 +1,15 @@ +{ + "include": ["core", "github_activity_tracker", "discord_activity_tracker"], + "exclude": [ + "**/.venv/**", + "**/migrations/**", + "**/__pycache__/**", + "**/tests/**", + "**/._*", + "**/.___*" + ], + "pythonVersion": "3.11", + "typeCheckingMode": "basic", + "reportMissingImports": true, + "stubPath": "" +} diff --git a/requirements-dev.in b/requirements-dev.in index 578cc1c..f28d847 100644 --- a/requirements-dev.in +++ b/requirements-dev.in @@ -5,6 +5,8 @@ pytest>=7.4,<9 pytest-django>=4.5,<5 +django-stubs>=4.2.7,<5 +django-stubs-ext>=4.2.7,<5 django-test-plus>=2.2,<3 model-bakery>=0.17,<2 pre-commit>=3.6,<5 diff --git a/requirements-dev.lock b/requirements-dev.lock index ef18404..4d4126e 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -63,9 +63,17 @@ distlib==0.4.0 django==4.2.30 # via # -r requirements.in + # django-stubs + # django-stubs-ext # model-bakery django-environ==0.13.0 # via -r requirements.in +django-stubs==4.2.7 + # via -r requirements-dev.in +django-stubs-ext==4.2.7 + # via + # -r requirements-dev.in + # django-stubs django-test-plus==2.4.1 # via -r requirements-dev.in faker==37.12.0 @@ -256,10 +264,16 @@ trio==0.33.0 # trio-websocket trio-websocket==0.12.2 # via selenium +types-pytz==2026.2.0.20260506 + # via django-stubs +types-pyyaml==6.0.12.20260510 + # via django-stubs typing-extensions==4.15.0 # via # aiosignal # beautifulsoup4 + # django-stubs + # django-stubs-ext # pinecone # psycopg # pydantic From db271f23ef6663d5655069a66ca9e6eb0066b766 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Wed, 13 May 2026 23:39:22 -0700 Subject: [PATCH 2/3] feat: enhance GitHub and Slack operations with improved error handling and type checking --- .github/workflows/actions.yml | 30 ++++++++ README.md | 6 +- core/operations/github_ops/client.py | 14 ++-- core/operations/github_ops/git_ops.py | 8 +- core/operations/slack_ops/channels.py | 4 +- core/operations/slack_ops/client.py | 2 +- core/tests/test_protocols.py | 30 +++++++- core/utils/boost_version_operations.py | 2 +- discord_activity_tracker/preprocessor.py | 6 +- discord_activity_tracker/staging_schema.py | 8 +- .../sync/chat_exporter.py | 2 +- discord_activity_tracker/sync/export.py | 3 +- docs/Contributing.md | 4 +- docs/Core_public_API.md | 2 +- docs/Deployment.md | 2 +- docs/Development_guideline.md | 3 +- docs/How_to_add_a_collector.md | 2 +- docs/README.md | 2 +- github_activity_tracker/fetcher.py | 18 +++-- .../commands/backfill_300_file_commits.py | 8 +- github_activity_tracker/models.py | 30 ++++---- github_activity_tracker/services.py | 4 +- github_activity_tracker/sync/commits.py | 29 +++++-- github_activity_tracker/sync/etag_cache.py | 6 +- .../sync/issues_and_prs.py | 61 ++++++++++++--- github_activity_tracker/sync/repos.py | 3 +- .../tests/test_sync_issues_and_prs.py | 75 +++++++++++++++---- pyrightconfig.json | 1 + 28 files changed, 271 insertions(+), 94 deletions(-) diff --git a/.github/workflows/actions.yml b/.github/workflows/actions.yml index a2f165a..a1a4afb 100644 --- a/.github/workflows/actions.yml +++ b/.github/workflows/actions.yml @@ -46,6 +46,36 @@ jobs: uv pip install pre-commit uv run pre-commit run -a + pyright: + runs-on: ubuntu-latest + timeout-minutes: 10 + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v7 + with: + python-version: "3.13" + + - name: Cache uv + uses: actions/cache@v4 + with: + path: ~/.cache/uv + key: ${{ runner.os }}-uv-pyright-${{ hashFiles('requirements-dev.lock') }} + restore-keys: | + ${{ runner.os }}-uv-pyright- + ${{ runner.os }}-uv- + + - name: Typecheck with Pyright + env: + SETUPTOOLS_SCM_WRITE_TO_SOURCE: "1" + run: | + uv venv + uv pip install -r requirements-dev.lock + uv pip install -e . + uv run pyright + test: runs-on: ubuntu-latest timeout-minutes: 15 diff --git a/README.md b/README.md index 8da3230..c8680af 100644 --- a/README.md +++ b/README.md @@ -129,7 +129,7 @@ python -m pytest --tb=short --cov=. --cov-report=term-missing --cov-fail-under=9 Coverage writes a local **`.coverage`** file (binary data used by `coverage.py`; safe to delete). It is listed in `.gitignore`. -**CI:** [`.github/workflows/actions.yml`](.github/workflows/actions.yml) runs pytest with `DATABASE_URL=postgres://postgres:postgres@127.0.0.1:5432/postgres` and the same `DJANGO_SETTINGS_MODULE=config.test_settings` as local. +**CI:** [`.github/workflows/actions.yml`](.github/workflows/actions.yml) runs three jobs on pushes/PRs (see the workflow for triggers): **`lint`** (pre-commit on all files), **`pyright`** (static analysis from `pyrightconfig.json`), and **`test`** (pytest with Postgres, `DATABASE_URL=postgres://postgres:postgres@127.0.0.1:5432/postgres`, `DJANGO_SETTINGS_MODULE=config.test_settings`, coverage, and `--cov-fail-under=90`). 6. Run a subset of tests (e.g. one app or one file): @@ -140,6 +140,8 @@ python -m pytest github_activity_tracker/tests/test_sync_utils.py -v CI runs pytest with coverage (`--cov`, HTML/XML reports). To match a **local** coverage gate, use **`--cov-fail-under=90`** (see step 5 above). If coverage fails locally or you need a fresh test DB schema after model changes, run once with `python -m pytest --create-db`. +**Pyright (local):** with dev dependencies installed (`uv pip install -r requirements-dev.lock`), run **`uv run pyright`** from the repo root to match the **`pyright`** CI job (`pyrightconfig.json` scopes `core`, `github_activity_tracker`, and `discord_activity_tracker`). + See [docs/Development_guideline.md](docs/Development_guideline.md#testing-workflow) for when to run tests during development. ## Project structure @@ -202,7 +204,7 @@ Docs are organized **by topic** (one doc per concern: workflow, workspace, servi - [Onboarding.md](docs/Onboarding.md) – First-day orientation for contributors (mental model, app roles, data flow). - [docs/README.md](docs/README.md) – Per-topic index and how to find app-specific info. -- [Running tests](#running-tests) – How to run the test suite (pytest, coverage). +- [Running tests](#running-tests) – How to run the test suite (pytest, coverage) and **Pyright** (`uv run pyright`). - [Celery](#celery) – How to start the Celery worker and Beat. - [Celery_test.md](docs/Celery_test.md) – Testing the Celery task (run once, Beat, Redis). - [operations/](docs/operations/README.md) – **Operations group:** shared I/O (GitHub, Discord, etc.); index and per-operation docs. diff --git a/core/operations/github_ops/client.py b/core/operations/github_ops/client.py index 610a8fa..de3c00e 100644 --- a/core/operations/github_ops/client.py +++ b/core/operations/github_ops/client.py @@ -71,12 +71,13 @@ def _check_rate_limit(self): self.rate_limit_reset_time = data["resources"]["core"]["reset"] if self.rate_limit_remaining == 0: - wait_time = max( - 0, self.rate_limit_reset_time - int(time.time()) - ) + reset_ts = self.rate_limit_reset_time + if reset_ts is None: + reset_ts = int(time.time()) + 60 + wait_time = max(0, reset_ts - int(time.time())) if wait_time > 0: raise RateLimitException( - f"Rate limit exceeded. Reset at {datetime.fromtimestamp(self.rate_limit_reset_time)}. " + f"Rate limit exceeded. Reset at {datetime.fromtimestamp(float(reset_ts))}. " f"Wait {wait_time} seconds." ) return True @@ -133,7 +134,7 @@ def _parse_rate_limit_wait(self, response: requests.Response) -> Optional[int]: dt = dt.replace(tzinfo=timezone.utc) wait = (dt - datetime.now(timezone.utc)).total_seconds() if wait > 0: - return wait + return int(wait) except (ValueError, TypeError): pass # Retry-After missing or did not yield a positive delay; try X-RateLimit-*. @@ -289,6 +290,9 @@ def _do_request( raise ConnectionException( f"Connection error for {endpoint_for_log}: max retries exceeded" ) + raise ConnectionException( + f"Rate limit / connection handling exhausted for {endpoint_for_log}" + ) def _handle_rate_limit( self, wait_time: int, max_delay: Optional[int] = 3600 diff --git a/core/operations/github_ops/git_ops.py b/core/operations/github_ops/git_ops.py index 8912f80..15fad90 100644 --- a/core/operations/github_ops/git_ops.py +++ b/core/operations/github_ops/git_ops.py @@ -154,12 +154,14 @@ def _url_with_token(url: str, token: str) -> str: ) -def sanitize_git_output(text: str) -> str: +def sanitize_git_output(text: str | bytes) -> str: """Redact credentials from git stderr/stdout snippets before logging. Masks GitHub HTTPS PAT forms and other userinfo-in-URL patterns so logs do not leak tokens when clone/push echoes the remote URL. """ + if isinstance(text, bytes): + text = text.decode("utf-8", errors="replace") if not text: return text out = re.sub( @@ -565,6 +567,8 @@ def fetch_file_content( """ if client is None: client = get_github_client(use="scraping") + if client is None: + raise RuntimeError("GitHub scraping client unavailable (missing token?)") content, _ = client.get_file_content(owner, repo, path, ref=ref) return content @@ -589,6 +593,8 @@ def upload_file( return None if client is None: client = get_github_client(use="write") + if client is None: + raise RuntimeError("GitHub write client unavailable (missing token?)") content = local_file_path.read_bytes() content_base64 = base64.b64encode(content).decode("utf-8") if commit_message is None: diff --git a/core/operations/slack_ops/channels.py b/core/operations/slack_ops/channels.py index 82a730e..c038282 100644 --- a/core/operations/slack_ops/channels.py +++ b/core/operations/slack_ops/channels.py @@ -7,7 +7,7 @@ import logging import os import threading -from typing import Optional +from typing import Any, Optional from core.operations.slack_ops.client import SlackAPIClient from core.operations.slack_ops.tokens import get_slack_client @@ -139,7 +139,7 @@ def run_channel_join_check( public_only = config["public_only"] types = "public_channel" if public_only else "public_channel,private_channel" - result = {"joined": [], "failed": [], "skipped_policy": []} + result: dict[str, Any] = {"joined": [], "failed": [], "skipped_policy": []} try: channels_to_consider = [] cursor = None diff --git a/core/operations/slack_ops/client.py b/core/operations/slack_ops/client.py index d94b439..4c11327 100644 --- a/core/operations/slack_ops/client.py +++ b/core/operations/slack_ops/client.py @@ -145,7 +145,7 @@ def users_list( ) -> dict: """List users in the workspace. Returns members with profile, etc.""" safe_limit = max(1, min(limit, 1000)) - params = {"limit": safe_limit} + params: dict[str, int | str] = {"limit": safe_limit} if cursor: params["cursor"] = cursor return self._request("GET", "users.list", params=params) diff --git a/core/tests/test_protocols.py b/core/tests/test_protocols.py index d91f426..2c05dd3 100644 --- a/core/tests/test_protocols.py +++ b/core/tests/test_protocols.py @@ -2,6 +2,7 @@ from __future__ import annotations +import json import subprocess import sys from pathlib import Path @@ -98,10 +99,33 @@ def test_pyright_positive_protocol_assignment_file() -> None: assert proc.returncode == 0, proc.stdout + proc.stderr -def test_pyright_negative_protocol_assignment_file() -> None: - path = _TYPING_DIR / "protocol_assignment_negative.py" +def test_pyright_negative_protocol_assignment_file(tmp_path: Path) -> None: + """Run Pyright in an isolated project so root ``pyrightconfig`` excludes do not skip the file.""" + src = _TYPING_DIR / "protocol_assignment_negative.py" + dest = tmp_path / "protocol_assignment_negative.py" + dest.write_text(src.read_text(encoding="utf-8"), encoding="utf-8") + cfg_path = tmp_path / "pyrightconfig.json" + cfg_path.write_text( + json.dumps( + { + "include": ["protocol_assignment_negative.py"], + "exclude": [], + "pythonVersion": "3.11", + "typeCheckingMode": "basic", + "reportMissingImports": True, + "stubPath": "", + "executionEnvironments": [ + { + "root": str(tmp_path.resolve()), + "extraPaths": [str(_REPO_ROOT.resolve())], + } + ], + } + ), + encoding="utf-8", + ) proc = subprocess.run( - [sys.executable, "-m", "pyright", str(path)], + [sys.executable, "-m", "pyright", "--project", str(tmp_path)], cwd=_REPO_ROOT, capture_output=True, text=True, diff --git a/core/utils/boost_version_operations.py b/core/utils/boost_version_operations.py index 7383590..396df2c 100644 --- a/core/utils/boost_version_operations.py +++ b/core/utils/boost_version_operations.py @@ -114,7 +114,7 @@ def loose_version_tuple(version: str) -> tuple[int, int, int]: out.append(int(number) if number else 0) while len(out) < 3: out.append(0) - return tuple(out[:3]) + return (out[0], out[1], out[2]) # --- Normalization ------------------------------------------------------------ diff --git a/discord_activity_tracker/preprocessor.py b/discord_activity_tracker/preprocessor.py index b3cfccc..fbb074d 100644 --- a/discord_activity_tracker/preprocessor.py +++ b/discord_activity_tracker/preprocessor.py @@ -117,7 +117,7 @@ def _pinecone_channel_display_name(channel: DiscordChannel) -> str: def _format_chain_message_line(msg: DiscordMessage, cleaned: str) -> str: """One line for merged reply-chain content: ``username: "message text"``.""" - username = msg.author.username if msg.author_id else "unknown" + username = msg.author.username if getattr(msg, "author_id", None) else "unknown" escaped = cleaned.replace("\\", "\\\\").replace('"', '\\"') return f'{username}: "{escaped}"' @@ -164,7 +164,9 @@ def _chain_to_document( "channel_name": _pinecone_channel_display_name(channel), "server_id": str(server.server_id), "server_name": server.server_name, - "author": root.author.username if root.author_id else "unknown", + "author": ( + root.author.username if getattr(root, "author_id", None) else "unknown" + ), "timestamp": ts, "is_reply_chain": len(chain) > 1, "source_ids": ",".join(ids), diff --git a/discord_activity_tracker/staging_schema.py b/discord_activity_tracker/staging_schema.py index 614b241..adc37fc 100644 --- a/discord_activity_tracker/staging_schema.py +++ b/discord_activity_tracker/staging_schema.py @@ -17,7 +17,7 @@ import json from pathlib import Path -from typing import Annotated, Any, Union +from typing import Annotated, Any, NoReturn, Union from pydantic import BaseModel, ConfigDict, Field, ValidationError, field_validator @@ -130,7 +130,7 @@ def _blank_optional_timestamp_to_none(cls, v: Any) -> Any: return v -def _validation_error(prefix: str, err: ValidationError) -> None: +def _validation_error(prefix: str, err: ValidationError) -> NoReturn: detail = err.errors()[:5] msg = f"{prefix}: " + "; ".join( f"{e.get('loc', ())}: {e.get('msg', '')}" for e in detail @@ -150,7 +150,7 @@ def validate_envelope( try: return DiscordChatExporterEnvelope.model_validate(data) except ValidationError as e: - raise _validation_error(prefix, e) from e + _validation_error(prefix, e) def validate_normalized_message( @@ -163,7 +163,7 @@ def validate_normalized_message( try: return NormalizedDiscordMessage.model_validate(obj) except ValidationError as e: - raise _validation_error(prefix, e) from e + _validation_error(prefix, e) def build_staging_json_schema_bundle() -> dict[str, Any]: diff --git a/discord_activity_tracker/sync/chat_exporter.py b/discord_activity_tracker/sync/chat_exporter.py index ac9ceed..2551123 100644 --- a/discord_activity_tracker/sync/chat_exporter.py +++ b/discord_activity_tracker/sync/chat_exporter.py @@ -453,7 +453,7 @@ def _export_guild_sequential( before_date: Optional[datetime], include_threads: str, channel_ids: Optional[Sequence[int]], -) -> List[Path]: +) -> None: logger.info( "DiscordChatExporter sequential mode (DISCORD_CHAT_EXPORTER_SEQUENTIAL_EXPORT)" ) diff --git a/discord_activity_tracker/sync/export.py b/discord_activity_tracker/sync/export.py index 1051a0f..02f5400 100644 --- a/discord_activity_tracker/sync/export.py +++ b/discord_activity_tracker/sync/export.py @@ -74,7 +74,8 @@ def generate_markdown_content( first_msg = messages[0] last_msg = messages[-1] message_count = len(messages) - unique_authors = set(msg.author_id for msg in messages) + unique_authors = {getattr(msg, "author_id", None) for msg in messages} + unique_authors.discard(None) active_users = len(unique_authors) else: first_msg = last_msg = None diff --git a/docs/Contributing.md b/docs/Contributing.md index fc847eb..28dfa19 100644 --- a/docs/Contributing.md +++ b/docs/Contributing.md @@ -51,7 +51,7 @@ For a full list of functions, parameter/return types, and validation (e.g. empty ### Testing -- **Running tests:** From the project root, install dev deps (`pip install -r requirements-dev.txt`), start the test database (`docker compose -f docker-compose.test.yml up -d`), set `DATABASE_URL` (and `SECRET_KEY` for the process) as in [README.md](../README.md#running-tests), then run `python -m pytest`. Tests **always use PostgreSQL** (`config.test_settings`); there is no SQLite fallback. +- **Running tests:** From the project root, install dev deps (`pip install -r requirements-dev.lock` or `uv pip install -r requirements-dev.lock`), start the test database (`docker compose -f docker-compose.test.yml up -d`), set `DATABASE_URL` (and `SECRET_KEY` for the process) as in [README.md](../README.md#running-tests), then run `python -m pytest`. Tests **always use PostgreSQL** (`config.test_settings`); there is no SQLite fallback. - See [README.md](../README.md#running-tests) and [Development_guideline.md](Development_guideline.md#testing-workflow) for full commands and options. - **Unit tests for `services.py`:** Call the service functions and assert on the database (or mocks) as needed. - **Other tests:** Prefer service functions when setting up data. If you must create models directly for tests, keep it in test code (e.g. fixtures or test helpers) and avoid doing the same in production code. @@ -59,7 +59,7 @@ For a full list of functions, parameter/return types, and validation (e.g. empty ## Other guidelines - **Branching:** Create feature branches from `develop`. Open pull requests against `develop`. See [Development_guideline.md](Development_guideline.md). -- **Code style:** Use Python 3.11+ and follow Django and project conventions. Use the project’s logging (`logging.getLogger(__name__)`). +- **Code style:** Use Python 3.11+ and follow Django and project conventions. Use the project’s logging (`logging.getLogger(__name__)`). Before pushing, run **`uv run pyright`** (with dev deps) for the paths covered by **`pyrightconfig.json`**, and ensure CI’s **lint** / **pyright** / **test** jobs would pass. - **Database:** Use the Django ORM and migrations. Writes only through the service layer as above. - **Docs:** Update this doc (and app `services.py` docstrings) when adding new apps or changing the write rules. diff --git a/docs/Core_public_API.md b/docs/Core_public_API.md index 300f437..67fda01 100644 --- a/docs/Core_public_API.md +++ b/docs/Core_public_API.md @@ -34,7 +34,7 @@ Structural contracts for **data** that crosses tracker layers (sync outcomes, ac Implementations are frozen dataclasses in each tracker app (for example `github_activity_tracker.protocol_impl`, `discord_activity_tracker.protocol_impl`). Prefer dataclasses over plain `dict` for reliable `isinstance` checks with `@runtime_checkable`. -**Local static check:** with dev dependencies installed (`requirements-dev.lock`), from the repo root run `python -m pyright` (optionally scoped to `core/pyright_samples/` snippet files). Pyright is not required in CI for this repository. +**Local static check:** with dev dependencies installed (`requirements-dev.lock`), from the repo root run **`uv run pyright`** (same as the **`pyright`** job in [`.github/workflows/actions.yml`](../.github/workflows/actions.yml)). Root **`pyrightconfig.json`** scopes analysis to `core`, `github_activity_tracker`, and `discord_activity_tracker` and excludes **`core/pyright_samples/**`** from that run; **`core/tests/test_protocols.py`** still exercises positive/negative protocol assignment snippets via subprocess. ## Reducing coupling diff --git a/docs/Deployment.md b/docs/Deployment.md index f1bb1ac..9557d53 100644 --- a/docs/Deployment.md +++ b/docs/Deployment.md @@ -7,7 +7,7 @@ This project uses a GitHub Actions CI/CD pipeline that automatically deploys to ``` Push to main/develop ↓ -CI workflow (lint + tests) +CI workflow (lint, Pyright, tests) ↓ on success Deploy workflow ↓ diff --git a/docs/Development_guideline.md b/docs/Development_guideline.md index 2beb3a7..da867ab 100644 --- a/docs/Development_guideline.md +++ b/docs/Development_guideline.md @@ -100,6 +100,7 @@ Use these steps to get the Django project running on your machine. Run tests often so you catch problems early. - **PostgreSQL for pytest:** `config.test_settings` requires `DATABASE_URL` pointing at PostgreSQL (see [README.md](../README.md#running-tests): `docker compose -f docker-compose.test.yml up -d`, then export `DATABASE_URL` / `SECRET_KEY`). This matches CI and avoids SQLite-only passes that fail in production. +- **Pyright:** Install dev dependencies (`requirements-dev.lock`), then from the project root run **`uv run pyright`**. Configuration lives in **`pyrightconfig.json`** at the repo root (typed paths: `core`, `github_activity_tracker`, `discord_activity_tracker`; `core/pyright_samples/**` is excluded from the default run—see **`core/tests/test_protocols.py`** for protocol assignment checks). The **`pyright`** job in [`.github/workflows/actions.yml`](../.github/workflows/actions.yml) runs the same check in CI. - **Before each commit:** run the test suite for the code you changed (`python -m pytest` or a subset). - **For app commands:** ensure the command runs successfully (e.g. `python manage.py run_boost_library_tracker` exits with 0 and does the expected work). - **Full workflow:** run `python manage.py run_scheduled_collectors --schedule default --group ` / `--schedule interval --interval-minutes ` when testing the YAML-driven path (matches how Celery Beat invokes it). @@ -111,7 +112,7 @@ This guide walks you from setup to merged code. 1. Set up locally - Follow "Local development setup" above. 2. Create a feature branch - Branch from `develop` (e.g. `git checkout develop && git pull && git checkout -b feature/your-feature-name`). -3. Develop and test - Make your changes in the Django app. Run the testing workflow (e.g. run tests and the app command) after each logical change. +3. Develop and test - Make your changes in the Django app. Run the testing workflow (e.g. **`uv run pyright`**, tests, and the app command) after each logical change. 4. Commit and push - Commit with clear messages and push the feature branch to the remote. 5. Open a pull request - Open a PR targeting the `develop` branch. Describe what changed and how to test it. 6. Address review - Respond to reviewer comments and update the PR as needed. diff --git a/docs/How_to_add_a_collector.md b/docs/How_to_add_a_collector.md index 0b3753b..b86cabd 100644 --- a/docs/How_to_add_a_collector.md +++ b/docs/How_to_add_a_collector.md @@ -285,7 +285,7 @@ def test_run_my_skeleton_tracker_command_integration(): ## 6. Tests - Add tests under `/tests/`; keep exit codes and boundaries mockable. -- Run `python -m pytest` locally; CI runs with `DATABASE_URL` pointing at Postgres (see [README.md](../README.md#running-tests) for local Postgres parity). +- Run `python -m pytest` locally; CI runs **lint** (pre-commit), **Pyright**, and **test** (pytest with Postgres and coverage); see [README.md](../README.md#running-tests) for local Postgres parity and `uv run pyright` for typing. ## 7. Docs diff --git a/docs/README.md b/docs/README.md index ce552a3..b005266 100644 --- a/docs/README.md +++ b/docs/README.md @@ -17,7 +17,7 @@ Documentation is organized **by topic**, not by app. Each doc covers one cross-c | **Workspace** | [Workspace.md](Workspace.md) | Workspace layout and usage for file processing (`workspace//...`). | | **Schema** | [Schema.md](Schema.md) | Database schema and table relationships. | | **Development** | [Development_guideline.md](Development_guideline.md) | Development setup, app requirements, and step-by-step workflow. | -| **Testing** | [README.md](../README.md#running-tests), [Development_guideline.md](Development_guideline.md#testing-workflow) | How to run tests (pytest), coverage, and when to run them. | +| **Testing / typing** | [README.md](../README.md#running-tests), [Development_guideline.md](Development_guideline.md#testing-workflow) | pytest (Postgres), coverage, when to run tests; **Pyright** (`uv run pyright`) and CI jobs. | | **Deployment** | [Deployment.md](Deployment.md) | CI/CD pipeline, environment secrets (`SSH_HOST`, `SSH_USER`, `SSH_PRIVATE_KEY`; optional `SSH_PORT`), server setup, and deploy script behavior. | | **Contributing** | [Contributing.md](Contributing.md) | Service layer (single place for writes) and contributor guidelines. | | **Service API** | [Service_API.md](Service_API.md) | API reference and index for all service layer functions. | diff --git a/github_activity_tracker/fetcher.py b/github_activity_tracker/fetcher.py index 7f19cc9..de0d555 100644 --- a/github_activity_tracker/fetcher.py +++ b/github_activity_tracker/fetcher.py @@ -8,7 +8,7 @@ import logging import time from datetime import datetime, timezone -from typing import TYPE_CHECKING, Any, Iterator, Optional +from typing import TYPE_CHECKING, Any, Iterator, Optional, cast from urllib.parse import parse_qs, urlparse import requests @@ -206,7 +206,9 @@ def fetch_commits_from_github( if next_url: # rel="last" omitted but rel="next" is present: fetch remaining pages, oldest-first. - pages: list[list[dict]] = [first_page_data] + pages: list[list[dict[str, Any]]] = [ + cast(list[dict[str, Any]], first_page_data or []) + ] current_links = first_page_links while current_links.get("next"): forward_url = current_links["next"] @@ -219,7 +221,7 @@ def fetch_commits_from_github( forward_url, ) time.sleep(0.2) - pages.append(page_data or []) + pages.append(cast(list[dict[str, Any]], page_data or [])) for page_data in reversed(pages): for commit in reversed(page_data): @@ -317,7 +319,7 @@ def fetch_pr_reviews_from_github( page = 1 per_page = 100 while True: - params = { + params: dict[str, Any] = { "per_page": per_page, "page": page, } @@ -394,8 +396,8 @@ def fetch_issues_and_prs_from_github( next_url: Optional[str] = None page_num = 1 - def _issues_list_params(page: int) -> dict: - params: dict = { + def _issues_list_params(page: int) -> dict[str, Any]: + params: dict[str, Any] = { "state": "all", "per_page": per_page, "page": page, @@ -501,7 +503,7 @@ def _yield_issue_pr_items_for_list_page(items: list) -> Iterator[dict]: "Fetched %d items (issues+PRs combined) from page %s", len(items), page_num ) - yield from _yield_issue_pr_items_for_list_page(items) + yield from _yield_issue_pr_items_for_list_page(cast(list[Any], items)) if etag_cache is not None and response_etag: etag_cache.set("issues_and_prs", page_num, since_iso, "", response_etag) @@ -533,7 +535,7 @@ def _yield_issue_pr_items_for_list_page(items: list) -> Iterator[dict]: "Fetched %d items (issues+PRs combined) from page %s", len(items), page_num ) - yield from _yield_issue_pr_items_for_list_page(items) + yield from _yield_issue_pr_items_for_list_page(cast(list[Any], items)) if next_url is None: logger.debug('Last page reached (no Link rel="next")') diff --git a/github_activity_tracker/management/commands/backfill_300_file_commits.py b/github_activity_tracker/management/commands/backfill_300_file_commits.py index 91885f7..724afd2 100644 --- a/github_activity_tracker/management/commands/backfill_300_file_commits.py +++ b/github_activity_tracker/management/commands/backfill_300_file_commits.py @@ -60,7 +60,9 @@ def handle(self, *args, **options): ) if limit > 0: commits_300 = list(commits_300[:limit]) - total = len(commits_300) if limit > 0 else commits_300.count() + total = len(commits_300) + else: + total = commits_300.count() if total == 0: self.stdout.write( self.style.SUCCESS("No commits with exactly 300 file changes found.") @@ -102,7 +104,9 @@ def handle(self, *args, **options): GitCommitFileChange.objects.filter(commit=commit_obj).delete() _process_commit_files(repo, commit_obj, full_files) - new_count = commit_obj.file_changes.count() + new_count = GitCommitFileChange.objects.filter( + commit=commit_obj + ).count() self.stdout.write( self.style.SUCCESS( f" Updated: 300 -> {new_count} file changes" diff --git a/github_activity_tracker/models.py b/github_activity_tracker/models.py index edfcfa3..bfcfcd0 100644 --- a/github_activity_tracker/models.py +++ b/github_activity_tracker/models.py @@ -8,30 +8,30 @@ # --- Enums --- class FileChangeStatus(models.TextChoices): - ADDED = "added", "Added" - MODIFIED = "modified", "Modified" - REMOVED = "removed", "Removed" - RENAMED = "renamed", "Renamed" - COPIED = "copied", "Copied" - CHANGED = "changed", "Changed" + ADDED = "added", "Added" # pyright: ignore[reportCallIssue] + MODIFIED = "modified", "Modified" # pyright: ignore[reportCallIssue] + REMOVED = "removed", "Removed" # pyright: ignore[reportCallIssue] + RENAMED = "renamed", "Renamed" # pyright: ignore[reportCallIssue] + COPIED = "copied", "Copied" # pyright: ignore[reportCallIssue] + CHANGED = "changed", "Changed" # pyright: ignore[reportCallIssue] class IssueState(models.TextChoices): - OPEN = "open", "Open" - CLOSED = "closed", "Closed" + OPEN = "open", "Open" # pyright: ignore[reportCallIssue] + CLOSED = "closed", "Closed" # pyright: ignore[reportCallIssue] class IssueStateReason(models.TextChoices): - COMPLETED = "completed", "Completed" - NOT_PLANNED = "not_planned", "Not planned" - REOPENED = "reopened", "Reopened" - NULL = "null", "Null" + COMPLETED = "completed", "Completed" # pyright: ignore[reportCallIssue] + NOT_PLANNED = "not_planned", "Not planned" # pyright: ignore[reportCallIssue] + REOPENED = "reopened", "Reopened" # pyright: ignore[reportCallIssue] + NULL = "null", "Null" # pyright: ignore[reportCallIssue] class PullRequestState(models.TextChoices): - OPEN = "open", "Open" - CLOSED = "closed", "Closed" - MERGED = "merged", "Merged" + OPEN = "open", "Open" # pyright: ignore[reportCallIssue] + CLOSED = "closed", "Closed" # pyright: ignore[reportCallIssue] + MERGED = "merged", "Merged" # pyright: ignore[reportCallIssue] # --- Part 1: Repository, Language, License --- diff --git a/github_activity_tracker/services.py b/github_activity_tracker/services.py index 7a5e651..3cdc841 100644 --- a/github_activity_tracker/services.py +++ b/github_activity_tracker/services.py @@ -140,7 +140,7 @@ def ensure_repository_owner( ) -> None: """Ensure repo has owner_account set (fixes rows with null owner_account_id).""" repo.refresh_from_db() - if repo.owner_account_id is None: + if getattr(repo, "owner_account_id", None) is None: repo.owner_account = owner_account repo.save(update_fields=["owner_account_id"]) @@ -272,7 +272,7 @@ def set_github_file_previous_filename( previous_file: GitHubFile, ) -> None: """Set the previous_filename reference for a renamed file.""" - if github_file.previous_filename_id != previous_file.id: + if getattr(github_file, "previous_filename_id", None) != previous_file.id: github_file.previous_filename = previous_file github_file.save(update_fields=["previous_filename"]) diff --git a/github_activity_tracker/sync/commits.py b/github_activity_tracker/sync/commits.py index 38817aa..24f63c2 100644 --- a/github_activity_tracker/sync/commits.py +++ b/github_activity_tracker/sync/commits.py @@ -20,7 +20,7 @@ get_or_create_unknown_github_account, ) from github_activity_tracker import big_commit, fetcher, services -from github_activity_tracker.models import FileChangeStatus +from github_activity_tracker.models import FileChangeStatus, GitCommit from github_activity_tracker.workspace import ( get_commit_json_path, iter_existing_commit_jsons, @@ -35,7 +35,7 @@ ) if TYPE_CHECKING: - from github_activity_tracker.models import GitCommit, GitHubRepository + from github_activity_tracker.models import GitHubRepository logger = logging.getLogger(__name__) @@ -71,7 +71,7 @@ def _process_commit_files( repo, filename, is_deleted=is_deleted ) # Link new file to old file - if github_file.previous_filename_id != old_file.id: + if getattr(github_file, "previous_filename_id", None) != old_file.id: services.set_github_file_previous_filename(github_file, old_file) else: github_file, _ = services.create_or_update_github_file( @@ -116,7 +116,11 @@ def _process_commit_data(repo: GitHubRepository, commit_data: dict) -> None: name, email = _commit_author_name_and_email(commit_data) account, _ = get_or_create_unknown_github_account(name=name, email=email) - commit_hash = commit_data.get("sha") + commit_hash_raw = commit_data.get("sha") + if not isinstance(commit_hash_raw, str) or not commit_hash_raw: + logger.warning("Commit payload missing sha; skipping") + return + commit_hash = commit_hash_raw comment = commit_data.get("commit", {}).get("message", "") commit_date_str = commit_data.get("commit", {}).get("author", {}).get( "date" @@ -167,6 +171,9 @@ def _process_big_commit_worker(owner: str, repo_name: str, commit_data: dict) -> """ try: sha = commit_data.get("sha") + if not isinstance(sha, str) or not sha: + logger.warning("Big commit payload missing sha; skipping worker") + return logger.info( "Processing big commit %s/%s:%s in background", owner, @@ -221,7 +228,11 @@ def _process_big_commit_worker(owner: str, repo_name: str, commit_data: dict) -> ) # Write original commit data (with 300 files) so we don't lose the commit try: - json_path = get_commit_json_path(owner, repo_name, commit_data.get("sha")) + sha_fallback = commit_data.get("sha") + if not isinstance(sha_fallback, str) or not sha_fallback: + logger.error("Cannot write fallback JSON: missing sha") + return + json_path = get_commit_json_path(owner, repo_name, sha_fallback) json_path.parent.mkdir(parents=True, exist_ok=True) json_path.write_text( json.dumps(commit_data, indent=2, default=str), @@ -229,7 +240,7 @@ def _process_big_commit_worker(owner: str, repo_name: str, commit_data: dict) -> ) logger.warning( "Wrote partial commit data (300 files) for %s after error", - commit_data.get("sha")[:7], + sha_fallback[:7], ) except Exception as write_error: logger.error("Failed to write fallback commit JSON: %s", write_error) @@ -269,8 +280,12 @@ def sync_commits( # Phase 2: fetch from GitHub client = get_github_client() + if client is None: + raise RuntimeError("GitHub client unavailable for sync_commits") if start_date is None: - last_commit = repo.commits.order_by("-commit_at").first() + last_commit = ( + GitCommit.objects.filter(repo=repo).order_by("-commit_at").first() + ) if last_commit: start_date = last_commit.commit_at + timedelta(seconds=1) # Leave end_date as None when not set so the fetcher uses until_iso="" diff --git a/github_activity_tracker/sync/etag_cache.py b/github_activity_tracker/sync/etag_cache.py index 3495143..4e50ef2 100644 --- a/github_activity_tracker/sync/etag_cache.py +++ b/github_activity_tracker/sync/etag_cache.py @@ -8,7 +8,7 @@ import logging import time -from typing import Optional +from typing import Optional, cast from django.conf import settings @@ -96,8 +96,8 @@ def get( return None try: key = self._key(list_type, page, since_iso, until_iso) - value = self._client.get(key) - return value if value else None + raw = self._client.get(key) + return cast(str | None, raw if raw else None) except Exception as e: logger.debug("ETag cache get failed: %s", e) _invalidate_redis_client() diff --git a/github_activity_tracker/sync/issues_and_prs.py b/github_activity_tracker/sync/issues_and_prs.py index 1bad8bb..3a03929 100644 --- a/github_activity_tracker/sync/issues_and_prs.py +++ b/github_activity_tracker/sync/issues_and_prs.py @@ -12,7 +12,7 @@ import json import logging from datetime import datetime, timedelta -from typing import TYPE_CHECKING, Optional +from typing import Optional from cppa_user_tracker.services import get_or_create_github_account from github_activity_tracker import fetcher, services @@ -36,8 +36,13 @@ from core.operations.github_ops import get_github_client from core.operations.github_ops.client import ConnectionException, RateLimitException -if TYPE_CHECKING: - from github_activity_tracker.models import GitHubRepository +from github_activity_tracker.models import ( + GitHubRepository, + Issue, + IssueLabel, + PullRequest, + PullRequestLabel, +) logger = logging.getLogger(__name__) @@ -60,11 +65,23 @@ def _process_issue_data(repo: GitHubRepository, issue_data: dict) -> None: avatar_url=user_info["avatar_url"], ) + issue_number_raw = issue_data.get("number") + issue_id_raw = issue_data.get("id") + if issue_number_raw is None or issue_id_raw is None: + logger.warning( + "Issue missing number or id; skipping (got number=%r id=%r)", + issue_number_raw, + issue_id_raw, + ) + return + issue_number = int(issue_number_raw) + issue_id = int(issue_id_raw) + issue_obj, _ = services.create_or_update_issue( repo=repo, account=account, - issue_number=issue_data.get("number"), - issue_id=issue_data.get("id"), + issue_number=issue_number, + issue_id=issue_id, title=issue_data.get("title", ""), body=issue_data.get("body", ""), state=issue_data.get("state", "open"), @@ -113,7 +130,9 @@ def _process_issue_data(repo: GitHubRepository, issue_data: dict) -> None: if (label_data.get("name") or "") } existing_label_names = { - il.label_name for il in issue_obj.labels.all() if il.label_name + il.label_name + for il in IssueLabel.objects.filter(issue=issue_obj) + if il.label_name } for label_name in existing_label_names - incoming_label_names: services.remove_issue_label(issue_obj, label_name) @@ -168,11 +187,23 @@ def _process_pr_data(repo: GitHubRepository, pr_data: dict) -> None: avatar_url=user_info["avatar_url"], ) + pr_number_raw = pr_data.get("number") + pr_id_raw = pr_data.get("id") + if pr_number_raw is None or pr_id_raw is None: + logger.warning( + "PR missing number or id; skipping (got number=%r id=%r)", + pr_number_raw, + pr_id_raw, + ) + return + pr_number = int(pr_number_raw) + pr_id = int(pr_id_raw) + pr_obj, _ = services.create_or_update_pull_request( repo=repo, account=account, - pr_number=pr_data.get("number"), - pr_id=pr_data.get("id"), + pr_number=pr_number, + pr_id=pr_id, title=pr_data.get("title", ""), body=pr_data.get("body", ""), state=pr_data.get("state", "open"), @@ -242,7 +273,9 @@ def _process_pr_data(repo: GitHubRepository, pr_data: dict) -> None: if (label_data.get("name") or "") } existing_pr_label_names = { - pl.label_name for pl in pr_obj.labels.all() if pl.label_name + pl.label_name + for pl in PullRequestLabel.objects.filter(pr=pr_obj) + if pl.label_name } for label_name in existing_pr_label_names - incoming_pr_label_names: services.remove_pull_request_label(pr_obj, label_name) @@ -326,8 +359,12 @@ def sync_issues_and_prs( # Phase 2: determine start date as max(last issue, last PR) +1s — shared /issues timeline. if start_date is None: - last_issue = repo.issues.order_by("-issue_updated_at").first() - last_pr = repo.pull_requests.order_by("-pr_updated_at").first() + last_issue = ( + Issue.objects.filter(repo=repo).order_by("-issue_updated_at").first() + ) + last_pr = ( + PullRequest.objects.filter(repo=repo).order_by("-pr_updated_at").first() + ) issue_date = ( (last_issue.issue_updated_at + timedelta(seconds=1)) @@ -347,6 +384,8 @@ def sync_issues_and_prs( # Phase 3: fetch from GitHub, write JSON, persist to DB, remove file. client = get_github_client() + if client is None: + raise RuntimeError("GitHub client unavailable for sync_issues_and_prs") etag_cache = RedisListETagCache(repo_id=repo.pk) count_issues = 0 count_prs = 0 diff --git a/github_activity_tracker/sync/repos.py b/github_activity_tracker/sync/repos.py index e7e3272..7408b19 100644 --- a/github_activity_tracker/sync/repos.py +++ b/github_activity_tracker/sync/repos.py @@ -25,7 +25,8 @@ def sync_repos(repo: GitHubRepository) -> None: try: client = get_github_client() - + if client is None: + raise RuntimeError("GitHub client unavailable for sync_repos") owner = repo.owner_account.username repo_name = repo.repo_name diff --git a/github_activity_tracker/tests/test_sync_issues_and_prs.py b/github_activity_tracker/tests/test_sync_issues_and_prs.py index e005a83..a73b7e6 100644 --- a/github_activity_tracker/tests/test_sync_issues_and_prs.py +++ b/github_activity_tracker/tests/test_sync_issues_and_prs.py @@ -12,19 +12,28 @@ ) +@patch("github_activity_tracker.sync.issues_and_prs.PullRequest.objects") +@patch("github_activity_tracker.sync.issues_and_prs.Issue.objects") @patch("github_activity_tracker.sync.issues_and_prs.get_github_client") @patch("github_activity_tracker.sync.issues_and_prs.fetcher") @patch("github_activity_tracker.sync.issues_and_prs._process_existing_issue_jsons") @patch("github_activity_tracker.sync.issues_and_prs._process_existing_pr_jsons") def test_sync_issues_and_prs_processes_both_types( - mock_existing_prs, mock_existing_issues, mock_fetcher, mock_get_client + mock_existing_prs, + mock_existing_issues, + mock_fetcher, + mock_get_client, + mock_issue_objects, + mock_pr_objects, ): """sync_issues_and_prs routes items by key to issue or PR processing.""" mock_repo = MagicMock() mock_repo.owner_account.username = "owner" mock_repo.repo_name = "repo" - mock_repo.issues.order_by.return_value.first.return_value = None - mock_repo.pull_requests.order_by.return_value.first.return_value = None + mock_issue_objects.filter.return_value.order_by.return_value.first.return_value = ( + None + ) + mock_pr_objects.filter.return_value.order_by.return_value.first.return_value = None mock_existing_issues.return_value = (0, []) mock_existing_prs.return_value = (0, []) @@ -65,12 +74,19 @@ def test_sync_issues_and_prs_processes_both_types( mock_proc_pr.assert_called_once() +@patch("github_activity_tracker.sync.issues_and_prs.PullRequest.objects") +@patch("github_activity_tracker.sync.issues_and_prs.Issue.objects") @patch("github_activity_tracker.sync.issues_and_prs.get_github_client") @patch("github_activity_tracker.sync.issues_and_prs.fetcher") @patch("github_activity_tracker.sync.issues_and_prs._process_existing_issue_jsons") @patch("github_activity_tracker.sync.issues_and_prs._process_existing_pr_jsons") def test_sync_issues_and_prs_uses_max_start_date( - mock_existing_prs, mock_existing_issues, mock_fetcher, mock_get_client + mock_existing_prs, + mock_existing_issues, + mock_fetcher, + mock_get_client, + mock_issue_objects, + mock_pr_objects, ): """sync_issues_and_prs uses the later of last_issue and last_pr (+1s) as start_date.""" mock_repo = MagicMock() @@ -80,12 +96,16 @@ def test_sync_issues_and_prs_uses_max_start_date( # Last issue updated at 2024-01-05 mock_last_issue = MagicMock() mock_last_issue.issue_updated_at = datetime(2024, 1, 5, tzinfo=timezone.utc) - mock_repo.issues.order_by.return_value.first.return_value = mock_last_issue + mock_issue_objects.filter.return_value.order_by.return_value.first.return_value = ( + mock_last_issue + ) # Last PR updated at 2024-01-03 (older than last issue) mock_last_pr = MagicMock() mock_last_pr.pr_updated_at = datetime(2024, 1, 3, tzinfo=timezone.utc) - mock_repo.pull_requests.order_by.return_value.first.return_value = mock_last_pr + mock_pr_objects.filter.return_value.order_by.return_value.first.return_value = ( + mock_last_pr + ) mock_existing_issues.return_value = (0, []) mock_existing_prs.return_value = (0, []) @@ -102,19 +122,28 @@ def test_sync_issues_and_prs_uses_max_start_date( assert start_date == datetime(2024, 1, 5, 0, 0, 1, tzinfo=timezone.utc) +@patch("github_activity_tracker.sync.issues_and_prs.PullRequest.objects") +@patch("github_activity_tracker.sync.issues_and_prs.Issue.objects") @patch("github_activity_tracker.sync.issues_and_prs.get_github_client") @patch("github_activity_tracker.sync.issues_and_prs.fetcher") @patch("github_activity_tracker.sync.issues_and_prs._process_existing_issue_jsons") @patch("github_activity_tracker.sync.issues_and_prs._process_existing_pr_jsons") def test_sync_issues_and_prs_processes_existing_jsons_first( - mock_existing_prs, mock_existing_issues, mock_fetcher, mock_get_client + mock_existing_prs, + mock_existing_issues, + mock_fetcher, + mock_get_client, + mock_issue_objects, + mock_pr_objects, ): """sync_issues_and_prs processes leftover JSON files before fetching from GitHub.""" mock_repo = MagicMock() mock_repo.owner_account.username = "owner" mock_repo.repo_name = "repo" - mock_repo.issues.order_by.return_value.first.return_value = None - mock_repo.pull_requests.order_by.return_value.first.return_value = None + mock_issue_objects.filter.return_value.order_by.return_value.first.return_value = ( + None + ) + mock_pr_objects.filter.return_value.order_by.return_value.first.return_value = None # Existing JSONs found mock_existing_issues.return_value = (2, [10, 11]) @@ -134,12 +163,19 @@ def test_sync_issues_and_prs_processes_existing_jsons_first( mock_existing_prs.assert_called_once_with(mock_repo) +@patch("github_activity_tracker.sync.issues_and_prs.PullRequest.objects") +@patch("github_activity_tracker.sync.issues_and_prs.Issue.objects") @patch("github_activity_tracker.sync.issues_and_prs.get_github_client") @patch("github_activity_tracker.sync.issues_and_prs.fetcher") @patch("github_activity_tracker.sync.issues_and_prs._process_existing_issue_jsons") @patch("github_activity_tracker.sync.issues_and_prs._process_existing_pr_jsons") def test_sync_issues_and_prs_respects_override_start_date( - mock_existing_prs, mock_existing_issues, mock_fetcher, mock_get_client + mock_existing_prs, + mock_existing_issues, + mock_fetcher, + mock_get_client, + mock_issue_objects, + mock_pr_objects, ): """sync_issues_and_prs uses provided start_date instead of deriving from DB.""" mock_repo = MagicMock() @@ -157,27 +193,36 @@ def test_sync_issues_and_prs_respects_override_start_date( sync_issues_and_prs(mock_repo, start_date=override_start) # Should NOT query DB for last issue/PR - mock_repo.issues.order_by.assert_not_called() - mock_repo.pull_requests.order_by.assert_not_called() + mock_issue_objects.filter.assert_not_called() + mock_pr_objects.filter.assert_not_called() # Should pass override_start to fetcher call_args = mock_fetcher.fetch_issues_and_prs_from_github.call_args assert call_args[0][3] == override_start +@patch("github_activity_tracker.sync.issues_and_prs.PullRequest.objects") +@patch("github_activity_tracker.sync.issues_and_prs.Issue.objects") @patch("github_activity_tracker.sync.issues_and_prs.get_github_client") @patch("github_activity_tracker.sync.issues_and_prs.fetcher") @patch("github_activity_tracker.sync.issues_and_prs._process_existing_issue_jsons") @patch("github_activity_tracker.sync.issues_and_prs._process_existing_pr_jsons") def test_sync_issues_and_prs_saves_and_removes_json_files( - mock_existing_prs, mock_existing_issues, mock_fetcher, mock_get_client + mock_existing_prs, + mock_existing_issues, + mock_fetcher, + mock_get_client, + mock_issue_objects, + mock_pr_objects, ): """sync_issues_and_prs writes JSON, processes, then removes file for each item.""" mock_repo = MagicMock() mock_repo.owner_account.username = "owner" mock_repo.repo_name = "repo" - mock_repo.issues.order_by.return_value.first.return_value = None - mock_repo.pull_requests.order_by.return_value.first.return_value = None + mock_issue_objects.filter.return_value.order_by.return_value.first.return_value = ( + None + ) + mock_pr_objects.filter.return_value.order_by.return_value.first.return_value = None mock_existing_issues.return_value = (0, []) mock_existing_prs.return_value = (0, []) diff --git a/pyrightconfig.json b/pyrightconfig.json index abe32a0..109b1c4 100644 --- a/pyrightconfig.json +++ b/pyrightconfig.json @@ -5,6 +5,7 @@ "**/migrations/**", "**/__pycache__/**", "**/tests/**", + "core/pyright_samples/**", "**/._*", "**/.___*" ], From 0da5443c0e22040787666aad75c3079fdbf87c57 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Thu, 14 May 2026 07:41:06 -0700 Subject: [PATCH 3/3] feat: improve commit and issue processing with enhanced type validation and error handling --- core/operations/github_ops/client.py | 3 ++- github_activity_tracker/sync/commits.py | 13 ++++++---- .../sync/issues_and_prs.py | 24 +++++++++++++++---- 3 files changed, 30 insertions(+), 10 deletions(-) diff --git a/core/operations/github_ops/client.py b/core/operations/github_ops/client.py index de3c00e..d4f6ba3 100644 --- a/core/operations/github_ops/client.py +++ b/core/operations/github_ops/client.py @@ -7,6 +7,7 @@ import base64 import logging +import math import re import time from email.utils import parsedate_to_datetime @@ -134,7 +135,7 @@ def _parse_rate_limit_wait(self, response: requests.Response) -> Optional[int]: dt = dt.replace(tzinfo=timezone.utc) wait = (dt - datetime.now(timezone.utc)).total_seconds() if wait > 0: - return int(wait) + return math.ceil(wait) except (ValueError, TypeError): pass # Retry-After missing or did not yield a positive delay; try X-RateLimit-*. diff --git a/github_activity_tracker/sync/commits.py b/github_activity_tracker/sync/commits.py index 24f63c2..bbe0f8f 100644 --- a/github_activity_tracker/sync/commits.py +++ b/github_activity_tracker/sync/commits.py @@ -117,10 +117,10 @@ def _process_commit_data(repo: GitHubRepository, commit_data: dict) -> None: account, _ = get_or_create_unknown_github_account(name=name, email=email) commit_hash_raw = commit_data.get("sha") - if not isinstance(commit_hash_raw, str) or not commit_hash_raw: + if not isinstance(commit_hash_raw, str) or not commit_hash_raw.strip(): logger.warning("Commit payload missing sha; skipping") return - commit_hash = commit_hash_raw + commit_hash = commit_hash_raw.strip() comment = commit_data.get("commit", {}).get("message", "") commit_date_str = commit_data.get("commit", {}).get("author", {}).get( "date" @@ -171,9 +171,10 @@ def _process_big_commit_worker(owner: str, repo_name: str, commit_data: dict) -> """ try: sha = commit_data.get("sha") - if not isinstance(sha, str) or not sha: + if not isinstance(sha, str) or not sha.strip(): logger.warning("Big commit payload missing sha; skipping worker") return + sha = sha.strip() logger.info( "Processing big commit %s/%s:%s in background", owner, @@ -229,9 +230,10 @@ def _process_big_commit_worker(owner: str, repo_name: str, commit_data: dict) -> # Write original commit data (with 300 files) so we don't lose the commit try: sha_fallback = commit_data.get("sha") - if not isinstance(sha_fallback, str) or not sha_fallback: + if not isinstance(sha_fallback, str) or not sha_fallback.strip(): logger.error("Cannot write fallback JSON: missing sha") return + sha_fallback = sha_fallback.strip() json_path = get_commit_json_path(owner, repo_name, sha_fallback) json_path.parent.mkdir(parents=True, exist_ok=True) json_path.write_text( @@ -304,8 +306,9 @@ def sync_commits( client, owner, repo_name, start_date, end_date, etag_cache=etag_cache ): sha = commit_data.get("sha") - if not sha: + if not isinstance(sha, str) or not sha.strip(): continue + sha = sha.strip() # Check if commit is truncated (300 files = possible truncation) is_truncated = big_commit.is_commit_truncated(commit_data) diff --git a/github_activity_tracker/sync/issues_and_prs.py b/github_activity_tracker/sync/issues_and_prs.py index 3a03929..aabf15d 100644 --- a/github_activity_tracker/sync/issues_and_prs.py +++ b/github_activity_tracker/sync/issues_and_prs.py @@ -74,8 +74,16 @@ def _process_issue_data(repo: GitHubRepository, issue_data: dict) -> None: issue_id_raw, ) return - issue_number = int(issue_number_raw) - issue_id = int(issue_id_raw) + try: + issue_number = int(issue_number_raw) + issue_id = int(issue_id_raw) + except (TypeError, ValueError): + logger.warning( + "Issue number/id not numeric; skipping (got number=%r id=%r)", + issue_number_raw, + issue_id_raw, + ) + return issue_obj, _ = services.create_or_update_issue( repo=repo, @@ -196,8 +204,16 @@ def _process_pr_data(repo: GitHubRepository, pr_data: dict) -> None: pr_id_raw, ) return - pr_number = int(pr_number_raw) - pr_id = int(pr_id_raw) + try: + pr_number = int(pr_number_raw) + pr_id = int(pr_id_raw) + except (TypeError, ValueError): + logger.warning( + "PR number/id not numeric; skipping (got number=%r id=%r)", + pr_number_raw, + pr_id_raw, + ) + return pr_obj, _ = services.create_or_update_pull_request( repo=repo,