Skip to content
Merged
64 changes: 63 additions & 1 deletion core/tests/test_datetime_parsing.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,18 @@
"""Tests for core.utils.datetime_parsing."""

import re
from datetime import datetime, timedelta, timezone

import pytest
from django.utils import timezone as django_timezone

from core.utils.datetime_parsing import ensure_aware_utc, parse_iso_datetime
from core.utils.datetime_parsing import (
CANONICAL_INSTANT_UTC_Z_PATTERN,
ensure_aware_utc,
format_instant_iso_z,
parse_iso_datetime,
parse_iso_datetime_lenient,
)


def test_ensure_aware_utc_none():
Expand Down Expand Up @@ -67,3 +74,58 @@ def test_parse_iso_datetime_with_offset_strips_tz_to_naive_utc():
def test_parse_iso_datetime_invalid_raises():
with pytest.raises(ValueError, match="Invalid ISO datetime"):
parse_iso_datetime("not-a-date")


def test_format_instant_iso_z_empty():
assert format_instant_iso_z(None) == ""
assert format_instant_iso_z("") == ""
assert format_instant_iso_z(" ") == ""


def test_format_instant_iso_z_z_suffix_utc():
assert format_instant_iso_z("2024-03-15T10:30:00Z") == "2024-03-15T10:30:00Z"


def test_format_instant_iso_z_offset_to_z():
assert format_instant_iso_z("2024-01-01T00:00:00+05:00") == "2023-12-31T19:00:00Z"


def test_format_instant_iso_z_invalid_returns_original():
assert format_instant_iso_z("not-a-date") == "not-a-date"


def test_parse_iso_datetime_lenient_empty():
assert parse_iso_datetime_lenient(None) is None
assert parse_iso_datetime_lenient("") is None
assert parse_iso_datetime_lenient(" ") is None


def test_parse_iso_datetime_lenient_z_utc_aware():
dt = parse_iso_datetime_lenient("2024-01-15T10:30:00Z")
assert dt == datetime(2024, 1, 15, 10, 30, 0, tzinfo=timezone.utc)


def test_parse_iso_datetime_lenient_invalid_returns_none():
assert parse_iso_datetime_lenient("not-a-date") is None


@pytest.mark.parametrize(
"date_str,expected_year",
[
("2023-06-01T00:00:00Z", 2023),
("2025-12-31T23:59:59Z", 2025),
],
)
def test_parse_iso_datetime_lenient_parametrized(date_str, expected_year):
result = parse_iso_datetime_lenient(date_str)
assert result is not None
assert result.year == expected_year


def test_canonical_instant_utc_z_pattern():
pat = re.compile(CANONICAL_INSTANT_UTC_Z_PATTERN)
assert pat.fullmatch("2026-01-01T00:00:00Z")
assert pat.fullmatch("2026-01-01T00:00:00.5Z")
assert pat.fullmatch("2026-01-01T00:00:00.123456Z")
assert pat.fullmatch("2026-01-01T00:00:00+00:00") is None
assert pat.fullmatch("") is None
23 changes: 23 additions & 0 deletions core/tests/test_text_processing.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
"""Tests for core.utils.text_processing."""

import pytest

from core.utils.text_processing import (
SLACK_GREETING_WORDS,
clean_text,
filter_sentence,
truncate_content,
validate_content_length,
)

Expand Down Expand Up @@ -67,3 +70,23 @@ def test_validate_content_length():

def test_slack_constants_non_empty():
assert "hello" in SLACK_GREETING_WORDS


def test_truncate_content_short_unchanged():
assert truncate_content("hi", max_length=100) == "hi"


def test_truncate_content_long_adds_ellipsis():
s = "x" * 50
out = truncate_content(s, max_length=10)
assert out.endswith("...")
assert len(out) == 10


def test_truncate_content_max_length_at_most_three():
assert truncate_content("abcdef", max_length=2) == "ab"


def test_truncate_content_negative_max_length_raises():
with pytest.raises(ValueError, match="max_length must be non-negative"):
truncate_content("hello", max_length=-1)
67 changes: 56 additions & 11 deletions core/utils/datetime_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,16 @@

from __future__ import annotations

import logging
from datetime import datetime, timezone

from django.utils import timezone as django_timezone

logger = logging.getLogger(__name__)

# ISO 8601 UTC instant with ``Z`` (optional fractional seconds). Used by JSON Schema and Pydantic.
CANONICAL_INSTANT_UTC_Z_PATTERN = r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?Z$"


def ensure_aware_utc(dt: datetime | None) -> datetime | None:
"""
Expand All @@ -20,15 +26,34 @@ def ensure_aware_utc(dt: datetime | None) -> datetime | None:
return dt.astimezone(timezone.utc)


def parse_iso_datetime_lenient(raw: str | None) -> datetime | None:
"""
Parse ISO-like date/datetime strings from APIs (GitHub, Discord, etc.).

Returns ``None`` for empty/whitespace input or on parse failure (logs at DEBUG).
``Z`` is normalized to ``+00:00`` for :meth:`datetime.fromisoformat`. Preserves
timezone awareness when present (unlike :func:`parse_iso_datetime`, which returns
naive UTC).

:func:`parse_iso_datetime` delegates here for the actual parse, then applies
strict error handling and naive-UTC normalization.
"""
if not raw or not str(raw).strip():
return None
s = str(raw).strip().replace("Z", "+00:00")
try:
return datetime.fromisoformat(s)
except ValueError as e:
logger.debug("Failed to parse datetime %r: %s", s, e)
return None
Comment thread
leostar0412 marked this conversation as resolved.


def parse_iso_datetime(raw: str | None) -> datetime | None:
"""
Parse a date or datetime string using ``datetime.fromisoformat``.

Accepts common ISO-style forms (e.g. ``YYYY-MM-DD``, ``YYYY-MM-DDTHH:MM:SS``,
``YYYY-MM-DD HH:MM:SS`` on Python 3.11+, optional fractional seconds and offsets).
If the string ends with ``Z`` and contains ``T``, ``Z`` is treated as UTC before parsing.

Empty or whitespace-only input returns ``None``.
Delegates to :func:`parse_iso_datetime_lenient` for parsing. Empty or
whitespace-only input returns ``None``.

Raises:
ValueError: If the string is non-empty but cannot be parsed.
Expand All @@ -39,12 +64,32 @@ def parse_iso_datetime(raw: str | None) -> datetime | None:
if not raw or not str(raw).strip():
return None
s = str(raw).strip()
if s.endswith("Z") and "T" in s:
s = s[:-1] + "+00:00"
try:
dt = datetime.fromisoformat(s)
except ValueError as e:
raise ValueError(f"Invalid ISO datetime ({s!r}): {e}") from e
dt = parse_iso_datetime_lenient(raw)
if dt is None:
raise ValueError(f"Invalid ISO datetime ({s!r})")
if dt.tzinfo:
return dt.astimezone(timezone.utc).replace(tzinfo=None)
return dt


def format_instant_iso_z(raw: str | None) -> str:
"""
Normalize a date/datetime string to an ISO 8601 **instant** in UTC with a ``Z`` suffix.

Uses :func:`parse_iso_datetime` for parsing. Empty or whitespace-only input returns
``""``. If the string is non-empty but cannot be parsed, returns the stripped
original string (lenient handling for odd exporter payloads).

Naive datetimes from parsing are interpreted as UTC wall clock before formatting.
"""
text = (raw or "").strip()
if not text:
return ""
try:
dt = parse_iso_datetime(text)
except ValueError:
return text
if dt is None:
return text
aware = dt.replace(tzinfo=timezone.utc)
return aware.astimezone(timezone.utc).isoformat().replace("+00:00", "Z")
11 changes: 11 additions & 0 deletions core/utils/text_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,17 @@ def filter_sentence(
return sentence_lower.strip()


def truncate_content(content: str, max_length: int = 100) -> str:
"""Return ``content`` truncated to ``max_length`` characters with ``...`` when longer."""
if max_length < 0:
raise ValueError("max_length must be non-negative")
if len(content) <= max_length:
return content
if max_length <= 3:
return content[:max_length]
return content[: max_length - 3] + "..."
Comment thread
leostar0412 marked this conversation as resolved.


def validate_content_length(content: str | None, min_length: int = 50) -> bool:
"""
Validate that content meets minimum length requirement.
Expand Down
2 changes: 0 additions & 2 deletions discord_activity_tracker/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@ class DiscordChannelAdmin(admin.ModelAdmin):
"server",
"channel_type",
"category_name",
"last_synced_at",
"last_activity_at",
)
list_filter = ("channel_type", "server")
search_fields = ("channel_name", "channel_id", "category_name")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,10 @@
from discord_activity_tracker.services import (
get_or_create_discord_channel,
get_or_create_discord_server,
update_channel_last_activity,
update_channel_last_synced,
)
from discord_activity_tracker.staging_schema import (
validate_envelope,
validate_normalized_message,
)
from discord_activity_tracker.sync.chat_exporter import (
convert_exporter_message_to_dict,
Expand All @@ -32,7 +34,6 @@
_safe_int,
)
from discord_activity_tracker.sync.messages import _process_messages_in_batches
from discord_activity_tracker.sync.utils import parse_datetime
from discord_activity_tracker.workspace import get_cpp_discussion_import_dir

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -77,12 +78,13 @@ def run(self) -> None:
for i, json_path in enumerate(json_files, 1):
try:
data = parse_exported_json(json_path)
guild_info = data.get("guild", {})
channel_info = data.get("channel", {})
messages = data.get("messages", [])
rel = _json_display_path(import_dir, json_path)
envelope = validate_envelope(data, source=rel)
guild_info = envelope.guild.model_dump(by_alias=True)
channel_info = envelope.channel.model_dump(by_alias=True)
messages = envelope.messages

Comment thread
leostar0412 marked this conversation as resolved.
ch_name = channel_info.get("name", "?")
rel = _json_display_path(import_dir, json_path)
self.stdout.write(
f" [{i}/{len(json_files)}] {rel} — #{ch_name}: {len(messages)} messages"
)
Expand Down Expand Up @@ -132,16 +134,15 @@ async def _persist_channel(
category_name=channel_info.get("category") or "",
)

converted = [convert_exporter_message_to_dict(m) for m in messages]
srv_id = _safe_int(guild_info.get("id", 0))
ch_id = _safe_int(channel_info.get("id", 0))
converted = [
convert_exporter_message_to_dict(m, server_id=srv_id, channel_id=ch_id)
for m in messages
]
for idx, cmsg in enumerate(converted):
validate_normalized_message(cmsg, source=f"message[{idx}]")
count = await _process_messages_in_batches(channel, converted)

if messages:
last_converted = convert_exporter_message_to_dict(messages[-1])
last_time = parse_datetime(last_converted.get("created_at"))
if last_time:
await sync_to_async(update_channel_last_activity)(channel, last_time)

await sync_to_async(update_channel_last_synced)(channel)
return count

def sync_pinecone(self) -> None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,11 @@
from discord_activity_tracker.services import (
get_or_create_discord_channel,
get_or_create_discord_server,
update_channel_last_activity,
update_channel_last_synced,
)
from discord_activity_tracker.staging_schema import (
StagingValidationError,
validate_envelope,
validate_normalized_message,
)
from discord_activity_tracker.sync.exporter_window import (
latest_message_created_at_for_guild,
Expand All @@ -44,7 +47,6 @@
parse_exported_json,
)
from discord_activity_tracker.sync.messages import _process_messages_in_batches
from discord_activity_tracker.sync.utils import parse_datetime
from discord_activity_tracker.workspace import (
clear_exporter_staging_dir,
get_channel_raw_dir,
Expand Down Expand Up @@ -197,9 +199,10 @@ def task_discord_sync(
for i, json_path in enumerate(json_files, 1):
try:
data = parse_exported_json(json_path)
guild_info = data.get("guild", {})
channel_info = data.get("channel", {})
messages = data.get("messages", [])
envelope = validate_envelope(data, source=json_path.name)
guild_info = envelope.guild.model_dump(by_alias=True)
channel_info = envelope.channel.model_dump(by_alias=True)
messages = envelope.messages
Comment thread
coderabbitai[bot] marked this conversation as resolved.

ch_name = channel_info.get("name", "?")
ch_id = _safe_int(channel_info.get("id", 0))
Expand All @@ -223,6 +226,17 @@ def task_discord_sync(
dest = channel_raw_dir / f"{date_tag}.json"
json_path.rename(dest)

except StagingValidationError as exc:
logger.error(
"Staging validation failed for %s (file left in staging): %s",
json_path.name,
exc,
)
continue
except ValueError as exc:
logger.error("Failed to process %s: %s", json_path.name, exc)
json_path.unlink(missing_ok=True)
continue
except Exception as exc:
logger.error("Failed to process %s: %s", json_path.name, exc)
json_path.unlink(missing_ok=True)
Expand Down Expand Up @@ -347,21 +361,15 @@ async def _persist_channel(
category_name=channel_info.get("category") or "",
)

converted = [convert_exporter_message_to_dict(m) for m in messages]
srv_id = _safe_int(guild_info.get("id", 0))
ch_id = _safe_int(channel_info.get("id", 0))
converted = [
convert_exporter_message_to_dict(m, server_id=srv_id, channel_id=ch_id)
for m in messages
]
for idx, cmsg in enumerate(converted):
validate_normalized_message(cmsg, source=f"message[{idx}]")
count = await _process_messages_in_batches(channel, converted)

def finalize_exporter_channel_sync() -> None:
if converted:
parsed_times = [
t
for m in converted
if (t := parse_datetime(m.get("created_at"))) is not None
]
if parsed_times:
update_channel_last_activity(channel, max(parsed_times))
update_channel_last_synced(channel)

await sync_to_async(finalize_exporter_channel_sync)()
return count


Expand Down
Loading
Loading