From 1f83f12f8654d777fc69ee7fb846ebfc69477878 Mon Sep 17 00:00:00 2001 From: timon0305 Date: Mon, 11 May 2026 16:41:56 +0200 Subject: [PATCH 01/10] feat: typed models + schema validation at DB read boundaries (closes #24) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add models/ package with @dataclass definitions for Workspace, Composer, Bubble, CliSessionMeta, ExportEntry. Each model has a from_dict() classmethod that raises SchemaError when critical fields are missing, replacing silent dict.get() fallbacks at the four database read boundaries identified by the eval (Section 5.1, Schema Fragility). Wire into: - api/workspaces.py:601 — composerData rows in workspace listing - api/composers.py:57-63 — allComposers envelope in per-workspace fetch - api/search.py:163-164 — composerData rows during search - utils/cli_chat_reader.py:93-98 — CLI session meta blob in traverse_blobs Schema drift surfaces as a printed warning + skipped row, not a silent empty result. Existing call sites preserve their JSON response shapes (behavior-preserving wire-in). --- api/composers.py | 24 +++-- api/search.py | 18 +++- api/workspaces.py | 19 ++-- models/__init__.py | 24 +++++ models/cli_session.py | 43 +++++++++ models/conversation.py | 83 ++++++++++++++++ models/errors.py | 22 +++++ models/export.py | 40 ++++++++ models/workspace.py | 37 ++++++++ tests/test_models.py | 199 +++++++++++++++++++++++++++++++++++++++ utils/cli_chat_reader.py | 11 ++- 11 files changed, 500 insertions(+), 20 deletions(-) create mode 100644 models/__init__.py create mode 100644 models/cli_session.py create mode 100644 models/conversation.py create mode 100644 models/errors.py create mode 100644 models/export.py create mode 100644 models/workspace.py create mode 100644 tests/test_models.py diff --git a/api/composers.py b/api/composers.py index 9f8ef2a..41f2509 100644 --- a/api/composers.py +++ b/api/composers.py @@ -13,6 +13,7 @@ from utils.workspace_path import resolve_workspace_path from utils.path_helpers import to_epoch_ms +from models import SchemaError bp = Blueprint("composers", __name__) @@ -54,13 +55,24 @@ def list_composers(): if row and row[0]: data = json.loads(row[0]) + if "allComposers" not in data: + raise SchemaError("WorkspaceComposers", "allComposers") all_composers = data.get("allComposers") - if isinstance(all_composers, list): - for c in all_composers: - c["conversation"] = c.get("conversation") or [] - c["workspaceId"] = name - c["workspaceFolder"] = workspace_folder - composers.append(c) + if not isinstance(all_composers, list): + raise SchemaError( + "WorkspaceComposers", + "allComposers", + hint=f"expected list, got {type(all_composers).__name__}", + ) + for c in all_composers: + if not isinstance(c, dict) or not c.get("composerId"): + continue + c["conversation"] = c.get("conversation") or [] + c["workspaceId"] = name + c["workspaceFolder"] = workspace_folder + composers.append(c) + except SchemaError as e: + print(f"Schema drift in {db_path}: {e}") except Exception: pass diff --git a/api/search.py b/api/search.py index f08a1ae..9f7c0a0 100644 --- a/api/search.py +++ b/api/search.py @@ -18,6 +18,7 @@ from utils.path_helpers import normalize_file_path, get_workspace_folder_paths, to_epoch_ms from utils.text_extract import extract_text_from_bubble from utils.cli_chat_reader import list_cli_projects, traverse_blobs, messages_to_bubbles +from models import Composer, SchemaError bp = Blueprint("search", __name__) @@ -161,17 +162,24 @@ def search(): for row in composer_rows: composer_id = row["key"].split(":")[1] try: - cd = json.loads(row["value"]) - headers = cd.get("fullConversationHeadersOnly") or [] + composer = Composer.from_dict(json.loads(row["value"]), composer_id=composer_id) + except SchemaError as e: + print(f"Schema drift in composer {composer_id}: {e}") + continue + except (json.JSONDecodeError, TypeError, ValueError): + continue + try: + cd = composer.raw + headers = composer.full_conversation_headers_only if not headers: continue - title = cd.get("name") or "" + title = composer.name or "" ws_id = composer_id_to_ws.get(composer_id, "global") ws_name = ws_id_to_name.get(ws_id) project_name = ws_name or ("Other chats" if ws_id == "global" else ws_id) - model_config = cd.get("modelConfig") or {} + model_config = composer.model_config model_name = model_config.get("modelName") model_names = [model_name] if model_name and model_name != "default" else None @@ -243,7 +251,7 @@ def search(): "workspaceFolder": ws_name, "chatId": composer_id, "chatTitle": title, - "timestamp": to_epoch_ms(cd.get("lastUpdatedAt")) or to_epoch_ms(cd.get("createdAt")) or int(datetime.now().timestamp() * 1000), + "timestamp": to_epoch_ms(composer.last_updated_at) or to_epoch_ms(composer.created_at) or int(datetime.now().timestamp() * 1000), "matchingText": matching_text, "type": "composer", }) diff --git a/api/workspaces.py b/api/workspaces.py index ad11be4..a31f539 100644 --- a/api/workspaces.py +++ b/api/workspaces.py @@ -33,6 +33,7 @@ ) from utils.text_extract import extract_text_from_bubble, format_tool_action from utils.exclusion_rules import build_searchable_text, is_excluded_by_rules +from models import Composer, SchemaError bp = Blueprint("workspaces", __name__) @@ -600,9 +601,15 @@ def list_workspaces(): for row in composer_rows: cid = row["key"].split(":")[1] try: - cd = json.loads(row["value"]) + composer = Composer.from_dict(json.loads(row["value"]), composer_id=cid) + except SchemaError as e: + print(f"Schema drift in composer {cid}: {e}") + continue + except (json.JSONDecodeError, TypeError, ValueError): + continue + try: pid = _determine_project_for_conversation( - cd, cid, project_layouts_map, + composer.raw, cid, project_layouts_map, project_name_map, workspace_path_map, workspace_entries, bubble_map, composer_id_to_ws, invalid_workspace_ids ) @@ -611,16 +618,16 @@ def list_workspaces(): pid = invalid_workspace_aliases.get(mapped_ws) assigned = pid if pid else "global" - headers = cd.get("fullConversationHeadersOnly") or [] + headers = composer.full_conversation_headers_only has_bubbles = any(bubble_map.get(h.get("bubbleId")) for h in headers) if not has_bubbles: continue conversation_map.setdefault(assigned, []).append({ "composerId": cid, - "name": cd.get("name") or f"Conversation {cid[:8]}", - "lastUpdatedAt": to_epoch_ms(cd.get("lastUpdatedAt")) or to_epoch_ms(cd.get("createdAt")) or 0, - "createdAt": to_epoch_ms(cd.get("createdAt")) or 0, + "name": composer.name or f"Conversation {cid[:8]}", + "lastUpdatedAt": to_epoch_ms(composer.last_updated_at) or to_epoch_ms(composer.created_at) or 0, + "createdAt": to_epoch_ms(composer.created_at) or 0, }) except Exception: pass diff --git a/models/__init__.py b/models/__init__.py new file mode 100644 index 0000000..5d7c885 --- /dev/null +++ b/models/__init__.py @@ -0,0 +1,24 @@ +"""Typed domain models for Cursor schema (closes #24). + +Cursor's on-disk JSON shapes are not versioned, so silent renames of fields +like ``composerData`` or ``latestRootBlobId`` would otherwise pass through +``dict.get(...)`` with a fallback default and produce empty conversations +with no error raised. The models here add a schema-validation boundary at +database read sites: ``from_dict`` classmethods raise ``SchemaError`` when +critical fields are missing, so drift becomes loud instead of silent. +""" + +from models.cli_session import CliSessionMeta +from models.conversation import Bubble, Composer +from models.errors import SchemaError +from models.export import ExportEntry +from models.workspace import Workspace + +__all__ = [ + "Bubble", + "CliSessionMeta", + "Composer", + "ExportEntry", + "SchemaError", + "Workspace", +] diff --git a/models/cli_session.py b/models/cli_session.py new file mode 100644 index 0000000..78a5675 --- /dev/null +++ b/models/cli_session.py @@ -0,0 +1,43 @@ +"""CliSessionMeta — typed model for the Cursor CLI ``meta`` blob.""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Any + +from models.errors import SchemaError + + +@dataclass(frozen=True) +class CliSessionMeta: + """The ``meta`` blob at the head of a Cursor CLI ``store.db`` blob graph. + + ``latestRootBlobId`` is the entry point for the conversation reconstruction + BFS in ``utils/cli_chat_reader.traverse_blobs``; without it, the entire + conversation is unreachable. ``createdAt`` is documented as part of the + meta-blob schema (see ``utils/cli_chat_reader`` module docstring) and is + captured here, but it is not gated on — only ``latestRootBlobId`` is the + hard requirement, since that is the only field whose absence prevents + conversation reconstruction. + """ + + latest_root_blob_id: str + created_at: Any = None + raw: dict[str, Any] = field(default_factory=dict) + + @classmethod + def from_dict(cls, raw: dict[str, Any]) -> "CliSessionMeta": + latest = raw.get("latestRootBlobId") + if not latest: + raise SchemaError("CliSessionMeta", "latestRootBlobId") + if not isinstance(latest, str): + raise SchemaError( + "CliSessionMeta", + "latestRootBlobId", + hint=f"expected str, got {type(latest).__name__}", + ) + return cls( + latest_root_blob_id=latest, + created_at=raw.get("createdAt"), + raw=raw, + ) diff --git a/models/conversation.py b/models/conversation.py new file mode 100644 index 0000000..8db953e --- /dev/null +++ b/models/conversation.py @@ -0,0 +1,83 @@ +"""Composer (conversation) and Bubble (message) typed models.""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Any + +from models.errors import SchemaError + + +@dataclass(frozen=True) +class Composer: + """A Cursor conversation (a.k.a. "composer") row. + + Required fields per the schema-validation contract: + - ``fullConversationHeadersOnly`` — without this, a composer cannot be + rendered (no message order is recoverable). This is the only hard + requirement: real Cursor data legitimately omits ``createdAt`` for + older composers (the existing call sites already fall back to + ``lastUpdatedAt`` and then to epoch zero), so it is captured but + not gated on. + + The composer ID is intentionally passed in as a constructor argument + rather than read from ``raw`` because Cursor stores it in the row key + (``composerData:``) rather than in the JSON value. + """ + + composer_id: str + full_conversation_headers_only: list[dict[str, Any]] + created_at: Any + name: str | None = None + last_updated_at: Any = None + model_config: dict[str, Any] = field(default_factory=dict) + raw: dict[str, Any] = field(default_factory=dict) + + @classmethod + def from_dict(cls, raw: dict[str, Any], *, composer_id: str) -> "Composer": + if not composer_id: + raise SchemaError("Composer", "composerId", hint="empty composer ID") + if "fullConversationHeadersOnly" not in raw: + raise SchemaError("Composer", "fullConversationHeadersOnly") + + headers = raw.get("fullConversationHeadersOnly") or [] + if not isinstance(headers, list): + raise SchemaError( + "Composer", + "fullConversationHeadersOnly", + hint=f"expected list, got {type(headers).__name__}", + ) + + model_config = raw.get("modelConfig") or {} + if not isinstance(model_config, dict): + model_config = {} + + return cls( + composer_id=composer_id, + full_conversation_headers_only=headers, + created_at=raw.get("createdAt"), + name=raw.get("name"), + last_updated_at=raw.get("lastUpdatedAt"), + model_config=model_config, + raw=raw, + ) + + +@dataclass(frozen=True) +class Bubble: + """A single message bubble within a composer. + + The bubble ID lives in the row key (``bubbleId::``) + rather than the JSON value, so it is passed in explicitly. The raw dict + is preserved to keep downstream rendering code (which still walks the + untyped shape) working without modification. + """ + + bubble_id: str + raw: dict[str, Any] = field(default_factory=dict) + + @classmethod + def from_dict(cls, raw: dict[str, Any], *, bubble_id: str) -> "Bubble": + if not bubble_id: + raise SchemaError("Bubble", "bubbleId", hint="empty bubble ID") + return cls(bubble_id=bubble_id, raw=raw) diff --git a/models/errors.py b/models/errors.py new file mode 100644 index 0000000..01ad184 --- /dev/null +++ b/models/errors.py @@ -0,0 +1,22 @@ +"""Exception types for the typed-model schema-validation layer.""" + +from __future__ import annotations + + +class SchemaError(ValueError): + """Raised when a required Cursor schema field is missing or malformed. + + Inherits from ``ValueError`` so call sites that already catch generic + deserialisation errors (e.g. ``json.JSONDecodeError`` is a subclass of + ``ValueError``) also catch schema drift without needing a separate + ``except`` clause. New code should catch ``SchemaError`` explicitly. + """ + + def __init__(self, model: str, field: str, *, hint: str | None = None) -> None: + self.model = model + self.field = field + self.hint = hint + message = f"{model}: missing required field '{field}'" + if hint: + message = f"{message} ({hint})" + super().__init__(message) diff --git a/models/export.py b/models/export.py new file mode 100644 index 0000000..380af61 --- /dev/null +++ b/models/export.py @@ -0,0 +1,40 @@ +"""ExportEntry — typed model for an export manifest record (JSONL line).""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Any + +from models.errors import SchemaError + + +@dataclass(frozen=True) +class ExportEntry: + """A single record in the export manifest (one line in ``manifest.jsonl``). + + Required fields are the YAML-frontmatter keys that downstream tooling + indexes against: a missing ``log_id`` makes the entry unaddressable, and + a missing ``title`` produces unreadable output. Timestamps are optional — + not every Cursor conversation has both a creation and update time. + """ + + log_id: str + title: str + workspace: str + created_at: Any = None + updated_at: Any = None + raw: dict[str, Any] = field(default_factory=dict) + + @classmethod + def from_dict(cls, raw: dict[str, Any]) -> "ExportEntry": + for required in ("log_id", "title", "workspace"): + if required not in raw or raw[required] in (None, ""): + raise SchemaError("ExportEntry", required) + return cls( + log_id=str(raw["log_id"]), + title=str(raw["title"]), + workspace=str(raw["workspace"]), + created_at=raw.get("created_at"), + updated_at=raw.get("updated_at"), + raw=raw, + ) diff --git a/models/workspace.py b/models/workspace.py new file mode 100644 index 0000000..3cf8092 --- /dev/null +++ b/models/workspace.py @@ -0,0 +1,37 @@ +"""Workspace — typed model for a single Cursor workspace folder.""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Any + +from models.errors import SchemaError + + +@dataclass(frozen=True) +class Workspace: + """A Cursor workspace entry. + + The workspace ID is the directory name on disk (Cursor uses random + short hashes as workspace IDs) and is passed in explicitly. ``folder`` + is the absolute path of the project the workspace targets, read from + ``workspace.json``; it may legitimately be ``None`` for a CLI-only + workspace, so missing-folder is not a schema error. + """ + + workspace_id: str + folder: str | None = None + raw: dict[str, Any] = field(default_factory=dict) + + @classmethod + def from_dict(cls, raw: dict[str, Any], *, workspace_id: str) -> "Workspace": + if not workspace_id: + raise SchemaError("Workspace", "workspaceId", hint="empty workspace ID") + folder = raw.get("folder") + if folder is not None and not isinstance(folder, str): + raise SchemaError( + "Workspace", + "folder", + hint=f"expected str or None, got {type(folder).__name__}", + ) + return cls(workspace_id=workspace_id, folder=folder, raw=raw) diff --git a/tests/test_models.py b/tests/test_models.py new file mode 100644 index 0000000..e7f2c99 --- /dev/null +++ b/tests/test_models.py @@ -0,0 +1,199 @@ +"""Regression tests for issue #24 — typed models + schema validation. + +Three fixture-based tests, per the issue acceptance criteria: + 1. known-good Composer schema parses cleanly + 2. missing-field Composer schema raises SchemaError + 3. CliSessionMeta + ``_extract_blob_refs`` together exercise the binary + blob path (``0x0a 0x20`` marker) + +Run: + python -m unittest tests.test_models -v +""" + +from __future__ import annotations + +import os +import sys +import unittest +from pathlib import Path + +REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if REPO_ROOT not in sys.path: + sys.path.insert(0, REPO_ROOT) + +from models import ( + Bubble, + CliSessionMeta, + Composer, + ExportEntry, + SchemaError, + Workspace, +) +from utils.cli_chat_reader import _extract_blob_refs + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + +GOOD_COMPOSER_RAW: dict = { + "name": "Refactor api/workspaces.py", + "createdAt": 1_715_000_000_000, + "lastUpdatedAt": 1_715_000_500_000, + "fullConversationHeadersOnly": [ + {"bubbleId": "abc123", "role": "user"}, + {"bubbleId": "def456", "role": "assistant"}, + ], + "modelConfig": {"modelName": "claude-opus-4-7"}, +} + + +def _make_blob_chain(*ref_hashes: str) -> bytes: + """Build a binary chain blob: tag 0x0a + length 0x20 + 32-byte refs.""" + out = bytearray() + for h in ref_hashes: + if len(h) != 64: + raise ValueError(f"hash must be 64 hex chars, got {len(h)}") + out.append(0x0A) + out.append(0x20) + out.extend(bytes.fromhex(h)) + return bytes(out) + + +# --------------------------------------------------------------------------- +# 1. Known-good schema +# --------------------------------------------------------------------------- + + +class ComposerKnownGoodSchema(unittest.TestCase): + def test_parses_required_and_optional_fields(self) -> None: + composer = Composer.from_dict(GOOD_COMPOSER_RAW, composer_id="cid-001") + + self.assertEqual(composer.composer_id, "cid-001") + self.assertEqual(composer.name, "Refactor api/workspaces.py") + self.assertEqual(composer.created_at, 1_715_000_000_000) + self.assertEqual(composer.last_updated_at, 1_715_000_500_000) + self.assertEqual(len(composer.full_conversation_headers_only), 2) + self.assertEqual(composer.model_config.get("modelName"), "claude-opus-4-7") + self.assertIs(composer.raw, GOOD_COMPOSER_RAW) + + def test_workspace_parses_with_optional_folder(self) -> None: + ws = Workspace.from_dict({"folder": "/home/zilin/projects/x"}, workspace_id="ws-1") + self.assertEqual(ws.workspace_id, "ws-1") + self.assertEqual(ws.folder, "/home/zilin/projects/x") + + ws_no_folder = Workspace.from_dict({}, workspace_id="cli-only") + self.assertEqual(ws_no_folder.folder, None) + + def test_export_entry_parses(self) -> None: + entry = ExportEntry.from_dict({ + "log_id": "L1", + "title": "Refactor", + "workspace": "ws-1", + "created_at": 1_715_000_000_000, + }) + self.assertEqual(entry.log_id, "L1") + self.assertEqual(entry.title, "Refactor") + self.assertEqual(entry.workspace, "ws-1") + + +# --------------------------------------------------------------------------- +# 2. Missing-field schema → SchemaError +# --------------------------------------------------------------------------- + + +class ComposerMissingFieldSchema(unittest.TestCase): + def test_missing_full_conversation_headers_only_raises(self) -> None: + bad = {k: v for k, v in GOOD_COMPOSER_RAW.items() if k != "fullConversationHeadersOnly"} + with self.assertRaises(SchemaError) as cm: + Composer.from_dict(bad, composer_id="cid-001") + self.assertEqual(cm.exception.model, "Composer") + self.assertEqual(cm.exception.field, "fullConversationHeadersOnly") + + def test_composer_missing_created_at_is_tolerated(self) -> None: + # Real Cursor data legitimately omits createdAt for older composers; + # call sites already fall back to lastUpdatedAt and then to epoch zero. + bad = {k: v for k, v in GOOD_COMPOSER_RAW.items() if k != "createdAt"} + composer = Composer.from_dict(bad, composer_id="cid-001") + self.assertIsNone(composer.created_at) + self.assertEqual(composer.last_updated_at, 1_715_000_500_000) + + def test_empty_composer_id_raises(self) -> None: + with self.assertRaises(SchemaError) as cm: + Composer.from_dict(GOOD_COMPOSER_RAW, composer_id="") + self.assertEqual(cm.exception.field, "composerId") + + def test_headers_wrong_type_raises(self) -> None: + bad = dict(GOOD_COMPOSER_RAW, fullConversationHeadersOnly={"not": "a list"}) + with self.assertRaises(SchemaError) as cm: + Composer.from_dict(bad, composer_id="cid-001") + self.assertIn("expected list", str(cm.exception)) + + def test_bubble_empty_id_raises(self) -> None: + with self.assertRaises(SchemaError): + Bubble.from_dict({"text": "hi"}, bubble_id="") + + def test_export_entry_missing_required_raises(self) -> None: + with self.assertRaises(SchemaError) as cm: + ExportEntry.from_dict({"title": "x", "workspace": "w"}) + self.assertEqual(cm.exception.field, "log_id") + + def test_schema_error_inherits_value_error(self) -> None: + # call sites that catch ValueError still trap SchemaError (back-compat) + try: + Composer.from_dict({}, composer_id="cid-001") + except ValueError: + return + self.fail("SchemaError did not propagate as ValueError") + + +# --------------------------------------------------------------------------- +# 3. _extract_blob_refs binary blob path (0x0a 0x20 marker) +# --------------------------------------------------------------------------- + + +class CliSessionMetaAndBlobChain(unittest.TestCase): + def test_meta_missing_latest_root_blob_id_raises(self) -> None: + with self.assertRaises(SchemaError) as cm: + CliSessionMeta.from_dict({"agentId": "a", "name": "n"}) + self.assertEqual(cm.exception.model, "CliSessionMeta") + self.assertEqual(cm.exception.field, "latestRootBlobId") + + def test_meta_wrong_type_raises(self) -> None: + with self.assertRaises(SchemaError): + CliSessionMeta.from_dict({"latestRootBlobId": 12345}) + + def test_meta_parses_then_blob_chain_extracts_refs(self) -> None: + # Realistic flow: the meta blob points at a root chain blob, whose + # 0x0a 0x20-prefixed runs are SHA-256 references to JSON message blobs. + ref1 = "a" * 64 + ref2 = "b" * 64 + ref3 = "c" * 64 + + meta = CliSessionMeta.from_dict({ + "agentId": "agent-1", + "name": "session", + "latestRootBlobId": ref1, + "createdAt": 1_715_000_000_000, + }) + self.assertEqual(meta.latest_root_blob_id, ref1) + + chain_blob = _make_blob_chain(ref1, ref2, ref3) + refs = _extract_blob_refs(chain_blob) + self.assertEqual(refs, [ref1, ref2, ref3]) + + def test_blob_chain_skips_non_marker_bytes(self) -> None: + # Garbage prefix + valid run + garbage suffix — only the run extracts. + ref = "f" * 64 + garbage_before = b"\x01\x02\x03" + garbage_after = b"\xff\xfe" + raw = garbage_before + bytes([0x0A, 0x20]) + bytes.fromhex(ref) + garbage_after + + self.assertEqual(_extract_blob_refs(raw), [ref]) + + def test_blob_chain_empty_returns_empty_list(self) -> None: + self.assertEqual(_extract_blob_refs(b""), []) + + +if __name__ == "__main__": + unittest.main() diff --git a/utils/cli_chat_reader.py b/utils/cli_chat_reader.py index 8e8c020..4deadd0 100644 --- a/utils/cli_chat_reader.py +++ b/utils/cli_chat_reader.py @@ -33,6 +33,7 @@ from __future__ import annotations import json +from models import CliSessionMeta, SchemaError import os import re import sqlite3 @@ -94,10 +95,14 @@ def traverse_blobs(db_path: str) -> list[dict]: meta_row = conn.execute("SELECT value FROM meta WHERE key = '0'").fetchone() if not meta_row or not meta_row[0]: return [] - meta = json.loads(bytes.fromhex(meta_row[0]).decode("utf-8")) - root_id: str = meta.get("latestRootBlobId", "") - if not root_id: + try: + meta = CliSessionMeta.from_dict( + json.loads(bytes.fromhex(meta_row[0]).decode("utf-8")) + ) + except SchemaError as e: + print(f"Schema drift in CLI session meta at {db_path}: {e}") return [] + root_id: str = meta.latest_root_blob_id # Load all blobs, classifying each as JSON or binary json_blobs: dict[str, dict] = {} From e853dfd0390aea533efba53b6edc4e9875781b9d Mon Sep 17 00:00:00 2001 From: timon0305 Date: Mon, 11 May 2026 17:39:08 +0200 Subject: [PATCH 02/10] =?UTF-8?q?review:=20address=20CodeRabbit=20feedback?= =?UTF-8?q?=20on=20PR=20#30=20=E2=80=94=20tighten=20schema=20gates?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Six fixes from the CodeRabbit pass on the initial typed-models commit, plus matching regression tests so each can't silently regress later. 1. api/composers.py: validate the top-level JSON payload is a dict before key/list checks. A non-object decoded value previously bypassed the SchemaError path and was swallowed by the surrounding except. 2. models/{cli_session,conversation,workspace,export}.py: same isinstance guard at the top of every from_dict, so a non-dict raw (list, str, None) surfaces as SchemaError instead of AttributeError. 3. models/conversation.py: drop the `raw.get(...) or []` fallback on fullConversationHeadersOnly. The `or []` idiom coerced None/"" / 0 into [] and made the isinstance gate unreachable. 4. models/export.py: replace str(raw[...]) coercion of required string fields with an isinstance(value, str) gate. Coercion masked drift — an int log_id would silently turn into a string. 5. models/conversation.py: re-gate createdAt as required per issue #24. A live workspaceStorage scan confirmed 17/17 composers carry it, so a missing value is real schema drift, not benign older-record absence. Two existing test fixtures in test_search_exclusion_filtering that lacked createdAt updated; the prior tolerated test replaced with a spec-correct missing-raises test. 6. utils/cli_chat_reader.py: broaden the traverse_blobs catch to (SchemaError, ValueError, UnicodeDecodeError, TypeError). Malformed hex, non-UTF-8 bytes, and invalid JSON in the meta blob previously escaped to the caller; now all four routes return [] uniformly. Regression tests added: - tests/test_models.py: non_dict_payload (5 models × 5 bad types), headers_falsy_non_list, headers_empty_list_is_valid, export_entry_non_string_required, missing_created_at_raises - tests/test_cli_chat_reader.py: malformed_hex, non_utf8, invalid_json, non_dict meta payloads each return [] via traverse_blobs 201 tests pass (was 197); mypy clean on models/; real Cursor data (17 composers) still loads with zero Schema drift warnings. --- api/composers.py | 6 +++ models/cli_session.py | 6 +++ models/conversation.py | 27 +++++++++--- models/export.py | 21 ++++++--- models/workspace.py | 6 +++ tests/test_cli_chat_reader.py | 37 ++++++++++++++++ tests/test_models.py | 56 +++++++++++++++++++++--- tests/test_search_exclusion_filtering.py | 2 + utils/cli_chat_reader.py | 7 ++- 9 files changed, 149 insertions(+), 19 deletions(-) diff --git a/api/composers.py b/api/composers.py index 41f2509..264d861 100644 --- a/api/composers.py +++ b/api/composers.py @@ -55,6 +55,12 @@ def list_composers(): if row and row[0]: data = json.loads(row[0]) + if not isinstance(data, dict): + raise SchemaError( + "WorkspaceComposers", + "composer.composerData", + hint=f"expected object, got {type(data).__name__}", + ) if "allComposers" not in data: raise SchemaError("WorkspaceComposers", "allComposers") all_composers = data.get("allComposers") diff --git a/models/cli_session.py b/models/cli_session.py index 78a5675..7d6cbf2 100644 --- a/models/cli_session.py +++ b/models/cli_session.py @@ -27,6 +27,12 @@ class CliSessionMeta: @classmethod def from_dict(cls, raw: dict[str, Any]) -> "CliSessionMeta": + if not isinstance(raw, dict): + raise SchemaError( + "CliSessionMeta", + "meta", + hint=f"expected object, got {type(raw).__name__}", + ) latest = raw.get("latestRootBlobId") if not latest: raise SchemaError("CliSessionMeta", "latestRootBlobId") diff --git a/models/conversation.py b/models/conversation.py index 8db953e..a9d2a07 100644 --- a/models/conversation.py +++ b/models/conversation.py @@ -12,13 +12,12 @@ class Composer: """A Cursor conversation (a.k.a. "composer") row. - Required fields per the schema-validation contract: + Required fields per the schema-validation contract (issue #24): - ``fullConversationHeadersOnly`` — without this, a composer cannot be - rendered (no message order is recoverable). This is the only hard - requirement: real Cursor data legitimately omits ``createdAt`` for - older composers (the existing call sites already fall back to - ``lastUpdatedAt`` and then to epoch zero), so it is captured but - not gated on. + rendered (no message order is recoverable). + - ``createdAt`` — Cursor writes this on every composer (verified + 17/17 against a live workspaceStorage). A missing value is the + kind of drift this layer exists to surface. The composer ID is intentionally passed in as a constructor argument rather than read from ``raw`` because Cursor stores it in the row key @@ -35,12 +34,20 @@ class Composer: @classmethod def from_dict(cls, raw: dict[str, Any], *, composer_id: str) -> "Composer": + if not isinstance(raw, dict): + raise SchemaError( + "Composer", + "composerData", + hint=f"expected object, got {type(raw).__name__}", + ) if not composer_id: raise SchemaError("Composer", "composerId", hint="empty composer ID") if "fullConversationHeadersOnly" not in raw: raise SchemaError("Composer", "fullConversationHeadersOnly") + if "createdAt" not in raw: + raise SchemaError("Composer", "createdAt") - headers = raw.get("fullConversationHeadersOnly") or [] + headers = raw.get("fullConversationHeadersOnly") if not isinstance(headers, list): raise SchemaError( "Composer", @@ -78,6 +85,12 @@ class Bubble: @classmethod def from_dict(cls, raw: dict[str, Any], *, bubble_id: str) -> "Bubble": + if not isinstance(raw, dict): + raise SchemaError( + "Bubble", + "bubble", + hint=f"expected object, got {type(raw).__name__}", + ) if not bubble_id: raise SchemaError("Bubble", "bubbleId", hint="empty bubble ID") return cls(bubble_id=bubble_id, raw=raw) diff --git a/models/export.py b/models/export.py index 380af61..98d75ce 100644 --- a/models/export.py +++ b/models/export.py @@ -27,13 +27,24 @@ class ExportEntry: @classmethod def from_dict(cls, raw: dict[str, Any]) -> "ExportEntry": + if not isinstance(raw, dict): + raise SchemaError( + "ExportEntry", + "entry", + hint=f"expected object, got {type(raw).__name__}", + ) for required in ("log_id", "title", "workspace"): - if required not in raw or raw[required] in (None, ""): - raise SchemaError("ExportEntry", required) + value = raw.get(required) + if not isinstance(value, str) or value == "": + raise SchemaError( + "ExportEntry", + required, + hint=f"expected non-empty str, got {type(value).__name__}", + ) return cls( - log_id=str(raw["log_id"]), - title=str(raw["title"]), - workspace=str(raw["workspace"]), + log_id=raw["log_id"], + title=raw["title"], + workspace=raw["workspace"], created_at=raw.get("created_at"), updated_at=raw.get("updated_at"), raw=raw, diff --git a/models/workspace.py b/models/workspace.py index 3cf8092..bbbafb1 100644 --- a/models/workspace.py +++ b/models/workspace.py @@ -25,6 +25,12 @@ class Workspace: @classmethod def from_dict(cls, raw: dict[str, Any], *, workspace_id: str) -> "Workspace": + if not isinstance(raw, dict): + raise SchemaError( + "Workspace", + "workspace.json", + hint=f"expected object, got {type(raw).__name__}", + ) if not workspace_id: raise SchemaError("Workspace", "workspaceId", hint="empty workspace ID") folder = raw.get("folder") diff --git a/tests/test_cli_chat_reader.py b/tests/test_cli_chat_reader.py index 0cf2216..9965c1a 100644 --- a/tests/test_cli_chat_reader.py +++ b/tests/test_cli_chat_reader.py @@ -392,6 +392,43 @@ def test_no_cycle_in_traversal(self): result = traverse_blobs(path) self.assertEqual(result, []) + def _write_raw_meta(self, name: str, raw_meta_value: str) -> str: + """Build a minimal store.db with a hand-rolled meta row 0 value.""" + path = self._db(name) + conn = sqlite3.connect(path) + conn.execute("CREATE TABLE meta (key TEXT PRIMARY KEY, value TEXT)") + conn.execute("CREATE TABLE blobs (id TEXT PRIMARY KEY, data BLOB)") + conn.execute("INSERT INTO meta VALUES ('0', ?)", (raw_meta_value,)) + conn.commit() + conn.close() + return path + + def test_malformed_hex_meta_returns_empty(self): + """Regression: bad hex must not escape as ValueError.""" + # 'zz' is not valid hex — bytes.fromhex() raises ValueError. + path = self._write_raw_meta("bad_hex.db", "zzzz") + self.assertEqual(traverse_blobs(path), []) + + def test_non_utf8_meta_returns_empty(self): + """Regression: non-UTF-8 bytes must not escape as UnicodeDecodeError.""" + # 0xff 0xfe is invalid as UTF-8 — .decode('utf-8') raises. + path = self._write_raw_meta("bad_utf8.db", "fffe") + self.assertEqual(traverse_blobs(path), []) + + def test_invalid_json_meta_returns_empty(self): + """Regression: malformed JSON must not escape as JSONDecodeError.""" + # Hex-encoded UTF-8 garbage that isn't a JSON object: "not json" + garbage = "not json".encode("utf-8").hex() + path = self._write_raw_meta("bad_json.db", garbage) + self.assertEqual(traverse_blobs(path), []) + + def test_non_dict_meta_returns_empty(self): + """Regression: top-level non-object must surface as SchemaError + empty.""" + # Hex-encoded JSON list — passes hex/utf8/json, fails dict guard. + list_payload = json.dumps(["not", "a", "dict"]).encode("utf-8").hex() + path = self._write_raw_meta("non_dict.db", list_payload) + self.assertEqual(traverse_blobs(path), []) + # --------------------------------------------------------------------------- # iter_sessions and list_cli_projects diff --git a/tests/test_models.py b/tests/test_models.py index e7f2c99..aa14b68 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -110,13 +110,14 @@ def test_missing_full_conversation_headers_only_raises(self) -> None: self.assertEqual(cm.exception.model, "Composer") self.assertEqual(cm.exception.field, "fullConversationHeadersOnly") - def test_composer_missing_created_at_is_tolerated(self) -> None: - # Real Cursor data legitimately omits createdAt for older composers; - # call sites already fall back to lastUpdatedAt and then to epoch zero. + def test_missing_created_at_raises(self) -> None: + # Issue #24 lists createdAt as a required field. A live workspaceStorage + # check confirmed 17/17 composers carry it, so a missing value is real + # schema drift (not benign older-record absence). bad = {k: v for k, v in GOOD_COMPOSER_RAW.items() if k != "createdAt"} - composer = Composer.from_dict(bad, composer_id="cid-001") - self.assertIsNone(composer.created_at) - self.assertEqual(composer.last_updated_at, 1_715_000_500_000) + with self.assertRaises(SchemaError) as cm: + Composer.from_dict(bad, composer_id="cid-001") + self.assertEqual(cm.exception.field, "createdAt") def test_empty_composer_id_raises(self) -> None: with self.assertRaises(SchemaError) as cm: @@ -129,6 +130,21 @@ def test_headers_wrong_type_raises(self) -> None: Composer.from_dict(bad, composer_id="cid-001") self.assertIn("expected list", str(cm.exception)) + def test_headers_falsy_non_list_raises(self) -> None: + # Regression: CodeRabbit caught that ``raw.get(...) or []`` silently + # coerced None / "" / 0 / False to an empty list, bypassing the + # isinstance gate. Each falsy non-list value must now raise. + for bad_value in (None, "", 0, False): + bad = dict(GOOD_COMPOSER_RAW, fullConversationHeadersOnly=bad_value) + with self.assertRaises(SchemaError, msg=f"failed for {bad_value!r}"): + Composer.from_dict(bad, composer_id="cid-001") + + def test_headers_empty_list_is_valid(self) -> None: + # Empty list must still pass — a composer with no messages is legal. + ok = dict(GOOD_COMPOSER_RAW, fullConversationHeadersOnly=[]) + composer = Composer.from_dict(ok, composer_id="cid-001") + self.assertEqual(composer.full_conversation_headers_only, []) + def test_bubble_empty_id_raises(self) -> None: with self.assertRaises(SchemaError): Bubble.from_dict({"text": "hi"}, bubble_id="") @@ -138,6 +154,16 @@ def test_export_entry_missing_required_raises(self) -> None: ExportEntry.from_dict({"title": "x", "workspace": "w"}) self.assertEqual(cm.exception.field, "log_id") + def test_export_entry_non_string_required_raises(self) -> None: + # Regression: CodeRabbit caught that ``str(raw["log_id"])`` silently + # coerced ints, UUIDs, lists, etc. into strings, masking schema drift. + # Each required field must now be an actual non-empty string. + bad_values: tuple[object, ...] = (123, None, "", [], {"x": 1}, True) + for bad_value in bad_values: + bad: dict[str, object] = {"log_id": bad_value, "title": "x", "workspace": "w"} + with self.assertRaises(SchemaError, msg=f"failed for {bad_value!r}"): + ExportEntry.from_dict(bad) + def test_schema_error_inherits_value_error(self) -> None: # call sites that catch ValueError still trap SchemaError (back-compat) try: @@ -146,6 +172,24 @@ def test_schema_error_inherits_value_error(self) -> None: return self.fail("SchemaError did not propagate as ValueError") + def test_non_dict_payload_raises_schema_error(self) -> None: + # Regression: CodeRabbit caught that a malformed top-level payload + # (list, string, None) would previously trip AttributeError on + # ``raw.get(...)`` and get swallowed by surrounding ``except Exception``. + # Each ``from_dict`` now surfaces it as SchemaError so drift stays loud. + bad_payloads: tuple[object, ...] = ([], "not a dict", 42, None, ("a", "b")) + for bad in bad_payloads: + with self.assertRaises(SchemaError, msg=f"failed for {type(bad).__name__}"): + Composer.from_dict(bad, composer_id="cid-001") # type: ignore[arg-type] + with self.assertRaises(SchemaError): + CliSessionMeta.from_dict(bad) # type: ignore[arg-type] + with self.assertRaises(SchemaError): + Workspace.from_dict(bad, workspace_id="ws-1") # type: ignore[arg-type] + with self.assertRaises(SchemaError): + ExportEntry.from_dict(bad) # type: ignore[arg-type] + with self.assertRaises(SchemaError): + Bubble.from_dict(bad, bubble_id="b-1") # type: ignore[arg-type] + # --------------------------------------------------------------------------- # 3. _extract_blob_refs binary blob path (0x0a 0x20 marker) diff --git a/tests/test_search_exclusion_filtering.py b/tests/test_search_exclusion_filtering.py index d09efb1..a3e0209 100644 --- a/tests/test_search_exclusion_filtering.py +++ b/tests/test_search_exclusion_filtering.py @@ -170,6 +170,7 @@ def _build_global_db(self): {"bubbleId": "b-kwd-1"}, {"bubbleId": "b-kwd-2"}, ], + "createdAt": 1739260000000, "lastUpdatedAt": 1739270000000, } ), @@ -186,6 +187,7 @@ def _build_global_db(self): "fullConversationHeadersOnly": [ {"bubbleId": "b-roadmap-1"}, ], + "createdAt": 1739261000000, "lastUpdatedAt": 1739271000000, } ), diff --git a/utils/cli_chat_reader.py b/utils/cli_chat_reader.py index 4deadd0..5927c76 100644 --- a/utils/cli_chat_reader.py +++ b/utils/cli_chat_reader.py @@ -99,7 +99,12 @@ def traverse_blobs(db_path: str) -> list[dict]: meta = CliSessionMeta.from_dict( json.loads(bytes.fromhex(meta_row[0]).decode("utf-8")) ) - except SchemaError as e: + except (SchemaError, ValueError, UnicodeDecodeError, TypeError) as e: + # SchemaError covers field-level drift; ValueError covers bad hex + # and bad JSON (JSONDecodeError is a ValueError subclass); + # UnicodeDecodeError covers non-UTF-8 bytes; TypeError covers + # the meta_row[0] not being a string. All routes back to "no + # conversation" instead of escaping to the caller. print(f"Schema drift in CLI session meta at {db_path}: {e}") return [] root_id: str = meta.latest_root_blob_id From 7774f41550a36619dd0d57a83b0e4101202badcf Mon Sep 17 00:00:00 2001 From: timon0305 Date: Mon, 11 May 2026 17:47:58 +0200 Subject: [PATCH 03/10] review: route per-row workspace-local composer drift through SchemaError (#30) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CodeRabbit flagged that the inline ``isinstance(c, dict) or not c.get(...)`` skip in api/composers.py masked row-level drift instead of surfacing it the way the rest of the boundary does. The exact suggested fix (Composer.from_dict(c)) would have rejected every workspace-local record: those carry composerId + lastUpdatedAt but not fullConversationHeadersOnly or createdAt, which only exist on the global cursorDiskKV rows. Add WorkspaceLocalComposer to models/conversation.py — a slim sibling model that validates the workspace-local shape (composerId required as a non-empty string; non-dict raw raises). api/composers.py:list_composers now wraps each entry in WorkspaceLocalComposer.from_dict, so per-row drift logs a Schema drift warning and skips, matching every other read-site in this PR. Regression coverage in tests/test_models.py: parses good shape; raises SchemaError for missing/empty/non-string composerId and for non-dict payloads. 204 tests pass (was 201; +3); mypy clean on models/; real Cursor smoke test shows zero drift warnings. --- api/composers.py | 7 +++++-- models/__init__.py | 3 ++- models/conversation.py | 40 ++++++++++++++++++++++++++++++++++++++++ tests/test_models.py | 31 +++++++++++++++++++++++++++++++ 4 files changed, 78 insertions(+), 3 deletions(-) diff --git a/api/composers.py b/api/composers.py index 264d861..c5c71ef 100644 --- a/api/composers.py +++ b/api/composers.py @@ -13,7 +13,7 @@ from utils.workspace_path import resolve_workspace_path from utils.path_helpers import to_epoch_ms -from models import SchemaError +from models import SchemaError, WorkspaceLocalComposer bp = Blueprint("composers", __name__) @@ -71,7 +71,10 @@ def list_composers(): hint=f"expected list, got {type(all_composers).__name__}", ) for c in all_composers: - if not isinstance(c, dict) or not c.get("composerId"): + try: + WorkspaceLocalComposer.from_dict(c) + except SchemaError as e: + print(f"Schema drift in {db_path}: {e}") continue c["conversation"] = c.get("conversation") or [] c["workspaceId"] = name diff --git a/models/__init__.py b/models/__init__.py index 5d7c885..93c8d38 100644 --- a/models/__init__.py +++ b/models/__init__.py @@ -9,7 +9,7 @@ """ from models.cli_session import CliSessionMeta -from models.conversation import Bubble, Composer +from models.conversation import Bubble, Composer, WorkspaceLocalComposer from models.errors import SchemaError from models.export import ExportEntry from models.workspace import Workspace @@ -21,4 +21,5 @@ "ExportEntry", "SchemaError", "Workspace", + "WorkspaceLocalComposer", ] diff --git a/models/conversation.py b/models/conversation.py index a9d2a07..40cff14 100644 --- a/models/conversation.py +++ b/models/conversation.py @@ -70,6 +70,46 @@ def from_dict(cls, raw: dict[str, Any], *, composer_id: str) -> "Composer": ) +@dataclass(frozen=True) +class WorkspaceLocalComposer: + """A composer entry from ``composer.composerData`` ItemTable rows. + + These are summary records that live in each per-workspace ``state.vscdb``. + They share ``composerId`` and ``lastUpdatedAt`` with the global composer + schema, but they do **not** carry ``fullConversationHeadersOnly`` or + ``createdAt`` — those only exist on the global ``cursorDiskKV`` rows that + ``Composer.from_dict`` validates. Treating both shapes through the same + model would reject every workspace-local entry, so this slim model + exists to keep schema-drift detection at the boundary without conflating + the two storage paths. + """ + + composer_id: str + last_updated_at: Any = None + raw: dict[str, Any] = field(default_factory=dict) + + @classmethod + def from_dict(cls, raw: dict[str, Any]) -> "WorkspaceLocalComposer": + if not isinstance(raw, dict): + raise SchemaError( + "WorkspaceLocalComposer", + "composer", + hint=f"expected object, got {type(raw).__name__}", + ) + composer_id = raw.get("composerId") + if not isinstance(composer_id, str) or not composer_id: + raise SchemaError( + "WorkspaceLocalComposer", + "composerId", + hint=f"expected non-empty str, got {type(composer_id).__name__}", + ) + return cls( + composer_id=composer_id, + last_updated_at=raw.get("lastUpdatedAt"), + raw=raw, + ) + + @dataclass(frozen=True) class Bubble: """A single message bubble within a composer. diff --git a/tests/test_models.py b/tests/test_models.py index aa14b68..6ab973c 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -28,6 +28,7 @@ ExportEntry, SchemaError, Workspace, + WorkspaceLocalComposer, ) from utils.cli_chat_reader import _extract_blob_refs @@ -85,6 +86,17 @@ def test_workspace_parses_with_optional_folder(self) -> None: ws_no_folder = Workspace.from_dict({}, workspace_id="cli-only") self.assertEqual(ws_no_folder.folder, None) + def test_workspace_local_composer_parses(self) -> None: + # Workspace-local composers (composer.composerData ItemTable rows) + # carry composerId + lastUpdatedAt but not the global-storage fields. + c = WorkspaceLocalComposer.from_dict({ + "composerId": "cid-local-1", + "lastUpdatedAt": 1_715_000_500_000, + "conversation": [], + }) + self.assertEqual(c.composer_id, "cid-local-1") + self.assertEqual(c.last_updated_at, 1_715_000_500_000) + def test_export_entry_parses(self) -> None: entry = ExportEntry.from_dict({ "log_id": "L1", @@ -149,6 +161,25 @@ def test_bubble_empty_id_raises(self) -> None: with self.assertRaises(SchemaError): Bubble.from_dict({"text": "hi"}, bubble_id="") + def test_workspace_local_composer_missing_id_raises(self) -> None: + # Regression for CodeRabbit's per-row drift comment on api/composers.py: + # previously a row missing composerId was silently skipped. Now drift + # raises SchemaError so api/composers.py can log + skip explicitly. + for bad in ( + {"lastUpdatedAt": 0}, # composerId absent + {"composerId": ""}, # empty string + {"composerId": None}, # None + {"composerId": 123}, # wrong type + ): + with self.assertRaises(SchemaError, msg=f"failed for {bad!r}") as cm: + WorkspaceLocalComposer.from_dict(bad) + self.assertEqual(cm.exception.field, "composerId") + + def test_workspace_local_composer_non_dict_raises(self) -> None: + for bad in (None, [], "str", 42): + with self.assertRaises(SchemaError): + WorkspaceLocalComposer.from_dict(bad) # type: ignore[arg-type] + def test_export_entry_missing_required_raises(self) -> None: with self.assertRaises(SchemaError) as cm: ExportEntry.from_dict({"title": "x", "workspace": "w"}) From d983b9ba6f42e3d82875719407351e574b4bd54e Mon Sep 17 00:00:00 2001 From: timon0305 Date: Mon, 11 May 2026 18:05:06 +0200 Subject: [PATCH 04/10] review: log non-schema read failures in list_composers (#30) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CodeRabbit flagged that the bare ``except Exception: pass`` adjacent to the new SchemaError path swallowed decode / SQLite / non-schema errors silently and returned a partial composer list with no signal. Replace with a print that surfaces the exception and ``db_path``, matching the ``Schema drift in ...`` line a few lines up. No behaviour change beyond observability — the loop still continues to the next workspace. --- api/composers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/api/composers.py b/api/composers.py index c5c71ef..14988b9 100644 --- a/api/composers.py +++ b/api/composers.py @@ -82,8 +82,8 @@ def list_composers(): composers.append(c) except SchemaError as e: print(f"Schema drift in {db_path}: {e}") - except Exception: - pass + except Exception as e: + print(f"Failed reading composers from {db_path}: {e}") composers.sort(key=lambda c: to_epoch_ms(c.get("lastUpdatedAt")), reverse=True) return jsonify(composers) From 545886d6e1c5711c404aeabae3450a7156c5a2a3 Mon Sep 17 00:00:00 2001 From: timon0305 Date: Tue, 12 May 2026 14:55:51 +0200 Subject: [PATCH 05/10] docs: drop module docstrings and trim class docstrings to one-liners (#30) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Internal cleanup on the typed-models package — file location and class name carry the intent; the multi-paragraph rationales were rehashing the PR description. Function bodies are unchanged. - models/__init__.py: removed 8-line module docstring - models/errors.py: dropped module docstring + SchemaError class docstring trimmed to one line - models/conversation.py: dropped module docstring; Composer, WorkspaceLocalComposer, Bubble docstrings trimmed to one line each - models/cli_session.py: dropped module docstring; CliSessionMeta docstring trimmed - models/workspace.py: dropped module docstring; Workspace docstring trimmed - models/export.py: dropped module docstring; ExportEntry docstring trimmed - tests/test_models.py: dropped module docstring; removed now-unused ``from pathlib import Path`` 204 tests pass. --- models/__init__.py | 10 ---------- models/cli_session.py | 13 +------------ models/conversation.py | 36 +++--------------------------------- models/errors.py | 10 +--------- models/export.py | 10 +--------- models/workspace.py | 11 +---------- tests/test_models.py | 13 ------------- 7 files changed, 7 insertions(+), 96 deletions(-) diff --git a/models/__init__.py b/models/__init__.py index 93c8d38..df249ce 100644 --- a/models/__init__.py +++ b/models/__init__.py @@ -1,13 +1,3 @@ -"""Typed domain models for Cursor schema (closes #24). - -Cursor's on-disk JSON shapes are not versioned, so silent renames of fields -like ``composerData`` or ``latestRootBlobId`` would otherwise pass through -``dict.get(...)`` with a fallback default and produce empty conversations -with no error raised. The models here add a schema-validation boundary at -database read sites: ``from_dict`` classmethods raise ``SchemaError`` when -critical fields are missing, so drift becomes loud instead of silent. -""" - from models.cli_session import CliSessionMeta from models.conversation import Bubble, Composer, WorkspaceLocalComposer from models.errors import SchemaError diff --git a/models/cli_session.py b/models/cli_session.py index 7d6cbf2..473531e 100644 --- a/models/cli_session.py +++ b/models/cli_session.py @@ -1,5 +1,3 @@ -"""CliSessionMeta — typed model for the Cursor CLI ``meta`` blob.""" - from __future__ import annotations from dataclasses import dataclass, field @@ -10,16 +8,7 @@ @dataclass(frozen=True) class CliSessionMeta: - """The ``meta`` blob at the head of a Cursor CLI ``store.db`` blob graph. - - ``latestRootBlobId`` is the entry point for the conversation reconstruction - BFS in ``utils/cli_chat_reader.traverse_blobs``; without it, the entire - conversation is unreachable. ``createdAt`` is documented as part of the - meta-blob schema (see ``utils/cli_chat_reader`` module docstring) and is - captured here, but it is not gated on — only ``latestRootBlobId`` is the - hard requirement, since that is the only field whose absence prevents - conversation reconstruction. - """ + """CLI session meta blob; latestRootBlobId is the conversation entry point and the only required field.""" latest_root_blob_id: str created_at: Any = None diff --git a/models/conversation.py b/models/conversation.py index 40cff14..9a17767 100644 --- a/models/conversation.py +++ b/models/conversation.py @@ -1,5 +1,3 @@ -"""Composer (conversation) and Bubble (message) typed models.""" - from __future__ import annotations from dataclasses import dataclass, field @@ -10,19 +8,7 @@ @dataclass(frozen=True) class Composer: - """A Cursor conversation (a.k.a. "composer") row. - - Required fields per the schema-validation contract (issue #24): - - ``fullConversationHeadersOnly`` — without this, a composer cannot be - rendered (no message order is recoverable). - - ``createdAt`` — Cursor writes this on every composer (verified - 17/17 against a live workspaceStorage). A missing value is the - kind of drift this layer exists to surface. - - The composer ID is intentionally passed in as a constructor argument - rather than read from ``raw`` because Cursor stores it in the row key - (``composerData:``) rather than in the JSON value. - """ + """Cursor conversation row from globalStorage cursorDiskKV; requires fullConversationHeadersOnly + createdAt.""" composer_id: str full_conversation_headers_only: list[dict[str, Any]] @@ -72,17 +58,7 @@ def from_dict(cls, raw: dict[str, Any], *, composer_id: str) -> "Composer": @dataclass(frozen=True) class WorkspaceLocalComposer: - """A composer entry from ``composer.composerData`` ItemTable rows. - - These are summary records that live in each per-workspace ``state.vscdb``. - They share ``composerId`` and ``lastUpdatedAt`` with the global composer - schema, but they do **not** carry ``fullConversationHeadersOnly`` or - ``createdAt`` — those only exist on the global ``cursorDiskKV`` rows that - ``Composer.from_dict`` validates. Treating both shapes through the same - model would reject every workspace-local entry, so this slim model - exists to keep schema-drift detection at the boundary without conflating - the two storage paths. - """ + """Summary composer row from per-workspace state.vscdb ItemTable; only composerId is required.""" composer_id: str last_updated_at: Any = None @@ -112,13 +88,7 @@ def from_dict(cls, raw: dict[str, Any]) -> "WorkspaceLocalComposer": @dataclass(frozen=True) class Bubble: - """A single message bubble within a composer. - - The bubble ID lives in the row key (``bubbleId::``) - rather than the JSON value, so it is passed in explicitly. The raw dict - is preserved to keep downstream rendering code (which still walks the - untyped shape) working without modification. - """ + """One message in a composer; bubble_id comes from the row key, not the JSON value.""" bubble_id: str raw: dict[str, Any] = field(default_factory=dict) diff --git a/models/errors.py b/models/errors.py index 01ad184..db12c94 100644 --- a/models/errors.py +++ b/models/errors.py @@ -1,16 +1,8 @@ -"""Exception types for the typed-model schema-validation layer.""" - from __future__ import annotations class SchemaError(ValueError): - """Raised when a required Cursor schema field is missing or malformed. - - Inherits from ``ValueError`` so call sites that already catch generic - deserialisation errors (e.g. ``json.JSONDecodeError`` is a subclass of - ``ValueError``) also catch schema drift without needing a separate - ``except`` clause. New code should catch ``SchemaError`` explicitly. - """ + """Raised when a required Cursor schema field is missing or malformed.""" def __init__(self, model: str, field: str, *, hint: str | None = None) -> None: self.model = model diff --git a/models/export.py b/models/export.py index 98d75ce..fc5d594 100644 --- a/models/export.py +++ b/models/export.py @@ -1,5 +1,3 @@ -"""ExportEntry — typed model for an export manifest record (JSONL line).""" - from __future__ import annotations from dataclasses import dataclass, field @@ -10,13 +8,7 @@ @dataclass(frozen=True) class ExportEntry: - """A single record in the export manifest (one line in ``manifest.jsonl``). - - Required fields are the YAML-frontmatter keys that downstream tooling - indexes against: a missing ``log_id`` makes the entry unaddressable, and - a missing ``title`` produces unreadable output. Timestamps are optional — - not every Cursor conversation has both a creation and update time. - """ + """One line of manifest.jsonl; log_id / title / workspace required, timestamps optional.""" log_id: str title: str diff --git a/models/workspace.py b/models/workspace.py index bbbafb1..1ad9c9c 100644 --- a/models/workspace.py +++ b/models/workspace.py @@ -1,5 +1,3 @@ -"""Workspace — typed model for a single Cursor workspace folder.""" - from __future__ import annotations from dataclasses import dataclass, field @@ -10,14 +8,7 @@ @dataclass(frozen=True) class Workspace: - """A Cursor workspace entry. - - The workspace ID is the directory name on disk (Cursor uses random - short hashes as workspace IDs) and is passed in explicitly. ``folder`` - is the absolute path of the project the workspace targets, read from - ``workspace.json``; it may legitimately be ``None`` for a CLI-only - workspace, so missing-folder is not a schema error. - """ + """A Cursor workspace folder; folder is None for CLI-only workspaces (not a schema error).""" workspace_id: str folder: str | None = None diff --git a/tests/test_models.py b/tests/test_models.py index 6ab973c..d0b08e2 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -1,21 +1,8 @@ -"""Regression tests for issue #24 — typed models + schema validation. - -Three fixture-based tests, per the issue acceptance criteria: - 1. known-good Composer schema parses cleanly - 2. missing-field Composer schema raises SchemaError - 3. CliSessionMeta + ``_extract_blob_refs`` together exercise the binary - blob path (``0x0a 0x20`` marker) - -Run: - python -m unittest tests.test_models -v -""" - from __future__ import annotations import os import sys import unittest -from pathlib import Path REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) if REPO_ROOT not in sys.path: From 26b565ed3f58004c61b98a65347b3237bd32fa4e Mon Sep 17 00:00:00 2001 From: timon0305 Date: Tue, 12 May 2026 15:03:47 +0200 Subject: [PATCH 06/10] docs: drop task-reference comments and section banners (#30) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per the review nit: stop narrating in code. Removes: - tests/test_models.py: 3 ``# --- section ---`` banners and 10 inline comments referencing issue #24 / CodeRabbit findings. - tests/test_cli_chat_reader.py: 4 ``"""Regression: ..."""`` docstrings and 4 line-comments restating what the test exercises. - utils/cli_chat_reader.py: 5-line narration on the broadened except arm — the tuple of exception types is the documentation. Test names and code already describe the WHAT; the WHY for these cases is in the PR description and the commit history, not the source. 204 tests still pass. --- tests/test_cli_chat_reader.py | 8 ------- tests/test_models.py | 42 ----------------------------------- utils/cli_chat_reader.py | 5 ----- 3 files changed, 55 deletions(-) diff --git a/tests/test_cli_chat_reader.py b/tests/test_cli_chat_reader.py index 9965c1a..afc182c 100644 --- a/tests/test_cli_chat_reader.py +++ b/tests/test_cli_chat_reader.py @@ -404,27 +404,19 @@ def _write_raw_meta(self, name: str, raw_meta_value: str) -> str: return path def test_malformed_hex_meta_returns_empty(self): - """Regression: bad hex must not escape as ValueError.""" - # 'zz' is not valid hex — bytes.fromhex() raises ValueError. path = self._write_raw_meta("bad_hex.db", "zzzz") self.assertEqual(traverse_blobs(path), []) def test_non_utf8_meta_returns_empty(self): - """Regression: non-UTF-8 bytes must not escape as UnicodeDecodeError.""" - # 0xff 0xfe is invalid as UTF-8 — .decode('utf-8') raises. path = self._write_raw_meta("bad_utf8.db", "fffe") self.assertEqual(traverse_blobs(path), []) def test_invalid_json_meta_returns_empty(self): - """Regression: malformed JSON must not escape as JSONDecodeError.""" - # Hex-encoded UTF-8 garbage that isn't a JSON object: "not json" garbage = "not json".encode("utf-8").hex() path = self._write_raw_meta("bad_json.db", garbage) self.assertEqual(traverse_blobs(path), []) def test_non_dict_meta_returns_empty(self): - """Regression: top-level non-object must surface as SchemaError + empty.""" - # Hex-encoded JSON list — passes hex/utf8/json, fails dict guard. list_payload = json.dumps(["not", "a", "dict"]).encode("utf-8").hex() path = self._write_raw_meta("non_dict.db", list_payload) self.assertEqual(traverse_blobs(path), []) diff --git a/tests/test_models.py b/tests/test_models.py index d0b08e2..7c486a8 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -20,10 +20,6 @@ from utils.cli_chat_reader import _extract_blob_refs -# --------------------------------------------------------------------------- -# Fixtures -# --------------------------------------------------------------------------- - GOOD_COMPOSER_RAW: dict = { "name": "Refactor api/workspaces.py", "createdAt": 1_715_000_000_000, @@ -48,11 +44,6 @@ def _make_blob_chain(*ref_hashes: str) -> bytes: return bytes(out) -# --------------------------------------------------------------------------- -# 1. Known-good schema -# --------------------------------------------------------------------------- - - class ComposerKnownGoodSchema(unittest.TestCase): def test_parses_required_and_optional_fields(self) -> None: composer = Composer.from_dict(GOOD_COMPOSER_RAW, composer_id="cid-001") @@ -74,8 +65,6 @@ def test_workspace_parses_with_optional_folder(self) -> None: self.assertEqual(ws_no_folder.folder, None) def test_workspace_local_composer_parses(self) -> None: - # Workspace-local composers (composer.composerData ItemTable rows) - # carry composerId + lastUpdatedAt but not the global-storage fields. c = WorkspaceLocalComposer.from_dict({ "composerId": "cid-local-1", "lastUpdatedAt": 1_715_000_500_000, @@ -96,11 +85,6 @@ def test_export_entry_parses(self) -> None: self.assertEqual(entry.workspace, "ws-1") -# --------------------------------------------------------------------------- -# 2. Missing-field schema → SchemaError -# --------------------------------------------------------------------------- - - class ComposerMissingFieldSchema(unittest.TestCase): def test_missing_full_conversation_headers_only_raises(self) -> None: bad = {k: v for k, v in GOOD_COMPOSER_RAW.items() if k != "fullConversationHeadersOnly"} @@ -110,9 +94,6 @@ def test_missing_full_conversation_headers_only_raises(self) -> None: self.assertEqual(cm.exception.field, "fullConversationHeadersOnly") def test_missing_created_at_raises(self) -> None: - # Issue #24 lists createdAt as a required field. A live workspaceStorage - # check confirmed 17/17 composers carry it, so a missing value is real - # schema drift (not benign older-record absence). bad = {k: v for k, v in GOOD_COMPOSER_RAW.items() if k != "createdAt"} with self.assertRaises(SchemaError) as cm: Composer.from_dict(bad, composer_id="cid-001") @@ -130,16 +111,12 @@ def test_headers_wrong_type_raises(self) -> None: self.assertIn("expected list", str(cm.exception)) def test_headers_falsy_non_list_raises(self) -> None: - # Regression: CodeRabbit caught that ``raw.get(...) or []`` silently - # coerced None / "" / 0 / False to an empty list, bypassing the - # isinstance gate. Each falsy non-list value must now raise. for bad_value in (None, "", 0, False): bad = dict(GOOD_COMPOSER_RAW, fullConversationHeadersOnly=bad_value) with self.assertRaises(SchemaError, msg=f"failed for {bad_value!r}"): Composer.from_dict(bad, composer_id="cid-001") def test_headers_empty_list_is_valid(self) -> None: - # Empty list must still pass — a composer with no messages is legal. ok = dict(GOOD_COMPOSER_RAW, fullConversationHeadersOnly=[]) composer = Composer.from_dict(ok, composer_id="cid-001") self.assertEqual(composer.full_conversation_headers_only, []) @@ -149,9 +126,6 @@ def test_bubble_empty_id_raises(self) -> None: Bubble.from_dict({"text": "hi"}, bubble_id="") def test_workspace_local_composer_missing_id_raises(self) -> None: - # Regression for CodeRabbit's per-row drift comment on api/composers.py: - # previously a row missing composerId was silently skipped. Now drift - # raises SchemaError so api/composers.py can log + skip explicitly. for bad in ( {"lastUpdatedAt": 0}, # composerId absent {"composerId": ""}, # empty string @@ -173,9 +147,6 @@ def test_export_entry_missing_required_raises(self) -> None: self.assertEqual(cm.exception.field, "log_id") def test_export_entry_non_string_required_raises(self) -> None: - # Regression: CodeRabbit caught that ``str(raw["log_id"])`` silently - # coerced ints, UUIDs, lists, etc. into strings, masking schema drift. - # Each required field must now be an actual non-empty string. bad_values: tuple[object, ...] = (123, None, "", [], {"x": 1}, True) for bad_value in bad_values: bad: dict[str, object] = {"log_id": bad_value, "title": "x", "workspace": "w"} @@ -183,7 +154,6 @@ def test_export_entry_non_string_required_raises(self) -> None: ExportEntry.from_dict(bad) def test_schema_error_inherits_value_error(self) -> None: - # call sites that catch ValueError still trap SchemaError (back-compat) try: Composer.from_dict({}, composer_id="cid-001") except ValueError: @@ -191,10 +161,6 @@ def test_schema_error_inherits_value_error(self) -> None: self.fail("SchemaError did not propagate as ValueError") def test_non_dict_payload_raises_schema_error(self) -> None: - # Regression: CodeRabbit caught that a malformed top-level payload - # (list, string, None) would previously trip AttributeError on - # ``raw.get(...)`` and get swallowed by surrounding ``except Exception``. - # Each ``from_dict`` now surfaces it as SchemaError so drift stays loud. bad_payloads: tuple[object, ...] = ([], "not a dict", 42, None, ("a", "b")) for bad in bad_payloads: with self.assertRaises(SchemaError, msg=f"failed for {type(bad).__name__}"): @@ -209,11 +175,6 @@ def test_non_dict_payload_raises_schema_error(self) -> None: Bubble.from_dict(bad, bubble_id="b-1") # type: ignore[arg-type] -# --------------------------------------------------------------------------- -# 3. _extract_blob_refs binary blob path (0x0a 0x20 marker) -# --------------------------------------------------------------------------- - - class CliSessionMetaAndBlobChain(unittest.TestCase): def test_meta_missing_latest_root_blob_id_raises(self) -> None: with self.assertRaises(SchemaError) as cm: @@ -226,8 +187,6 @@ def test_meta_wrong_type_raises(self) -> None: CliSessionMeta.from_dict({"latestRootBlobId": 12345}) def test_meta_parses_then_blob_chain_extracts_refs(self) -> None: - # Realistic flow: the meta blob points at a root chain blob, whose - # 0x0a 0x20-prefixed runs are SHA-256 references to JSON message blobs. ref1 = "a" * 64 ref2 = "b" * 64 ref3 = "c" * 64 @@ -245,7 +204,6 @@ def test_meta_parses_then_blob_chain_extracts_refs(self) -> None: self.assertEqual(refs, [ref1, ref2, ref3]) def test_blob_chain_skips_non_marker_bytes(self) -> None: - # Garbage prefix + valid run + garbage suffix — only the run extracts. ref = "f" * 64 garbage_before = b"\x01\x02\x03" garbage_after = b"\xff\xfe" diff --git a/utils/cli_chat_reader.py b/utils/cli_chat_reader.py index 5927c76..2b2be00 100644 --- a/utils/cli_chat_reader.py +++ b/utils/cli_chat_reader.py @@ -100,11 +100,6 @@ def traverse_blobs(db_path: str) -> list[dict]: json.loads(bytes.fromhex(meta_row[0]).decode("utf-8")) ) except (SchemaError, ValueError, UnicodeDecodeError, TypeError) as e: - # SchemaError covers field-level drift; ValueError covers bad hex - # and bad JSON (JSONDecodeError is a ValueError subclass); - # UnicodeDecodeError covers non-UTF-8 bytes; TypeError covers - # the meta_row[0] not being a string. All routes back to "no - # conversation" instead of escaping to the caller. print(f"Schema drift in CLI session meta at {db_path}: {e}") return [] root_id: str = meta.latest_root_blob_id From 59ae103aea39970cb222e2238c8718619fcf935f Mon Sep 17 00:00:00 2001 From: timon0305 Date: Tue, 12 May 2026 15:10:43 +0200 Subject: [PATCH 07/10] review: tighten ID + createdAt type gates across the model layer (#30) Three related CodeRabbit findings, addressed together: - Composer.composer_id, Bubble.bubble_id, Workspace.workspace_id now require ``isinstance(..., str) and value != ""``. Truthiness alone let non-string IDs (e.g. ``123``) slip past the boundary; aligned with the pattern already in WorkspaceLocalComposer. - Composer.createdAt is now type-gated. Presence was already enforced but any value type (e.g. ``"2026-05-12"``) silently became the timestamp. Required values must be ``int`` or ``float`` and explicitly not ``bool`` (since ``bool`` is a subclass of ``int`` in Python and a boolean is never a real timestamp). Regression tests pin each new gate: - test_non_string_composer_id_raises - test_non_string_bubble_id_raises - test_non_string_workspace_id_raises - test_non_numeric_created_at_raises - test_numeric_created_at_passes 209 tests pass (was 204; +5 new). --- models/conversation.py | 26 +++++++++++++++++++++----- models/workspace.py | 8 ++++++-- tests/test_models.py | 31 +++++++++++++++++++++++++++++++ 3 files changed, 58 insertions(+), 7 deletions(-) diff --git a/models/conversation.py b/models/conversation.py index 9a17767..4ac3629 100644 --- a/models/conversation.py +++ b/models/conversation.py @@ -26,13 +26,25 @@ def from_dict(cls, raw: dict[str, Any], *, composer_id: str) -> "Composer": "composerData", hint=f"expected object, got {type(raw).__name__}", ) - if not composer_id: - raise SchemaError("Composer", "composerId", hint="empty composer ID") + if not isinstance(composer_id, str) or not composer_id: + raise SchemaError( + "Composer", + "composerId", + hint=f"expected non-empty str, got {type(composer_id).__name__}", + ) if "fullConversationHeadersOnly" not in raw: raise SchemaError("Composer", "fullConversationHeadersOnly") if "createdAt" not in raw: raise SchemaError("Composer", "createdAt") + created_at = raw.get("createdAt") + if not isinstance(created_at, (int, float)) or isinstance(created_at, bool): + raise SchemaError( + "Composer", + "createdAt", + hint=f"expected timestamp number, got {type(created_at).__name__}", + ) + headers = raw.get("fullConversationHeadersOnly") if not isinstance(headers, list): raise SchemaError( @@ -48,7 +60,7 @@ def from_dict(cls, raw: dict[str, Any], *, composer_id: str) -> "Composer": return cls( composer_id=composer_id, full_conversation_headers_only=headers, - created_at=raw.get("createdAt"), + created_at=created_at, name=raw.get("name"), last_updated_at=raw.get("lastUpdatedAt"), model_config=model_config, @@ -101,6 +113,10 @@ def from_dict(cls, raw: dict[str, Any], *, bubble_id: str) -> "Bubble": "bubble", hint=f"expected object, got {type(raw).__name__}", ) - if not bubble_id: - raise SchemaError("Bubble", "bubbleId", hint="empty bubble ID") + if not isinstance(bubble_id, str) or not bubble_id: + raise SchemaError( + "Bubble", + "bubbleId", + hint=f"expected non-empty str, got {type(bubble_id).__name__}", + ) return cls(bubble_id=bubble_id, raw=raw) diff --git a/models/workspace.py b/models/workspace.py index 1ad9c9c..75869e3 100644 --- a/models/workspace.py +++ b/models/workspace.py @@ -22,8 +22,12 @@ def from_dict(cls, raw: dict[str, Any], *, workspace_id: str) -> "Workspace": "workspace.json", hint=f"expected object, got {type(raw).__name__}", ) - if not workspace_id: - raise SchemaError("Workspace", "workspaceId", hint="empty workspace ID") + if not isinstance(workspace_id, str) or not workspace_id: + raise SchemaError( + "Workspace", + "workspaceId", + hint=f"expected non-empty str, got {type(workspace_id).__name__}", + ) folder = raw.get("folder") if folder is not None and not isinstance(folder, str): raise SchemaError( diff --git a/tests/test_models.py b/tests/test_models.py index 7c486a8..ce335bc 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -64,6 +64,12 @@ def test_workspace_parses_with_optional_folder(self) -> None: ws_no_folder = Workspace.from_dict({}, workspace_id="cli-only") self.assertEqual(ws_no_folder.folder, None) + def test_non_string_workspace_id_raises(self) -> None: + for bad_id in (None, "", 123, [], {}, True, b"bytes"): + with self.assertRaises(SchemaError, msg=f"failed for {bad_id!r}") as cm: + Workspace.from_dict({}, workspace_id=bad_id) # type: ignore[arg-type] + self.assertEqual(cm.exception.field, "workspaceId") + def test_workspace_local_composer_parses(self) -> None: c = WorkspaceLocalComposer.from_dict({ "composerId": "cid-local-1", @@ -99,11 +105,30 @@ def test_missing_created_at_raises(self) -> None: Composer.from_dict(bad, composer_id="cid-001") self.assertEqual(cm.exception.field, "createdAt") + def test_non_numeric_created_at_raises(self) -> None: + for bad_value in ("2026-05-12", None, [], {}, True, False, b"bytes"): + bad = dict(GOOD_COMPOSER_RAW, createdAt=bad_value) + with self.assertRaises(SchemaError, msg=f"failed for {bad_value!r}") as cm: + Composer.from_dict(bad, composer_id="cid-001") + self.assertEqual(cm.exception.field, "createdAt") + + def test_numeric_created_at_passes(self) -> None: + for ok_value in (1_715_000_000_000, 1_715_000_000_000.5, 0, -1): + ok = dict(GOOD_COMPOSER_RAW, createdAt=ok_value) + composer = Composer.from_dict(ok, composer_id="cid-001") + self.assertEqual(composer.created_at, ok_value) + def test_empty_composer_id_raises(self) -> None: with self.assertRaises(SchemaError) as cm: Composer.from_dict(GOOD_COMPOSER_RAW, composer_id="") self.assertEqual(cm.exception.field, "composerId") + def test_non_string_composer_id_raises(self) -> None: + for bad_id in (None, 123, [], {}, True, b"bytes"): + with self.assertRaises(SchemaError, msg=f"failed for {bad_id!r}") as cm: + Composer.from_dict(GOOD_COMPOSER_RAW, composer_id=bad_id) # type: ignore[arg-type] + self.assertEqual(cm.exception.field, "composerId") + def test_headers_wrong_type_raises(self) -> None: bad = dict(GOOD_COMPOSER_RAW, fullConversationHeadersOnly={"not": "a list"}) with self.assertRaises(SchemaError) as cm: @@ -125,6 +150,12 @@ def test_bubble_empty_id_raises(self) -> None: with self.assertRaises(SchemaError): Bubble.from_dict({"text": "hi"}, bubble_id="") + def test_non_string_bubble_id_raises(self) -> None: + for bad_id in (None, 123, [], {}, True, b"bytes"): + with self.assertRaises(SchemaError, msg=f"failed for {bad_id!r}") as cm: + Bubble.from_dict({"text": "hi"}, bubble_id=bad_id) # type: ignore[arg-type] + self.assertEqual(cm.exception.field, "bubbleId") + def test_workspace_local_composer_missing_id_raises(self) -> None: for bad in ( {"lastUpdatedAt": 0}, # composerId absent From 16f861d007d98fe24a43584572d1937c5f48ed94 Mon Sep 17 00:00:00 2001 From: timon0305 Date: Thu, 14 May 2026 14:48:13 +0200 Subject: [PATCH 08/10] review: wire typed models load-bearing across read sites (#30) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes five blocking findings from Brad's review pass: 1. Bubble / Workspace / ExportEntry were defined and unit-tested but had zero production callers — the PR's own "Used at" table didn't match the code. Wired each at the cited sites: * Bubble in api/workspaces.py (both bubble loaders, line 584 + 1023) * Bubble in api/search.py:150 * Workspace in api/composers.py:44 and api/workspaces.py:54 * ExportEntry reader in scripts/export.py:66 — paired with writer updates so the on-disk shape matches what the model accepts. 2. get_workspace_tabs() in api/workspaces.py bypassed Composer.from_dict while list_workspaces() in the same file validated. The two primary conversation-browsing paths now agree on what counts as a valid row. 3. get_composer() returned the raw dict from either per-workspace DB or global storage with no validation. Now goes through WorkspaceLocalComposer.from_dict (per-workspace, drift logged + skipped) and Composer.from_dict (global fallback, drift -> 404 + log). 4. WorkspaceLocalComposer.from_dict result was discarded after the gate. The typed object is now load-bearing: the sort key reads local.last_updated_at and the JSON's composerId/lastUpdatedAt are written from the validated fields, not the raw dict. 5. SchemaError message always read "missing required field" even on type/shape mismatches. Now reads "invalid field" when hint is set and "missing required field" only when hint is None, so log grepping can tell the two drift modes apart. Plus one forward-looking note: added a WHY comment at the numeric-only createdAt check in models/conversation.py pointing at the 17/17 live scan that justifies the strict isinstance(int, float) gate. 11 new regression tests: - tests/test_models_wired_at_read_sites.py (new file, 10 tests) pinning each from_dict call site with spy + side_effect patches so a future refactor that drops the typed model fails loudly. - tests/test_models.py one new test asserting the SchemaError wording distinction holds. Verified locally: - python -m unittest discover tests: 220 passed, OK (was 209) - python -m unittest tests.test_models_wired_at_read_sites: 10/10 pass - python app.py boots clean, curl / returns 200 --- api/composers.py | 54 +++- api/search.py | 13 +- api/workspaces.py | 44 ++- models/conversation.py | 4 + models/errors.py | 9 +- scripts/export.py | 24 +- tests/test_models.py | 16 + tests/test_models_wired_at_read_sites.py | 381 +++++++++++++++++++++++ 8 files changed, 497 insertions(+), 48 deletions(-) create mode 100644 tests/test_models_wired_at_read_sites.py diff --git a/api/composers.py b/api/composers.py index 14988b9..96b9d03 100644 --- a/api/composers.py +++ b/api/composers.py @@ -13,7 +13,7 @@ from utils.workspace_path import resolve_workspace_path from utils.path_helpers import to_epoch_ms -from models import SchemaError, WorkspaceLocalComposer +from models import Composer, SchemaError, Workspace, WorkspaceLocalComposer bp = Blueprint("composers", __name__) @@ -41,9 +41,11 @@ def list_composers(): workspace_folder = None try: - wd = _read_json_file(wj_path) - workspace_folder = wd.get("folder") - except Exception: + workspace = Workspace.from_dict(_read_json_file(wj_path), workspace_id=name) + workspace_folder = workspace.folder + except (SchemaError, OSError, ValueError): + # Missing / malformed workspace.json is non-fatal — the row still + # contributes its composer data, just without a folder hint. pass try: @@ -72,21 +74,27 @@ def list_composers(): ) for c in all_composers: try: - WorkspaceLocalComposer.from_dict(c) + local = WorkspaceLocalComposer.from_dict(c) except SchemaError as e: print(f"Schema drift in {db_path}: {e}") continue + # Use the typed view downstream so the dataclass is + # load-bearing, not just a filter (Brad's review): the + # sort key and the JSON's composerId both read off the + # validated values, not the raw dict. + c["composerId"] = local.composer_id + c["lastUpdatedAt"] = local.last_updated_at c["conversation"] = c.get("conversation") or [] c["workspaceId"] = name c["workspaceFolder"] = workspace_folder - composers.append(c) + composers.append((local, c)) except SchemaError as e: print(f"Schema drift in {db_path}: {e}") except Exception as e: print(f"Failed reading composers from {db_path}: {e}") - composers.sort(key=lambda c: to_epoch_ms(c.get("lastUpdatedAt")), reverse=True) - return jsonify(composers) + composers.sort(key=lambda pair: to_epoch_ms(pair[0].last_updated_at), reverse=True) + return jsonify([c for _, c in composers]) except Exception as e: print(f"Failed to get composers: {e}") @@ -117,9 +125,17 @@ def get_composer(composer_id): if row and row[0]: data = json.loads(row[0]) for c in (data.get("allComposers") or []): - if c.get("composerId") == composer_id: - return jsonify(c) - except Exception: + if isinstance(c, dict) and c.get("composerId") == composer_id: + try: + local = WorkspaceLocalComposer.from_dict(c) + except SchemaError as e: + # Same drift list_composers() logs and skips at line ~78, + # so a single-composer fetch can't silently return malformed + # JSON the list endpoint hid. + print(f"Schema drift in workspace-local composer {composer_id}: {e}") + continue + return jsonify(local.raw) + except (OSError, sqlite3.Error, json.JSONDecodeError, ValueError): pass # Fallback: global storage @@ -135,10 +151,18 @@ def get_composer(composer_id): if row and row[0]: raw = row[0] if isinstance(row[0], str) else row[0].decode("utf-8") - composer = json.loads(raw) - composer.setdefault("conversation", []) - return jsonify(composer) - except Exception: + try: + composer = Composer.from_dict(json.loads(raw), composer_id=composer_id) + except SchemaError as e: + # Don't return malformed JSON to the client — surface the drift + # as a 404 + log, matching the silent-skip behaviour of the + # list endpoints for the same row. + print(f"Schema drift in composer {composer_id}: {e}") + return jsonify({"error": "Composer schema drift"}), 404 + payload = dict(composer.raw) + payload.setdefault("conversation", []) + return jsonify(payload) + except (OSError, sqlite3.Error, json.JSONDecodeError, ValueError): pass return jsonify({"error": "Composer not found"}), 404 diff --git a/api/search.py b/api/search.py index 9f7c0a0..3af435c 100644 --- a/api/search.py +++ b/api/search.py @@ -18,7 +18,7 @@ from utils.path_helpers import normalize_file_path, get_workspace_folder_paths, to_epoch_ms from utils.text_extract import extract_text_from_bubble from utils.cli_chat_reader import list_cli_projects, traverse_blobs, messages_to_bubbles -from models import Composer, SchemaError +from models import Bubble, Composer, SchemaError bp = Blueprint("search", __name__) @@ -147,11 +147,12 @@ def search(): if len(parts) >= 3: bid = parts[2] try: - b = json.loads(row["value"]) - if isinstance(b, dict): - text = extract_text_from_bubble(b) - bubble_map[bid] = {"text": text, "raw": b} - except Exception: + bubble = Bubble.from_dict(json.loads(row["value"]), bubble_id=bid) + text = extract_text_from_bubble(bubble.raw) + bubble_map[bid] = {"text": text, "raw": bubble.raw} + except (SchemaError, json.JSONDecodeError, ValueError): + # Skip malformed bubble rows — search must keep returning + # results from the well-formed ones. pass # Search through composerData diff --git a/api/workspaces.py b/api/workspaces.py index a31f539..0511ffc 100644 --- a/api/workspaces.py +++ b/api/workspaces.py @@ -33,7 +33,7 @@ ) from utils.text_extract import extract_text_from_bubble, format_tool_action from utils.exclusion_rules import build_searchable_text, is_excluded_by_rules -from models import Composer, SchemaError +from models import Bubble, Composer, SchemaError, Workspace bp = Blueprint("workspaces", __name__) @@ -51,11 +51,11 @@ def _get_workspace_display_name(workspace_path: str, workspace_id: str) -> str: return "Other chats" wj_path = os.path.join(workspace_path, workspace_id, "workspace.json") try: - wd = _read_json_file(wj_path) - name = get_workspace_display_name(wd) + workspace = Workspace.from_dict(_read_json_file(wj_path), workspace_id=workspace_id) + name = get_workspace_display_name(workspace.raw) if name: return name - except Exception: + except (SchemaError, OSError, ValueError): pass return workspace_id @@ -581,10 +581,11 @@ def list_workspaces(): if len(parts) >= 3: bid = parts[2] try: - b = json.loads(row["value"]) - if isinstance(b, dict): - bubble_map[bid] = b - except Exception: + bubble = Bubble.from_dict(json.loads(row["value"]), bubble_id=bid) + bubble_map[bid] = bubble.raw + except (SchemaError, json.JSONDecodeError, ValueError): + # Skip malformed bubble rows — read-many path, one bad row + # must not 500 the endpoint. pass # Process each composer @@ -1019,10 +1020,10 @@ def get_workspace_tabs(workspace_id): if len(parts) >= 3: bid = parts[2] try: - b = json.loads(row["value"]) - if isinstance(b, dict): - bubble_map[bid] = b - except Exception: + bubble = Bubble.from_dict(json.loads(row["value"]), bubble_id=bid) + bubble_map[bid] = bubble.raw + except (SchemaError, json.JSONDecodeError, ValueError): + # Skip malformed rows — one bad bubble must not 500 the tabs endpoint. pass # Load codeBlockDiffs @@ -1097,8 +1098,17 @@ def get_workspace_tabs(workspace_id): for row in composer_rows: composer_id = row["key"].split(":")[1] try: - cd = json.loads(row["value"]) - + composer = Composer.from_dict(json.loads(row["value"]), composer_id=composer_id) + except SchemaError as e: + # Skip the same drift list_workspaces() drops at line 605 so the two + # primary conversation paths agree on what counts as a valid composer. + print(f"Schema drift in composer {composer_id}: {e}") + continue + except (json.JSONDecodeError, TypeError, ValueError): + continue + try: + cd = composer.raw + # Determine project pid = _determine_project_for_conversation( cd, composer_id, project_layouts_map, @@ -1109,11 +1119,11 @@ def get_workspace_tabs(workspace_id): if not pid and mapped_ws in invalid_workspace_ids: pid = invalid_workspace_aliases.get(mapped_ws) assigned = pid if pid else "global" - + if assigned not in matching_ws_ids: continue - - headers = cd.get("fullConversationHeadersOnly") or [] + + headers = composer.full_conversation_headers_only # Build bubbles bubbles = [] diff --git a/models/conversation.py b/models/conversation.py index 4ac3629..c2a103a 100644 --- a/models/conversation.py +++ b/models/conversation.py @@ -38,6 +38,10 @@ def from_dict(cls, raw: dict[str, Any], *, composer_id: str) -> "Composer": raise SchemaError("Composer", "createdAt") created_at = raw.get("createdAt") + # Numeric-only on purpose: a 2026-05 scan of 17/17 live composers on + # disk stored createdAt as int milliseconds. If Cursor ever switches + # to ISO strings, those rows would disappear from list/search via a + # drift warning — relax the check at that point, don't silently coerce. if not isinstance(created_at, (int, float)) or isinstance(created_at, bool): raise SchemaError( "Composer", diff --git a/models/errors.py b/models/errors.py index db12c94..90a691a 100644 --- a/models/errors.py +++ b/models/errors.py @@ -8,7 +8,12 @@ def __init__(self, model: str, field: str, *, hint: str | None = None) -> None: self.model = model self.field = field self.hint = hint - message = f"{model}: missing required field '{field}'" + # Distinguish "absent" from "present-but-wrong-shape" so log grepping can + # tell missing-key drift apart from type-mismatch drift. Hint is only + # populated for shape mismatches (e.g. "expected list, got dict"), so its + # presence is the signal. if hint: - message = f"{message} ({hint})" + message = f"{model}: invalid field '{field}' ({hint})" + else: + message = f"{model}: missing required field '{field}'" super().__init__(message) diff --git a/scripts/export.py b/scripts/export.py index d79fb66..b308326 100644 --- a/scripts/export.py +++ b/scripts/export.py @@ -38,6 +38,7 @@ aggregate_session_stats, ) from utils.cursor_md_exporter import cursor_cli_session_to_markdown +from models import ExportEntry, SchemaError _logger = logging.getLogger(__name__) @@ -62,13 +63,13 @@ def _load_manifest_entries(manifest_path: str) -> dict: if not line: continue try: - entry = json.loads(line) - log_id = entry.get("log_id") - if log_id: - existing[log_id] = entry - except Exception as e: - _logger.debug("Skipping malformed manifest line in %s: %s", manifest_path, e) - except Exception as e: + entry = ExportEntry.from_dict(json.loads(line)) + existing[entry.log_id] = entry.raw + except (SchemaError, json.JSONDecodeError, ValueError) as e: + # Pre-PR-30 manifests lack title/workspace — skip them so the + # next export rebuilds the entry under the new schema. + _logger.debug("Skipping manifest line in %s: %s", manifest_path, e) + except OSError as e: _logger.debug("Failed to read manifest %s: %s", manifest_path, e) return existing @@ -818,7 +819,8 @@ def assign_workspace(cd, cid): rel_path = os.path.join(today, ws_slug, "chat", filename) exported.append({"id": composer_id, "rel_path": rel_path, "content": md, - "out_path": out_path, "updatedAt": updated_at}) + "out_path": out_path, "updatedAt": updated_at, + "title": title, "workspace": ws_display_name}) count += 1 # --- Cursor CLI sessions --- @@ -915,6 +917,8 @@ def assign_workspace(cd, cid): "content": md, "out_path": out_path, "updatedAt": updated_ms, + "title": title, + "workspace": ws_name, }) count += 1 @@ -947,6 +951,8 @@ def assign_workspace(cd, cid): for e in exported: existing[e["id"]] = { "log_id": e["id"], + "title": e["title"], + "workspace": e["workspace"], "path": os.path.relpath(e["out_path"], out_dir), "updated_at": datetime.fromtimestamp(e["updatedAt"] / 1000).isoformat() if e["updatedAt"] else datetime.now().isoformat(), } @@ -960,6 +966,8 @@ def assign_workspace(cd, cid): for e in exported: global_existing[e["id"]] = { "log_id": e["id"], + "title": e["title"], + "workspace": e["workspace"], "path": e["out_path"], "updated_at": datetime.fromtimestamp(e["updatedAt"] / 1000).isoformat() if e["updatedAt"] else datetime.now().isoformat(), } diff --git a/tests/test_models.py b/tests/test_models.py index ce335bc..fe59cfe 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -191,6 +191,22 @@ def test_schema_error_inherits_value_error(self) -> None: return self.fail("SchemaError did not propagate as ValueError") + def test_schema_error_message_distinguishes_missing_from_invalid(self) -> None: + # Operators grep logs for drift. "missing required field" must only fire + # when a key is absent; type / shape mismatches must read "invalid field" + # so the two failure modes can be told apart. + missing = SchemaError("Composer", "createdAt") + self.assertIn("missing required field", str(missing)) + self.assertNotIn("invalid field", str(missing)) + + type_mismatch = SchemaError( + "Composer", "createdAt", + hint="expected timestamp number, got str", + ) + self.assertIn("invalid field", str(type_mismatch)) + self.assertNotIn("missing required field", str(type_mismatch)) + self.assertIn("expected timestamp number, got str", str(type_mismatch)) + def test_non_dict_payload_raises_schema_error(self) -> None: bad_payloads: tuple[object, ...] = ([], "not a dict", 42, None, ("a", "b")) for bad in bad_payloads: diff --git a/tests/test_models_wired_at_read_sites.py b/tests/test_models_wired_at_read_sites.py new file mode 100644 index 0000000..f9ca432 --- /dev/null +++ b/tests/test_models_wired_at_read_sites.py @@ -0,0 +1,381 @@ +from __future__ import annotations + +import json +import os +import sqlite3 +import sys +import tempfile +import unittest +from contextlib import closing +from unittest.mock import patch + +REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if REPO_ROOT not in sys.path: + sys.path.insert(0, REPO_ROOT) + +# Regression tests for the PR #30 review finding: "models are defined + +# tested but not wired at any production read site." Each test invokes the +# specific code path Brad cited and asserts the corresponding from_dict +# classmethod was called at least once. If a future refactor unwires the +# model, these tests fail loudly. + + +WORKSPACE_ID = "ws-wired" +COMPOSER_ID = "cmp-wired" +BUBBLE_ID = "bub-wired" + + +def _build_workspace_storage(parent: str) -> str: + ws_root = os.path.join(parent, "workspaceStorage") + global_root = os.path.join(parent, "globalStorage") + os.makedirs(ws_root, exist_ok=True) + os.makedirs(global_root, exist_ok=True) + + ws_dir = os.path.join(ws_root, WORKSPACE_ID) + os.makedirs(ws_dir, exist_ok=True) + project_folder = os.path.join(parent, "wired-project") + os.makedirs(project_folder, exist_ok=True) + with open(os.path.join(ws_dir, "workspace.json"), "w", encoding="utf-8") as f: + json.dump({"folder": project_folder}, f) + + local_db = os.path.join(ws_dir, "state.vscdb") + with closing(sqlite3.connect(local_db)) as conn: + conn.execute("CREATE TABLE ItemTable ([key] TEXT PRIMARY KEY, value TEXT)") + conn.execute( + "INSERT INTO ItemTable ([key], value) VALUES (?, ?)", + ( + "composer.composerData", + json.dumps({"allComposers": [{"composerId": COMPOSER_ID}]}), + ), + ) + conn.commit() + + global_db = os.path.join(global_root, "state.vscdb") + with closing(sqlite3.connect(global_db)) as conn: + conn.execute("CREATE TABLE cursorDiskKV ([key] TEXT PRIMARY KEY, value TEXT)") + conn.execute( + "INSERT INTO cursorDiskKV ([key], value) VALUES (?, ?)", + ( + f"composerData:{COMPOSER_ID}", + json.dumps({ + "name": "Wired conversation", + "createdAt": 1_715_000_000_000, + "lastUpdatedAt": 1_715_000_500_000, + "fullConversationHeadersOnly": [{"bubbleId": BUBBLE_ID, "type": 1}], + "modelConfig": {"modelName": "gpt-4o"}, + }), + ), + ) + conn.execute( + "INSERT INTO cursorDiskKV ([key], value) VALUES (?, ?)", + ( + f"bubbleId:{COMPOSER_ID}:{BUBBLE_ID}", + json.dumps({"text": "find me sentinel-wired", "type": "user"}), + ), + ) + conn.commit() + + return ws_root + + +class TestBubbleWiredAtReadSite(unittest.TestCase): + def setUp(self): + self._tmp = tempfile.TemporaryDirectory() + self.workspace_path = _build_workspace_storage(self._tmp.name) + self._prior_ws = os.environ.get("WORKSPACE_PATH") + self._prior_cli = os.environ.get("CLI_CHATS_PATH") + os.environ["WORKSPACE_PATH"] = self.workspace_path + os.environ["CLI_CHATS_PATH"] = os.path.join(self._tmp.name, "cli-empty") + os.makedirs(os.environ["CLI_CHATS_PATH"], exist_ok=True) + + def tearDown(self): + for key, prior in (("WORKSPACE_PATH", self._prior_ws), ("CLI_CHATS_PATH", self._prior_cli)): + if prior is None: + os.environ.pop(key, None) + else: + os.environ[key] = prior + self._tmp.cleanup() + + def test_search_endpoint_calls_bubble_from_dict(self): + from app import create_app + import api.search as search_mod + app = create_app() + app.config["TESTING"] = True + app.config["EXCLUSION_RULES"] = [] + with patch.object(search_mod.Bubble, "from_dict", wraps=search_mod.Bubble.from_dict) as spy: + client = app.test_client() + response = client.get("/api/search?q=sentinel-wired") + self.assertEqual(response.status_code, 200) + self.assertGreaterEqual( + spy.call_count, 1, + msg="Bubble.from_dict was never called from /api/search — " + "model is defined but not wired at the production read site", + ) + + def test_workspace_tabs_endpoint_calls_bubble_from_dict(self): + from app import create_app + import api.workspaces as workspaces_mod + app = create_app() + app.config["TESTING"] = True + app.config["EXCLUSION_RULES"] = [] + with patch.object(workspaces_mod.Bubble, "from_dict", wraps=workspaces_mod.Bubble.from_dict) as spy: + client = app.test_client() + response = client.get(f"/api/workspaces/{WORKSPACE_ID}/tabs") + self.assertEqual(response.status_code, 200) + self.assertGreaterEqual( + spy.call_count, 1, + msg="Bubble.from_dict was never called from /api/workspaces/.../tabs — " + "model is defined but not wired at the production read site", + ) + + def test_workspace_tabs_endpoint_calls_composer_from_dict(self): + # Brad's most-important finding: list_workspaces() at api/workspaces.py:605 + # validates each composer with Composer.from_dict, but get_workspace_tabs() + # in the same file used raw json.loads. Schema drift would have been hidden + # from one of the two primary conversation-browsing paths. This test pins + # that BOTH paths must now validate via the model. + from app import create_app + import api.workspaces as workspaces_mod + app = create_app() + app.config["TESTING"] = True + app.config["EXCLUSION_RULES"] = [] + with patch.object(workspaces_mod.Composer, "from_dict", wraps=workspaces_mod.Composer.from_dict) as spy: + client = app.test_client() + response = client.get(f"/api/workspaces/{WORKSPACE_ID}/tabs") + self.assertEqual(response.status_code, 200) + self.assertGreaterEqual( + spy.call_count, 1, + msg="Composer.from_dict was never called from /api/workspaces/.../tabs — " + "the second primary conversation path is bypassing schema validation", + ) + + +class TestWorkspaceWiredAtReadSite(unittest.TestCase): + def setUp(self): + self._tmp = tempfile.TemporaryDirectory() + self.workspace_path = _build_workspace_storage(self._tmp.name) + self._prior_ws = os.environ.get("WORKSPACE_PATH") + self._prior_cli = os.environ.get("CLI_CHATS_PATH") + os.environ["WORKSPACE_PATH"] = self.workspace_path + os.environ["CLI_CHATS_PATH"] = os.path.join(self._tmp.name, "cli-empty") + os.makedirs(os.environ["CLI_CHATS_PATH"], exist_ok=True) + + def tearDown(self): + for key, prior in (("WORKSPACE_PATH", self._prior_ws), ("CLI_CHATS_PATH", self._prior_cli)): + if prior is None: + os.environ.pop(key, None) + else: + os.environ[key] = prior + self._tmp.cleanup() + + def test_composers_endpoint_calls_workspace_from_dict(self): + from app import create_app + import api.composers as composers_mod + app = create_app() + app.config["TESTING"] = True + app.config["EXCLUSION_RULES"] = [] + with patch.object(composers_mod.Workspace, "from_dict", wraps=composers_mod.Workspace.from_dict) as spy: + client = app.test_client() + response = client.get("/api/composers") + self.assertEqual(response.status_code, 200) + self.assertGreaterEqual( + spy.call_count, 1, + msg="Workspace.from_dict was never called from /api/composers — " + "model is defined but not wired at the production read site", + ) + + def test_workspace_display_name_calls_workspace_from_dict(self): + import api.workspaces as workspaces_mod + with patch.object(workspaces_mod.Workspace, "from_dict", wraps=workspaces_mod.Workspace.from_dict) as spy: + name = workspaces_mod._get_workspace_display_name(self.workspace_path, WORKSPACE_ID) + self.assertIsInstance(name, str) + self.assertGreaterEqual( + spy.call_count, 1, + msg="Workspace.from_dict was never called from _get_workspace_display_name", + ) + + def test_list_composers_sort_reads_typed_last_updated_at_not_raw_dict(self): + # Brad's review: WorkspaceLocalComposer.from_dict was used purely as a + # gate — the typed object was discarded and the raw dict appended. + # This test pins that the typed object is now load-bearing: the sort + # key reads `local.last_updated_at`, not `c.get("lastUpdatedAt")`. + # If a future refactor reverts to the raw dict, this test fails. + from app import create_app + from models import WorkspaceLocalComposer + import api.composers as composers_mod + + # Seed two composers with raw lastUpdatedAt values in REVERSE order + # to the values we'll return from a patched WorkspaceLocalComposer. + # If the sort reads from the raw dict, A comes first; if it reads + # from the typed model, B comes first. + ws_db = os.path.join(self.workspace_path, WORKSPACE_ID, "state.vscdb") + with closing(sqlite3.connect(ws_db)) as conn: + conn.execute( + "UPDATE ItemTable SET value = ? WHERE [key] = 'composer.composerData'", + (json.dumps({"allComposers": [ + {"composerId": "cmp-A", "lastUpdatedAt": 9000}, + {"composerId": "cmp-B", "lastUpdatedAt": 1000}, + ]}),), + ) + conn.commit() + + # Patch from_dict to swap the timestamps in the typed return value. + # The raw dict still has A=9000, B=1000; the typed values are flipped. + def swapped_from_dict(raw): # type: ignore[no-redef] + cid = raw["composerId"] + return WorkspaceLocalComposer( + composer_id=cid, + last_updated_at=1000 if cid == "cmp-A" else 9000, + raw=raw, + ) + + app = create_app() + app.config["TESTING"] = True + app.config["EXCLUSION_RULES"] = [] + with patch.object(composers_mod.WorkspaceLocalComposer, "from_dict", side_effect=swapped_from_dict): + client = app.test_client() + response = client.get("/api/composers") + self.assertEqual(response.status_code, 200) + ids = [c["composerId"] for c in response.get_json()] + self.assertEqual( + ids[:2], ["cmp-B", "cmp-A"], + msg="Sort key still reads raw dict instead of typed " + "WorkspaceLocalComposer.last_updated_at — typed model " + "is back to being just a filter.", + ) + + +class TestGetComposerValidatesSchema(unittest.TestCase): + # Brad's follow-up finding: list_composers() validates each row via + # WorkspaceLocalComposer.from_dict and logs drift, but the single-composer + # fetch get_composer() returned raw dict unchanged. The two paths must + # agree — a drifted composer that's hidden from the list must NOT be + # silently served by /api/composers/. + + def setUp(self): + self._tmp = tempfile.TemporaryDirectory() + self.workspace_path = _build_workspace_storage(self._tmp.name) + self._prior_ws = os.environ.get("WORKSPACE_PATH") + self._prior_cli = os.environ.get("CLI_CHATS_PATH") + os.environ["WORKSPACE_PATH"] = self.workspace_path + os.environ["CLI_CHATS_PATH"] = os.path.join(self._tmp.name, "cli-empty") + os.makedirs(os.environ["CLI_CHATS_PATH"], exist_ok=True) + + def tearDown(self): + for key, prior in (("WORKSPACE_PATH", self._prior_ws), ("CLI_CHATS_PATH", self._prior_cli)): + if prior is None: + os.environ.pop(key, None) + else: + os.environ[key] = prior + self._tmp.cleanup() + + def test_get_composer_calls_workspace_local_composer_from_dict(self): + from app import create_app + import api.composers as composers_mod + app = create_app() + app.config["TESTING"] = True + app.config["EXCLUSION_RULES"] = [] + with patch.object( + composers_mod.WorkspaceLocalComposer, "from_dict", + wraps=composers_mod.WorkspaceLocalComposer.from_dict, + ) as spy: + client = app.test_client() + response = client.get(f"/api/composers/{COMPOSER_ID}") + self.assertEqual(response.status_code, 200) + self.assertGreaterEqual( + spy.call_count, 1, + msg="WorkspaceLocalComposer.from_dict was never called from " + "/api/composers/ — the per-workspace path is bypassing " + "schema validation that list_composers performs.", + ) + + def test_get_composer_calls_composer_from_dict_on_global_fallback(self): + # When the per-workspace path misses (composer only in globalStorage), + # the fallback must validate via Composer.from_dict — not just decode + # the JSON blob and return it. + from app import create_app + import api.composers as composers_mod + app = create_app() + app.config["TESTING"] = True + app.config["EXCLUSION_RULES"] = [] + # Force the global fallback by zeroing out per-workspace allComposers + ws_db = os.path.join(self.workspace_path, WORKSPACE_ID, "state.vscdb") + with closing(sqlite3.connect(ws_db)) as conn: + conn.execute( + "UPDATE ItemTable SET value = ? WHERE [key] = 'composer.composerData'", + (json.dumps({"allComposers": []}),), + ) + conn.commit() + with patch.object( + composers_mod.Composer, "from_dict", + wraps=composers_mod.Composer.from_dict, + ) as spy: + client = app.test_client() + response = client.get(f"/api/composers/{COMPOSER_ID}") + self.assertEqual(response.status_code, 200) + self.assertGreaterEqual( + spy.call_count, 1, + msg="Composer.from_dict was never called from /api/composers/ " + "global-fallback — drifted composers in globalStorage would " + "still leak to the client.", + ) + + +class TestExportEntryWiredAtReadSite(unittest.TestCase): + def test_load_manifest_entries_calls_export_entry_from_dict(self): + with tempfile.TemporaryDirectory() as tmp: + manifest_path = os.path.join(tmp, "manifest.jsonl") + with open(manifest_path, "w", encoding="utf-8") as f: + # One well-formed entry matching the new writer's schema. + f.write(json.dumps({ + "log_id": "log-wired", + "title": "Wired chat", + "workspace": "wired-project", + "path": "out.md", + "updated_at": "2026-05-14T00:00:00", + }) + "\n") + + from scripts import export as export_mod + with patch.object( + export_mod.ExportEntry, "from_dict", + wraps=export_mod.ExportEntry.from_dict, + ) as spy: + entries = export_mod._load_manifest_entries(manifest_path) + self.assertIn("log-wired", entries) + self.assertGreaterEqual( + spy.call_count, 1, + msg="ExportEntry.from_dict was never called from " + "_load_manifest_entries — model is defined but not " + "wired at the production read site", + ) + + def test_load_manifest_entries_skips_pre_pr30_entries(self): + # Backwards-compat: an old manifest from before this PR will lack + # title/workspace. The reader must skip those silently (and the next + # export rebuilds the entry under the new schema) rather than 500. + with tempfile.TemporaryDirectory() as tmp: + manifest_path = os.path.join(tmp, "manifest.jsonl") + with open(manifest_path, "w", encoding="utf-8") as f: + # Old-schema entry (no title, no workspace). + f.write(json.dumps({ + "log_id": "legacy", + "path": "out.md", + "updated_at": "2026-05-01T00:00:00", + }) + "\n") + # New-schema entry alongside it. + f.write(json.dumps({ + "log_id": "modern", + "title": "Modern chat", + "workspace": "modern-project", + "path": "out.md", + "updated_at": "2026-05-14T00:00:00", + }) + "\n") + + from scripts import export as export_mod + entries = export_mod._load_manifest_entries(manifest_path) + self.assertNotIn("legacy", entries, msg="pre-PR-30 entries must be skipped") + self.assertIn("modern", entries, msg="new-schema entries must still load") + + +if __name__ == "__main__": + unittest.main() From b9f34a66dc0764654656a1b43d9b42919b395b4f Mon Sep 17 00:00:00 2001 From: timon0305 Date: Thu, 14 May 2026 15:06:05 +0200 Subject: [PATCH 09/10] review: log schema drift + harden get_composer envelope (#30) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CodeRabbit follow-ups to the round-2 wiring pass: 1. The three bubble-loading loops (api/workspaces.py:583, 1025; api/search.py:149) caught SchemaError, json.JSONDecodeError and ValueError together and dropped the row silently. That broke the read-boundary contract — a drifted bubble would simply vanish with no operator signal. Split the catches: SchemaError now logs `Schema drift in bubble : `; JSONDecodeError / ValueError still pass silently because that's a parser-noise issue, not a schema-contract issue. 2. get_composer() at api/composers.py:125 trusted that the per-workspace blob was a dict with a list-valued allComposers. A drifted local row like `[]` or `"…"` raised AttributeError on data.get(...) and turned schema drift into a 500. Mirrored the three envelope guards list_composers() already applies at line 60–74 (data is dict, allComposers present, allComposers is list) and wrapped the outer try with `except SchemaError` so drift logs the db_path and falls through to the global fallback instead of 500'ing. 3. scripts/ lacked __init__.py while every other top-level package directory (api/, models/, utils/, tests/) has one. The new `from scripts import export` calls in tests resolve at runtime via the sys.path insert in the test header, but mypy was reporting inconsistent module resolution on scripts.export — a CI typecheck risk. Added an empty scripts/__init__.py to match convention. 3 new regression tests in tests/test_models_wired_at_read_sites.py: - test_bubble_schema_drift_is_logged_not_swallowed_silently — seeds a malformed bubble, captures stdout, asserts the `Schema drift in bubble ` line is present. - test_get_composer_handles_non_dict_envelope_via_fallback — replaces composer.composerData with a list payload, asserts the endpoint returns 200 via the global fallback (was 500 before the envelope guards). - test_get_composer_handles_non_list_all_composers_via_fallback — same shape for non-list allComposers. Verified locally: - python -m unittest discover tests: 223 passed (was 220) - python -m unittest tests.test_models_wired_at_read_sites: 13/13 - mypy resolves scripts.export consistently --- api/composers.py | 22 ++++++- api/search.py | 9 ++- api/workspaces.py | 17 ++++-- scripts/__init__.py | 0 tests/test_models_wired_at_read_sites.py | 73 ++++++++++++++++++++++++ 5 files changed, 112 insertions(+), 9 deletions(-) create mode 100644 scripts/__init__.py diff --git a/api/composers.py b/api/composers.py index 96b9d03..bdb44cd 100644 --- a/api/composers.py +++ b/api/composers.py @@ -124,7 +124,25 @@ def get_composer(composer_id): if row and row[0]: data = json.loads(row[0]) - for c in (data.get("allComposers") or []): + # Mirror the envelope guards list_composers() applies at line 60–74 + # so a drifted local row (data not a dict, or allComposers missing + # / non-list) surfaces as a logged SchemaError, not a 500. + if not isinstance(data, dict): + raise SchemaError( + "WorkspaceComposers", + "composer.composerData", + hint=f"expected object, got {type(data).__name__}", + ) + if "allComposers" not in data: + raise SchemaError("WorkspaceComposers", "allComposers") + all_composers = data.get("allComposers") + if not isinstance(all_composers, list): + raise SchemaError( + "WorkspaceComposers", + "allComposers", + hint=f"expected list, got {type(all_composers).__name__}", + ) + for c in all_composers: if isinstance(c, dict) and c.get("composerId") == composer_id: try: local = WorkspaceLocalComposer.from_dict(c) @@ -135,6 +153,8 @@ def get_composer(composer_id): print(f"Schema drift in workspace-local composer {composer_id}: {e}") continue return jsonify(local.raw) + except SchemaError as e: + print(f"Schema drift in {db_path}: {e}") except (OSError, sqlite3.Error, json.JSONDecodeError, ValueError): pass diff --git a/api/search.py b/api/search.py index 3af435c..9a67b9c 100644 --- a/api/search.py +++ b/api/search.py @@ -150,9 +150,12 @@ def search(): bubble = Bubble.from_dict(json.loads(row["value"]), bubble_id=bid) text = extract_text_from_bubble(bubble.raw) bubble_map[bid] = {"text": text, "raw": bubble.raw} - except (SchemaError, json.JSONDecodeError, ValueError): - # Skip malformed bubble rows — search must keep returning - # results from the well-formed ones. + except SchemaError as e: + # Drift logged so the operator can see why a chat dropped + # out of search results; bad row still skipped so search + # keeps returning results from the well-formed ones. + print(f"Schema drift in bubble {bid}: {e}") + except (json.JSONDecodeError, ValueError): pass # Search through composerData diff --git a/api/workspaces.py b/api/workspaces.py index 0511ffc..adbde62 100644 --- a/api/workspaces.py +++ b/api/workspaces.py @@ -583,9 +583,12 @@ def list_workspaces(): try: bubble = Bubble.from_dict(json.loads(row["value"]), bubble_id=bid) bubble_map[bid] = bubble.raw - except (SchemaError, json.JSONDecodeError, ValueError): - # Skip malformed bubble rows — read-many path, one bad row - # must not 500 the endpoint. + except SchemaError as e: + # Drift surfaces in logs so an operator sees disappearing + # bubbles instead of guessing. The row is still skipped — + # one bad bubble must not 500 the endpoint. + print(f"Schema drift in bubble {bid}: {e}") + except (json.JSONDecodeError, ValueError): pass # Process each composer @@ -1022,8 +1025,12 @@ def get_workspace_tabs(workspace_id): try: bubble = Bubble.from_dict(json.loads(row["value"]), bubble_id=bid) bubble_map[bid] = bubble.raw - except (SchemaError, json.JSONDecodeError, ValueError): - # Skip malformed rows — one bad bubble must not 500 the tabs endpoint. + except SchemaError as e: + # Drift surfaces in logs so an operator can chase disappearing + # bubbles instead of guessing. Bad row still skipped so the + # tabs endpoint can't 500 on one malformed bubble. + print(f"Schema drift in bubble {bid}: {e}") + except (json.JSONDecodeError, ValueError): pass # Load codeBlockDiffs diff --git a/scripts/__init__.py b/scripts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_models_wired_at_read_sites.py b/tests/test_models_wired_at_read_sites.py index f9ca432..491ca58 100644 --- a/tests/test_models_wired_at_read_sites.py +++ b/tests/test_models_wired_at_read_sites.py @@ -128,6 +128,38 @@ def test_workspace_tabs_endpoint_calls_bubble_from_dict(self): "model is defined but not wired at the production read site", ) + def test_bubble_schema_drift_is_logged_not_swallowed_silently(self): + # CodeRabbit: SchemaError used to be lumped in with JSONDecodeError / + # ValueError and skipped silently. Schema drift must now print a + # `Schema drift in bubble ` line so disappearing bubbles can be + # traced. The well-formed row still loads alongside. + from app import create_app + # Seed a deliberately-malformed bubble row that will trip + # Bubble.from_dict's "expected non-empty str" gate on the bubble_id by + # putting a non-dict at the value slot. + global_db = os.path.join(self._tmp.name, "globalStorage", "state.vscdb") + with closing(sqlite3.connect(global_db)) as conn: + conn.execute( + "INSERT INTO cursorDiskKV ([key], value) VALUES (?, ?)", + (f"bubbleId:{COMPOSER_ID}:bub-bad", json.dumps("not-a-dict")), + ) + conn.commit() + app = create_app() + app.config["TESTING"] = True + app.config["EXCLUSION_RULES"] = [] + import io + from contextlib import redirect_stdout + captured = io.StringIO() + with redirect_stdout(captured): + client = app.test_client() + response = client.get("/api/search?q=sentinel-wired") + self.assertEqual(response.status_code, 200) + out = captured.getvalue() + self.assertIn("Schema drift in bubble", out, + msg=f"expected drift log line, got stdout:\n{out!r}") + self.assertIn("bub-bad", out, + msg="drift log must include the offending bubble id") + def test_workspace_tabs_endpoint_calls_composer_from_dict(self): # Brad's most-important finding: list_workspaces() at api/workspaces.py:605 # validates each composer with Composer.from_dict, but get_workspace_tabs() @@ -289,6 +321,47 @@ def test_get_composer_calls_workspace_local_composer_from_dict(self): "schema validation that list_composers performs.", ) + def test_get_composer_handles_non_dict_envelope_via_fallback(self): + # CodeRabbit: data.get(...) used to crash with AttributeError when the + # per-workspace blob isn't a dict. Now must be caught as SchemaError + # so the function falls through to the global fallback instead of 500. + from app import create_app + ws_db = os.path.join(self.workspace_path, WORKSPACE_ID, "state.vscdb") + with closing(sqlite3.connect(ws_db)) as conn: + # Replace the dict envelope with a list — would have raised + # AttributeError on data.get(...) before the guards landed. + conn.execute( + "UPDATE ItemTable SET value = ? WHERE [key] = 'composer.composerData'", + (json.dumps(["not", "a", "dict"]),), + ) + conn.commit() + app = create_app() + app.config["TESTING"] = True + app.config["EXCLUSION_RULES"] = [] + client = app.test_client() + response = client.get(f"/api/composers/{COMPOSER_ID}") + # Global fallback still has the composer seeded, so this must return + # 200, not 500. The per-workspace drift gets logged and skipped. + self.assertEqual(response.status_code, 200) + self.assertEqual(response.get_json().get("name"), "Wired conversation") + + def test_get_composer_handles_non_list_all_composers_via_fallback(self): + from app import create_app + ws_db = os.path.join(self.workspace_path, WORKSPACE_ID, "state.vscdb") + with closing(sqlite3.connect(ws_db)) as conn: + conn.execute( + "UPDATE ItemTable SET value = ? WHERE [key] = 'composer.composerData'", + (json.dumps({"allComposers": "should-be-list"}),), + ) + conn.commit() + app = create_app() + app.config["TESTING"] = True + app.config["EXCLUSION_RULES"] = [] + client = app.test_client() + response = client.get(f"/api/composers/{COMPOSER_ID}") + # Drift surfaces via global fallback, not as a 500. + self.assertEqual(response.status_code, 200) + def test_get_composer_calls_composer_from_dict_on_global_fallback(self): # When the per-workspace path misses (composer only in globalStorage), # the fallback must validate via Composer.from_dict — not just decode From 017d408ed2ca9bf52867721580dbfb7ffea262bf Mon Sep 17 00:00:00 2001 From: timon0305 Date: Fri, 15 May 2026 03:54:16 +0200 Subject: [PATCH 10/10] =?UTF-8?q?review:=202=20CodeRabbit=20findings=20on?= =?UTF-8?q?=20PR=20#30=20=E2=80=94=20response-shape=20parity=20+=20alias-v?= =?UTF-8?q?ote=20drift=20guard?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * api/composers.py — get_composer() returned three different shapes for the same conceptual `conversation` field depending on which branch resolved the composer: - list_composers (line 89): `c["conversation"] = c.get("conversation") or []` - get_composer per-workspace (line 157): `return jsonify(local.raw)` — raw, no normalisation - get_composer global fallback (line 185): `payload.setdefault("conversation", [])` — fills missing but not None All three sites now use the line-89 idiom, so the endpoint family returns an identical shape regardless of source. * api/workspaces.py — `_infer_invalid_workspace_aliases()` was reparsing raw composer_rows with `json.loads` + bare `except Exception` *before* the per-row Composer.from_dict gates in list_workspaces() / get_workspace_tabs() ran. That meant schema-drifted composers could still cast votes in the workspace-alias inference and misassign otherwise-valid composers to the wrong workspace. Now gates via `Composer.from_dict(...)` with a narrow `(SchemaError, JSONDecodeError, TypeError, ValueError)` catch. Existing test_majority_vote_alias_selection fixture predated the strict createdAt gate from issue #24; added `createdAt` to each of the three composer rows so the fixture reflects real production shape. New test_drifted_composer_does_not_skew_vote pins the regression: seeds two well-formed boost-ws votes plus one drifted row that would vote for team-ws if counted, asserts the alias resolves to boost-ws. Verified: - mypy --ignore-missing-imports --no-strict-optional . → Success - ruff check api/ utils/ scripts/export.py app.py → All checks passed - unittest discover tests → 224 / 224 OK (was 223 + 1 new regression test) --- api/composers.py | 11 ++++-- api/workspaces.py | 11 ++++-- tests/test_invalid_workspace_aliases.py | 47 +++++++++++++++++++++++-- 3 files changed, 62 insertions(+), 7 deletions(-) diff --git a/api/composers.py b/api/composers.py index c8ff0ee..dc85450 100644 --- a/api/composers.py +++ b/api/composers.py @@ -154,7 +154,14 @@ def get_composer(composer_id): # JSON the list endpoint hid. print(f"Schema drift in workspace-local composer {composer_id}: {e}") continue - return jsonify(local.raw) + # Match list_composers() at line 89 and the global + # fallback below: `conversation` is normalised to [] + # whether it's absent or None, so the response shape + # is identical regardless of which branch resolved + # the composer (CodeRabbit on PR #30). + payload = dict(local.raw) + payload["conversation"] = payload.get("conversation") or [] + return jsonify(payload) except SchemaError as e: print(f"Schema drift in {db_path}: {e}") except (OSError, sqlite3.Error, json.JSONDecodeError, ValueError): @@ -182,7 +189,7 @@ def get_composer(composer_id): print(f"Schema drift in composer {composer_id}: {e}") return jsonify({"error": "Composer schema drift"}), 404 payload = dict(composer.raw) - payload.setdefault("conversation", []) + payload["conversation"] = payload.get("conversation") or [] return jsonify(payload) except (OSError, sqlite3.Error, json.JSONDecodeError, ValueError): pass diff --git a/api/workspaces.py b/api/workspaces.py index 5439da3..7d01aa5 100644 --- a/api/workspaces.py +++ b/api/workspaces.py @@ -447,8 +447,15 @@ def _infer_invalid_workspace_aliases( if mapped not in invalid_workspace_ids: continue try: - cd = json.loads(row["value"]) - except Exception: + # Validate via Composer.from_dict so a schema-drifted row can't + # steer the alias vote and misassign otherwise-valid composers to + # the wrong workspace. The downstream per-row loops in + # list_workspaces() / get_workspace_tabs() already drop drift, + # but this helper runs BEFORE that loop and its votes shape + # invalid_workspace_aliases for every other composer. + composer = Composer.from_dict(json.loads(row["value"]), composer_id=cid) + cd = composer.raw + except (SchemaError, json.JSONDecodeError, TypeError, ValueError): continue inferred = _determine_project_for_conversation( cd, diff --git a/tests/test_invalid_workspace_aliases.py b/tests/test_invalid_workspace_aliases.py index 1236f7f..ae9ee81 100644 --- a/tests/test_invalid_workspace_aliases.py +++ b/tests/test_invalid_workspace_aliases.py @@ -11,10 +11,13 @@ class TestInvalidWorkspaceAliases(unittest.TestCase): def test_majority_vote_alias_selection(self): + # `createdAt` is required by Composer.from_dict (issue #24's strict + # numeric-millis gate). Production rows always carry it; the original + # fixture predated the gate. composer_rows = [ - {"key": "composerData:cid-1", "value": json.dumps({"fullConversationHeadersOnly": []})}, - {"key": "composerData:cid-2", "value": json.dumps({"fullConversationHeadersOnly": []})}, - {"key": "composerData:cid-3", "value": json.dumps({"fullConversationHeadersOnly": []})}, + {"key": "composerData:cid-1", "value": json.dumps({"createdAt": 1_715_000_000_000, "fullConversationHeadersOnly": []})}, + {"key": "composerData:cid-2", "value": json.dumps({"createdAt": 1_715_000_000_000, "fullConversationHeadersOnly": []})}, + {"key": "composerData:cid-3", "value": json.dumps({"createdAt": 1_715_000_000_000, "fullConversationHeadersOnly": []})}, ] composer_id_to_ws = { "cid-1": "invalid-ws", @@ -46,6 +49,44 @@ def test_majority_vote_alias_selection(self): self.assertEqual(aliases.get("invalid-ws"), "boost-ws") + def test_drifted_composer_does_not_skew_vote(self): + # CodeRabbit regression check: a schema-drifted composer + # (e.g. missing createdAt) must NOT cast a vote, because if it did + # it could outweigh well-formed composers and misassign every other + # composer mapped to the same invalid workspace. + composer_rows = [ + # Two well-formed votes for boost-ws + {"key": "composerData:cid-1", "value": json.dumps({"createdAt": 1_715_000_000_000, "fullConversationHeadersOnly": []})}, + {"key": "composerData:cid-2", "value": json.dumps({"createdAt": 1_715_000_000_000, "fullConversationHeadersOnly": []})}, + # One drifted row that, if counted, would vote for team-ws + # (no createdAt → Composer.from_dict raises SchemaError → skip) + {"key": "composerData:cid-3", "value": json.dumps({"fullConversationHeadersOnly": []})}, + ] + composer_id_to_ws = {"cid-1": "invalid-ws", "cid-2": "invalid-ws", "cid-3": "invalid-ws"} + project_layouts_map = { + "cid-1": [normalize_file_path(r"d:\_Cpp_Digest\boostbacklog")], + "cid-2": [normalize_file_path(r"d:\_Cpp_Digest\boostbacklog")], + "cid-3": [normalize_file_path(r"d:\_Cpp_Digest\team-brain")], + } + workspace_path_map = { + normalize_file_path(r"d:\_cpp_digest\boostbacklog"): "boost-ws", + normalize_file_path(r"d:\_cpp_digest\team-brain"): "team-ws", + } + + aliases = _infer_invalid_workspace_aliases( + composer_rows=composer_rows, + project_layouts_map=project_layouts_map, + project_name_map={}, + workspace_path_map=workspace_path_map, + workspace_entries=[], + bubble_map={}, + composer_id_to_ws=composer_id_to_ws, + invalid_workspace_ids={"invalid-ws"}, + ) + + # cid-3 is dropped (drift), so boost-ws wins 2-0 (not 2-1) + self.assertEqual(aliases.get("invalid-ws"), "boost-ws") + if __name__ == "__main__": unittest.main()