Skip to content

Commit 1f83f12

Browse files
committed
feat: typed models + schema validation at DB read boundaries (closes #24)
Add models/ package with @DataClass definitions for Workspace, Composer, Bubble, CliSessionMeta, ExportEntry. Each model has a from_dict() classmethod that raises SchemaError when critical fields are missing, replacing silent dict.get() fallbacks at the four database read boundaries identified by the eval (Section 5.1, Schema Fragility). Wire into: - api/workspaces.py:601 — composerData rows in workspace listing - api/composers.py:57-63 — allComposers envelope in per-workspace fetch - api/search.py:163-164 — composerData rows during search - utils/cli_chat_reader.py:93-98 — CLI session meta blob in traverse_blobs Schema drift surfaces as a printed warning + skipped row, not a silent empty result. Existing call sites preserve their JSON response shapes (behavior-preserving wire-in).
1 parent 95d3140 commit 1f83f12

11 files changed

Lines changed: 500 additions & 20 deletions

File tree

api/composers.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
from utils.workspace_path import resolve_workspace_path
1515
from utils.path_helpers import to_epoch_ms
16+
from models import SchemaError
1617

1718
bp = Blueprint("composers", __name__)
1819

@@ -54,13 +55,24 @@ def list_composers():
5455

5556
if row and row[0]:
5657
data = json.loads(row[0])
58+
if "allComposers" not in data:
59+
raise SchemaError("WorkspaceComposers", "allComposers")
5760
all_composers = data.get("allComposers")
58-
if isinstance(all_composers, list):
59-
for c in all_composers:
60-
c["conversation"] = c.get("conversation") or []
61-
c["workspaceId"] = name
62-
c["workspaceFolder"] = workspace_folder
63-
composers.append(c)
61+
if not isinstance(all_composers, list):
62+
raise SchemaError(
63+
"WorkspaceComposers",
64+
"allComposers",
65+
hint=f"expected list, got {type(all_composers).__name__}",
66+
)
67+
for c in all_composers:
68+
if not isinstance(c, dict) or not c.get("composerId"):
69+
continue
70+
c["conversation"] = c.get("conversation") or []
71+
c["workspaceId"] = name
72+
c["workspaceFolder"] = workspace_folder
73+
composers.append(c)
74+
except SchemaError as e:
75+
print(f"Schema drift in {db_path}: {e}")
6476
except Exception:
6577
pass
6678

api/search.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from utils.path_helpers import normalize_file_path, get_workspace_folder_paths, to_epoch_ms
1919
from utils.text_extract import extract_text_from_bubble
2020
from utils.cli_chat_reader import list_cli_projects, traverse_blobs, messages_to_bubbles
21+
from models import Composer, SchemaError
2122

2223
bp = Blueprint("search", __name__)
2324

@@ -161,17 +162,24 @@ def search():
161162
for row in composer_rows:
162163
composer_id = row["key"].split(":")[1]
163164
try:
164-
cd = json.loads(row["value"])
165-
headers = cd.get("fullConversationHeadersOnly") or []
165+
composer = Composer.from_dict(json.loads(row["value"]), composer_id=composer_id)
166+
except SchemaError as e:
167+
print(f"Schema drift in composer {composer_id}: {e}")
168+
continue
169+
except (json.JSONDecodeError, TypeError, ValueError):
170+
continue
171+
try:
172+
cd = composer.raw
173+
headers = composer.full_conversation_headers_only
166174
if not headers:
167175
continue
168176

169-
title = cd.get("name") or ""
177+
title = composer.name or ""
170178
ws_id = composer_id_to_ws.get(composer_id, "global")
171179
ws_name = ws_id_to_name.get(ws_id)
172180
project_name = ws_name or ("Other chats" if ws_id == "global" else ws_id)
173181

174-
model_config = cd.get("modelConfig") or {}
182+
model_config = composer.model_config
175183
model_name = model_config.get("modelName")
176184
model_names = [model_name] if model_name and model_name != "default" else None
177185

@@ -243,7 +251,7 @@ def search():
243251
"workspaceFolder": ws_name,
244252
"chatId": composer_id,
245253
"chatTitle": title,
246-
"timestamp": to_epoch_ms(cd.get("lastUpdatedAt")) or to_epoch_ms(cd.get("createdAt")) or int(datetime.now().timestamp() * 1000),
254+
"timestamp": to_epoch_ms(composer.last_updated_at) or to_epoch_ms(composer.created_at) or int(datetime.now().timestamp() * 1000),
247255
"matchingText": matching_text,
248256
"type": "composer",
249257
})

api/workspaces.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
)
3434
from utils.text_extract import extract_text_from_bubble, format_tool_action
3535
from utils.exclusion_rules import build_searchable_text, is_excluded_by_rules
36+
from models import Composer, SchemaError
3637

3738
bp = Blueprint("workspaces", __name__)
3839

@@ -600,9 +601,15 @@ def list_workspaces():
600601
for row in composer_rows:
601602
cid = row["key"].split(":")[1]
602603
try:
603-
cd = json.loads(row["value"])
604+
composer = Composer.from_dict(json.loads(row["value"]), composer_id=cid)
605+
except SchemaError as e:
606+
print(f"Schema drift in composer {cid}: {e}")
607+
continue
608+
except (json.JSONDecodeError, TypeError, ValueError):
609+
continue
610+
try:
604611
pid = _determine_project_for_conversation(
605-
cd, cid, project_layouts_map,
612+
composer.raw, cid, project_layouts_map,
606613
project_name_map, workspace_path_map,
607614
workspace_entries, bubble_map, composer_id_to_ws, invalid_workspace_ids
608615
)
@@ -611,16 +618,16 @@ def list_workspaces():
611618
pid = invalid_workspace_aliases.get(mapped_ws)
612619
assigned = pid if pid else "global"
613620

614-
headers = cd.get("fullConversationHeadersOnly") or []
621+
headers = composer.full_conversation_headers_only
615622
has_bubbles = any(bubble_map.get(h.get("bubbleId")) for h in headers)
616623
if not has_bubbles:
617624
continue
618625

619626
conversation_map.setdefault(assigned, []).append({
620627
"composerId": cid,
621-
"name": cd.get("name") or f"Conversation {cid[:8]}",
622-
"lastUpdatedAt": to_epoch_ms(cd.get("lastUpdatedAt")) or to_epoch_ms(cd.get("createdAt")) or 0,
623-
"createdAt": to_epoch_ms(cd.get("createdAt")) or 0,
628+
"name": composer.name or f"Conversation {cid[:8]}",
629+
"lastUpdatedAt": to_epoch_ms(composer.last_updated_at) or to_epoch_ms(composer.created_at) or 0,
630+
"createdAt": to_epoch_ms(composer.created_at) or 0,
624631
})
625632
except Exception:
626633
pass

models/__init__.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
"""Typed domain models for Cursor schema (closes #24).
2+
3+
Cursor's on-disk JSON shapes are not versioned, so silent renames of fields
4+
like ``composerData`` or ``latestRootBlobId`` would otherwise pass through
5+
``dict.get(...)`` with a fallback default and produce empty conversations
6+
with no error raised. The models here add a schema-validation boundary at
7+
database read sites: ``from_dict`` classmethods raise ``SchemaError`` when
8+
critical fields are missing, so drift becomes loud instead of silent.
9+
"""
10+
11+
from models.cli_session import CliSessionMeta
12+
from models.conversation import Bubble, Composer
13+
from models.errors import SchemaError
14+
from models.export import ExportEntry
15+
from models.workspace import Workspace
16+
17+
__all__ = [
18+
"Bubble",
19+
"CliSessionMeta",
20+
"Composer",
21+
"ExportEntry",
22+
"SchemaError",
23+
"Workspace",
24+
]

models/cli_session.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
"""CliSessionMeta — typed model for the Cursor CLI ``meta`` blob."""
2+
3+
from __future__ import annotations
4+
5+
from dataclasses import dataclass, field
6+
from typing import Any
7+
8+
from models.errors import SchemaError
9+
10+
11+
@dataclass(frozen=True)
12+
class CliSessionMeta:
13+
"""The ``meta`` blob at the head of a Cursor CLI ``store.db`` blob graph.
14+
15+
``latestRootBlobId`` is the entry point for the conversation reconstruction
16+
BFS in ``utils/cli_chat_reader.traverse_blobs``; without it, the entire
17+
conversation is unreachable. ``createdAt`` is documented as part of the
18+
meta-blob schema (see ``utils/cli_chat_reader`` module docstring) and is
19+
captured here, but it is not gated on — only ``latestRootBlobId`` is the
20+
hard requirement, since that is the only field whose absence prevents
21+
conversation reconstruction.
22+
"""
23+
24+
latest_root_blob_id: str
25+
created_at: Any = None
26+
raw: dict[str, Any] = field(default_factory=dict)
27+
28+
@classmethod
29+
def from_dict(cls, raw: dict[str, Any]) -> "CliSessionMeta":
30+
latest = raw.get("latestRootBlobId")
31+
if not latest:
32+
raise SchemaError("CliSessionMeta", "latestRootBlobId")
33+
if not isinstance(latest, str):
34+
raise SchemaError(
35+
"CliSessionMeta",
36+
"latestRootBlobId",
37+
hint=f"expected str, got {type(latest).__name__}",
38+
)
39+
return cls(
40+
latest_root_blob_id=latest,
41+
created_at=raw.get("createdAt"),
42+
raw=raw,
43+
)

models/conversation.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
"""Composer (conversation) and Bubble (message) typed models."""
2+
3+
from __future__ import annotations
4+
5+
from dataclasses import dataclass, field
6+
from typing import Any
7+
8+
from models.errors import SchemaError
9+
10+
11+
@dataclass(frozen=True)
12+
class Composer:
13+
"""A Cursor conversation (a.k.a. "composer") row.
14+
15+
Required fields per the schema-validation contract:
16+
- ``fullConversationHeadersOnly`` — without this, a composer cannot be
17+
rendered (no message order is recoverable). This is the only hard
18+
requirement: real Cursor data legitimately omits ``createdAt`` for
19+
older composers (the existing call sites already fall back to
20+
``lastUpdatedAt`` and then to epoch zero), so it is captured but
21+
not gated on.
22+
23+
The composer ID is intentionally passed in as a constructor argument
24+
rather than read from ``raw`` because Cursor stores it in the row key
25+
(``composerData:<id>``) rather than in the JSON value.
26+
"""
27+
28+
composer_id: str
29+
full_conversation_headers_only: list[dict[str, Any]]
30+
created_at: Any
31+
name: str | None = None
32+
last_updated_at: Any = None
33+
model_config: dict[str, Any] = field(default_factory=dict)
34+
raw: dict[str, Any] = field(default_factory=dict)
35+
36+
@classmethod
37+
def from_dict(cls, raw: dict[str, Any], *, composer_id: str) -> "Composer":
38+
if not composer_id:
39+
raise SchemaError("Composer", "composerId", hint="empty composer ID")
40+
if "fullConversationHeadersOnly" not in raw:
41+
raise SchemaError("Composer", "fullConversationHeadersOnly")
42+
43+
headers = raw.get("fullConversationHeadersOnly") or []
44+
if not isinstance(headers, list):
45+
raise SchemaError(
46+
"Composer",
47+
"fullConversationHeadersOnly",
48+
hint=f"expected list, got {type(headers).__name__}",
49+
)
50+
51+
model_config = raw.get("modelConfig") or {}
52+
if not isinstance(model_config, dict):
53+
model_config = {}
54+
55+
return cls(
56+
composer_id=composer_id,
57+
full_conversation_headers_only=headers,
58+
created_at=raw.get("createdAt"),
59+
name=raw.get("name"),
60+
last_updated_at=raw.get("lastUpdatedAt"),
61+
model_config=model_config,
62+
raw=raw,
63+
)
64+
65+
66+
@dataclass(frozen=True)
67+
class Bubble:
68+
"""A single message bubble within a composer.
69+
70+
The bubble ID lives in the row key (``bubbleId:<composer_id>:<bubble_id>``)
71+
rather than the JSON value, so it is passed in explicitly. The raw dict
72+
is preserved to keep downstream rendering code (which still walks the
73+
untyped shape) working without modification.
74+
"""
75+
76+
bubble_id: str
77+
raw: dict[str, Any] = field(default_factory=dict)
78+
79+
@classmethod
80+
def from_dict(cls, raw: dict[str, Any], *, bubble_id: str) -> "Bubble":
81+
if not bubble_id:
82+
raise SchemaError("Bubble", "bubbleId", hint="empty bubble ID")
83+
return cls(bubble_id=bubble_id, raw=raw)

models/errors.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
"""Exception types for the typed-model schema-validation layer."""
2+
3+
from __future__ import annotations
4+
5+
6+
class SchemaError(ValueError):
7+
"""Raised when a required Cursor schema field is missing or malformed.
8+
9+
Inherits from ``ValueError`` so call sites that already catch generic
10+
deserialisation errors (e.g. ``json.JSONDecodeError`` is a subclass of
11+
``ValueError``) also catch schema drift without needing a separate
12+
``except`` clause. New code should catch ``SchemaError`` explicitly.
13+
"""
14+
15+
def __init__(self, model: str, field: str, *, hint: str | None = None) -> None:
16+
self.model = model
17+
self.field = field
18+
self.hint = hint
19+
message = f"{model}: missing required field '{field}'"
20+
if hint:
21+
message = f"{message} ({hint})"
22+
super().__init__(message)

models/export.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
"""ExportEntry — typed model for an export manifest record (JSONL line)."""
2+
3+
from __future__ import annotations
4+
5+
from dataclasses import dataclass, field
6+
from typing import Any
7+
8+
from models.errors import SchemaError
9+
10+
11+
@dataclass(frozen=True)
12+
class ExportEntry:
13+
"""A single record in the export manifest (one line in ``manifest.jsonl``).
14+
15+
Required fields are the YAML-frontmatter keys that downstream tooling
16+
indexes against: a missing ``log_id`` makes the entry unaddressable, and
17+
a missing ``title`` produces unreadable output. Timestamps are optional —
18+
not every Cursor conversation has both a creation and update time.
19+
"""
20+
21+
log_id: str
22+
title: str
23+
workspace: str
24+
created_at: Any = None
25+
updated_at: Any = None
26+
raw: dict[str, Any] = field(default_factory=dict)
27+
28+
@classmethod
29+
def from_dict(cls, raw: dict[str, Any]) -> "ExportEntry":
30+
for required in ("log_id", "title", "workspace"):
31+
if required not in raw or raw[required] in (None, ""):
32+
raise SchemaError("ExportEntry", required)
33+
return cls(
34+
log_id=str(raw["log_id"]),
35+
title=str(raw["title"]),
36+
workspace=str(raw["workspace"]),
37+
created_at=raw.get("created_at"),
38+
updated_at=raw.get("updated_at"),
39+
raw=raw,
40+
)

models/workspace.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
"""Workspace — typed model for a single Cursor workspace folder."""
2+
3+
from __future__ import annotations
4+
5+
from dataclasses import dataclass, field
6+
from typing import Any
7+
8+
from models.errors import SchemaError
9+
10+
11+
@dataclass(frozen=True)
12+
class Workspace:
13+
"""A Cursor workspace entry.
14+
15+
The workspace ID is the directory name on disk (Cursor uses random
16+
short hashes as workspace IDs) and is passed in explicitly. ``folder``
17+
is the absolute path of the project the workspace targets, read from
18+
``workspace.json``; it may legitimately be ``None`` for a CLI-only
19+
workspace, so missing-folder is not a schema error.
20+
"""
21+
22+
workspace_id: str
23+
folder: str | None = None
24+
raw: dict[str, Any] = field(default_factory=dict)
25+
26+
@classmethod
27+
def from_dict(cls, raw: dict[str, Any], *, workspace_id: str) -> "Workspace":
28+
if not workspace_id:
29+
raise SchemaError("Workspace", "workspaceId", hint="empty workspace ID")
30+
folder = raw.get("folder")
31+
if folder is not None and not isinstance(folder, str):
32+
raise SchemaError(
33+
"Workspace",
34+
"folder",
35+
hint=f"expected str or None, got {type(folder).__name__}",
36+
)
37+
return cls(workspace_id=workspace_id, folder=folder, raw=raw)

0 commit comments

Comments
 (0)