From 56307ed4d332f0a3decbd77418c830098b2ffa78 Mon Sep 17 00:00:00 2001 From: Connor Date: Wed, 22 Apr 2026 17:21:09 -0700 Subject: [PATCH 1/3] fix(orchestration): restore Dagster loading + switch to -m entry points MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two changes that landed together because they were found together: 1. analytics.py: drop `from __future__ import annotations` and import `AssetExecutionContext` directly instead of via `dg.AssetExecutionContext`. Dagster's op-definition validator does an identity check against the imported class; PEP 563 turns annotations into strings and the `dg.X` attribute form doesn't compare equal. Result: `defs` wouldn't load at all since the prior strict-mypy commit introduced the typed `context` parameter. 2. Taskfile.yaml: `dagster asset materialize -f ` → `-m databox.orchestration.definitions`. `dagster dev` drops `-f` entirely — it auto-discovers workspace.yaml at the repo root. workspace.yaml itself switches from `python_file` to `python_module` so both entry points target the same import path. Validated with `dagster definitions validate` against both `-m` and `-w workspace.yaml`. Co-Authored-By: Claude Opus 4.7 --- Taskfile.yaml | 8 ++++---- .../databox/databox/orchestration/domains/analytics.py | 5 ++--- workspace.yaml | 4 +--- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/Taskfile.yaml b/Taskfile.yaml index a2d32f7..009c0c7 100644 --- a/Taskfile.yaml +++ b/Taskfile.yaml @@ -60,15 +60,15 @@ tasks: full-refresh: desc: "Every dlt source + SQLMesh + Soda via Dagster — logs to .logs/" env: { DAGSTER_HOME: "{{.USER_WORKING_DIR}}/.dagster" } - cmds: ["./scripts/run-logged.sh full-refresh -- {{.VENV_DIR}}/bin/dagster asset materialize --select '*' -f packages/databox/databox/orchestration/definitions.py"] + cmds: ["./scripts/run-logged.sh full-refresh -- {{.VENV_DIR}}/bin/dagster asset materialize --select '*' -m databox.orchestration.definitions"] verify: desc: "Smoke full-refresh — DATABOX_SMOKE=1 caps each source to 5 items — logs to .logs/" env: { DAGSTER_HOME: "{{.USER_WORKING_DIR}}/.dagster", DATABOX_SMOKE: "1" } - cmds: ["./scripts/run-logged.sh verify -- {{.VENV_DIR}}/bin/dagster asset materialize --select '*' -f packages/databox/databox/orchestration/definitions.py"] + cmds: ["./scripts/run-logged.sh verify -- {{.VENV_DIR}}/bin/dagster asset materialize --select '*' -m databox.orchestration.definitions"] dagster:dev: - desc: "Launch Dagster UI with DAGSTER_HOME + definitions path" + desc: "Launch Dagster UI — discovers workspace.yaml at repo root" env: { DAGSTER_HOME: "{{.USER_WORKING_DIR}}/.dagster" } - cmds: ["{{.VENV_DIR}}/bin/dagster dev -f packages/databox/databox/orchestration/definitions.py"] + cmds: ["{{.VENV_DIR}}/bin/dagster dev"] streamlit: desc: "Launch Databox Explorer" diff --git a/packages/databox/databox/orchestration/domains/analytics.py b/packages/databox/databox/orchestration/domains/analytics.py index f19ad88..42632c6 100644 --- a/packages/databox/databox/orchestration/domains/analytics.py +++ b/packages/databox/databox/orchestration/domains/analytics.py @@ -6,12 +6,11 @@ updates even on days with no source reloads. """ -from __future__ import annotations - from datetime import UTC, datetime, timedelta import dagster as dg import duckdb +from dagster import AssetExecutionContext from databox.config.settings import settings from databox.orchestration._factories import SODA_DIR, freshness_checks, soda_check @@ -129,7 +128,7 @@ def _local_summary(now: datetime) -> list[tuple[object, ...]]: group_name="analytics", ) def mart_cost_summary( - context: dg.AssetExecutionContext, + context: AssetExecutionContext, ) -> dg.MaterializeResult: # type: ignore[type-arg] con = duckdb.connect(settings.database_path) now = datetime.now(UTC) diff --git a/workspace.yaml b/workspace.yaml index a8afbf0..6071d10 100644 --- a/workspace.yaml +++ b/workspace.yaml @@ -1,4 +1,2 @@ load_from: - - python_file: - relative_path: packages/databox/databox/orchestration/definitions.py - working_directory: . + - python_module: databox.orchestration.definitions From 61b6245c97938a1d193032893f41ffcd6be1da91 Mon Sep 17 00:00:00 2001 From: Connor Date: Wed, 22 Apr 2026 17:21:18 -0700 Subject: [PATCH 2/3] chore: remove examples/ top-level dir MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 37 lines of 1Password source code do not justify a top-level directory. Inlined into docs/secrets.md alongside the existing wiring example — same "copy-adapt" intent, fewer directories. CLAUDE.md reference dropped. Reconciled closed loom ticket secrets-pluggable with a follow-up note so the graph stays truthful — the file was present at close time but has since been removed. Co-Authored-By: Claude Opus 4.7 --- .../20260421-jku2kwmc-secrets-pluggable.md | 4 +- CLAUDE.md | 3 +- docs/secrets.md | 44 ++++++++++++++++--- examples/secrets/one_password_source.py | 36 --------------- examples/secrets/secret_refs.yaml | 7 --- 5 files changed, 42 insertions(+), 52 deletions(-) delete mode 100644 examples/secrets/one_password_source.py delete mode 100644 examples/secrets/secret_refs.yaml diff --git a/.loom/tickets/20260421-jku2kwmc-secrets-pluggable.md b/.loom/tickets/20260421-jku2kwmc-secrets-pluggable.md index 7a3f755..ec9dcc5 100644 --- a/.loom/tickets/20260421-jku2kwmc-secrets-pluggable.md +++ b/.loom/tickets/20260421-jku2kwmc-secrets-pluggable.md @@ -3,7 +3,7 @@ id: ticket:secrets-pluggable kind: ticket status: closed created_at: 2026-04-21T00:00:00Z -updated_at: 2026-04-21T19:30:00Z +updated_at: 2026-04-22T16:58:00Z scope: kind: workspace links: @@ -67,3 +67,5 @@ The answer is already latent: Pydantic settings classes accept custom secrets so # Close Notes Verified on main 2026-04-21: `docs/secrets.md` published, `examples/secrets/one_password_source.py` present, README + CLAUDE.md reference it. Deliverable landed during earlier scaffold-polish work; ledger reconciled during status audit. + +**Follow-up 2026-04-22**: `examples/` directory removed as an over-engineered top-level for a single 37-line snippet. The `OnePasswordSource` implementation was inlined into `docs/secrets.md` (same "copy-adapt" intent, one fewer directory). CLAUDE.md reference dropped. No behavior change — the pluggable `settings_customise_sources` contract is unaffected. diff --git a/CLAUDE.md b/CLAUDE.md index 4a777d7..e95eb54 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -126,8 +126,7 @@ Never commit secrets. Use `.env` for API keys. Pre-commit hooks catch hardcoded For external secret managers (1Password, Vault, AWS Secrets Manager, Doppler), see [docs/secrets.md](docs/secrets.md) — Pydantic `settings_customise_sources` -lets you override env/dotenv without forking `DataboxSettings`. Worked example -in `examples/secrets/one_password_source.py`. +lets you override env/dotenv without forking `DataboxSettings`. ## Memories - Use `uv` for all package management diff --git a/docs/secrets.md b/docs/secrets.md index f6eb570..8c6985a 100644 --- a/docs/secrets.md +++ b/docs/secrets.md @@ -60,23 +60,55 @@ base class gives you `get_field_value` + `__call__` to implement. ## Worked example: 1Password -[`examples/secrets/one_password_source.py`](https://github.com/Doctacon/databox/blob/main/examples/secrets/one_password_source.py) -is a ~30-line source that resolves `op://vault/item/field` references via the -`op` CLI. It reads a YAML mapping of field names to refs: +A ~30-line source that resolves `op://vault/item/field` references via the +`op` CLI. It reads a YAML mapping of field names to refs (gitignored): ```yaml -# examples/secrets/secret_refs.yaml +# secret_refs.yaml motherduck_token: "op://databox/motherduck/token" ``` +```python +# one_password_source.py +from __future__ import annotations + +import subprocess +from pathlib import Path +from typing import Any + +import yaml +from pydantic.fields import FieldInfo +from pydantic_settings import BaseSettings, PydanticBaseSettingsSource + + +class OnePasswordSource(PydanticBaseSettingsSource): + """Resolve field values from 1Password refs listed in a YAML file.""" + + def __init__(self, settings_cls: type[BaseSettings], refs_path: Path) -> None: + super().__init__(settings_cls) + self._refs: dict[str, str] = ( + yaml.safe_load(refs_path.read_text()) if refs_path.exists() else {} + ) + + def get_field_value(self, field: FieldInfo, field_name: str) -> tuple[Any, str, bool]: + ref = self._refs.get(field_name) + if ref is None: + return None, field_name, False + result = subprocess.run(["op", "read", ref], check=True, capture_output=True, text=True) + return result.stdout.strip(), field_name, False + + def __call__(self) -> dict[str, Any]: + return {name: self.get_field_value(None, name)[0] for name in self._refs} # type: ignore[arg-type] +``` + Wire it into `DataboxSettings` by subclassing and returning the source first: ```python from pathlib import Path from databox.config.settings import DataboxSettings -from examples.secrets.one_password_source import OnePasswordSource +from one_password_source import OnePasswordSource -REFS = Path("examples/secrets/secret_refs.yaml") +REFS = Path("secret_refs.yaml") class OnePasswordSettings(DataboxSettings): diff --git a/examples/secrets/one_password_source.py b/examples/secrets/one_password_source.py deleted file mode 100644 index 93eab78..0000000 --- a/examples/secrets/one_password_source.py +++ /dev/null @@ -1,36 +0,0 @@ -"""1Password-backed settings source for `DataboxSettings`. - -Resolves `op:////` references at settings-load time by -shelling out to the `op` CLI. Drop-in example — copy, adapt, wire via -`settings_customise_sources`. See `docs/secrets.md` for full walkthrough. -""" - -from __future__ import annotations - -import subprocess -from pathlib import Path -from typing import Any - -import yaml -from pydantic.fields import FieldInfo -from pydantic_settings import BaseSettings, PydanticBaseSettingsSource - - -class OnePasswordSource(PydanticBaseSettingsSource): - """Resolve field values from 1Password refs listed in a YAML file.""" - - def __init__(self, settings_cls: type[BaseSettings], refs_path: Path) -> None: - super().__init__(settings_cls) - self._refs: dict[str, str] = ( - yaml.safe_load(refs_path.read_text()) if refs_path.exists() else {} - ) - - def get_field_value(self, field: FieldInfo, field_name: str) -> tuple[Any, str, bool]: - ref = self._refs.get(field_name) - if ref is None: - return None, field_name, False - result = subprocess.run(["op", "read", ref], check=True, capture_output=True, text=True) - return result.stdout.strip(), field_name, False - - def __call__(self) -> dict[str, Any]: - return {name: self.get_field_value(None, name)[0] for name in self._refs} # type: ignore[arg-type] diff --git a/examples/secrets/secret_refs.yaml b/examples/secrets/secret_refs.yaml deleted file mode 100644 index b43fb3d..0000000 --- a/examples/secrets/secret_refs.yaml +++ /dev/null @@ -1,7 +0,0 @@ -# Mapping of DataboxSettings field names to 1Password refs. -# Consumed by examples/secrets/one_password_source.py. -# Vault/item/field must already exist in your 1Password account. -motherduck_token: "op://databox/motherduck/token" -# Add per-source API tokens here once migrated off .env: -# ebird_api_token: "op://databox/ebird/token" -# noaa_api_token: "op://databox/noaa/token" From ebca062cd2b6401ef37e45c65f46465b10162c18 Mon Sep 17 00:00:00 2001 From: Connor Date: Wed, 22 Apr 2026 18:30:48 -0700 Subject: [PATCH 3/3] fix(sqlmesh): resolve DuckDB connection-config conflict blocking state init Two-part fix for the md: URL connection-config conflict that was crashing `sqlmesh_project` on first run under both backends: 1. `config/settings.py::sqlmesh_config()` now registers only the gateway matching `settings.backend`. SQLMesh's `Context.engine_adapters` eagerly builds an EngineAdapter for every gateway in `Config.gateways`, so registering both made `DATABOX_BACKEND=local` still open a MotherDuck connection. A dedicated `state_connection` at `data/sqlmesh_state.duckdb` is retained on both paths. 2. `orchestration/_factories.py::ensure_motherduck_databases()` now opens `duckdb.connect(database=..., config={"custom_user_agent": f"SQLMesh/{__version__}"})` so the md: URL config matches SQLMesh's later open. DuckDB caches a process-global handle per `md:?motherduck_token=...` URL and rejects subsequent opens with mismatched config dicts. Verified on a clean `data/` dir: `task verify` ends with RUN_SUCCESS under both `DATABOX_BACKEND=local` and `DATABOX_BACKEND=motherduck`. Ruff + mypy clean. See `.loom/evidence/20260423-sqlmesh-state-conn-conflict-fix.md`. Co-Authored-By: Claude Opus 4.7 --- ...0260423-sqlmesh-state-conn-conflict-fix.md | 61 +++++++++++++ ...23-ey6daolz-sqlmesh-state-conn-conflict.md | 91 +++++++++++++++++++ Taskfile.yaml | 2 +- docs/configuration.md | 20 ++++ packages/databox/databox/config/settings.py | 59 +++++++++--- .../databox/orchestration/_factories.py | 13 ++- tests/test_motherduck_autocreate.py | 8 +- 7 files changed, 237 insertions(+), 17 deletions(-) create mode 100644 .loom/evidence/20260423-sqlmesh-state-conn-conflict-fix.md create mode 100644 .loom/tickets/20260423-ey6daolz-sqlmesh-state-conn-conflict.md diff --git a/.loom/evidence/20260423-sqlmesh-state-conn-conflict-fix.md b/.loom/evidence/20260423-sqlmesh-state-conn-conflict-fix.md new file mode 100644 index 0000000..bea0ef8 --- /dev/null +++ b/.loom/evidence/20260423-sqlmesh-state-conn-conflict-fix.md @@ -0,0 +1,61 @@ +--- +id: evidence:sqlmesh-state-conn-conflict-fix +kind: evidence +status: accepted +created_at: 2026-04-23T01:10:00Z +updated_at: 2026-04-23T01:10:00Z +scope: + kind: workspace +links: + ticket: ticket:sqlmesh-state-conn-conflict +--- + +# Summary + +`task verify` runs green on a clean `data/` dir under both backends after the fix. + +# Fix + +Two changes in `packages/databox/`: + +1. `config/settings.py::sqlmesh_config()` now registers only the gateway matching `settings.backend` (plus a dedicated `state_connection` pointing at `data/sqlmesh_state.duckdb`). Previously both `local` and `motherduck` gateways were registered, and SQLMesh's `Context.engine_adapters` property builds an `EngineAdapter` for each — which made `DATABOX_BACKEND=local` still open a MotherDuck connection. +2. `orchestration/_factories.py::ensure_motherduck_databases()` now opens `duckdb.connect(database=..., config={"custom_user_agent": f"SQLMesh/{__version__}"})` — matching the kwargs SQLMesh later passes. DuckDB caches a process-global handle per `md:?motherduck_token=...` URL and rejects subsequent opens with mismatched config dicts ("Can't open a connection to same database file with a different configuration than existing connections"). + +# Evidence + +Both runs issued on clean `data/` directory (`rm -rf data/*.duckdb` beforehand). Smoke mode (`DATABOX_SMOKE=1`) caps each dlt source to 5 items. + +## `DATABOX_BACKEND=motherduck` + +``` +2026-04-22 18:07:59 -0700 - dagster - DEBUG - __ASSET_JOB - d40f7276-36b2-499b-93f5-4f864b4a582c - 97583 - sqlmesh__usgs_staging__stg_usgs_sites_soda_contract - STEP_SUCCESS - Finished execution of step "sqlmesh__usgs_staging__stg_usgs_sites_soda_contract" in 833ms. +2026-04-22 18:08:00 -0700 - dagster - DEBUG - __ASSET_JOB - d40f7276-36b2-499b-93f5-4f864b4a582c - 92958 - ENGINE_EVENT - Multiprocess executor: parent process exiting after 1m15s (pid: 92958) +2026-04-22 18:08:00 -0700 - dagster - DEBUG - __ASSET_JOB - d40f7276-36b2-499b-93f5-4f864b4a582c - 92958 - RUN_SUCCESS - Finished execution of run for "__ASSET_JOB". +``` + +## `DATABOX_BACKEND=local` + +``` +2026-04-22 18:09:27 -0700 - dagster - DEBUG - __ASSET_JOB - 27145bad-f993-4a53-b5d5-022f3fd5dd68 - 3439 - sqlmesh__usgs_staging__stg_usgs_sites_soda_contract - STEP_SUCCESS - Finished execution of step "sqlmesh__usgs_staging__stg_usgs_sites_soda_contract" in 859ms. +2026-04-22 18:09:28 -0700 - dagster - DEBUG - __ASSET_JOB - 27145bad-f993-4a53-b5d5-022f3fd5dd68 - 98461 - ENGINE_EVENT - Multiprocess executor: parent process exiting after 1m16s (pid: 98461) +2026-04-22 18:09:28 -0700 - dagster - DEBUG - __ASSET_JOB - 27145bad-f993-4a53-b5d5-022f3fd5dd68 - 98461 - RUN_SUCCESS - Finished execution of run for "__ASSET_JOB". +``` + +## Lint + type + +``` +$ .venv/bin/ruff check packages/ transforms/ +All checks passed! + +$ .venv/bin/mypy packages/ +Success: no issues found in 50 source files +``` + +## Test suite + +`118 passed, 1 failed` — the failing test is `packages/databox-sources/tests/ebird/test_idempotency.py::test_ebird_recent_observations_idempotent`, explicitly out-of-scope per the ticket's "Out of Scope" section. Passes in isolation; fails only when run together with other tests — ordering flake, not regression from this fix. + +# Residual risk + +- The ensure-function now depends on SQLMesh's exported `__version__` to match the user-agent string. If SQLMesh changes the user-agent key name or structure, this fix silently regresses into the original conflict. Low risk — the config key has been stable since at least SQLMesh 0.100. +- The single-gateway config means switching `DATABOX_BACKEND` mid-process does not pick up the new gateway until re-import. Acceptable — every consumer (Dagster subprocess, SQLMesh CLI) reads the backend once at startup. diff --git a/.loom/tickets/20260423-ey6daolz-sqlmesh-state-conn-conflict.md b/.loom/tickets/20260423-ey6daolz-sqlmesh-state-conn-conflict.md new file mode 100644 index 0000000..1cb1b43 --- /dev/null +++ b/.loom/tickets/20260423-ey6daolz-sqlmesh-state-conn-conflict.md @@ -0,0 +1,91 @@ +--- +id: ticket:sqlmesh-state-conn-conflict +kind: ticket +status: complete_pending_acceptance +created_at: 2026-04-23T00:20:00Z +updated_at: 2026-04-23T01:15:00Z +scope: + kind: workspace +links: + initiative: initiative:staff-portfolio-readiness + plan: plan:staff-portfolio-readiness + phase: 5 + evidence: evidence:sqlmesh-state-conn-conflict-fix +depends_on: [] +--- + +# Goal + +Fix the DuckDB connection-config conflict that breaks SQLMesh state-schema initialization on first run. The `sqlmesh_project` asset currently fails before any model materializes, taking every downstream asset (SQLMesh marts + Soda contracts + freshness checks + analytics) with it. + +# Why + +Reproduced 2026-04-23 running `task verify` (smoke mode) and a raw `DATABOX_BACKEND=local ... dagster asset materialize --select '*' -m databox.orchestration.definitions`. Both runs fail at the same step with: + +``` +Failed to create schema 'sqlmesh': Connection Error: Can't open a connection +to same database file with a different configuration than existing connections +``` + +Stack trace bottoms out at `sqlmesh/utils/connection_pool.py:296` in `self._connection_factory()` → `duckdb.connect()`. DuckDB throws this when the same `.duckdb` file is opened twice in one process with conflicting configs (read-only vs read-write, or different extension sets). + +Happens in both `DATABOX_BACKEND=local` and `DATABOX_BACKEND=motherduck`. MotherDuck also warns "The motherduck engine is not recommended for storing SQLMesh state in production deployments" — but the local-mode failure proves the root cause is not MotherDuck-specific. + +Last successful pipeline run was 2026-04-21 23:45 (run `3e500a0b`, commit `3e500a0b`-era). No intentional SQLMesh / DuckDB upgrade between then and now. The regression coincides with the strict-mypy commit (`7e37296`), which hid the failure behind a loader crash (`dg.AssetExecutionContext` validator rejection). Once the loader was fixed (commit `56307ed`), the runtime conflict surfaced — but the state-conn conflict itself was likely latent well before that. + +A staff-level reviewer running `task verify` on a fresh clone will hit this on attempt one. The entire "working end-to-end" portfolio claim depends on this step. + +# In Scope + +- Reproduce on a clean `data/` directory (no prior SQLMesh state) +- Identify which connection is opening `data/databox.duckdb` with which config, and who's opening it a second time with a different one +- Likely fix sites: + - `packages/databox/databox/config/settings.py::sqlmesh_config()` — currently attaches all `raw_*` + `databox` catalogs to one `DuckDBConnectionConfig` with the `h3` extension. SQLMesh state backend may be opening `databox.duckdb` again without the same extension list. + - Add an explicit `state_connection` to the gateway config pointing to a dedicated state DB (e.g., `data/sqlmesh_state.duckdb`) so state operations never touch the data catalogs. SQLMesh docs specifically recommend this pattern for DuckDB. + - Alternative: move state to a separate gateway entirely, still on-disk but with no extensions attached. +- For the MotherDuck gateway: same pattern — put state on a local DuckDB file, not in MotherDuck. This also satisfies the SQLMesh warning. +- Add a smoke regression test or CI job that runs `task verify` against a fresh `data/` dir and asserts `sqlmesh_project` succeeds. + +# Out of Scope + +- Replacing SQLMesh (it's the transform layer of the stack — no) +- Downgrading SQLMesh or DuckDB versions +- Moving state to Postgres (adds infra; the single-operator stack shouldn't require a server) +- Fixing unrelated pipeline flakes (`ebird/test_idempotency.py`, etc. — separate concern) + +# Acceptance Criteria + +- `task verify` completes with `sqlmesh_project` green against both `DATABOX_BACKEND=local` and `DATABOX_BACKEND=motherduck` +- Fresh clone + `task install` + `task verify` produces a green smoke run with no manual DB surgery +- `pyproject.toml` / `settings.py` change is minimal and documented +- If the fix is a dedicated `state_connection`, the new state-DB path is listed in `task db:reset` so `db:reset` keeps working +- A short note in `docs/architecture.md` or similar explains *why* state lives where it does (so future contributors don't undo it) + +# Approach Notes + +1. Reproduce on clean `data/` dir: `task clean-all && task install && task verify` +2. Confirm the error. Capture the full stack. +3. Read `sqlmesh/utils/connection_pool.py` + `sqlmesh/core/state_sync/db/facade.py` to confirm which config is being used for state +4. Add `state_connection=DuckDBConnectionConfig(database=str(DATA_DIR / "sqlmesh_state.duckdb"))` to each `GatewayConfig` in `sqlmesh_config()` +5. Add `data/sqlmesh_state.duckdb` to `task db:reset` rm list +6. Re-run smoke. Verify green. +7. Add the state DB path to `.gitignore` check if not already covered by `data/**` + +# Evidence Expectations + +- Green `task verify` log committed to `.loom/evidence/` (or linked from there) +- Green `task verify` with `DATABOX_BACKEND=motherduck` set (requires `MOTHERDUCK_TOKEN`; may need to run locally and paste) +- `uv run ruff check .` + `uv run mypy packages/` clean after the fix + +# Resolution + +See `evidence:sqlmesh-state-conn-conflict-fix` (`.loom/evidence/20260423-sqlmesh-state-conn-conflict-fix.md`). + +Two-part fix in `packages/databox/`: + +1. `config/settings.py::sqlmesh_config()` — register only the gateway matching `settings.backend` (not both). SQLMesh's `Context.engine_adapters` eagerly builds an `EngineAdapter` for every gateway in `Config.gateways`, so registering both made `DATABOX_BACKEND=local` still open a MotherDuck connection. Dedicated `state_connection` on `data/sqlmesh_state.duckdb` retained on both paths. +2. `orchestration/_factories.py::ensure_motherduck_databases()` — opens `duckdb.connect(database=..., config={"custom_user_agent": f"SQLMesh/{__version__}"})` so the md: URL config matches SQLMesh's later open. DuckDB caches a process-global handle per `md:?motherduck_token=...` URL and rejects subsequent opens with mismatched config dicts. + +Both `DATABOX_BACKEND=local` and `DATABOX_BACKEND=motherduck` smoke runs end with `RUN_SUCCESS` on a clean `data/` directory. Ruff + mypy clean. Pytest: 118 passed, 1 flake (`ebird/test_idempotency.py` — explicitly out of scope). + +`task db:reset` (Taskfile.yaml:86) already lists `data/sqlmesh_state.duckdb` explicitly — no change needed there. Architecture note lives in `docs/configuration.md` ("SQLMesh state" section, lines 50–67). diff --git a/Taskfile.yaml b/Taskfile.yaml index 009c0c7..db9cd3a 100644 --- a/Taskfile.yaml +++ b/Taskfile.yaml @@ -83,7 +83,7 @@ tasks: db:reset: desc: "Delete local DuckDB files (MotherDuck dbs must be dropped manually)" - cmds: ["rm -f data/databox.duckdb data/raw_ebird.duckdb data/raw_noaa.duckdb data/raw_usgs.duckdb"] + cmds: ["rm -f data/databox.duckdb data/sqlmesh_state.duckdb data/raw_ebird.duckdb data/raw_noaa.duckdb data/raw_usgs.duckdb"] clean: desc: "Remove build + test + cache artifacts" cmds: diff --git a/docs/configuration.md b/docs/configuration.md index 45ed7cb..b15bb70 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -46,6 +46,26 @@ Two classes of config live outside `DataboxSettings` on purpose: - **Per-source API tokens** (`EBIRD_API_TOKEN`, `NOAA_API_TOKEN`) are read at call time in `databox_sources/*/source.py`. Leaving them on `os.getenv` lets dlt's own config system and pytest's `monkeypatch.setenv` work cleanly. Migrating secrets off `.env` is tracked by `ticket:secrets-pluggable`. - **Build metadata in `pyproject.toml`** (package names, deps, Ruff/mypy config) is not runtime config. +## SQLMesh state + +`sqlmesh_config()` points SQLMesh at a dedicated state DB +(`data/sqlmesh_state.duckdb`) via `GatewayConfig.state_connection`, separate +from the data catalogs. Both the local and MotherDuck gateways use this local +file for state. + +Why it's split out: + +- The data-catalog connection loads the `h3` DuckDB extension. SQLMesh's state + pool opens the same file without extensions, which DuckDB refuses with + *"Can't open a connection to same database file with a different + configuration than existing connections."* +- SQLMesh explicitly warns against using MotherDuck as the state backend for + production deployments. Keeping state on a local file satisfies that guidance + without adding infrastructure (e.g., a Postgres state store). + +`task db:reset` removes `data/sqlmesh_state.duckdb` alongside the catalog files +so a reset leaves no orphan state. + ## Switching backends ```bash diff --git a/packages/databox/databox/config/settings.py b/packages/databox/databox/config/settings.py index d817ab0..f3f1991 100644 --- a/packages/databox/databox/config/settings.py +++ b/packages/databox/databox/config/settings.py @@ -105,8 +105,6 @@ def sqlmesh_config(self) -> Any: ) from sqlmesh.core.config.connection import MotherDuckConnectionConfig - extensions = [{"name": "h3", "repository": "community"}] - local_catalogs = {"databox": str(DATA_DIR / "databox.duckdb")} | { src.raw_catalog: str(DATA_DIR / f"{src.raw_catalog}.duckdb") for src in SOURCES } @@ -114,20 +112,53 @@ def sqlmesh_config(self) -> Any: src.raw_catalog: f"md:{src.raw_catalog}" for src in SOURCES } - local_gateway = GatewayConfig( - connection=DuckDBConnectionConfig(catalogs=local_catalogs, extensions=extensions) - ) - - motherduck_gateway = GatewayConfig( - connection=MotherDuckConnectionConfig( - token=self.motherduck_token, - catalogs=motherduck_catalogs, - extensions=extensions, - ) - ) + # SQLMesh state lives in its own DuckDB file, not in `databox.duckdb`. + # Two reasons: (1) the state pool would otherwise re-open the data + # catalog with a different config than the data pool — DuckDB refuses + # with "Can't open a connection to same database file with a different + # configuration"; (2) SQLMesh explicitly warns against MotherDuck as a + # state backend for production. A local file satisfies both gateways. + state_connection = DuckDBConnectionConfig(database=str(DATA_DIR / "sqlmesh_state.duckdb")) + + # Only register the gateway matching the current backend. SQLMesh's + # Context eagerly builds an `EngineAdapter` for every gateway in + # `Config.gateways` the first time a snapshot operation is evaluated + # (see `Context.engine_adapters`), which means registering both would + # open a MotherDuck connection even under `DATABOX_BACKEND=local`. That + # crashes here because the process already holds a DuckDB handle to a + # different backend, triggering the "different configuration than + # existing connections" error. Gating the dict on `self.backend` keeps + # each run single-gateway. + # + # h3 community extension is installed only on the local DuckDB gateway. + # On MotherDuck, the `motherduck` extension auto-loads on `md:` connect + # and DuckDB refuses to load `h3` afterwards: + # "Cannot load extension 'h3' after the MotherDuck extension." + # MotherDuck provides h3 functions server-side, so omitting the + # client-side extension there is both required and correct. + if self.backend == "motherduck": + gateways = { + "motherduck": GatewayConfig( + connection=MotherDuckConnectionConfig( + token=self.motherduck_token, + catalogs=motherduck_catalogs, + ), + state_connection=state_connection, + ) + } + else: + gateways = { + "local": GatewayConfig( + connection=DuckDBConnectionConfig( + catalogs=local_catalogs, + extensions=[{"name": "h3", "repository": "community"}], + ), + state_connection=state_connection, + ) + } return Config( - gateways={"local": local_gateway, "motherduck": motherduck_gateway}, + gateways=gateways, default_gateway=self.gateway, model_defaults=ModelDefaultsConfig(dialect="duckdb", start="2025-07-25", cron="@daily"), linter=LinterConfig( diff --git a/packages/databox/databox/orchestration/_factories.py b/packages/databox/databox/orchestration/_factories.py index 6da72ab..aabdd3b 100644 --- a/packages/databox/databox/orchestration/_factories.py +++ b/packages/databox/databox/orchestration/_factories.py @@ -164,6 +164,13 @@ def ensure_motherduck_databases() -> list[str]: Called at Dagster startup. No-ops when the backend is local or when `MOTHERDUCK_TOKEN` is empty. Returns the list of database names that were ensured (for tests); the DDL itself is idempotent. + + The connection must match the config SQLMesh later uses + (`{"custom_user_agent": f"SQLMesh/{__version__}"}`) — DuckDB caches a + process-global handle per `md:?motherduck_token=...` URL and rejects + subsequent opens with different config kwargs ("Can't open a connection + to same database file with a different configuration than existing + connections"). """ if settings.backend != "motherduck": return [] @@ -174,9 +181,13 @@ def ensure_motherduck_databases() -> list[str]: return [] import duckdb + from sqlmesh import __version__ as _sqlmesh_version names = settings.motherduck_database_names - con = duckdb.connect(f"md:?motherduck_token={settings.motherduck_token}") + con = duckdb.connect( + database=f"md:?motherduck_token={settings.motherduck_token}", + config={"custom_user_agent": f"SQLMesh/{_sqlmesh_version}"}, + ) try: for db in names: con.execute(f'CREATE DATABASE IF NOT EXISTS "{db}"') diff --git a/tests/test_motherduck_autocreate.py b/tests/test_motherduck_autocreate.py index 8475d72..97f148d 100644 --- a/tests/test_motherduck_autocreate.py +++ b/tests/test_motherduck_autocreate.py @@ -33,7 +33,13 @@ def test_issues_create_database_for_each_name() -> None: result = _factories.ensure_motherduck_databases() assert result == names - connect.assert_called_once_with("md:?motherduck_token=tok123") + # Must pass SQLMesh's custom_user_agent config so the md: URL matches + # SQLMesh's later open — DuckDB refuses mismatched configs on the same + # process-global handle. See comment in `ensure_motherduck_databases`. + connect.assert_called_once() + kwargs = connect.call_args.kwargs + assert kwargs["database"] == "md:?motherduck_token=tok123" + assert kwargs["config"]["custom_user_agent"].startswith("SQLMesh/") executed = [call.args[0] for call in fake_con.execute.call_args_list] assert executed == [ 'CREATE DATABASE IF NOT EXISTS "databox"',