diff --git a/UPDATING.md b/UPDATING.md
index fb37e3bcc04d..40278c7bedd1 100644
--- a/UPDATING.md
+++ b/UPDATING.md
@@ -470,6 +470,247 @@ See `superset/mcp_service/PRODUCTION.md` for deployment guides.
}
```
+### Composite primary keys on many-to-many association tables
+
+The eight M:N association tables listed below have been changed from a synthetic surrogate `id INTEGER PRIMARY KEY` to a composite `PRIMARY KEY (fk1, fk2)` on the two foreign-key columns. The `id` column is dropped, and the two tables that previously carried a redundant `UNIQUE (fk1, fk2)` constraint have that constraint removed (it is now subsumed by the composite primary key).
+
+**Affected tables and their composite-PK column pairs:**
+
+| Table | Composite PK |
+|---|---|
+| `dashboard_roles` | `(dashboard_id, role_id)` |
+| `dashboard_slices` | `(dashboard_id, slice_id)` |
+| `dashboard_user` | `(user_id, dashboard_id)` |
+| `report_schedule_user` | `(user_id, report_schedule_id)` |
+| `rls_filter_roles` | `(role_id, rls_filter_id)` |
+| `rls_filter_tables` | `(table_id, rls_filter_id)` |
+| `slice_user` | `(user_id, slice_id)` |
+| `sqlatable_user` | `(user_id, table_id)` |
+
+**Impact on external readers:** Any BI tool, custom report, backup script, or external integration that references these tables by their old surrogate `id` column (e.g., `SELECT id FROM dashboard_slices WHERE …`, `WHERE dashboard_slices.id IN (…)`) will break. Update such queries to project or filter on the FK pair (`dashboard_id, slice_id`) instead. The FK columns themselves are unchanged.
+
+**Pre-flight inventory queries.** Before applying the upgrade, operators are encouraged to run the queries below against their database to assess what the migration will change. Two classes of pre-existing data are not preserved by the migration: duplicate `(fk1, fk2)` rows (the migration keeps `MIN(id)` and deletes the rest) and rows with `NULL` in either FK column (the migration deletes them, since FK columns are promoted to `NOT NULL` for the composite PK). Compliance- or audit-sensitive operators should also `\copy` (Postgres) or `SELECT … INTO OUTFILE` (MySQL) the affected rows for their own records before upgrading.
+
+```sql
+-- Duplicate (fk1, fk2) pairs (the migration will keep MIN(id) per group, delete the rest)
+SELECT dashboard_id, role_id, COUNT(*) FROM dashboard_roles GROUP BY dashboard_id, role_id HAVING COUNT(*) > 1;
+SELECT dashboard_id, slice_id, COUNT(*) FROM dashboard_slices GROUP BY dashboard_id, slice_id HAVING COUNT(*) > 1;
+SELECT user_id, dashboard_id, COUNT(*) FROM dashboard_user GROUP BY user_id, dashboard_id HAVING COUNT(*) > 1;
+SELECT user_id, report_schedule_id, COUNT(*) FROM report_schedule_user GROUP BY user_id, report_schedule_id HAVING COUNT(*) > 1;
+SELECT role_id, rls_filter_id, COUNT(*) FROM rls_filter_roles GROUP BY role_id, rls_filter_id HAVING COUNT(*) > 1;
+SELECT table_id, rls_filter_id, COUNT(*) FROM rls_filter_tables GROUP BY table_id, rls_filter_id HAVING COUNT(*) > 1;
+SELECT user_id, slice_id, COUNT(*) FROM slice_user GROUP BY user_id, slice_id HAVING COUNT(*) > 1;
+SELECT user_id, table_id, COUNT(*) FROM sqlatable_user GROUP BY user_id, table_id HAVING COUNT(*) > 1;
+
+-- Rows with a NULL in either FK (the migration will delete these)
+SELECT COUNT(*) FROM dashboard_roles WHERE dashboard_id IS NULL OR role_id IS NULL;
+SELECT COUNT(*) FROM dashboard_slices WHERE dashboard_id IS NULL OR slice_id IS NULL;
+SELECT COUNT(*) FROM dashboard_user WHERE user_id IS NULL OR dashboard_id IS NULL;
+SELECT COUNT(*) FROM report_schedule_user WHERE user_id IS NULL OR report_schedule_id IS NULL;
+SELECT COUNT(*) FROM rls_filter_roles WHERE role_id IS NULL OR rls_filter_id IS NULL;
+SELECT COUNT(*) FROM rls_filter_tables WHERE table_id IS NULL OR rls_filter_id IS NULL;
+SELECT COUNT(*) FROM slice_user WHERE user_id IS NULL OR slice_id IS NULL;
+SELECT COUNT(*) FROM sqlatable_user WHERE user_id IS NULL OR table_id IS NULL;
+```
+
+**Sizing the maintenance window on PostgreSQL.** The queries above are dialect-portable but only count rows. Operators on PostgreSQL can run the diagnostic queries below to characterize the migration's runtime cost ahead of time: per-table row count and on-disk size, an aggregated duplicate roll-up, the external-FK pre-flight check (the migration runs the same check and aborts if it returns rows), and a lock-window estimate. On PostgreSQL **all eight tables take the direct-ALTER path** — the two redundant `UNIQUE` constraints are dropped by name (`DROP CONSTRAINT`), avoiding any full-table rewrite; the `recreate="always"` rewrite path applies only on MySQL/SQLite. Note also that Alembic runs the whole upgrade in **one transaction on PostgreSQL: the `ACCESS EXCLUSIVE` locks acquired per table are held cumulatively until commit**, so total unavailability of these RBAC/RLS junction tables is the *sum* of the per-table windows — and a waiting `ACCESS EXCLUSIVE` queues all later reads behind it. Run the migration with the application quiesced.
+
+```sql
+-- Per-table size and row count. Two tables ("dashboard_slices",
+-- "report_schedule_user") carry a redundant UNIQUE constraint; on
+-- PostgreSQL it is dropped by name (DROP CONSTRAINT) and every table
+-- then takes the same direct-ALTER path — no full-table rewrite on
+-- this dialect. has_unique only signals the extra DROP CONSTRAINT.
+WITH affected(name, has_unique) AS (
+ VALUES
+ ('dashboard_roles', false),
+ ('dashboard_slices', true),
+ ('dashboard_user', false),
+ ('report_schedule_user', true),
+ ('rls_filter_roles', false),
+ ('rls_filter_tables', false),
+ ('slice_user', false),
+ ('sqlatable_user', false)
+)
+SELECT
+ a.name AS table_name,
+ CASE WHEN a.has_unique THEN 'recreate (full rewrite)'
+ ELSE 'direct ALTER' END AS migration_path,
+ c.reltuples::bigint AS estimated_rows,
+ pg_size_pretty(pg_total_relation_size(c.oid)) AS total_size,
+ pg_size_pretty(pg_relation_size(c.oid)) AS heap_size,
+ pg_size_pretty(pg_indexes_size(c.oid)) AS index_size
+FROM affected a
+JOIN pg_class c ON c.relname = a.name AND c.relkind = 'r'
+ORDER BY pg_total_relation_size(c.oid) DESC;
+```
+
+```sql
+-- Aggregated duplicate-row roll-up.
+-- "dup_groups" is the number of (fk1, fk2) pairs that appear more
+-- than once; "rows_dropped" is the total number of rows the
+-- migration will delete during the dedupe pass (it keeps MIN(id) per
+-- group and discards the rest).
+SELECT 'dashboard_roles' AS t, COUNT(*) AS dup_groups, SUM(c) - COUNT(*) AS rows_dropped
+ FROM (SELECT COUNT(*) c FROM dashboard_roles GROUP BY dashboard_id, role_id HAVING COUNT(*) > 1) g
+UNION ALL SELECT 'dashboard_slices', COUNT(*), SUM(c) - COUNT(*)
+ FROM (SELECT COUNT(*) c FROM dashboard_slices GROUP BY dashboard_id, slice_id HAVING COUNT(*) > 1) g
+UNION ALL SELECT 'dashboard_user', COUNT(*), SUM(c) - COUNT(*)
+ FROM (SELECT COUNT(*) c FROM dashboard_user GROUP BY user_id, dashboard_id HAVING COUNT(*) > 1) g
+UNION ALL SELECT 'report_schedule_user',COUNT(*), SUM(c) - COUNT(*)
+ FROM (SELECT COUNT(*) c FROM report_schedule_user GROUP BY user_id, report_schedule_id HAVING COUNT(*) > 1) g
+UNION ALL SELECT 'rls_filter_roles', COUNT(*), SUM(c) - COUNT(*)
+ FROM (SELECT COUNT(*) c FROM rls_filter_roles GROUP BY role_id, rls_filter_id HAVING COUNT(*) > 1) g
+UNION ALL SELECT 'rls_filter_tables', COUNT(*), SUM(c) - COUNT(*)
+ FROM (SELECT COUNT(*) c FROM rls_filter_tables GROUP BY table_id, rls_filter_id HAVING COUNT(*) > 1) g
+UNION ALL SELECT 'slice_user', COUNT(*), SUM(c) - COUNT(*)
+ FROM (SELECT COUNT(*) c FROM slice_user GROUP BY user_id, slice_id HAVING COUNT(*) > 1) g
+UNION ALL SELECT 'sqlatable_user', COUNT(*), SUM(c) - COUNT(*)
+ FROM (SELECT COUNT(*) c FROM sqlatable_user GROUP BY user_id, table_id HAVING COUNT(*) > 1) g
+ORDER BY rows_dropped DESC NULLS LAST;
+```
+
+```sql
+-- External-FK pre-flight check.
+-- The migration runs the equivalent check at upgrade time and aborts
+-- if any external FK references one of the soon-to-be-removed `id`
+-- columns. Running it ahead of time lets you discover (and migrate)
+-- any such reference before the maintenance window. On a stock
+-- Superset install this should return zero rows. (Default schema
+-- only; multi-schema deployments need to broaden the lookup.)
+SELECT
+ rc.constraint_name,
+ kcu.table_schema || '.' || kcu.table_name AS referencing_table,
+ kcu.column_name AS referencing_column,
+ ccu.table_name AS referenced_table,
+ ccu.column_name AS referenced_column
+FROM information_schema.referential_constraints rc
+JOIN information_schema.key_column_usage kcu
+ ON kcu.constraint_name = rc.constraint_name
+ AND kcu.constraint_schema = rc.constraint_schema
+JOIN information_schema.constraint_column_usage ccu
+ ON ccu.constraint_name = rc.constraint_name
+ AND ccu.constraint_schema = rc.constraint_schema
+WHERE ccu.table_name IN (
+ 'dashboard_roles','dashboard_slices','dashboard_user',
+ 'report_schedule_user','rls_filter_roles','rls_filter_tables',
+ 'slice_user','sqlatable_user')
+ AND ccu.column_name = 'id';
+```
+
+```sql
+-- Lock-window estimate, all eight tables. Each direct ALTER takes
+-- ACCESS EXCLUSIVE for the duration of the composite-PK index build
+-- (plus the implicit NOT NULL validation scan) — typically seconds
+-- for tables in the low millions of rows, but the locks are held
+-- cumulatively until the migration's single transaction commits.
+SELECT
+ c.relname AS table_name,
+ pg_size_pretty(pg_relation_size(c.oid)) AS heap_size,
+ pg_relation_size(c.oid) / 1024 / 1024 AS heap_size_mb,
+ ROUND(pg_relation_size(c.oid) / 1024 / 1024 / 100.0, 1) AS est_seconds_at_100mbs
+FROM pg_class c
+WHERE c.relname IN (
+ 'dashboard_roles', 'dashboard_slices', 'dashboard_user',
+ 'report_schedule_user', 'rls_filter_roles', 'rls_filter_tables',
+ 'slice_user', 'sqlatable_user');
+```
+
+**Sizing the maintenance window on MySQL.** Equivalent diagnostic queries for MySQL/InnoDB. One important difference from PostgreSQL: InnoDB rebuilds the clustered index on every PK change, so *all eight* tables undergo a full table rebuild on MySQL — not just the two that go through the explicit `recreate="always"` path. Additionally, the upgrade emits `DROP COLUMN id` and `ADD PRIMARY KEY (fk1, fk2)` as **separate ALTER statements, so most tables pay the clustered-index rebuild twice** — budget roughly 2× the single-rebuild estimate from the query below. The **downgrade is a comparable maintenance window in its own right** (it re-adds the `id` column and rebuilds every table on both dialects); plan rollback windows with the same sizing, not as a quick undo.
+
+```sql
+-- Per-table size, row count, and which migration path each will take.
+-- TABLE_ROWS is an InnoDB estimate (analogous to PostgreSQL's reltuples);
+-- run SELECT COUNT(*) per table for an exact count if needed.
+SELECT
+ TABLE_NAME AS table_name,
+ CASE WHEN TABLE_NAME IN ('dashboard_slices', 'report_schedule_user')
+ THEN 'recreate (explicit, drops UNIQUE)'
+ ELSE 'direct ALTER (still rebuilds InnoDB clustered index)'
+ END AS migration_path,
+ TABLE_ROWS AS estimated_rows,
+ CONCAT(ROUND((DATA_LENGTH + INDEX_LENGTH) / 1024 / 1024, 1), ' MB') AS total_size,
+ CONCAT(ROUND(DATA_LENGTH / 1024 / 1024, 1), ' MB') AS heap_size,
+ CONCAT(ROUND(INDEX_LENGTH / 1024 / 1024, 1), ' MB') AS index_size
+FROM information_schema.TABLES
+WHERE TABLE_SCHEMA = DATABASE()
+ AND TABLE_NAME IN (
+ 'dashboard_roles', 'dashboard_slices', 'dashboard_user',
+ 'report_schedule_user', 'rls_filter_roles', 'rls_filter_tables',
+ 'slice_user', 'sqlatable_user'
+ )
+ORDER BY (DATA_LENGTH + INDEX_LENGTH) DESC;
+```
+
+```sql
+-- Aggregated duplicate-row roll-up. Same SQL as the PostgreSQL version
+-- (standard SQL); included here for copy-paste convenience.
+SELECT 'dashboard_roles' AS t, COUNT(*) AS dup_groups, SUM(c) - COUNT(*) AS rows_dropped
+ FROM (SELECT COUNT(*) c FROM dashboard_roles GROUP BY dashboard_id, role_id HAVING COUNT(*) > 1) g
+UNION ALL SELECT 'dashboard_slices', COUNT(*), SUM(c) - COUNT(*)
+ FROM (SELECT COUNT(*) c FROM dashboard_slices GROUP BY dashboard_id, slice_id HAVING COUNT(*) > 1) g
+UNION ALL SELECT 'dashboard_user', COUNT(*), SUM(c) - COUNT(*)
+ FROM (SELECT COUNT(*) c FROM dashboard_user GROUP BY user_id, dashboard_id HAVING COUNT(*) > 1) g
+UNION ALL SELECT 'report_schedule_user',COUNT(*), SUM(c) - COUNT(*)
+ FROM (SELECT COUNT(*) c FROM report_schedule_user GROUP BY user_id, report_schedule_id HAVING COUNT(*) > 1) g
+UNION ALL SELECT 'rls_filter_roles', COUNT(*), SUM(c) - COUNT(*)
+ FROM (SELECT COUNT(*) c FROM rls_filter_roles GROUP BY role_id, rls_filter_id HAVING COUNT(*) > 1) g
+UNION ALL SELECT 'rls_filter_tables', COUNT(*), SUM(c) - COUNT(*)
+ FROM (SELECT COUNT(*) c FROM rls_filter_tables GROUP BY table_id, rls_filter_id HAVING COUNT(*) > 1) g
+UNION ALL SELECT 'slice_user', COUNT(*), SUM(c) - COUNT(*)
+ FROM (SELECT COUNT(*) c FROM slice_user GROUP BY user_id, slice_id HAVING COUNT(*) > 1) g
+UNION ALL SELECT 'sqlatable_user', COUNT(*), SUM(c) - COUNT(*)
+ FROM (SELECT COUNT(*) c FROM sqlatable_user GROUP BY user_id, table_id HAVING COUNT(*) > 1) g
+ORDER BY rows_dropped DESC;
+```
+
+```sql
+-- External-FK pre-flight check. KEY_COLUMN_USAGE on MySQL carries
+-- both sides of the FK in a single row, so this is simpler than the
+-- PostgreSQL version. Should return zero rows on a stock install.
+SELECT
+ CONSTRAINT_NAME,
+ CONCAT(TABLE_SCHEMA, '.', TABLE_NAME) AS referencing_table,
+ COLUMN_NAME AS referencing_column,
+ REFERENCED_TABLE_NAME AS referenced_table,
+ REFERENCED_COLUMN_NAME AS referenced_column
+FROM information_schema.KEY_COLUMN_USAGE
+WHERE TABLE_SCHEMA = DATABASE()
+ AND REFERENCED_TABLE_NAME IN (
+ 'dashboard_roles', 'dashboard_slices', 'dashboard_user',
+ 'report_schedule_user', 'rls_filter_roles', 'rls_filter_tables',
+ 'slice_user', 'sqlatable_user'
+ )
+ AND REFERENCED_COLUMN_NAME = 'id';
+```
+
+```sql
+-- Lock-window estimate for ALL EIGHT tables (InnoDB rebuilds the
+-- clustered index on PK change, so even "direct ALTER" is a rewrite).
+-- ADD PRIMARY KEY is INPLACE but not LOCK=NONE — it allows concurrent
+-- reads but blocks writes. Use heap size combined with your effective
+-- rebuild throughput (~100-200 MB/s on commodity SSD; higher on NVMe).
+SELECT
+ TABLE_NAME AS table_name,
+ CONCAT(ROUND(DATA_LENGTH / 1024 / 1024, 1), ' MB') AS heap_size,
+ ROUND(DATA_LENGTH / 1024 / 1024, 1) AS heap_size_mb,
+ ROUND(DATA_LENGTH / 1024 / 1024 / 100.0, 1) AS est_rewrite_seconds_at_100mbs
+FROM information_schema.TABLES
+WHERE TABLE_SCHEMA = DATABASE()
+ AND TABLE_NAME IN (
+ 'dashboard_roles', 'dashboard_slices', 'dashboard_user',
+ 'report_schedule_user', 'rls_filter_roles', 'rls_filter_tables',
+ 'slice_user', 'sqlatable_user'
+ )
+ORDER BY DATA_LENGTH DESC;
+```
+
+**Restoring an old `pg_dump` (or equivalent) against the new schema.** A dump taken before the migration includes `INSERT` statements that populate the now-removed `id` column. Restoring such a dump against the post-migration schema will fail. The supported workaround is to dump only the schema and reference data, then re-create the M:N associations from application data after restore — for example with `pg_dump --exclude-table-data` (or per-table `--exclude-table-data=dashboard_slices` etc.) for the eight junction tables, restore the rest, then run a one-shot script that re-INSERTs `(fk1, fk2)` pairs derived from your application export. Operators who need to restore an old dump verbatim should restore against a pre-migration Superset and then re-run the upgrade.
+
+**Intentional downgrade asymmetry.** The migration's `downgrade()` restores the surrogate `id` column and (for `dashboard_slices` and `report_schedule_user`) the original `UNIQUE (fk1, fk2)` constraint, but it does **not** restore the original `NULL`-allowed state on the FK columns — they remain `NOT NULL`. This is intentional: under SQLAlchemy's `secondary=` semantics, a `NULL` in either FK column of a junction table is meaningless (it cannot participate in either side of the relationship). Operators downgrading are not expected to need this restored. The asymmetry is documented for completeness so that round-trip schema diffs are not mistaken for migration bugs.
+
+**Constraint-name divergence between upgrade and downgrade.** The composite primary key created on upgrade is named `pk_
` (Alembic's default for `op.create_primary_key("pk_
", ...)`), while the surrogate `id` primary key restored on downgrade is named `
_pkey` (PostgreSQL's default convention for `PrimaryKeyConstraint("id")`). The two names alternate so that a round-trip (upgrade → downgrade → upgrade) does not collide on a pre-existing constraint name. Operators using schema-comparison tools (e.g. `pg_diff`, `migra`) against a downgraded database may see this as drift versus a fresh-install schema. It is cosmetic — no application code references either constraint name.
+
## 6.0.0
- [33055](https://github.com/apache/superset/pull/33055): Upgrades Flask-AppBuilder to 5.0.0. The AUTH_OID authentication type has been deprecated and is no longer available as an option in Flask-AppBuilder. OpenID (OID) is considered a deprecated authentication protocol - if you are using AUTH_OID, you will need to migrate to an alternative authentication method such as OAuth, LDAP, or database authentication before upgrading.
- [34871](https://github.com/apache/superset/pull/34871): Fixed Jest test hanging issue from Ant Design v5 upgrade. MessageChannel is now mocked in test environment to prevent rc-overflow from causing Jest to hang. Test environment only - no production impact.
diff --git a/docker-compose-mysql.yml b/docker-compose-mysql.yml
new file mode 100644
index 000000000000..13f4c99236cb
--- /dev/null
+++ b/docker-compose-mysql.yml
@@ -0,0 +1,117 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# Compose override that swaps the default Postgres metadata DB for MySQL 8.
+# Useful for evaluating dialect-specific behaviour (e.g., DDL-migration
+# cost on a deployment whose production metadata DB is MySQL).
+#
+# Usage:
+# docker compose -f docker-compose.yml -f docker-compose-mysql.yml up
+# docker compose -f docker-compose.yml -f docker-compose-mysql.yml down
+#
+# To switch back to Postgres, just drop the second `-f` flag — the MySQL
+# data lives in a separate volume (`db_home_mysql`) so neither side is
+# corrupted by switching dialects.
+#
+# Notes:
+# - Mirrors the connection settings used by CI's `test-mysql` shard:
+# dialect ``mysql+mysqldb``, charset utf8mb4 with binary_prefix.
+# - Host port 13306 (configurable via DATABASE_PORT_MYSQL) to avoid
+# colliding with a native MySQL install on 3306.
+# - The Postgres-specific init scripts under
+# docker/docker-entrypoint-initdb.d/ are not mounted (they are
+# postgres-only); examples / cypress fixtures still load via
+# `superset-init`'s post-startup steps.
+
+# Shared environment override applied to every Superset-side service that
+# connects to the metadata DB. ``environment:`` takes precedence over the
+# values inherited from the env_file in docker-compose.yml.
+x-mysql-env: &mysql-env
+ DATABASE_DIALECT: mysql+mysqldb
+ DATABASE_HOST: db
+ DATABASE_PORT: "3306"
+ DATABASE_DB: superset
+ DATABASE_USER: superset
+ DATABASE_PASSWORD: superset
+ SQLALCHEMY_DATABASE_URI: "mysql+mysqldb://superset:superset@db:3306/superset?charset=utf8mb4&binary_prefix=true"
+ # Override the analytics-examples DB connection too. ``EXAMPLES_PORT``
+ # in docker/.env is hardcoded to 5432 (the Postgres port); without
+ # this override the examples connection would try MySQL on 5432 and
+ # fail. The examples user/DB are created by docker/mysql-init/
+ # examples-init.sql on first MySQL boot.
+ EXAMPLES_HOST: db
+ EXAMPLES_PORT: "3306"
+ EXAMPLES_DB: examples
+ EXAMPLES_USER: examples
+ EXAMPLES_PASSWORD: examples
+ SUPERSET__SQLALCHEMY_EXAMPLES_URI: "mysql+mysqldb://examples:examples@db:3306/examples?charset=utf8mb4&binary_prefix=true"
+
+services:
+ db:
+ image: mysql:8.0
+ environment:
+ MYSQL_DATABASE: superset
+ MYSQL_USER: superset
+ MYSQL_PASSWORD: superset
+ MYSQL_ROOT_PASSWORD: root
+ # The original 5432 port mapping is harmless on a MySQL container
+ # (nothing listens on 5432 inside it) but we add 13306->3306 so the
+ # MySQL port is reachable from the host without colliding with a
+ # native MySQL on 3306. Compose merges port lists.
+ ports:
+ - "127.0.0.1:${DATABASE_PORT_MYSQL:-13306}:3306"
+ # Override the init-scripts mount by re-binding the same target path
+ # to a MySQL-compatible directory. Compose merges volume lists by
+ # target path; later definitions win on conflict, so this displaces
+ # the Postgres-specific ``./docker/docker-entrypoint-initdb.d`` mount
+ # from docker-compose.yml. Without this, MySQL would try to run
+ # ``cypress-init.sh`` (which invokes ``psql``, not in the MySQL
+ # image), abort the init phase, and never create the ``examples``
+ # database. Add the MySQL data volume separately.
+ volumes:
+ - db_home_mysql:/var/lib/mysql
+ - ./docker/mysql-init:/docker-entrypoint-initdb.d
+ command:
+ - --default-authentication-plugin=caching_sha2_password
+ - --character-set-server=utf8mb4
+ - --collation-server=utf8mb4_0900_ai_ci
+ healthcheck:
+ test: ["CMD-SHELL", "mysqladmin ping -h localhost -uroot -proot --silent"]
+ interval: 5s
+ timeout: 5s
+ retries: 20
+
+ superset:
+ environment: *mysql-env
+
+ superset-init:
+ environment: *mysql-env
+
+ superset-worker:
+ environment: *mysql-env
+
+ superset-worker-beat:
+ environment: *mysql-env
+
+ superset-node:
+ environment: *mysql-env
+
+ superset-tests-worker:
+ environment: *mysql-env
+
+volumes:
+ db_home_mysql:
diff --git a/docker/mysql-init/examples-init.sql b/docker/mysql-init/examples-init.sql
new file mode 100644
index 000000000000..68dabe38671d
--- /dev/null
+++ b/docker/mysql-init/examples-init.sql
@@ -0,0 +1,32 @@
+-- Licensed to the Apache Software Foundation (ASF) under one
+-- or more contributor license agreements. See the NOTICE file
+-- distributed with this work for additional information
+-- regarding copyright ownership. The ASF licenses this file
+-- to you under the Apache License, Version 2.0 (the
+-- "License"); you may not use this file except in compliance
+-- with the License. You may obtain a copy of the License at
+--
+-- http://www.apache.org/licenses/LICENSE-2.0
+--
+-- Unless required by applicable law or agreed to in writing,
+-- software distributed under the License is distributed on an
+-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+-- KIND, either express or implied. See the License for the
+-- specific language governing permissions and limitations
+-- under the License.
+
+-- MySQL counterpart to docker/docker-entrypoint-initdb.d/examples-init.sh.
+-- Creates the analytics-examples database and user that Superset's
+-- ``load-examples`` command writes to. Mounted by docker-compose-mysql.yml
+-- at /docker-entrypoint-initdb.d/ so the MySQL image's first-boot
+-- entrypoint runs it automatically. (The Postgres init scripts under
+-- docker/docker-entrypoint-initdb.d/ are NOT mounted on the MySQL
+-- service — they invoke psql, which doesn't exist in the MySQL image.)
+
+CREATE DATABASE IF NOT EXISTS examples
+ CHARACTER SET utf8mb4
+ COLLATE utf8mb4_0900_ai_ci;
+
+CREATE USER IF NOT EXISTS 'examples'@'%' IDENTIFIED BY 'examples';
+GRANT ALL PRIVILEGES ON examples.* TO 'examples'@'%';
+FLUSH PRIVILEGES;
diff --git a/pyproject.toml b/pyproject.toml
index 7dc0f18ea45a..d2131ffe9a33 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -100,6 +100,7 @@ dependencies = [
"simplejson>=3.15.0",
"slack_sdk>=3.19.0, <4",
"sqlalchemy>=1.4, <2",
+ "sqlalchemy-continuum>=1.6.0, <2.0.0",
"sqlalchemy-utils>=0.38.0, <0.43", # expanding lowerbound to work with pydoris
"sqlglot>=30.8.0, <31",
# newer pandas needs 0.9+
diff --git a/requirements/base.txt b/requirements/base.txt
index 6c1b966e72cc..68dc9c39f3b9 100644
--- a/requirements/base.txt
+++ b/requirements/base.txt
@@ -407,7 +407,10 @@ sqlalchemy==1.4.54
# flask-sqlalchemy
# marshmallow-sqlalchemy
# shillelagh
+ # sqlalchemy-continuum
# sqlalchemy-utils
+sqlalchemy-continuum==1.6.0
+ # via apache-superset (pyproject.toml)
sqlalchemy-utils==0.42.0
# via
# apache-superset (pyproject.toml)
diff --git a/requirements/development.txt b/requirements/development.txt
index ebd7c13fe51d..8c957d3c7bcf 100644
--- a/requirements/development.txt
+++ b/requirements/development.txt
@@ -975,9 +975,14 @@ sqlalchemy==1.4.54
# marshmallow-sqlalchemy
# shillelagh
# sqlalchemy-bigquery
+ # sqlalchemy-continuum
# sqlalchemy-utils
sqlalchemy-bigquery==1.17.0
# via apache-superset
+sqlalchemy-continuum==1.6.0
+ # via
+ # -c requirements/base-constraint.txt
+ # apache-superset
sqlalchemy-utils==0.42.0
# via
# -c requirements/base-constraint.txt
diff --git a/scripts/seed_junction_load.py b/scripts/seed_junction_load.py
new file mode 100644
index 000000000000..4e4e5d6b42c1
--- /dev/null
+++ b/scripts/seed_junction_load.py
@@ -0,0 +1,679 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# ----------------------------------------------------------------------
+# Stress-test data generator for the composite-PK migration (sc-105349).
+#
+# Bulk-inserts synthetic parent rows and many-to-many junction rows for
+# the eight association tables that the composite-PK migration touches.
+# Useful for measuring migration runtime at varying scales — run this at
+# 100K / 1M / 5M / 10M rows and time the migration at each scale to
+# verify the O(N log N) extrapolation.
+#
+# Idempotent: rerunning with the same target is a no-op; rerunning with
+# a higher target adds rows up to the new total. Batched bulk INSERTs
+# (10K rows per statement) make it fast on Postgres, MySQL, and SQLite.
+#
+# Usage (inside the Superset container):
+#
+# docker exec superset-superset-1 \\
+# /app/.venv/bin/python /app/scripts/seed_junction_load.py \\
+# --dashboard-slices 1000000 \\
+# --slice-user 100000 \\
+# --dashboard-user 100000
+#
+# Run with no flags for the defaults shown below. Use ``--dry-run`` to
+# print the planned inserts without writing anything.
+#
+# The script connects via Superset's standard ``DATABASE_*`` env vars
+# (or ``SUPERSET__SQLALCHEMY_DATABASE_URI`` if set), so it works
+# automatically inside the Superset container regardless of which
+# metadata DB backend is in use.
+
+from __future__ import annotations
+
+import argparse
+import logging
+import os
+import sys
+import time
+from contextlib import contextmanager
+from typing import Iterator
+from uuid import uuid4
+
+import sqlalchemy as sa
+from sqlalchemy.engine import Connection, Engine
+
+logger = logging.getLogger("seed_junction_load")
+
+# Bulk INSERT batch size. Larger values = fewer statements but more memory.
+BATCH = 10_000
+
+# Default per-junction-table target row counts. Tuned to mimic the shape
+# of a large multi-team Superset install. Override via CLI flags.
+DEFAULTS: dict[str, int] = {
+ "dashboard_slices": 1_000_000,
+ "slice_user": 100_000,
+ "dashboard_user": 100_000,
+ "dashboard_roles": 10_000,
+}
+
+# (junction_table, fk1_col, fk2_col, parent1_table, parent2_table)
+# parents reference id columns; we generate (fk1, fk2) pairs by sampling
+# from the parents' existing IDs.
+JUNCTIONS: list[tuple[str, str, str, str, str]] = [
+ ("dashboard_slices", "dashboard_id", "slice_id", "dashboards", "slices"),
+ ("slice_user", "user_id", "slice_id", "ab_user", "slices"),
+ ("dashboard_user", "user_id", "dashboard_id", "ab_user", "dashboards"),
+ ("dashboard_roles", "dashboard_id", "role_id", "dashboards", "ab_role"),
+]
+
+# Junction tables that originally carried ``UNIQUE(fk1, fk2)`` and therefore
+# cannot accept duplicate ``(fk1, fk2)`` pairs even on the pre-migration
+# (downgrade) schema. The other JUNCTIONS allow duplicates pre-migration.
+# Only ``dashboard_slices`` is listed: the migration's other UNIQUE table
+# (``report_schedule_user``) is not in JUNCTIONS — this script doesn't seed
+# it — so listing it here would imply coverage that doesn't exist. Add it
+# alongside a JUNCTIONS entry if that table ever gets seeded.
+JUNCTIONS_WITH_UNIQUE: set[str] = {"dashboard_slices"}
+
+
+# ----------------------------------------------------------------------
+# Connection setup
+# ----------------------------------------------------------------------
+
+
+def build_engine() -> Engine:
+ """Build a SQLAlchemy engine from Superset env vars."""
+ if uri := os.environ.get("SUPERSET__SQLALCHEMY_DATABASE_URI"):
+ logger.info("Using SUPERSET__SQLALCHEMY_DATABASE_URI from env")
+ return sa.create_engine(uri)
+
+ try:
+ dialect = os.environ["DATABASE_DIALECT"]
+ user = os.environ["DATABASE_USER"]
+ password = os.environ["DATABASE_PASSWORD"]
+ host = os.environ["DATABASE_HOST"]
+ port = os.environ["DATABASE_PORT"]
+ db = os.environ["DATABASE_DB"]
+ except KeyError as exc:
+ sys.exit(
+ f"Missing env var {exc}; either set DATABASE_DIALECT/USER/PASSWORD/"
+ f"HOST/PORT/DB or SUPERSET__SQLALCHEMY_DATABASE_URI before running."
+ )
+
+ uri = f"{dialect}://{user}:{password}@{host}:{port}/{db}"
+ logger.info(
+ "Built URI from DATABASE_* env vars (dialect=%s, host=%s)", dialect, host
+ )
+ return sa.create_engine(uri)
+
+
+# ----------------------------------------------------------------------
+# Helpers
+# ----------------------------------------------------------------------
+
+
+def uuid_value(dialect_name: str) -> bytes | str:
+ """Return a UUID in the form the active dialect expects.
+
+ MySQL stores UUIDs as ``BINARY(16)`` (16 raw bytes); Postgres has a
+ native ``UUID`` type that accepts strings; SQLite stores them as
+ BLOB/TEXT and accepts either. Branching here keeps the seed script
+ backend-agnostic without depending on Superset's custom column types.
+ """
+ if dialect_name.startswith("mysql"):
+ return uuid4().bytes
+ return str(uuid4())
+
+
+@contextmanager
+def time_phase(name: str) -> Iterator[None]:
+ """Log elapsed wall time for a named phase."""
+ start = time.monotonic()
+ logger.info("[%s] starting", name)
+ try:
+ yield
+ finally:
+ elapsed = time.monotonic() - start
+ logger.info("[%s] done in %.2fs", name, elapsed)
+
+
+def count_rows(conn: Connection, table: str) -> int:
+ return conn.scalar(sa.text(f"SELECT COUNT(*) FROM {table}")) or 0 # noqa: S608
+
+
+def existing_ids(conn: Connection, table: str, limit: int | None = None) -> list[int]:
+ sql = f"SELECT id FROM {table} ORDER BY id" # noqa: S608
+ if limit is not None:
+ sql += f" LIMIT {limit}"
+ return [row[0] for row in conn.execute(sa.text(sql))]
+
+
+# ----------------------------------------------------------------------
+# Parent seeders
+#
+# Each function ensures the named parent table has at least ``target``
+# rows by inserting synthetic ones with minimal-but-valid columns.
+# Returns nothing; subsequent code reads back IDs via ``existing_ids``.
+# ----------------------------------------------------------------------
+
+
+def seed_dashboards(conn: Connection, target: int, dry_run: bool) -> None:
+ current = count_rows(conn, "dashboards")
+ if current >= target:
+ logger.info(
+ "dashboards: %d rows (target %d) — no insert needed", current, target
+ )
+ return
+ needed = target - current
+ logger.info("dashboards: %d → %d (+%d)", current, target, needed)
+ if dry_run:
+ return
+
+ dialect = conn.engine.dialect.name
+ sql = sa.text(
+ "INSERT INTO dashboards (uuid, dashboard_title, slug, published) "
+ "VALUES (:uuid, :title, :slug, :published)"
+ )
+ for batch_start in range(0, needed, BATCH):
+ rows = [
+ {
+ "uuid": uuid_value(dialect),
+ "title": f"seed_dashboard_{current + i}",
+ "slug": f"seed-dashboard-{current + i}-{uuid4().hex[:8]}",
+ "published": False,
+ }
+ for i in range(batch_start, min(batch_start + BATCH, needed))
+ ]
+ conn.execute(sql, rows)
+ logger.info(" dashboards: inserted %d / %d", batch_start + len(rows), needed)
+
+
+def seed_dbs(conn: Connection, dry_run: bool) -> int:
+ """Ensure at least one row exists in ``dbs`` (parent of ``tables``).
+ Returns the id to use as ``database_id`` when seeding ``tables``."""
+ ids = existing_ids(conn, "dbs", limit=1)
+ if ids:
+ return ids[0]
+ if dry_run:
+ return -1 # placeholder
+ dialect = conn.engine.dialect.name
+ logger.info("dbs: inserting one synthetic database (no rows present)")
+ conn.execute(
+ sa.text(
+ "INSERT INTO dbs (uuid, database_name, sqlalchemy_uri, expose_in_sqllab) "
+ "VALUES (:uuid, :name, :uri, :expose)"
+ ),
+ {
+ "uuid": uuid_value(dialect),
+ "name": f"seed_db_{uuid4().hex[:8]}",
+ "uri": "sqlite:///seed.db",
+ "expose": False,
+ },
+ )
+ return existing_ids(conn, "dbs", limit=1)[0]
+
+
+def seed_tables(conn: Connection, target: int, dry_run: bool) -> None:
+ current = count_rows(conn, "tables")
+ if current >= target:
+ logger.info("tables: %d rows (target %d) — no insert needed", current, target)
+ return
+ needed = target - current
+ logger.info("tables: %d → %d (+%d)", current, target, needed)
+ if dry_run:
+ return
+
+ database_id = seed_dbs(conn, dry_run=False)
+ dialect = conn.engine.dialect.name
+ sql = sa.text(
+ "INSERT INTO tables (uuid, table_name, database_id) "
+ "VALUES (:uuid, :name, :db_id)"
+ )
+ for batch_start in range(0, needed, BATCH):
+ rows = [
+ {
+ "uuid": uuid_value(dialect),
+ "name": f"seed_table_{current + i}",
+ "db_id": database_id,
+ }
+ for i in range(batch_start, min(batch_start + BATCH, needed))
+ ]
+ conn.execute(sql, rows)
+ logger.info(" tables: inserted %d / %d", batch_start + len(rows), needed)
+
+
+def seed_slices(conn: Connection, target: int, dry_run: bool) -> None:
+ current = count_rows(conn, "slices")
+ if current >= target:
+ logger.info("slices: %d rows (target %d) — no insert needed", current, target)
+ return
+ needed = target - current
+ logger.info("slices: %d → %d (+%d)", current, target, needed)
+ if dry_run:
+ return
+
+ # Slices reference tables.id; ensure at least one ``tables`` row exists
+ # so the FK is satisfiable (datasource_id is nullable but we set it for
+ # realism). The migration test doesn't care, but a real Superset that
+ # re-renders these slices does.
+ seed_tables(conn, target=1, dry_run=False)
+ table_id = existing_ids(conn, "tables", limit=1)[0]
+ dialect = conn.engine.dialect.name
+ sql = sa.text(
+ "INSERT INTO slices "
+ "(uuid, slice_name, datasource_id, datasource_type, viz_type) "
+ "VALUES (:uuid, :name, :ds_id, :ds_type, :viz)"
+ )
+ for batch_start in range(0, needed, BATCH):
+ rows = [
+ {
+ "uuid": uuid_value(dialect),
+ "name": f"seed_slice_{current + i}",
+ "ds_id": table_id,
+ "ds_type": "table",
+ "viz": "table",
+ }
+ for i in range(batch_start, min(batch_start + BATCH, needed))
+ ]
+ conn.execute(sql, rows)
+ logger.info(" slices: inserted %d / %d", batch_start + len(rows), needed)
+
+
+def seed_users(conn: Connection, target: int, dry_run: bool) -> None:
+ current = count_rows(conn, "ab_user")
+ if current >= target:
+ logger.info("ab_user: %d rows (target %d) — no insert needed", current, target)
+ return
+ needed = target - current
+ logger.info("ab_user: %d → %d (+%d)", current, target, needed)
+ if dry_run:
+ return
+
+ sql = sa.text(
+ "INSERT INTO ab_user (first_name, last_name, username, email, active) "
+ "VALUES (:first, :last, :username, :email, :active)"
+ )
+ for batch_start in range(0, needed, BATCH):
+ rows = [
+ {
+ "first": "seed",
+ "last": f"user_{current + i}",
+ "username": f"seed_user_{current + i}_{uuid4().hex[:8]}",
+ "email": f"seed_user_{current + i}_{uuid4().hex[:8]}@example.invalid",
+ "active": True,
+ }
+ for i in range(batch_start, min(batch_start + BATCH, needed))
+ ]
+ conn.execute(sql, rows)
+ logger.info(" ab_user: inserted %d / %d", batch_start + len(rows), needed)
+
+
+def seed_roles(conn: Connection, target: int, dry_run: bool) -> None:
+ current = count_rows(conn, "ab_role")
+ if current >= target:
+ logger.info("ab_role: %d rows (target %d) — no insert needed", current, target)
+ return
+ needed = target - current
+ logger.info("ab_role: %d → %d (+%d)", current, target, needed)
+ if dry_run:
+ return
+
+ sql = sa.text("INSERT INTO ab_role (name) VALUES (:name)")
+ for batch_start in range(0, needed, BATCH):
+ rows = [
+ {"name": f"seed_role_{current + i}_{uuid4().hex[:8]}"}
+ for i in range(batch_start, min(batch_start + BATCH, needed))
+ ]
+ conn.execute(sql, rows)
+ logger.info(" ab_role: inserted %d / %d", batch_start + len(rows), needed)
+
+
+# ----------------------------------------------------------------------
+# Junction seeder
+# ----------------------------------------------------------------------
+
+
+def _load_existing_pairs(
+ conn: Connection, junction: str, fk1_col: str, fk2_col: str
+) -> set[tuple[int, int]]:
+ """Load existing ``(fk1, fk2)`` pairs from a junction table into a set.
+
+ Used so the seeder can skip them when generating new pairs (junction
+ tables enforce uniqueness on the FK pair). Memory is ~32 bytes/tuple
+ on CPython, so 10M existing pairs is ~320MB — acceptable for a dev
+ machine. The junction / column names come from ``JUNCTIONS``, not
+ user input, so the f-string interpolation is safe.
+ """
+ sql_text = f"SELECT {fk1_col}, {fk2_col} FROM {junction}" # noqa: S608
+ return {(row[0], row[1]) for row in conn.execute(sa.text(sql_text))}
+
+
+def _generate_new_pairs(
+ p1_ids: list[int],
+ p2_ids: list[int],
+ existing_pairs: set[tuple[int, int]],
+) -> Iterator[tuple[int, int]]:
+ """Yield ``(fk1, fk2)`` pairs from the parent1 × parent2 cross-product
+ that are not already in ``existing_pairs``."""
+ for fk1 in p1_ids:
+ for fk2 in p2_ids:
+ if (fk1, fk2) not in existing_pairs:
+ yield (fk1, fk2)
+
+
+def seed_junction(
+ conn: Connection,
+ junction: str,
+ fk1_col: str,
+ fk2_col: str,
+ parent1: str,
+ parent2: str,
+ target: int,
+ dry_run: bool,
+) -> None:
+ """Bulk-insert junction rows up to ``target`` rows total.
+
+ Generates ``(fk1, fk2)`` pairs by walking the cross-product of
+ parent1 IDs × parent2 IDs in row-major order, skipping pairs that
+ already exist. Walking the cross-product deterministically keeps
+ the script replayable: re-running with the same target is a no-op,
+ and re-running with a higher target appends new pairs in a stable
+ order regardless of how many runs preceded.
+ """
+ current = count_rows(conn, junction)
+ if current >= target:
+ logger.info(
+ "%s: %d rows (target %d) — no insert needed", junction, current, target
+ )
+ return
+ needed = target - current
+ logger.info("%s: %d → %d (+%d)", junction, current, target, needed)
+ if dry_run:
+ return
+
+ p1_ids = existing_ids(conn, parent1)
+ p2_ids = existing_ids(conn, parent2)
+ max_pairs = len(p1_ids) * len(p2_ids)
+ if max_pairs < target:
+ sys.exit(
+ f"Cannot reach {target} rows in {junction}: "
+ f"only {max_pairs} unique pairs available "
+ f"({len(p1_ids)} × {len(p2_ids)}). "
+ f"Increase parent targets and rerun."
+ )
+
+ existing_pairs: set[tuple[int, int]] = (
+ _load_existing_pairs(conn, junction, fk1_col, fk2_col) if current > 0 else set()
+ )
+ if existing_pairs:
+ logger.info(
+ " %s: loaded %d existing pairs into avoidance set",
+ junction,
+ len(existing_pairs),
+ )
+
+ insert_sql = sa.text(
+ f"INSERT INTO {junction} ({fk1_col}, {fk2_col}) " # noqa: S608
+ f"VALUES (:fk1, :fk2)"
+ )
+
+ inserted = 0
+ batch: list[dict[str, int]] = []
+ for fk1, fk2 in _generate_new_pairs(p1_ids, p2_ids, existing_pairs):
+ batch.append({"fk1": fk1, "fk2": fk2})
+ inserted += 1
+ if len(batch) == BATCH or inserted == needed:
+ conn.execute(insert_sql, batch)
+ logger.info(" %s: inserted %d / %d", junction, inserted, needed)
+ batch = []
+ if inserted == needed:
+ return
+ if inserted < needed:
+ sys.exit(
+ f"Ran out of unique pairs at {inserted}/{needed} for {junction} "
+ f"(parents have {len(p1_ids)} × {len(p2_ids)} = {max_pairs} pairs, "
+ f"{len(existing_pairs)} already present)"
+ )
+
+
+# ----------------------------------------------------------------------
+# Orchestration
+# ----------------------------------------------------------------------
+
+
+def _compute_parent_requirements(targets: dict[str, int]) -> dict[str, int]:
+ """For each parent table, return the minimum row count needed so that
+ parent1 × parent2 ≥ target for every junction it participates in.
+
+ Allocates ceil(sqrt(target)) rows per parent, balanced across the two
+ parents of each junction. The actual junction seeder will then walk
+ the cross-product to produce the target number of unique pairs.
+ """
+ parent_req: dict[str, int] = {}
+ for junction, _, _, p1, p2 in JUNCTIONS:
+ target = targets.get(junction, 0)
+ if target == 0:
+ continue
+ sqrt_n = int(target**0.5) + 1
+ parent_req[p1] = max(parent_req.get(p1, 0), sqrt_n)
+ parent_req[p2] = max(parent_req.get(p2, 0), sqrt_n)
+ return parent_req
+
+
+def _seed_parents(conn: Connection, parent_req: dict[str, int], dry_run: bool) -> None:
+ """Seed parent tables in dependency order:
+ independent parents (ab_user, ab_role) first, then dashboards / slices /
+ tables (which transitively depend on dbs, seeded inside seed_tables)."""
+ if "ab_user" in parent_req:
+ seed_users(conn, parent_req["ab_user"], dry_run)
+ if "ab_role" in parent_req:
+ seed_roles(conn, parent_req["ab_role"], dry_run)
+ if "dashboards" in parent_req:
+ seed_dashboards(conn, parent_req["dashboards"], dry_run)
+ if "slices" in parent_req:
+ seed_slices(conn, parent_req["slices"], dry_run)
+ if "tables" in parent_req:
+ seed_tables(conn, parent_req["tables"], dry_run)
+
+
+def _seed_all_junctions(
+ conn: Connection, targets: dict[str, int], dry_run: bool
+) -> None:
+ for junction, fk1, fk2, p1, p2 in JUNCTIONS:
+ target = targets.get(junction, 0)
+ if target == 0:
+ continue
+ with time_phase(f"junction:{junction}"):
+ seed_junction(conn, junction, fk1, fk2, p1, p2, target, dry_run)
+
+
+def inject_duplicates(
+ conn: Connection,
+ junction: str,
+ fk1_col: str,
+ fk2_col: str,
+ pct: float,
+ dry_run: bool,
+) -> None:
+ """Insert duplicate ``(fk1, fk2)`` rows on a non-UNIQUE junction table.
+
+ Used to stress-test the migration's ``_dedupe_by_min_id`` phase, which
+ is otherwise a no-op on cleanly-seeded data. Computes ``count =
+ current_rows * pct / 100`` and inserts that many rows by re-sampling
+ existing ``(fk1, fk2)`` pairs in row-major order. The synthetic
+ duplicates land on top of distinct existing pairs (one duplicate per
+ distinct pair, then wraps), so the migration's dedupe finds and
+ deletes them.
+
+ **Pre-condition: the table must NOT have UNIQUE on (fk1, fk2)**, i.e.,
+ the schema must be the pre-migration shape (after running
+ ``superset db downgrade``). On the post-migration schema the composite
+ PK rejects duplicates and this function will error.
+ """
+ if pct == 0:
+ return
+ current = count_rows(conn, junction)
+ count = int(current * pct / 100)
+ if count == 0:
+ logger.info(
+ "%s: 0 duplicates to inject (current=%d, pct=%g)",
+ junction,
+ current,
+ pct,
+ )
+ return
+ logger.info(
+ "%s: injecting %d duplicate rows (%g%% of %d existing)",
+ junction,
+ count,
+ pct,
+ current,
+ )
+ if dry_run:
+ return
+
+ select_sql = sa.text(
+ f"SELECT {fk1_col}, {fk2_col} FROM {junction} ORDER BY id LIMIT :n" # noqa: S608
+ )
+ sample = conn.execute(select_sql, {"n": count}).fetchall()
+ if not sample:
+ logger.warning("%s: no rows to duplicate (table is empty)", junction)
+ return
+
+ insert_sql = sa.text(
+ f"INSERT INTO {junction} ({fk1_col}, {fk2_col}) " # noqa: S608
+ f"VALUES (:fk1, :fk2)"
+ )
+ inserted = 0
+ while inserted < count:
+ batch: list[dict[str, int]] = []
+ while len(batch) < BATCH and inserted < count:
+ row = sample[inserted % len(sample)]
+ batch.append({"fk1": row[0], "fk2": row[1]})
+ inserted += 1
+ conn.execute(insert_sql, batch)
+ logger.info(" %s: injected %d / %d duplicates", junction, inserted, count)
+
+
+def _inject_dirty_data(conn: Connection, dirty_pct: float, dry_run: bool) -> None:
+ """Inject duplicate rows on every non-UNIQUE seeded junction.
+
+ The two tables that originally carried ``UNIQUE(fk1, fk2)`` are
+ skipped because their composite-PK successor (and their pre-migration
+ UNIQUE constraint) both reject duplicate inserts.
+ """
+ if dirty_pct == 0:
+ return
+ for junction, fk1, fk2, _, _ in JUNCTIONS:
+ if junction in JUNCTIONS_WITH_UNIQUE:
+ logger.info(
+ "%s: skipping duplicate injection (table has UNIQUE on FK pair)",
+ junction,
+ )
+ continue
+ with time_phase(f"dirty:{junction}"):
+ inject_duplicates(conn, junction, fk1, fk2, dirty_pct, dry_run)
+
+
+def run(targets: dict[str, int], dry_run: bool, dirty_duplicates_pct: float) -> None:
+ engine = build_engine()
+ with engine.begin() as conn:
+ parent_req = _compute_parent_requirements(targets)
+ logger.info("Required parent row counts: %s", parent_req)
+
+ with time_phase("parents"):
+ _seed_parents(conn, parent_req, dry_run)
+
+ with time_phase("junctions"):
+ _seed_all_junctions(conn, targets, dry_run)
+
+ if dirty_duplicates_pct > 0:
+ with time_phase("dirty-duplicates"):
+ _inject_dirty_data(conn, dirty_duplicates_pct, dry_run)
+
+
+# ----------------------------------------------------------------------
+# CLI
+# ----------------------------------------------------------------------
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser(
+ description=__doc__,
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ )
+ for table, default in DEFAULTS.items():
+ parser.add_argument(
+ f"--{table.replace('_', '-')}",
+ type=int,
+ default=default,
+ help=f"target row count for {table} (default: {default:,})",
+ )
+ parser.add_argument(
+ "--dry-run",
+ "-n",
+ action="store_true",
+ help="print planned inserts without writing to the DB",
+ )
+ parser.add_argument(
+ "--dirty-duplicates-pct",
+ type=float,
+ default=0,
+ help=(
+ "after seeding distinct pairs, inject this percentage of duplicate "
+ "rows on each non-UNIQUE junction (slice_user, dashboard_user, "
+ "dashboard_roles). Stress-tests the migration's _dedupe_by_min_id "
+ "phase. Requires 2bee73611e32 to NOT be applied: un-apply it by "
+ "downgrading to its parent (`superset db downgrade `, "
+ "where is read from the 2bee73611e32 migration "
+ "file) — the post-migration composite PK rejects duplicates and "
+ "this will error. Default: 0 (no duplicates)."
+ ),
+ )
+ parser.add_argument(
+ "--verbose",
+ "-v",
+ action="store_true",
+ help="increase log verbosity",
+ )
+ args = parser.parse_args()
+
+ logging.basicConfig(
+ level=logging.DEBUG if args.verbose else logging.INFO,
+ format="%(asctime)s [%(levelname)s] %(message)s",
+ datefmt="%H:%M:%S",
+ )
+
+ targets = {table: getattr(args, table) for table in DEFAULTS}
+
+ logger.info("Targets: %s", targets)
+ logger.info("Dry run: %s", args.dry_run)
+ logger.info("Dirty duplicates pct: %g", args.dirty_duplicates_pct)
+
+ with time_phase("total"):
+ run(
+ targets,
+ dry_run=args.dry_run,
+ dirty_duplicates_pct=args.dirty_duplicates_pct,
+ )
+
+
+if __name__ == "__main__":
+ main()
diff --git a/superset/charts/api.py b/superset/charts/api.py
index 0098996a5022..7402113d40a7 100644
--- a/superset/charts/api.py
+++ b/superset/charts/api.py
@@ -81,7 +81,9 @@
from superset.commands.importers.v1.utils import get_contents_from_bundle
from superset.constants import MODEL_API_RW_METHOD_PERMISSION_MAP, RouteMethod
from superset.daos.chart import ChartDAO
-from superset.exceptions import ScreenshotImageNotAvailableException
+from superset.exceptions import (
+ ScreenshotImageNotAvailableException,
+)
from superset.extensions import event_logger, security_manager
from superset.models.slice import Slice
from superset.tasks.thumbnails import cache_chart_thumbnail
@@ -95,6 +97,14 @@
StatusValues,
)
from superset.utils.urls import get_url_path
+from superset.versioning.api_helpers import (
+ current_entity_etag_uuid,
+ current_entity_version_info,
+ get_version_endpoint,
+ list_versions_endpoint,
+)
+from superset.versioning.etag import set_version_etag
+from superset.versioning.schemas import VersionListItemSchema
from superset.views.base_api import (
BaseSupersetModelRestApi,
RelatedFieldFilter,
@@ -132,6 +142,8 @@ def ensure_thumbnails_enabled(self) -> Optional[Response]:
"screenshot",
"cache_screenshot",
"warm_up_cache",
+ "list_versions",
+ "get_version",
}
class_permission_name = "Chart"
method_permission_name = MODEL_API_RW_METHOD_PERMISSION_MAP
@@ -238,7 +250,7 @@ def ensure_thumbnails_enabled(self) -> Optional[Response]:
openapi_spec_tag = "Charts"
""" Override the name set for this collection of endpoints """
- openapi_spec_component_schemas = CHART_SCHEMAS
+ openapi_spec_component_schemas = CHART_SCHEMAS + (VersionListItemSchema,)
apispec_parameter_schemas = {
"screenshot_query_schema": screenshot_query_schema,
@@ -312,7 +324,11 @@ def get(self, id_or_uuid: str) -> Response:
result = self.chart_get_response_schema.dump(dash)
if resolver := current_app.config.get("EXTRA_OWNERS_RESOLVER"):
result["extra_owners"] = resolver(dash)
- return self.response(200, result=result)
+
+ return set_version_etag(
+ self.response(200, result=result),
+ current_entity_etag_uuid(Slice, dash.id, dash.uuid),
+ )
except ChartNotFoundError:
return self.response_404()
@@ -419,6 +435,34 @@ def put(self, pk: int) -> Response:
type: number
result:
$ref: '#/components/schemas/{{self.__class__.__name__}}.put'
+ old_version:
+ type: integer
+ nullable: true
+ description: >-
+ 0-based version_number of the live row before this
+ update. Unstable under retention pruning — see
+ old_transaction_id for a stable identifier.
+ new_version:
+ type: integer
+ nullable: true
+ description: >-
+ 0-based version_number of the newly-live row after
+ this update. Can equal old_version when no
+ versioned column changed, or when retention
+ pruning dropped an older closed row in the same
+ commit.
+ old_transaction_id:
+ type: integer
+ nullable: true
+ description: Continuum transaction_id of the live
+ row before this update. Stable across pruning.
+ new_transaction_id:
+ type: integer
+ nullable: true
+ description: Continuum transaction_id of the live
+ row after this update. Differs from
+ old_transaction_id when the update produced a new
+ version row.
400:
$ref: '#/components/responses/400'
401:
@@ -437,9 +481,29 @@ def put(self, pk: int) -> Response:
# This validates custom Schema with custom validations
except ValidationError as error:
return self.response_400(message=error.messages)
+
+ # Live version identifiers before the update (empty + query-free when
+ # ``ENABLE_VERSIONING_CAPTURE`` is off, so this stays inert under the
+ # kill-switch).
+ old_info = current_entity_version_info(Slice, pk)
+
try:
changed_model = UpdateChartCommand(pk, item).run()
- response = self.response(200, id=changed_model.id, result=item)
+ new_info = current_entity_version_info(
+ Slice, changed_model.id, changed_model.uuid
+ )
+ response = self.response(
+ 200,
+ id=changed_model.id,
+ result=item,
+ old_version=old_info.version,
+ new_version=new_info.version,
+ old_transaction_id=old_info.transaction_id,
+ new_transaction_id=new_info.transaction_id,
+ old_version_uuid=old_info.version_uuid,
+ new_version_uuid=new_info.version_uuid,
+ )
+ set_version_etag(response, new_info.version_uuid)
except ChartNotFoundError:
response = self.response_404()
except ChartForbiddenError:
@@ -1214,3 +1278,107 @@ def import_(self) -> Response:
)
command.run()
return self.response(200, message="OK")
+
+ @expose("//versions/", methods=("GET",))
+ @protect()
+ @safe
+ @statsd_metrics
+ @event_logger.log_this_with_context(
+ action=lambda self, *args, **kwargs: f"{self.__class__.__name__}.list_versions",
+ log_to_statsd=False,
+ )
+ def list_versions(self, uuid_str: str) -> Response:
+ """List version history for a chart.
+ ---
+ get:
+ summary: Return the version history for a chart
+ parameters:
+ - in: path
+ schema:
+ type: string
+ format: uuid
+ name: uuid_str
+ description: Chart UUID
+ responses:
+ 200:
+ description: Version history ordered by oldest first
+ content:
+ application/json:
+ schema:
+ type: object
+ properties:
+ result:
+ type: array
+ items:
+ $ref: '#/components/schemas/VersionListItemSchema'
+ count:
+ type: integer
+ 400:
+ $ref: '#/components/responses/400'
+ 401:
+ $ref: '#/components/responses/401'
+ 403:
+ $ref: '#/components/responses/403'
+ 404:
+ $ref: '#/components/responses/404'
+ """
+ return list_versions_endpoint(self, Slice, uuid_str, access_kwarg="chart")
+
+ @expose(
+ "//versions//",
+ methods=("GET",),
+ )
+ @protect()
+ @safe
+ @statsd_metrics
+ @event_logger.log_this_with_context(
+ action=lambda self, *args, **kwargs: f"{self.__class__.__name__}.get_version", # noqa: E501
+ log_to_statsd=False,
+ )
+ def get_version(self, uuid_str: str, version_uuid_str: str) -> Response:
+ """Return the chart's state at a specific version.
+ ---
+ get:
+ summary: Read-only snapshot of the chart at a given version
+ parameters:
+ - in: path
+ schema:
+ type: string
+ format: uuid
+ name: uuid_str
+ description: Chart UUID
+ - in: path
+ schema:
+ type: string
+ format: uuid
+ name: version_uuid_str
+ description: Version UUID as returned by the list endpoint
+ responses:
+ 200:
+ description: Snapshot of the chart at the target version
+ content:
+ application/json:
+ schema:
+ type: object
+ properties:
+ result:
+ type: object
+ description: >-
+ The chart's scalar fields at the target version
+ (entity-specific keys), plus a `_version` block
+ with the version-level metadata.
+ properties:
+ _version:
+ $ref: '#/components/schemas/VersionListItemSchema'
+ 400:
+ $ref: '#/components/responses/400'
+ 401:
+ $ref: '#/components/responses/401'
+ 403:
+ $ref: '#/components/responses/403'
+ 404:
+ $ref: '#/components/responses/404'
+ """
+ return get_version_endpoint(
+ self, Slice, uuid_str, version_uuid_str, access_kwarg="chart"
+ )
diff --git a/superset/commands/dashboard/copy.py b/superset/commands/dashboard/copy.py
index b694d3686788..f8239751cce7 100644
--- a/superset/commands/dashboard/copy.py
+++ b/superset/commands/dashboard/copy.py
@@ -40,6 +40,19 @@ def __init__(self, original_dash: Dashboard, data: dict[str, Any]) -> None:
@transaction(on_error=partial(on_error, reraise=DashboardCopyError))
def run(self) -> Dashboard:
self.validate()
+ # Declare the high-level avenue before the copy touches the
+ # session. The change-record listener stamps
+ # ``version_transaction.action_kind = 'clone'`` so the new
+ # dashboard's baseline records read as "Cloned from "
+ # in the timeline instead of "Dashboard created".
+ # Method-scoped imports — defer the versioning bootstrap path
+ # (``Model.metadata`` and Continuum-adjacent setup) out of this
+ # command's module-load graph; see ``changes.py`` module
+ # docstring for the broader init-order rationale.
+ from superset import db
+ from superset.versioning.changes import ACTION_KIND_CLONE, ACTION_KIND_KEY
+
+ db.session.info[ACTION_KIND_KEY] = ACTION_KIND_CLONE
return DashboardDAO.copy_dashboard(self._original_dash, self._properties)
def validate(self) -> None:
diff --git a/superset/commands/dashboard/importers/v1/__init__.py b/superset/commands/dashboard/importers/v1/__init__.py
index c3063ed5437f..044f4d63ac60 100644
--- a/superset/commands/dashboard/importers/v1/__init__.py
+++ b/superset/commands/dashboard/importers/v1/__init__.py
@@ -22,7 +22,7 @@
from marshmallow import Schema
from sqlalchemy.orm import Session # noqa: F401
-from sqlalchemy.sql import delete, select
+from sqlalchemy.sql import select
from superset import db
from superset.charts.schemas import ImportV1ChartSchema
@@ -47,6 +47,7 @@
from superset.extensions import feature_flag_manager
from superset.migrations.shared.native_filters import migrate_dashboard
from superset.models.dashboard import Dashboard, dashboard_slices
+from superset.models.slice import Slice
from superset.themes.schemas import ImportV1ThemeSchema
logger = logging.getLogger(__name__)
@@ -167,8 +168,18 @@ def _import(
)
# import dashboards
+ #
+ # Dashboard → charts associations go through the ORM relationship
+ # (``dashboard.slices = [...]``) rather than Core
+ # ``delete()``/``insert()`` on the ``dashboard_slices`` table.
+ # Bulk DML via Core would emit a malformed INSERT into
+ # ``dashboard_slices_version`` (missing the composite-PK columns)
+ # because SQLAlchemy-Continuum's M2M tracker can't see per-row
+ # column values when the DELETE/INSERT goes through the Core
+ # layer. The same pattern is applied in
+ # ``superset/commands/importers/v1/assets.py`` and the spike's
+ # ``DatasetDAO.update_columns`` rewrite.
dashboards: list[Dashboard] = []
- dashboard_chart_ids: list[tuple[int, int]] = []
for file_name, config in configs.items():
if file_name.startswith("dashboards/"):
config = update_id_refs(config, chart_ids, dataset_info)
@@ -183,16 +194,9 @@ def _import(
dashboard = import_dashboard(config, overwrite=overwrite)
dashboards.append(dashboard)
- # When overwriting, first delete all existing chart relationships
- # so the dashboard is replaced rather than merged
- if overwrite:
- db.session.execute(
- delete(dashboard_slices).where(
- dashboard_slices.c.dashboard_id == dashboard.id
- )
- )
-
- # Collect chart IDs to associate with this dashboard
+ # Resolve the dashboard's chart membership from the imported
+ # position_json and apply it to the ORM relationship.
+ target_chart_ids: list[int] = []
for uuid in find_chart_uuids(config["position"]):
if uuid not in chart_ids:
continue
@@ -201,7 +205,31 @@ def _import(
overwrite
or (dashboard.id, chart_id) not in existing_relationships
):
- dashboard_chart_ids.append((dashboard.id, chart_id))
+ target_chart_ids.append(chart_id)
+
+ if overwrite:
+ # Replace the dashboard's chart membership entirely.
+ dashboard.slices = (
+ db.session.query(Slice)
+ .filter(Slice.id.in_(target_chart_ids))
+ .all()
+ if target_chart_ids
+ else []
+ )
+ # Flush eagerly so the M2M rows land in
+ # ``dashboard_slices`` before any subsequent
+ # autoflush fires an inner-flush event handler
+ # that would reset the relationship change.
+ db.session.flush()
+ elif target_chart_ids:
+ # Append only the new associations to existing ones.
+ new_slices = (
+ db.session.query(Slice)
+ .filter(Slice.id.in_(target_chart_ids))
+ .all()
+ )
+ dashboard.slices = list(dashboard.slices) + new_slices
+ db.session.flush()
# Handle tags using import_tag function
if feature_flag_manager.is_feature_enabled("TAGGING_SYSTEM"):
@@ -215,14 +243,6 @@ def _import(
db.session,
)
- # set ref in the dashboard_slices table
- if dashboard_chart_ids:
- values = [
- {"dashboard_id": dashboard_id, "slice_id": chart_id}
- for (dashboard_id, chart_id) in dashboard_chart_ids
- ]
- db.session.execute(dashboard_slices.insert(), values)
-
# Migrate any filter-box charts to native dashboard filters.
for dashboard in dashboards:
migrate_dashboard(dashboard)
diff --git a/superset/commands/dashboard/update.py b/superset/commands/dashboard/update.py
index dd81d96deeb1..b39ef4af9463 100644
--- a/superset/commands/dashboard/update.py
+++ b/superset/commands/dashboard/update.py
@@ -59,23 +59,31 @@ def __init__(self, model_id: int, data: dict[str, Any]):
def run(self) -> Model:
self.validate()
assert self._model is not None
- self.process_tab_diff()
- self.process_native_filter_diff()
-
- # Update tags
- if (tags := self._properties.pop("tags", None)) is not None:
- update_tags(ObjectType.dashboard, self._model.id, self._model.tags, tags)
-
- # Re-serialize position_json to escape 4-byte Unicode characters
- if position_json := self._properties.get("position_json"):
- self._properties["position_json"] = json.dumps(json.loads(position_json))
-
- dashboard = DashboardDAO.update(self._model, self._properties)
- if self._properties.get("json_metadata"):
- DashboardDAO.set_dash_metadata(
- dashboard,
- data=json.loads(self._properties.get("json_metadata", "{}")),
- )
+ # Suppress autoflush during the update body so that Continuum's
+ # before_flush baseline listener does not fire mid-operation while
+ # the session is only partially populated.
+ with db.session.no_autoflush:
+ self.process_tab_diff()
+ self.process_native_filter_diff()
+
+ # Update tags
+ if (tags := self._properties.pop("tags", None)) is not None:
+ update_tags(
+ ObjectType.dashboard, self._model.id, self._model.tags, tags
+ )
+
+ # Re-serialize position_json to escape 4-byte Unicode characters
+ if position_json := self._properties.get("position_json"):
+ self._properties["position_json"] = json.dumps(
+ json.loads(position_json)
+ )
+
+ dashboard = DashboardDAO.update(self._model, self._properties)
+ if self._properties.get("json_metadata"):
+ DashboardDAO.set_dash_metadata(
+ dashboard,
+ data=json.loads(self._properties.get("json_metadata", "{}")),
+ )
return dashboard
def validate(self) -> None:
diff --git a/superset/commands/dataset/duplicate.py b/superset/commands/dataset/duplicate.py
index 2be7be5690b9..8371610fe55b 100644
--- a/superset/commands/dataset/duplicate.py
+++ b/superset/commands/dataset/duplicate.py
@@ -52,6 +52,16 @@ def __init__(self, data: dict[str, Any]) -> None:
@transaction(on_error=partial(on_error, reraise=DatasetDuplicateFailedError))
def run(self) -> Model:
self.validate()
+ # Declare the high-level avenue before the duplicate touches
+ # the session. The change-record listener stamps
+ # ``version_transaction.action_kind = 'clone'`` so the new
+ # dataset's baseline records read as a clone in the timeline.
+ # Method-scoped import — defers the versioning bootstrap path
+ # out of this command's module-load graph; see ``changes.py``
+ # module docstring for the broader init-order rationale.
+ from superset.versioning.changes import ACTION_KIND_CLONE, ACTION_KIND_KEY
+
+ db.session.info[ACTION_KIND_KEY] = ACTION_KIND_CLONE
database_id = self._base_model.database_id
table_name = self._properties["table_name"]
owners = self._properties["owners"]
diff --git a/superset/commands/importers/v1/__init__.py b/superset/commands/importers/v1/__init__.py
index d8d010408761..5cf0f8ade990 100644
--- a/superset/commands/importers/v1/__init__.py
+++ b/superset/commands/importers/v1/__init__.py
@@ -86,6 +86,19 @@ def _get_uuids(cls) -> set[str]:
def run(self) -> None:
self.validate()
+ # Declare the high-level avenue before any session writes. The
+ # change-record listener reads this on its first after_flush
+ # for the resulting ``version_transaction`` row and stamps
+ # ``version_transaction.action_kind = 'import'``. Lets operators
+ # explain otherwise-confusing diffs ("Cleared default_filters")
+ # as "this was an import". See data-model.md §"Three dimensions".
+ # Method-scoped import — defers the versioning bootstrap path
+ # out of this command's module-load graph; see ``changes.py``
+ # module docstring for the broader init-order rationale.
+ from superset.versioning.changes import ACTION_KIND_IMPORT, ACTION_KIND_KEY
+
+ db.session.info[ACTION_KIND_KEY] = ACTION_KIND_IMPORT
+
try:
self._import(self._configs, self.overwrite, self.contents)
except CommandException:
diff --git a/superset/commands/importers/v1/assets.py b/superset/commands/importers/v1/assets.py
index 99e28b38f964..1b7b4b20b573 100644
--- a/superset/commands/importers/v1/assets.py
+++ b/superset/commands/importers/v1/assets.py
@@ -19,7 +19,6 @@
from marshmallow import Schema
from marshmallow.exceptions import ValidationError
-from sqlalchemy.sql import delete, insert
from superset import db
from superset.charts.schemas import ImportV1ChartSchema
@@ -49,7 +48,7 @@
from superset.extensions import feature_flag_manager
from superset.migrations.shared.native_filters import migrate_dashboard
from superset.models.core import Database
-from superset.models.dashboard import Dashboard, dashboard_slices
+from superset.models.dashboard import Dashboard
from superset.models.slice import Slice
from superset.models.sql_lab import SavedQuery
from superset.queries.saved_queries.schemas import ImportV1SavedQuerySchema
@@ -165,23 +164,33 @@ def _import( # noqa: C901
dashboard = import_dashboard(config, overwrite=overwrite)
# set ref in the dashboard_slices table
- dashboard_chart_ids: list[dict[str, int]] = []
+ # Use ORM-level reassignment instead of Core
+ # delete()/insert() so SQLAlchemy-Continuum's M2M tracker
+ # sees per-row changes through the ORM. Bulk DML via Core
+ # would emit a malformed INSERT into
+ # ``dashboard_slices_version`` (missing the composite-PK
+ # columns) — see the parallel rewrite in
+ # ``DatasetDAO.update_columns`` and the test-factory's
+ # ``delete_dashboard_slices_associations`` for the same
+ # reason.
+ slice_ids: list[int] = []
for uuid in find_chart_uuids(config["position"]):
if uuid not in chart_ids:
break
- chart_id = chart_ids[uuid]
- dashboard_chart_id = {
- "dashboard_id": dashboard.id,
- "slice_id": chart_id,
- }
- dashboard_chart_ids.append(dashboard_chart_id)
+ slice_ids.append(chart_ids[uuid])
- db.session.execute(
- delete(dashboard_slices).where(
- dashboard_slices.c.dashboard_id == dashboard.id
- )
+ dashboard.slices = (
+ db.session.query(Slice).filter(Slice.id.in_(slice_ids)).all()
+ if slice_ids
+ else []
)
- db.session.execute(insert(dashboard_slices).values(dashboard_chart_ids))
+ # Flush eagerly so the M2M rows land in
+ # ``dashboard_slices`` before any subsequent autoflush
+ # fires an inner-flush event handler that would reset
+ # the relationship change (cf. the SAWarning at
+ # ``superset/models/helpers.py`` re. "attribute history
+ # events accumulated ... have been reset").
+ db.session.flush()
# Handle tags using import_tag function
if feature_flag_manager.is_feature_enabled("TAGGING_SYSTEM"):
diff --git a/superset/config.py b/superset/config.py
index 0bb0cacb9f57..61e8bde5ea6f 100644
--- a/superset/config.py
+++ b/superset/config.py
@@ -1245,7 +1245,11 @@ class D3TimeFormat(TypedDict, total=False):
"origins": [
"https://tile.openstreetmap.org",
"https://tile.osm.ch",
- ]
+ ],
+ # Make the entity-version-history `ETag` header readable by cross-origin
+ # browser clients. Without this, `fetch()` callers cannot read the header
+ # even when CORS is otherwise permissive.
+ "expose_headers": ["ETag"],
}
# Sanitizes the HTML content used in markdowns to allow its rendering in a safe manner.
@@ -1425,6 +1429,21 @@ class D3TimeFormat(TypedDict, total=False):
# The limit for the Superset Meta DB when the feature flag ENABLE_SUPERSET_META_DB is on
SUPERSET_META_DB_LIMIT: int | None = 1000
+# Master switch for entity-version-history capture. Ships defaulted ``False``
+# so the versioning infrastructure (schema + Continuum wiring) lands inert:
+# no save writes shadow rows or a ``version_transaction``/``version_changes``
+# record, while the /versions/ endpoints stay available read-only (returning
+# empty). Set to ``True`` in ``superset_config.py`` (or via the env var of the
+# same name) to enable the before-flush listeners that drive capture.
+# Capture is activated by flipping this default to on once validated in
+# production. It is an operational escape hatch — for use when a
+# versioning-induced regression needs a 30-second recovery instead of
+# revert-and-redeploy — not a feature flag, and remains as the permanent
+# kill-switch.
+ENABLE_VERSIONING_CAPTURE: bool = utils.parse_boolean_string(
+ os.environ.get("ENABLE_VERSIONING_CAPTURE", "false")
+)
+
# Adds a warning message on sqllab save query and schedule query modals.
SQLLAB_SAVE_WARNING_MESSAGE = None
SQLLAB_SCHEDULE_WARNING_MESSAGE = None
diff --git a/superset/connectors/sqla/models.py b/superset/connectors/sqla/models.py
index 83ddbc3fcfe7..9e8050702d01 100644
--- a/superset/connectors/sqla/models.py
+++ b/superset/connectors/sqla/models.py
@@ -945,6 +945,15 @@ class TableColumn(AuditMixinNullable, ImportExportMixin, CertificationMixin, Mod
__tablename__ = "table_columns"
__table_args__ = (UniqueConstraint("table_id", "column_name"),)
+ # SPIKE (full-Continuum): Continuum-versioned
+ # again, with audit-field exclusions to suppress the per-column-per-save
+ # noise rows that ADR-004 flagged as Failure 3. ``changed_on`` refreshes
+ # on every parent dataset save even when the column itself wasn't user-
+ # edited; capturing it produced one shadow row per column per save with
+ # no user signal.
+ __versioned__: dict[str, Any] = {
+ "exclude": ["changed_on", "created_on", "changed_by_fk", "created_by_fk"]
+ }
id = Column(Integer, primary_key=True)
column_name = Column(String(255), nullable=False)
@@ -1190,6 +1199,10 @@ class SqlMetric(AuditMixinNullable, ImportExportMixin, CertificationMixin, Model
__tablename__ = "sql_metrics"
__table_args__ = (UniqueConstraint("table_id", "metric_name"),)
+ # SPIKE: same audit-field exclusions as TableColumn (see above).
+ __versioned__: dict[str, Any] = {
+ "exclude": ["changed_on", "created_on", "changed_by_fk", "created_by_fk"]
+ }
id = Column(Integer, primary_key=True)
metric_name = Column(String(255), nullable=False)
@@ -1285,9 +1298,18 @@ def data(self) -> dict[str, Any]:
sqlatable_user = DBTable(
"sqlatable_user",
metadata,
- Column("id", Integer, primary_key=True),
- Column("user_id", Integer, ForeignKey("ab_user.id", ondelete="CASCADE")),
- Column("table_id", Integer, ForeignKey("tables.id", ondelete="CASCADE")),
+ Column(
+ "user_id",
+ Integer,
+ ForeignKey("ab_user.id", ondelete="CASCADE"),
+ primary_key=True,
+ ),
+ Column(
+ "table_id",
+ Integer,
+ ForeignKey("tables.id", ondelete="CASCADE"),
+ primary_key=True,
+ ),
)
@@ -1318,6 +1340,33 @@ class SqlaTable(
owner_class = security_manager.user_model
__tablename__ = "tables"
+ # Exclude M2M association relationships: Continuum only captures FK columns on
+ # association INSERTs (not the auto-increment id), which breaks the NOT NULL PK.
+ # deleted_at exclusion will be added when soft delete is merged.
+ # Audit columns are auto-bumped on every save. Excluding them lets
+ # Continuum's is_modified() return False on no-op saves (e.g. owners-only
+ # edits) so we don't create empty version rows. version_transaction.user_id
+ # / issued_at preserve "who/when".
+ # The perm-string class (perm / schema_perm / catalog_perm) is derived
+ # security state, not user-authored content: permission maintenance
+ # rewrites it in bulk, and versioning it produced phantom transactions
+ # flooding the activity stream (one "updated" row per touched entity
+ # with no user edit — surfaced by the version-history UI, PR #40988).
+ # Excluding it also means a restore can't resurrect stale permission
+ # strings; the live, derived values stay authoritative.
+ __versioned__: dict[str, Any] = {
+ "exclude": [
+ "owners",
+ "row_level_security_filters",
+ "changed_on",
+ "created_on",
+ "changed_by_fk",
+ "created_by_fk",
+ "perm",
+ "schema_perm",
+ "catalog_perm",
+ ]
+ }
# Note this uniqueness constraint is not part of the physical schema, i.e., it does
# not exist in the migrations, but is required by `import_from_dict` to ensure the
@@ -1446,7 +1495,7 @@ def link(self) -> Markup:
name = escape(self.name)
url = escape(self.explore_url)
anchor = f'{name}'
- return Markup(anchor)
+ return Markup(anchor) # noqa: S704
def get_catalog_perm(self) -> str | None:
"""Returns catalog permission if present, database one otherwise."""
@@ -2220,17 +2269,25 @@ def text(self, clause: str) -> TextClause:
RLSFilterRoles = DBTable(
"rls_filter_roles",
metadata,
- Column("id", Integer, primary_key=True),
- Column("role_id", Integer, ForeignKey("ab_role.id"), nullable=False),
- Column("rls_filter_id", Integer, ForeignKey("row_level_security_filters.id")),
+ Column("role_id", Integer, ForeignKey("ab_role.id"), primary_key=True),
+ Column(
+ "rls_filter_id",
+ Integer,
+ ForeignKey("row_level_security_filters.id"),
+ primary_key=True,
+ ),
)
RLSFilterTables = DBTable(
"rls_filter_tables",
metadata,
- Column("id", Integer, primary_key=True),
- Column("table_id", Integer, ForeignKey("tables.id")),
- Column("rls_filter_id", Integer, ForeignKey("row_level_security_filters.id")),
+ Column("table_id", Integer, ForeignKey("tables.id"), primary_key=True),
+ Column(
+ "rls_filter_id",
+ Integer,
+ ForeignKey("row_level_security_filters.id"),
+ primary_key=True,
+ ),
)
diff --git a/superset/constants.py b/superset/constants.py
index 863d0cf82ba1..3525abc882c1 100644
--- a/superset/constants.py
+++ b/superset/constants.py
@@ -179,6 +179,8 @@ class RouteMethod: # pylint: disable=too-few-public-methods
"put_colors": "write",
"sync_permissions": "write",
"restore": "write",
+ "list_versions": "read",
+ "get_version": "read",
}
EXTRA_FORM_DATA_APPEND_KEYS = {
diff --git a/superset/daos/dataset.py b/superset/daos/dataset.py
index 1822fd711864..30def924c52a 100644
--- a/superset/daos/dataset.py
+++ b/superset/daos/dataset.py
@@ -275,6 +275,103 @@ def update(
return super().update(item, attributes)
+ @classmethod
+ def _validate_column_date_formats(
+ cls, property_columns: list[dict[str, Any]]
+ ) -> None:
+ for column in property_columns:
+ if column.get("python_date_format") is None:
+ continue
+ if not DatasetDAO.validate_python_date_format(column["python_date_format"]):
+ raise ValueError(
+ "python_date_format is an invalid date/timestamp format."
+ )
+
+ @classmethod
+ def _override_columns(
+ cls, model: SqlaTable, property_columns: list[dict[str, Any]]
+ ) -> None:
+ """Replace columns by natural key (``column_name``) — update in place
+ rather than delete-and-reinsert.
+
+ SPIKE (full-Continuum): the previous
+ delete-and-reinsert pattern produced overlapping shadow rows in
+ ``table_columns_version`` (the same ``column_name`` had a DELETE
+ shadow at tx N alongside an INSERT shadow at tx N for a fresh PK).
+ Continuum's ``Reverter`` couldn't unwind this on restore: its flush
+ ordering inserts the historical row before deleting the live one,
+ hitting the ``UNIQUE (table_id, column_name)`` constraint mid-flush
+ (ADR-004 Failure 1).
+
+ The natural-key upsert keeps PKs stable across metadata refresh.
+ Continuum captures only real field changes; new columns get plain
+ INSERT shadows; removed columns get plain DELETE shadows. No
+ natural-key collisions, so Reverter can restore cleanly.
+
+ Behaviour change vs. the previous implementation: PKs of unchanged
+ columns are preserved. Charts that reference columns by their
+ ``id`` continue to work across a metadata refresh — previously
+ such references would be invalidated.
+ """
+ existing_by_name = {c.column_name: c for c in model.columns}
+ incoming_by_name = {p["column_name"]: p for p in property_columns}
+
+ # Identity is the natural key here, never the payload's ``id``:
+ # setattr-ing an incoming ``id`` onto a name-matched row would
+ # rewrite a live primary key, and a renamed column whose payload
+ # still carries its old ``id`` would INSERT with a live PK while
+ # the old-named row is deleted in the same flush — INSERTs flush
+ # before DELETEs, so that collides on the PK / UNIQUE(table_id,
+ # column_name) constraints. ``table_id`` is pinned to *model*.
+ protected_keys = ("id", "table_id")
+
+ # Update columns present in both: in-place setattr.
+ for name, col in existing_by_name.items():
+ if name in incoming_by_name:
+ for key, value in incoming_by_name[name].items():
+ if key not in protected_keys:
+ setattr(col, key, value)
+
+ # Insert columns present only in incoming.
+ for name, properties in incoming_by_name.items():
+ if name not in existing_by_name:
+ cleaned = {
+ key: value
+ for key, value in properties.items()
+ if key not in protected_keys
+ }
+ db.session.add(TableColumn(**{**cleaned, "table_id": model.id}))
+
+ # Delete columns present only in existing.
+ for name, col in existing_by_name.items():
+ if name not in incoming_by_name:
+ db.session.delete(col)
+
+ @classmethod
+ def _upsert_columns(
+ cls, model: SqlaTable, property_columns: list[dict[str, Any]]
+ ) -> None:
+ columns_by_id = {column.id: column for column in model.columns}
+ property_columns_by_id = {
+ properties["id"]: properties
+ for properties in property_columns
+ if "id" in properties
+ }
+
+ for properties in property_columns:
+ if "id" not in properties:
+ db.session.add(TableColumn(**{**properties, "table_id": model.id}))
+
+ for properties in property_columns_by_id.values():
+ col = columns_by_id[properties["id"]]
+ for key, value in properties.items():
+ setattr(col, key, value)
+
+ ids_to_keep = property_columns_by_id.keys()
+ for col in model.columns:
+ if col.id not in ids_to_keep:
+ db.session.delete(col)
+
@classmethod
def update_columns(
cls,
@@ -290,64 +387,15 @@ def update_columns(
- If a column Dict does not have an `id` then we create a new metric.
- If there are extra columns on the metadata db that are not defined on the List
then we delete.
- """
-
- for column in property_columns:
- if (
- "python_date_format" in column
- and column["python_date_format"] is not None
- ):
- if not DatasetDAO.validate_python_date_format(
- column["python_date_format"]
- ):
- raise ValueError(
- "python_date_format is an invalid date/timestamp format."
- )
+ Uses individual ORM operations (not bulk) so that SQLAlchemy-Continuum
+ can capture each row change in the version history.
+ """
+ cls._validate_column_date_formats(property_columns)
if override_columns:
- db.session.query(TableColumn).filter(
- TableColumn.table_id == model.id
- ).delete(synchronize_session="fetch")
-
- db.session.bulk_insert_mappings(
- TableColumn,
- [
- {**properties, "table_id": model.id}
- for properties in property_columns
- ],
- )
+ cls._override_columns(model, property_columns)
else:
- columns_by_id = {column.id: column for column in model.columns}
-
- property_columns_by_id = {
- properties["id"]: properties
- for properties in property_columns
- if "id" in properties
- }
-
- db.session.bulk_insert_mappings(
- TableColumn,
- [
- {**properties, "table_id": model.id}
- for properties in property_columns
- if "id" not in properties
- ],
- )
-
- db.session.bulk_update_mappings(
- TableColumn,
- [
- {**columns_by_id[properties["id"]].__dict__, **properties}
- for properties in property_columns_by_id.values()
- ],
- )
-
- db.session.query(TableColumn).filter(
- TableColumn.id.in_(
- {column.id for column in model.columns}
- - property_columns_by_id.keys()
- )
- ).delete(synchronize_session="fetch")
+ cls._upsert_columns(model, property_columns)
@classmethod
def update_metrics(
@@ -363,6 +411,9 @@ def update_metrics(
- If a metric Dict does not have an `id` then we create a new metric.
- If there are extra metrics on the metadata db that are not defined on the List
then we delete.
+
+ Uses individual ORM operations (not bulk) so that SQLAlchemy-Continuum
+ can capture each row change in the version history.
"""
metrics_by_id = {metric.id: metric for metric in model.metrics}
@@ -373,28 +424,22 @@ def update_metrics(
if "id" in properties
}
- db.session.bulk_insert_mappings(
- SqlMetric,
- [
- {**properties, "table_id": model.id}
- for properties in property_metrics
- if "id" not in properties
- ],
- )
-
- db.session.bulk_update_mappings(
- SqlMetric,
- [
- {**metrics_by_id[properties["id"]].__dict__, **properties}
- for properties in property_metrics_by_id.values()
- ],
- )
-
- db.session.query(SqlMetric).filter(
- SqlMetric.id.in_(
- {metric.id for metric in model.metrics} - property_metrics_by_id.keys()
- )
- ).delete(synchronize_session="fetch")
+ # Insert new metrics
+ for properties in property_metrics:
+ if "id" not in properties:
+ db.session.add(SqlMetric(**{**properties, "table_id": model.id}))
+
+ # Update existing metrics
+ for properties in property_metrics_by_id.values():
+ metric = metrics_by_id[properties["id"]]
+ for key, value in properties.items():
+ setattr(metric, key, value)
+
+ # Delete removed metrics
+ ids_to_keep = property_metrics_by_id.keys()
+ for metric in model.metrics:
+ if metric.id not in ids_to_keep:
+ db.session.delete(metric)
@classmethod
def find_dataset_column(cls, dataset_id: int, column_id: int) -> TableColumn | None:
diff --git a/superset/daos/version.py b/superset/daos/version.py
new file mode 100644
index 000000000000..f73c7b4e347d
--- /dev/null
+++ b/superset/daos/version.py
@@ -0,0 +1,69 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Backward-compat façade for the entity-versioning DAO surface.
+
+The actual implementation lives in :mod:`superset.versioning.queries`
+(read side: list/get/resolve/find/UUID derivation). This module
+re-exports it under a single ``VersionDAO`` class plus the module-level
+UUID helpers so existing callers keep working without changes. (The
+write side — restore + audit stamping — ships in a later PR; only the
+read surface is wired here.)
+
+New code should import from the versioning sub-modules directly.
+"""
+
+from __future__ import annotations
+
+from superset.versioning.queries import (
+ current_live_transaction_id,
+ current_live_version_uuid,
+ current_version_number,
+ derive_version_uuid,
+ derive_version_uuid as _derive_version_uuid, # noqa: F401
+ find_active_by_uuid,
+ get_version,
+ list_change_records_batch,
+ list_versions,
+ resolve_version_uuid,
+ VERSION_UUID_NAMESPACE,
+)
+
+# Re-exports for ``from superset.daos.version import …`` consumers.
+__all__ = [
+ "VERSION_UUID_NAMESPACE",
+ "VersionDAO",
+ "derive_version_uuid",
+]
+
+
+class VersionDAO:
+ """Thin façade over :mod:`superset.versioning.queries`.
+
+ Preserved as a single namespace for ergonomic access from API
+ handlers and command classes; the underlying functions are
+ importable directly from their respective sub-modules.
+ """
+
+ # --- read side (queries.py) -------------------------------------------
+ find_active_by_uuid = staticmethod(find_active_by_uuid)
+ current_version_number = staticmethod(current_version_number)
+ current_live_transaction_id = staticmethod(current_live_transaction_id)
+ current_live_version_uuid = staticmethod(current_live_version_uuid)
+ list_change_records_batch = staticmethod(list_change_records_batch)
+ list_versions = staticmethod(list_versions)
+ resolve_version_uuid = staticmethod(resolve_version_uuid)
+ get_version = staticmethod(get_version)
diff --git a/superset/dashboards/api.py b/superset/dashboards/api.py
index d67a45c8d27a..076528e219ba 100644
--- a/superset/dashboards/api.py
+++ b/superset/dashboards/api.py
@@ -119,7 +119,9 @@
TabsPayloadSchema,
thumbnail_query_schema,
)
-from superset.exceptions import ScreenshotImageNotAvailableException
+from superset.exceptions import (
+ ScreenshotImageNotAvailableException,
+)
from superset.extensions import event_logger, security_manager
from superset.models.dashboard import Dashboard
from superset.models.embedded_dashboard import EmbeddedDashboard
@@ -139,6 +141,14 @@
ScreenshotCachePayload,
)
from superset.utils.urls import get_url_path
+from superset.versioning.api_helpers import (
+ current_entity_etag_uuid,
+ current_entity_version_info,
+ get_version_endpoint,
+ list_versions_endpoint,
+)
+from superset.versioning.etag import set_version_etag
+from superset.versioning.schemas import VersionListItemSchema
from superset.views.base_api import (
BaseSupersetModelRestApi,
RelatedFieldFilter,
@@ -252,6 +262,8 @@ class DashboardRestApi(CustomTagsOptimizationMixin, BaseSupersetModelRestApi):
"put_chart_customizations",
"put_colors",
"export_as_example",
+ "list_versions",
+ "get_version",
}
resource_name = "dashboard"
allow_browser_login = True
@@ -434,6 +446,7 @@ def get_list(self, **kwargs: Any) -> Response:
GetFavStarIdsSchema,
EmbeddedDashboardResponseSchema,
DashboardScreenshotPostSchema,
+ VersionListItemSchema,
)
apispec_parameter_schemas = {
"get_delete_ids_schema": get_delete_ids_schema,
@@ -524,7 +537,10 @@ def get(
add_extra_log_payload(
dashboard_id=dash.id, action=f"{self.__class__.__name__}.get"
)
- return self.response(200, result=result)
+ return set_version_etag(
+ self.response(200, result=result),
+ current_entity_etag_uuid(Dashboard, dash.id, dash.uuid),
+ )
@expose("//datasets", methods=("GET",))
@protect()
@@ -808,6 +824,34 @@ def put(self, pk: int) -> Response:
$ref: '#/components/schemas/{{self.__class__.__name__}}.put'
last_modified_time:
type: number
+ old_version:
+ type: integer
+ nullable: true
+ description: >-
+ 0-based version_number of the live row before this
+ update. Unstable under retention pruning — see
+ old_transaction_id for a stable identifier.
+ new_version:
+ type: integer
+ nullable: true
+ description: >-
+ 0-based version_number of the newly-live row after
+ this update. Can equal old_version when no
+ versioned column changed, or when retention
+ pruning dropped an older closed row in the same
+ commit.
+ old_transaction_id:
+ type: integer
+ nullable: true
+ description: Continuum transaction_id of the live
+ row before this update. Stable across pruning.
+ new_transaction_id:
+ type: integer
+ nullable: true
+ description: Continuum transaction_id of the live
+ row after this update. Differs from
+ old_transaction_id when the update produced a new
+ version row.
400:
$ref: '#/components/responses/400'
401:
@@ -826,17 +870,32 @@ def put(self, pk: int) -> Response:
# This validates custom Schema with custom validations
except ValidationError as error:
return self.response_400(message=error.messages)
+
+ # Live version identifiers before the update (empty + query-free when
+ # ``ENABLE_VERSIONING_CAPTURE`` is off).
+ old_info = current_entity_version_info(Dashboard, pk)
+
try:
changed_model = UpdateDashboardCommand(pk, item).run()
last_modified_time = changed_model.changed_on.replace(
microsecond=0
).timestamp()
+ new_info = current_entity_version_info(
+ Dashboard, changed_model.id, changed_model.uuid
+ )
response = self.response(
200,
id=changed_model.id,
result=item,
last_modified_time=last_modified_time,
+ old_version=old_info.version,
+ new_version=new_info.version,
+ old_transaction_id=old_info.transaction_id,
+ new_transaction_id=new_info.transaction_id,
+ old_version_uuid=old_info.version_uuid,
+ new_version_uuid=new_info.version_uuid,
)
+ set_version_etag(response, new_info.version_uuid)
except DashboardNotFoundError:
response = self.response_404()
except DashboardForbiddenError:
@@ -2229,3 +2288,109 @@ def copy_dash(self, original_dash: Dashboard) -> Response:
).timestamp(),
},
)
+
+ @expose("//versions/", methods=("GET",))
+ @protect()
+ @safe
+ @statsd_metrics
+ @event_logger.log_this_with_context(
+ action=lambda self, *args, **kwargs: f"{self.__class__.__name__}.list_versions",
+ log_to_statsd=False,
+ )
+ def list_versions(self, uuid_str: str) -> Response:
+ """List version history for a dashboard.
+ ---
+ get:
+ summary: Return the version history for a dashboard
+ parameters:
+ - in: path
+ schema:
+ type: string
+ format: uuid
+ name: uuid_str
+ description: Dashboard UUID
+ responses:
+ 200:
+ description: Version history ordered by oldest first
+ content:
+ application/json:
+ schema:
+ type: object
+ properties:
+ result:
+ type: array
+ items:
+ $ref: '#/components/schemas/VersionListItemSchema'
+ count:
+ type: integer
+ 400:
+ $ref: '#/components/responses/400'
+ 401:
+ $ref: '#/components/responses/401'
+ 403:
+ $ref: '#/components/responses/403'
+ 404:
+ $ref: '#/components/responses/404'
+ """
+ return list_versions_endpoint(
+ self, Dashboard, uuid_str, access_kwarg="dashboard"
+ )
+
+ @expose(
+ "//versions//",
+ methods=("GET",),
+ )
+ @protect()
+ @safe
+ @statsd_metrics
+ @event_logger.log_this_with_context(
+ action=lambda self, *args, **kwargs: f"{self.__class__.__name__}.get_version", # noqa: E501
+ log_to_statsd=False,
+ )
+ def get_version(self, uuid_str: str, version_uuid_str: str) -> Response:
+ """Return the dashboard's state at a specific version.
+ ---
+ get:
+ summary: Read-only snapshot of the dashboard at a given version
+ parameters:
+ - in: path
+ schema:
+ type: string
+ format: uuid
+ name: uuid_str
+ description: Dashboard UUID
+ - in: path
+ schema:
+ type: string
+ format: uuid
+ name: version_uuid_str
+ description: Version UUID as returned by the list endpoint
+ responses:
+ 200:
+ description: Snapshot of the dashboard at the target version
+ content:
+ application/json:
+ schema:
+ type: object
+ properties:
+ result:
+ type: object
+ description: >-
+ The dashboard's scalar fields at the target version
+ (entity-specific keys), plus a `_version` block
+ with the version-level metadata.
+ properties:
+ _version:
+ $ref: '#/components/schemas/VersionListItemSchema'
+ 400:
+ $ref: '#/components/responses/400'
+ 401:
+ $ref: '#/components/responses/401'
+ 403:
+ $ref: '#/components/responses/403'
+ 404:
+ $ref: '#/components/responses/404'
+ """
+ return get_version_endpoint(
+ self, Dashboard, uuid_str, version_uuid_str, access_kwarg="dashboard"
+ )
diff --git a/superset/datasets/api.py b/superset/datasets/api.py
index ab7d254f2bc4..22c38db42b14 100644
--- a/superset/datasets/api.py
+++ b/superset/datasets/api.py
@@ -73,10 +73,21 @@
GetOrCreateDatasetSchema,
openapi_spec_methods_override,
)
-from superset.exceptions import SupersetSyntaxErrorException, SupersetTemplateException
+from superset.exceptions import (
+ SupersetSyntaxErrorException,
+ SupersetTemplateException,
+)
from superset.jinja_context import BaseTemplateProcessor, get_template_processor
from superset.utils import json
from superset.utils.core import parse_boolean_string, sanitize_cookie_token
+from superset.versioning.api_helpers import (
+ current_entity_etag_uuid,
+ current_entity_version_info,
+ get_version_endpoint,
+ list_versions_endpoint,
+)
+from superset.versioning.etag import set_version_etag
+from superset.versioning.schemas import VersionListItemSchema
from superset.views.base import DatasourceFilter
from superset.views.base_api import (
BaseSupersetModelRestApi,
@@ -111,6 +122,8 @@ class DatasetRestApi(BaseSupersetModelRestApi):
"get_or_create_dataset",
"warm_up_cache",
"get_drill_info",
+ "list_versions",
+ "get_version",
}
list_columns = [
"id",
@@ -299,6 +312,7 @@ class DatasetRestApi(BaseSupersetModelRestApi):
DatasetRelatedObjectsResponse,
DatasetDuplicateSchema,
GetOrCreateDatasetSchema,
+ VersionListItemSchema,
)
openapi_spec_methods = openapi_spec_methods_override
@@ -416,6 +430,40 @@ def put(self, pk: int) -> Response:
type: number
result:
$ref: '#/components/schemas/{{self.__class__.__name__}}.put'
+ old_version:
+ type: integer
+ nullable: true
+ description: >-
+ 0-based version_number of the live row before this
+ update (null if the dataset had no prior history).
+ Matches the ``version_number`` field of the list
+ versions endpoint. Unstable under retention
+ pruning — see ``old_transaction_id`` for a stable
+ identifier.
+ new_version:
+ type: integer
+ nullable: true
+ description: >-
+ 0-based version_number of the newly-live row after
+ this update. Can equal ``old_version`` when no
+ versioned column changed, or when retention
+ pruning dropped an older closed row in the same
+ commit.
+ old_transaction_id:
+ type: integer
+ nullable: true
+ description: >-
+ Continuum transaction_id of the live row before
+ this update. Stable across retention pruning.
+ new_transaction_id:
+ type: integer
+ nullable: true
+ description: >-
+ Continuum transaction_id of the live row after
+ this update. When this differs from
+ ``old_transaction_id`` the update produced a new
+ version row (regardless of whether ``new_version``
+ changed).
400:
$ref: '#/components/responses/400'
401:
@@ -439,17 +487,69 @@ def put(self, pk: int) -> Response:
# This validates custom Schema with custom validations
except ValidationError as error:
return self.response_400(message=error.messages)
+
+ # Live version identifiers before the update (empty + query-free when
+ # ``ENABLE_VERSIONING_CAPTURE`` is off).
+ old_info = current_entity_version_info(SqlaTable, pk)
+
try:
+ # Two commands, two commits, two Continuum transactions for an
+ # ``override_columns`` save — deliberately NOT merged into one
+ # transaction. A single-transaction design was attempted and
+ # reverted: ``DBEventLogger`` writes request logs through the
+ # SHARED scoped session and calls ``commit()`` /
+ # ``rollback()`` on it mid-request (superset/utils/log.py),
+ # so any save held uncommitted across a logged sub-action can
+ # be committed half-done (Postgres/MySQL) or rolled back
+ # entirely on a transient logger failure (SQLite's
+ # "database is locked"). Until the event logger gets its own
+ # session, per-command commit boundaries are the only shape
+ # whose failure modes are honest. Consequence the
+ # version-history UI must tolerate: one logical save can
+ # surface as two version transactions stamped the same second.
changed_model = UpdateDatasetCommand(pk, item, override_columns).run()
+ # Capture the post-update identifiers BEFORE the refresh:
+ # RefreshDatasetCommand commits its own transaction, so reading
+ # afterwards would attribute the refresh's version to the
+ # user's update (and old→new would span two transactions).
+ new_info = current_entity_version_info(
+ SqlaTable, changed_model.id, changed_model.uuid
+ )
+ etag_version_uuid = new_info.version_uuid
if override_columns:
RefreshDatasetCommand(pk).run()
- response = self.response(200, id=changed_model.id, result=item)
+ # The ETag must reflect the entity's *current live* version,
+ # which after the refresh is the refresh's transaction —
+ # re-read it rather than reusing the pre-refresh uuid.
+ etag_version_uuid = current_entity_etag_uuid(
+ SqlaTable, changed_model.id, changed_model.uuid
+ )
+ response = self.response(
+ 200,
+ id=changed_model.id,
+ result=item,
+ old_version=old_info.version,
+ new_version=new_info.version,
+ old_transaction_id=old_info.transaction_id,
+ new_transaction_id=new_info.transaction_id,
+ old_version_uuid=old_info.version_uuid,
+ new_version_uuid=new_info.version_uuid,
+ )
+ set_version_etag(response, etag_version_uuid)
except DatasetNotFoundError:
response = self.response_404()
except DatasetForbiddenError:
response = self.response_403()
except DatasetInvalidError as ex:
response = self.response_422(message=ex.normalized_messages())
+ except DatasetRefreshFailedError as ex:
+ logger.error(
+ "Error refreshing dataset during update %s: %s",
+ self.__class__.__name__,
+ str(ex),
+ exc_info=True,
+ )
+ response = self.response_422(message=str(ex))
except DatasetUpdateFailedError as ex:
logger.error(
"Error updating model %s: %s",
@@ -712,8 +812,9 @@ def refresh(self, pk: int) -> Response:
@safe
@statsd_metrics
@event_logger.log_this_with_context(
- action=lambda self, *args, **kwargs: f"{self.__class__.__name__}"
- ".detect_datetime_formats",
+ action=lambda self, *args, **kwargs: (
+ f"{self.__class__.__name__}.detect_datetime_formats"
+ ),
log_to_statsd=False,
)
def detect_datetime_formats(self, pk: int) -> Response:
@@ -794,8 +895,9 @@ def detect_datetime_formats(self, pk: int) -> Response:
@safe
@statsd_metrics
@event_logger.log_this_with_context(
- action=lambda self, *args, **kwargs: f"{self.__class__.__name__}"
- f".related_objects",
+ action=lambda self, *args, **kwargs: (
+ f"{self.__class__.__name__}.related_objects"
+ ),
log_to_statsd=False,
)
def related_objects(self, id_or_uuid: str) -> Response:
@@ -1053,8 +1155,9 @@ def import_(self) -> Response:
@safe
@statsd_metrics
@event_logger.log_this_with_context(
- action=lambda self, *args, **kwargs: f"{self.__class__.__name__}"
- f".get_or_create_dataset",
+ action=lambda self, *args, **kwargs: (
+ f"{self.__class__.__name__}.get_or_create_dataset"
+ ),
log_to_statsd=False,
)
def get_or_create_dataset(self) -> Response:
@@ -1266,7 +1369,10 @@ def get(self, id_or_uuid: str, **kwargs: Any) -> Response:
except SupersetTemplateException as ex:
return self.response(ex.status, message=str(ex))
- return self.response(200, **response)
+ return set_version_etag(
+ self.response(200, **response),
+ current_entity_etag_uuid(SqlaTable, table.id, table.uuid),
+ )
@expose("//drill_info/", methods=("GET",))
@protect()
@@ -1274,9 +1380,9 @@ def get(self, id_or_uuid: str, **kwargs: Any) -> Response:
@safe
@statsd_metrics
@event_logger.log_this_with_context(
- action=lambda self,
- *args,
- **kwargs: f"{self.__class__.__name__}.get_drill_info",
+ action=lambda self, *args, **kwargs: (
+ f"{self.__class__.__name__}.get_drill_info"
+ ),
log_to_statsd=False,
)
def get_drill_info(self, pk: int, **kwargs: Any) -> Response:
@@ -1411,3 +1517,114 @@ def render_item_list(item_list: list[dict[str, Any]]) -> list[dict[str, Any]]:
raise template_exception from ex
return data
+
+ @expose("//versions/", methods=("GET",))
+ @protect()
+ @safe
+ @statsd_metrics
+ @event_logger.log_this_with_context(
+ action=lambda self, *args, **kwargs: f"{self.__class__.__name__}.list_versions",
+ log_to_statsd=False,
+ )
+ def list_versions(self, uuid_str: str) -> Response:
+ """List version history for a dataset.
+ ---
+ get:
+ summary: Return the version history for a dataset
+ parameters:
+ - in: path
+ schema:
+ type: string
+ format: uuid
+ name: uuid_str
+ description: Dataset UUID
+ responses:
+ 200:
+ description: Version history ordered by oldest first
+ content:
+ application/json:
+ schema:
+ type: object
+ properties:
+ result:
+ type: array
+ items:
+ $ref: '#/components/schemas/VersionListItemSchema'
+ count:
+ type: integer
+ 400:
+ $ref: '#/components/responses/400'
+ 401:
+ $ref: '#/components/responses/401'
+ 403:
+ $ref: '#/components/responses/403'
+ 404:
+ $ref: '#/components/responses/404'
+ """
+ return list_versions_endpoint(
+ self, SqlaTable, uuid_str, access_kwarg="datasource"
+ )
+
+ @expose(
+ "//versions//",
+ methods=("GET",),
+ )
+ @protect()
+ @safe
+ @statsd_metrics
+ @event_logger.log_this_with_context(
+ action=lambda self, *args, **kwargs: f"{self.__class__.__name__}.get_version", # noqa: E501
+ log_to_statsd=False,
+ )
+ def get_version(self, uuid_str: str, version_uuid_str: str) -> Response:
+ """Return the dataset's state at a specific version.
+ ---
+ get:
+ summary: Read-only snapshot of the dataset at a given version
+ description: >-
+ Returns the dataset's scalar fields plus reconstructed
+ ``columns`` and ``metrics`` lists as they were at the target
+ version. Does not modify live state.
+ parameters:
+ - in: path
+ schema:
+ type: string
+ format: uuid
+ name: uuid_str
+ description: Dataset UUID
+ - in: path
+ schema:
+ type: string
+ format: uuid
+ name: version_uuid_str
+ description: Version UUID as returned by the list endpoint
+ responses:
+ 200:
+ description: Snapshot of the dataset at the target version
+ content:
+ application/json:
+ schema:
+ type: object
+ properties:
+ result:
+ type: object
+ description: >-
+ The dataset's scalar fields at the target version
+ (entity-specific keys), plus `columns` / `metrics`
+ as they were at that version, plus a `_version`
+ block with the version-level metadata.
+ properties:
+ _version:
+ $ref: '#/components/schemas/VersionListItemSchema'
+ 400:
+ $ref: '#/components/responses/400'
+ 401:
+ $ref: '#/components/responses/401'
+ 403:
+ $ref: '#/components/responses/403'
+ 404:
+ $ref: '#/components/responses/404'
+ """
+ return get_version_endpoint(
+ self, SqlaTable, uuid_str, version_uuid_str, access_kwarg="datasource"
+ )
diff --git a/superset/extensions/__init__.py b/superset/extensions/__init__.py
index e704a2a4048f..32ca2bf2c6ed 100644
--- a/superset/extensions/__init__.py
+++ b/superset/extensions/__init__.py
@@ -146,6 +146,31 @@ def init_app(self, app: Flask) -> None:
celery_app = celery.Celery()
csrf = CSRFProtect()
db = get_sqla_class()()
+
+# make_versioned() MUST be called immediately after db is constructed and before
+# any versioned model class is defined. Continuum patches the SQLAlchemy
+# metaclass at call time; models constructed before this call are silently skipped.
+from sqlalchemy_continuum import ( # noqa: E402
+ make_versioned,
+ versioning_manager as _continuum_manager,
+)
+
+from superset.versioning.factory import ( # noqa: E402
+ SkipUnmodifiedPlugin,
+ VersioningFlaskPlugin,
+ VersionTransactionFactory,
+)
+
+# Rename the transaction table from "transaction" (SQL reserved word) to
+# "version_transaction" via the custom factory before make_versioned() fires.
+_continuum_manager.transaction_cls = VersionTransactionFactory()
+
+make_versioned(
+ user_cls=None,
+ plugins=[VersioningFlaskPlugin(), SkipUnmodifiedPlugin()],
+ options={"strategy": "validity"},
+)
+
_event_logger: dict[str, Any] = {}
encrypted_field_factory = EncryptedFieldFactory()
event_logger = LocalProxy(lambda: _event_logger.get("event_logger"))
diff --git a/superset/initialization/__init__.py b/superset/initialization/__init__.py
index 738bfb22984c..d20838ab2197 100644
--- a/superset/initialization/__init__.py
+++ b/superset/initialization/__init__.py
@@ -612,6 +612,170 @@ def init_extensions(self) -> None:
# Surface exceptions during initialization of extensions
print(ex)
+ @staticmethod
+ def _remove_continuum_write_listeners() -> None:
+ """Detach SQLAlchemy-Continuum's own write listeners.
+
+ ``make_versioned()`` runs unconditionally at import of
+ ``superset.extensions`` and registers Continuum's mapper, session,
+ and engine listeners — the ones that write shadow rows and
+ ``version_transaction`` rows on every flush. Skipping only the
+ custom baseline/change-record listeners would leave those running,
+ so with the kill-switch off the shadow tables would silently keep
+ accumulating, contradicting the documented contract.
+
+ This is deliberately a *targeted subset* of
+ ``sqlalchemy_continuum.remove_versioning()``: that helper also
+ calls ``manager.reset()``, which clears ``version_class_map`` —
+ and ``version_class()`` would then silently return the live model
+ class, breaking the read-only ``/versions/`` endpoints this flag
+ promises to keep working.
+
+ Idempotent: guarded on a representative listener so repeated app
+ initializations in one process (test fixtures) don't raise on
+ double-removal.
+ """
+ # pylint: disable=import-outside-toplevel
+ import sqlalchemy as sa
+ from sqlalchemy_continuum import versioning_manager
+
+ if not sa.event.contains(
+ sa.orm.Mapper, "after_insert", versioning_manager.track_inserts
+ ):
+ return # already detached by a prior init
+ versioning_manager.remove_operations_tracking(sa.orm.Mapper)
+ versioning_manager.remove_session_tracking(sa.orm.session.Session)
+ sa.event.remove(
+ sa.engine.Engine,
+ "before_execute",
+ versioning_manager.track_association_operations,
+ )
+ sa.event.remove(
+ sa.engine.Engine, "rollback", versioning_manager.clear_connection
+ )
+ sa.event.remove(
+ sa.engine.Engine,
+ "set_connection_execution_options",
+ versioning_manager.track_cloned_connections,
+ )
+
+ # Belt-and-suspenders: flip Continuum's master option off as well.
+ # Every write listener checks ``manager.options['versioning']`` before
+ # doing work (manager.py / unit_of_work.py), so if a future Continuum
+ # version registers an additional write listener this detach does not
+ # know to remove, that listener still no-ops. ``version_class()`` reads
+ # from ``version_class_map`` and ignores this option, so the read-only
+ # ``/versions/`` endpoints are unaffected.
+ versioning_manager.options["versioning"] = False
+
+ # Verify the known write listeners are actually gone. A Continuum
+ # upgrade that renamed a handler would make the removals above silently
+ # miss, leaving capture half-on while we report "disabled"; surface
+ # that rather than booting in a contradictory state.
+ if sa.event.contains(
+ sa.orm.Mapper, "after_insert", versioning_manager.track_inserts
+ ):
+ logger.warning(
+ "versioning: Continuum write listeners still attached after "
+ "detach; capture may not be fully disabled. This usually means "
+ "the pinned sqlalchemy-continuum version changed how it "
+ "registers listeners."
+ )
+
+ def init_versioning(self) -> None:
+ """Register SQLAlchemy-Continuum baseline and retention listeners.
+
+ Must be called after all versioned model classes have been imported so
+ that VERSIONED_MODELS can be populated and configure_mappers() has run.
+
+ ``ENABLE_VERSIONING_CAPTURE`` (ships default ``False``) gates the two
+ before-flush listener registrations. The flag is operational, not
+ feature: with it off the infrastructure is inert (no save writes
+ shadow rows); flipping it on activates capture. The switch also lets
+ an operator who observes a versioning-induced regression (e.g. a
+ save-path slowdown attributable to the change-record listener)
+ disable capture in ``superset_config.py`` and restart workers — a
+ 30-second recovery instead of revert-and-redeploy. Shadow tables
+ already created by the migration stay; they just stop accumulating
+ new rows.
+
+ The fallback here is ``False`` so that any app-factory path that
+ does not load ``superset.config`` (some test factories, embedded
+ use) stays inert by default rather than silently enabling capture.
+ """
+ if not self.config.get("ENABLE_VERSIONING_CAPTURE", False):
+ logger.warning(
+ "versioning: ENABLE_VERSIONING_CAPTURE is False; "
+ "skipping baseline + change-record listener registration "
+ "and detaching Continuum's write listeners. Save-path "
+ "capture is disabled; existing shadow tables and "
+ "/versions/ endpoints continue to work read-only."
+ )
+ self._remove_continuum_write_listeners()
+ return
+
+ from sqlalchemy.orm import Session # noqa: F401
+ from sqlalchemy_continuum import version_class
+
+ from superset.connectors.sqla.models import SqlaTable
+ from superset.models.dashboard import Dashboard
+ from superset.models.slice import Slice
+ from superset.versioning.baseline import (
+ register_baseline_listener,
+ VERSIONED_MODELS,
+ )
+
+ # Note: previously this block called ``configure_mappers()`` before
+ # importing the snapshot modules, believing their Table declarations
+ # needed ``version_transaction`` to exist. That's not actually the
+ # case — the snapshot tables reference ``version_transaction.id``
+ # only at the DB level (via the migration); the SQLAlchemy Table
+ # objects here intentionally declare ``transaction_id`` as a plain
+ # ``BigInteger`` without a FK to avoid the resolution dependency.
+ # Removing the global ``configure_mappers()`` avoids eagerly
+ # resolving relationships in other unrelated models (notably
+ # Flask-AppBuilder's AuditMixin on classes like Tag, whose
+ # ``created_by`` primaryjoin only resolves under specific class
+ # registry states in SQLAlchemy 1.4).
+ from superset.versioning.changes import ( # noqa: E402
+ register_change_record_listener,
+ )
+
+ # All versioned models — Dashboard / Slice / SqlaTable plus their
+ # children (TableColumn / SqlMetric) and the dashboard_slices
+ # M2M — go through Continuum's shadow tables. The JSON-snapshot
+ # path that previously backed dataset / dashboard child diffs
+ # has been removed (full-Continuum spike).
+ for model_cls in (Dashboard, Slice, SqlaTable):
+ try:
+ version_class(model_cls) # ensure Continuum wired this model
+ # Dedup guard: VERSIONED_MODELS is module-level state, and
+ # test fixtures initialize multiple Superset apps per
+ # process — without the check each re-init appends
+ # duplicate entries.
+ if model_cls not in VERSIONED_MODELS:
+ VERSIONED_MODELS.append(model_cls)
+ except Exception: # pylint: disable=broad-except
+ # Continuum failed to wire versioning for this model. We
+ # boot in degraded mode rather than failing startup, but a
+ # silent skip would hide that change capture has stopped for
+ # the model — so surface it at WARNING with the traceback.
+ logger.warning(
+ "Versioning is not wired for %s; change capture will be "
+ "skipped for it. This usually means Continuum did not "
+ "register a version class for the model.",
+ model_cls.__name__,
+ exc_info=True,
+ )
+
+ register_baseline_listener()
+ register_change_record_listener()
+
+ # Retention pruning runs out-of-band as a scheduled Celery beat
+ # task, shipped as a separate stacked PR. The previous
+ # synchronous after_commit listener was retired so retention work
+ # doesn't add latency to user saves.
+
def init_app_in_ctx(self) -> None:
"""
Runs init logic in the context of the app
@@ -638,6 +802,9 @@ def init_app_in_ctx(self) -> None:
self.init_all_dependencies_and_extensions()
+ # Must run after all versioned models are imported and mappers configured.
+ self.init_versioning()
+
@staticmethod
def _log_config_warning(message: str) -> None:
top_banner = 80 * "-" + "\n" + 36 * " " + "WARNING\n" + 80 * "-"
diff --git a/superset/migrations/versions/2026-05-01_23-36_2bee73611e32_composite_pk_association_tables.py b/superset/migrations/versions/2026-05-01_23-36_2bee73611e32_composite_pk_association_tables.py
new file mode 100644
index 000000000000..e31296d241f4
--- /dev/null
+++ b/superset/migrations/versions/2026-05-01_23-36_2bee73611e32_composite_pk_association_tables.py
@@ -0,0 +1,580 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""composite_pk_association_tables
+
+Replace the unused synthetic ``id INTEGER PRIMARY KEY`` on eight many-to-many
+association tables with a composite primary key on the two FK columns. Drops
+the now-redundant ``UniqueConstraint(fk1, fk2)`` on the two tables that
+already carry one. Pre-flight: deletes rows with NULL FK values (six tables
+allow them today) and any duplicate ``(fk1, fk2)`` rows.
+
+Motivated by SQLAlchemy-Continuum issue #129 (M2M restore against junction
+tables with surrogate PKs); also closes the data-integrity hole where six
+of the eight tables lacked DB-level uniqueness.
+
+Revision ID: 2bee73611e32
+Revises: 78a40c08b4be
+Create Date: 2026-05-01 23:36:34.050058
+
+"""
+
+import logging
+from typing import NamedTuple
+
+import sqlalchemy as sa
+from alembic import op
+from alembic.operations.base import BatchOperations
+from sqlalchemy import inspect
+from sqlalchemy.engine import Connection
+
+# revision identifiers, used by Alembic.
+revision = "2bee73611e32"
+down_revision = "78a40c08b4be"
+
+logger = logging.getLogger("alembic.env")
+
+
+class AssociationTable(NamedTuple):
+ """A junction table being converted from surrogate-id PK to composite-FK PK."""
+
+ name: str
+ fk1: str
+ fk2: str
+
+
+# Order is alphabetical by table name; deterministic for review and bisection.
+AFFECTED_TABLES: list[AssociationTable] = [
+ AssociationTable("dashboard_roles", "dashboard_id", "role_id"),
+ AssociationTable("dashboard_slices", "dashboard_id", "slice_id"),
+ AssociationTable("dashboard_user", "user_id", "dashboard_id"),
+ AssociationTable("report_schedule_user", "user_id", "report_schedule_id"),
+ AssociationTable("rls_filter_roles", "role_id", "rls_filter_id"),
+ AssociationTable("rls_filter_tables", "table_id", "rls_filter_id"),
+ AssociationTable("slice_user", "user_id", "slice_id"),
+ AssociationTable("sqlatable_user", "user_id", "table_id"),
+]
+
+# These two tables already declare ``UniqueConstraint(fk1, fk2)`` in the model;
+# the composite PK subsumes it, so the migration drops the redundant constraint.
+TABLES_WITH_PRE_EXISTING_UNIQUE: set[str] = {
+ "dashboard_slices",
+ "report_schedule_user",
+}
+
+# Documentation set: tables whose FK columns are nullable in their original
+# create_table migrations (``dashboard_roles.dashboard_id`` from revision
+# e11ccdd12658 is the most recent addition). ``report_schedule_user`` is the
+# only affected table created with both FK columns ``NOT NULL`` and is
+# intentionally absent here. This set is no longer consulted at runtime — the
+# upgrade now runs the NULL-FK cleanup on every affected table because the
+# DELETE is a cheap no-op when the columns are already NOT NULL, and that
+# eliminates the risk of bugs from this set going stale (the
+# ``dashboard_roles`` omission caught in PR review was exactly that bug).
+TABLES_WITH_NULLABLE_FKS: set[str] = {
+ "dashboard_roles",
+ "dashboard_slices",
+ "dashboard_user",
+ "rls_filter_roles",
+ "rls_filter_tables",
+ "slice_user",
+ "sqlatable_user",
+}
+
+
+def _check_no_external_fks_to_id(conn: Connection) -> None:
+ """Raise ``RuntimeError`` if any foreign key in the database references one
+ of the eight junction-table ``id`` columns. Uses SQLAlchemy's ``Inspector``
+ for dialect-agnostic introspection across PostgreSQL, MySQL, and SQLite.
+
+ Scope limitation: ``Inspector.get_table_names()`` returns tables in the
+ connection's default schema only. On PostgreSQL deployments where Superset
+ metadata lives in a non-default schema, or on multi-schema deployments
+ that allow cross-schema FKs, an external FK in another schema would not
+ be detected. This is acceptable for the standard single-schema
+ deployment that Superset documents; operators with multi-schema
+ metadata should run the equivalent inventory query against
+ ``information_schema.referential_constraints`` themselves before
+ applying.
+ """
+ affected = {t.name for t in AFFECTED_TABLES}
+ insp = inspect(conn)
+ for table_name in insp.get_table_names():
+ if table_name in affected:
+ continue
+ for fk in insp.get_foreign_keys(table_name):
+ if fk["referred_table"] in affected and "id" in fk["referred_columns"]:
+ raise RuntimeError(
+ f"Cannot drop synthetic id from {fk['referred_table']}: "
+ f"external FK {fk.get('name', '')} on {table_name} "
+ f"references {fk['referred_table']}({fk['referred_columns']}). "
+ "Drop or migrate the referencing FK before applying this "
+ "migration."
+ )
+
+
+def _table_clause(t: AssociationTable) -> sa.sql.expression.TableClause:
+ """Build a lightweight SQLAlchemy ``TableClause`` for ``t`` exposing the
+ columns the helper queries reference (``id``, ``fk1``, ``fk2``). Used so
+ that the dedupe / cleanup / assert SQL can be expressed via SQLAlchemy
+ core constructs rather than via string interpolation."""
+ return sa.table(t.name, sa.column("id"), sa.column(t.fk1), sa.column(t.fk2))
+
+
+def _delete_null_fk_rows(conn: Connection, t: AssociationTable) -> int:
+ """Delete rows where ``t.fk1`` or ``t.fk2`` is NULL on ``t.name``.
+
+ Returns the deletion count. Required because primary-key columns must be
+ NOT NULL; the PK-add downstream would fail with a cryptic constraint
+ violation if any NULL-FK rows survived. Run unconditionally on every
+ affected table — see ``TABLES_WITH_NULLABLE_FKS`` above for the rationale.
+ """
+ tbl = _table_clause(t)
+ stmt = sa.delete(tbl).where(sa.or_(tbl.c[t.fk1].is_(None), tbl.c[t.fk2].is_(None)))
+ result = conn.execute(stmt)
+ n = result.rowcount or 0
+ if n:
+ logger.warning(
+ "Deleted %d row(s) with NULL FK from %s before composite-PK promotion",
+ n,
+ t.name,
+ )
+ return n
+
+
+def _dedupe_by_min_id(conn: Connection, t: AssociationTable) -> int:
+ """Delete duplicate ``(t.fk1, t.fk2)`` rows from ``t.name`` keeping ``MIN(id)``.
+
+ Returns the deletion count. The ``NOT IN`` argument is wrapped in an
+ extra ``SELECT keep_id FROM (...) AS s`` derived table because MySQL
+ rejects ``DELETE FROM t WHERE id NOT IN (SELECT MIN(id) FROM t GROUP BY
+ ...)`` with ERROR 1093 unless the inner SELECT is materialized through
+ a derived table. SQLAlchemy's ``.subquery()`` produces that wrap.
+
+ Logs a sample (up to 10) of the discarded ``(fk1, fk2, id)`` tuples at
+ WARN before deletion, so operators can audit which rows are dropped —
+ the "keep ``MIN(id)``" policy preserves the original row, which is
+ correct in practice but discards any later, semantically-identical
+ re-grants.
+ """
+ tbl = _table_clause(t)
+
+ keep_min = (
+ sa.select(sa.func.min(tbl.c.id).label("keep_id"))
+ .group_by(tbl.c[t.fk1], tbl.c[t.fk2])
+ .subquery("keep_min")
+ )
+ keep_ids = sa.select(keep_min.c.keep_id)
+ discarded = tbl.c.id.notin_(keep_ids)
+
+ sample_stmt = (
+ sa.select(tbl.c[t.fk1], tbl.c[t.fk2], tbl.c.id).where(discarded).limit(10)
+ )
+ sample = list(conn.execute(sample_stmt))
+
+ delete_stmt = sa.delete(tbl).where(discarded)
+ result = conn.execute(delete_stmt)
+ n = result.rowcount or 0
+ if n:
+ logger.warning(
+ "Deduped %d duplicate row(s) from %s; sample of discarded "
+ "(%s, %s, id) tuples (up to 10): %s",
+ n,
+ t.name,
+ t.fk1,
+ t.fk2,
+ sample,
+ )
+ return n
+
+
+def _assert_no_duplicates(conn: Connection, t: AssociationTable) -> None:
+ """Raise ``RuntimeError`` if any ``(t.fk1, t.fk2)`` duplicate group remains.
+
+ Called after ``_dedupe_by_min_id`` to surface silent dialect-dependent
+ dedupe failures (e.g., a MySQL syntax issue) as an actionable error
+ before the PK-add fires with a less-helpful constraint-violation message.
+ """
+ tbl = _table_clause(t)
+ duplicate_groups = (
+ sa.select(sa.literal(1))
+ .select_from(tbl)
+ .group_by(tbl.c[t.fk1], tbl.c[t.fk2])
+ .having(sa.func.count() > 1)
+ .subquery("duplicate_groups")
+ )
+ count_stmt = sa.select(sa.func.count()).select_from(duplicate_groups)
+ if remaining := conn.scalar(count_stmt) or 0:
+ raise RuntimeError(
+ f"Dedupe failed for {t.name}: {remaining} duplicate "
+ f"({t.fk1}, {t.fk2}) groups remain after _dedupe_by_min_id. "
+ f"Check the dedupe SQL for dialect {conn.dialect.name}."
+ )
+
+
+def _build_pre_upgrade_table(
+ insp: sa.engine.reflection.Inspector,
+ t: AssociationTable,
+ fks: list[dict] | None = None,
+) -> sa.Table:
+ """Build a ``Table`` object representing the pre-upgrade schema of ``t``,
+ explicitly *without* any redundant ``UniqueConstraint(t.fk1, t.fk2)``.
+ Used as ``copy_from`` to ``batch_alter_table`` so the rebuilt table
+ omits the unnamed UNIQUE constraint deterministically across dialects
+ (SQLite reflects unnamed UNIQUEs with ``name=None``, defeating the
+ standard ``batch_op.drop_constraint(name)`` path).
+
+ Reflects column types and FK targets (with original FK constraint names
+ preserved) from the live database; only the redundant UNIQUE is omitted.
+
+ *fks* lets a caller pass a pre-captured ``get_foreign_keys`` result.
+ The MySQL upgrade path drops the live FK constraints before building
+ this table, so re-reflecting here would only see them via the
+ Inspector's per-instance ``info_cache`` — an implementation detail,
+ not a contract. Passing the pre-drop list makes the dependency
+ explicit instead of relying on reflection caching.
+ """
+ md = sa.MetaData()
+ if fks is None:
+ fks = insp.get_foreign_keys(t.name)
+ fks_for_col: dict[str, list[dict]] = {}
+ for fk in fks:
+ for col_name in fk["constrained_columns"]:
+ fks_for_col.setdefault(col_name, []).append(fk)
+
+ cols: list[sa.Column] = []
+ for c in insp.get_columns(t.name):
+ col_kwargs = {"nullable": c.get("nullable", True)}
+ if c["name"] == "id":
+ col_kwargs["primary_key"] = True
+ col_kwargs["autoincrement"] = True
+ fk_args = []
+ for fk in fks_for_col.get(c["name"], []):
+ idx = fk["constrained_columns"].index(c["name"])
+ target = f"{fk['referred_table']}.{fk['referred_columns'][idx]}"
+ options = {}
+ if fk.get("options", {}).get("ondelete"):
+ options["ondelete"] = fk["options"]["ondelete"]
+ if fk.get("name"):
+ options["name"] = fk["name"]
+ fk_args.append(sa.ForeignKey(target, **options))
+ cols.append(sa.Column(c["name"], c["type"], *fk_args, **col_kwargs))
+ return sa.Table(t.name, md, *cols)
+
+
+def _drop_redundant_unique_by_name(
+ conn: Connection, insp: sa.engine.reflection.Inspector, t: AssociationTable
+) -> None:
+ """Drop the redundant ``UNIQUE(fk1, fk2)`` constraint by its reflected
+ name on PostgreSQL / MySQL.
+
+ The two tables in ``TABLES_WITH_PRE_EXISTING_UNIQUE`` carry a UNIQUE
+ constraint that the composite primary key subsumes. PostgreSQL and
+ MySQL both auto-name UNIQUE constraints (``
__key`` on
+ Postgres, ``
_
_`` or the explicit ``uq_*`` we may have
+ given it on MySQL), so they're reflectable by name. SQLite is
+ handled separately via ``recreate="always"`` + ``copy_from`` because
+ it reflects unnamed UNIQUEs with ``name=None``.
+
+ No-op if no matching UNIQUE is found (defensive — re-runs after a
+ partial application should not error).
+ """
+ for uc in insp.get_unique_constraints(t.name):
+ if set(uc.get("column_names", [])) == {t.fk1, t.fk2} and uc.get("name"):
+ op.drop_constraint(uc["name"], t.name, type_="unique")
+ return
+
+
+# MySQL ON DELETE actions that the downgrade re-create loop is allowed
+# to interpolate into raw SQL. The reflected value comes from MySQL's
+# information_schema (so not user input), but a whitelist eliminates
+# the "what if an unexpected value appears" question entirely. The
+# four entries are the SQL-standard set; SET DEFAULT is intentionally
+# excluded because InnoDB silently downgrades it to NO ACTION.
+_VALID_ONDELETE_ACTIONS: frozenset[str] = frozenset(
+ {"CASCADE", "SET NULL", "RESTRICT", "NO ACTION"}
+)
+
+
+def _enforce_not_null_for_sqlite(
+ batch_op: BatchOperations, t: AssociationTable, conn: Connection
+) -> None:
+ """Force ``NOT NULL`` on the FK columns post-PK-promotion on SQLite only.
+
+ SQLite has a long-standing quirk: composite ``PRIMARY KEY`` does not
+ promote constituent columns to ``NOT NULL`` (only ``INTEGER PRIMARY KEY``
+ does). PostgreSQL and MySQL implicitly promote the PK columns to
+ ``NOT NULL`` when the constraint is added, making the explicit
+ ``alter_column`` redundant there.
+
+ Skipping the ``alter_column`` on MySQL is also functionally required:
+ MySQL 8 rejects ``ALTER COLUMN`` on a column that participates in a
+ foreign key constraint with ``ERROR 1832 (HY000): Cannot change column
+ 'X': used in a foreign key constraint 'Y'`` whenever the table has
+ data — even when the only change is ``NULL`` → ``NOT NULL`` and the
+ column is already part of a freshly-added composite primary key (which
+ InnoDB has just made implicitly ``NOT NULL`` anyway). The error fires
+ on populated tables but not on empty ones, which is why CI's
+ ``test-mysql`` shard (fresh schema) didn't catch this and a real
+ production-shaped install does.
+
+ Only SQLite still needs the explicit step, and SQLite has no FK
+ enforcement objection.
+ """
+ if conn.dialect.name == "sqlite":
+ batch_op.alter_column(t.fk1, existing_type=sa.Integer, nullable=False)
+ batch_op.alter_column(t.fk2, existing_type=sa.Integer, nullable=False)
+
+
+def upgrade() -> None:
+ conn = op.get_bind()
+ _check_no_external_fks_to_id(conn)
+ insp = inspect(conn)
+
+ for t in AFFECTED_TABLES:
+ # Resumability guard: on MySQL every DDL statement auto-commits, so
+ # a failure at table N of 8 leaves tables 1..N-1 already converted
+ # while ``alembic_version`` is still un-stamped. Without this guard
+ # a re-run would fail at table 1 (``drop_column("id")`` on a table
+ # that no longer has ``id``), and ``downgrade`` can't run either
+ # (the revision was never stamped) — recovery would need manual
+ # surgery. A converted table is identified by the absent ``id``
+ # column; skipping it makes re-running the upgrade safe on every
+ # dialect (Postgres/SQLite wrap the migration in a transaction, so
+ # the guard is simply never hit there).
+ if "id" not in {c["name"] for c in insp.get_columns(t.name)}:
+ logger.info(
+ "%s: already converted (no surrogate id column); skipping",
+ t.name,
+ )
+ continue
+
+ # Run NULL-FK cleanup unconditionally: it is a no-op DELETE on tables
+ # whose FK columns are already NOT NULL (cheap), and skipping it on a
+ # table whose FK was nullable would leave the PK-add to fail with a
+ # cryptic constraint violation. Cf. ``TABLES_WITH_NULLABLE_FKS`` above
+ # for documentation of which tables are known to have nullable FKs.
+ _delete_null_fk_rows(conn, t)
+ _dedupe_by_min_id(conn, t)
+ _assert_no_duplicates(conn, t)
+
+ # Two tables (``dashboard_slices``, ``report_schedule_user``)
+ # carry a redundant ``UNIQUE(fk1, fk2)`` that the composite PK
+ # subsumes. Three dialect-specific paths:
+ #
+ # * **PostgreSQL** — the UNIQUE constraint has a stable
+ # reflected name (Postgres default convention), so we
+ # ``DROP CONSTRAINT`` by name and then run the structural
+ # change as direct ALTER. This avoids the full-table copy
+ # that ``recreate="always"`` would trigger
+ # (``CREATE TABLE AS SELECT → DROP → RENAME``), holding
+ # ``ACCESS EXCLUSIVE`` only for the (much shorter) PK
+ # index build instead of the full copy duration.
+ #
+ # * **MySQL** — InnoDB binds the FK constraints to the
+ # redundant UNIQUE's underlying index for back-reference,
+ # so a direct ``DROP CONSTRAINT`` of the UNIQUE raises
+ # ``ERROR 1553``. Use ``recreate="always"`` to rebuild the
+ # table without the UNIQUE; drop the FKs first to dodge
+ # the ``ERROR 1826`` (duplicate FK constraint name) that
+ # the temp-table phase would otherwise provoke. The FKs
+ # are re-created automatically as part of ``copy_from``.
+ #
+ # * **SQLite** — unnamed UNIQUE constraints reflect with
+ # ``name=None`` and can't be dropped by name. Use
+ # ``recreate="always"`` + ``copy_from`` (omits UNIQUE).
+ # SQLite always rebuilds for PK changes anyway, so the
+ # recreate isn't extra cost there.
+ if t.name in TABLES_WITH_PRE_EXISTING_UNIQUE:
+ if conn.dialect.name == "postgresql":
+ _drop_redundant_unique_by_name(conn, insp, t)
+ with op.batch_alter_table(t.name) as batch_op:
+ batch_op.drop_column("id")
+ batch_op.create_primary_key(f"pk_{t.name}", [t.fk1, t.fk2])
+ _enforce_not_null_for_sqlite(batch_op, t, conn)
+ else:
+ # Capture the FK list BEFORE dropping: the copy_from table
+ # below must embed these constraints, and re-reflecting
+ # after the drop only works via the Inspector's
+ # per-instance info_cache (see _build_pre_upgrade_table).
+ pre_drop_fks = insp.get_foreign_keys(t.name)
+ if conn.dialect.name == "mysql":
+ for fk in pre_drop_fks:
+ if fk_name := fk.get("name"):
+ op.drop_constraint(fk_name, t.name, type_="foreignkey")
+ with op.batch_alter_table(
+ t.name,
+ recreate="always",
+ copy_from=_build_pre_upgrade_table(insp, t, fks=pre_drop_fks),
+ ) as batch_op:
+ batch_op.drop_column("id")
+ batch_op.create_primary_key(f"pk_{t.name}", [t.fk1, t.fk2])
+ _enforce_not_null_for_sqlite(batch_op, t, conn)
+ else:
+ with op.batch_alter_table(t.name) as batch_op:
+ batch_op.drop_column("id")
+ batch_op.create_primary_key(f"pk_{t.name}", [t.fk1, t.fk2])
+ _enforce_not_null_for_sqlite(batch_op, t, conn)
+
+
+def downgrade() -> None:
+ # Inverse order: undo upgrade transformations from last-applied to
+ # first-applied. Within each table, drop the composite PK, restore the
+ # surrogate ``id`` column, and re-add the original ``UNIQUE`` constraint
+ # on the two tables that previously carried one.
+ #
+ # Note: FK columns remain NOT NULL after downgrade (intentional asymmetry
+ # — see UPDATING.md). Restoring the original nullable state would require
+ # an explicit ``alter_column`` per FK per table for no operator value;
+ # junction-table NULL FKs were always meaningless under ``secondary=``
+ # semantics.
+ # The downgrade names the restored PK ``
_pkey`` (matching Postgres'
+ # default constraint-naming convention, which was the original constraint
+ # name before this migration ran) so a downgrade-then-upgrade round-trip
+ # doesn't collide on the upgrade's ``pk_
`` name.
+ #
+ # Adding a NOT NULL ``id`` column to a table with existing rows requires
+ # a default that fires on the existing rows. ``sa.Identity()`` (Postgres
+ # 10+ / MySQL 8+) and ``sa.Sequence`` (with explicit nextval) both
+ # backfill existing rows during ALTER TABLE; bare ``autoincrement=True``
+ # does not. ``Identity`` is the modern portable choice.
+ conn = op.get_bind()
+ insp = inspect(conn)
+ is_mysql = conn.dialect.name == "mysql"
+ for t in reversed(AFFECTED_TABLES):
+ if is_mysql:
+ _downgrade_mysql_table(insp, t)
+ else:
+ with op.batch_alter_table(t.name) as batch_op:
+ batch_op.drop_constraint(f"pk_{t.name}", type_="primary")
+ batch_op.add_column(
+ sa.Column(
+ "id",
+ sa.Integer,
+ sa.Identity(always=False),
+ nullable=False,
+ )
+ )
+ batch_op.create_primary_key(f"{t.name}_pkey", ["id"])
+ if t.name in TABLES_WITH_PRE_EXISTING_UNIQUE:
+ batch_op.create_unique_constraint(
+ f"uq_{t.name}_{t.fk1}_{t.fk2}", [t.fk1, t.fk2]
+ )
+
+
+def _downgrade_mysql_table(
+ insp: sa.engine.reflection.Inspector, t: AssociationTable
+) -> None:
+ """MySQL-specific downgrade for one table.
+
+ Two MySQL quirks force a dialect-specific path here:
+
+ 1. **ERROR 1553 — ``Cannot drop index 'PRIMARY': needed in a foreign
+ key constraint``**. InnoDB uses the composite PK index to back the
+ FK on the leftmost column. Dropping the PK before the FKs orphans
+ that backing index. PostgreSQL and SQLite create separate indexes
+ for FK columns and don't need this dance. We drop the FKs first
+ and re-add them after the structural change.
+
+ 2. **``Identity(always=False)`` on a non-PK column add does not emit
+ ``AUTO_INCREMENT`` on MySQL.** SQLAlchemy 1.4 only emits
+ ``AUTO_INCREMENT`` when the column has both ``Identity()`` and
+ ``primary_key=True`` at create time. Our portable path adds the
+ column first, then creates the PK separately — which works on
+ Postgres (the column gets ``GENERATED BY DEFAULT AS IDENTITY``)
+ and SQLite (``INTEGER PRIMARY KEY`` becomes a rowid alias) but
+ leaves MySQL without auto-generation, so existing rows can't be
+ backfilled and future ``INSERT`` statements fail with
+ ``Field 'id' doesn't have a default value``. The combined
+ ``DROP PRIMARY KEY, ADD COLUMN AUTO_INCREMENT, ADD PRIMARY KEY``
+ in a single ALTER statement is the canonical MySQL idiom: MySQL
+ backfills existing rows with sequential values and the column
+ remains auto-incrementing for future inserts.
+
+ Raw SQL is unavoidable here — there is no SQLAlchemy core equivalent
+ for the combined-ALTER form, and the constitution allows raw SQL for
+ dialect-specific DDL with no programmatic equivalent (preferring
+ triple-quoted strings for legibility).
+
+ Belt-and-braces guard: ``t.name`` is interpolated as a backtick-quoted
+ identifier in the ALTER statements below. The value comes from
+ ``AFFECTED_TABLES`` (a module-level literal), so SQL injection is
+ structurally precluded. The explicit ``allowed`` check here makes
+ that invariant load-bearing rather than implicit, so a future
+ refactor that loosens the call-site can't slip past review.
+ """
+ allowed = {a.name for a in AFFECTED_TABLES}
+ if t.name not in allowed:
+ raise RuntimeError(
+ f"Refusing to ALTER unknown table {t.name!r}: "
+ f"only AFFECTED_TABLES entries may flow through this path."
+ )
+
+ fks = insp.get_foreign_keys(t.name)
+
+ for fk in fks:
+ if fk_name := fk.get("name"):
+ op.execute(f"ALTER TABLE `{t.name}` DROP FOREIGN KEY `{fk_name}`")
+
+ op.execute(
+ f"""
+ ALTER TABLE `{t.name}`
+ DROP PRIMARY KEY,
+ ADD COLUMN id INT NOT NULL AUTO_INCREMENT,
+ ADD PRIMARY KEY (id)
+ """
+ )
+
+ if t.name in TABLES_WITH_PRE_EXISTING_UNIQUE:
+ op.execute(
+ f"""
+ ALTER TABLE `{t.name}`
+ ADD UNIQUE INDEX `uq_{t.name}_{t.fk1}_{t.fk2}`
+ (`{t.fk1}`, `{t.fk2}`)
+ """
+ )
+
+ for fk in fks:
+ # Guard the FK name for symmetry with the drop loop above.
+ # MySQL/InnoDB always reflects a name for FK constraints
+ # (auto-assigning ``
_ibfk_`` if none was specified),
+ # so this branch is defensive rather than reachable in practice.
+ fk_name = fk.get("name")
+ if not fk_name:
+ continue
+ ondelete = fk.get("options", {}).get("ondelete")
+ # Defensive whitelist: ``ondelete`` is reflected from MySQL's
+ # information_schema (not user input), but interpolating it
+ # into raw SQL without a check leaves a "what if an
+ # unexpected value appears" footgun. The SQL standard defines
+ # exactly four actions; reject anything else loudly.
+ if ondelete and ondelete.upper() not in _VALID_ONDELETE_ACTIONS:
+ raise RuntimeError(
+ f"Unexpected ON DELETE action {ondelete!r} reflected from "
+ f"{t.name}.{fk_name}; refusing to interpolate into raw SQL."
+ )
+ ondelete_clause = f" ON DELETE {ondelete}" if ondelete else ""
+ local_cols = ", ".join(f"`{c}`" for c in fk["constrained_columns"])
+ ref_cols = ", ".join(f"`{c}`" for c in fk["referred_columns"])
+ op.execute(
+ f"""
+ ALTER TABLE `{t.name}`
+ ADD CONSTRAINT `{fk_name}`
+ FOREIGN KEY ({local_cols})
+ REFERENCES `{fk["referred_table"]}` ({ref_cols})
+ {ondelete_clause}
+ """
+ )
diff --git a/superset/migrations/versions/2026-05-28_19-50_56cd24c07170_add_versioning_tables.py b/superset/migrations/versions/2026-05-28_19-50_56cd24c07170_add_versioning_tables.py
new file mode 100644
index 000000000000..141edafcec4c
--- /dev/null
+++ b/superset/migrations/versions/2026-05-28_19-50_56cd24c07170_add_versioning_tables.py
@@ -0,0 +1,567 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""add_versioning_tables
+
+Creates the full schema backing entity versioning in a single
+migration:
+
+1. ``version_transaction`` — audit log keyed by Continuum's per-flush
+ transaction id (plus a Postgres-specific id sequence).
+2. **Parent shadow tables** mirroring each versioned entity's columns:
+ ``dashboards_version`` / ``slices_version`` / ``tables_version``.
+3. ``version_changes`` — field-level diff log keyed to a
+ ``(transaction, entity)`` pair; each row describes one atomic change
+ (one field or one child-collection element) that occurred during a
+ save.
+4. **Child shadow tables** for the collections Continuum auto-registers
+ when ``__versioned__`` is applied to ``TableColumn`` / ``SqlMetric``
+ and the ``slices`` exclude is removed from
+ ``Dashboard.__versioned__``: ``table_columns_version`` /
+ ``sql_metrics_version`` / ``dashboard_slices_version``.
+
+All shadow tables follow the validity-strategy shape (mirrored columns
++ ``transaction_id`` / ``end_transaction_id`` / ``operation_type``
+bookkeeping with FKs to ``version_transaction.id``). The current
+version row has ``end_transaction_id = NULL``.
+
+This migration replaces three iterative migrations from the spike phase
+(``56cd24c07170``, ``e1f3c5a7b9d0``, ``f7a2b3c4d5e6``) that captured the
+same schema in three steps as the feature was developed. Compacting
+gives downstream operators one migration to apply / reverse and one
+review surface. The ``revision`` hash is reused from the original first
+migration so anyone still tracking the chain by that hash lands on the
+same logical change set.
+
+Generated by hand because the current Continuum + Alembic-autogenerate
+interaction trips on the renamed ``transaction`` -> ``version_transaction``
+table key (``KeyError`` lookups in ``table_key_to_table``). Column
+inventories were sourced from the live model ``__table__`` definitions
+and ``version_class(...).__table__`` / Continuum association metadata.
+
+Primary key choice. Both ``version_transaction.id`` and
+``version_changes.id`` are ``BigInteger`` autoincrement — a deliberate
+carveout from the project's UUID-PK convention for new models (see
+``CLAUDE.md`` §"UUID Migration"). ``version_transaction`` is keyed
+externally by SQLAlchemy-Continuum via
+``nextval('version_transaction_id_seq')`` on every INSERT; matching
+that contract is required for ``versioning_manager`` to function.
+``version_changes`` follows the same shape because the user-facing
+identity is the ``(transaction_id, entity_kind, entity_id, sequence)``
+composite unique key, not the row id; the API surfaces a deterministic
+UUIDv5 ``version_uuid`` derived from ``entity.uuid`` and
+``transaction_id`` for stable external references.
+
+See spec FR-016..FR-021, data-model.md §``version_changes`` /
+§"Storage architecture", and the spike notes in
+``spike-continuum-restore.md``.
+
+Revision ID: 56cd24c07170
+Revises: 2bee73611e32
+Create Date: 2026-05-28 19:50:00.000000
+
+"""
+
+from __future__ import annotations
+
+import sqlalchemy as sa
+from alembic import op
+from sqlalchemy_utils import UUIDType
+
+from superset.utils.core import MediumText
+
+revision = "56cd24c07170"
+# Stacked on the composite-PK association-tables change (2bee73611e32) so the
+# Continuum shadow tables this migration creates can mirror the
+# composite-PK shape of the live association tables. If that change
+# is removed from the stack, this should be reverted to "ce6bd21901ab".
+down_revision = "2bee73611e32"
+
+
+def upgrade() -> None:
+ bind = op.get_bind()
+
+ # ------------------------------------------------------------------
+ # version_transaction
+ #
+ # Audit log for each versioning event. Continuum emits
+ # ``nextval('version_transaction_id_seq')`` on every INSERT, so the
+ # sequence must exist before the table on Postgres. SQLite/MySQL
+ # ignore the explicit CREATE SEQUENCE (they auto-increment natively).
+ # ------------------------------------------------------------------
+ if bind.dialect.name == "postgresql":
+ op.execute("CREATE SEQUENCE IF NOT EXISTS version_transaction_id_seq")
+
+ op.create_table(
+ "version_transaction",
+ sa.Column(
+ "id",
+ sa.BigInteger(),
+ sa.Sequence("version_transaction_id_seq"),
+ primary_key=True,
+ autoincrement=True,
+ nullable=False,
+ ),
+ sa.Column("issued_at", sa.DateTime(), nullable=True),
+ sa.Column("remote_addr", sa.String(50), nullable=True),
+ sa.Column("user_id", sa.Integer(), nullable=True),
+ # ``action_kind`` carries the high-level avenue that produced
+ # this transaction (``restore`` / ``import`` / ``clone``).
+ # ``NULL`` is the default "ordinary save" — most rows leave
+ # this empty. Commands set
+ # ``session.info["_versioning_action_kind"]`` before commit;
+ # the change-record listener stamps the value here. Parallel
+ # to ``version_changes.entity_kind`` and ``version_changes.kind``
+ # — the schema's third ``*_kind`` column, at transaction scope.
+ sa.Column("action_kind", sa.String(32), nullable=True),
+ )
+
+ if bind.dialect.name == "postgresql":
+ op.execute(
+ "ALTER SEQUENCE version_transaction_id_seq OWNED BY version_transaction.id"
+ )
+
+ # ------------------------------------------------------------------
+ # dashboards_version
+ # ------------------------------------------------------------------
+ op.create_table(
+ "dashboards_version",
+ sa.Column("uuid", UUIDType(binary=True), nullable=True),
+ sa.Column("id", sa.Integer(), nullable=False),
+ sa.Column("dashboard_title", sa.String(500), nullable=True),
+ # ``MediumText()`` mirrors the live column type — on MySQL plain
+ # ``TEXT`` caps at 64 KB, which large dashboards exceed; an
+ # oversized live write would then fail the shadow INSERT under
+ # ``STRICT_TRANS_TABLES`` (or silently truncate without it) and
+ # corrupt the history. Postgres ``TEXT`` is unbounded and SQLite
+ # ignores the length annotation so this is MySQL-driven.
+ sa.Column("position_json", MediumText(), nullable=True),
+ sa.Column("description", sa.Text(), nullable=True),
+ sa.Column("css", MediumText(), nullable=True),
+ sa.Column("theme_id", sa.Integer(), nullable=True),
+ sa.Column("certified_by", sa.Text(), nullable=True),
+ sa.Column("certification_details", sa.Text(), nullable=True),
+ sa.Column("json_metadata", MediumText(), nullable=True),
+ sa.Column("slug", sa.String(255), nullable=True),
+ sa.Column("published", sa.Boolean(), nullable=True),
+ sa.Column("is_managed_externally", sa.Boolean(), nullable=True),
+ sa.Column("external_url", sa.Text(), nullable=True),
+ sa.Column("transaction_id", sa.BigInteger(), nullable=False),
+ sa.Column("end_transaction_id", sa.BigInteger(), nullable=True),
+ sa.Column("operation_type", sa.SmallInteger(), nullable=False),
+ sa.PrimaryKeyConstraint("id", "transaction_id"),
+ sa.ForeignKeyConstraint(
+ ["transaction_id"],
+ ["version_transaction.id"],
+ name="fk_dashboards_version_transaction_id",
+ ),
+ sa.ForeignKeyConstraint(
+ ["end_transaction_id"],
+ ["version_transaction.id"],
+ name="fk_dashboards_version_end_transaction_id",
+ ),
+ )
+ op.create_index(
+ "ix_dashboards_version_end_transaction_id",
+ "dashboards_version",
+ ["end_transaction_id"],
+ )
+ op.create_index(
+ "ix_dashboards_version_operation_type",
+ "dashboards_version",
+ ["operation_type"],
+ )
+ op.create_index(
+ "ix_dashboards_version_transaction_id",
+ "dashboards_version",
+ ["transaction_id"],
+ )
+
+ # ------------------------------------------------------------------
+ # slices_version (Charts)
+ # ------------------------------------------------------------------
+ op.create_table(
+ "slices_version",
+ sa.Column("uuid", UUIDType(binary=True), nullable=True),
+ sa.Column("id", sa.Integer(), nullable=False),
+ sa.Column("slice_name", sa.String(250), nullable=True),
+ sa.Column("datasource_id", sa.Integer(), nullable=True),
+ sa.Column("datasource_type", sa.String(200), nullable=True),
+ sa.Column("datasource_name", sa.String(2000), nullable=True),
+ sa.Column("viz_type", sa.String(250), nullable=True),
+ sa.Column("params", MediumText(), nullable=True),
+ sa.Column("description", sa.Text(), nullable=True),
+ sa.Column("cache_timeout", sa.Integer(), nullable=True),
+ sa.Column("certified_by", sa.Text(), nullable=True),
+ sa.Column("certification_details", sa.Text(), nullable=True),
+ sa.Column("is_managed_externally", sa.Boolean(), nullable=True),
+ sa.Column("external_url", sa.Text(), nullable=True),
+ sa.Column("transaction_id", sa.BigInteger(), nullable=False),
+ sa.Column("end_transaction_id", sa.BigInteger(), nullable=True),
+ sa.Column("operation_type", sa.SmallInteger(), nullable=False),
+ sa.PrimaryKeyConstraint("id", "transaction_id"),
+ sa.ForeignKeyConstraint(
+ ["transaction_id"],
+ ["version_transaction.id"],
+ name="fk_slices_version_transaction_id",
+ ),
+ sa.ForeignKeyConstraint(
+ ["end_transaction_id"],
+ ["version_transaction.id"],
+ name="fk_slices_version_end_transaction_id",
+ ),
+ )
+ op.create_index(
+ "ix_slices_version_end_transaction_id",
+ "slices_version",
+ ["end_transaction_id"],
+ )
+ op.create_index(
+ "ix_slices_version_operation_type",
+ "slices_version",
+ ["operation_type"],
+ )
+ op.create_index(
+ "ix_slices_version_transaction_id",
+ "slices_version",
+ ["transaction_id"],
+ )
+
+ # ------------------------------------------------------------------
+ # tables_version (SqlaTable / Datasets)
+ # ------------------------------------------------------------------
+ op.create_table(
+ "tables_version",
+ sa.Column("uuid", UUIDType(binary=True), nullable=True),
+ sa.Column("id", sa.Integer(), nullable=False),
+ sa.Column("description", sa.Text(), nullable=True),
+ sa.Column("default_endpoint", sa.Text(), nullable=True),
+ sa.Column("is_featured", sa.Boolean(), nullable=True),
+ sa.Column("filter_select_enabled", sa.Boolean(), nullable=True),
+ sa.Column("offset", sa.Integer(), nullable=True),
+ sa.Column("cache_timeout", sa.Integer(), nullable=True),
+ sa.Column("params", sa.String(1000), nullable=True),
+ sa.Column("is_managed_externally", sa.Boolean(), nullable=True),
+ sa.Column("external_url", sa.Text(), nullable=True),
+ sa.Column("table_name", sa.String(250), nullable=True),
+ sa.Column("main_dttm_col", sa.String(250), nullable=True),
+ sa.Column("currency_code_column", sa.String(250), nullable=True),
+ sa.Column("database_id", sa.Integer(), nullable=True),
+ sa.Column("fetch_values_predicate", sa.Text(), nullable=True),
+ sa.Column("schema", sa.String(255), nullable=True),
+ sa.Column("catalog", sa.String(256), nullable=True),
+ sa.Column("sql", MediumText(), nullable=True),
+ sa.Column("is_sqllab_view", sa.Boolean(), nullable=True),
+ sa.Column("template_params", sa.Text(), nullable=True),
+ sa.Column("extra", sa.Text(), nullable=True),
+ sa.Column("normalize_columns", sa.Boolean(), nullable=True),
+ sa.Column("always_filter_main_dttm", sa.Boolean(), nullable=True),
+ sa.Column("folders", sa.JSON(), nullable=True),
+ sa.Column("transaction_id", sa.BigInteger(), nullable=False),
+ sa.Column("end_transaction_id", sa.BigInteger(), nullable=True),
+ sa.Column("operation_type", sa.SmallInteger(), nullable=False),
+ sa.PrimaryKeyConstraint("id", "transaction_id"),
+ sa.ForeignKeyConstraint(
+ ["transaction_id"],
+ ["version_transaction.id"],
+ name="fk_tables_version_transaction_id",
+ ),
+ sa.ForeignKeyConstraint(
+ ["end_transaction_id"],
+ ["version_transaction.id"],
+ name="fk_tables_version_end_transaction_id",
+ ),
+ )
+ op.create_index(
+ "ix_tables_version_end_transaction_id",
+ "tables_version",
+ ["end_transaction_id"],
+ )
+ op.create_index(
+ "ix_tables_version_operation_type",
+ "tables_version",
+ ["operation_type"],
+ )
+ op.create_index(
+ "ix_tables_version_transaction_id",
+ "tables_version",
+ ["transaction_id"],
+ )
+
+ # ------------------------------------------------------------------
+ # version_changes
+ #
+ # Field-level diff log keyed to a (transaction, entity) pair. Each
+ # row describes one atomic change (one field or one child-collection
+ # element) that occurred to one entity during a save. See spec
+ # FR-016..FR-021 and data-model.md §version_changes.
+ #
+ # ``(entity_kind, entity_id)`` is a polymorphic reference: depending
+ # on ``entity_kind`` (``"chart"`` / ``"dashboard"`` / ``"dataset"``)
+ # the ``entity_id`` is the integer PK on ``slices`` / ``dashboards`` /
+ # ``tables`` respectively. SQL has no native polymorphic FK, so the
+ # constraint is intentionally omitted — cleanup relies on the
+ # ``CASCADE`` from ``version_transaction.id`` plus command-layer
+ # ordering for entity deletes (the command that hard-deletes the
+ # entity runs inside the same transaction that prunes its history).
+ # A bare ``DELETE FROM WHERE id = X`` outside that
+ # transactional boundary leaves orphan ``version_changes`` rows
+ # whose ``entity_id`` references a vanished row — the read-side
+ # tombstone-state lookup handles this gracefully.
+ # ------------------------------------------------------------------
+ op.create_table(
+ "version_changes",
+ sa.Column(
+ "id",
+ sa.BigInteger(),
+ primary_key=True,
+ autoincrement=True,
+ nullable=False,
+ ),
+ sa.Column(
+ "transaction_id",
+ sa.BigInteger(),
+ sa.ForeignKey("version_transaction.id", ondelete="CASCADE"),
+ nullable=False,
+ ),
+ sa.Column(
+ "entity_kind",
+ sa.String(length=32),
+ nullable=False,
+ ),
+ sa.Column(
+ "entity_id",
+ sa.Integer(),
+ nullable=False,
+ ),
+ sa.Column(
+ # Integer, not SmallInteger: per-entity sequence within one
+ # transaction is assigned by unbounded enumerate(); a
+ # pathological diff (e.g. a giant position_json rewrite) could
+ # overflow SmallInteger's 32767 on Postgres/MySQL.
+ "sequence",
+ sa.Integer(),
+ nullable=False,
+ ),
+ sa.Column(
+ "kind",
+ sa.String(length=32),
+ nullable=False,
+ ),
+ # ``operation`` is the per-record verb: ``add`` / ``remove`` /
+ # ``move`` / ``edit``. ``move`` only fires for layout records;
+ # the other three apply across every emit site. Made explicit
+ # so consumers don't have to infer the verb from ``from_value``
+ # / ``to_value`` null-tests or from ``path[0]`` for layout records.
+ sa.Column(
+ "operation",
+ sa.String(length=16),
+ nullable=False,
+ ),
+ sa.Column("path", sa.JSON(), nullable=False),
+ sa.Column("from_value", sa.JSON(), nullable=True),
+ sa.Column("to_value", sa.JSON(), nullable=True),
+ sa.UniqueConstraint(
+ "transaction_id",
+ "entity_kind",
+ "entity_id",
+ "sequence",
+ name="uq_version_changes_tx_entity_sequence",
+ ),
+ )
+ op.create_index(
+ "ix_version_changes_kind",
+ "version_changes",
+ ["kind"],
+ )
+ op.create_index(
+ "ix_version_changes_entity",
+ "version_changes",
+ ["entity_kind", "entity_id"],
+ )
+
+ # ------------------------------------------------------------------
+ # table_columns_version
+ # ------------------------------------------------------------------
+ op.create_table(
+ "table_columns_version",
+ sa.Column("uuid", UUIDType(binary=True), nullable=True),
+ sa.Column("id", sa.Integer(), nullable=False),
+ sa.Column("column_name", sa.String(255), nullable=True),
+ sa.Column("verbose_name", sa.String(1024), nullable=True),
+ sa.Column("is_active", sa.Boolean(), nullable=True),
+ sa.Column("type", sa.Text(), nullable=True),
+ sa.Column("advanced_data_type", sa.String(255), nullable=True),
+ sa.Column("groupby", sa.Boolean(), nullable=True),
+ sa.Column("filterable", sa.Boolean(), nullable=True),
+ sa.Column("description", MediumText(), nullable=True),
+ sa.Column("table_id", sa.Integer(), nullable=True),
+ sa.Column("is_dttm", sa.Boolean(), nullable=True),
+ sa.Column("expression", MediumText(), nullable=True),
+ sa.Column("python_date_format", sa.String(255), nullable=True),
+ sa.Column("datetime_format", sa.String(100), nullable=True),
+ sa.Column("extra", sa.Text(), nullable=True),
+ sa.Column("transaction_id", sa.BigInteger(), nullable=False),
+ sa.Column("end_transaction_id", sa.BigInteger(), nullable=True),
+ sa.Column("operation_type", sa.SmallInteger(), nullable=False),
+ sa.PrimaryKeyConstraint("id", "transaction_id"),
+ sa.ForeignKeyConstraint(
+ ["transaction_id"],
+ ["version_transaction.id"],
+ name="fk_table_columns_version_transaction_id",
+ ),
+ sa.ForeignKeyConstraint(
+ ["end_transaction_id"],
+ ["version_transaction.id"],
+ name="fk_table_columns_version_end_transaction_id",
+ ),
+ )
+ op.create_index(
+ "ix_table_columns_version_end_transaction_id",
+ "table_columns_version",
+ ["end_transaction_id"],
+ )
+ op.create_index(
+ "ix_table_columns_version_operation_type",
+ "table_columns_version",
+ ["operation_type"],
+ )
+ op.create_index(
+ "ix_table_columns_version_transaction_id",
+ "table_columns_version",
+ ["transaction_id"],
+ )
+
+ # ------------------------------------------------------------------
+ # sql_metrics_version
+ # ------------------------------------------------------------------
+ op.create_table(
+ "sql_metrics_version",
+ sa.Column("uuid", UUIDType(binary=True), nullable=True),
+ sa.Column("id", sa.Integer(), nullable=False),
+ sa.Column("metric_name", sa.String(255), nullable=True),
+ sa.Column("verbose_name", sa.String(1024), nullable=True),
+ sa.Column("metric_type", sa.String(32), nullable=True),
+ sa.Column("description", MediumText(), nullable=True),
+ sa.Column("d3format", sa.String(128), nullable=True),
+ sa.Column("currency", sa.JSON(), nullable=True),
+ sa.Column("warning_text", sa.Text(), nullable=True),
+ sa.Column("table_id", sa.Integer(), nullable=True),
+ sa.Column("expression", MediumText(), nullable=True),
+ sa.Column("extra", sa.Text(), nullable=True),
+ sa.Column("transaction_id", sa.BigInteger(), nullable=False),
+ sa.Column("end_transaction_id", sa.BigInteger(), nullable=True),
+ sa.Column("operation_type", sa.SmallInteger(), nullable=False),
+ sa.PrimaryKeyConstraint("id", "transaction_id"),
+ sa.ForeignKeyConstraint(
+ ["transaction_id"],
+ ["version_transaction.id"],
+ name="fk_sql_metrics_version_transaction_id",
+ ),
+ sa.ForeignKeyConstraint(
+ ["end_transaction_id"],
+ ["version_transaction.id"],
+ name="fk_sql_metrics_version_end_transaction_id",
+ ),
+ )
+ op.create_index(
+ "ix_sql_metrics_version_end_transaction_id",
+ "sql_metrics_version",
+ ["end_transaction_id"],
+ )
+ op.create_index(
+ "ix_sql_metrics_version_operation_type",
+ "sql_metrics_version",
+ ["operation_type"],
+ )
+ op.create_index(
+ "ix_sql_metrics_version_transaction_id",
+ "sql_metrics_version",
+ ["transaction_id"],
+ )
+
+ # ------------------------------------------------------------------
+ # dashboard_slices_version (M2M association)
+ #
+ # The live ``dashboard_slices`` table is reshaped to a
+ # composite PK on ``(dashboard_id, slice_id)`` — no surrogate ``id``.
+ # Continuum auto-mirrors the live columns into the shadow Table at
+ # ``make_versioned()`` time, so the shadow's SQLAlchemy metadata
+ # also has no ``id``. The DB shadow PK is the natural composite key
+ # plus Continuum's bookkeeping (``transaction_id``, ``operation_type``);
+ # ``operation_type`` is included because a single transaction can in
+ # principle produce both INSERT and DELETE shadows for the same
+ # ``(dashboard_id, slice_id)`` pair (slice removed and re-added in
+ # one save).
+ #
+ # If that reshape is removed from the stack, the live table reverts to
+ # carrying its surrogate ``id`` and this migration would need to
+ # match — see ``spike-continuum-restore.md`` "Branch maintenance".
+ # ------------------------------------------------------------------
+ op.create_table(
+ "dashboard_slices_version",
+ sa.Column("dashboard_id", sa.Integer(), nullable=False),
+ sa.Column("slice_id", sa.Integer(), nullable=False),
+ sa.Column("transaction_id", sa.BigInteger(), nullable=False),
+ sa.Column("end_transaction_id", sa.BigInteger(), nullable=True),
+ sa.Column("operation_type", sa.SmallInteger(), nullable=False),
+ sa.PrimaryKeyConstraint(
+ "dashboard_id", "slice_id", "transaction_id", "operation_type"
+ ),
+ sa.ForeignKeyConstraint(
+ ["transaction_id"],
+ ["version_transaction.id"],
+ name="fk_dashboard_slices_version_transaction_id",
+ ),
+ sa.ForeignKeyConstraint(
+ ["end_transaction_id"],
+ ["version_transaction.id"],
+ name="fk_dashboard_slices_version_end_transaction_id",
+ ),
+ )
+ op.create_index(
+ "ix_dashboard_slices_version_end_transaction_id",
+ "dashboard_slices_version",
+ ["end_transaction_id"],
+ )
+ op.create_index(
+ "ix_dashboard_slices_version_operation_type",
+ "dashboard_slices_version",
+ ["operation_type"],
+ )
+ op.create_index(
+ "ix_dashboard_slices_version_transaction_id",
+ "dashboard_slices_version",
+ ["transaction_id"],
+ )
+
+
+def downgrade() -> None:
+ # Drop in reverse dependency order: children with FKs to
+ # ``version_transaction`` drop first; ``version_transaction`` and its
+ # sequence drop last.
+ op.drop_table("dashboard_slices_version")
+ op.drop_table("sql_metrics_version")
+ op.drop_table("table_columns_version")
+ op.drop_table("version_changes")
+ op.drop_table("tables_version")
+ op.drop_table("slices_version")
+ op.drop_table("dashboards_version")
+ op.drop_table("version_transaction")
+
+ bind = op.get_bind()
+ if bind.dialect.name == "postgresql":
+ op.execute("DROP SEQUENCE IF EXISTS version_transaction_id_seq")
diff --git a/superset/migrations/versions/2026-06-03_12-00_8f3a1b2c4d5e_shadow_live_row_indexes.py b/superset/migrations/versions/2026-06-03_12-00_8f3a1b2c4d5e_shadow_live_row_indexes.py
new file mode 100644
index 000000000000..68723578588c
--- /dev/null
+++ b/superset/migrations/versions/2026-06-03_12-00_8f3a1b2c4d5e_shadow_live_row_indexes.py
@@ -0,0 +1,173 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""shadow_live_row_indexes
+
+Adds per-shadow-table indexes covering the canonical "current live row
+of entity X" lookup that ``find_active_by_uuid`` / ``list_versions`` /
+``get_version`` / restore validation / activity-view all funnel
+through:
+
+ SELECT ... FROM _version
+ WHERE id = ? AND end_transaction_id IS NULL
+
+The base migration (``56cd24c07170_add_versioning_tables``) created
+single-column indexes on ``transaction_id``, ``end_transaction_id``,
+and ``operation_type``, but nothing covering the predicate combination
+that actually runs in hot paths.
+
+Index choice is dialect-specific:
+
+* **PostgreSQL / SQLite** — partial index over the entity ``id`` with
+ ``WHERE end_transaction_id IS NULL``. Cuts the index size to one row
+ per live entity (vs. one row per historical version) and turns the
+ hot lookup into a single index probe.
+* **MySQL** — partial indexes aren't supported; use a plain composite
+ ``(id, end_transaction_id)``. MySQL's optimizer handles the
+ ``IS NULL`` predicate against the composite efficiently.
+
+It also adds a composite ``(table_id, transaction_id)`` index on the two
+child shadow tables (``table_columns_version`` / ``sql_metrics_version``).
+The dataset child-diff path queries these by parent ``table_id`` plus a
+transaction-range bound, neither of which the base migration's
+single-column indexes nor the ``id``-leading PK can serve:
+
+ SELECT ... FROM table_columns_version
+ WHERE table_id = ? AND transaction_id <= ? AND ... (shadow_rows_valid_at)
+
+ SELECT max(transaction_id) FROM table_columns_version
+ WHERE table_id = ? AND transaction_id < ? (prior-tx probe)
+
+A plain composite leading with ``table_id`` serves both on every dialect,
+so no partial-index split is needed here.
+
+Surfaced by sqlalchemy-review pass W-NEW-4 (live-row lookup) and a
+Codex sqlalchemy-review pass (child-diff ``table_id`` lookup).
+
+Revision ID: 8f3a1b2c4d5e
+Revises: 56cd24c07170
+Create Date: 2026-06-03 12:00:00.000000
+
+"""
+
+from __future__ import annotations
+
+import sqlalchemy as sa
+from alembic import op
+
+revision = "8f3a1b2c4d5e"
+down_revision = "56cd24c07170"
+
+
+# The parent + child shadow tables, all of which carry an ``id``
+# column (mirroring the live entity's integer PK). ``dashboard_slices_version``
+# is intentionally excluded: it's the M2M association shadow with a
+# composite PK ``(dashboard_id, slice_id, transaction_id, operation_type)``
+# and no ``id`` column. The canonical "live row" lookup doesn't apply to
+# the M2M shadow — readers query it by ``transaction_id`` (already
+# indexed by the base migration) when reconstructing per-tx changes.
+SHADOW_TABLES: tuple[str, ...] = (
+ "dashboards_version",
+ "slices_version",
+ "tables_version",
+ "table_columns_version",
+ "sql_metrics_version",
+)
+
+
+# Child shadow tables whose rows are looked up by parent ``table_id`` plus a
+# transaction-range bound on the dataset child-diff path. Both carry a
+# nullable ``table_id`` mirroring the live row's FK to ``tables.id``.
+CHILD_SHADOW_TABLES: tuple[str, ...] = (
+ "table_columns_version",
+ "sql_metrics_version",
+)
+
+
+def _index_name(table: str) -> str:
+ return f"ix_{table}_live_id"
+
+
+def _child_index_name(table: str) -> str:
+ return f"ix_{table}_table_id_transaction_id"
+
+
+def upgrade() -> None:
+ bind = op.get_bind()
+ dialect = bind.dialect.name
+
+ where_clause = sa.text("end_transaction_id IS NULL")
+
+ for table in SHADOW_TABLES:
+ index_name = _index_name(table)
+ if dialect == "postgresql":
+ op.create_index(
+ index_name,
+ table,
+ ["id"],
+ unique=False,
+ postgresql_where=where_clause,
+ )
+ elif dialect == "sqlite":
+ op.create_index(
+ index_name,
+ table,
+ ["id"],
+ unique=False,
+ sqlite_where=where_clause,
+ )
+ else:
+ # MySQL (and any unknown dialect): partial indexes aren't
+ # supported, so use a plain composite. MySQL's optimizer
+ # handles ``id = ? AND end_transaction_id IS NULL`` against
+ # the composite efficiently.
+ op.create_index(
+ index_name,
+ table,
+ ["id", "end_transaction_id"],
+ unique=False,
+ )
+
+ # Child-diff access pattern: filter by parent ``table_id`` plus a
+ # transaction-range bound. A plain composite serves this on every
+ # dialect, so no partial-index split is needed.
+ for table in CHILD_SHADOW_TABLES:
+ op.create_index(
+ _child_index_name(table),
+ table,
+ ["table_id", "transaction_id"],
+ unique=False,
+ )
+
+
+def downgrade() -> None:
+ # Probe the inspector instead of emitting ``DROP INDEX IF EXISTS``:
+ # stock MySQL (5.7/8.x) has no IF EXISTS grammar for DROP INDEX
+ # (it's a MariaDB extension), so the clause is not dialect-portable.
+ # The existence check keeps the downgrade robust against a
+ # partial-application failure on upgrade (e.g. the first
+ # ``op.create_index`` succeeded under Postgres' transactional DDL but
+ # a later one failed and rolled back the rest — repeated downgrade
+ # must not raise on the missing indexes).
+ inspector = sa.inspect(op.get_bind())
+ for table in SHADOW_TABLES:
+ index_name = _index_name(table)
+ if any(ix["name"] == index_name for ix in inspector.get_indexes(table)):
+ op.drop_index(index_name, table_name=table)
+ for table in CHILD_SHADOW_TABLES:
+ index_name = _child_index_name(table)
+ if any(ix["name"] == index_name for ix in inspector.get_indexes(table)):
+ op.drop_index(index_name, table_name=table)
diff --git a/superset/models/dashboard.py b/superset/models/dashboard.py
index 4653272fcbf3..25395813d4a6 100644
--- a/superset/models/dashboard.py
+++ b/superset/models/dashboard.py
@@ -35,7 +35,6 @@
String,
Table,
Text,
- UniqueConstraint,
)
from sqlalchemy.engine.base import Connection
from sqlalchemy.orm import relationship, subqueryload
@@ -93,37 +92,53 @@ def copy_dashboard(_mapper: Mapper, _connection: Connection, target: Dashboard)
dashboard_slices = Table(
"dashboard_slices",
metadata,
- Column("id", Integer, primary_key=True),
- Column("dashboard_id", Integer, ForeignKey("dashboards.id", ondelete="CASCADE")),
- Column("slice_id", Integer, ForeignKey("slices.id", ondelete="CASCADE")),
- UniqueConstraint("dashboard_id", "slice_id"),
+ Column(
+ "dashboard_id",
+ Integer,
+ ForeignKey("dashboards.id", ondelete="CASCADE"),
+ primary_key=True,
+ ),
+ Column(
+ "slice_id",
+ Integer,
+ ForeignKey("slices.id", ondelete="CASCADE"),
+ primary_key=True,
+ ),
)
dashboard_user = Table(
"dashboard_user",
metadata,
- Column("id", Integer, primary_key=True),
- Column("user_id", Integer, ForeignKey("ab_user.id", ondelete="CASCADE")),
- Column("dashboard_id", Integer, ForeignKey("dashboards.id", ondelete="CASCADE")),
+ Column(
+ "user_id",
+ Integer,
+ ForeignKey("ab_user.id", ondelete="CASCADE"),
+ primary_key=True,
+ ),
+ Column(
+ "dashboard_id",
+ Integer,
+ ForeignKey("dashboards.id", ondelete="CASCADE"),
+ primary_key=True,
+ ),
)
DashboardRoles = Table(
"dashboard_roles",
metadata,
- Column("id", Integer, primary_key=True),
Column(
"dashboard_id",
Integer,
ForeignKey("dashboards.id", ondelete="CASCADE"),
- nullable=False,
+ primary_key=True,
),
Column(
"role_id",
Integer,
ForeignKey("ab_role.id", ondelete="CASCADE"),
- nullable=False,
+ primary_key=True,
),
)
@@ -132,6 +147,27 @@ class Dashboard(CoreDashboard, AuditMixinNullable, ImportExportMixin):
"""The dashboard object!"""
__tablename__ = "dashboards"
+ # deleted_at exclusion will be added when soft delete is merged.
+ # SPIKE (full-Continuum): ``slices`` removed from
+ # the exclude list so Continuum auto-creates an association version table
+ # for ``dashboard_slices`` and ``Reverter(relations=["slices"])`` can
+ # restore chart membership. Owners / roles stay excluded — access metadata,
+ # not user-authored content (ADR-005).
+ # Audit columns (changed_on/created_on/changed_by_fk/created_by_fk) are
+ # auto-bumped by AuditMixin on every save; excluding them lets Continuum's
+ # is_modified() return False on no-op saves (e.g. owners-only edits) so we
+ # don't create empty version rows. version_transaction.user_id /
+ # issued_at preserve "who/when" without per-row duplication.
+ __versioned__: dict[str, Any] = {
+ "exclude": [
+ "owners",
+ "roles",
+ "changed_on",
+ "created_on",
+ "changed_by_fk",
+ "created_by_fk",
+ ]
+ }
id = Column(Integer, primary_key=True)
dashboard_title = Column(String(500))
position_json = Column(utils.MediumText())
diff --git a/superset/models/helpers.py b/superset/models/helpers.py
index 5db4eb9f60e8..3b56181e966e 100644
--- a/superset/models/helpers.py
+++ b/superset/models/helpers.py
@@ -264,6 +264,53 @@ class UUIDMixin: # pylint: disable=too-few-public-methods
UUIDType(binary=True), primary_key=False, unique=True, default=uuid.uuid4
)
+ @validates("uuid")
+ def _coerce_uuid(self, key: str, value: Any) -> Any: # noqa: ARG002
+ """Coerce well-formed UUID strings to ``uuid.UUID`` on assignment;
+ pass everything else through untouched.
+
+ **Why coerce.** ``UUIDType`` only converts at SQL bind / SQL
+ result time. Importers and ad-hoc construction
+ (``SqlMetric(uuid="…string…")``) leave the in-memory attribute
+ as a ``str`` until the next DB round-trip refreshes it. With
+ SQLAlchemy-Continuum versioning attached to a child mapper
+ (``TableColumn`` / ``SqlMetric``), the post-INSERT attribute-
+ expire behaviour changes enough that the refresh doesn't happen
+ before the caller reads the attribute — breaking equality
+ assertions like ``test_import_dataset``'s
+ ``metric.uuid == uuid.UUID(...)`` because str ≠ UUID. Coercing
+ defensively here makes the in-memory attribute always a UUID
+ regardless of provenance.
+
+ **Why the non-UUID-string escape hatch.** Tightening this
+ validator to raise on non-UUID strings would break a small set
+ of existing unit tests that use human-readable placeholder
+ strings as fixture uuids (e.g.
+ ``test_dashboard_schemas.py``'s ``"dashboard-uuid-7"`` and
+ analogous placeholders in importer tests). The fixtures use
+ these placeholders for legibility — they're only ever compared
+ by string equality, never written to a real database. Letting
+ them through unchanged keeps the fixtures working at the cost
+ of deferring "real" UUID malformation to the SQL bind layer,
+ which raises a clearer "invalid input syntax for type uuid"
+ error keyed to the actual column.
+
+ **Tightening path** (if amin M1 is ever revisited): replace
+ the ``return value`` in the ``except`` branch with
+ ``raise ValueError(f"Invalid UUID: {value!r}")``, then run the
+ unit test suite and migrate any remaining placeholder fixtures
+ to ``uuid.uuid4()`` (use
+ ``rg '''SqlMetric\\(uuid="[^"]*"|"dashboard-uuid|"slice-uuid'''``
+ to find them). The full migration touches ~5–10 fixture files
+ and is non-breaking outside tests.
+ """
+ if isinstance(value, str):
+ try:
+ return uuid.UUID(value)
+ except ValueError:
+ return value
+ return value
+
@property
def short_uuid(self) -> str:
return str(self.uuid)[:8]
@@ -546,14 +593,23 @@ def remove_params(self, param_to_remove: str) -> None:
def reset_ownership(self) -> None:
"""object will belong to the user the current user"""
- # make sure the object doesn't have relations to a user
- # it will be filled by appbuilder on save
- self.created_by = None
- self.changed_by = None
- # flask global context might not exist (in cli or tests for example)
+ # Reset the audit pointers. When a Flask request context is
+ # available we explicitly stamp the current user, otherwise we
+ # leave the attributes unset so Flask-AppBuilder's column
+ # defaults fill them in on save. An explicit assignment is
+ # required because once the ``created_by`` / ``changed_by``
+ # relationships are configured (which happens eagerly on models
+ # registered with SQLAlchemy-Continuum), setting them to
+ # ``None`` propagates to the FK column and suppresses the
+ # ``default=`` callable.
self.owners = []
- if g and hasattr(g, "user"):
+ if g and hasattr(g, "user") and g.user:
+ self.created_by = g.user
+ self.changed_by = g.user
self.owners = [g.user]
+ else:
+ self.created_by = None
+ self.changed_by = None
@property
def params_dict(self) -> dict[Any, Any]:
diff --git a/superset/models/slice.py b/superset/models/slice.py
index 40de049df2ab..1975ca43c5c6 100644
--- a/superset/models/slice.py
+++ b/superset/models/slice.py
@@ -59,9 +59,18 @@
slice_user = Table(
"slice_user",
metadata,
- Column("id", Integer, primary_key=True),
- Column("user_id", Integer, ForeignKey("ab_user.id", ondelete="CASCADE")),
- Column("slice_id", Integer, ForeignKey("slices.id", ondelete="CASCADE")),
+ Column(
+ "user_id",
+ Integer,
+ ForeignKey("ab_user.id", ondelete="CASCADE"),
+ primary_key=True,
+ ),
+ Column(
+ "slice_id",
+ Integer,
+ ForeignKey("slices.id", ondelete="CASCADE"),
+ primary_key=True,
+ ),
)
logger = logging.getLogger(__name__)
@@ -74,6 +83,38 @@ class Slice( # pylint: disable=too-many-public-methods
query_context_factory: QueryContextFactory | None = None
__tablename__ = "slices"
+ # query_context is excluded: it is a cached/regenerated field, not user-authored.
+ # deleted_at exclusion will be added when soft delete is merged.
+ # Exclude M2M association relationships: Continuum only captures FK columns on
+ # association INSERTs (not the auto-increment id), which breaks the NOT NULL PK.
+ # Ownership changes are administrative metadata, not user-authored content.
+ # Audit / save-marker columns are auto-bumped on every save. Excluding
+ # them lets Continuum's is_modified() return False on no-op saves
+ # (e.g. owners-only edits) so we don't create empty version rows.
+ # version_transaction.user_id / issued_at preserve "who/when".
+ # The perm-string class (perm / schema_perm / catalog_perm) is derived
+ # security state, not user-authored content: permission maintenance
+ # rewrites it in bulk, and versioning it produced phantom transactions
+ # flooding the activity stream (10 "Chart updated" rows for one user
+ # save — surfaced by the version-history UI, PR #40988). Excluding it
+ # also means a restore can't resurrect stale permission strings; the
+ # live, derived values stay authoritative.
+ __versioned__: dict[str, Any] = {
+ "exclude": [
+ "query_context",
+ "owners",
+ "dashboards",
+ "changed_on",
+ "created_on",
+ "changed_by_fk",
+ "created_by_fk",
+ "last_saved_at",
+ "last_saved_by_fk",
+ "perm",
+ "schema_perm",
+ "catalog_perm",
+ ]
+ }
id = Column(Integer, primary_key=True)
slice_name = Column(String(250))
datasource_id = Column(Integer)
@@ -328,7 +369,11 @@ def chart(self) -> str:
@property
def slice_link(self) -> Markup:
name = escape(self.chart)
- return Markup(f'{name}')
+ # ``self.url`` is ``/explore/?slice_id=``; the only
+ # interpolation is the integer primary key, so the URL has no
+ # user-controlled segment to escape (unlike ``Dashboard.url``
+ # which embeds the user-set slug). ``noqa: S704`` is safe.
+ return Markup(f'{name}') # noqa: S704
@property
def icons(self) -> str:
diff --git a/superset/reports/models.py b/superset/reports/models.py
index f0abda8a9216..7564336ae11d 100644
--- a/superset/reports/models.py
+++ b/superset/reports/models.py
@@ -101,20 +101,18 @@ class ReportSourceFormat(StrEnum):
report_schedule_user = Table(
"report_schedule_user",
metadata,
- Column("id", Integer, primary_key=True),
Column(
"user_id",
Integer,
ForeignKey("ab_user.id", ondelete="CASCADE"),
- nullable=False,
+ primary_key=True,
),
Column(
"report_schedule_id",
Integer,
ForeignKey("report_schedule.id", ondelete="CASCADE"),
- nullable=False,
+ primary_key=True,
),
- UniqueConstraint("user_id", "report_schedule_id"),
)
diff --git a/superset/versioning/__init__.py b/superset/versioning/__init__.py
new file mode 100644
index 000000000000..13a83393a912
--- /dev/null
+++ b/superset/versioning/__init__.py
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
diff --git a/superset/versioning/api_helpers.py b/superset/versioning/api_helpers.py
new file mode 100644
index 000000000000..d052f66e8f5d
--- /dev/null
+++ b/superset/versioning/api_helpers.py
@@ -0,0 +1,211 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Shared handlers for the ``/versions/`` REST endpoints.
+
+Each ``ChartRestApi`` / ``DashboardRestApi`` / ``DatasetRestApi`` carries
+the same read endpoint methods — ``list_versions`` and ``get_version`` —
+whose bodies are byte-for-byte identical apart from the model class and
+the ``security_manager.raise_for_access`` kwarg. Extracting the bodies
+here lets each per-resource method collapse to a single delegation call,
+while the OpenAPI docstring + FAB decorators stay at the method site
+where they belong.
+
+(The restore endpoint ships in a later PR; only the read endpoints are
+wired here.)
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any
+from uuid import UUID
+
+import sqlalchemy as sa
+from flask import current_app, Response
+from flask_appbuilder import Model
+
+from superset.daos.version import VersionDAO
+from superset.exceptions import SupersetSecurityException
+from superset.extensions import db, security_manager
+from superset.versioning.etag import set_version_etag_by_uuid
+from superset.versioning.schemas import VersionListItemSchema
+
+#: Serializer for version rows (list items and the ``_version`` block of a
+#: single-version snapshot — same shape). Dumping through marshmallow
+#: instead of handing raw dicts to ``jsonify`` keeps ``issued_at``
+#: ISO-8601 (Flask's default JSON provider renders datetimes as RFC-1123
+#: http-dates) and ``version_uuid`` consistently a string (the list rows
+#: carry UUID instances, the snapshot block pre-stringifies).
+_version_item_schema = VersionListItemSchema()
+
+
+@dataclass
+class EntityVersionInfo:
+ """Live version identifiers for a write-endpoint response.
+
+ Every field is ``None`` when ``ENABLE_VERSIONING_CAPTURE`` is off — the
+ write endpoints then issue no version queries at all, so they stay inert
+ under the kill-switch rather than paying save-path latency the flag is
+ meant to eliminate.
+ """
+
+ version: int | None = None
+ transaction_id: int | None = None
+ version_uuid: str | None = None
+
+
+def _capture_enabled() -> bool:
+ return bool(current_app.config.get("ENABLE_VERSIONING_CAPTURE", False))
+
+
+def current_entity_version_info(
+ model_cls: type[Model],
+ entity_id: int | None,
+ entity_uuid: UUID | None = None,
+) -> EntityVersionInfo:
+ """Resolve the live version number, transaction id, and version uuid.
+
+ Returns an empty (all-``None``) record and issues *no* queries when
+ capture is disabled. When *entity_uuid* is not supplied it is resolved
+ with a single ``SELECT uuid`` rather than loading the whole entity row.
+ """
+ if entity_id is None or not _capture_enabled():
+ return EntityVersionInfo()
+ if entity_uuid is None:
+ entity_uuid = db.session.scalar(
+ sa.select(model_cls.uuid).where(model_cls.id == entity_id)
+ )
+ version_uuid = (
+ VersionDAO.current_live_version_uuid(model_cls, entity_id, entity_uuid)
+ if entity_uuid is not None
+ else None
+ )
+ return EntityVersionInfo(
+ version=VersionDAO.current_version_number(model_cls, entity_id),
+ transaction_id=VersionDAO.current_live_transaction_id(model_cls, entity_id),
+ version_uuid=str(version_uuid) if version_uuid else None,
+ )
+
+
+def current_entity_etag_uuid(
+ model_cls: type[Model],
+ entity_id: int | None,
+ entity_uuid: UUID | None,
+) -> str | None:
+ """Resolve only the live version uuid (for an ETag), gated by capture.
+
+ Returns ``None`` without querying when capture is off or either id is
+ missing.
+ """
+ if entity_id is None or entity_uuid is None or not _capture_enabled():
+ return None
+ version_uuid = VersionDAO.current_live_version_uuid(
+ model_cls, entity_id, entity_uuid
+ )
+ return str(version_uuid) if version_uuid else None
+
+
+def _resolve_entity(
+ api: Any,
+ model_cls: type[Model],
+ uuid_str: str,
+ access_kwarg: str,
+) -> tuple[Any, UUID] | Response:
+ """Parse the path UUID, look up the live entity, run the read-access
+ gate.
+
+ Returns ``(entity, entity_uuid)`` on success or a pre-built
+ ``Response`` (400 / 403 / 404) that the caller should return
+ directly. The split shape keeps the call site terse and lets the
+ three handler functions share the preflight without each repeating
+ the try / except dance.
+ """
+ try:
+ entity_uuid = UUID(uuid_str)
+ except ValueError:
+ return api.response_400(message="Invalid UUID")
+
+ entity = VersionDAO.find_active_by_uuid(model_cls, entity_uuid)
+ if entity is None:
+ return api.response_404()
+
+ try:
+ security_manager.raise_for_access(**{access_kwarg: entity})
+ except SupersetSecurityException:
+ return api.response_403()
+
+ return entity, entity_uuid
+
+
+def list_versions_endpoint(
+ api: Any,
+ model_cls: type[Model],
+ uuid_str: str,
+ access_kwarg: str,
+) -> Response:
+ """Body of ``GET /api/v1/{resource}//versions/``."""
+ resolved = _resolve_entity(api, model_cls, uuid_str, access_kwarg)
+ if isinstance(resolved, Response):
+ return resolved
+ entity, entity_uuid = resolved
+
+ versions = VersionDAO.list_versions(model_cls, entity_uuid, entity=entity)
+ if versions is None:
+ return api.response_404()
+ result = _version_item_schema.dump(versions, many=True)
+ return set_version_etag_by_uuid(
+ api.response(200, result=result, count=len(result)),
+ model_cls,
+ entity_uuid,
+ entity_id=entity.id,
+ )
+
+
+def get_version_endpoint(
+ api: Any,
+ model_cls: type[Model],
+ uuid_str: str,
+ version_uuid_str: str,
+ access_kwarg: str,
+) -> Response:
+ """Body of ``GET /api/v1/{resource}//versions//``."""
+ resolved = _resolve_entity(api, model_cls, uuid_str, access_kwarg)
+ if isinstance(resolved, Response):
+ return resolved
+ entity, entity_uuid = resolved
+
+ try:
+ version_uuid = UUID(version_uuid_str)
+ except ValueError:
+ return api.response_400(message="Invalid version UUID")
+
+ snapshot = VersionDAO.get_version(
+ model_cls, entity_uuid, version_uuid, entity=entity
+ )
+ if snapshot is None:
+ return api.response_404()
+ # Normalize the version-level block through the schema; the entity
+ # scalar fields stay as the DAO shaped them (their keys are
+ # entity-specific by design).
+ if "_version" in snapshot:
+ snapshot["_version"] = _version_item_schema.dump(snapshot["_version"])
+ return set_version_etag_by_uuid(
+ api.response(200, result=snapshot),
+ model_cls,
+ entity_uuid,
+ entity_id=entity.id,
+ )
diff --git a/superset/versioning/baseline/__init__.py b/superset/versioning/baseline/__init__.py
new file mode 100644
index 000000000000..664af88c4cb3
--- /dev/null
+++ b/superset/versioning/baseline/__init__.py
@@ -0,0 +1,69 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""``before_flush`` listener that captures a baseline version (version 0)
+for entities being updated for the first time after the versioning
+migration.
+
+Package layout (descends from public entry point to leaf builders):
+
+* :mod:`.listener` — public :func:`register_baseline_listener` that
+ wires the before-flush event on ``db.session``.
+* :mod:`.dirty` — :func:`force_parent_dirty_on_child_change` and
+ :func:`pin_audit_columns`: promote a parent into ``session.dirty``
+ when only its versioned children changed, and pin its audit columns
+ so the synthetic flush doesn't bump them.
+* :mod:`.collection` — discovery: which parents need a baseline row?
+ Holds ``VERSIONED_MODELS`` (populated at app start),
+ :func:`collect_parents_to_baseline`, the
+ :func:`child_to_parent_registry` mapping, and the per-parent
+ Continuum-shadow-table lookups.
+* :mod:`.insertion` — parent baseline insertion + child-handler
+ dispatch.
+* :mod:`.children` — per-entity child baseline handlers
+ (``_baseline_dataset_children`` / ``_baseline_dashboard_children``)
+ plus the leaf helpers that synthesize child / slice shadow rows.
+* :mod:`.shadow` — low-level :func:`insert_baseline_shadow_row`
+ helper used by every module that writes a shadow row, and the
+ :data:`CONTINUUM_BOOKKEEPING_COLUMNS` constant re-used outside this
+ package (the change-record listener and ``queries.py`` filter on it).
+
+The re-exports below preserve the prior ``from
+superset.versioning.baseline import …`` call shape; no caller outside
+this package needs to change.
+"""
+
+from __future__ import annotations
+
+from superset.versioning.baseline.collection import (
+ child_to_parent_registry,
+ VERSIONED_MODELS,
+)
+from superset.versioning.baseline.dirty import pin_audit_columns
+from superset.versioning.baseline.listener import register_baseline_listener
+from superset.versioning.baseline.shadow import (
+ CONTINUUM_BOOKKEEPING_COLUMNS,
+ insert_baseline_shadow_row,
+)
+
+__all__ = [
+ "CONTINUUM_BOOKKEEPING_COLUMNS",
+ "VERSIONED_MODELS",
+ "child_to_parent_registry",
+ "insert_baseline_shadow_row",
+ "pin_audit_columns",
+ "register_baseline_listener",
+]
diff --git a/superset/versioning/baseline/children.py b/superset/versioning/baseline/children.py
new file mode 100644
index 000000000000..e55a5cf4fb57
--- /dev/null
+++ b/superset/versioning/baseline/children.py
@@ -0,0 +1,212 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Per-entity child-baseline handlers.
+
+After a parent baseline row lands in :mod:`.insertion`, this module's
+handlers write the parent's child baselines under the same transaction
+id. The dispatch table :data:`CHILD_BASELINE_HANDLERS` is keyed on
+the parent class name (avoids an import-cycle with the entity modules,
+which can't be loaded at app-init time).
+
+The dataset handler baselines :class:`TableColumn` and
+:class:`SqlMetric` children. The dashboard handler baselines the
+``dashboard_slices`` M2M membership *and* synthesizes
+``operation_type=0`` rows in ``slices_version`` for attached slices
+that have no prior shadow — without those slice-side baselines,
+Continuum's M2M revert query returns empty.
+
+Leaf-level helpers (:func:`_insert_child_baseline_rows`,
+:func:`_baseline_attached_slices`,
+:func:`_insert_synthetic_slice_baseline`) live here too — they're
+shared between the two parent-specific handlers.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Callable
+from typing import Any
+
+import sqlalchemy as sa
+from sqlalchemy.orm import Session
+
+from superset.versioning.baseline.shadow import insert_baseline_shadow_row
+
+
+def _baseline_dataset_children(session: Session, dataset: Any, tx_id: int) -> None:
+ """Baseline a dataset's ``TableColumn`` and ``SqlMetric`` children
+ under the dataset's baseline tx.
+ """
+ # pylint: disable=import-outside-toplevel
+ from sqlalchemy_continuum import version_class
+
+ from superset.connectors.sqla.models import SqlMetric, TableColumn
+
+ for child_cls in (TableColumn, SqlMetric):
+ _insert_child_baseline_rows(
+ session,
+ dataset,
+ child_cls.__table__,
+ version_class(child_cls).__table__,
+ "table_id",
+ tx_id,
+ )
+
+
+def _baseline_dashboard_children(session: Session, dashboard: Any, tx_id: int) -> None:
+ """Baseline a dashboard's ``dashboard_slices`` M2M plus synthesize
+ ``operation_type=0`` rows in ``slices_version`` for attached slices
+ with no prior shadow.
+
+ Continuum's M2M version-side relationship for ``Dashboard.slices``
+ joins through both ``dashboard_slices_version`` AND
+ ``slices_version``: the second exists clause filters slices by
+ "latest slices_version row with tx <= dashboard.tx". If a slice
+ has no slices_version rows at all, that join produces no match
+ and ``version_obj.slices`` returns empty — leaving the dashboard
+ restore with no slices to append. The synthetic slice baseline at
+ this dashboard's tx gives the M2M query a slice version it can match.
+
+ Doesn't try to be clever about slices shared across dashboards: a
+ slice is baselined at this dashboard's tx_id only when it has no
+ shadow rows at all. If a later dashboard baseline references the
+ same slice, this baseline (now at lower tx) is still found by
+ that dashboard's restore. The reverse — a dashboard baselined
+ AFTER the slice was first baselined under another dashboard at
+ a higher tx — is a residual gap deferred to a future fix.
+ """
+ metadata = type(dashboard).__table__.metadata
+ live_tbl = metadata.tables.get("dashboard_slices")
+ shadow_tbl = metadata.tables.get("dashboard_slices_version")
+ if live_tbl is None or shadow_tbl is None:
+ return
+
+ _insert_child_baseline_rows(
+ session, dashboard, live_tbl, shadow_tbl, "dashboard_id", tx_id
+ )
+ _baseline_attached_slices(session, dashboard, live_tbl, tx_id)
+
+
+# Dispatch table keyed by parent CLASS NAME rather than class, to avoid
+# the import-cycle between baseline.py (loaded at app init) and the
+# entity modules. The class-name string is set once at app start by
+# the model definitions — typo-prone if extended. Declared after the
+# handlers it references because module-level dict literals evaluate
+# at import time and need the names already bound.
+_ChildBaselineHandler = Callable[[Session, Any, int], None]
+CHILD_BASELINE_HANDLERS: dict[str, _ChildBaselineHandler] = {
+ "SqlaTable": _baseline_dataset_children,
+ "Dashboard": _baseline_dashboard_children,
+}
+
+
+def _insert_child_baseline_rows(
+ session: Session,
+ parent_obj: Any,
+ child_table: sa.Table,
+ child_version_table: sa.Table,
+ fk_column_name: str,
+ tx_id: int,
+) -> None:
+ """Synthesize ``operation_type=0`` shadow rows for every live child of
+ *parent_obj* under transaction id *tx_id*.
+
+ Parallels :func:`~superset.versioning.baseline.insertion._insert_baseline_row`
+ but iterates over child rows. Used to give Continuum's ``Reverter``
+ baseline data for children of pre-existing parents (children that
+ predate this commit have no shadow rows otherwise, so Reverter
+ would treat them as "deleted at the target tx" and try to remove
+ them on revert — the ADR-004 Failure 1 reproduction scenario).
+
+ :param child_table: the live child SQLAlchemy ``Table`` (e.g.
+ ``TableColumn.__table__`` or the bare ``dashboard_slices`` association)
+ :param child_version_table: the corresponding Continuum shadow ``Table``
+ :param fk_column_name: column on *child_table* that points to the parent
+ (e.g. ``"table_id"`` for ``TableColumn``, ``"dashboard_id"`` for
+ ``dashboard_slices``)
+ """
+ conn = session.connection()
+ fk_col = getattr(child_table.c, fk_column_name)
+
+ rows = (
+ conn.execute(sa.select(child_table).where(fk_col == parent_obj.id))
+ .mappings()
+ .all()
+ )
+ if not rows:
+ return
+
+ for row in rows:
+ insert_baseline_shadow_row(conn, child_version_table, row, tx_id)
+
+
+def _baseline_attached_slices(
+ session: Session, dashboard: Any, live_tbl: sa.Table, tx_id: int
+) -> None:
+ """Insert ``operation_type=0`` rows in ``slices_version`` for each
+ slice attached to *dashboard* that has no shadow row yet.
+
+ Batched: one membership SELECT, one existing-shadow SELECT, one live
+ SELECT for the missing slices. Per-slice work happens only on
+ ``_insert_synthetic_slice_baseline``. The previous per-slice
+ ``COUNT(*)`` + ``SELECT`` pattern was O(N) round-trips and surfaced
+ as a measurable first-save hotspot on dashboards with many charts.
+ """
+ # pylint: disable=import-outside-toplevel
+ from sqlalchemy_continuum import version_class
+
+ from superset.models.slice import Slice
+
+ slice_ver_table = version_class(Slice).__table__
+ slice_table = Slice.__table__
+ conn = session.connection()
+
+ attached_slice_ids = [
+ r.slice_id
+ for r in conn.execute(
+ sa.select(live_tbl.c.slice_id).where(
+ live_tbl.c.dashboard_id == dashboard.id
+ )
+ ).all()
+ ]
+ if not attached_slice_ids:
+ return
+
+ existing_shadow_ids = {
+ row[0]
+ for row in conn.execute(
+ sa.select(slice_ver_table.c.id.distinct()).where(
+ slice_ver_table.c.id.in_(attached_slice_ids)
+ )
+ ).all()
+ }
+ missing_ids = [sid for sid in attached_slice_ids if sid not in existing_shadow_ids]
+ if not missing_ids:
+ return
+
+ slice_rows = (
+ conn.execute(sa.select(slice_table).where(slice_table.c.id.in_(missing_ids)))
+ .mappings()
+ .all()
+ )
+ for slice_row in slice_rows:
+ _insert_synthetic_slice_baseline(conn, slice_ver_table, slice_row, tx_id)
+
+
+def _insert_synthetic_slice_baseline(
+ conn: Any, slice_ver_table: sa.Table, slice_row: Any, tx_id: int
+) -> None:
+ insert_baseline_shadow_row(conn, slice_ver_table, slice_row, tx_id)
diff --git a/superset/versioning/baseline/collection.py b/superset/versioning/baseline/collection.py
new file mode 100644
index 000000000000..7ab78a9d6087
--- /dev/null
+++ b/superset/versioning/baseline/collection.py
@@ -0,0 +1,153 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Discovery: figure out which parents need a baseline row.
+
+Three helpers cooperate on the listener's "should I baseline" decision:
+
+* :func:`collect_parents_to_baseline` — walks ``session.dirty`` /
+ ``new`` / ``deleted`` and returns the unique parent entities to
+ consider (directly-dirty versioned parents + parents reachable from
+ dirty children via :func:`child_to_parent_registry`).
+* :func:`version_table_for` — resolves a Continuum shadow Table for
+ one parent object.
+* :func:`shadow_row_count` — counts existing shadow rows for the
+ parent's id; ``0`` is the signal to insert a baseline.
+
+:func:`child_to_parent_registry` is also exposed because
+:mod:`superset.versioning.factory` consumes it via inline import.
+
+**Inline imports.** ``versioning.baseline`` is imported during
+``init_versioning()`` before all SQLAlchemy mappers are configured;
+the lazy imports defer Continuum + model resolution until call time.
+"""
+
+from __future__ import annotations
+
+import functools
+import logging
+from typing import Any
+
+import sqlalchemy as sa
+from sqlalchemy.exc import OperationalError, ProgrammingError
+from sqlalchemy.orm import Session
+
+# Populated at app startup (superset/initialization/__init__.py) before
+# register_baseline_listener() is called.
+VERSIONED_MODELS: list[type] = []
+
+logger = logging.getLogger(__name__)
+
+
+def collect_parents_to_baseline(session: Session) -> dict[int, Any]:
+ """Return parents-to-baseline as ``{id(obj): obj}`` keyed by Python
+ object identity to dedupe across ``session.dirty + new + deleted``.
+
+ Includes both directly-dirty versioned parents and parents reachable
+ from dirty/new/deleted children via the child→parent registry.
+ """
+ parents: dict[int, Any] = {}
+ child_map = child_to_parent_registry()
+ for obj in list(session.dirty) + list(session.new) + list(session.deleted):
+ if type(obj) in VERSIONED_MODELS:
+ parents[id(obj)] = obj
+ continue
+ entry = child_map.get(type(obj))
+ if entry is None:
+ continue
+ parent_attr, parent_cls = entry
+ parent = getattr(obj, parent_attr, None)
+ if parent is not None and type(parent) is parent_cls: # noqa: E721
+ parents[id(parent)] = parent
+ return parents
+
+
+@functools.cache
+def child_to_parent_registry() -> dict[type, tuple[str, type]]:
+ """Map child entity class → (parent-relationship-attr, parent class).
+
+ When a dirty child of a known type appears in session.dirty/new/deleted,
+ we walk to its parent and baseline the parent (+ siblings) under the
+ SAME flush so pre-edit child values land in the baseline shadow rows.
+ Without this, edits that only touch child rows produce a "silent" flush
+ A (just ``TableColumn``) followed by flush B (``SqlaTable.changed_on``);
+ flush B reads children from DB AFTER flush A already pushed UPDATEs,
+ capturing post-edit state.
+
+ Cached because this is called from ``force_parent_dirty_on_child_change``
+ and ``collect_parents_to_baseline`` on every save flush. The returned
+ mapping depends only on the (fixed at import time) child model classes,
+ so an unbounded ``functools.cache`` is the right shape — no invalidation
+ needed.
+ """
+ # Lazy import: ``baseline`` is imported during ``init_versioning``, which
+ # runs before all model mappers are configured. Importing model classes
+ # at module load would either cycle or hit unresolved mappers.
+ # pylint: disable=import-outside-toplevel
+ from superset.connectors.sqla.models import SqlaTable, SqlMetric, TableColumn
+
+ return {
+ TableColumn: ("table", SqlaTable),
+ SqlMetric: ("table", SqlaTable),
+ }
+
+
+def version_table_for(obj: Any) -> Any:
+ """Return Continuum's shadow ``Table`` for *obj*'s class, or ``None``
+ when the class isn't registered (forks / plugins that subclass without
+ ``__versioned__``).
+ """
+ # pylint: disable=import-outside-toplevel
+ from sqlalchemy_continuum import version_class
+ from sqlalchemy_continuum.exc import ClassNotVersioned
+
+ try:
+ return version_class(type(obj)).__table__
+ except ClassNotVersioned:
+ return None
+
+
+def shadow_row_count(session: Session, obj: Any, version_table: Any) -> int | None:
+ """Return number of shadow rows for *obj.id* in *version_table*, or
+ ``None`` when the version table is missing (migration not yet applied)
+ or the count query raised unexpectedly.
+ """
+ try:
+ # SAVEPOINT so a missing-table probe can't poison the enclosing
+ # transaction on PostgreSQL (a failed statement aborts the tx
+ # there; subsequent statements would raise InFailedSqlTransaction
+ # and fail the user's save despite the except below).
+ with session.no_autoflush, session.connection().begin_nested():
+ return (
+ session.connection()
+ .execute(
+ sa.select(sa.func.count())
+ .select_from(version_table)
+ .where(version_table.c.id == obj.id)
+ )
+ .scalar()
+ )
+ except (OperationalError, ProgrammingError):
+ # Missing table: OperationalError on SQLite/MySQL,
+ # ProgrammingError (UndefinedTable) on PostgreSQL.
+ return None
+ except Exception: # pylint: disable=broad-except
+ logger.exception(
+ "baseline_listener: count query failed for %s id=%s",
+ type(obj).__name__,
+ getattr(obj, "id", None),
+ )
+ return None
diff --git a/superset/versioning/baseline/dirty.py b/superset/versioning/baseline/dirty.py
new file mode 100644
index 000000000000..f351cbb38bbe
--- /dev/null
+++ b/superset/versioning/baseline/dirty.py
@@ -0,0 +1,243 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Parent-dirty force machinery for child-only saves.
+
+When a versioned child (``TableColumn`` / ``SqlMetric``) is in
+``session.dirty`` / ``new`` / ``deleted`` but its parent's scalar
+columns haven't been touched, the parent is *missing* from the dirty
+set — so Continuum's UnitOfWork never creates a parent UPDATE
+operation, no parent shadow row is written, and the version-history
+dropdown comes back empty for column/metric-only saves.
+
+:func:`force_parent_dirty_on_child_change` walks dirty/new/deleted
+children, looks them up in the child→parent registry (in
+:mod:`.collection`), and ``attributes.flag_modified``s a deterministic
+non-excluded column on the parent. SQLAlchemy adds the parent to
+``session.dirty``; Continuum then writes a parent shadow row whose
+scalars mirror the previous version (only the children actually
+changed).
+
+:func:`pin_audit_columns` is a companion: when the parent is force-
+flagged, we pin ``changed_by_fk`` / ``changed_on`` to their current
+in-memory values so the parent UPDATE doesn't invoke the audit
+columns' ``onupdate=get_user_id`` / ``onupdate=datetime.now`` hooks
+(which would attribute the synthetic flush to whoever ``g.user`` is
+at the time, possibly a deleted test user under autoflush teardown).
+
+**Inline imports.** Same init-order rationale as
+:mod:`superset.versioning.baseline.collection`.
+"""
+
+from __future__ import annotations
+
+import logging
+from collections.abc import Iterator
+from typing import Any
+
+from sqlalchemy.exc import InvalidRequestError
+from sqlalchemy.orm import attributes, Session
+
+from superset.versioning.baseline.collection import child_to_parent_registry
+
+logger = logging.getLogger(__name__)
+
+
+def force_parent_dirty_on_child_change(session: Session) -> None:
+ """Mark a versioned parent as dirty whenever one of its versioned
+ children appears in ``session.dirty``/``new``/``deleted`` but the
+ parent's own scalars haven't been edited.
+
+ Without this hook, edits that only touch ``TableColumn`` or
+ ``SqlMetric`` rows leave the parent ``SqlaTable`` out of
+ ``session.dirty`` — so Continuum's UnitOfWork never creates a
+ parent UPDATE operation and ``list_versions`` (which queries the
+ parent shadow ``tables_version``) returns just the baseline. The
+ user-visible symptom is "I edited a column description but the
+ dataset's version history dropdown is empty".
+
+ For each child that represents a real edit, we resolve its parent
+ and ``attributes.flag_modified`` a deterministic non-excluded
+ column so SQLAlchemy adds the parent to ``session.dirty`` without
+ altering any column values. Continuum then writes a parent shadow
+ row at this transaction; its scalar columns mirror the previous
+ version (only the children changed).
+ ``SkipUnmodifiedPlugin._is_no_op_update`` is taught to recognize
+ the "scalars match but children dirty" case and keep the row.
+ """
+ child_map = child_to_parent_registry()
+ new_set = session.new
+ for child in _real_dirty_versioned_children(session, child_map):
+ parent = _resolve_parent(child, child_map)
+ if parent is None:
+ continue
+ if parent in new_set:
+ # Already-new short-circuit. If the parent itself is in
+ # ``session.new`` (typical during an import that adds a
+ # ``SqlaTable`` plus 50 fresh ``TableColumn`` children), it
+ # will INSERT in this flush regardless — the
+ # ``flag_modified`` call is redundant (and the attribute-
+ # default-not-yet-fired case in ``_flag_parent`` would just
+ # swallow an ``InvalidRequestError``). Skip the work.
+ continue
+ if _flag_parent(parent):
+ pin_audit_columns(parent)
+
+
+def _real_dirty_versioned_children(
+ session: Session, child_map: dict[type, Any]
+) -> Iterator[Any]:
+ """Yield child instances that are (a) of a versioned-child class
+ registered in *child_map*, and (b) represent a real content edit —
+ not a phantom-dirty entry from lazy-load side effects or audit-
+ column auto-bumps.
+
+ Phantom-dirty filter rationale: a child can appear in
+ ``session.dirty`` for reasons that don't represent real content
+ edits — lazy-load side effects, ``AuditMixin`` auto-bumps from
+ prior code paths, M2M relationship-cascade artifacts (e.g.,
+ ``rls_entry.tables.extend([dataset])`` in setUp), Reverter side
+ passes. Force-touching the parent in those cases produces an
+ incidental ``UPDATE tables SET description=…, changed_on=…,
+ changed_by_fk=…`` that can violate FK integrity on some dialects
+ (observed in ``test_rls_filter_alters_no_role_user_birth_names_query``).
+
+ The filter applies ONLY to persistent rows in ``session.dirty``:
+ ``session.new`` (creation) and ``session.deleted`` (removal) are
+ always real content changes — deletion in particular is a state
+ transition with no attribute history, so ``is_modified`` returns
+ False there even when the change is real (column-removed records
+ must still emit).
+ """
+ # pylint: disable=import-outside-toplevel
+ from sqlalchemy_continuum import is_modified
+
+ # ``session.dirty`` / ``session.new`` are IdentitySets — ``__contains__``
+ # uses identity comparison, which is what we need for the phantom-
+ # dirty filter below.
+ dirty_set = session.dirty
+ for obj in list(session.dirty) + list(session.new) + list(session.deleted):
+ if type(obj) not in child_map:
+ continue
+ if obj in dirty_set and not is_modified(obj):
+ continue
+ yield obj
+
+
+def _resolve_parent(child: Any, child_map: dict[type, Any]) -> Any | None:
+ """Resolve the versioned parent for *child* via the child→parent
+ registry; return ``None`` when the registered parent attribute
+ isn't loaded or has been swapped for an unexpected type."""
+ parent_attr, parent_cls = child_map[type(child)]
+ parent = getattr(child, parent_attr, None)
+ if parent is None or type(parent) is not parent_cls: # noqa: E721
+ return None
+ return parent
+
+
+def _flag_parent(parent: Any) -> bool:
+ """``flag_modified`` a stable non-excluded column on *parent* so
+ SQLAlchemy adds it to ``session.dirty`` without altering values.
+ Returns ``True`` on success.
+
+ Column choice: ``description`` is a plain ``Text`` column on all
+ three versioned parent classes (Dashboard, Slice, SqlaTable) and is
+ in none of their ``__versioned__`` excludes — pick it
+ deterministically so the flagged attribute is stable across
+ SQLAlchemy versions / mapper-configuration orders. We deliberately
+ avoid ``uuid``: when a versioned-parent UPDATE goes through with
+ ``uuid`` flagged, the column's ``UUIDType``/BLOB round-trip
+ produces a memoryview that fails an FK integrity check on some
+ dialects (observed in
+ ``test_rls_filter_alters_no_role_user_birth_names_query`` and
+ ``test_restore_applies_scalar_field``). ``description`` is a plain
+ text column with no marshaling layer, so flagging it safely
+ round-trips its current value. Falls back to ``uuid`` then
+ ``col_keys[0]`` for forks that excluded ``description``.
+
+ Returns ``False`` for the freshly-constructed ``session.new``
+ instance whose attribute defaults haven't fired yet — the
+ attribute is unloaded in instance state, so ``flag_modified``
+ rejects it with ``InvalidRequestError``. The parent will INSERT in
+ this flush regardless, so the flag was redundant; safely skip.
+ Hit by ``test_create_dataset_item`` (POST /api/v1/dataset/).
+ """
+ # pylint: disable=import-outside-toplevel
+ from sqlalchemy_continuum.utils import versioned_column_properties
+
+ col_keys = [prop.key for prop in versioned_column_properties(parent)]
+ if not col_keys:
+ return False
+ if "description" in col_keys:
+ flag_col = "description"
+ elif "uuid" in col_keys:
+ flag_col = "uuid"
+ else:
+ flag_col = col_keys[0]
+ try:
+ attributes.flag_modified(parent, flag_col)
+ except InvalidRequestError:
+ return False
+ return True
+
+
+def pin_audit_columns(parent: Any) -> None:
+ """Pin ``changed_by_fk`` and ``changed_on`` to their current in-memory
+ values on a flag-flushed parent.
+
+ ``changed_by_fk`` carries ``onupdate=get_user_id`` from ``AuditMixin``:
+ any UPDATE statement that doesn't explicitly set this column lets
+ SQLAlchemy invoke ``get_user_id()`` and write whoever ``g.user`` is
+ at flush time. When the flush is autoflush-triggered during an
+ earlier test's teardown (after the test user has been deleted from
+ ``ab_user``), the bumped value points at a non-existent row and the
+ parent UPDATE fails the FK to ``ab_user``. The same applies to
+ ``changed_on``'s ``onupdate=datetime.now`` (cosmetic only, but it's
+ cheap to pin together).
+
+ ``flag_modified`` on both columns marks them as having dirty
+ attribute history, which tells SQLAlchemy to use the in-memory
+ (previously-committed) values instead of invoking ``onupdate`` —
+ the parent UPDATE then carries the existing audit values rather
+ than whatever ``g.user`` resolves to during the synthetic flag
+ flush. Hits ``test_rls_filter_alters_no_role_user_birth_names_query``
+ and ``TestDatasetRestoreApi::test_restore_applies_scalar_field``
+ in CI's full-suite ordering (autoflush during teardown).
+ """
+ pinned_any = False
+ for audit_col in ("changed_by_fk", "changed_on"):
+ if hasattr(parent, audit_col):
+ try:
+ attributes.flag_modified(parent, audit_col)
+ pinned_any = True
+ except InvalidRequestError:
+ continue
+ if not pinned_any and hasattr(parent, "changed_by_fk"):
+ # Both audit columns are present on the parent but neither
+ # ``flag_modified`` succeeded — typically because the parent is
+ # a freshly-constructed ``session.new`` instance whose attribute
+ # defaults haven't fired yet. Without the pin, the synthetic
+ # parent UPDATE in this flush invokes ``onupdate=get_user_id``
+ # and writes whoever ``g.user`` is at flush time, which under
+ # autoflush-during-teardown can point at a deleted test user
+ # and fail the FK to ``ab_user``. Surface this so the failure
+ # mode is debuggable from the log without inspection.
+ logger.info(
+ "baseline: skipped audit-column pin on %s id=%s "
+ "(attribute defaults not loaded)",
+ type(parent).__name__,
+ getattr(parent, "id", None),
+ )
diff --git a/superset/versioning/baseline/insertion.py b/superset/versioning/baseline/insertion.py
new file mode 100644
index 000000000000..303a1e4f202c
--- /dev/null
+++ b/superset/versioning/baseline/insertion.py
@@ -0,0 +1,149 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Parent baseline insertion + child-handler dispatch.
+
+Two complementary helpers:
+
+* :func:`insert_baseline_and_children` — top-level glue called by
+ the listener. Wraps the work in ``session.no_autoflush`` (so
+ ``session.connection()`` doesn't trigger a flush of Continuum's
+ pending Transaction object before our direct-SQL insert claims its
+ tx_id) and logs any failures as listener-boundary errors.
+* :func:`_insert_baseline_row` — actually writes the
+ ``version_transaction`` row and the parent shadow row. Returns the
+ allocated ``transaction_id``.
+* :func:`_baseline_children_for_parent` — dispatches to the per-
+ entity handler in :mod:`.children` under the same tx_id.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+import sqlalchemy as sa
+from sqlalchemy.orm import Session
+
+from superset.versioning.baseline.children import CHILD_BASELINE_HANDLERS
+from superset.versioning.baseline.shadow import insert_baseline_shadow_row
+from superset.versioning.utils import read_row_outside_flush
+
+logger = logging.getLogger(__name__)
+
+
+def insert_baseline_and_children(
+ session: Session, obj: Any, version_table: Any
+) -> None:
+ """Insert the parent baseline row, then baseline the parent's child
+ collections under the same transaction id.
+
+ Wrapped in ``no_autoflush`` so ``session.connection()`` inside
+ ``_insert_baseline_row`` does not trigger a flush of Continuum's
+ pending Transaction object before our direct-SQL insert claims its
+ tx_id.
+ """
+ try:
+ with session.no_autoflush:
+ tx_id = _insert_baseline_row(session, obj, version_table)
+ if tx_id is None:
+ return
+ _baseline_children_for_parent(session, obj, tx_id)
+ logger.debug(
+ "baseline_listener: inserted baseline tx_id=%s for %s id=%s",
+ tx_id,
+ type(obj).__name__,
+ getattr(obj, "id", None),
+ )
+ except Exception: # pylint: disable=broad-except
+ logger.exception(
+ "baseline_listener: failed to insert baseline for %s id=%s",
+ type(obj).__name__,
+ getattr(obj, "id", None),
+ )
+
+
+def _insert_baseline_row(
+ session: Session, obj: Any, version_table: sa.Table
+) -> int | None:
+ """Insert a synthetic baseline row capturing the pre-edit DB state of *obj*.
+
+ Creates a version_transaction entry and an operation_type=0 version row.
+ All writes use the session's existing connection so they share the same
+ database transaction as the triggering flush.
+
+ Returns the allocated ``transaction_id`` so the caller can baseline child
+ collections under the same tx (see
+ :func:`~superset.versioning.baseline.children._insert_child_baseline_rows`),
+ or ``None`` when the entity has no live row.
+ """
+ # pylint: disable=import-outside-toplevel
+ from sqlalchemy_continuum import versioning_manager
+
+ main_table = type(obj).__table__
+ row = read_row_outside_flush(session, main_table, obj.id)
+ if row is None:
+ return None
+
+ conn = session.connection()
+
+ # Insert a version_transaction row for the baseline.
+ #
+ # ``issued_at`` and ``user_id`` are sourced from the entity's audit fields
+ # (``changed_on`` / ``changed_by_fk``, falling back to ``created_on`` /
+ # ``created_by_fk`` if the row was never edited), so the baseline reads
+ # in the version-history UI as "this is the state at the time of the
+ # last pre-versioning edit, by that user." Using ``now()`` and the
+ # current user would have made the baseline look chronologically newer
+ # than subsequent edits and attributed historical content to the user
+ # who happened to trigger the first save under versioning.
+ baseline_issued_at = row.get("changed_on") or row.get("created_on") or sa.func.now()
+ baseline_user_id = row.get("changed_by_fk") or row.get("created_by_fk")
+ tx_table = versioning_manager.transaction_cls.__table__
+ result = conn.execute(
+ tx_table.insert().values(
+ issued_at=baseline_issued_at,
+ user_id=baseline_user_id,
+ remote_addr=None,
+ )
+ )
+ tx_id = result.inserted_primary_key[0]
+ insert_baseline_shadow_row(conn, version_table, row, tx_id)
+ return tx_id
+
+
+def _baseline_children_for_parent(
+ session: Session, parent_obj: Any, tx_id: int
+) -> None:
+ """Baseline a parent's child collections under the parent's baseline tx.
+
+ Dispatches via the
+ :data:`~superset.versioning.baseline.children.CHILD_BASELINE_HANDLERS`
+ table to per-entity handlers. A handler failure is logged but does
+ not block the parent baseline.
+ """
+ parent_name = type(parent_obj).__name__
+ handler = CHILD_BASELINE_HANDLERS.get(parent_name)
+ if handler is None:
+ return
+ try:
+ handler(session, parent_obj, tx_id)
+ except Exception: # pylint: disable=broad-except
+ logger.exception(
+ "baseline_listener: failed to baseline children of %s id=%s",
+ parent_name,
+ getattr(parent_obj, "id", None),
+ )
diff --git a/superset/versioning/baseline/listener.py b/superset/versioning/baseline/listener.py
new file mode 100644
index 000000000000..8f36844222b1
--- /dev/null
+++ b/superset/versioning/baseline/listener.py
@@ -0,0 +1,119 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Public entry point: attach the ``before_flush`` baseline listener.
+
+:func:`register_baseline_listener` is called from
+:class:`superset.initialization.SupersetAppInitializer.init_versioning`
+after ``make_versioned()`` has run and all versioned model classes
+have been imported. It registers one ``before_flush`` listener on
+``db.session`` that:
+
+1. force-dirties versioned parents whose only changes are
+ child-collection edits (:mod:`.dirty`);
+2. collects the parents that need a baseline row
+ (:mod:`.collection`);
+3. for each parent with no prior shadow row, inserts the synthetic
+ baseline row + its child baseline rows (:mod:`.insertion` +
+ :mod:`.children`).
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+from sqlalchemy import event
+from sqlalchemy.orm import Session
+from sqlalchemy_continuum import versioning_manager
+
+from superset.versioning.baseline.collection import (
+ collect_parents_to_baseline,
+ shadow_row_count,
+ version_table_for,
+ VERSIONED_MODELS,
+)
+from superset.versioning.baseline.dirty import force_parent_dirty_on_child_change
+from superset.versioning.baseline.insertion import insert_baseline_and_children
+
+logger = logging.getLogger(__name__)
+
+# Sentinel attribute set on the session target after first successful
+# registration — same pattern as
+# :mod:`superset.versioning.changes.listener`. Subsequent calls become
+# no-ops so test fixtures that instantiate multiple Superset apps per
+# process don't attach a second copy of the listener to the shared
+# ``db.session`` (every flush would otherwise run the baseline pass
+# twice).
+_REGISTERED_SENTINEL = "_versioning_baseline_listener_registered"
+
+
+def register_baseline_listener() -> None:
+ """Attach the before_flush listener that captures baseline versions.
+
+ Call this after ``VERSIONED_MODELS`` has been populated and
+ ``make_versioned()`` has run. Idempotent — repeat calls are no-ops.
+ """
+ # pylint: disable=import-outside-toplevel
+ from superset.extensions import db
+
+ if getattr(db.session, _REGISTERED_SENTINEL, False):
+ return
+
+ # insert=True prepends us in the listener chain so we run BEFORE
+ # Continuum's before_flush. Continuum's pending Transaction object
+ # (added in its own before_flush) would otherwise get a lower
+ # auto-increment tx_id than our direct-SQL baseline insert, placing the
+ # baseline row after the update in version_number order. Prepending
+ # ensures our baseline's tx_id comes first.
+ @event.listens_for(db.session, "before_flush", insert=True)
+ def capture_baseline(session: Session, flush_context: Any, instances: Any) -> None:
+ if not VERSIONED_MODELS:
+ return
+ # Respect the unified capture master switch. Unlike the change-record
+ # listener (which self-gates because it needs a Continuum transaction
+ # id that won't exist when capture is off), the baseline writer mints
+ # its own ``version_transaction`` row via direct SQL — so without this
+ # guard a detached/kill-switched session would still write baselines.
+ # ``_remove_continuum_write_listeners`` flips this option off.
+ if not versioning_manager.options["versioning"]:
+ return
+ try:
+ # Make sure a child-only edit promotes the parent to
+ # ``session.dirty`` before Continuum's before_flush reads the
+ # dirty set.
+ force_parent_dirty_on_child_change(session)
+ for obj in collect_parents_to_baseline(session).values():
+ if type(obj) not in VERSIONED_MODELS:
+ continue
+ version_table = version_table_for(obj)
+ if version_table is None:
+ continue
+ count = shadow_row_count(session, obj, version_table)
+ if count == 0:
+ insert_baseline_and_children(session, obj, version_table)
+ except Exception: # pylint: disable=broad-except
+ # Versioning must never break a user's save. If baseline capture
+ # fails (a lazy-load error, a registry gap, an unexpected schema
+ # state), log it and let the flush proceed uninstrumented rather
+ # than aborting the user's transaction.
+ logger.warning(
+ "versioning: baseline capture failed during before_flush; "
+ "the save proceeds without a baseline row for this flush.",
+ exc_info=True,
+ )
+
+ setattr(db.session, _REGISTERED_SENTINEL, True)
diff --git a/superset/versioning/baseline/shadow.py b/superset/versioning/baseline/shadow.py
new file mode 100644
index 000000000000..49d5980467ac
--- /dev/null
+++ b/superset/versioning/baseline/shadow.py
@@ -0,0 +1,92 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Continuum-shaped shadow-row writer.
+
+Two pieces:
+
+* :data:`CONTINUUM_BOOKKEEPING_COLUMNS` — the set of column names
+ Continuum uses for per-row bookkeeping (``transaction_id`` /
+ ``end_transaction_id`` / ``operation_type``). Re-used outside this
+ package as a filter (the change-record listener strips these from
+ JSON record values).
+* :func:`insert_baseline_shadow_row` — copies a live row into a
+ shadow ``Table`` as a synthetic ``operation_type=0`` baseline at
+ the given transaction id. The other modules in this package use it
+ for every parent and child baseline insert.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+import sqlalchemy as sa
+
+logger = logging.getLogger(__name__)
+
+# Continuum's per-shadow-row bookkeeping columns. Skipped when copying
+# content from a live row into a synthetic baseline shadow row; set
+# explicitly by the baseline writer so the row reads as a freshly-created
+# live row at the baseline transaction.
+CONTINUUM_BOOKKEEPING_COLUMNS: frozenset[str] = frozenset(
+ {"transaction_id", "end_transaction_id", "operation_type"}
+)
+
+
+def insert_baseline_shadow_row(
+ conn: Any,
+ version_table: sa.Table,
+ source_row: Any,
+ tx_id: int,
+) -> None:
+ """Copy *source_row* into *version_table* as a synthetic baseline
+ (``operation_type=0``) shadow row at *tx_id*.
+
+ Content columns are copied through; the three Continuum bookkeeping
+ columns are set explicitly so the row reads as a freshly-created
+ live row at *tx_id*. Column objects (not names) are used as
+ ``values()`` keys to avoid the "Unconsumed column names" error that
+ a name-based dict hits when a Column's ``.key`` differs from its
+ ``.name`` — a thing Continuum-generated tables occasionally produce.
+ """
+ col_values: dict[Any, Any] = {}
+ dropped: list[str] = []
+ for col in version_table.columns:
+ if col.name in CONTINUUM_BOOKKEEPING_COLUMNS:
+ continue
+ if col.name in source_row:
+ col_values[col] = source_row[col.name]
+ else:
+ dropped.append(col.name)
+ if dropped:
+ # A content column present on the shadow table but absent from the
+ # live source row means the two schemas have diverged (a Continuum
+ # shadow column whose name doesn't match the live column). The value
+ # would be stored NULL — a silent history-fidelity gap — so surface
+ # it rather than dropping it quietly.
+ logger.warning(
+ "versioning: baseline shadow row for %s is missing source "
+ "values for column(s) %s; they will be stored NULL. This "
+ "indicates a name divergence between the live table and its "
+ "Continuum shadow table.",
+ version_table.name,
+ ", ".join(dropped),
+ )
+ col_values[version_table.c.transaction_id] = tx_id
+ col_values[version_table.c.end_transaction_id] = None
+ col_values[version_table.c.operation_type] = 0
+ conn.execute(version_table.insert().values(col_values))
diff --git a/superset/versioning/changes/__init__.py b/superset/versioning/changes/__init__.py
new file mode 100644
index 000000000000..29c1592b3e84
--- /dev/null
+++ b/superset/versioning/changes/__init__.py
@@ -0,0 +1,76 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Write-side change-record capture for ``version_changes``.
+
+The package is split into four submodules that descend from public
+entry point to leaf helpers:
+
+* :mod:`.listener` — public ``register_change_record_listener`` plus
+ the session-event machinery (``before_flush`` / ``after_flush`` /
+ ``after_commit`` / ``after_rollback``) that drives the capture.
+ Holds ``ACTION_KIND_KEY``, the buffer-key constants, and the per-tx
+ ``action_kind`` stamper.
+* :mod:`.state` — per-entity diff dispatch: pre-state read,
+ post-state serialisation, JSON-safety coercion (``jsonable``),
+ cached scalar-field discovery, and bulk-insert into the
+ ``version_changes`` table.
+* :mod:`.shadow_queries` — shadow-table reads that drive child-
+ collection diffs (dataset columns/metrics, dashboard slice
+ membership). Includes the validity-strategy ``shadow_rows_valid_at``
+ helper consumed externally by :mod:`superset.versioning.queries`.
+* :mod:`.table` — the SQLAlchemy ``Table`` definition for
+ ``version_changes`` plus the ``ENTITY_KIND_BY_CLASS_NAME`` mapping
+ consumed by the API + activity-view modules.
+
+The re-exports below preserve the prior ``from
+superset.versioning.changes import …`` call shape; no caller outside
+this package needs to change.
+"""
+
+from __future__ import annotations
+
+from superset.versioning.changes.listener import (
+ ACTION_KIND_CLONE,
+ ACTION_KIND_IMPORT,
+ ACTION_KIND_KEY,
+ ACTION_KIND_RESTORE,
+ ACTION_KINDS,
+ ACTION_META_KEY,
+ build_action_headline,
+ OPERATION_ANNOUNCE,
+ register_change_record_listener,
+)
+from superset.versioning.changes.shadow_queries import shadow_rows_valid_at
+from superset.versioning.changes.table import (
+ ENTITY_KIND_BY_CLASS_NAME,
+ version_changes_table,
+)
+
+__all__ = [
+ "ACTION_KIND_CLONE",
+ "ACTION_KIND_IMPORT",
+ "ACTION_KIND_KEY",
+ "ACTION_KIND_RESTORE",
+ "ACTION_KINDS",
+ "ACTION_META_KEY",
+ "build_action_headline",
+ "OPERATION_ANNOUNCE",
+ "ENTITY_KIND_BY_CLASS_NAME",
+ "register_change_record_listener",
+ "shadow_rows_valid_at",
+ "version_changes_table",
+]
diff --git a/superset/versioning/changes/listener.py b/superset/versioning/changes/listener.py
new file mode 100644
index 000000000000..fbd944eaaf70
--- /dev/null
+++ b/superset/versioning/changes/listener.py
@@ -0,0 +1,507 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Session-level listeners that drive ``version_changes`` writes.
+
+Two flush events cooperate, plus two post-commit / post-rollback
+cleanups:
+
+- ``before_flush``: for each versioned entity in ``session.dirty``,
+ reads the pre-save scalar state from the DB via raw SQL inside
+ ``session.no_autoflush`` (same idiom as the baseline listener, not
+ Continuum's internal ``units_of_work`` which is a private API), reads
+ the post-save state from the in-memory ORM object, calls the diff
+ engine, and buffers the resulting :class:`ChangeRecord` list on
+ ``session.info``. This must run before the flush because after the
+ flush the DB already reflects the post-state; we can't recover the
+ pre-state from it.
+
+- ``after_flush``: drains the buffer, resolves the current Continuum
+ transaction id via ``versioning_manager.units_of_work``, and bulk-
+ inserts one ``version_changes`` row per record with a monotonic
+ ``sequence`` number. Records accumulated across multiple before_flush
+ calls within one transaction share the same ``transaction_id`` and
+ contiguous sequence numbers.
+
+- ``after_commit`` / ``after_rollback``: clean up session-scoped
+ state (processed-tx set, ``action_kind`` / ``action_meta`` keys, and
+ the pending-records buffer) so a long-lived session doesn't carry any
+ of it into the next transaction.
+
+Scope:
+ - Slice, Dashboard, SqlaTable **scalar fields** (via the cached
+ field set from :mod:`superset.versioning.changes.state` — new
+ columns are picked up automatically).
+ - ``Slice.params`` kind-classification (filter / metric / time_range
+ / color_palette / dimension, plus generic ``field`` fallback).
+
+Child-collection diffs (dataset ``TableColumn`` / ``SqlMetric``,
+dashboard ``dashboard_slices``) read the pre- and post-state from
+Continuum shadow tables via the helpers in
+:mod:`superset.versioning.changes.shadow_queries`, executed in
+``after_flush`` once Continuum has written its tx-N rows.
+
+``session.new`` entities are not processed in this listener:
+operation_type=0 transactions (baseline capture and first-save INSERTs)
+produce zero change records per spec §Clarifications 2026-04-24.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+import sqlalchemy as sa
+from sqlalchemy import event
+from sqlalchemy.exc import OperationalError, ProgrammingError
+from sqlalchemy.orm import Session
+
+from superset.versioning.changes.shadow_queries import (
+ _dashboard_child_records_for_tx_from_shadows,
+ _dataset_child_records_for_tx_from_shadows,
+)
+from superset.versioning.changes.state import (
+ bulk_insert_records,
+ compute_records_for_entity,
+)
+from superset.versioning.changes.table import ENTITY_KIND_BY_CLASS_NAME
+from superset.versioning.diff import (
+ ChangeRecord,
+ fold_dashboard_layout_with_chart_changes,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# Key under which the pending-records buffer is stored on ``session.info``.
+# Using ``session.info`` (SQLAlchemy's user-data dict) avoids the need
+# for a module-level WeakKeyDictionary and keeps buffers naturally scoped
+# to the session's lifetime.
+_BUFFER_KEY = "_version_changes_pending"
+
+# Key for the set of Continuum transaction ids whose change records
+# have already been written in this session. ``after_flush`` can fire
+# more than once for a single transaction (e.g. autoflush triggered by
+# a mid-commit query), and our child-diff path reads snapshot tables
+# that don't care about the buffer state — without this marker we'd
+# re-insert the same child records on the second flush and hit the
+# UNIQUE(transaction_id, entity_kind, entity_id, sequence) constraint.
+_PROCESSED_TXS_KEY = "_version_changes_processed_txs"
+
+# Key on ``session.info`` that commands set to declare the high-level
+# action that produced the current transaction. Read once per flush by
+# the change-record listener and stamped onto the
+# ``version_transaction.action_kind`` column via ``sa.update()``.
+# ``None`` (the default) means "ordinary save".
+#
+# Commands set this immediately before ``db.session.commit()``:
+#
+# db.session.info[ACTION_KIND_KEY] = ACTION_KIND_RESTORE
+# db.session.commit()
+#
+# The listener pops the key after stamping, and ``after_commit`` /
+# ``after_rollback`` cleanup pop it again as a safety net, so a
+# long-lived session can't accidentally carry the value into the next
+# transaction.
+ACTION_KIND_KEY = "_versioning_action_kind"
+
+# Recognised ``action_kind`` values — the single source of truth shared
+# by the four command-side stampers (restore / import / clone) and the
+# listener that writes them to ``version_transaction.action_kind``.
+# Schemas / response decorators that need an allowlist read from
+# ``ACTION_KINDS`` so a future addition (e.g. ``"thumbnail_warm"``) only
+# has to update this one constant. ``None`` is *not* a member — it
+# represents the default "ordinary save" path that never sets the key.
+ACTION_KIND_RESTORE = "restore"
+ACTION_KIND_IMPORT = "import"
+ACTION_KIND_CLONE = "clone"
+ACTION_KINDS: frozenset[str] = frozenset(
+ {ACTION_KIND_RESTORE, ACTION_KIND_IMPORT, ACTION_KIND_CLONE}
+)
+
+# Key on ``session.info`` carrying a synthetic "headline" change record
+# for the current transaction — the ``__meta__`` record convention. Set
+# by commands alongside ``ACTION_KIND_KEY`` when the avenue has a payload
+# the field-level diff can't express; the canonical case is restore,
+# whose transaction otherwise carries no pointer to WHICH version was
+# restored (surfaced by the version-history UI, PR #40988: "Restored to
+# X from [date]" can't be rendered from API data alone).
+#
+# Build the value with :func:`build_action_headline` — the single owner
+# of the record shape — rather than hand-rolling the dict; renderers
+# dispatch on ``kind == "__meta__"`` plus the transaction's
+# ``action_kind`` (the verb deliberately does NOT ride in ``path``,
+# which stays pure navigation per the ChangeRecord contract).
+#
+# The listener pops the key on the first record-bearing firing for the
+# transaction and PREPENDS the record to the entity's buffer (sequence 0
+# — headline first). Same lifecycle as ``ACTION_KIND_KEY``: popped on
+# use, and the ``after_commit`` / ``after_rollback`` cleanups pop it as
+# a safety net.
+ACTION_META_KEY = "_versioning_action_meta"
+
+# ``operation`` value for synthetic headline records: a headline
+# announces an action, it does not mutate a field, so the field-verb
+# vocabulary (add / remove / move / edit) would be dishonest here.
+OPERATION_ANNOUNCE = "announce"
+
+
+def build_action_headline(
+ entity_kind: str,
+ entity_id: int,
+ to_value: dict[str, Any],
+) -> dict[str, Any]:
+ """Build the ``ACTION_META_KEY`` payload — the single owner of the
+ ``__meta__`` headline record's shape.
+
+ *entity_kind* is the table-kind (see ``ENTITY_KIND_BY_CLASS_NAME``);
+ *to_value* carries the action's payload (for restore:
+ ``{"version_uuid": ..., "version_number": ...}``). The action itself
+ is identified by the transaction's ``action_kind`` column, which the
+ same command stamps via ``ACTION_KIND_KEY`` — renderers join the
+ two rather than parsing the verb out of the record.
+ """
+ return {
+ "entity_kind": entity_kind,
+ "entity_id": entity_id,
+ "record": ChangeRecord(
+ kind="__meta__",
+ operation=OPERATION_ANNOUNCE,
+ path=["__meta__"],
+ from_value=None,
+ to_value=to_value,
+ ),
+ }
+
+
+# Sentinel attribute set on the session target after first successful
+# registration. Subsequent calls become no-ops. Storing the flag on the
+# target itself (rather than module-level state) keeps the guard
+# naturally scoped — a fresh session proxy gets a fresh registration —
+# and avoids the TOCTOU race between ``event.contains`` and
+# ``event.listen`` that a module-level ref would have under concurrent
+# init. In test fixtures that instantiate multiple Superset apps per
+# process, the shared ``db.session`` carries the sentinel and re-entry
+# is correctly deduped.
+_REGISTERED_SENTINEL = "_versioning_change_listener_registered"
+
+
+def _process_dirty_entity_into_buffer(
+ session: Session,
+ obj: Any,
+ buffer: dict[tuple[str, int], list[ChangeRecord]],
+) -> None:
+ """Compute scalar change records for one dirty entity + append to buffer."""
+ entity_kind = ENTITY_KIND_BY_CLASS_NAME.get(type(obj).__name__)
+ if entity_kind is None:
+ return
+ entity_id = getattr(obj, "id", None)
+ if entity_id is None:
+ return
+ try:
+ records = compute_records_for_entity(session, obj)
+ except Exception: # pylint: disable=broad-except
+ logger.exception(
+ "version_changes: diff failed for %s id=%s",
+ type(obj).__name__,
+ entity_id,
+ )
+ return
+ if records:
+ buffer.setdefault((entity_kind, entity_id), []).extend(records)
+
+
+def _append_child_records_to_buffer(
+ session: Session,
+ tx_id: int,
+ buffer: dict[tuple[str, int], list[ChangeRecord]],
+) -> None:
+ """Compute dataset + dashboard child-collection records + append to buffer.
+
+ Runs in ``after_flush`` so the shadow tables already have the
+ current-tx rows. Reads from Continuum shadow tables
+ (``table_columns_version`` / ``sql_metrics_version`` /
+ ``dashboard_slices_version`` / ``slices_version``).
+ """
+ try:
+ for dataset_id, records in _dataset_child_records_for_tx_from_shadows(
+ session, tx_id
+ ).items():
+ buffer.setdefault(("dataset", dataset_id), []).extend(records)
+ for dashboard_id, records in (
+ _dashboard_child_records_for_tx_from_shadows(session, tx_id)
+ ).items():
+ buffer.setdefault(("dashboard", dashboard_id), []).extend(records)
+
+ # Post-merge fold: when a dashboard save adds/removes charts,
+ # drop the redundant ``position_json.*`` records that mirror
+ # the membership change. See
+ # ``diff.fold_dashboard_layout_with_chart_changes``.
+ for key in list(buffer.keys()):
+ if key[0] == "dashboard":
+ buffer[key] = fold_dashboard_layout_with_chart_changes(buffer[key])
+ if not buffer[key]:
+ del buffer[key]
+ except Exception: # pylint: disable=broad-except
+ logger.exception("version_changes: child-diff failed for tx %s", tx_id)
+
+
+def _current_transaction_id(session: Session) -> int | None:
+ """Return the Continuum transaction id for *session*'s current unit of
+ work, or ``None`` when Continuum has no active transaction (e.g. raw
+ SQL execution outside the ORM's flush flow).
+ """
+ # pylint: disable=import-outside-toplevel
+ from sqlalchemy_continuum import versioning_manager
+
+ uow = versioning_manager.units_of_work.get(session.connection())
+ if uow is None or uow.current_transaction is None:
+ return None
+ return uow.current_transaction.id
+
+
+def _inject_action_meta_record(
+ session: Session,
+ buffer: dict[tuple[str, int], list[ChangeRecord]],
+) -> None:
+ """Pop ``ACTION_META_KEY`` and prepend its synthetic headline record
+ to the owning entity's buffer (the ``__meta__`` record convention).
+
+ No-op when no command set the key — and, critically, no-op WITHOUT
+ popping when the buffer is empty: the buffer-empty short-circuit in
+ ``flush_change_records`` exists so a multi-flush transaction can
+ deliver its records on a later firing, and a headline-only buffer
+ would defeat it (the first firing would persist just the headline,
+ mark the tx processed, and the later flush's real records would be
+ silently dropped). Leaving the key in place parks the headline until
+ the record-bearing firing. Prepended (not appended) so the headline
+ gets ``sequence`` 0 and renders first. Malformed payloads are logged
+ and dropped — a headline is descriptive enrichment, never worth
+ failing the user's save over.
+ """
+ if not buffer:
+ return
+ meta = session.info.pop(ACTION_META_KEY, None)
+ if meta is None:
+ return
+ try:
+ key = (meta["entity_kind"], meta["entity_id"])
+ record = meta["record"]
+ buffer.setdefault(key, []).insert(0, record)
+ except (KeyError, TypeError): # pragma: no cover - defensive
+ logger.exception("version_changes: malformed ACTION_META_KEY payload")
+
+
+def _stamp_action_kind_on_transaction(session: Session, tx_id: int) -> None:
+ """Pop the per-tx action_kind from ``session.info`` and stamp it
+ onto the ``version_transaction`` row identified by *tx_id*.
+
+ No-op when no command set the action_kind (the default for
+ ordinary saves). Emits via ``sa.update()`` against Continuum's
+ transaction Table so the identifier is auto-quoted per dialect
+ (MySQL would otherwise reject the unquoted column name if it ever
+ collided with a reserved word) and the dialect-portable column
+ binding is reused instead of hand-written SQL.
+
+ The action_kind is popped (not just read) so a long-lived session
+ can't accidentally carry the value into the next transaction. A
+ failed stamp is logged and swallowed — action_kind is a
+ descriptive enrichment, not a correctness invariant; refusing to
+ write change records because an UPDATE on a single column failed
+ would punish the user save for an audit-log nicety.
+ """
+ # pylint: disable=import-outside-toplevel
+ from sqlalchemy_continuum import versioning_manager
+
+ action_kind = session.info.pop(ACTION_KIND_KEY, None)
+ if action_kind is None:
+ return
+ tx_tbl = versioning_manager.transaction_cls.__table__
+ try:
+ session.connection().execute(
+ sa.update(tx_tbl)
+ .where(tx_tbl.c.id == tx_id)
+ .values(action_kind=action_kind)
+ )
+ except Exception: # pylint: disable=broad-except
+ logger.exception(
+ "version_changes: failed to stamp action_kind=%s on tx %s",
+ action_kind,
+ tx_id,
+ )
+
+
+def _persist_buffered_records(
+ session: Session,
+ tx_id: int,
+ buffer: dict[tuple[str, int], list[ChangeRecord]],
+) -> None:
+ """Bulk-insert *buffer*'s records under *tx_id* and reset the buffer.
+
+ Catches ``OperationalError`` / ``ProgrammingError`` to handle the
+ pre-migration startup race (version_changes table missing — the
+ former on SQLite/MySQL, the latter on PostgreSQL), and ``Exception``
+ as the listener-boundary safety net so a malformed record can't
+ crash the user's save.
+
+ The insert runs under a SAVEPOINT (``begin_nested`` on the
+ connection): on PostgreSQL a failed statement aborts the enclosing
+ transaction, so without it the swallowed exception would still
+ poison the user's save — the COMMIT that follows this listener
+ would raise ``InFailedSqlTransaction``, defeating the fail-open
+ guarantee exactly where it matters.
+ """
+ try:
+ with session.connection().begin_nested():
+ bulk_insert_records(session, tx_id, buffer)
+ except (OperationalError, ProgrammingError):
+ # version_changes table missing (migration not yet applied).
+ pass
+ except Exception: # pylint: disable=broad-except
+ logger.exception(
+ "version_changes: bulk insert failed for tx %s (%d entities)",
+ tx_id,
+ len(buffer),
+ )
+
+
+def register_change_record_listener() -> None: # noqa: C901
+ """Attach the before_flush + after_flush listeners.
+
+ Registered from :class:`superset.initialization.SupersetAppInitializer`
+ (``init_versioning``) alongside the baseline, dataset-snapshot,
+ and dashboard-snapshot listeners. Must run after Continuum's
+ ``make_versioned()`` so the ``versioning_manager`` is available
+ and has installed its own before_flush hook.
+ """
+ # pylint: disable=import-outside-toplevel
+ from superset.connectors.sqla.models import SqlaTable
+ from superset.extensions import db
+ from superset.models.dashboard import Dashboard
+ from superset.models.slice import Slice
+
+ if getattr(db.session, _REGISTERED_SENTINEL, False):
+ return
+
+ versioned_classes: tuple[type, ...] = (Dashboard, Slice, SqlaTable)
+
+ def compute_change_records(
+ session: Session, _flush_context: Any, _instances: Any
+ ) -> None:
+ # session.info persists across before_flush/after_flush within
+ # a single transaction. The buffer is keyed on
+ # ``(entity_kind, entity_id)`` so scalar records captured here
+ # and child records captured in after_flush merge
+ # under the same entity without duplication.
+ buffer: dict[tuple[str, int], list[ChangeRecord]] = session.info.setdefault(
+ _BUFFER_KEY, {}
+ )
+ for obj in list(session.dirty):
+ if isinstance(obj, versioned_classes):
+ _process_dirty_entity_into_buffer(session, obj, buffer)
+
+ def flush_change_records(session: Session, _flush_context: Any) -> None:
+ buffer: dict[tuple[str, int], list[ChangeRecord]] = session.info.setdefault(
+ _BUFFER_KEY, {}
+ )
+
+ tx_id = _current_transaction_id(session)
+ if tx_id is None:
+ session.info[_BUFFER_KEY] = {}
+ return
+
+ # Skip if we've already written records for this tx (after_flush
+ # can fire more than once per commit — e.g. autoflush from a
+ # mid-commit query). Without this guard the child-diff path would
+ # re-read the same shadow rows and re-emit the same records,
+ # tripping the UNIQUE(transaction_id, entity_kind, entity_id,
+ # sequence) constraint on insert.
+ processed: set[int] = session.info.setdefault(_PROCESSED_TXS_KEY, set())
+ if tx_id in processed:
+ # Drop anything buffered after the tx was persisted: records
+ # left here would otherwise survive on the long-lived scoped
+ # session and be inserted under the NEXT transaction's id.
+ session.info[_BUFFER_KEY] = {}
+ return
+
+ # Stamp action_kind eagerly, before the buffer-empty short-
+ # circuit. Restores / imports / clones may flush across multiple
+ # cycles; the FIRST firing for this tx is the one with the
+ # value still on ``session.info``. The helper pops on success
+ # so subsequent firings see ``None`` and short-circuit cleanly.
+ _stamp_action_kind_on_transaction(session, tx_id)
+
+ _append_child_records_to_buffer(session, tx_id, buffer)
+
+ # After the child append and before the emptiness check: the
+ # headline joins whichever firing carries the transaction's real
+ # records (scalar or child), and its peek-don't-pop guard parks
+ # it across record-less firings instead of defeating the
+ # multi-flush short-circuit below.
+ _inject_action_meta_record(session, buffer)
+
+ if not buffer:
+ # Don't mark tx as processed when nothing was inserted. A
+ # later after_flush firing for the same tx may carry the
+ # records — e.g. when an entity's edit lands across two
+ # flushes (a child-only flush followed by a parent-dirty
+ # flush): the parent shadow only lands in the parent-dirty
+ # flush, so the child-diff path can't find a prior tx to
+ # compare against until then.
+ session.info[_BUFFER_KEY] = {}
+ return
+
+ try:
+ _persist_buffered_records(session, tx_id, buffer)
+ finally:
+ session.info[_BUFFER_KEY] = {}
+ processed.add(tx_id)
+
+ def reset_processed_after_commit(session: Session) -> None:
+ # ``_PROCESSED_TXS_KEY`` accumulates Continuum tx ids whose change
+ # records have already been written, to dedup against multiple
+ # ``after_flush`` firings within one transaction. After commit
+ # the tx is closed and its id will never recur on this session
+ # — drop the set so a long-lived session (Celery worker, CLI)
+ # doesn't grow it without bound.
+ session.info.pop(_PROCESSED_TXS_KEY, None)
+ # If a command set the action_kind but no flush fired (e.g. a
+ # save that touched nothing versioned), the value would
+ # otherwise leak into the next transaction. Drop it here as a
+ # belt-and-suspenders cleanup; the
+ # ``_stamp_action_kind_on_transaction`` helper already pops on
+ # the normal path.
+ session.info.pop(ACTION_KIND_KEY, None)
+ session.info.pop(ACTION_META_KEY, None)
+ session.info.pop(_BUFFER_KEY, None)
+
+ def reset_action_kind_after_rollback(session: Session) -> None:
+ # When a command sets ``ACTION_KIND_KEY`` and then an exception
+ # fires before flush (e.g. validation error after the key is
+ # set), the transaction rolls back without the listener ever
+ # popping the key. The next save on the same session would
+ # then inherit the stale value and label an unrelated commit
+ # as "restore" / "import" / "clone". Pop here so a rolled-back
+ # action's intent doesn't leak forward.
+ session.info.pop(ACTION_KIND_KEY, None)
+ session.info.pop(ACTION_META_KEY, None)
+ session.info.pop(_BUFFER_KEY, None)
+
+ event.listen(db.session, "before_flush", compute_change_records)
+ event.listen(db.session, "after_flush", flush_change_records)
+ event.listen(db.session, "after_commit", reset_processed_after_commit)
+ event.listen(db.session, "after_rollback", reset_action_kind_after_rollback)
+ setattr(db.session, _REGISTERED_SENTINEL, True)
diff --git a/superset/versioning/changes/shadow_queries.py b/superset/versioning/changes/shadow_queries.py
new file mode 100644
index 000000000000..1f6a290007f1
--- /dev/null
+++ b/superset/versioning/changes/shadow_queries.py
@@ -0,0 +1,323 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Shadow-table queries that drive child-collection diffs.
+
+Reads Continuum shadow tables (``table_columns_version`` /
+``sql_metrics_version`` / ``dashboard_slices_version`` /
+``slices_version``) under the validity-strategy semantics to compute
+the pre/post state of child collections at a given transaction. Used
+by the change-record listener's ``after_flush`` path once Continuum
+has written the current transaction's shadow rows.
+
+**Inline imports.** Continuum's ``version_class`` and the Superset
+model classes are imported inside each helper because this package is
+loaded from ``init_versioning()`` before all SQLAlchemy mappers are
+configured. The deferred imports keep the module-load graph free of
+mapper-resolution side effects.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+import sqlalchemy as sa
+from sqlalchemy.orm import Session
+
+from superset.versioning.baseline import CONTINUUM_BOOKKEEPING_COLUMNS
+from superset.versioning.changes.state import jsonable
+from superset.versioning.diff import (
+ ChangeRecord,
+ diff_dashboard_slices,
+ diff_dataset_columns,
+ diff_dataset_metrics,
+)
+
+
+def shadow_rows_valid_at(
+ session: Session,
+ shadow_table: sa.Table,
+ fk_col_name: str,
+ fk_value: int,
+ tx: int,
+) -> list[dict[str, Any]]:
+ """Return the live state of *shadow_table* rows whose FK column
+ (``fk_col_name``) equals *fk_value*, as of transaction *tx*.
+
+ Uses Continuum's validity-strategy semantics: a row is "valid at tx"
+ when ``transaction_id <= tx`` AND (``end_transaction_id`` IS NULL OR
+ ``end_transaction_id`` > tx) AND it isn't a DELETE shadow.
+
+ The returned dicts mirror the live row's column set (no Continuum
+ bookkeeping columns), so they can be passed straight to the
+ natural-key diff helpers (``diff_dataset_columns`` etc.).
+ """
+ fk_col = getattr(shadow_table.c, fk_col_name)
+ rows = (
+ session.connection()
+ .execute(
+ sa.select(shadow_table).where(
+ fk_col == fk_value,
+ shadow_table.c.transaction_id <= tx,
+ sa.or_(
+ shadow_table.c.end_transaction_id.is_(None),
+ shadow_table.c.end_transaction_id > tx,
+ ),
+ shadow_table.c.operation_type != 2,
+ )
+ )
+ .mappings()
+ .all()
+ )
+ # Coerce values to JSON-safe forms — raw shadow rows can carry
+ # ``UUID``, ``datetime``, ``bytes`` etc. that don't survive the
+ # ``version_changes.from_value/to_value`` JSON column write.
+ return [
+ {
+ k: jsonable(v)
+ for k, v in dict(row).items()
+ if k not in CONTINUUM_BOOKKEEPING_COLUMNS
+ }
+ for row in rows
+ ]
+
+
+def _affected_dataset_ids_at_tx(session: Session, tx: int) -> set[int]:
+ """Datasets touched at *tx* — directly (parent shadow at tx) or
+ indirectly (column / metric shadow at tx)."""
+ # pylint: disable=import-outside-toplevel
+ from sqlalchemy_continuum import version_class
+
+ from superset.connectors.sqla.models import SqlaTable, SqlMetric, TableColumn
+
+ dataset_ids: set[int] = set()
+ parent_tbl = version_class(SqlaTable).__table__
+ for row in session.connection().execute(
+ sa.select(parent_tbl.c.id).where(parent_tbl.c.transaction_id == tx)
+ ):
+ dataset_ids.add(row[0])
+ for child_cls in (TableColumn, SqlMetric):
+ child_tbl = version_class(child_cls).__table__
+ for row in session.connection().execute(
+ sa.select(child_tbl.c.table_id).where(child_tbl.c.transaction_id == tx)
+ ):
+ if row[0] is not None:
+ dataset_ids.add(row[0])
+ return dataset_ids
+
+
+def _dataset_child_records_for_tx_from_shadows(
+ session: Session, transaction_id: int
+) -> dict[int, list[ChangeRecord]]:
+ """Compute column + metric diff records for each dataset touched at
+ *transaction_id*, reading from Continuum shadow tables.
+
+ For each dataset:
+ * Post-state = rows valid at ``transaction_id`` in
+ ``table_columns_version`` / ``sql_metrics_version``.
+ * Pre-state = rows valid at ``transaction_id - 1`` in the same
+ shadow tables.
+
+ With Continuum's validity-strategy semantics, "valid at tx N - 1"
+ is the state immediately before this transaction's effects (the
+ row that gets superseded at tx=N has ``end_transaction_id=N``, so
+ it satisfies ``end > N - 1``). Unrelated transactions between this
+ dataset's edits are transparent — they don't change validity for
+ this dataset's children.
+
+ First-edit case: when there is no prior tx (the dataset's earliest
+ shadow IS at *transaction_id*), pre-state is empty. We skip rather
+ than emit "Added X" for every column — same "baseline = zero
+ records" semantics as the snapshot path.
+ """
+ # pylint: disable=import-outside-toplevel
+ from sqlalchemy_continuum import version_class
+
+ from superset.connectors.sqla.models import SqlMetric, TableColumn
+
+ cols_tbl = version_class(TableColumn).__table__
+ metrics_tbl = version_class(SqlMetric).__table__
+
+ result: dict[int, list[ChangeRecord]] = {}
+ for dataset_id in _affected_dataset_ids_at_tx(session, transaction_id):
+ # Skip the very first transaction for this dataset (no pre-state).
+ prior_tx = (
+ session.connection()
+ .execute(
+ sa.select(sa.func.max(cols_tbl.c.transaction_id)).where(
+ cols_tbl.c.table_id == dataset_id,
+ cols_tbl.c.transaction_id < transaction_id,
+ )
+ )
+ .scalar()
+ )
+ if prior_tx is None:
+ # No prior column shadow — could still be a metric-only edit;
+ # check metrics shadow too.
+ prior_tx = (
+ session.connection()
+ .execute(
+ sa.select(sa.func.max(metrics_tbl.c.transaction_id)).where(
+ metrics_tbl.c.table_id == dataset_id,
+ metrics_tbl.c.transaction_id < transaction_id,
+ )
+ )
+ .scalar()
+ )
+ if prior_tx is None:
+ continue
+
+ post_cols = shadow_rows_valid_at(
+ session, cols_tbl, "table_id", dataset_id, transaction_id
+ )
+ pre_cols = shadow_rows_valid_at(
+ session, cols_tbl, "table_id", dataset_id, prior_tx
+ )
+ post_metrics = shadow_rows_valid_at(
+ session, metrics_tbl, "table_id", dataset_id, transaction_id
+ )
+ pre_metrics = shadow_rows_valid_at(
+ session, metrics_tbl, "table_id", dataset_id, prior_tx
+ )
+
+ records: list[ChangeRecord] = []
+ records.extend(diff_dataset_columns(pre_cols, post_cols))
+ records.extend(diff_dataset_metrics(pre_metrics, post_metrics))
+ if records:
+ result[dataset_id] = records
+ return result
+
+
+def _affected_dashboard_ids_at_tx(session: Session, tx: int) -> set[int]:
+ """Dashboards touched at *tx* — directly (parent shadow at tx) or
+ indirectly (slice-membership shadow at tx)."""
+ # pylint: disable=import-outside-toplevel
+ from sqlalchemy_continuum import version_class
+
+ from superset.models.dashboard import Dashboard
+
+ dashboard_ids: set[int] = set()
+ parent_tbl = version_class(Dashboard).__table__
+ for row in session.connection().execute(
+ sa.select(parent_tbl.c.id).where(parent_tbl.c.transaction_id == tx)
+ ):
+ dashboard_ids.add(row[0])
+
+ # M2M shadow: ``dashboard_slices_version`` is auto-generated by
+ # Continuum and lives in metadata — not a model class. Look it up
+ # from the metadata bag rather than via ``version_class``.
+ metadata = parent_tbl.metadata
+ if (m2m_tbl := metadata.tables.get("dashboard_slices_version")) is not None:
+ for row in session.connection().execute(
+ sa.select(m2m_tbl.c.dashboard_id).where(m2m_tbl.c.transaction_id == tx)
+ ):
+ if row[0] is not None:
+ dashboard_ids.add(row[0])
+ return dashboard_ids
+
+
+def _dashboard_slice_uuids_at_tx(
+ session: Session, dashboard_id: int, tx: int
+) -> list[str]:
+ """Slice UUIDs attached to *dashboard_id* as of *tx*, read by joining
+ ``dashboard_slices_version`` (M2M membership) against
+ ``slices_version`` (slice content).
+
+ Joining through both is necessary — and matches the same query
+ Continuum's M2M ``Reverter`` uses — because a slice that's
+ referenced by the M2M but has no slice-version row at this tx is
+ treated as "not yet versioned" and excluded.
+
+ Returns UUIDs (strings) so the result can be diffed by the existing
+ :func:`diff_dashboard_slices` helper, which keys on uuid.
+ """
+ # pylint: disable=import-outside-toplevel
+ from sqlalchemy_continuum import version_class
+
+ from superset.models.slice import Slice
+
+ metadata = version_class(Slice).__table__.metadata
+ m2m_tbl = metadata.tables.get("dashboard_slices_version")
+ slices_tbl = version_class(Slice).__table__
+ if m2m_tbl is None:
+ return []
+
+ rows = (
+ session.connection()
+ .execute(
+ sa.select(slices_tbl.c.uuid).where(
+ slices_tbl.c.id == m2m_tbl.c.slice_id,
+ m2m_tbl.c.dashboard_id == dashboard_id,
+ m2m_tbl.c.transaction_id <= tx,
+ sa.or_(
+ m2m_tbl.c.end_transaction_id.is_(None),
+ m2m_tbl.c.end_transaction_id > tx,
+ ),
+ m2m_tbl.c.operation_type != 2,
+ slices_tbl.c.transaction_id <= tx,
+ sa.or_(
+ slices_tbl.c.end_transaction_id.is_(None),
+ slices_tbl.c.end_transaction_id > tx,
+ ),
+ slices_tbl.c.operation_type != 2,
+ )
+ )
+ .all()
+ )
+ return [str(r[0]) for r in rows if r[0] is not None]
+
+
+def _dashboard_child_records_for_tx_from_shadows(
+ session: Session, transaction_id: int
+) -> dict[int, list[ChangeRecord]]:
+ """Compute slice-membership diff records for each dashboard touched
+ at *transaction_id*, reading from Continuum shadow tables.
+
+ Same pre/post logic as
+ :func:`_dataset_child_records_for_tx_from_shadows`.
+ """
+ # pylint: disable=import-outside-toplevel
+ from sqlalchemy_continuum import version_class
+
+ from superset.models.dashboard import Dashboard
+
+ metadata = version_class(Dashboard).__table__.metadata
+ m2m_tbl = metadata.tables.get("dashboard_slices_version")
+
+ result: dict[int, list[ChangeRecord]] = {}
+ for dashboard_id in _affected_dashboard_ids_at_tx(session, transaction_id):
+ prior_tx = None
+ if m2m_tbl is not None:
+ prior_tx = (
+ session.connection()
+ .execute(
+ sa.select(sa.func.max(m2m_tbl.c.transaction_id)).where(
+ m2m_tbl.c.dashboard_id == dashboard_id,
+ m2m_tbl.c.transaction_id < transaction_id,
+ )
+ )
+ .scalar()
+ )
+ if prior_tx is None:
+ continue
+
+ post_uuids = _dashboard_slice_uuids_at_tx(session, dashboard_id, transaction_id)
+ pre_uuids = _dashboard_slice_uuids_at_tx(session, dashboard_id, prior_tx)
+
+ records = diff_dashboard_slices(pre_uuids, post_uuids)
+ if records:
+ result[dashboard_id] = records
+ return result
diff --git a/superset/versioning/changes/state.py b/superset/versioning/changes/state.py
new file mode 100644
index 000000000000..5d47871e1eed
--- /dev/null
+++ b/superset/versioning/changes/state.py
@@ -0,0 +1,237 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Pre/post-state reading and the per-entity diff dispatch.
+
+Three concerns live here:
+
+1. **JSON-safety coercion** — raw column values (``datetime``, ``UUID``,
+ ``bytes``, ``Decimal``) get converted to strings before they land in
+ the ``version_changes.from_value`` / ``to_value`` JSON columns.
+2. **State capture** — :func:`_orm_to_post_state` serialises the
+ in-memory ORM object; :func:`_read_pre_state` reads the corresponding
+ pre-flush row directly from the DB inside ``session.no_autoflush``.
+3. **Diff dispatch** — :func:`compute_records_for_entity` routes to the
+ right :mod:`superset.versioning.diff` helper based on the model
+ class name (string dispatch keeps this module free of hard imports
+ on the three entity classes, which avoids import-order coupling at
+ app-init time).
+
+Bulk insert of the computed records into the ``version_changes`` table
+lives here too — it's the tail of the per-entity compute pipeline.
+"""
+
+from __future__ import annotations
+
+import logging
+from datetime import date, datetime
+from decimal import Decimal
+from typing import Any
+from uuid import UUID
+
+import sqlalchemy as sa
+from flask_appbuilder import Model
+from sqlalchemy.orm import Session
+
+from superset.versioning.changes.table import version_changes_table
+from superset.versioning.diff import (
+ ChangeRecord,
+ diff_dashboard,
+ diff_dataset,
+ diff_slice,
+ scalar_fields_for,
+)
+from superset.versioning.utils import read_row_outside_flush
+
+logger = logging.getLogger(__name__)
+
+
+# Per-model-class cache of the scalar-field set. Populated lazily on
+# first save of a model. Reading from ``__table__.columns`` is cheap
+# but not free; memoising keeps the save-path overhead budget (FR-021)
+# from slowly growing with the set of distinct model classes seen.
+_SCALAR_FIELDS_CACHE: dict[type, frozenset[str]] = {}
+
+
+def _cached_scalar_fields(model_cls: type[Model]) -> frozenset[str]:
+ """Cached wrapper around :func:`scalar_fields_for`."""
+ if model_cls not in _SCALAR_FIELDS_CACHE:
+ # ``Slice.params`` is walked by ``diff_slice_params`` for kind
+ # promotion; emitting it as one opaque ``field`` change would
+ # defeat that and flood the log with meaningless records.
+ # ``last_saved_at`` / ``last_saved_by_fk`` are stamped by
+ # ``UpdateChartCommand`` on every chart save; they're audit
+ # noise (same shape as ``changed_on`` / ``changed_by_fk``) and
+ # don't carry user-authored signal.
+ # ``Dashboard.json_metadata`` and ``position_json`` are JSON
+ # blobs walked structurally by ``diff_json_field`` (one record
+ # per changed top-level key); the raw scalar diff would emit
+ # one giant multi-KB record per save and swamp the response.
+ special: frozenset[str] = frozenset()
+ audit: frozenset[str] = frozenset()
+ if model_cls.__name__ == "Slice":
+ special = frozenset({"params"})
+ audit = frozenset({"last_saved_at", "last_saved_by_fk"})
+ elif model_cls.__name__ == "Dashboard":
+ special = frozenset({"json_metadata", "position_json"})
+ _SCALAR_FIELDS_CACHE[model_cls] = scalar_fields_for(
+ model_cls, special=special, audit=audit
+ )
+ return _SCALAR_FIELDS_CACHE[model_cls]
+
+
+def jsonable(value: Any) -> Any:
+ """Convert a column value into a JSON-serialisable form.
+
+ Slice has ``last_saved_at`` (datetime), datasets have datetime
+ columns, and any of these fields can land in ``from_value`` /
+ ``to_value`` of a ``version_changes`` row, which is a JSON column.
+ Python's default JSON encoder rejects ``datetime`` / ``UUID`` /
+ ``bytes`` / ``Decimal``, so the whole bulk insert fails if a single
+ record carries one. Convert to ISO / hex / str at record-construction
+ time.
+ """
+ if isinstance(value, (datetime, date)):
+ return value.isoformat()
+ if isinstance(value, UUID):
+ return str(value)
+ if isinstance(value, bytes):
+ return value.hex()
+ if isinstance(value, Decimal):
+ # Stringify rather than ``float()`` to preserve precision; the
+ # diff engine compares string equality on ``from_value`` /
+ # ``to_value``, so coercing both sides to the same form is what
+ # matters.
+ return str(value)
+ return value
+
+
+def _orm_to_post_state(obj: Any) -> dict[str, Any]:
+ """Serialise an ORM object's column attributes to a plain dict.
+
+ We only read declared column attributes — not relationships or
+ hybrid properties — because the diff engine operates on scalar
+ values per its documented API. Values are passed through
+ :func:`jsonable` so the dict is JSON-safe end-to-end.
+ """
+ state = sa.inspect(obj)
+ return {
+ col.key: jsonable(getattr(obj, col.key)) for col in state.mapper.column_attrs
+ }
+
+
+def _read_pre_state(
+ session: Session, model_cls: type[Model], entity_id: int
+) -> dict[str, Any] | None:
+ """Read the entity's pre-flush row directly from the DB and convert
+ non-JSON-safe types to strings so both sides of the diff compare on
+ the same form. Delegates the autoflush-suppressed read itself to
+ :func:`superset.versioning.utils.read_row_outside_flush`.
+
+ Returns ``None`` if the row is missing (shouldn't happen for a dirty
+ existing object, but defensive against race conditions).
+ """
+ table = model_cls.__table__
+ result = read_row_outside_flush(session, table, entity_id)
+ if result is None:
+ return None
+ # Convert non-JSON-safe types (datetime, UUID, bytes, Decimal) to
+ # strings so both sides of the diff compare on the same form and
+ # any value that ends up in ``from_value`` / ``to_value`` is
+ # acceptable to the JSON column on insert.
+ return {key: jsonable(value) for key, value in result.items()}
+
+
+def compute_records_for_entity(session: Session, obj: Any) -> list[ChangeRecord]:
+ """Diff the pre-state (from DB) against the post-state (in memory).
+
+ Dispatches to :func:`diff_slice` / :func:`diff_dashboard` /
+ :func:`diff_dataset` based on the model class name — string-based
+ dispatch is used to keep this module free of hard imports on the
+ three entity classes, which in turn avoids import-order coupling
+ at app-init time.
+ """
+ model_cls = type(obj)
+ entity_id = getattr(obj, "id", None)
+ if entity_id is None:
+ return []
+
+ try:
+ pre_state = _read_pre_state(session, model_cls, entity_id)
+ except Exception: # pylint: disable=broad-except
+ logger.exception(
+ "version_changes: pre-state read failed for %s id=%s",
+ model_cls.__name__,
+ entity_id,
+ )
+ return []
+
+ if pre_state is None:
+ return []
+
+ post_state = _orm_to_post_state(obj)
+ fields = _cached_scalar_fields(model_cls)
+
+ name = model_cls.__name__
+ if name == "Slice":
+ return diff_slice(pre_state, post_state, fields=fields)
+ if name == "Dashboard":
+ return diff_dashboard(pre_state, post_state, fields=fields)
+ if name == "SqlaTable":
+ return diff_dataset(pre_state, post_state, fields=fields)
+ return []
+
+
+def bulk_insert_records(
+ session: Session,
+ transaction_id: int,
+ buffered: dict[tuple[str, int], list[ChangeRecord]],
+) -> None:
+ """Insert ``version_changes`` rows for one transaction via raw SQL.
+
+ Uses the module-level :data:`version_changes_table` Table object
+ (which carries JSON column types, unlike ``sa.table(...)``) so the
+ connection marshals ``path`` / ``from_value`` / ``to_value`` Python
+ structures into JSON on insert. Skips the ORM flush round that
+ ``session.bulk_insert_mappings`` would cost inside an already-
+ active flush.
+
+ ``buffered`` is a dict keyed on ``(entity_kind, entity_id)`` so
+ records for one entity — scalars from ``before_flush`` plus
+ children collected in ``after_flush`` — merge naturally under the
+ same key. ``sequence`` resets per entity so each entity's records
+ form a self-contained replay sequence.
+ """
+ if not buffered:
+ return
+ rows = []
+ for (entity_kind, entity_id), records in buffered.items():
+ for seq, r in enumerate(records):
+ rows.append(
+ {
+ "transaction_id": transaction_id,
+ "entity_kind": entity_kind,
+ "entity_id": entity_id,
+ "sequence": seq,
+ "kind": r.kind,
+ "operation": r.operation,
+ "path": r.path,
+ "from_value": r.from_value,
+ "to_value": r.to_value,
+ }
+ )
+ if rows:
+ session.connection().execute(version_changes_table.insert(), rows)
diff --git a/superset/versioning/changes/table.py b/superset/versioning/changes/table.py
new file mode 100644
index 000000000000..174d37269e38
--- /dev/null
+++ b/superset/versioning/changes/table.py
@@ -0,0 +1,87 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Schema definition for ``version_changes``.
+
+Declared against the shared ``Model.metadata`` so integration tests
+that build schema via ``metadata.create_all()`` pick it up without the
+Alembic migration running. Mirrors the shape of the
+``56cd24c07170_add_versioning_tables`` migration byte-for-byte. Typed
+columns (``sa.JSON`` for path / values) are required so the
+connection's bulk-insert path marshals Python lists/dicts into JSON —
+a lightweight ``sa.table(...)`` would not carry the type info and
+SQLite's driver would reject the ``list`` as an unsupported bind.
+
+The schema lives in its own module to keep the listener and the
+shadow-table-query helpers free of schema-construction boilerplate at
+import time.
+"""
+
+from __future__ import annotations
+
+import sqlalchemy as sa
+from flask_appbuilder import Model
+
+_metadata = Model.metadata # pylint: disable=no-member
+
+version_changes_table = sa.Table(
+ "version_changes",
+ _metadata,
+ sa.Column("id", sa.BigInteger, primary_key=True, autoincrement=True),
+ # ``transaction_id`` references ``version_transaction.id`` at the DB
+ # level only — the FK + ON DELETE CASCADE live in the Alembic
+ # migration. Declaring the FK here would fail to resolve at Table
+ # creation time because ``version_transaction`` is built
+ # dynamically by SQLAlchemy-Continuum at mapper-configuration time;
+ # integration tests that materialise schema via ``metadata.create_all``
+ # before Continuum runs would hit ``NoReferencedTableError``. Same
+ # pattern as the other versioning tables.
+ sa.Column("transaction_id", sa.BigInteger, nullable=False),
+ sa.Column("entity_kind", sa.String(32), nullable=False),
+ sa.Column("entity_id", sa.Integer, nullable=False),
+ # Integer, not SmallInteger: matches the migration — per-entity
+ # sequence within a transaction is assigned by unbounded enumerate().
+ sa.Column("sequence", sa.Integer, nullable=False),
+ sa.Column("kind", sa.String(32), nullable=False),
+ sa.Column("operation", sa.String(16), nullable=False),
+ sa.Column("path", sa.JSON, nullable=False),
+ sa.Column("from_value", sa.JSON, nullable=True),
+ sa.Column("to_value", sa.JSON, nullable=True),
+ sa.UniqueConstraint(
+ "transaction_id",
+ "entity_kind",
+ "entity_id",
+ "sequence",
+ name="uq_version_changes_tx_entity_sequence",
+ ),
+ sa.Index("ix_version_changes_kind", "kind"),
+ # No standalone transaction_id index: the UNIQUE constraint above
+ # leads with transaction_id, so its backing index already serves
+ # transaction_id-prefix lookups on every dialect.
+ sa.Index("ix_version_changes_entity", "entity_kind", "entity_id"),
+ extend_existing=True,
+)
+
+# Mapping from Python class name to the ``entity_kind`` value written
+# to ``version_changes.entity_kind``. The API filters change records
+# by this value (``WHERE entity_kind = 'chart'`` for the chart history
+# endpoint, etc.) — kept short and user-facing-ish so downstream tools
+# consuming the raw table read sensibly.
+ENTITY_KIND_BY_CLASS_NAME: dict[str, str] = {
+ "Slice": "chart",
+ "Dashboard": "dashboard",
+ "SqlaTable": "dataset",
+}
diff --git a/superset/versioning/diff.py b/superset/versioning/diff.py
new file mode 100644
index 000000000000..3dce7b81e91f
--- /dev/null
+++ b/superset/versioning/diff.py
@@ -0,0 +1,1036 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Diff engine for the ``version_changes`` table (FR-016..FR-019).
+
+Hand-rolled because:
+
+- The on-disk ``path`` shape (array of segments) is a direct
+ representation of our chosen format; external diff libraries
+ return string paths or JSON-Pointer forms that would need
+ translation.
+- Kind classification (``filter`` vs ``metric`` vs ``field`` etc.)
+ is co-located with diff walking, avoiding a second classification
+ pass over the generic diff output.
+- Child-collection identity uses natural keys (``column_name``,
+ ``metric_name``, slice ``uuid``) — the same identity model
+ ``DatasetDAO.update_columns`` settled on (ADR-004). External
+ libraries default to list-index matching, which is wrong for our
+ data.
+
+See ADR (plan.md §"Key Design Decision: Hand-rolled diff engine") for
+the full rationale.
+
+All functions in this module are pure: they take dicts (or lists of
+dicts) and return a list of :class:`ChangeRecord`. The ORM->dict
+conversion and Continuum transaction lookup happen in the capture
+listener, not here. This keeps the engine unit-testable without
+an app context or DB.
+"""
+
+from __future__ import annotations
+
+import logging
+from collections.abc import Callable, Iterable
+from dataclasses import dataclass
+from typing import Any
+
+from superset.utils import json as _json
+
+logger = logging.getLogger(__name__)
+
+# Per-field recursion depth caps for the leaf-level diff walker.
+# A cap is a usefulness bound, not a safety bound: it controls how deep
+# into a nested JSON value the engine emits per-leaf records before
+# stopping and treating the sub-tree as an opaque value. Values are
+# tuned to the field's semantic shape — layout meta is shallow
+# (text/sizes/colors), json_metadata and chart params can carry deep
+# structures (native filters, adhoc filter sub-queries).
+_LAYOUT_META_DIFF_DEPTH = 3
+_JSON_METADATA_DIFF_DEPTH = 6
+_SLICE_PARAMS_DIFF_DEPTH = 6
+
+# Columns that are always excluded from change records, regardless of
+# what ``__versioned__`` says. ``id`` / ``uuid`` are stable identifiers
+# (not edited in normal flows). The four audit fields change on every
+# save — emitting records for them would double every history entry
+# with meaningless "timestamp changed, user stamped" rows that the UI
+# would have to filter out anyway.
+_AUDIT_FIELDS: frozenset[str] = frozenset(
+ {
+ "id",
+ "uuid",
+ "created_on",
+ "changed_on",
+ "created_by_fk",
+ "changed_by_fk",
+ }
+)
+
+# Fields stripped from child-collection dict items (TableColumn,
+# SqlMetric) before comparison and emission. ``changed_on`` /
+# ``created_on`` / ``*_by_fk`` are audit fields that update on every
+# save of the parent — without this filter, saving a dataset to add
+# one column produces a record per existing column too (because their
+# ``changed_on`` timestamps all refreshed). ``id`` and ``table_id``
+# are implementation details — ``id`` can change under the
+# ``override_columns`` delete-and-reinsert pattern (ADR-004) even
+# when the column is semantically unchanged; ``table_id`` is the
+# parent FK and never meaningfully differs within one dataset's
+# history. ``uuid`` stays stable across normal saves and is kept so
+# the renderer can use it for identity if it needs to.
+_CHILD_ITEM_OPAQUE_FIELDS: frozenset[str] = frozenset(
+ {
+ "id",
+ "table_id",
+ "changed_on",
+ "created_on",
+ "changed_by_fk",
+ "created_by_fk",
+ }
+)
+
+
+def _strip_opaque_fields(item: Any) -> Any:
+ """Return *item* with child-item audit/implementation fields removed.
+
+ Pass-through for non-dict values (scalars, strings) — the strip
+ only applies where it matters (dataset column / metric dicts).
+ """
+ if not isinstance(item, dict):
+ return item
+ return {k: v for k, v in item.items() if k not in _CHILD_ITEM_OPAQUE_FIELDS}
+
+
+# Chart ``params`` sub-keys that are promoted to first-class kinds.
+# Every other params sub-key falls through to ``kind="field"``.
+_CHART_PARAMS_KIND_BY_KEY: dict[str, str] = {
+ "adhoc_filters": "filter",
+ "time_range": "time_range",
+ "color_scheme": "color_palette",
+ "metrics": "metric",
+ "groupby": "dimension",
+ "columns": "dimension",
+}
+
+# Chart ``params`` sub-keys that are machine-stamped on save and don't
+# carry user-authored signal — same category as ``last_saved_at`` on
+# the scalar side. ``slice_id`` is a self-reference to the chart's
+# own primary id; Superset's save paths add or refresh it on every
+# save, producing a spurious "field" record on the first save after
+# a chart's params were stored without it.
+_CHART_PARAMS_AUDIT_KEYS: frozenset[str] = frozenset({"slice_id"})
+
+
+def scalar_fields_for(
+ model_cls: Any,
+ *,
+ special: frozenset[str] = frozenset(),
+ audit: frozenset[str] = frozenset(),
+) -> frozenset[str]:
+ """Scalar columns on ``model_cls`` that should produce change records.
+
+ Derived from the model itself at call time so contributors (and
+ downstream derivatives) don't have to maintain a parallel whitelist
+ in this module. Adding a new column to ``Dashboard``, ``Slice``, or
+ ``SqlaTable`` — whether upstream or in a fork — automatically flows
+ through to ``version_changes`` on the next save.
+
+ Excludes, in order:
+
+ 1. The model's own ``__versioned__.exclude`` list, so change records
+ stay consistent with Continuum's shadow tables. If Continuum
+ isn't tracking a column, the change log shouldn't either.
+ 2. :data:`_AUDIT_FIELDS` — ``id``, ``uuid``, and the audit
+ timestamps / user-id columns shared across the three entity types.
+ 3. The caller's ``audit`` set — model-specific save-side-effect
+ columns that aren't user-authored content. ``Slice.last_saved_at``
+ / ``last_saved_by_fk`` are stamped on every chart save by
+ ``UpdateChartCommand``, similar to how ``changed_on`` is stamped
+ by the ORM event listener; emitting "field" records for them
+ would noise up the change log with one entry per save that
+ carries no user-meaningful signal.
+ 4. The caller's ``special`` set — columns handled by a dedicated
+ differ elsewhere. ``Slice.params``, for example, is walked by
+ :func:`diff_slice_params` to produce first-class ``filter`` /
+ ``time_range`` / ``metric`` / ``dimension`` records; emitting
+ it as a single opaque ``field`` would defeat that.
+ """
+ try:
+ table = model_cls.__table__
+ except AttributeError:
+ return frozenset()
+ columns = frozenset(c.name for c in table.columns)
+ continuum_exclude = frozenset(
+ getattr(model_cls, "__versioned__", {}).get("exclude", []) or []
+ )
+ return columns - continuum_exclude - _AUDIT_FIELDS - audit - special
+
+
+@dataclass(frozen=True)
+class ChangeRecord:
+ """One atomic change, as stored in ``version_changes``.
+
+ Fields match the ``version_changes`` columns one-to-one so the
+ capture listener can serialise a list of these to
+ ``session.bulk_insert_mappings`` without translation.
+
+ Three orthogonal dimensions:
+ * ``kind`` — what type of thing changed (``filter`` / ``column`` /
+ ``header`` / ``field`` / etc.). Content category.
+ * ``operation`` — what happened to it (``add`` / ``remove`` /
+ ``move`` / ``edit``). ``move`` only fires for layout records.
+ * ``path`` — pure navigation address; no verb encoded.
+
+ The transaction-level fourth dimension (``trigger``: ``restore`` /
+ ``import`` / ``clone``) lives on ``version_transaction``, not here.
+ """
+
+ kind: str
+ operation: str
+ path: list[Any]
+ from_value: Any
+ to_value: Any
+
+
+Key = str | int
+
+
+def _operation_from_values(from_value: Any, to_value: Any) -> str:
+ """Derive the per-record ``operation`` verb from ``from_value`` /
+ ``to_value`` nullability.
+
+ * ``add`` — ``from_value`` is ``None`` and ``to_value`` is not.
+ * ``remove`` — ``to_value`` is ``None`` and ``from_value`` is not.
+ * ``edit`` — both populated (or both null, which shouldn't reach here).
+
+ Used by every emit site except ``_diff_layout_node``, which emits
+ ``move`` records (parent reparenting) that cannot be derived from
+ value nullability alone.
+ """
+ if from_value is None and to_value is not None:
+ return "add"
+ if to_value is None and from_value is not None:
+ return "remove"
+ return "edit"
+
+
+def _values_equivalent(from_value: Any, to_value: Any) -> bool:
+ """True if a transition from ``from_value`` to ``to_value`` should
+ NOT produce a record.
+
+ Beyond plain ``==`` equality, treats ``None`` and ``""`` as equivalent:
+ Superset's save paths normalize nullable strings to ``""`` on first
+ write (e.g. ``Dashboard.css``, ``certified_by``,
+ ``certification_details``), so a first-save transition between
+ null and empty string carries no user-authored signal.
+ """
+ if from_value == to_value:
+ return True
+ if from_value in (None, "") and to_value in (None, ""):
+ return True
+ return False
+
+
+def _diff_scalar(
+ field_name: str,
+ from_value: Any,
+ to_value: Any,
+) -> ChangeRecord | None:
+ """Emit a generic ``kind="field"`` record when a scalar differs."""
+ if _values_equivalent(from_value, to_value):
+ return None
+ return ChangeRecord(
+ kind="field",
+ operation=_operation_from_values(from_value, to_value),
+ path=[field_name],
+ from_value=from_value,
+ to_value=to_value,
+ )
+
+
+def _recursive_leaf_diff(
+ kind: str,
+ path_prefix: list[Any],
+ pre: Any,
+ post: Any,
+ *,
+ max_depth: int,
+) -> list[ChangeRecord]:
+ """Walk matched dict structures and emit one ``ChangeRecord`` per
+ changed leaf.
+
+ Recursion rules:
+
+ * Both sides equal (per :func:`_values_equivalent`) → no record.
+ * Both sides ``dict`` AND recursion depth below ``max_depth`` →
+ recurse into each key, extending the path by the key.
+ * All other cases (scalar mismatch, list on either side, mismatched
+ types, both dicts but depth-capped) → emit one leaf record with
+ ``from_value`` / ``to_value`` carrying the raw pre/post values.
+
+ Lists are treated as opaque on purpose — positional paths break on
+ reorder and most lists in Superset's JSON blobs (adhoc filters,
+ metrics, dataset columns) already have a dedicated natural-key
+ walker upstream that emits per-element records with the right
+ identity.
+
+ A depth-cap hit on dict-vs-dict emits a debug log so production
+ tuning can see when a field's cap is too tight to capture all
+ meaningful change.
+ """
+
+ def _walk(pre: Any, post: Any, path: list[Any], depth: int) -> list[ChangeRecord]:
+ if _values_equivalent(pre, post):
+ return []
+ if depth < max_depth and isinstance(pre, dict) and isinstance(post, dict):
+ records: list[ChangeRecord] = []
+ for key in sorted(set(pre) | set(post)):
+ records.extend(
+ _walk(pre.get(key), post.get(key), [*path, key], depth + 1)
+ )
+ return records
+ if isinstance(pre, dict) and isinstance(post, dict):
+ logger.debug(
+ "version_changes: depth cap %d hit at path=%s — sub-tree "
+ "emitted as opaque leaf",
+ max_depth,
+ path,
+ )
+ return [
+ ChangeRecord(
+ kind=kind,
+ operation=_operation_from_values(pre, post),
+ path=list(path),
+ from_value=pre,
+ to_value=post,
+ )
+ ]
+
+ return _walk(pre, post, path_prefix, 0)
+
+
+def _diff_list_by_natural_key(
+ kind: str,
+ path_prefix: list[Any],
+ from_list: list[Any] | None,
+ to_list: list[Any] | None,
+ key_fn: Callable[[Any], Key | None],
+) -> list[ChangeRecord]:
+ """Diff two lists, matching elements by natural key.
+
+ Emits one record per add / remove / modify. When ``key_fn`` returns
+ ``None`` for an item (natural key missing or empty), the item falls
+ back to its position as a synthetic key — so insertions in the
+ middle of a keyless list still produce sensible records, at the
+ cost of position-dependent identity.
+ """
+ from_list = from_list or []
+ to_list = to_list or []
+
+ def _effective_key(raw: Key | None, idx: int) -> Key:
+ if raw is None or raw == "":
+ return idx
+ return raw
+
+ from_by_key: dict[Key, Any] = {}
+ for idx, item in enumerate(from_list):
+ from_by_key[_effective_key(key_fn(item), idx)] = item
+ to_by_key: dict[Key, Any] = {}
+ for idx, item in enumerate(to_list):
+ to_by_key[_effective_key(key_fn(item), idx)] = item
+
+ records: list[ChangeRecord] = []
+ # Preserve `from` order then append `to`-only keys, so sequence is
+ # deterministic across runs. For dict items (dataset columns /
+ # metrics) we strip audit/implementation fields before comparing
+ # AND before emitting — otherwise a save that only adds a new
+ # column would also emit "changed" records for every existing
+ # column, because their ``changed_on`` timestamps all refreshed.
+ # The stripped from/to are what the renderer sees; the per-column
+ # audit trail is already aggregated at the transaction level in
+ # ``version_transaction`` (``user_id`` + ``issued_at``).
+ for k, from_item in from_by_key.items():
+ to_item = to_by_key.get(k)
+ stripped_from = _strip_opaque_fields(from_item)
+ if to_item is None:
+ records.append(
+ ChangeRecord(
+ kind=kind,
+ operation="remove",
+ path=[*path_prefix, k],
+ from_value=stripped_from,
+ to_value=None,
+ )
+ )
+ continue
+ stripped_to = _strip_opaque_fields(to_item)
+ if stripped_from != stripped_to:
+ records.append(
+ ChangeRecord(
+ kind=kind,
+ operation="edit",
+ path=[*path_prefix, k],
+ from_value=stripped_from,
+ to_value=stripped_to,
+ )
+ )
+ for k, to_item in to_by_key.items():
+ if k not in from_by_key:
+ records.append(
+ ChangeRecord(
+ kind=kind,
+ operation="add",
+ path=[*path_prefix, k],
+ from_value=None,
+ to_value=_strip_opaque_fields(to_item),
+ )
+ )
+ return records
+
+
+def _filter_key(f: Any) -> Key | None:
+ """Natural key for an adhoc filter — its subject (column name).
+
+ Users rarely have two filters on the same column; when they do the
+ secondary dimensions (operator, comparator) appear in the record's
+ from/to values so the renderer can disambiguate.
+ """
+ return f.get("subject") if isinstance(f, dict) else None
+
+
+def _metric_key(m: Any) -> Key | None:
+ """Natural key for a metric: prefer ``label``, fall back to column+aggregate."""
+ if not isinstance(m, dict):
+ return None
+ if label := m.get("label"):
+ return label
+ column = m.get("column")
+ col_name = column.get("column_name") if isinstance(column, dict) else None
+ agg = m.get("aggregate")
+ if col_name and agg:
+ return f"{agg}({col_name})"
+ return None
+
+
+def _dimension_key(d: Any) -> Key | None:
+ """Natural key for a groupby/columns element — usually a bare string."""
+ if isinstance(d, str):
+ return d
+ if isinstance(d, dict):
+ return d.get("label") or d.get("column_name")
+ return None
+
+
+def _coerce_params(p: Any) -> dict[str, Any]:
+ """Decode ``Slice.params`` which is stored as a JSON string."""
+ if p is None:
+ return {}
+ if isinstance(p, str):
+ try:
+ decoded = _json.loads(p)
+ except _json.JSONDecodeError:
+ return {}
+ return decoded if isinstance(decoded, dict) else {}
+ if isinstance(p, dict):
+ return p
+ return {}
+
+
+def diff_slice_params(
+ from_params: Any,
+ to_params: Any,
+) -> list[ChangeRecord]:
+ """Diff the ``Slice.params`` JSON blob, promoting known keys to kinds."""
+ from_p = _coerce_params(from_params)
+ to_p = _coerce_params(to_params)
+ records: list[ChangeRecord] = []
+ all_keys = (set(from_p) | set(to_p)) - _CHART_PARAMS_AUDIT_KEYS
+ for key in sorted(all_keys):
+ from_v = from_p.get(key)
+ to_v = to_p.get(key)
+ if _values_equivalent(from_v, to_v):
+ continue
+ kind = _CHART_PARAMS_KIND_BY_KEY.get(key)
+ if kind == "filter" and isinstance(from_v, list) and isinstance(to_v, list):
+ records.extend(
+ _diff_list_by_natural_key(
+ "filter",
+ ["params", "adhoc_filters"],
+ from_v,
+ to_v,
+ _filter_key,
+ )
+ )
+ elif kind == "metric" and isinstance(from_v, list) and isinstance(to_v, list):
+ records.extend(
+ _diff_list_by_natural_key(
+ "metric",
+ ["params", "metrics"],
+ from_v,
+ to_v,
+ _metric_key,
+ )
+ )
+ elif (
+ kind == "dimension" and isinstance(from_v, list) and isinstance(to_v, list)
+ ):
+ records.extend(
+ _diff_list_by_natural_key(
+ "dimension",
+ ["params", key],
+ from_v,
+ to_v,
+ _dimension_key,
+ )
+ )
+ elif kind:
+ # scalar first-class kind (time_range, color_palette).
+ # For genuinely scalar values the recursion emits one leaf
+ # record exactly as before; for the unusual case of a dict
+ # value (custom viz params) it recurses to the leaf.
+ records.extend(
+ _recursive_leaf_diff(
+ kind=kind,
+ path_prefix=["params", key],
+ pre=from_v,
+ post=to_v,
+ max_depth=_SLICE_PARAMS_DIFF_DEPTH,
+ )
+ )
+ else:
+ # unknown params sub-key: generic field change, recursed
+ # to the leaf so a deep custom-viz option doesn't ship its
+ # whole sub-tree on both sides.
+ records.extend(
+ _recursive_leaf_diff(
+ kind="field",
+ path_prefix=["params", key],
+ pre=from_v,
+ post=to_v,
+ max_depth=_SLICE_PARAMS_DIFF_DEPTH,
+ )
+ )
+ return records
+
+
+def diff_scalar_fields(
+ pre: dict[str, Any],
+ post: dict[str, Any],
+ *,
+ fields: Iterable[str],
+) -> list[ChangeRecord]:
+ """Emit one ``kind="field"`` record per differing field in ``fields``.
+
+ The ``fields`` iterable is supplied by the caller — typically
+ :func:`scalar_fields_for` at listener wiring time. Keeping the
+ field list outside this function means adding a new column to a
+ model does not require a matching edit here.
+ """
+ records: list[ChangeRecord] = []
+ for field in sorted(fields):
+ record = _diff_scalar(field, pre.get(field), post.get(field))
+ if record is not None:
+ records.append(record)
+ return records
+
+
+def diff_slice(
+ pre: dict[str, Any],
+ post: dict[str, Any],
+ *,
+ fields: Iterable[str],
+) -> list[ChangeRecord]:
+ """Full Slice (chart) diff — scalars plus params classification.
+
+ Pass ``fields=scalar_fields_for(Slice, special=frozenset({"params"}))``
+ to get the ``params``-excluded scalar set; ``Slice.params`` is diffed
+ separately by :func:`diff_slice_params` for kind promotion.
+ """
+ records = diff_scalar_fields(pre, post, fields=fields)
+ records.extend(diff_slice_params(pre.get("params"), post.get("params")))
+ return records
+
+
+def diff_json_field(
+ field_name: str,
+ from_value: Any,
+ to_value: Any,
+ *,
+ exclude_keys: frozenset[str] = frozenset(),
+ max_depth: int = _JSON_METADATA_DIFF_DEPTH,
+) -> list[ChangeRecord]:
+ """Diff a TEXT column that stores a JSON dict, emitting one record
+ per changed leaf.
+
+ Used for ``Dashboard.json_metadata`` (``position_json`` has its
+ own structural diff via :func:`diff_dashboard_layout`). Saving the
+ blob verbatim into ``from_value`` / ``to_value`` would swamp the
+ change log with multi-KB strings on every save; recursing into the
+ parsed dict reduces noise to "exactly which leaf changed".
+
+ *exclude_keys* names sub-keys that are frontend-derived /
+ auto-stamped on save and don't carry user-authored signal. Same
+ rationale as the ``audit`` parameter on
+ :func:`scalar_fields_for` for the parent-column level.
+
+ Path is ``[field_name, key, ...]`` for leaf records, mirroring
+ :func:`diff_slice_params`'s ``["params", key, ...]`` shape so
+ renderers can use a single addressing scheme across the chart
+ and dashboard sides.
+ """
+ from_p = _coerce_params(from_value)
+ to_p = _coerce_params(to_value)
+ records: list[ChangeRecord] = []
+ for key in sorted(set(from_p) | set(to_p)):
+ if key in exclude_keys:
+ continue
+ records.extend(
+ _recursive_leaf_diff(
+ kind="field",
+ path_prefix=[field_name, key],
+ pre=from_p.get(key),
+ post=to_p.get(key),
+ max_depth=max_depth,
+ )
+ )
+ return records
+
+
+# json_metadata sub-keys that the frontend auto-stamps / auto-derives
+# on save. They mirror dashboard membership and chart inventory, not
+# user-authored content, so they noise up the change log without
+# carrying intent. The records produced for these keys can be ~50KB
+# (full label-colour dict) for a one-chart save.
+#
+# chart_configuration: per-chart cross-filter scope state,
+# re-derived when charts are added/removed.
+# global_chart_configuration: dashboard-wide filter scope; the
+# ``chartsInScope`` list mirrors live
+# dashboard membership.
+# map_label_colors: label → colour map, re-stamped on save
+# from currently-visible filter values.
+# shared_label_colors: cross-chart shared-label colour list,
+# rewritten by the DAO when a dashboard is
+# merely *viewed* — producing phantom
+# "Properties updated" records with no
+# user edit (surfaced by the
+# version-history UI, PR #40988). The
+# view-time write itself is a separate
+# round-trip-asymmetry issue (cf. #39706);
+# this exclusion stops the change-record
+# noise regardless.
+# show_chart_timestamps: frontend toggle, defaults applied on
+# save when missing.
+# color_namespace: scoped colour-scheme namespace, frontend-
+# derived from the chart set.
+DASHBOARD_JSON_METADATA_AUDIT_KEYS: frozenset[str] = frozenset(
+ {
+ "chart_configuration",
+ "global_chart_configuration",
+ "map_label_colors",
+ "shared_label_colors",
+ "show_chart_timestamps",
+ "color_namespace",
+ }
+)
+
+
+# Layout component types and how they map to record ``kind`` strings.
+# ``HEADER_ID`` is excluded — that's the dashboard's title bar, mirrored
+# from ``dashboard_title``. ``ROOT_ID`` and ``GRID_ID`` are structural
+# singletons whose only deltas are children lists, which we infer from
+# the moves of the children themselves.
+_LAYOUT_TYPE_TO_KIND: dict[str, str] = {
+ "CHART": "chart",
+ "ROW": "row",
+ "COLUMN": "column",
+ "TAB": "tab",
+ "TABS": "tabs",
+ "HEADER": "header",
+ "MARKDOWN": "markdown",
+ "DIVIDER": "divider",
+}
+
+# Layout components we never emit records for: ROOT_ID is the layout
+# root (always present, never moves); GRID_ID is the singleton vertical
+# stack inside ROOT_ID; HEADER_ID is the dashboard's title bar (already
+# covered by the ``dashboard_title`` scalar field).
+_LAYOUT_SUPPRESSED_IDS: frozenset[str] = frozenset({"ROOT_ID", "GRID_ID", "HEADER_ID"})
+
+
+def _layout_component_label(node: dict[str, Any]) -> str | None:
+ """Extract a human-readable label from a layout node, when one
+ exists. Used to build the ``from_value`` / ``to_value`` payload so
+ the UI can render messages like "Added chart 'Foo'" without
+ needing to fetch related entities.
+ """
+ meta = node.get("meta") or {}
+ if not isinstance(meta, dict):
+ return None
+ for key in ("sliceName", "label", "text"):
+ value = meta.get(key)
+ if isinstance(value, str) and value.strip():
+ return value
+ return None
+
+
+def _layout_node_payload(node: dict[str, Any]) -> dict[str, Any]:
+ """Minimal payload describing a layout node — enough for the UI
+ to render the change without dragging the full layout snippet
+ (which can be ~1KB per row when CHART nodes carry colour configs).
+ """
+ meta = node.get("meta") or {}
+ if not isinstance(meta, dict):
+ meta = {}
+ payload: dict[str, Any] = {"id": node.get("id"), "type": node.get("type")}
+ if (label := _layout_component_label(node)) is not None:
+ payload["name"] = label
+ if (chart_id := meta.get("chartId")) is not None:
+ payload["chartId"] = chart_id
+ # ``uuid`` (slice uuid for CHART nodes) lets the M2M-vs-layout
+ # dedupe in :func:`fold_dashboard_layout_with_chart_changes`
+ # match on the same key — :func:`diff_dashboard_slices` keys its
+ # records by uuid, not chartId.
+ if (slice_uuid := meta.get("uuid")) is not None:
+ payload["uuid"] = slice_uuid
+ return payload
+
+
+def _layout_parent_id(node: dict[str, Any]) -> Any:
+ """The immediate-parent node id for a layout component — the last
+ entry in ``parents``. Used to detect moves: same id, different
+ parent."""
+ parents = node.get("parents") or []
+ if not isinstance(parents, list) or not parents:
+ return None
+ return parents[-1]
+
+
+def _meta_excluding_position(node: dict[str, Any]) -> dict[str, Any]:
+ """Meta dict with ``parents``-equivalent positional bits removed
+ so two nodes that differ ONLY in where they sit compare equal at
+ the meta level. Move detection uses ``parents`` directly; this is
+ for "edit" (meta change) detection."""
+ meta = node.get("meta") or {}
+ return dict(meta) if isinstance(meta, dict) else {}
+
+
+def _diff_layout_node(
+ node_id: str,
+ pre_node: dict[str, Any] | None,
+ post_node: dict[str, Any] | None,
+) -> list[ChangeRecord]:
+ """Diff one component slot in the layout dict and return records for
+ the logical action — add, remove, move, edit.
+
+ add / remove / move emit a single record carrying the minimal node
+ payload (so the renderer can describe the affected component).
+ edit recurses into the node's ``meta`` dict and emits one record per
+ changed leaf, capped at ``_LAYOUT_META_DIFF_DEPTH``.
+
+ Returns an empty list when the slot is unchanged or holds an unknown
+ component type.
+ """
+ node_for_kind = post_node or pre_node or {}
+ kind = _LAYOUT_TYPE_TO_KIND.get(node_for_kind.get("type") or "")
+ if kind is None:
+ return [] # unknown component type — skip rather than emit garbage
+
+ if pre_node is None and post_node is not None:
+ return [
+ ChangeRecord(
+ kind=kind,
+ operation="add",
+ path=[node_id],
+ from_value=None,
+ to_value=_layout_node_payload(post_node),
+ )
+ ]
+ if post_node is None and pre_node is not None:
+ return [
+ ChangeRecord(
+ kind=kind,
+ operation="remove",
+ path=[node_id],
+ from_value=_layout_node_payload(pre_node),
+ to_value=None,
+ )
+ ]
+
+ # Both present — check move first, then edit.
+ assert pre_node is not None
+ assert post_node is not None
+ pre_parent = _layout_parent_id(pre_node)
+ if pre_parent != (post_parent := _layout_parent_id(post_node)):
+ return [
+ ChangeRecord(
+ kind=kind,
+ operation="move",
+ path=[node_id],
+ from_value={**_layout_node_payload(pre_node), "parent": pre_parent},
+ to_value={**_layout_node_payload(post_node), "parent": post_parent},
+ )
+ ]
+
+ # Edit: recurse into meta and emit one record per changed leaf.
+ # Path shape ``[node_id, , ...]``. The verb (operation) is
+ # derived per-leaf by the recursion via ``_operation_from_values``;
+ # a leaf added inside an existing node gets ``add`` and so on. The
+ # node-level "this was an edit" fact is implicit in the path shape
+ # carrying segments after ``node_id``.
+ return _recursive_leaf_diff(
+ kind=kind,
+ path_prefix=[node_id],
+ pre=_meta_excluding_position(pre_node),
+ post=_meta_excluding_position(post_node),
+ max_depth=_LAYOUT_META_DIFF_DEPTH,
+ )
+
+
+def diff_dashboard_layout(
+ pre: Any,
+ post: Any,
+) -> list[ChangeRecord]:
+ """Structural diff of a dashboard's ``position_json``, emitting one
+ record per logical layout action.
+
+ Walks both sides keyed on the component ``id`` (e.g.
+ ``"CHART-mkPZLOnWCElgL0Udp1gVK"``):
+
+ * id present only in *post* → ``op=add``, ``from_value=None``,
+ ``to_value=``
+ * id present only in *pre* → ``op=remove``, payload swapped
+ * id in both, ``parents`` differs → ``op=move``, payloads carry
+ old + new parent
+ * id in both, parents equal, ``meta`` differs → ``op=edit``,
+ payloads carry old + new meta
+ * id in both, equal → no record
+
+ The ``operation_type``-style verb is encoded in
+ ``path[0]`` as ``["add"|"remove"|"move"|"edit", ,
+ ]`` so the UI's path-based renderer can read it
+ without inspecting from/to.
+
+ ``ROOT_ID`` / ``GRID_ID`` / ``HEADER_ID`` are suppressed (see
+ :data:`_LAYOUT_SUPPRESSED_IDS`).
+ """
+ pre_nodes = _layout_nodes(pre)
+ post_nodes = _layout_nodes(post)
+ records: list[ChangeRecord] = []
+ for node_id in sorted(set(pre_nodes) | set(post_nodes)):
+ records.extend(
+ _diff_layout_node(node_id, pre_nodes.get(node_id), post_nodes.get(node_id))
+ )
+ return records
+
+
+def _layout_nodes(raw: Any) -> dict[str, dict[str, Any]]:
+ """Coerce *raw* (a ``position_json`` blob or already-parsed dict) into
+ the ``{node_id: node_dict}`` shape used by the layout diff, filtering
+ out non-dict values and the always-present root/grid/header singletons.
+ """
+ parsed = _coerce_params(raw)
+ return {
+ k: v
+ for k, v in parsed.items()
+ if isinstance(v, dict) and k not in _LAYOUT_SUPPRESSED_IDS
+ }
+
+
+def diff_dashboard(
+ pre: dict[str, Any],
+ post: dict[str, Any],
+ *,
+ fields: Iterable[str],
+) -> list[ChangeRecord]:
+ """Dashboard diff: scalar fields plus structural diff of
+ ``json_metadata`` and ``position_json``.
+
+ Promoting ``position_json`` to ``kind="layout"`` or
+ ``json_metadata.native_filter_configuration`` to ``kind="filter"``
+ is deferred to Phase 2 alongside the UI that would render them
+ (spec Clarifications §Session 2026-04-24); until then, both fields
+ fall through to ``kind="field"`` records keyed by sub-key.
+ """
+ records = diff_scalar_fields(pre, post, fields=fields)
+ records.extend(
+ diff_json_field(
+ "json_metadata",
+ pre.get("json_metadata"),
+ post.get("json_metadata"),
+ exclude_keys=DASHBOARD_JSON_METADATA_AUDIT_KEYS,
+ )
+ )
+ records.extend(
+ diff_dashboard_layout(pre.get("position_json"), post.get("position_json"))
+ )
+ return records
+
+
+def _layout_chart_uuids_by_verb(
+ records: list[ChangeRecord],
+) -> tuple[set[Any], set[Any]]:
+ """Scan *records* for layout ``add``/``remove`` records on charts and
+ return ``(added_uuids, removed_uuids)`` sets.
+
+ Keys off ``operation`` (the explicit verb column) rather than
+ ``path[0]`` — paths no longer carry the verb.
+ """
+ added: set[Any] = set()
+ removed: set[Any] = set()
+ for r in records:
+ if r.kind != "chart":
+ continue
+ # Layout chart records have ``path = [node_id]`` (length 1) for
+ # add/remove/move and ``[node_id, ...leaf]`` for edits. We only
+ # care about the structural add/remove cases here.
+ if len(r.path) != 1:
+ continue
+ if r.operation == "add" and isinstance(r.to_value, dict):
+ uuid_ = r.to_value.get("uuid")
+ if uuid_ is not None:
+ added.add(uuid_)
+ elif r.operation == "remove" and isinstance(r.from_value, dict):
+ uuid_ = r.from_value.get("uuid")
+ if uuid_ is not None:
+ removed.add(uuid_)
+ return added, removed
+
+
+def _is_redundant_m2m_chart_record(
+ r: ChangeRecord, added_uuids: set[Any], removed_uuids: set[Any]
+) -> bool:
+ """Return ``True`` when *r* is an M2M-style slice record that
+ duplicates an already-captured layout add/remove for the same uuid.
+
+ M2M slice records have path ``["slices", uuid]`` (length 2); their
+ info is strictly less than the corresponding layout record's
+ (no name, no parent), so the layout side wins on dedup.
+ """
+ if r.kind != "chart" or len(r.path) != 2 or r.path[0] != "slices":
+ return False
+ slice_uuid = r.path[1]
+ if r.from_value is None and r.to_value is not None:
+ return slice_uuid in added_uuids
+ if r.to_value is None and r.from_value is not None:
+ return slice_uuid in removed_uuids
+ return False
+
+
+def fold_dashboard_layout_with_chart_changes(
+ records: list[ChangeRecord],
+) -> list[ChangeRecord]:
+ """When a dashboard save adds/removes charts, the ``slices`` M2M
+ diff and the layout diff each emit a record for the same logical
+ action. Drop the M2M ``kind="chart"`` records — the layout-side
+ record carries more information (chart name, parent container).
+
+ The matching is by slice uuid: ``diff_dashboard_slices`` produces
+ records with path ``["slices", ]``; the layout
+ payloads carry the same uuid (sourced from
+ ``position_json.CHART-x.meta.uuid``). We dedupe on that key.
+
+ Called from the change-records listener after the M2M and layout
+ diffs are both merged into the per-entity buffer.
+ """
+ added_uuids, removed_uuids = _layout_chart_uuids_by_verb(records)
+ return [
+ r
+ for r in records
+ if not _is_redundant_m2m_chart_record(r, added_uuids, removed_uuids)
+ ]
+
+
+def diff_dataset(
+ pre: dict[str, Any],
+ post: dict[str, Any],
+ *,
+ fields: Iterable[str],
+) -> list[ChangeRecord]:
+ """SqlaTable scalar-field diff. All paths emit ``kind="field"``.
+
+ Children (columns, metrics) are diffed separately via
+ :func:`diff_dataset_columns` / :func:`diff_dataset_metrics`. The
+ listener reads them from Continuum shadow tables
+ (``table_columns_version`` / ``sql_metrics_version``) rather than
+ walking the ORM collection.
+ """
+ return diff_scalar_fields(pre, post, fields=fields)
+
+
+def diff_dataset_columns(
+ from_columns: list[dict[str, Any]] | None,
+ to_columns: list[dict[str, Any]] | None,
+) -> list[ChangeRecord]:
+ """Child-collection diff on TableColumn rows, keyed by column_name."""
+ return _diff_list_by_natural_key(
+ kind="column",
+ path_prefix=["columns"],
+ from_list=from_columns,
+ to_list=to_columns,
+ key_fn=lambda c: c.get("column_name") if isinstance(c, dict) else None,
+ )
+
+
+def diff_dataset_metrics(
+ from_metrics: list[dict[str, Any]] | None,
+ to_metrics: list[dict[str, Any]] | None,
+) -> list[ChangeRecord]:
+ """Child-collection diff on SqlMetric rows, keyed by metric_name."""
+ return _diff_list_by_natural_key(
+ kind="metric",
+ path_prefix=["metrics"],
+ from_list=from_metrics,
+ to_list=to_metrics,
+ key_fn=lambda m: m.get("metric_name") if isinstance(m, dict) else None,
+ )
+
+
+def diff_dashboard_slices(
+ from_slice_uuids: list[str] | None,
+ to_slice_uuids: list[str] | None,
+) -> list[ChangeRecord]:
+ """Diff a dashboard's chart membership, keyed by slice uuid.
+
+ Pure set-diff: added uuids get ``from_value=None, to_value=uuid``;
+ removed uuids get the inverse. No "changed" case because chart
+ associations are identity-only (the list element IS the uuid).
+ """
+ from_set = set(from_slice_uuids or [])
+ to_set = set(to_slice_uuids or [])
+ records: list[ChangeRecord] = []
+ for uuid_ in sorted(from_set - to_set):
+ records.append(
+ ChangeRecord(
+ kind="chart",
+ operation="remove",
+ path=["slices", uuid_],
+ from_value=uuid_,
+ to_value=None,
+ )
+ )
+ for uuid_ in sorted(to_set - from_set):
+ records.append(
+ ChangeRecord(
+ kind="chart",
+ operation="add",
+ path=["slices", uuid_],
+ from_value=None,
+ to_value=uuid_,
+ )
+ )
+ return records
diff --git a/superset/versioning/etag.py b/superset/versioning/etag.py
new file mode 100644
index 000000000000..fc51d9357666
--- /dev/null
+++ b/superset/versioning/etag.py
@@ -0,0 +1,78 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""ETag header emission for the entity-versioning API surface."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+from uuid import UUID
+
+import sqlalchemy as sa
+from flask_appbuilder import Model
+
+from superset.extensions import db
+
+if TYPE_CHECKING:
+ from flask import Response
+
+
+def set_version_etag(
+ response: "Response", version_uuid: UUID | str | None
+) -> "Response":
+ """Attach ``ETag: ""`` to *response*.
+
+ Uses RFC 7232 strong-validator form (no leading ``W/``); the response
+ header value is wrapped in double quotes per the spec. No-op when
+ *version_uuid* is ``None`` (entity has no version rows yet). Accepts a
+ ``UUID`` or a pre-stringified uuid (the write endpoints carry the latter).
+ """
+ if version_uuid is not None:
+ response.headers["ETag"] = f'"{version_uuid}"'
+ return response
+
+
+def set_version_etag_by_uuid(
+ response: "Response",
+ model_cls: type[Model],
+ entity_uuid: UUID,
+ *,
+ entity_id: int | None = None,
+) -> "Response":
+ """Attach ``ETag`` derived from *entity_uuid*'s current live version.
+
+ If *entity_id* is provided the helper uses it directly; otherwise it
+ runs ``SELECT id WHERE uuid = ?`` to resolve it. Pass *entity_id*
+ from call sites that already have the entity in hand (e.g. via
+ :func:`superset.versioning.api_helpers.resolve_endpoint_path_entity`)
+ so the lookup doesn't fire twice — every list/get versions request
+ previously cost an extra round-trip here on top of the resolve.
+
+ No-op when the entity is missing or has no version rows yet.
+ """
+ # pylint: disable=import-outside-toplevel
+ from superset.daos.version import VersionDAO
+
+ if entity_id is None:
+ entity_id = db.session.scalar(
+ sa.select(model_cls.id).where(model_cls.uuid == entity_uuid)
+ )
+ if entity_id is None:
+ return response
+ return set_version_etag(
+ response,
+ VersionDAO.current_live_version_uuid(model_cls, entity_id, entity_uuid),
+ )
diff --git a/superset/versioning/factory.py b/superset/versioning/factory.py
new file mode 100644
index 000000000000..2e03b8244e33
--- /dev/null
+++ b/superset/versioning/factory.py
@@ -0,0 +1,312 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+from __future__ import annotations
+
+import logging
+from collections.abc import Callable
+from typing import Any
+
+import sqlalchemy as sa
+import sqlalchemy.orm as sa_orm
+from sqlalchemy_continuum import is_modified, version_class
+from sqlalchemy_continuum.operation import Operation
+from sqlalchemy_continuum.plugins.base import Plugin
+from sqlalchemy_continuum.plugins.flask import FlaskPlugin
+from sqlalchemy_continuum.transaction import TransactionFactory
+from sqlalchemy_continuum.utils import versioned_column_properties
+
+from superset.utils import json
+from superset.versioning.diff import DASHBOARD_JSON_METADATA_AUDIT_KEYS
+
+logger = logging.getLogger(__name__)
+
+
+def _normalize_dashboard_json_metadata(value: Any) -> Any:
+ """Parse ``dashboards.json_metadata`` and drop frontend-stamped audit
+ sub-keys so a save that only re-stamps ``map_label_colors`` (etc.)
+ compares equal to its predecessor.
+
+ ``map_label_colors`` is regenerated client-side from the
+ ``LabelsColorMap`` singleton on every save (see
+ ``saveDashboardRequest`` in
+ ``superset-frontend/src/dashboard/actions/dashboardState.ts``).
+ The singleton's contents depend on which charts have rendered in
+ the page session, so two saves with no user-authored change produce
+ different bytes. The diff engine ignores the same audit sub-keys
+ (``DASHBOARD_JSON_METADATA_AUDIT_KEYS`` in
+ ``superset/versioning/diff.py``); aligning the skip-plugin's
+ comparison with that filter keeps the two paths consistent.
+ """
+ if value is None or value == "":
+ return value
+ try:
+ parsed = json.loads(value)
+ except (TypeError, ValueError):
+ return value
+ if not isinstance(parsed, dict):
+ return parsed
+ return {
+ k: v for k, v in parsed.items() if k not in DASHBOARD_JSON_METADATA_AUDIT_KEYS
+ }
+
+
+# Per-class column normalizers, keyed on (class_name, column_name). Class
+# name is used (rather than class itself) so importing the model classes
+# at module load is unnecessary — keeps the plugin importable before
+# ``make_versioned()`` has registered the version classes.
+_COLUMN_NORMALIZERS: dict[tuple[str, str], Callable[[Any], Any]] = {
+ ("Dashboard", "json_metadata"): _normalize_dashboard_json_metadata,
+}
+
+
+def _normalize_for_compare(target: Any, col_name: str, value: Any) -> Any:
+ """Return *value* run through any per-class column normalizer registered
+ in ``_COLUMN_NORMALIZERS``, else *value* unchanged.
+ """
+ normalizer = _COLUMN_NORMALIZERS.get((type(target).__name__, col_name))
+ return normalizer(value) if normalizer is not None else value
+
+
+def _has_dirty_versioned_children(target: Any, uow: Any) -> bool:
+ """Return ``True`` when *uow* contains an operation for a versioned
+ child of *target* (e.g. a ``TableColumn`` whose ``table`` is *target*).
+
+ Used by :meth:`SkipUnmodifiedPlugin._is_no_op_update` so a parent
+ UPDATE that was force-flagged by
+ :func:`baseline.force_parent_dirty_on_child_change` is preserved
+ even though the parent's own scalars match the previous version.
+ """
+ # pylint: disable=import-outside-toplevel
+ from superset.versioning.baseline import child_to_parent_registry
+
+ child_map = child_to_parent_registry()
+ target_cls = type(target)
+ for _key, op in uow.operations.items():
+ entry = child_map.get(type(op.target))
+ if entry is None:
+ continue
+ parent_attr, parent_cls = entry
+ if parent_cls is not target_cls:
+ continue
+ parent = getattr(op.target, parent_attr, None)
+ if parent is target:
+ return True
+ return False
+
+
+class VersionTransactionFactory(TransactionFactory):
+ """TransactionFactory that renames the transaction table and adds a bare
+ ``user_id`` integer column so the FlaskPlugin can record the acting user
+ without requiring a FK relationship to ``ab_user``.
+
+ Continuum only adds ``user_id`` when ``user_cls`` is set on the manager.
+ We add it unconditionally (no FK) so that both the FlaskPlugin's
+ ``transaction_args()`` and our ``baseline.py`` direct inserts can record
+ which user triggered the version event.
+ """
+
+ def create_class(self, manager: Any) -> Any:
+ cls = super().create_class(manager)
+ cls.__table__.name = "version_transaction"
+ # Rename the PostgreSQL sequence for consistent naming.
+ for col in cls.__table__.columns:
+ if col.name == "id" and col.default is not None:
+ col.default.name = "version_transaction_id_seq"
+ # Add user_id INTEGER (no FK) for user tracking. The mapper has not
+ # been configured yet at this point, so append_column + add_property
+ # is safe here.
+ user_id_col = sa.Column("user_id", sa.Integer, nullable=True)
+ cls.__table__.append_column(user_id_col)
+ cls.__mapper__.add_property("user_id", sa_orm.column_property(user_id_col))
+ # ``action_kind`` — high-level avenue that produced this commit
+ # (``restore`` / ``import`` / ``clone`` / ``NULL`` for ordinary
+ # saves). The DDL is in the consolidated Alembic migration; we
+ # also declare it on the SQLAlchemy Table here so consumers
+ # can reference ``versioning_manager.transaction_cls.__table__
+ # .c.action_kind`` at runtime, and so the change-record
+ # listener's ``sa.update()`` stamp emits the correctly-quoted
+ # identifier per dialect.
+ action_kind_col = sa.Column("action_kind", sa.String(32), nullable=True)
+ cls.__table__.append_column(action_kind_col)
+ cls.__mapper__.add_property(
+ "action_kind", sa_orm.column_property(action_kind_col)
+ )
+ return cls
+
+
+class VersioningFlaskPlugin(FlaskPlugin):
+ """FlaskPlugin subclass that uses Superset's :func:`get_user_id` (which
+ reads ``g.user``) instead of Flask-Login's ``current_user``. Superset's
+ JWT auth for API routes populates ``g.user`` but leaves
+ ``flask_login.current_user`` anonymous, so the upstream plugin would
+ record ``user_id=NULL`` on version_transaction rows created by API
+ calls. Returns an empty dict (so the transaction row is written
+ anyway) when no user is available — e.g. CLI, Celery, import/export.
+ """
+
+ def transaction_args(self, uow: Any, session: Any) -> dict[str, Any]:
+ # pylint: disable=import-outside-toplevel
+ from flask import has_request_context, request
+
+ from superset.utils.core import get_user_id
+
+ user_id = get_user_id()
+ # get_user_id() returns an int in a real request, or None for a
+ # context-less save (CLI, Celery, import/export). Guard against any
+ # non-int so a bogus value never reaches the integer
+ # ``version_transaction.user_id`` column — notably a mocked ``g`` in
+ # tests, where ``g.user.id`` is a Mock rather than None and would
+ # otherwise blow up the flush with a SQL bind error.
+ if not isinstance(user_id, int):
+ return {}
+
+ remote_addr: str | None
+ try:
+ remote_addr = request.remote_addr if has_request_context() else None
+ except RuntimeError:
+ remote_addr = None
+
+ return {"user_id": user_id, "remote_addr": remote_addr}
+
+
+class SkipUnmodifiedPlugin(Plugin):
+ """Skip creating version rows for UPDATE operations whose post-flush
+ column values are byte-identical to the previous live version row.
+
+ Continuum creates a version row for every entity in ``session.dirty``,
+ including saves where the SQLAlchemy ORM marked a column dirty (because
+ Superset re-serialised ``json_metadata`` via ``json.dumps`` on the save
+ path, or AuditMixin auto-bumped ``changed_on``) but the resulting value
+ is unchanged from the previous version. Those rows pollute the version
+ history with no-op entries.
+
+ ``is_modified()`` from Continuum is not enough: it consults SQLAlchemy's
+ attribute history, which is "did setattr produce a different value?",
+ not "did the final stored value change?". So we compare each
+ non-excluded versioned column on ``operation.target`` against the
+ previous live version row's value; if all are equal, the operation
+ is marked ``processed`` and Continuum skips it (see
+ ``UnitOfWork.create_version_objects``).
+
+ The associated transaction is not removed; if every operation is a
+ no-op the transaction becomes an orphan in ``version_transaction``
+ and is swept by the retention task at cutoff. Deleting the row
+ inline (in this hook) was considered and rejected: it would couple
+ this plugin to the change-records listener's buffer state — both
+ would have to agree that the flush produced nothing before we
+ could safely DROP the tx row, since ``version_changes.transaction_id``
+ has an ON DELETE CASCADE FK that would silently drop any buffered
+ diff records the listener was about to insert. The orphan's storage
+ cost (~40 bytes/row) is small enough that the coordination isn't
+ worth it; retention handles the cleanup correctly by construction
+ (orphans have no parent shadow → they're never "preserved" by the
+ "preserve transactions whose shadow has the live row" rule and
+ age out with the rest of the history).
+ """
+
+ def before_create_version_objects(self, uow: Any, session: Any) -> None:
+ # ``uow.operations`` is a custom Continuum ``Operations`` collection;
+ # use its ``.items()`` method (not ``.values()``) to iterate.
+ # INSERTs always create a row (no prior to compare against);
+ # DELETEs can't be no-ops. Only UPDATE operations are candidates.
+ for _key, operation in uow.operations.items():
+ if operation.processed or operation.type != Operation.UPDATE:
+ continue
+ try:
+ if self._is_no_op_update(operation.target, session, uow):
+ operation.processed = True
+ except Exception: # pylint: disable=broad-except
+ # Defensive — if introspection fails for any reason, fall
+ # back to creating the version row.
+ logger.exception(
+ "SkipUnmodifiedPlugin: skip-check raised for %s",
+ type(operation.target).__name__,
+ )
+
+ @classmethod
+ def _is_no_op_update(cls, target: Any, session: Any, uow: Any) -> bool:
+ """Return ``True`` when this UPDATE produces no observable change to
+ any non-excluded versioned column **and** no versioned children of
+ *target* are being modified in this flush.
+
+ Stages:
+
+ 1. If any versioned child (e.g. a ``TableColumn`` whose ``table``
+ is *target*) has an operation in ``uow.operations``, the parent
+ is being force-touched by
+ ``baseline.force_parent_dirty_on_child_change`` to anchor the
+ child changes against a parent shadow row. Keep the row.
+ 2. ``is_modified(target)`` — cheap SQLAlchemy attribute-history
+ check. Returns ``False`` when only excluded columns/relationships
+ (``owners``, ``changed_on``, …) are dirty. This is the common
+ case (every save auto-bumps ``changed_on``); short-circuiting
+ here saves the DB round-trip in stage 3.
+ 3. Compare post-flush column values against the previous live
+ version row's stored values. Catches the case where SQLAlchemy
+ sees a column as dirty (e.g. ``set_dash_metadata`` re-serialised
+ ``json_metadata`` to a different byte sequence) but the
+ resulting parsed content matches the prior version.
+ """
+ if _has_dirty_versioned_children(target, uow):
+ return False
+ if not is_modified(target):
+ return True
+ return cls._matches_previous_version(target, session)
+
+ @staticmethod
+ def _matches_previous_version(target: Any, session: Any) -> bool:
+ """Return ``True`` when every non-excluded versioned column on
+ *target* matches the value stored in its previous live version row
+ (i.e., the row with ``end_transaction_id IS NULL``).
+
+ Returns ``False`` for entities with no prior version row — letting
+ Continuum create the first one. In practice this case is rare:
+ ``register_baseline_listener`` (in ``superset.versioning.baseline``)
+ runs ahead of Continuum's ``before_flush`` and inserts a baseline
+ row for any entity being saved for the first time, so the second
+ save (and beyond) is what flows through this path.
+ """
+ cls = type(target)
+ try:
+ ver_cls = version_class(cls)
+ except Exception: # pylint: disable=broad-except
+ return False
+ ver_table = ver_cls.__table__
+
+ col_keys = [prop.key for prop in versioned_column_properties(target)]
+ if not col_keys:
+ return False
+
+ select_stmt = (
+ sa.select(*[ver_table.c[c] for c in col_keys])
+ .where(ver_table.c.id == target.id)
+ .where(ver_table.c.end_transaction_id.is_(None))
+ .order_by(ver_table.c.transaction_id.desc())
+ .limit(1)
+ )
+ row = session.connection().execute(select_stmt).first()
+ if row is None:
+ return False # no previous version → let Continuum create one
+
+ for col_name, prev_value in zip(col_keys, row, strict=False):
+ post = _normalize_for_compare(
+ target, col_name, getattr(target, col_name, None)
+ )
+ pre = _normalize_for_compare(target, col_name, prev_value)
+ if post != pre:
+ return False
+ return True
diff --git a/superset/versioning/queries.py b/superset/versioning/queries.py
new file mode 100644
index 000000000000..ca9f58625827
--- /dev/null
+++ b/superset/versioning/queries.py
@@ -0,0 +1,516 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Read-side queries for the entity-versioning API.
+
+Pure-read helpers that translate Continuum shadow rows and
+``version_changes`` records into the shapes the API endpoints return.
+The corresponding write side (restore) lives in
+:mod:`superset.versioning.restore`. The backward-compat ``VersionDAO``
+façade in :mod:`superset.daos.version` re-exports both.
+
+Also exposes the deterministic version-UUID derivation
+(:data:`VERSION_UUID_NAMESPACE` + :func:`derive_version_uuid`) used by
+both the read endpoints and the ETag emission path in
+:mod:`superset.versioning.etag`.
+"""
+
+from __future__ import annotations
+
+import uuid
+from typing import Any
+from uuid import UUID
+
+import sqlalchemy as sa
+from flask_appbuilder import Model
+from sqlalchemy_continuum import version_class
+
+from superset.extensions import db
+from superset.versioning.baseline import CONTINUUM_BOOKKEEPING_COLUMNS
+
+# Fixed UUIDv5 namespace under which per-(entity, transaction) version UUIDs
+# are derived. Never change this constant — changing it invalidates every
+# version_uuid that clients may have cached, bookmarked, or stored.
+VERSION_UUID_NAMESPACE = UUID("7a6f5d9b-4c3b-5d8e-9a1c-0e2b4c6d8f10")
+
+# Continuum's integer ``operation_type`` mapped to the string the API
+# returns. Kept short and stable for downstream tooling consuming the
+# raw response. Continuum guarantees 0/1/2; anything else is a Continuum
+# version mismatch and surfaces as ``str(int)`` rather than crashing.
+_OP_TYPE_LABELS: dict[int, str] = {0: "baseline", 1: "update", 2: "delete"}
+
+
+def derive_version_uuid(entity_uuid: UUID, transaction_id: int) -> UUID:
+ """Derive a deterministic UUIDv5 identifying one version row.
+
+ The UUID is a function of the owning entity's UUID and the Continuum
+ ``transaction_id`` of the version row, so it is stable across retention
+ pruning (which never changes ``transaction_id``) and portable across
+ replicas. It is not randomly generated — two Supersets with identical
+ ``(entity.uuid, transaction_id)`` will compute the same version_uuid.
+ """
+ return uuid.uuid5(VERSION_UUID_NAMESPACE, f"{entity_uuid}:{transaction_id}")
+
+
+def _resolve_version_tables(
+ model_cls: type[Model],
+) -> tuple[sa.Table, sa.Table, sa.Table]:
+ """Return the (version, transaction, user) ``Table`` objects used by the
+ listing and snapshot queries.
+
+ All three lookups happen inside this module on every read; centralising
+ the trio (a) keeps the imports in one place and (b) makes the join helper
+ below take a uniform signature.
+ """
+ # pylint: disable=import-outside-toplevel
+ from sqlalchemy_continuum import versioning_manager
+
+ from superset import security_manager
+
+ ver_tbl = version_class(model_cls).__table__
+ tx_tbl = versioning_manager.transaction_cls.__table__
+ user_tbl = security_manager.user_model.__table__
+ return ver_tbl, tx_tbl, user_tbl
+
+
+def _version_with_tx_user_join(
+ ver_tbl: sa.Table, tx_tbl: sa.Table, user_tbl: sa.Table
+) -> Any:
+ """Build the version → transaction → user left-join used by both
+ :func:`list_versions` and :func:`get_version`. The user-side join is
+ a left-outer so saves with no Flask user context (CLI, Celery, import)
+ still surface in the result with ``changed_by = None``.
+ """
+ return ver_tbl.join(tx_tbl, ver_tbl.c.transaction_id == tx_tbl.c.id).outerjoin(
+ user_tbl, tx_tbl.c.user_id == user_tbl.c.id
+ )
+
+
+def _baseline_first_ordering(ver_tbl: sa.Table) -> tuple[Any, ...]:
+ """Order ``(operation_type != 0).asc(), transaction_id.asc()`` so any
+ op=0 row — Continuum's INSERT or our synthetic baseline — sorts to
+ position 0 regardless of its transaction_id. A single entity never has
+ more than one op=0 row (Continuum tracks one creation per live entity;
+ our baseline listener only fires when no prior version rows exist), so
+ this gives a stable chronological order with the "original" version
+ always first.
+ """
+ return (
+ (ver_tbl.c.operation_type != 0).asc(),
+ ver_tbl.c.transaction_id.asc(),
+ )
+
+
+def _user_select_cols(user_tbl: sa.Table) -> list[Any]:
+ """Columns to select from ``user_tbl`` to build a ``changed_by`` dict.
+ Labels ``user_tbl.c.id`` as ``"user_id"`` so callers can read the row
+ by a stable key regardless of whether they also select the version
+ table's ``id`` column.
+ """
+ return [
+ user_tbl.c.id.label("user_id"),
+ user_tbl.c.username,
+ user_tbl.c.first_name,
+ user_tbl.c.last_name,
+ ]
+
+
+def _changed_by_from_row(row: Any) -> dict[str, Any] | None:
+ """Project the user columns from a query row onto the API's
+ ``changed_by`` shape, or ``None`` for saves with no Flask user context
+ (CLI / Celery / import / unauthenticated). Expects the user columns to
+ have been selected via :func:`_user_select_cols` so the row keys are
+ ``user_id`` / ``username`` / ``first_name`` / ``last_name``.
+ """
+ if row["user_id"] is None:
+ return None
+ return {
+ "id": row["user_id"],
+ "username": row["username"],
+ "first_name": row["first_name"],
+ "last_name": row["last_name"],
+ }
+
+
+def _entity_kind_for(model_cls: type[Model]) -> str | None:
+ """Return the ``version_changes.entity_kind`` value for *model_cls*, or
+ ``None`` when the class isn't in the change-records taxonomy."""
+ # pylint: disable=import-outside-toplevel
+ from superset.versioning.changes import ENTITY_KIND_BY_CLASS_NAME
+
+ return ENTITY_KIND_BY_CLASS_NAME.get(model_cls.__name__)
+
+
+def find_active_by_uuid(model_cls: type[Model], entity_uuid: UUID) -> Any | None:
+ """Return the live entity matching *entity_uuid*, or None if not found."""
+ return (
+ db.session.query(model_cls).filter(model_cls.uuid == entity_uuid).one_or_none()
+ )
+
+
+def _get_version_count(model_cls: type[Model], entity_id: int) -> int:
+ """Return the number of historical version rows for *entity_id*."""
+ ver_cls = version_class(model_cls)
+ return (
+ db.session.query(sa.func.count())
+ .select_from(ver_cls)
+ .filter(ver_cls.id == entity_id)
+ .scalar()
+ or 0
+ )
+
+
+def current_version_number(model_cls: type[Model], entity_id: int) -> int | None:
+ """Return the 0-based ``version_number`` of the live row for *entity_id*
+ — equivalent to the index of the most recent entry that
+ :func:`list_versions` would return, or ``None`` when the entity has no
+ version rows yet.
+
+ Note: this index is *unstable under retention pruning*. The scheduled
+ retention task drops shadow rows older than the configured
+ retention window, so the same integer can refer to different rows
+ before and after a prune cycle. Use
+ :func:`current_live_transaction_id` for a stable identifier.
+ """
+ count = _get_version_count(model_cls, entity_id)
+ return count - 1 if count > 0 else None
+
+
+def current_live_transaction_id(model_cls: type[Model], entity_id: int) -> int | None:
+ """Return the Continuum ``transaction_id`` of the live row for
+ *entity_id* — stable across retention pruning, unlike the index
+ returned by :func:`current_version_number`.
+ """
+ ver_cls = version_class(model_cls)
+ row = (
+ db.session.query(ver_cls.transaction_id)
+ .filter(ver_cls.id == entity_id)
+ .filter(ver_cls.end_transaction_id.is_(None))
+ .order_by(ver_cls.transaction_id.desc())
+ .limit(1)
+ .first()
+ )
+ return row[0] if row else None
+
+
+def current_live_version_uuid(
+ model_cls: type[Model], entity_id: int, entity_uuid: UUID
+) -> UUID | None:
+ """Return the deterministic ``version_uuid`` of the live row, or
+ ``None`` when the entity has no version rows yet."""
+ tx_id = current_live_transaction_id(model_cls, entity_id)
+ if tx_id is None:
+ return None
+ return derive_version_uuid(entity_uuid, tx_id)
+
+
+def list_change_records_batch(
+ entity_kind: str,
+ entity_id: int,
+ transaction_ids: list[int],
+) -> dict[int, list[dict[str, Any]]]:
+ """Return ``version_changes`` rows keyed by ``transaction_id``.
+
+ Batches the lookup across multiple transactions with a single
+ ``WHERE transaction_id IN (...) AND entity_kind = ? AND entity_id = ?``
+ query so the list endpoint avoids N+1 round-trips. Rows are
+ distributed into per-tx lists sorted by ``sequence`` ascending
+ (matching the replay order the diff engine emits). Missing
+ transactions are represented by an empty list in the result so
+ callers can use ``result.get(tx_id, [])`` without guarding.
+
+ If the ``version_changes`` table is missing (pre-migration or
+ freshly downgraded), returns an empty dict rather than propagating
+ the error — consistent with this being a descriptive layer that
+ should not break the list endpoint.
+ """
+ # pylint: disable=import-outside-toplevel
+ from superset.versioning.changes import version_changes_table
+
+ if not transaction_ids:
+ return {}
+
+ # SAVEPOINT so a missing-table failure can't poison the enclosing
+ # transaction: on PostgreSQL a failed statement aborts the tx, and
+ # every later query in the request would raise InFailedSqlTransaction
+ # even though the exception below was caught.
+ try:
+ with db.session.connection().begin_nested():
+ rows = (
+ db.session.connection()
+ .execute(
+ sa.select(
+ version_changes_table.c.transaction_id,
+ version_changes_table.c.sequence,
+ version_changes_table.c.kind,
+ version_changes_table.c.path,
+ version_changes_table.c.from_value,
+ version_changes_table.c.to_value,
+ )
+ .where(
+ version_changes_table.c.entity_kind == entity_kind,
+ version_changes_table.c.entity_id == entity_id,
+ version_changes_table.c.transaction_id.in_(transaction_ids),
+ )
+ .order_by(
+ version_changes_table.c.transaction_id.asc(),
+ version_changes_table.c.sequence.asc(),
+ )
+ )
+ .mappings()
+ .all()
+ )
+ except (sa.exc.OperationalError, sa.exc.ProgrammingError):
+ # Missing version_changes table: OperationalError on SQLite/MySQL,
+ # ProgrammingError (UndefinedTable) on PostgreSQL.
+ return {}
+
+ grouped: dict[int, list[dict[str, Any]]] = {tx: [] for tx in transaction_ids}
+ for row in rows:
+ grouped[row["transaction_id"]].append(
+ {
+ "kind": row["kind"],
+ "path": row["path"],
+ "from_value": row["from_value"],
+ "to_value": row["to_value"],
+ }
+ )
+ return grouped
+
+
+def list_versions(
+ model_cls: type[Model],
+ entity_uuid: UUID,
+ *,
+ entity: Any | None = None,
+) -> list[dict[str, Any]] | None:
+ """Return the version history for the entity identified by *entity_uuid*.
+
+ Returns ``None`` when no active entity matches the UUID — callers should
+ translate that into a 404. Returns an empty list when the entity exists
+ but has no version rows yet (pre-migration, or never edited).
+
+ The list is ordered by ``transaction_id`` ascending and each entry is
+ assigned a 0-based sequential ``version_number``. ``operation_type`` is
+ mapped from Continuum's integer constants to a string (``0`` → baseline,
+ ``1`` → update, ``2`` → delete). ``changed_by`` is the User row keyed
+ off ``version_transaction.user_id``, or ``None`` when the save had no
+ Flask user context (CLI, import, etc.).
+
+ Pass *entity* to skip the ``find_active_by_uuid`` lookup when the
+ caller has already resolved the entity (API handlers do this to enforce
+ ``raise_for_ownership`` before calling here). The skip saves one
+ ``WHERE uuid = ?`` query — that lookup isn't identity-map-cacheable
+ because ``uuid`` is a unique non-PK column.
+ """
+ if entity is None:
+ entity = find_active_by_uuid(model_cls, entity_uuid)
+ if entity is None:
+ return None
+
+ ver_tbl, tx_tbl, user_tbl = _resolve_version_tables(model_cls)
+ stmt = (
+ sa.select(
+ ver_tbl.c.transaction_id,
+ ver_tbl.c.operation_type,
+ tx_tbl.c.issued_at,
+ *_user_select_cols(user_tbl),
+ )
+ .select_from(_version_with_tx_user_join(ver_tbl, tx_tbl, user_tbl))
+ .where(ver_tbl.c.id == entity.id)
+ .order_by(*_baseline_first_ordering(ver_tbl))
+ )
+ rows = db.session.execute(stmt).mappings().all()
+
+ # Batch-load change records for every listed transaction in one query.
+ # ``entity_kind`` is derived from the model class so the API
+ # filter ``WHERE entity_kind = 'chart' AND entity_id = ?`` can be
+ # precise when multiple versioned entities share a flush.
+ changes_by_tx: dict[int, list[dict[str, Any]]] = {}
+ if (entity_kind := _entity_kind_for(model_cls)) is not None:
+ tx_ids = [row["transaction_id"] for row in rows]
+ changes_by_tx = list_change_records_batch(entity_kind, entity.id, tx_ids)
+
+ return [
+ {
+ "version_uuid": derive_version_uuid(entity_uuid, row["transaction_id"]),
+ "version_number": version_number,
+ "transaction_id": row["transaction_id"],
+ "operation_type": _OP_TYPE_LABELS.get(
+ row["operation_type"], str(row["operation_type"])
+ ),
+ "issued_at": row["issued_at"],
+ "changed_by": _changed_by_from_row(row),
+ "changes": changes_by_tx.get(row["transaction_id"], []),
+ }
+ for version_number, row in enumerate(rows)
+ ]
+
+
+def resolve_version_uuid(
+ model_cls: type[Model],
+ entity_uuid: UUID,
+ version_uuid: UUID,
+ *,
+ entity: Any | None = None,
+) -> int | None:
+ """Translate a ``version_uuid`` into the 0-based ``version_number`` that
+ :func:`superset.versioning.restore.restore_version` accepts, or ``None``
+ when the UUID does not match any version row of the given entity.
+
+ Ordering matches :func:`list_versions` — op=0 rows first, then by
+ transaction_id — so the version_number returned here is the same index
+ a client would see in the list response.
+
+ Implementation note: the loop re-derives ``version_uuid`` per
+ transaction in Python because there's no portable SQL form for a
+ UUIDv5 derivation across PostgreSQL / MySQL / SQLite (Postgres has
+ ``uuid_generate_v5``; the other two do not). The iteration count is
+ bounded by the configured retention window worth of edits — the
+ retention task ages older shadow rows out — so the
+ practical N is at most a few hundred. If retention is ever
+ disabled on a heavily-edited entity, this loop is the
+ place to revisit.
+
+ Pass *entity* to skip the ``find_active_by_uuid`` lookup; see
+ :func:`list_versions` for the rationale.
+ """
+ if entity is None:
+ entity = find_active_by_uuid(model_cls, entity_uuid)
+ if entity is None:
+ return None
+
+ ver_cls = version_class(model_cls)
+ tx_ids = (
+ db.session.query(ver_cls.transaction_id)
+ .filter(ver_cls.id == entity.id)
+ .order_by(
+ (ver_cls.operation_type != 0).asc(),
+ ver_cls.transaction_id.asc(),
+ )
+ .all()
+ )
+ for version_number, (tx_id,) in enumerate(tx_ids):
+ if derive_version_uuid(entity_uuid, tx_id) == version_uuid:
+ return version_number
+ return None
+
+
+def get_version(
+ model_cls: type[Model],
+ entity_uuid: UUID,
+ version_uuid: UUID,
+ *,
+ entity: Any | None = None,
+) -> dict[str, Any] | None:
+ """Return the entity's state at the specified version as a dict.
+
+ Read-only — nothing in the live database is modified. The returned
+ shape is intended to mirror a regular single-entity GET response
+ (scalar columns plus restored ``columns`` / ``metrics`` lists for
+ ``SqlaTable``), with a ``_version`` key holding the version-level
+ metadata (uuid, transaction_id, operation_type, issued_at,
+ changed_by) so callers can tell which version they're looking at.
+
+ Returns ``None`` when either *entity_uuid* or *version_uuid* does not
+ match — callers should translate to 404.
+
+ Pass *entity* to skip the ``find_active_by_uuid`` lookup; see
+ :func:`list_versions` for the rationale. The same *entity* is threaded
+ into :func:`resolve_version_uuid` to eliminate a second redundant
+ lookup on the same request.
+ """
+ # pylint: disable=import-outside-toplevel
+ from superset.connectors.sqla.models import SqlaTable
+
+ if entity is None:
+ entity = find_active_by_uuid(model_cls, entity_uuid)
+ if entity is None:
+ return None
+
+ version_num = resolve_version_uuid(
+ model_cls, entity_uuid, version_uuid, entity=entity
+ )
+ if version_num is None:
+ return None
+
+ ver_tbl, tx_tbl, user_tbl = _resolve_version_tables(model_cls)
+ stmt = (
+ sa.select(
+ ver_tbl,
+ tx_tbl.c.issued_at,
+ *_user_select_cols(user_tbl),
+ )
+ .select_from(_version_with_tx_user_join(ver_tbl, tx_tbl, user_tbl))
+ .where(ver_tbl.c.id == entity.id)
+ .order_by(*_baseline_first_ordering(ver_tbl))
+ .offset(version_num)
+ .limit(1)
+ )
+ row = db.session.execute(stmt).mappings().first()
+ if row is None:
+ return None
+
+ # Project the entity's own scalar fields, skipping versioning
+ # metadata columns.
+ result: dict[str, Any] = {}
+ for col in ver_tbl.columns:
+ if col.name in CONTINUUM_BOOKKEEPING_COLUMNS:
+ continue
+ value = row[col.name]
+ # uuid columns come back as UUID instances; make them JSON-safe.
+ if isinstance(value, UUID):
+ value = str(value)
+ result[col.name] = value
+
+ changes: list[dict[str, Any]] = []
+ if (entity_kind := _entity_kind_for(model_cls)) is not None:
+ changes = list_change_records_batch(
+ entity_kind, entity.id, [row["transaction_id"]]
+ ).get(row["transaction_id"], [])
+
+ result["_version"] = {
+ "version_uuid": str(version_uuid),
+ "version_number": version_num,
+ "transaction_id": row["transaction_id"],
+ "operation_type": _OP_TYPE_LABELS.get(
+ row["operation_type"], str(row["operation_type"])
+ ),
+ "issued_at": row["issued_at"],
+ "changed_by": _changed_by_from_row(row),
+ "changes": changes,
+ }
+
+ # For datasets, attach the columns/metrics as they were at this
+ # transaction by reading from Continuum's child shadow tables
+ # (``table_columns_version`` / ``sql_metrics_version``). Empty lists
+ # when the dataset had no children at this tx.
+ if model_cls is SqlaTable:
+ # pylint: disable=import-outside-toplevel
+ from superset.connectors.sqla.models import SqlMetric, TableColumn
+ from superset.versioning.changes import shadow_rows_valid_at
+
+ target_tx = row["transaction_id"]
+ cols_tbl = version_class(TableColumn).__table__
+ metrics_tbl = version_class(SqlMetric).__table__
+ result["columns"] = shadow_rows_valid_at(
+ db.session, cols_tbl, "table_id", entity.id, target_tx
+ )
+ result["metrics"] = shadow_rows_valid_at(
+ db.session, metrics_tbl, "table_id", entity.id, target_tx
+ )
+
+ return result
diff --git a/superset/versioning/schemas.py b/superset/versioning/schemas.py
new file mode 100644
index 000000000000..9fa51a8432e7
--- /dev/null
+++ b/superset/versioning/schemas.py
@@ -0,0 +1,140 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Shared Marshmallow schemas for entity version history endpoints.
+
+Consumed by ChartRestApi, DashboardRestApi, and DatasetRestApi — the response
+shape is identical across all three resources, so the schemas live here to
+avoid triplicated definitions.
+"""
+
+from __future__ import annotations
+
+from marshmallow import fields, Schema
+
+
+class VersionChangedBySchema(Schema):
+ """Subset of the User model included in each version history entry."""
+
+ id = fields.Integer()
+ username = fields.String()
+ first_name = fields.String()
+ last_name = fields.String()
+
+
+class VersionChangeRecordSchema(Schema):
+ """One field-level diff hunk from ``version_changes``.
+
+ The frontend renders human-readable prose from (``kind``,
+ ``from_value``, ``to_value``) via Flask-Babel. Server-side the
+ shape is deliberately machine-readable only — see spec FR-019.
+ """
+
+ kind = fields.String(
+ metadata={
+ "description": (
+ "Semantic category of the change. First-class values in V1: "
+ "'filter', 'metric', 'dimension', 'column', 'chart', "
+ "'time_range', 'color_palette'. Falls back to 'field' for "
+ "generic scalar changes that don't map to a named kind."
+ )
+ },
+ )
+ path = fields.Raw(
+ metadata={
+ "description": (
+ "Array of segments locating the change in the entity's state. "
+ "Example: ['params', 'adhoc_filters', 'country']."
+ )
+ },
+ )
+ from_value = fields.Raw(
+ allow_none=True,
+ metadata={
+ "description": (
+ "Value at path before the save; null when the field did not exist."
+ ),
+ },
+ )
+ to_value = fields.Raw(
+ allow_none=True,
+ metadata={
+ "description": (
+ "Value at path after the save; null when the field was removed."
+ ),
+ },
+ )
+
+
+class VersionListItemSchema(Schema):
+ """A single version row in the version history response."""
+
+ version_uuid = fields.UUID(
+ metadata={
+ "description": (
+ "Deterministic UUIDv5 derived from the entity UUID and the "
+ "Continuum transaction id — stable across replicas and "
+ "retention pruning. The handle accepted by the get/restore "
+ "version endpoints."
+ )
+ },
+ )
+ version_number = fields.Integer(
+ metadata={"description": "0-based position in the history, oldest first"},
+ )
+ transaction_id = fields.Integer(
+ metadata={"description": "Underlying Continuum transaction id"},
+ )
+ operation_type = fields.String(
+ metadata={
+ "description": (
+ "One of 'baseline', 'update', or 'delete', derived from the "
+ "Continuum integer constant. Restore is not a distinct "
+ "operation_type: a restore surfaces as an ordinary 'update' "
+ "transaction."
+ )
+ },
+ )
+ issued_at = fields.DateTime(
+ metadata={"description": "UTC timestamp of the commit that produced the row"},
+ )
+ changed_by = fields.Nested(
+ VersionChangedBySchema,
+ allow_none=True,
+ metadata={
+ "description": (
+ "User who produced the version, or null when the commit had no "
+ "authenticated Flask user (CLI, Celery, import)."
+ )
+ },
+ )
+ changes = fields.List(
+ fields.Nested(VersionChangeRecordSchema),
+ metadata={
+ "description": (
+ "Structured diff records describing the atomic field-level "
+ "changes at this version, ordered by emission sequence. "
+ "Empty for baseline (op=0) transactions per spec M4."
+ )
+ },
+ )
+
+
+class VersionListResponseSchema(Schema):
+ """Envelope for version list responses."""
+
+ result = fields.List(fields.Nested(VersionListItemSchema))
+ count = fields.Integer()
diff --git a/superset/versioning/utils.py b/superset/versioning/utils.py
new file mode 100644
index 000000000000..e09f133bf1dd
--- /dev/null
+++ b/superset/versioning/utils.py
@@ -0,0 +1,81 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Shared session helpers used by the entity-versioning machinery."""
+
+from __future__ import annotations
+
+from collections.abc import Iterator
+from contextlib import contextmanager
+from typing import Any
+
+import sqlalchemy as sa
+from sqlalchemy.orm import Session
+
+
+@contextmanager
+def single_flush_scope(session: Session) -> Iterator[None]:
+ """Suppress autoflushes inside the block, flush once on clean exit.
+
+ Intended for operations that (a) make multiple mutations across
+ relationships and (b) issue intermediate queries which would
+ otherwise autoflush. Iterating from one relationship to another
+ inside SQLAlchemy-Continuum's ``Reverter`` is the canonical case:
+ a mid-iteration autoflush transitions pending DELETEs to
+ ``state.deleted=True``, and the subsequent
+ ``session.add(version_parent)`` cascade walk trips on the
+ deleted-state instances with ``InvalidRequestError``. Wrapping the
+ whole revert keeps marked-for-deletion instances in
+ ``state.persistent`` until the trailing flush drains DELETEs +
+ INSERTs in one atomic step. That single flush is also load-bearing
+ for the ``after_flush`` change-records listener — splitting the
+ work across multiple flushes would split it across multiple
+ Continuum transactions, and the listener's tx-dedup guard would
+ silently drop the second pass's records.
+
+ On exception, the trailing flush is skipped — the session's normal
+ rollback flow handles cleanup, and flushing a partially-mutated
+ state would be wrong.
+ """
+ with session.no_autoflush:
+ yield
+ session.flush()
+
+
+def read_row_outside_flush(
+ session: Session, table: sa.Table, entity_id: int
+) -> dict[str, Any] | None:
+ """Read the row with ``id == entity_id`` from *table* without triggering
+ an autoflush. Returns the row as a plain dict, or ``None`` when no row
+ matches.
+
+ The companion read primitive to :func:`single_flush_scope`. Listeners
+ that need pre-flush state (the row as it existed *before* the in-flight
+ edit was staged) use this — without ``no_autoflush``, the
+ ``session.connection().execute(...)`` would itself trigger a flush of
+ the pending edit, leaving "pre" and "post" indistinguishable.
+
+ Returns ``dict[str, Any]`` rather than ``RowMapping`` so callers don't
+ accidentally hold a cursor-bound object past the listener boundary.
+ """
+ with session.no_autoflush:
+ result = (
+ session.connection()
+ .execute(sa.select(table).where(table.c.id == entity_id))
+ .mappings()
+ .one_or_none()
+ )
+ return dict(result) if result else None
diff --git a/tests/integration_tests/migrations/composite_pk_association_tables__tests.py b/tests/integration_tests/migrations/composite_pk_association_tables__tests.py
new file mode 100644
index 000000000000..05097ef308d6
--- /dev/null
+++ b/tests/integration_tests/migrations/composite_pk_association_tables__tests.py
@@ -0,0 +1,137 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Schema-shape assertion tests for the composite-PK association-tables
+migration (revision 2bee73611e32).
+
+Builds the pre-migration shape against an isolated in-memory SQLite engine,
+runs the migration's ``upgrade()``, and asserts the resulting shape: no
+``id`` column, composite PK on the two FK columns, and no redundant
+``UNIQUE(fk1, fk2)`` on the two tables that previously carried one.
+
+Continuum-restore verification is OUT OF SCOPE; that work lives in the
+entity-versioning follow-up. Cross-backend verification (PostgreSQL,
+MySQL) is handled by CI's test-postgres / test-mysql shards.
+"""
+
+from importlib import import_module
+
+import pytest
+import sqlalchemy as sa
+from alembic.migration import MigrationContext
+from alembic.operations import Operations
+from sqlalchemy import inspect
+
+# Import the migration module under test.
+_migration = import_module(
+ "superset.migrations.versions."
+ "2026-05-01_23-36_2bee73611e32_composite_pk_association_tables"
+)
+AFFECTED_TABLES = _migration.AFFECTED_TABLES
+TABLES_WITH_PRE_EXISTING_UNIQUE = _migration.TABLES_WITH_PRE_EXISTING_UNIQUE
+TABLES_WITH_NULLABLE_FKS = _migration.TABLES_WITH_NULLABLE_FKS
+
+
+@pytest.fixture(scope="module")
+def post_upgrade_engine() -> sa.engine.Engine:
+ """An isolated in-memory SQLite engine with the migration applied to a
+ pre-migration-shaped seed schema. Used by the post-upgrade assertions
+ below. Module-scoped so the upgrade only runs once per module.
+
+ FK columns are NULLABLE on the six tables that historically allowed
+ NULLs — with ``nullable=False`` here, ``test_fk_columns_not_null``
+ would pass trivially rather than because the migration promoted
+ anything."""
+ engine = sa.create_engine("sqlite:///:memory:")
+ md = sa.MetaData()
+ for t in AFFECTED_TABLES:
+ nullable = t.name in TABLES_WITH_NULLABLE_FKS
+ cols: list[sa.SchemaItem] = [
+ sa.Column("id", sa.Integer, primary_key=True),
+ sa.Column(t.fk1, sa.Integer, nullable=nullable),
+ sa.Column(t.fk2, sa.Integer, nullable=nullable),
+ ]
+ constraints: list[sa.SchemaItem] = []
+ if t.name in TABLES_WITH_PRE_EXISTING_UNIQUE:
+ constraints.append(sa.UniqueConstraint(t.fk1, t.fk2))
+ sa.Table(t.name, md, *cols, *constraints)
+ md.create_all(engine)
+
+ # Apply the migration's upgrade() against this engine via Alembic's
+ # MigrationContext, patching the migration module's ``op`` reference.
+ with engine.connect() as conn:
+ ctx = MigrationContext.configure(conn)
+ ops = Operations(ctx)
+ original_op = _migration.op
+ _migration.op = ops # type: ignore[attr-defined]
+ try:
+ _migration.upgrade()
+ finally:
+ _migration.op = original_op # type: ignore[attr-defined]
+ return engine
+
+
+@pytest.mark.parametrize("t", AFFECTED_TABLES, ids=lambda t: t.name)
+def test_no_id_column(post_upgrade_engine: sa.engine.Engine, t) -> None:
+ """The synthetic ``id`` column is gone from each affected table."""
+ insp = inspect(post_upgrade_engine)
+ column_names = {c["name"] for c in insp.get_columns(t.name)}
+ assert "id" not in column_names, (
+ f"{t.name} still has an 'id' column after migration; "
+ f"composite-PK conversion incomplete"
+ )
+
+
+@pytest.mark.parametrize("t", AFFECTED_TABLES, ids=lambda t: t.name)
+def test_primary_key_is_composite_fks(post_upgrade_engine: sa.engine.Engine, t) -> None:
+ """The primary key of each affected table is exactly ``(fk1, fk2)``."""
+ insp = inspect(post_upgrade_engine)
+ pk_cols = set(insp.get_pk_constraint(t.name).get("constrained_columns", []))
+ assert pk_cols == {t.fk1, t.fk2}, (
+ f"{t.name} primary key is {pk_cols}, expected {{{t.fk1}, {t.fk2}}}"
+ )
+
+
+@pytest.mark.parametrize(
+ "t",
+ [t for t in AFFECTED_TABLES if t.name in TABLES_WITH_PRE_EXISTING_UNIQUE],
+ ids=lambda t: t.name,
+)
+def test_redundant_unique_dropped(post_upgrade_engine: sa.engine.Engine, t) -> None:
+ """For the two tables that previously carried a UNIQUE(fk1, fk2), that
+ constraint is now subsumed by the composite PK and must not appear
+ separately in the unique-constraint list."""
+ insp = inspect(post_upgrade_engine)
+ redundant_pair = {t.fk1, t.fk2}
+ for uc in insp.get_unique_constraints(t.name):
+ cols = set(uc.get("column_names", []))
+ assert cols != redundant_pair, (
+ f"{t.name} still carries a redundant UniqueConstraint over "
+ f"{redundant_pair} (name={uc.get('name')!r}); "
+ f"composite-PK conversion incomplete"
+ )
+
+
+@pytest.mark.parametrize("t", AFFECTED_TABLES, ids=lambda t: t.name)
+def test_fk_columns_not_null(post_upgrade_engine: sa.engine.Engine, t) -> None:
+ """PK promotion implicitly tightens the FK columns to NOT NULL."""
+ insp = inspect(post_upgrade_engine)
+ cols_by_name = {c["name"]: c for c in insp.get_columns(t.name)}
+ for col in (t.fk1, t.fk2):
+ assert col in cols_by_name, f"{t.name} missing column {col}"
+ assert cols_by_name[col].get("nullable") is False, (
+ f"{t.name}.{col} is nullable; expected NOT NULL after PK promotion"
+ )
diff --git a/tests/integration_tests/migrations/composite_pk_round_trip__tests.py b/tests/integration_tests/migrations/composite_pk_round_trip__tests.py
new file mode 100644
index 000000000000..3a010889799f
--- /dev/null
+++ b/tests/integration_tests/migrations/composite_pk_round_trip__tests.py
@@ -0,0 +1,200 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Schema round-trip tests for the composite-PK association-tables migration
+(revision 2bee73611e32). Builds the pre-migration shape against an in-memory
+SQLite engine, runs the migration's ``upgrade()``, asserts the post-upgrade
+shape, runs ``downgrade()``, asserts the prior shape is restored (modulo the
+documented FK NOT NULL asymmetry), and re-runs ``upgrade()`` to verify
+idempotency.
+
+This is run against an isolated in-memory engine via Alembic's
+``MigrationContext`` so the test does not perturb the project's test DB.
+
+Cross-backend (Postgres/MySQL) verification is handled by CI's
+test-postgres / test-mysql shards running ``superset db upgrade``. This
+file covers the SQLite slice.
+"""
+
+from importlib import import_module
+from typing import Any
+
+import sqlalchemy as sa
+from alembic.migration import MigrationContext
+from alembic.operations import Operations
+from sqlalchemy import inspect
+
+# Import the migration module under test.
+_migration = import_module(
+ "superset.migrations.versions."
+ "2026-05-01_23-36_2bee73611e32_composite_pk_association_tables"
+)
+AFFECTED_TABLES = _migration.AFFECTED_TABLES
+TABLES_WITH_PRE_EXISTING_UNIQUE = _migration.TABLES_WITH_PRE_EXISTING_UNIQUE
+TABLES_WITH_NULLABLE_FKS = _migration.TABLES_WITH_NULLABLE_FKS
+
+
+def _build_pre_migration_schema(engine: sa.engine.Engine) -> None:
+ """Recreate the eight tables in their pre-migration shape (surrogate
+ ``id INTEGER PRIMARY KEY`` plus an optional ``UNIQUE(fk1, fk2)`` on the
+ two tables that previously carried one). FK columns are NULLABLE on
+ the six tables that historically allowed NULLs — fidelity matters:
+ with ``nullable=False`` here, the post-upgrade NOT NULL assertions
+ pass trivially rather than because the migration promoted anything,
+ and the NULL-row cleanup path can't be exercised. FKs to parent
+ tables are omitted to keep the test self-contained — we're testing
+ schema transformations, not FK enforcement."""
+ md = sa.MetaData()
+ for t in AFFECTED_TABLES:
+ nullable = t.name in TABLES_WITH_NULLABLE_FKS
+ cols: list[sa.Column] = [
+ sa.Column("id", sa.Integer, primary_key=True),
+ sa.Column(t.fk1, sa.Integer, nullable=nullable),
+ sa.Column(t.fk2, sa.Integer, nullable=nullable),
+ ]
+ constraints: list[sa.SchemaItem] = []
+ if t.name in TABLES_WITH_PRE_EXISTING_UNIQUE:
+ constraints.append(sa.UniqueConstraint(t.fk1, t.fk2))
+ sa.Table(t.name, md, *cols, *constraints)
+ md.create_all(engine)
+
+
+def _shape(engine: sa.engine.Engine, table: str) -> dict[str, Any]:
+ """Return a structural summary for asserting equality across runs."""
+ insp = inspect(engine)
+ pk = insp.get_pk_constraint(table).get("constrained_columns", [])
+ columns = sorted(c["name"] for c in insp.get_columns(table))
+ uniques = sorted(
+ tuple(sorted(uc.get("column_names", [])))
+ for uc in insp.get_unique_constraints(table)
+ )
+ return {"columns": columns, "pk": sorted(pk), "uniques": uniques}
+
+
+def _run_with_alembic_context(engine: sa.engine.Engine, fn) -> None:
+ """Run ``fn()`` (the migration's upgrade/downgrade body) inside a fresh
+ Alembic ``MigrationContext`` bound to ``engine``. Patches the
+ migration module's ``op`` to point at this context so its
+ ``op.get_bind()`` and ``op.batch_alter_table`` calls execute against
+ the in-memory engine."""
+ with engine.connect() as conn:
+ ctx = MigrationContext.configure(conn)
+ ops = Operations(ctx)
+ original_op = _migration.op
+ _migration.op = ops # type: ignore[attr-defined]
+ try:
+ fn()
+ finally:
+ _migration.op = original_op # type: ignore[attr-defined]
+
+
+def test_round_trip_against_in_memory_sqlite() -> None:
+ """Round-trip: pre-migration → upgrade → downgrade → upgrade again.
+
+ Asserts:
+ - Post-upgrade shape: no ``id``, composite PK on (fk1, fk2), no
+ UNIQUE(fk1, fk2) on the two tables that previously carried one.
+ - Post-downgrade shape: ``id`` restored, PK back on (id), UNIQUE
+ re-added on the two tables. (FK columns remain NOT NULL — the
+ documented intentional asymmetry.)
+ - Post-re-upgrade idempotency: shape matches the first post-upgrade.
+ """
+ engine = sa.create_engine("sqlite:///:memory:")
+ _build_pre_migration_schema(engine)
+
+ _run_with_alembic_context(engine, _migration.upgrade)
+
+ for t in AFFECTED_TABLES:
+ s = _shape(engine, t.name)
+ assert "id" not in s["columns"], f"{t.name}: id still present post-upgrade: {s}"
+ assert s["pk"] == sorted([t.fk1, t.fk2]), (
+ f"{t.name}: PK is {s['pk']}, expected {sorted([t.fk1, t.fk2])}"
+ )
+ assert tuple(sorted([t.fk1, t.fk2])) not in s["uniques"], (
+ f"{t.name}: redundant UNIQUE not dropped post-upgrade: {s['uniques']}"
+ )
+
+ post_upgrade_shape = {t.name: _shape(engine, t.name) for t in AFFECTED_TABLES}
+
+ _run_with_alembic_context(engine, _migration.downgrade)
+
+ for t in AFFECTED_TABLES:
+ s = _shape(engine, t.name)
+ assert "id" in s["columns"], f"{t.name}: id not restored post-downgrade: {s}"
+ assert s["pk"] == ["id"], f"{t.name}: PK is {s['pk']}, expected ['id']"
+ if t.name in TABLES_WITH_PRE_EXISTING_UNIQUE:
+ assert tuple(sorted([t.fk1, t.fk2])) in s["uniques"], (
+ f"{t.name}: UNIQUE not restored post-downgrade: {s['uniques']}"
+ )
+
+ _run_with_alembic_context(engine, _migration.upgrade)
+
+ re_upgrade_shape = {t.name: _shape(engine, t.name) for t in AFFECTED_TABLES}
+ assert re_upgrade_shape == post_upgrade_shape, (
+ "Re-upgrade shape differs from initial upgrade shape — "
+ "migration is not idempotent. "
+ f"diff: {set(re_upgrade_shape.items()) ^ set(post_upgrade_shape.items())}"
+ )
+
+
+def test_upgrade_scrubs_null_fks_and_duplicates() -> None:
+ """The pre-flight data surgery is the migration's riskiest half — and
+ it must be deletable-detectable: this test fails if
+ ``_delete_null_fk_rows`` or ``_dedupe_by_min_id`` is removed from
+ ``upgrade()``.
+
+ Seeds a nullable-FK junction (``slice_user``) with NULL-FK rows and
+ duplicate ``(fk1, fk2)`` pairs in the true pre-migration shape, runs
+ the upgrade, and asserts exactly the distinct non-NULL pairs survive
+ (the composite PK could not even be created otherwise).
+ """
+ engine = sa.create_engine("sqlite:///:memory:")
+ _build_pre_migration_schema(engine)
+
+ md = sa.MetaData()
+ slice_user = sa.Table("slice_user", md, autoload_with=engine)
+ with engine.begin() as conn:
+ conn.execute(
+ slice_user.insert(),
+ [
+ {"id": 1, "user_id": 1, "slice_id": 1}, # keeper (MIN id)
+ {"id": 2, "user_id": 1, "slice_id": 1}, # duplicate pair
+ {"id": 3, "user_id": 1, "slice_id": 1}, # duplicate pair
+ {"id": 4, "user_id": 2, "slice_id": 2}, # distinct keeper
+ {"id": 5, "user_id": None, "slice_id": 3}, # NULL fk1
+ {"id": 6, "user_id": 3, "slice_id": None}, # NULL fk2
+ ],
+ )
+
+ _run_with_alembic_context(engine, _migration.upgrade)
+
+ with engine.connect() as conn:
+ survivors = sorted(
+ conn.execute(sa.text("SELECT user_id, slice_id FROM slice_user")).fetchall()
+ )
+ assert survivors == [(1, 1), (2, 2)], (
+ f"expected the two distinct non-NULL pairs to survive, got {survivors}"
+ )
+
+
+def test_migration_module_constants_are_consistent() -> None:
+ """Sanity-check the migration module's exported constants. Catches
+ accidental edits that misalign AFFECTED_TABLES with the auxiliary sets."""
+ affected_names = {t.name for t in AFFECTED_TABLES}
+ assert _migration.TABLES_WITH_PRE_EXISTING_UNIQUE.issubset(affected_names)
+ assert _migration.TABLES_WITH_NULLABLE_FKS.issubset(affected_names)
+ # Order is alphabetical (deterministic for review/bisection).
+ assert [t.name for t in AFFECTED_TABLES] == sorted(affected_names)
diff --git a/tests/integration_tests/superset_test_config.py b/tests/integration_tests/superset_test_config.py
index 56ab8ddd1941..a6dfd4d01a4d 100644
--- a/tests/integration_tests/superset_test_config.py
+++ b/tests/integration_tests/superset_test_config.py
@@ -89,6 +89,14 @@ def GET_FEATURE_FLAGS_FUNC(ff): # noqa: N802
TALISMAN_ENABLED = False
WTF_CSRF_ENABLED = False
+# Production ships entity-version capture OFF (see ``config.py``); the test
+# suite turns it ON so the capture pipeline (Continuum shadow rows + baseline
+# + ``version_changes``) is actually exercised. The dark/kill-switch contract
+# is proven separately by
+# ``tests/integration_tests/versioning/capture_disabled_tests.py``, which
+# detaches the listeners within the test.
+ENABLE_VERSIONING_CAPTURE = True
+
FAB_ROLES = {"TestRole": [["Security", "menu_access"], ["List Users", "menu_access"]]}
PUBLIC_ROLE_LIKE = "Gamma"
diff --git a/tests/integration_tests/versioning/__init__.py b/tests/integration_tests/versioning/__init__.py
new file mode 100644
index 000000000000..13a83393a912
--- /dev/null
+++ b/tests/integration_tests/versioning/__init__.py
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
diff --git a/tests/integration_tests/versioning/capture_disabled_tests.py b/tests/integration_tests/versioning/capture_disabled_tests.py
new file mode 100644
index 000000000000..80079979ed7c
--- /dev/null
+++ b/tests/integration_tests/versioning/capture_disabled_tests.py
@@ -0,0 +1,172 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Behavioral proof for the versioning kill-switch / dark-launch contract.
+
+``ENABLE_VERSIONING_CAPTURE=False`` MUST cause a real save to write
+**zero** ``version_transaction`` rows and **zero** ``*_version`` shadow
+rows — Continuum is wired at import (``make_versioned()``), so suppressing
+only the custom listeners would leave Continuum's own listeners minting
+empty transaction rows on every flush. ``init_versioning`` detaches those
+via ``_remove_continuum_write_listeners()``; this test pins that the
+*behavioral* result is genuinely nothing-written. The structural unit tests
+in ``tests/unit_tests/initialization_test.py`` (``TestInitVersioning``) drive
+the config-flag branch of ``init_versioning`` with mocks; this exercises the
+detach for real, against a database, and proves a control save under capture
+*on* writes both a shadow row and a ``version_changes`` record (so the
+zero-rows assertions are not vacuously true).
+
+This is the acceptance gate for shipping versioning dark in the
+base-infra rollout PR.
+"""
+
+from __future__ import annotations
+
+import pytest
+import sqlalchemy as sa
+from sqlalchemy.engine import Engine
+from sqlalchemy.orm import Mapper, Session
+from sqlalchemy_continuum import version_class, versioning_manager
+
+from superset.extensions import db
+from superset.initialization import SupersetAppInitializer
+from superset.models.slice import Slice
+from tests.integration_tests.base_tests import SupersetTestCase
+from tests.integration_tests.constants import ADMIN_USERNAME
+from tests.integration_tests.fixtures.birth_names_dashboard import ( # noqa: F401
+ load_birth_names_dashboard_with_slices,
+ load_birth_names_data,
+)
+
+
+def _transaction_row_count() -> int:
+ """Total rows in the shared ``version_transaction`` table."""
+ return db.session.query(versioning_manager.transaction_cls).count()
+
+
+def _slice_version_count(slice_id: int) -> int:
+ ver_cls = version_class(Slice)
+ return db.session.query(ver_cls).filter(ver_cls.id == slice_id).count()
+
+
+def _version_changes_count() -> int:
+ """Total rows in the ``version_changes`` table — the custom diff records,
+ distinct from Continuum's shadow rows. Proves the full capture pipeline
+ (not just Continuum) ran."""
+ return (
+ db.session.execute(sa.text("SELECT COUNT(*) FROM version_changes")).scalar()
+ or 0
+ )
+
+
+def _reattach_continuum_write_listeners() -> None:
+ """Inverse of ``init_versioning._remove_continuum_write_listeners`` so this
+ test restores process-global capture state for the rest of the suite
+ (which runs with ``ENABLE_VERSIONING_CAPTURE`` on). Idempotent on a
+ representative listener. Also restores ``options['versioning']`` — the
+ detach flips it off (and the baseline listener honors it), so the
+ re-attach must flip it back on or subsequent saves would silently stop
+ capturing."""
+ versioning_manager.options["versioning"] = True
+ if sa.event.contains(Mapper, "after_insert", versioning_manager.track_inserts):
+ return # already attached
+ versioning_manager.track_operations(Mapper)
+ versioning_manager.track_session(Session)
+ sa.event.listen(
+ Engine, "before_execute", versioning_manager.track_association_operations
+ )
+ sa.event.listen(Engine, "rollback", versioning_manager.clear_connection)
+ sa.event.listen(
+ Engine,
+ "set_connection_execution_options",
+ versioning_manager.track_cloned_connections,
+ )
+
+
+class TestVersioningCaptureDisabled(SupersetTestCase):
+ @pytest.mark.usefixtures("load_birth_names_dashboard_with_slices")
+ def test_capture_off_writes_no_version_or_transaction_rows(self) -> None:
+ """With Continuum's write listeners detached (the capture-off path),
+ a real content change MUST write neither a shadow row nor a
+ ``version_transaction`` row."""
+ db.session.commit()
+ chart = db.session.query(Slice).filter(Slice.slice_name == "Girls").first()
+ assert chart is not None
+ chart_id = chart.id
+
+ self.login(ADMIN_USERNAME)
+
+ # Simulate the ENABLE_VERSIONING_CAPTURE=False branch of init_versioning.
+ SupersetAppInitializer._remove_continuum_write_listeners()
+ try:
+ tx_before = _transaction_row_count()
+ ver_before = _slice_version_count(chart_id)
+
+ rv = self.client.put(
+ f"/api/v1/chart/{chart_id}",
+ json={"slice_name": "capture-off-renamed"},
+ )
+ assert rv.status_code == 200, rv.data
+ db.session.expire_all()
+
+ assert _transaction_row_count() == tx_before, (
+ "capture off MUST write zero version_transaction rows "
+ f"(before={tx_before}, after={_transaction_row_count()})"
+ )
+ assert _slice_version_count(chart_id) == ver_before, (
+ "capture off MUST write zero shadow rows "
+ f"(before={ver_before}, after={_slice_version_count(chart_id)})"
+ )
+ finally:
+ # Restore the chart and re-attach Continuum so the rest of the
+ # suite runs with capture on.
+ self.client.put(f"/api/v1/chart/{chart_id}", json={"slice_name": "Girls"})
+ _reattach_continuum_write_listeners()
+
+ @pytest.mark.usefixtures("load_birth_names_dashboard_with_slices")
+ def test_control_capture_on_writes_version_and_change_rows(self) -> None:
+ """Control: with capture on (the suite default), the same edit DOES
+ mint a shadow row AND a ``version_changes`` record — proving the
+ disabled-path assertions are not vacuously true and that the full
+ capture pipeline (Continuum shadow rows + the custom change-record
+ listener) runs end-to-end, not just Continuum's own writes."""
+ db.session.commit()
+ chart = db.session.query(Slice).filter(Slice.slice_name == "Boys").first()
+ if chart is None: # birth_names fixture not loaded for this test
+ pytest.skip("Boys slice not present")
+ chart_id = chart.id
+
+ self.login(ADMIN_USERNAME)
+ _reattach_continuum_write_listeners() # belt-and-suspenders: suite is on
+ # ``>`` rather than ``== before + 1``: the first edit to a not-yet-
+ # versioned entity also mints a synthetic baseline shadow row.
+ ver_before = _slice_version_count(chart_id)
+ changes_before = _version_changes_count()
+ try:
+ rv = self.client.put(
+ f"/api/v1/chart/{chart_id}",
+ json={"slice_name": "capture-on-renamed"},
+ )
+ assert rv.status_code == 200, rv.data
+ db.session.expire_all()
+ assert _slice_version_count(chart_id) > ver_before, (
+ "capture on MUST write at least one shadow row"
+ )
+ assert _version_changes_count() > changes_before, (
+ "capture on MUST write at least one version_changes record"
+ )
+ finally:
+ self.client.put(f"/api/v1/chart/{chart_id}", json={"slice_name": "Boys"})
diff --git a/tests/integration_tests/versioning/snapshot_projection_tests.py b/tests/integration_tests/versioning/snapshot_projection_tests.py
new file mode 100644
index 000000000000..b7be4ce4c36e
--- /dev/null
+++ b/tests/integration_tests/versioning/snapshot_projection_tests.py
@@ -0,0 +1,70 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Drift guard for the version-snapshot projection.
+
+``queries.get_version`` projects an entity's historical state by copying
+*every* shadow column except Continuum's three bookkeeping columns — a
+denylist, not an allowlist. That is safe today (nothing sensitive on the
+versioned models survives ``__versioned__['exclude']``), but a future
+sensitive column added to a versioned model would be exposed through the
+read-only ``/versions/`` endpoint by default unless someone remembers to
+exclude it. This test fails when such a column appears, forcing the
+exclusion decision to be made consciously.
+"""
+
+from __future__ import annotations
+
+from sqlalchemy_continuum import version_class
+
+from superset.connectors.sqla.models import SqlaTable
+from superset.models.dashboard import Dashboard
+from superset.models.slice import Slice
+from superset.versioning.baseline.shadow import CONTINUUM_BOOKKEEPING_COLUMNS
+from tests.integration_tests.base_tests import SupersetTestCase
+
+# Substrings that mark a column as something a version snapshot must never
+# echo back. Matched case-insensitively against the projected column names.
+_SENSITIVE_SUBSTRINGS = (
+ "password",
+ "secret",
+ "encrypted",
+ "private_key",
+ "api_key",
+ "access_token",
+)
+
+
+class TestSnapshotProjectionSafety(SupersetTestCase):
+ def test_version_snapshot_exposes_no_sensitive_columns(self) -> None:
+ """The columns ``get_version`` would project for each versioned model
+ must contain no sensitive-looking column name."""
+ for model_cls in (Slice, Dashboard, SqlaTable):
+ ver_tbl = version_class(model_cls).__table__
+ projected = [
+ col.name
+ for col in ver_tbl.columns
+ if col.name not in CONTINUUM_BOOKKEEPING_COLUMNS
+ ]
+ for name in projected:
+ lowered = name.lower()
+ offending = [s for s in _SENSITIVE_SUBSTRINGS if s in lowered]
+ assert not offending, (
+ f"{model_cls.__name__} version snapshot would expose "
+ f"sensitive-looking column '{name}'. Add it to the model's "
+ f"__versioned__['exclude'] set, or confirm it is safe and "
+ f"relax this guard."
+ )
diff --git a/tests/unit_tests/initialization_test.py b/tests/unit_tests/initialization_test.py
index 65d2ea4c96d0..7f19e0056d57 100644
--- a/tests/unit_tests/initialization_test.py
+++ b/tests/unit_tests/initialization_test.py
@@ -190,6 +190,68 @@ def test_database_uri_doesnt_cache_fallback_values(self):
)
+class TestInitVersioning:
+ """Structural coverage for the ``ENABLE_VERSIONING_CAPTURE`` gate.
+
+ Drives ``init_versioning`` through the real config branch (rather than
+ calling the detach helper directly) so a future inversion of the gate —
+ or of its default — is caught cheaply, without a DB. The behavioral
+ "zero rows when off" proof lives in
+ ``tests/integration_tests/versioning/capture_disabled_tests.py``.
+ """
+
+ def test_capture_flag_off_detaches_and_skips_registration(self):
+ """Flag explicitly False → detach Continuum, register nothing."""
+ fake = MagicMock()
+ fake.config = {"ENABLE_VERSIONING_CAPTURE": False}
+
+ with (
+ patch(
+ "superset.versioning.baseline.register_baseline_listener"
+ ) as reg_baseline,
+ patch(
+ "superset.versioning.changes.register_change_record_listener"
+ ) as reg_changes,
+ ):
+ SupersetAppInitializer.init_versioning(fake)
+
+ fake._remove_continuum_write_listeners.assert_called_once()
+ reg_baseline.assert_not_called()
+ reg_changes.assert_not_called()
+
+ def test_capture_flag_absent_defaults_to_off(self):
+ """Flag absent → fallback MUST be off, so any app-factory path that
+ doesn't load ``superset.config`` stays inert rather than silently
+ enabling capture."""
+ fake = MagicMock()
+ fake.config = {}
+
+ SupersetAppInitializer.init_versioning(fake)
+
+ fake._remove_continuum_write_listeners.assert_called_once()
+
+ def test_capture_flag_on_registers_listeners_without_detaching(self):
+ """Flag True → register both before-flush listeners, never detach."""
+ fake = MagicMock()
+ fake.config = {"ENABLE_VERSIONING_CAPTURE": True}
+
+ with (
+ patch(
+ "superset.versioning.baseline.register_baseline_listener"
+ ) as reg_baseline,
+ patch("superset.versioning.baseline.VERSIONED_MODELS", []),
+ patch(
+ "superset.versioning.changes.register_change_record_listener"
+ ) as reg_changes,
+ patch("sqlalchemy_continuum.version_class"),
+ ):
+ SupersetAppInitializer.init_versioning(fake)
+
+ reg_baseline.assert_called_once()
+ reg_changes.assert_called_once()
+ fake._remove_continuum_write_listeners.assert_not_called()
+
+
class TestCreateAppRoot:
"""Test app root resolution precedence in create_app."""
diff --git a/tests/unit_tests/migrations/composite_pk_association_tables_test.py b/tests/unit_tests/migrations/composite_pk_association_tables_test.py
new file mode 100644
index 000000000000..05a69293a23b
--- /dev/null
+++ b/tests/unit_tests/migrations/composite_pk_association_tables_test.py
@@ -0,0 +1,144 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Unit tests for the composite-PK association-tables migration (revision
+2bee73611e32). Verifies the post-migration constraint enforcement: duplicate
+``(fk1, fk2)`` insertions fail with IntegrityError, distinct pairs succeed.
+
+Schema is built from the live ORM ``Table`` definitions via
+``metadata.create_all(engine)`` against in-memory SQLite. This reflects the
+post-T015–T018 ORM model state (composite-PK), independent of whether the
+Alembic migration has run against the test DB. The two should agree.
+"""
+
+import pytest
+import sqlalchemy as sa
+from sqlalchemy.exc import IntegrityError
+
+# (table_name, fk1_col, fk2_col, fk1_parent_table, fk2_parent_table)
+# Parent-table names are needed to build the FK targets in the in-memory schema.
+AFFECTED_TABLES = [
+ ("dashboard_roles", "dashboard_id", "role_id", "dashboards", "ab_role"),
+ ("dashboard_slices", "dashboard_id", "slice_id", "dashboards", "slices"),
+ ("dashboard_user", "user_id", "dashboard_id", "ab_user", "dashboards"),
+ (
+ "report_schedule_user",
+ "user_id",
+ "report_schedule_id",
+ "ab_user",
+ "report_schedule",
+ ),
+ (
+ "rls_filter_roles",
+ "role_id",
+ "rls_filter_id",
+ "ab_role",
+ "row_level_security_filters",
+ ),
+ (
+ "rls_filter_tables",
+ "table_id",
+ "rls_filter_id",
+ "tables",
+ "row_level_security_filters",
+ ),
+ ("slice_user", "user_id", "slice_id", "ab_user", "slices"),
+ ("sqlatable_user", "user_id", "table_id", "ab_user", "tables"),
+]
+
+
+def _build_in_memory_schema(
+ table_name: str, fk1: str, fk2: str, fk1_parent: str, fk2_parent: str
+) -> tuple[sa.engine.Engine, sa.Table]:
+ """Build an in-memory SQLite schema with two minimal parent tables and
+ the junction table under test (composite-PK shape). Returns the engine
+ and the junction-table object for inserts."""
+ metadata = sa.MetaData()
+ sa.Table(
+ fk1_parent,
+ metadata,
+ sa.Column("id", sa.Integer, primary_key=True),
+ )
+ if fk2_parent != fk1_parent:
+ sa.Table(
+ fk2_parent,
+ metadata,
+ sa.Column("id", sa.Integer, primary_key=True),
+ )
+ junction = sa.Table(
+ table_name,
+ metadata,
+ sa.Column(
+ fk1,
+ sa.Integer,
+ sa.ForeignKey(f"{fk1_parent}.id"),
+ primary_key=True,
+ ),
+ sa.Column(
+ fk2,
+ sa.Integer,
+ sa.ForeignKey(f"{fk2_parent}.id"),
+ primary_key=True,
+ ),
+ )
+ engine = sa.create_engine("sqlite:///:memory:")
+ metadata.create_all(engine)
+ # Seed parent rows so the FK constraints can be satisfied.
+ # Identifiers come from the AFFECTED_TABLES test parameter list, not user input.
+ with engine.begin() as conn:
+ conn.execute(
+ sa.text(f"INSERT INTO {fk1_parent} (id) VALUES (1), (2)") # noqa: S608
+ )
+ if fk2_parent != fk1_parent:
+ conn.execute(
+ sa.text(f"INSERT INTO {fk2_parent} (id) VALUES (1), (2)") # noqa: S608
+ )
+ return engine, junction
+
+
+@pytest.mark.parametrize("table,fk1,fk2,fk1_parent,fk2_parent", AFFECTED_TABLES)
+def test_duplicate_insert_rejected(
+ table: str, fk1: str, fk2: str, fk1_parent: str, fk2_parent: str
+) -> None:
+ """Inserting the same ``(fk1, fk2)`` pair twice raises ``IntegrityError``.
+
+ Verifies SC-004 / FR-007 — the composite primary key enforces uniqueness
+ at the database level on every affected table.
+ """
+ engine, junction = _build_in_memory_schema(table, fk1, fk2, fk1_parent, fk2_parent)
+ with engine.begin() as conn:
+ conn.execute(junction.insert().values({fk1: 1, fk2: 1}))
+ with pytest.raises(IntegrityError):
+ conn.execute(junction.insert().values({fk1: 1, fk2: 1}))
+
+
+@pytest.mark.parametrize("table,fk1,fk2,fk1_parent,fk2_parent", AFFECTED_TABLES)
+def test_distinct_pairs_accepted(
+ table: str, fk1: str, fk2: str, fk1_parent: str, fk2_parent: str
+) -> None:
+ """Two distinct ``(fk1, fk2)`` pairs both succeed.
+
+ Sanity check that the PK isn't accidentally a single-column constraint
+ (which would reject ``(1, 1)`` and ``(1, 2)`` as a duplicate on column 1).
+ """
+ engine, junction = _build_in_memory_schema(table, fk1, fk2, fk1_parent, fk2_parent)
+ with engine.begin() as conn:
+ conn.execute(junction.insert().values({fk1: 1, fk2: 1}))
+ conn.execute(junction.insert().values({fk1: 1, fk2: 2}))
+ result = conn.execute(
+ sa.text(f"SELECT COUNT(*) FROM {table}") # noqa: S608
+ ).scalar_one()
+ assert result == 2