Merged latest main

bearInTheRoad · bearInTheRoad · commit 9ff849613631 · 2025-11-04T21:53:24.000+08:00
diff --git a/.circleci/continue_config.yml b/.circleci/continue_config.yml
@@ -93,9 +93,9 @@ jobs:
       - run:
           name: Run linters and code style checks
           command: make py-style
-      - run:
-          name: Exercise the benchmarks
-          command: make benchmark-ci
+      # - run:
+      #     name: Exercise the benchmarks
+      #     command: make benchmark-ci
       - run:
           name: Run cicd tests
           command: make cicd-test
diff --git a/sqlmesh/core/context.py b/sqlmesh/core/context.py
@@ -2279,6 +2279,7 @@ def audit(
                 snapshot=snapshot,
                 start=start,
                 end=end,
+                execution_time=execution_time,
                 snapshots=self.snapshots,
             ):
                 audit_id = f"{audit_result.audit.name}"
diff --git a/sqlmesh/core/renderer.py b/sqlmesh/core/renderer.py
@@ -196,7 +196,14 @@ def _resolve_table(table: str | exp.Table) -> str:
             **kwargs,
         }
 
+        if this_model:
+            render_kwargs["this_model"] = this_model
+
+        macro_evaluator.locals.update(render_kwargs)
+
         variables = kwargs.pop("variables", {})
+        if variables:
+            macro_evaluator.locals.setdefault(c.SQLMESH_VARS, {}).update(variables)
 
         expressions = [self._expression]
         if isinstance(self._expression, d.Jinja):
@@ -268,14 +275,6 @@ def _resolve_table(table: str | exp.Table) -> str:
                             f"Could not parse the rendered jinja at '{self._path}'.\n{ex}"
                         ) from ex
 
-        if this_model:
-            render_kwargs["this_model"] = this_model
-
-        macro_evaluator.locals.update(render_kwargs)
-
-        if variables:
-            macro_evaluator.locals.setdefault(c.SQLMESH_VARS, {}).update(variables)
-
         for definition in self._macro_definitions:
             try:
                 macro_evaluator.evaluate(definition)
diff --git a/sqlmesh/core/scheduler.py b/sqlmesh/core/scheduler.py
@@ -659,6 +659,7 @@ def _dag(
         }
         snapshots_to_create = snapshots_to_create or set()
         original_snapshots_to_create = snapshots_to_create.copy()
+        upstream_dependencies_cache: t.Dict[SnapshotId, t.Set[SchedulingUnit]] = {}
 
         snapshot_dag = snapshot_dag or snapshots_to_dag(batches)
         dag = DAG[SchedulingUnit]()
@@ -670,12 +671,15 @@ def _dag(
             snapshot = self.snapshots_by_name[snapshot_id.name]
             intervals = intervals_per_snapshot.get(snapshot.name, [])
 
-            upstream_dependencies: t.List[SchedulingUnit] = []
+            upstream_dependencies: t.Set[SchedulingUnit] = set()
 
             for p_sid in snapshot.parents:
-                upstream_dependencies.extend(
+                upstream_dependencies.update(
                     self._find_upstream_dependencies(
-                        p_sid, intervals_per_snapshot, original_snapshots_to_create
+                        p_sid,
+                        intervals_per_snapshot,
+                        original_snapshots_to_create,
+                        upstream_dependencies_cache,
                     )
                 )
 
@@ -726,29 +730,42 @@ def _find_upstream_dependencies(
         parent_sid: SnapshotId,
         intervals_per_snapshot: t.Dict[str, Intervals],
         snapshots_to_create: t.Set[SnapshotId],
-    ) -> t.List[SchedulingUnit]:
+        cache: t.Dict[SnapshotId, t.Set[SchedulingUnit]],
+    ) -> t.Set[SchedulingUnit]:
         if parent_sid not in self.snapshots:
-            return []
+            return set()
+        if parent_sid in cache:
+            return cache[parent_sid]
 
         p_intervals = intervals_per_snapshot.get(parent_sid.name, [])
 
+        parent_node: t.Optional[SchedulingUnit] = None
         if p_intervals:
             if len(p_intervals) > 1:
-                return [DummyNode(snapshot_name=parent_sid.name)]
-            interval = p_intervals[0]
-            return [EvaluateNode(snapshot_name=parent_sid.name, interval=interval, batch_index=0)]
-        if parent_sid in snapshots_to_create:
-            return [CreateNode(snapshot_name=parent_sid.name)]
+                parent_node = DummyNode(snapshot_name=parent_sid.name)
+            else:
+                interval = p_intervals[0]
+                parent_node = EvaluateNode(
+                    snapshot_name=parent_sid.name, interval=interval, batch_index=0
+                )
+        elif parent_sid in snapshots_to_create:
+            parent_node = CreateNode(snapshot_name=parent_sid.name)
+
+        if parent_node is not None:
+            cache[parent_sid] = {parent_node}
+            return {parent_node}
+
         # This snapshot has no intervals and doesn't need creation which means
         # that it can be a transitive dependency
-        transitive_deps: t.List[SchedulingUnit] = []
+        transitive_deps: t.Set[SchedulingUnit] = set()
         parent_snapshot = self.snapshots[parent_sid]
         for grandparent_sid in parent_snapshot.parents:
-            transitive_deps.extend(
+            transitive_deps.update(
                 self._find_upstream_dependencies(
-                    grandparent_sid, intervals_per_snapshot, snapshots_to_create
+                    grandparent_sid, intervals_per_snapshot, snapshots_to_create, cache
                 )
             )
+        cache[parent_sid] = transitive_deps
         return transitive_deps
 
     def _run_or_audit(
diff --git a/sqlmesh/core/test/definition.py b/sqlmesh/core/test/definition.py
@@ -807,7 +807,7 @@ def runTest(self) -> None:
             actual_df.reset_index(drop=True, inplace=True)
             expected = self._create_df(values, columns=self.model.columns_to_types, partial=partial)
 
-            self.assert_equal(expected, actual_df, sort=False, partial=partial)
+            self.assert_equal(expected, actual_df, sort=True, partial=partial)
 
     def _execute_model(self) -> pd.DataFrame:
         """Executes the python model and returns a DataFrame."""
diff --git a/tests/core/test_model.py b/tests/core/test_model.py
@@ -12160,6 +12160,24 @@ def test_grants_table_type(kind: t.Union[str, _ModelKind], expected: DataObjectT
     assert model.grants_table_type == expected
 
 
+def test_model_macro_using_locals_called_from_jinja(assert_exp_eq) -> None:
+    @macro()
+    def execution_date(evaluator):
+        return f"""'{evaluator.locals.get("execution_date")}'"""
+
+    expressions = d.parse(
+        """
+        MODEL (name db.table);
+
+        JINJA_QUERY_BEGIN;
+        SELECT {{ execution_date() }} AS col;
+        JINJA_END;
+        """
+    )
+    model = load_sql_based_model(expressions)
+    assert_exp_eq(model.render_query(), '''SELECT '1970-01-01' AS "col"''')
+
+
 def test_audits_in_embedded_model():
     expression = d.parse(
         """
diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py
@@ -1126,3 +1126,90 @@ def test_dag_multiple_chain_transitive_deps(mocker: MockerFixture, make_snapshot
             )
         },
     }
+
+
+def test_dag_upstream_dependency_caching_with_complex_diamond(mocker: MockerFixture, make_snapshot):
+    r"""
+    Test that the upstream dependency caching correctly handles a complex diamond dependency graph.
+
+    Dependency graph:
+            A (has intervals)
+           / \
+          B   C (no intervals - transitive)
+         / \ / \
+        D   E   F (no intervals - transitive)
+         \ / \ /
+          G   H (has intervals - selected)
+
+    This creates multiple paths from G and H to A. Without caching, A's dependencies would be
+    computed multiple times (once for each path). With caching, they should be computed once
+    and reused.
+    """
+    snapshots = {}
+
+    for name in ["a", "b", "c", "d", "e", "f", "g", "h"]:
+        snapshots[name] = make_snapshot(SqlModel(name=name, query=parse_one("SELECT 1 as id")))
+        snapshots[name].categorize_as(SnapshotChangeCategory.BREAKING)
+
+    # A is the root
+    snapshots["b"] = snapshots["b"].model_copy(update={"parents": (snapshots["a"].snapshot_id,)})
+    snapshots["c"] = snapshots["c"].model_copy(update={"parents": (snapshots["a"].snapshot_id,)})
+
+    # Middle layer: D, E, F depend on B and/or C
+    snapshots["d"] = snapshots["d"].model_copy(update={"parents": (snapshots["b"].snapshot_id,)})
+    snapshots["e"] = snapshots["e"].model_copy(
+        update={"parents": (snapshots["b"].snapshot_id, snapshots["c"].snapshot_id)}
+    )
+    snapshots["f"] = snapshots["f"].model_copy(update={"parents": (snapshots["c"].snapshot_id,)})
+
+    # Bottom layer: G and H depend on D/E and E/F respectively
+    snapshots["g"] = snapshots["g"].model_copy(
+        update={"parents": (snapshots["d"].snapshot_id, snapshots["e"].snapshot_id)}
+    )
+    snapshots["h"] = snapshots["h"].model_copy(
+        update={"parents": (snapshots["e"].snapshot_id, snapshots["f"].snapshot_id)}
+    )
+
+    scheduler = Scheduler(
+        snapshots=list(snapshots.values()),
+        snapshot_evaluator=mocker.Mock(),
+        state_sync=mocker.Mock(),
+        default_catalog=None,
+    )
+
+    batched_intervals = {
+        snapshots["a"]: [(to_timestamp("2023-01-01"), to_timestamp("2023-01-02"))],
+        snapshots["g"]: [(to_timestamp("2023-01-01"), to_timestamp("2023-01-02"))],
+        snapshots["h"]: [(to_timestamp("2023-01-01"), to_timestamp("2023-01-02"))],
+    }
+
+    full_dag = snapshots_to_dag(snapshots.values())
+    dag = scheduler._dag(batched_intervals, snapshot_dag=full_dag)
+
+    # Verify the DAG structure:
+    # 1. A should be evaluated first (no dependencies)
+    # 2. Both G and H should depend on A (through transitive dependencies)
+    # 3. Transitive nodes (B, C, D, E, F) should not appear as separate evaluation nodes
+    expected_a_node = EvaluateNode(
+        snapshot_name='"a"',
+        interval=(to_timestamp("2023-01-01"), to_timestamp("2023-01-02")),
+        batch_index=0,
+    )
+
+    expected_g_node = EvaluateNode(
+        snapshot_name='"g"',
+        interval=(to_timestamp("2023-01-01"), to_timestamp("2023-01-02")),
+        batch_index=0,
+    )
+
+    expected_h_node = EvaluateNode(
+        snapshot_name='"h"',
+        interval=(to_timestamp("2023-01-01"), to_timestamp("2023-01-02")),
+        batch_index=0,
+    )
+
+    assert dag.graph == {
+        expected_a_node: set(),
+        expected_g_node: {expected_a_node},
+        expected_h_node: {expected_a_node},
+    }
diff --git a/tests/core/test_test.py b/tests/core/test_test.py
@@ -3367,6 +3367,56 @@ def execute(context: ExecutionContext, **kwargs: t.Any) -> pd.DataFrame:
     _check_successful_or_raise(test_default_vars.run())
 
 
+def test_python_model_sorting(tmp_path: Path) -> None:
+    py_model = tmp_path / "models" / "test_sort_model.py"
+    py_model.parent.mkdir(parents=True, exist_ok=True)
+    py_model.write_text(
+        """
+import pandas as pd  # noqa: TID253
+from sqlmesh import model, ExecutionContext
+import typing as t
+
+@model(
+  name="test_sort_model",
+  columns={"id": "int", "value": "varchar"},
+)
+def execute(context: ExecutionContext, **kwargs: t.Any) -> pd.DataFrame:
+  # Return rows in a potentially non-deterministic order
+  # (simulating a model that doesn't guarantee order)
+  return pd.DataFrame([
+      {"id": 3, "value": "c"},
+      {"id": 1, "value": "a"},
+      {"id": 2, "value": "b"},
+  ])"""
+    )
+
+    config = Config(model_defaults=ModelDefaultsConfig(dialect="duckdb"))
+    context = Context(config=config, paths=tmp_path)
+
+    python_model = context.models['"test_sort_model"']
+
+    _check_successful_or_raise(
+        _create_test(
+            body=load_yaml("""
+    test_without_sort:
+      model: test_sort_model
+      outputs:
+        query:
+          rows:
+            - id: 1
+              value: "a"
+            - id: 2
+              value: "b"
+            - id: 3
+              value: "c"
+            """),
+            test_name="test_without_sort",
+            model=python_model,
+            context=context,
+        ).run()
+    )
+
+
 @use_terminal_console
 def test_cte_failure(tmp_path: Path) -> None:
     models_dir = tmp_path / "models"