From 32e602f74f1bccf295b89f1b7cc85465ce7cdfff Mon Sep 17 00:00:00 2001 From: Max Dubrinsky Date: Wed, 10 Jun 2026 17:13:44 -0400 Subject: [PATCH] fix(entities): backend-safe JSON numeric comparison + absent-key $eq, with Postgres parity leg (AIRCORE-749) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses items (2), (4), and (5) of AIRCORE-749. Item (1) ($like) is in the parent PR this stacks on. (2) Numeric comparison no longer 500s on PostgreSQL. _cast_json_to_numeric emitted an unconditional CAST(... AS FLOAT); on PostgreSQL a $gt/$lt against a JSON field holding non-numeric text raised "invalid input syntax for type double precision" (a latent prod 500), while SQLite silently coerced it to 0.0. A new dialect- dispatched _JsonNumeric construct casts only when the JSON value is actually a number (json_type IN ('integer','real') on SQLite; json_typeof = 'number' on PostgreSQL), yielding SQL NULL — no match — for non-numeric, null, and absent values on both backends, matching InMemoryFilterRepository's ordered-comparison contract. (4) $eq None now matches absent JSON keys on PostgreSQL too. The old cast(col, String) == 'null' matched absent keys only on SQLite (which renders them as the text 'null'); on PostgreSQL an absent key is SQL NULL, so it did not match. Added an IS NULL test so both backends match absent + explicit null, as the in-memory backend does. (5) The SQL-parity suite now runs against real PostgreSQL via testcontainers (skipped without Docker), not just SQLite — which is what let these cross-backend divergences hide. Added cases pin the numeric-guard and absent-key behavior; they error/diverge on PostgreSQL before this change and pass after. Remaining 749 item (3) — boolean text rendering under $like/$in/$nin — is still a documented divergence, untouched here. Signed-off-by: Max Dubrinsky --- .../app/repository/sqlalchemy/filter.py | 98 +++++++++++++--- .../tests/test_filter_matches_sql_parity.py | 108 ++++++++++++++---- 2 files changed, 170 insertions(+), 36 deletions(-) diff --git a/services/core/entities/src/nmp/core/entities/app/repository/sqlalchemy/filter.py b/services/core/entities/src/nmp/core/entities/app/repository/sqlalchemy/filter.py index a874afa8ed..7e1ae97ab7 100644 --- a/services/core/entities/src/nmp/core/entities/app/repository/sqlalchemy/filter.py +++ b/services/core/entities/src/nmp/core/entities/app/repository/sqlalchemy/filter.py @@ -7,8 +7,73 @@ from typing import Any, List, Optional, Set from nmp.common.api.filter import FilterOperation, FilterRepository -from sqlalchemy import JSON, ColumnElement, DateTime, String, and_, cast, false, func, not_, or_, select +from sqlalchemy import ( + ARRAY, + JSON, + ColumnElement, + DateTime, + Float, + String, + and_, + bindparam, + cast, + false, + func, + not_, + or_, + select, +) +from sqlalchemy.ext.compiler import compiles from sqlalchemy.orm import aliased +from sqlalchemy.sql.visitors import InternalTraversal + + +class _JsonNumeric(ColumnElement): + """A JSON value at a dotted path coerced to a number, or SQL ``NULL`` when it + is not a JSON number (non-numeric text, JSON ``null``, or an absent key). + + Ordered comparisons against SQL ``NULL`` are never true, so non-numeric and + absent values become "no match" on BOTH backends — matching + ``InMemoryFilterRepository``'s ordered-comparison contract — and PostgreSQL + never hits the hard ``invalid input syntax for type double precision`` error + that an unconditional ``CAST(text AS FLOAT)`` raises on non-numeric JSON text + (AIRCORE-749). The path is bound as a parameter, so the emitted SQL text is + constant regardless of path; ``_traverse_internals`` gives a path-distinct + cache key so SQLAlchemy can cache these statements (a bare ``inherit_cache`` + would yield a ``None`` key and silently disable statement caching). + """ + + type = Float() + _traverse_internals = [ + ("data_column", InternalTraversal.dp_clauseelement), + ("path", InternalTraversal.dp_string_list), + ] + + def __init__(self, data_column: Any, path: List[str]): + self.data_column = data_column + self.path = list(path) + + +@compiles(_JsonNumeric, "sqlite") +def _compile_json_numeric_sqlite(element: _JsonNumeric, compiler: Any, **kw: Any) -> str: + col = compiler.process(element.data_column, **kw) + json_path = "$" + "".join('."' + seg.replace('"', '""') + '"' for seg in element.path) + path_param = compiler.process(bindparam(None, json_path), **kw) + # json_extract returns the native value; restrict to actual numbers so text / + # null / absent fall through to NULL (no match), as SQLite would otherwise + # coerce e.g. "abc" -> 0.0. + return f"CASE WHEN json_type({col}, {path_param}) IN ('integer', 'real') THEN json_extract({col}, {path_param}) END" + + +@compiles(_JsonNumeric, "postgresql") +def _compile_json_numeric_postgresql(element: _JsonNumeric, compiler: Any, **kw: Any) -> str: + col = compiler.process(element.data_column, **kw) + path_param = compiler.process(bindparam(None, element.path, type_=ARRAY(String)), **kw) + json_elem = f"({col} #> {path_param})" + json_text = f"({col} #>> {path_param})" + # Only cast when the JSON value is actually a number; otherwise NULL (no match) + # so a non-numeric/absent value never reaches the CAST and errors. + return f"CASE WHEN json_typeof({json_elem}) = 'number' THEN CAST({json_text} AS DOUBLE PRECISION) END" def _escape_like(value: str) -> str: @@ -107,21 +172,19 @@ def _cast_json_to_text(self, column: Any) -> Any: # This handles both SQLite and PostgreSQL JSON string extraction return func.trim(cast(column, String), '"') - def _cast_json_to_numeric(self, column: Any) -> Any: - """Cast a JSON column element to a float for numeric comparisons. + def _json_comparison(self, field: str, value: Any, op: str) -> Any: + """Build an ordered comparison ($lt/$lte/$gt/$gte) for a field. - Uses CAST(... AS FLOAT) which works on both SQLite (REAL) and PostgreSQL. + For JSON fields, a numeric filter value compares against the JSON value as a + number — but only when it actually is a JSON number; non-numeric text, JSON + null, and absent keys are no-match on both backends (see ``_JsonNumeric``), + matching the in-memory contract and avoiding PostgreSQL's CAST error. + Non-numeric filter values compare as trimmed text. """ - from sqlalchemy import Float - - return cast(self._cast_json_to_text(column), Float) - - def _json_comparison(self, field: str, value: Any, op: str) -> Any: - """Build a comparison for JSON fields, using numeric cast when value is numeric.""" column, is_json = self._get_column(field) if is_json: - if isinstance(value, (int, float)): - casted = self._cast_json_to_numeric(column) + if isinstance(value, (int, float)) and not isinstance(value, bool): + casted: Any = _JsonNumeric(getattr(self.model, "data"), field.split(".")[1:]) else: casted = self._cast_json_to_text(column) value = str(value) @@ -132,11 +195,14 @@ def eq(self, field: str, value: Any) -> Any: """Equal comparison.""" column, is_json = self._get_column(field) if is_json: - # Handle None/null: match both missing JSON keys and explicit null values. - # SQLAlchemy's JSON subscript IS NULL doesn't work reliably across backends, - # but cast to String returns "null" for both cases on SQLite and PostgreSQL. + # Handle None/null: match both missing JSON keys and explicit null values, + # matching InMemoryFilterRepository (absent == None == matches $eq None). + # SQLite renders both an absent key and an explicit null as the text "null" + # (json_quote(json_extract(...))), so the cast covers both there. On + # PostgreSQL an absent key extracts to SQL NULL (not the text "null"), so + # we also test IS NULL to match absent keys (AIRCORE-749). if value is None: - return cast(column, String) == "null" + return or_(column.is_(None), cast(column, String) == "null") # Handle boolean values specially: # - SQLite stores JSON booleans as integers (0/1), json_extract returns "0" or "1" # - PostgreSQL stores them as "false"/"true" diff --git a/services/core/entities/tests/test_filter_matches_sql_parity.py b/services/core/entities/tests/test_filter_matches_sql_parity.py index 9223a3bb0a..b2d1bc4b79 100644 --- a/services/core/entities/tests/test_filter_matches_sql_parity.py +++ b/services/core/entities/tests/test_filter_matches_sql_parity.py @@ -3,21 +3,28 @@ """SQL-parity safety net for in-memory filter evaluation. -Each parametrized FilterOperation tree is run two ways against the same seeded -SQLite rows, both through the same ``op.apply(repo)`` front door: +Each parametrized FilterOperation tree is run against the same seeded rows on +every available SQL backend (SQLite always; PostgreSQL via testcontainers when +Docker is available) and through ``InMemoryFilterRepository``, all via the same +``op.apply(repo)`` front door: 1. ``SQLAlchemyFilterRepository`` (the SQL source of truth). 2. ``InMemoryFilterRepository`` over the ORM instances (the in-memory backend). -The two must select exactly the same set of row ids. ``InMemoryFilterRepository`` -is a native-Python evaluator, NOT a byte-for-byte SQL mirror (see its class -docstring), so this suite only covers the cases where native -and SQL semantics agree — strings, real numbers, native booleans (via ``$eq``), -and logical trees. It deliberately excludes the cases where the SQL backends -disagree with each other or rely on JSON-to-text coercion (e.g. int-vs-string -``$eq``, boolean text rendering, non-numeric-text numeric casts); those are -pinned as documented divergences in the plugin's test_filter_matches.py. +All three must select exactly the same set of row ids. Production runs +PostgreSQL while dev/CI default to SQLite, so the PostgreSQL leg is what catches +cross-backend divergences a SQLite-only run would miss (AIRCORE-749): the +non-numeric/null/absent numeric-cast (a hard error on PostgreSQL) and the +absent-JSON-key semantics for ``$eq None`` are both covered here. + +``InMemoryFilterRepository`` is a native-Python evaluator, NOT a byte-for-byte +SQL mirror (see its class docstring). The remaining documented divergences not +yet reconciled in the SQL layer — int-vs-string ``$eq`` and boolean text +rendering under ``$like``/``$in``/``$nin`` — are excluded here and pinned in the +plugin's test_filter_matches.py. """ +from importlib.util import find_spec + import pytest from nmp.common.api.filter import ComparisonOperation, FilterOperator, LogicalOperation from nmp.common.api.in_memory_filter import InMemoryFilterRepository @@ -49,10 +56,15 @@ class FakeEntity(Base): # substring (``_``/``%`` are ordinary characters), agreeing with the in-memory # backend. All keep score/tier/flag present so no absent-key divergence is # introduced into the existing cases. +# +# ``data.n2`` is numeric on one row, non-numeric text on another, explicit null on a +# third, and absent elsewhere — to pin the AIRCORE-749 numeric-comparison contract: +# only an actual JSON number participates in $gt/$lt; non-numeric/null/absent are +# no-match on both backends (and PostgreSQL must not error casting the text row). SEED = [ - dict(id=1, name="llama", data={"score": 5, "tier": "free", "flag": True, "k": None}), - dict(id=2, name="Llama-2", data={"score": 9, "tier": "pro", "flag": False}), - dict(id=3, name="zephyr", data={"score": 10, "tier": "pro", "flag": True, "k": "v"}), + dict(id=1, name="llama", data={"score": 5, "tier": "free", "flag": True, "k": None, "n2": 5}), + dict(id=2, name="Llama-2", data={"score": 9, "tier": "pro", "flag": False, "n2": "notnum"}), + dict(id=3, name="zephyr", data={"score": 10, "tier": "pro", "flag": True, "k": "v", "n2": None}), dict(id=4, name="mistral", data={"score": 100, "tier": "enterprise", "flag": False}), dict(id=5, name=None, data={"score": 1, "tier": "free", "flag": False}), # `_` is a single-char wildcard under LIKE; "prod_db" must not match "prodXdb". @@ -65,14 +77,65 @@ class FakeEntity(Base): ] -@pytest.fixture -def db(): - engine = create_engine("sqlite:///:memory:") +def _docker_available() -> bool: + """Whether a usable Docker daemon is reachable (for the testcontainers PG leg).""" + if find_spec("docker") is None: + return False + import docker + from docker.errors import DockerException + + try: + client = docker.from_env() + try: + client.ping() + finally: + client.close() + return True + except (DockerException, OSError): + return False + + +@pytest.fixture(scope="session") +def _pg_engine(): + """Session-scoped PostgreSQL engine via testcontainers; skipped without Docker. + + Production runs PostgreSQL while dev/CI default to SQLite, so the SQLite-only + leg cannot catch cross-backend divergences (AIRCORE-749). This runs the same + cases against a real Postgres (matching the prod ``json`` column type). + """ + if not _docker_available(): + pytest.skip("Docker unavailable; skipping PostgreSQL parity leg") + from testcontainers.postgres import PostgresContainer + + with PostgresContainer("postgres:16") as pg: + engine = create_engine(pg.get_connection_url()) + try: + yield engine + finally: + engine.dispose() + + +@pytest.fixture(params=["sqlite", "postgres"]) +def db(request): + """Seed FakeEntity in SQLite and (when Docker is available) PostgreSQL. + + The same cases run against both backends so SQL↔SQL and SQL↔in-memory + divergences are caught. The table is recreated per test so the session-scoped + Postgres engine stays clean between cases. + """ + if request.param == "sqlite": + engine = create_engine("sqlite:///:memory:") + else: + engine = request.getfixturevalue("_pg_engine") + Base.metadata.drop_all(engine) Base.metadata.create_all(engine) - with Session(engine) as session: - session.add_all([FakeEntity(**row) for row in SEED]) - session.commit() - yield session + try: + with Session(engine) as session: + session.add_all([FakeEntity(**row) for row in SEED]) + session.commit() + yield session + finally: + Base.metadata.drop_all(engine) def C(operator, field, value): @@ -120,6 +183,11 @@ def NOT(op): ("lte_data_score", C(FilterOperator.LTE, "data.score", 9)), ("gt_data_tier_text", C(FilterOperator.GT, "data.tier", "a")), ("lt_data_tier_text", C(FilterOperator.LT, "data.tier", "g")), + # AIRCORE-749: ordered comparison only matches actual JSON numbers; non-numeric + # text / null / absent are no-match on both backends (PostgreSQL must not error + # casting the "notnum" row). + ("gt_data_n2_numeric_only", C(FilterOperator.GT, "data.n2", 4)), + ("lt_data_n2_numeric_only", C(FilterOperator.LT, "data.n2", 10)), ("and_tree", AND(C(FilterOperator.EQ, "data.tier", "pro"), C(FilterOperator.GT, "data.score", 9))), ("or_tree", OR(C(FilterOperator.EQ, "name", "llama"), C(FilterOperator.EQ, "name", "zephyr"))), ("not_tree", NOT(C(FilterOperator.EQ, "data.tier", "pro"))),