Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,73 @@
from typing import Any, List, Optional, Set

from nmp.common.api.filter import FilterOperation, FilterRepository
from sqlalchemy import JSON, ColumnElement, DateTime, String, and_, cast, false, func, not_, or_, select
from sqlalchemy import (
ARRAY,
JSON,
ColumnElement,
DateTime,
Float,
String,
and_,
bindparam,
cast,
false,
func,
not_,
or_,
select,
)
from sqlalchemy.ext.compiler import compiles
from sqlalchemy.orm import aliased
from sqlalchemy.sql.visitors import InternalTraversal


class _JsonNumeric(ColumnElement):
"""A JSON value at a dotted path coerced to a number, or SQL ``NULL`` when it
is not a JSON number (non-numeric text, JSON ``null``, or an absent key).

Ordered comparisons against SQL ``NULL`` are never true, so non-numeric and
absent values become "no match" on BOTH backends — matching
``InMemoryFilterRepository``'s ordered-comparison contract — and PostgreSQL
never hits the hard ``invalid input syntax for type double precision`` error
that an unconditional ``CAST(text AS FLOAT)`` raises on non-numeric JSON text
(AIRCORE-749). The path is bound as a parameter, so the emitted SQL text is
constant regardless of path; ``_traverse_internals`` gives a path-distinct
cache key so SQLAlchemy can cache these statements (a bare ``inherit_cache``
would yield a ``None`` key and silently disable statement caching).
"""

type = Float()
_traverse_internals = [
("data_column", InternalTraversal.dp_clauseelement),
("path", InternalTraversal.dp_string_list),
]

def __init__(self, data_column: Any, path: List[str]):
self.data_column = data_column
self.path = list(path)


@compiles(_JsonNumeric, "sqlite")
def _compile_json_numeric_sqlite(element: _JsonNumeric, compiler: Any, **kw: Any) -> str:
col = compiler.process(element.data_column, **kw)
json_path = "$" + "".join('."' + seg.replace('"', '""') + '"' for seg in element.path)
path_param = compiler.process(bindparam(None, json_path), **kw)
# json_extract returns the native value; restrict to actual numbers so text /
# null / absent fall through to NULL (no match), as SQLite would otherwise
# coerce e.g. "abc" -> 0.0.
return f"CASE WHEN json_type({col}, {path_param}) IN ('integer', 'real') THEN json_extract({col}, {path_param}) END"


@compiles(_JsonNumeric, "postgresql")
def _compile_json_numeric_postgresql(element: _JsonNumeric, compiler: Any, **kw: Any) -> str:
col = compiler.process(element.data_column, **kw)
path_param = compiler.process(bindparam(None, element.path, type_=ARRAY(String)), **kw)
json_elem = f"({col} #> {path_param})"
json_text = f"({col} #>> {path_param})"
# Only cast when the JSON value is actually a number; otherwise NULL (no match)
# so a non-numeric/absent value never reaches the CAST and errors.
return f"CASE WHEN json_typeof({json_elem}) = 'number' THEN CAST({json_text} AS DOUBLE PRECISION) END"


def _escape_like(value: str) -> str:
Expand Down Expand Up @@ -107,21 +172,19 @@ def _cast_json_to_text(self, column: Any) -> Any:
# This handles both SQLite and PostgreSQL JSON string extraction
return func.trim(cast(column, String), '"')

def _cast_json_to_numeric(self, column: Any) -> Any:
"""Cast a JSON column element to a float for numeric comparisons.
def _json_comparison(self, field: str, value: Any, op: str) -> Any:
"""Build an ordered comparison ($lt/$lte/$gt/$gte) for a field.

Uses CAST(... AS FLOAT) which works on both SQLite (REAL) and PostgreSQL.
For JSON fields, a numeric filter value compares against the JSON value as a
number — but only when it actually is a JSON number; non-numeric text, JSON
null, and absent keys are no-match on both backends (see ``_JsonNumeric``),
matching the in-memory contract and avoiding PostgreSQL's CAST error.
Non-numeric filter values compare as trimmed text.
"""
from sqlalchemy import Float

return cast(self._cast_json_to_text(column), Float)

def _json_comparison(self, field: str, value: Any, op: str) -> Any:
"""Build a comparison for JSON fields, using numeric cast when value is numeric."""
column, is_json = self._get_column(field)
if is_json:
if isinstance(value, (int, float)):
casted = self._cast_json_to_numeric(column)
if isinstance(value, (int, float)) and not isinstance(value, bool):
casted: Any = _JsonNumeric(getattr(self.model, "data"), field.split(".")[1:])
else:
casted = self._cast_json_to_text(column)
value = str(value)
Expand All @@ -132,11 +195,14 @@ def eq(self, field: str, value: Any) -> Any:
"""Equal comparison."""
column, is_json = self._get_column(field)
if is_json:
# Handle None/null: match both missing JSON keys and explicit null values.
# SQLAlchemy's JSON subscript IS NULL doesn't work reliably across backends,
# but cast to String returns "null" for both cases on SQLite and PostgreSQL.
# Handle None/null: match both missing JSON keys and explicit null values,
# matching InMemoryFilterRepository (absent == None == matches $eq None).
# SQLite renders both an absent key and an explicit null as the text "null"
# (json_quote(json_extract(...))), so the cast covers both there. On
# PostgreSQL an absent key extracts to SQL NULL (not the text "null"), so
# we also test IS NULL to match absent keys (AIRCORE-749).
if value is None:
return cast(column, String) == "null"
return or_(column.is_(None), cast(column, String) == "null")
# Handle boolean values specially:
# - SQLite stores JSON booleans as integers (0/1), json_extract returns "0" or "1"
# - PostgreSQL stores them as "false"/"true"
Expand Down
108 changes: 88 additions & 20 deletions services/core/entities/tests/test_filter_matches_sql_parity.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,28 @@

"""SQL-parity safety net for in-memory filter evaluation.

Each parametrized FilterOperation tree is run two ways against the same seeded
SQLite rows, both through the same ``op.apply(repo)`` front door:
Each parametrized FilterOperation tree is run against the same seeded rows on
every available SQL backend (SQLite always; PostgreSQL via testcontainers when
Docker is available) and through ``InMemoryFilterRepository``, all via the same
``op.apply(repo)`` front door:
1. ``SQLAlchemyFilterRepository`` (the SQL source of truth).
2. ``InMemoryFilterRepository`` over the ORM instances (the in-memory backend).

The two must select exactly the same set of row ids. ``InMemoryFilterRepository``
is a native-Python evaluator, NOT a byte-for-byte SQL mirror (see its class
docstring), so this suite only covers the cases where native
and SQL semantics agree — strings, real numbers, native booleans (via ``$eq``),
and logical trees. It deliberately excludes the cases where the SQL backends
disagree with each other or rely on JSON-to-text coercion (e.g. int-vs-string
``$eq``, boolean text rendering, non-numeric-text numeric casts); those are
pinned as documented divergences in the plugin's test_filter_matches.py.
All three must select exactly the same set of row ids. Production runs
PostgreSQL while dev/CI default to SQLite, so the PostgreSQL leg is what catches
cross-backend divergences a SQLite-only run would miss (AIRCORE-749): the
non-numeric/null/absent numeric-cast (a hard error on PostgreSQL) and the
absent-JSON-key semantics for ``$eq None`` are both covered here.

``InMemoryFilterRepository`` is a native-Python evaluator, NOT a byte-for-byte
SQL mirror (see its class docstring). The remaining documented divergences not
yet reconciled in the SQL layer — int-vs-string ``$eq`` and boolean text
rendering under ``$like``/``$in``/``$nin`` — are excluded here and pinned in the
plugin's test_filter_matches.py.
"""

from importlib.util import find_spec

import pytest
from nmp.common.api.filter import ComparisonOperation, FilterOperator, LogicalOperation
from nmp.common.api.in_memory_filter import InMemoryFilterRepository
Expand Down Expand Up @@ -49,10 +56,15 @@ class FakeEntity(Base):
# substring (``_``/``%`` are ordinary characters), agreeing with the in-memory
# backend. All keep score/tier/flag present so no absent-key divergence is
# introduced into the existing cases.
#
# ``data.n2`` is numeric on one row, non-numeric text on another, explicit null on a
# third, and absent elsewhere — to pin the AIRCORE-749 numeric-comparison contract:
# only an actual JSON number participates in $gt/$lt; non-numeric/null/absent are
# no-match on both backends (and PostgreSQL must not error casting the text row).
SEED = [
dict(id=1, name="llama", data={"score": 5, "tier": "free", "flag": True, "k": None}),
dict(id=2, name="Llama-2", data={"score": 9, "tier": "pro", "flag": False}),
dict(id=3, name="zephyr", data={"score": 10, "tier": "pro", "flag": True, "k": "v"}),
dict(id=1, name="llama", data={"score": 5, "tier": "free", "flag": True, "k": None, "n2": 5}),
dict(id=2, name="Llama-2", data={"score": 9, "tier": "pro", "flag": False, "n2": "notnum"}),
dict(id=3, name="zephyr", data={"score": 10, "tier": "pro", "flag": True, "k": "v", "n2": None}),
dict(id=4, name="mistral", data={"score": 100, "tier": "enterprise", "flag": False}),
dict(id=5, name=None, data={"score": 1, "tier": "free", "flag": False}),
# `_` is a single-char wildcard under LIKE; "prod_db" must not match "prodXdb".
Expand All @@ -65,14 +77,65 @@ class FakeEntity(Base):
]


@pytest.fixture
def db():
engine = create_engine("sqlite:///:memory:")
def _docker_available() -> bool:
"""Whether a usable Docker daemon is reachable (for the testcontainers PG leg)."""
if find_spec("docker") is None:
return False
import docker
from docker.errors import DockerException

try:
client = docker.from_env()
try:
client.ping()
finally:
client.close()
return True
except (DockerException, OSError):
return False


@pytest.fixture(scope="session")
def _pg_engine():
"""Session-scoped PostgreSQL engine via testcontainers; skipped without Docker.

Production runs PostgreSQL while dev/CI default to SQLite, so the SQLite-only
leg cannot catch cross-backend divergences (AIRCORE-749). This runs the same
cases against a real Postgres (matching the prod ``json`` column type).
"""
if not _docker_available():
pytest.skip("Docker unavailable; skipping PostgreSQL parity leg")
from testcontainers.postgres import PostgresContainer

with PostgresContainer("postgres:16") as pg:
engine = create_engine(pg.get_connection_url())
try:
yield engine
finally:
engine.dispose()


@pytest.fixture(params=["sqlite", "postgres"])
def db(request):
"""Seed FakeEntity in SQLite and (when Docker is available) PostgreSQL.

The same cases run against both backends so SQL↔SQL and SQL↔in-memory
divergences are caught. The table is recreated per test so the session-scoped
Postgres engine stays clean between cases.
"""
if request.param == "sqlite":
engine = create_engine("sqlite:///:memory:")
else:
engine = request.getfixturevalue("_pg_engine")
Base.metadata.drop_all(engine)
Base.metadata.create_all(engine)
with Session(engine) as session:
session.add_all([FakeEntity(**row) for row in SEED])
session.commit()
yield session
try:
with Session(engine) as session:
session.add_all([FakeEntity(**row) for row in SEED])
session.commit()
yield session
finally:
Base.metadata.drop_all(engine)


def C(operator, field, value):
Expand Down Expand Up @@ -120,6 +183,11 @@ def NOT(op):
("lte_data_score", C(FilterOperator.LTE, "data.score", 9)),
("gt_data_tier_text", C(FilterOperator.GT, "data.tier", "a")),
("lt_data_tier_text", C(FilterOperator.LT, "data.tier", "g")),
# AIRCORE-749: ordered comparison only matches actual JSON numbers; non-numeric
# text / null / absent are no-match on both backends (PostgreSQL must not error
# casting the "notnum" row).
("gt_data_n2_numeric_only", C(FilterOperator.GT, "data.n2", 4)),
("lt_data_n2_numeric_only", C(FilterOperator.LT, "data.n2", 10)),
("and_tree", AND(C(FilterOperator.EQ, "data.tier", "pro"), C(FilterOperator.GT, "data.score", 9))),
("or_tree", OR(C(FilterOperator.EQ, "name", "llama"), C(FilterOperator.EQ, "name", "zephyr"))),
("not_tree", NOT(C(FilterOperator.EQ, "data.tier", "pro"))),
Expand Down
Loading