From 4be74d28f48a68566dcaad05be81a8e6027d55da Mon Sep 17 00:00:00 2001 From: Max Dubrinsky Date: Wed, 10 Jun 2026 15:05:04 -0400 Subject: [PATCH] fix(entities): treat $like as a literal substring, not a SQL wildcard (AIRCORE-749) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SQLAlchemyFilterRepository.like() interpolated the user filter value into an ILIKE pattern without escaping, so % and _ in a $like value were silently treated as SQL wildcards on both SQLite and PostgreSQL. The shipped in-memory backend (InMemoryFilterRepository.like) — the documented, test-pinned canonical contract — treats $like as a case-insensitive literal substring where % and _ are ordinary characters. The two backends therefore returned different result sets for any value containing % or _ (e.g. name~"a_b" matched "axb"). Escape the LIKE metacharacters (\, %, _) and pass an explicit ESCAPE clause, which behaves identically on SQLite and PostgreSQL, so the SQL backend matches the in-memory literal-substring contract. The in-memory backend is unchanged. Extends the SQL/in-memory parity suite with rows whose name/data.tier contain _ and %, each paired with a near-identical decoy a wildcard would wrongly match (covering both the plain-column and JSON cast-to-text paths). The new cases fail before this change (SQL over-matches the decoy) and pass after. Addresses item (1) of AIRCORE-749. The cross-backend JSON-coercion foot-guns (numeric-cast Postgres 500, boolean rendering, absent-key semantics) and the Postgres-backed parity leg remain open on the ticket — they require cross-backend behavior decisions. Signed-off-by: Max Dubrinsky --- e2e/test_entities.py | 6 +++-- .../app/repository/sqlalchemy/filter.py | 24 ++++++++++++++++--- .../tests/test_filter_matches_sql_parity.py | 18 ++++++++++++++ 3 files changed, 43 insertions(+), 5 deletions(-) diff --git a/e2e/test_entities.py b/e2e/test_entities.py index 2ba9945052..8641a0e7cf 100644 --- a/e2e/test_entities.py +++ b/e2e/test_entities.py @@ -306,8 +306,10 @@ def test_entity_search_filter(sdk: NeMoPlatform, workspace: str): assert len(response.data) == 1 assert response.data[0].name == entity_alpha - # Filter by name pattern (like) - filter_query = json.dumps({"name": {"$like": f"{prefix}%"}}) + # Filter by name substring. $like is a case-insensitive substring match, + # not a SQL wildcard pattern (% and _ are literal), so the shared prefix — + # a substring of both entity names — matches alpha and beta. + filter_query = json.dumps({"name": {"$like": prefix}}) response = sdk.entities.list( entity_type=ENTITY_TYPE, workspace=workspace, diff --git a/services/core/entities/src/nmp/core/entities/app/repository/sqlalchemy/filter.py b/services/core/entities/src/nmp/core/entities/app/repository/sqlalchemy/filter.py index 40eb859d82..a874afa8ed 100644 --- a/services/core/entities/src/nmp/core/entities/app/repository/sqlalchemy/filter.py +++ b/services/core/entities/src/nmp/core/entities/app/repository/sqlalchemy/filter.py @@ -11,6 +11,17 @@ from sqlalchemy.orm import aliased +def _escape_like(value: str) -> str: + """Escape SQL LIKE metacharacters so ``%`` and ``_`` match literally. + + ``$like`` is a case-insensitive substring (contains) test in which ``%`` and + ``_`` are ordinary characters — the canonical contract documented and pinned + by ``InMemoryFilterRepository.like``. The backslash escape character is + escaped first so the escapes we add are not themselves re-escaped. + """ + return str(value).replace("\\", "\\\\").replace("%", "\\%").replace("_", "\\_") + + class SQLAlchemyFilterRepository(FilterRepository): """SQLAlchemy implementation of FilterRepository. @@ -142,11 +153,18 @@ def eq(self, field: str, value: Any) -> Any: return column == value def like(self, field: str, value: str) -> Any: - """Like/contains comparison.""" + """Case-insensitive substring (contains) comparison. + + ``%`` and ``_`` in ``value`` are matched literally, not as SQL wildcards, + to agree with ``InMemoryFilterRepository.like``. Metacharacters are escaped + and an explicit ``ESCAPE`` clause is used, which behaves the same on SQLite + and PostgreSQL. + """ column, is_json = self._get_column(field) + pattern = f"%{_escape_like(value)}%" if is_json: - return self._cast_json_to_text(column).ilike(f"%{value}%") - return column.ilike(f"%{value}%") + return self._cast_json_to_text(column).ilike(pattern, escape="\\") + return column.ilike(pattern, escape="\\") def lt(self, field: str, value: Any) -> Any: """Less than comparison.""" diff --git a/services/core/entities/tests/test_filter_matches_sql_parity.py b/services/core/entities/tests/test_filter_matches_sql_parity.py index c55716413a..9223a3bb0a 100644 --- a/services/core/entities/tests/test_filter_matches_sql_parity.py +++ b/services/core/entities/tests/test_filter_matches_sql_parity.py @@ -42,12 +42,26 @@ class FakeEntity(Base): # absent keys (a documented native divergence pinned in the unit tests). A # plain-column NULL (name on row 5) and an explicit/absent ``k`` for $eq-None # coverage are the only nullable bits, and $eq agrees with SQL on both. +# +# Rows 6-9 carry SQL LIKE metacharacters (``_``/``%``) in ``name``/``data.tier``, +# each paired with a near-identical row that a wildcard interpretation would +# wrongly match. They pin the AIRCORE-749 contract that ``$like`` is a literal +# substring (``_``/``%`` are ordinary characters), agreeing with the in-memory +# backend. All keep score/tier/flag present so no absent-key divergence is +# introduced into the existing cases. SEED = [ dict(id=1, name="llama", data={"score": 5, "tier": "free", "flag": True, "k": None}), dict(id=2, name="Llama-2", data={"score": 9, "tier": "pro", "flag": False}), dict(id=3, name="zephyr", data={"score": 10, "tier": "pro", "flag": True, "k": "v"}), dict(id=4, name="mistral", data={"score": 100, "tier": "enterprise", "flag": False}), dict(id=5, name=None, data={"score": 1, "tier": "free", "flag": False}), + # `_` is a single-char wildcard under LIKE; "prod_db" must not match "prodXdb". + dict(id=6, name="prod_db", data={"score": 7, "tier": "free", "flag": True}), + dict(id=7, name="prodXdb", data={"score": 8, "tier": "pro", "flag": False}), + # `%` is a multi-char wildcard under LIKE; "50%off" must not match "50pctoff". + # data.tier "a_c" must not match "axc" (exercises the JSON cast-to-text path). + dict(id=8, name="50%off", data={"score": 11, "tier": "a_c", "flag": True}), + dict(id=9, name="50pctoff", data={"score": 12, "tier": "axc", "flag": False}), ] @@ -90,6 +104,10 @@ def NOT(op): ("like_name_lower", C(FilterOperator.LIKE, "name", "LAMA")), ("like_data_tier", C(FilterOperator.LIKE, "data.tier", "pr")), ("like_data_miss", C(FilterOperator.LIKE, "data.tier", "zzz")), + # AIRCORE-749: `_`/`%` are literal substrings, not SQL wildcards. + ("like_name_underscore_literal", C(FilterOperator.LIKE, "name", "prod_db")), + ("like_name_percent_literal", C(FilterOperator.LIKE, "name", "50%off")), + ("like_data_tier_underscore_literal", C(FilterOperator.LIKE, "data.tier", "a_c")), ("in_name", C(FilterOperator.IN, "name", ["llama", "mistral"])), ("in_data_tier", C(FilterOperator.IN, "data.tier", ["pro", "free"])), ("in_data_score", C(FilterOperator.IN, "data.score", [5, 10])),