From 4be74d28f48a68566dcaad05be81a8e6027d55da Mon Sep 17 00:00:00 2001
From: Max Dubrinsky <mdubrinsky@nvidia.com>
Date: Wed, 10 Jun 2026 15:05:04 -0400
Subject: [PATCH] fix(entities): treat $like as a literal substring, not a SQL
 wildcard (AIRCORE-749)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SQLAlchemyFilterRepository.like() interpolated the user filter value into an
ILIKE pattern without escaping, so % and _ in a $like value were silently
treated as SQL wildcards on both SQLite and PostgreSQL. The shipped in-memory
backend (InMemoryFilterRepository.like) — the documented, test-pinned canonical
contract — treats $like as a case-insensitive literal substring where % and _
are ordinary characters. The two backends therefore returned different result
sets for any value containing % or _ (e.g. name~"a_b" matched "axb").

Escape the LIKE metacharacters (\, %, _) and pass an explicit ESCAPE clause,
which behaves identically on SQLite and PostgreSQL, so the SQL backend matches
the in-memory literal-substring contract. The in-memory backend is unchanged.

Extends the SQL/in-memory parity suite with rows whose name/data.tier contain
_ and %, each paired with a near-identical decoy a wildcard would wrongly match
(covering both the plain-column and JSON cast-to-text paths). The new cases
fail before this change (SQL over-matches the decoy) and pass after.

Addresses item (1) of AIRCORE-749. The cross-backend JSON-coercion foot-guns
(numeric-cast Postgres 500, boolean rendering, absent-key semantics) and the
Postgres-backed parity leg remain open on the ticket — they require cross-backend
behavior decisions.

Signed-off-by: Max Dubrinsky <mdubrinsky@nvidia.com>
---
 e2e/test_entities.py                          |  6 +++--
 .../app/repository/sqlalchemy/filter.py       | 24 ++++++++++++++++---
 .../tests/test_filter_matches_sql_parity.py   | 18 ++++++++++++++
 3 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/e2e/test_entities.py b/e2e/test_entities.py
index 2ba9945052..8641a0e7cf 100644
--- a/e2e/test_entities.py
+++ b/e2e/test_entities.py
@@ -306,8 +306,10 @@ def test_entity_search_filter(sdk: NeMoPlatform, workspace: str):
         assert len(response.data) == 1
         assert response.data[0].name == entity_alpha
 
-        # Filter by name pattern (like)
-        filter_query = json.dumps({"name": {"$like": f"{prefix}%"}})
+        # Filter by name substring. $like is a case-insensitive substring match,
+        # not a SQL wildcard pattern (% and _ are literal), so the shared prefix —
+        # a substring of both entity names — matches alpha and beta.
+        filter_query = json.dumps({"name": {"$like": prefix}})
         response = sdk.entities.list(
             entity_type=ENTITY_TYPE,
             workspace=workspace,
diff --git a/services/core/entities/src/nmp/core/entities/app/repository/sqlalchemy/filter.py b/services/core/entities/src/nmp/core/entities/app/repository/sqlalchemy/filter.py
index 40eb859d82..a874afa8ed 100644
--- a/services/core/entities/src/nmp/core/entities/app/repository/sqlalchemy/filter.py
+++ b/services/core/entities/src/nmp/core/entities/app/repository/sqlalchemy/filter.py
@@ -11,6 +11,17 @@
 from sqlalchemy.orm import aliased
 
 
+def _escape_like(value: str) -> str:
+    """Escape SQL LIKE metacharacters so ``%`` and ``_`` match literally.
+
+    ``$like`` is a case-insensitive substring (contains) test in which ``%`` and
+    ``_`` are ordinary characters — the canonical contract documented and pinned
+    by ``InMemoryFilterRepository.like``. The backslash escape character is
+    escaped first so the escapes we add are not themselves re-escaped.
+    """
+    return str(value).replace("\\", "\\\\").replace("%", "\\%").replace("_", "\\_")
+
+
 class SQLAlchemyFilterRepository(FilterRepository):
     """SQLAlchemy implementation of FilterRepository.
 
@@ -142,11 +153,18 @@ def eq(self, field: str, value: Any) -> Any:
         return column == value
 
     def like(self, field: str, value: str) -> Any:
-        """Like/contains comparison."""
+        """Case-insensitive substring (contains) comparison.
+
+        ``%`` and ``_`` in ``value`` are matched literally, not as SQL wildcards,
+        to agree with ``InMemoryFilterRepository.like``. Metacharacters are escaped
+        and an explicit ``ESCAPE`` clause is used, which behaves the same on SQLite
+        and PostgreSQL.
+        """
         column, is_json = self._get_column(field)
+        pattern = f"%{_escape_like(value)}%"
         if is_json:
-            return self._cast_json_to_text(column).ilike(f"%{value}%")
-        return column.ilike(f"%{value}%")
+            return self._cast_json_to_text(column).ilike(pattern, escape="\\")
+        return column.ilike(pattern, escape="\\")
 
     def lt(self, field: str, value: Any) -> Any:
         """Less than comparison."""
diff --git a/services/core/entities/tests/test_filter_matches_sql_parity.py b/services/core/entities/tests/test_filter_matches_sql_parity.py
index c55716413a..9223a3bb0a 100644
--- a/services/core/entities/tests/test_filter_matches_sql_parity.py
+++ b/services/core/entities/tests/test_filter_matches_sql_parity.py
@@ -42,12 +42,26 @@ class FakeEntity(Base):
 # absent keys (a documented native divergence pinned in the unit tests). A
 # plain-column NULL (name on row 5) and an explicit/absent ``k`` for $eq-None
 # coverage are the only nullable bits, and $eq agrees with SQL on both.
+#
+# Rows 6-9 carry SQL LIKE metacharacters (``_``/``%``) in ``name``/``data.tier``,
+# each paired with a near-identical row that a wildcard interpretation would
+# wrongly match. They pin the AIRCORE-749 contract that ``$like`` is a literal
+# substring (``_``/``%`` are ordinary characters), agreeing with the in-memory
+# backend. All keep score/tier/flag present so no absent-key divergence is
+# introduced into the existing cases.
 SEED = [
     dict(id=1, name="llama", data={"score": 5, "tier": "free", "flag": True, "k": None}),
     dict(id=2, name="Llama-2", data={"score": 9, "tier": "pro", "flag": False}),
     dict(id=3, name="zephyr", data={"score": 10, "tier": "pro", "flag": True, "k": "v"}),
     dict(id=4, name="mistral", data={"score": 100, "tier": "enterprise", "flag": False}),
     dict(id=5, name=None, data={"score": 1, "tier": "free", "flag": False}),
+    # `_` is a single-char wildcard under LIKE; "prod_db" must not match "prodXdb".
+    dict(id=6, name="prod_db", data={"score": 7, "tier": "free", "flag": True}),
+    dict(id=7, name="prodXdb", data={"score": 8, "tier": "pro", "flag": False}),
+    # `%` is a multi-char wildcard under LIKE; "50%off" must not match "50pctoff".
+    # data.tier "a_c" must not match "axc" (exercises the JSON cast-to-text path).
+    dict(id=8, name="50%off", data={"score": 11, "tier": "a_c", "flag": True}),
+    dict(id=9, name="50pctoff", data={"score": 12, "tier": "axc", "flag": False}),
 ]
 
 
@@ -90,6 +104,10 @@ def NOT(op):
     ("like_name_lower", C(FilterOperator.LIKE, "name", "LAMA")),
     ("like_data_tier", C(FilterOperator.LIKE, "data.tier", "pr")),
     ("like_data_miss", C(FilterOperator.LIKE, "data.tier", "zzz")),
+    # AIRCORE-749: `_`/`%` are literal substrings, not SQL wildcards.
+    ("like_name_underscore_literal", C(FilterOperator.LIKE, "name", "prod_db")),
+    ("like_name_percent_literal", C(FilterOperator.LIKE, "name", "50%off")),
+    ("like_data_tier_underscore_literal", C(FilterOperator.LIKE, "data.tier", "a_c")),
     ("in_name", C(FilterOperator.IN, "name", ["llama", "mistral"])),
     ("in_data_tier", C(FilterOperator.IN, "data.tier", ["pro", "free"])),
     ("in_data_score", C(FilterOperator.IN, "data.score", [5, 10])),