diff --git a/e2e/test_entities.py b/e2e/test_entities.py index 2ba9945052..8641a0e7cf 100644 --- a/e2e/test_entities.py +++ b/e2e/test_entities.py @@ -306,8 +306,10 @@ def test_entity_search_filter(sdk: NeMoPlatform, workspace: str): assert len(response.data) == 1 assert response.data[0].name == entity_alpha - # Filter by name pattern (like) - filter_query = json.dumps({"name": {"$like": f"{prefix}%"}}) + # Filter by name substring. $like is a case-insensitive substring match, + # not a SQL wildcard pattern (% and _ are literal), so the shared prefix — + # a substring of both entity names — matches alpha and beta. + filter_query = json.dumps({"name": {"$like": prefix}}) response = sdk.entities.list( entity_type=ENTITY_TYPE, workspace=workspace, diff --git a/services/core/entities/src/nmp/core/entities/app/repository/sqlalchemy/filter.py b/services/core/entities/src/nmp/core/entities/app/repository/sqlalchemy/filter.py index 40eb859d82..a874afa8ed 100644 --- a/services/core/entities/src/nmp/core/entities/app/repository/sqlalchemy/filter.py +++ b/services/core/entities/src/nmp/core/entities/app/repository/sqlalchemy/filter.py @@ -11,6 +11,17 @@ from sqlalchemy.orm import aliased +def _escape_like(value: str) -> str: + """Escape SQL LIKE metacharacters so ``%`` and ``_`` match literally. + + ``$like`` is a case-insensitive substring (contains) test in which ``%`` and + ``_`` are ordinary characters — the canonical contract documented and pinned + by ``InMemoryFilterRepository.like``. The backslash escape character is + escaped first so the escapes we add are not themselves re-escaped. + """ + return str(value).replace("\\", "\\\\").replace("%", "\\%").replace("_", "\\_") + + class SQLAlchemyFilterRepository(FilterRepository): """SQLAlchemy implementation of FilterRepository. @@ -142,11 +153,18 @@ def eq(self, field: str, value: Any) -> Any: return column == value def like(self, field: str, value: str) -> Any: - """Like/contains comparison.""" + """Case-insensitive substring (contains) comparison. + + ``%`` and ``_`` in ``value`` are matched literally, not as SQL wildcards, + to agree with ``InMemoryFilterRepository.like``. Metacharacters are escaped + and an explicit ``ESCAPE`` clause is used, which behaves the same on SQLite + and PostgreSQL. + """ column, is_json = self._get_column(field) + pattern = f"%{_escape_like(value)}%" if is_json: - return self._cast_json_to_text(column).ilike(f"%{value}%") - return column.ilike(f"%{value}%") + return self._cast_json_to_text(column).ilike(pattern, escape="\\") + return column.ilike(pattern, escape="\\") def lt(self, field: str, value: Any) -> Any: """Less than comparison.""" diff --git a/services/core/entities/tests/test_filter_matches_sql_parity.py b/services/core/entities/tests/test_filter_matches_sql_parity.py index c55716413a..9223a3bb0a 100644 --- a/services/core/entities/tests/test_filter_matches_sql_parity.py +++ b/services/core/entities/tests/test_filter_matches_sql_parity.py @@ -42,12 +42,26 @@ class FakeEntity(Base): # absent keys (a documented native divergence pinned in the unit tests). A # plain-column NULL (name on row 5) and an explicit/absent ``k`` for $eq-None # coverage are the only nullable bits, and $eq agrees with SQL on both. +# +# Rows 6-9 carry SQL LIKE metacharacters (``_``/``%``) in ``name``/``data.tier``, +# each paired with a near-identical row that a wildcard interpretation would +# wrongly match. They pin the AIRCORE-749 contract that ``$like`` is a literal +# substring (``_``/``%`` are ordinary characters), agreeing with the in-memory +# backend. All keep score/tier/flag present so no absent-key divergence is +# introduced into the existing cases. SEED = [ dict(id=1, name="llama", data={"score": 5, "tier": "free", "flag": True, "k": None}), dict(id=2, name="Llama-2", data={"score": 9, "tier": "pro", "flag": False}), dict(id=3, name="zephyr", data={"score": 10, "tier": "pro", "flag": True, "k": "v"}), dict(id=4, name="mistral", data={"score": 100, "tier": "enterprise", "flag": False}), dict(id=5, name=None, data={"score": 1, "tier": "free", "flag": False}), + # `_` is a single-char wildcard under LIKE; "prod_db" must not match "prodXdb". + dict(id=6, name="prod_db", data={"score": 7, "tier": "free", "flag": True}), + dict(id=7, name="prodXdb", data={"score": 8, "tier": "pro", "flag": False}), + # `%` is a multi-char wildcard under LIKE; "50%off" must not match "50pctoff". + # data.tier "a_c" must not match "axc" (exercises the JSON cast-to-text path). + dict(id=8, name="50%off", data={"score": 11, "tier": "a_c", "flag": True}), + dict(id=9, name="50pctoff", data={"score": 12, "tier": "axc", "flag": False}), ] @@ -90,6 +104,10 @@ def NOT(op): ("like_name_lower", C(FilterOperator.LIKE, "name", "LAMA")), ("like_data_tier", C(FilterOperator.LIKE, "data.tier", "pr")), ("like_data_miss", C(FilterOperator.LIKE, "data.tier", "zzz")), + # AIRCORE-749: `_`/`%` are literal substrings, not SQL wildcards. + ("like_name_underscore_literal", C(FilterOperator.LIKE, "name", "prod_db")), + ("like_name_percent_literal", C(FilterOperator.LIKE, "name", "50%off")), + ("like_data_tier_underscore_literal", C(FilterOperator.LIKE, "data.tier", "a_c")), ("in_name", C(FilterOperator.IN, "name", ["llama", "mistral"])), ("in_data_tier", C(FilterOperator.IN, "data.tier", ["pro", "free"])), ("in_data_score", C(FilterOperator.IN, "data.score", [5, 10])),