From 7590ebabd9c01ae55de6ded467886dfc9858deee Mon Sep 17 00:00:00 2001 From: robfrank Date: Wed, 18 Mar 2026 11:58:47 +0100 Subject: [PATCH 1/8] ci: add Dependabot configuration for pip and GitHub Actions Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/dependabot.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 .github/dependabot.yml diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..4776211 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,11 @@ +version: 2 +updates: + - package-ecosystem: "pip" + directory: "/" + schedule: + interval: "weekly" + + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "weekly" From 7a601fb2e1297383132e0675f7fd0edc64bc7b10 Mon Sep 17 00:00:00 2001 From: robfrank Date: Wed, 18 Mar 2026 11:59:42 +0100 Subject: [PATCH 2/8] ci: add CI workflow with lint and test jobs Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/ci.yml | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 .github/workflows/ci.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..bafe7ef --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,30 @@ +name: CI + +on: + pull_request: + branches: [main] + push: + branches: [main] + workflow_dispatch: + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.13" + - run: pip install ruff + - run: ruff check . + - run: ruff format --check . + + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.13" + - run: pip install -e ".[dev]" + - run: pytest tests/ --cov=haystack_integrations --cov-report=term-missing -v From 91697e5bd7ae63f4fa4109bce3e14a96e1174cf6 Mon Sep 17 00:00:00 2001 From: robfrank Date: Wed, 18 Mar 2026 11:59:49 +0100 Subject: [PATCH 3/8] build: add testcontainers, docker, and ruff to dev dependencies Co-Authored-By: Claude Opus 4.6 (1M context) --- pyproject.toml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 6cc2aa7..b38207c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,15 @@ Homepage = "https://arcadedb.com" Repository = "https://github.com/ArcadeData/arcadedb-haystack" Documentation = "https://docs.arcadedb.com" +[project.optional-dependencies] +dev = [ + "pytest", + "pytest-cov", + "testcontainers>=4.9.1", + "docker>=7.1.0", + "ruff", +] + [tool.hatch.build.targets.wheel] packages = ["src/haystack_integrations"] @@ -40,6 +49,9 @@ packages = ["src/haystack_integrations"] dependencies = [ "pytest", "pytest-cov", + "testcontainers>=4.9.1", + "docker>=7.1.0", + "ruff", ] [tool.hatch.envs.default.scripts] From fa4b7b8b67bd7db51e74dd6e22e47214377c1956 Mon Sep 17 00:00:00 2001 From: robfrank Date: Wed, 18 Mar 2026 12:00:43 +0100 Subject: [PATCH 4/8] test: add testcontainers fixture for ArcadeDB Co-Authored-By: Claude Opus 4.6 (1M context) --- tests/conftest.py | 49 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 tests/conftest.py diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..a829754 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,49 @@ +# SPDX-FileCopyrightText: 2026-present ArcadeData Ltd +# SPDX-License-Identifier: Apache-2.0 + +import time + +import pytest +import requests +from testcontainers.core.container import DockerContainer + +ARCADEDB_IMAGE = "arcadedata/arcadedb:26.3.1" + + +def _wait_for_ready(container, timeout=60): + """Wait for ArcadeDB HTTP API to become ready.""" + host = container.get_container_host_ip() + port = container.get_exposed_port(2480) + url = f"http://{host}:{port}/api/v1/ready" + + start = time.time() + while time.time() - start < timeout: + try: + resp = requests.get(url, timeout=2) + if resp.status_code == 204: + return + except (requests.ConnectionError, requests.Timeout): + pass + time.sleep(1) + raise TimeoutError(f"ArcadeDB not ready at {url} after {timeout}s") + + +@pytest.fixture(scope="session") +def arcadedb_url(): + """Start an ArcadeDB container and yield its HTTP base URL.""" + container = ( + DockerContainer(ARCADEDB_IMAGE) + .with_exposed_ports(2480) + .with_env( + "JAVA_OPTS", + "-Darcadedb.server.rootPassword=arcadedb", + ) + ) + container.start() + _wait_for_ready(container) + + host = container.get_container_host_ip() + port = container.get_exposed_port(2480) + yield f"http://{host}:{port}" + + container.stop() From 3b994e2b4994dee4410add7e4246f9e20f1cae3c Mon Sep 17 00:00:00 2001 From: robfrank Date: Wed, 18 Mar 2026 12:01:31 +0100 Subject: [PATCH 5/8] test: refactor integration tests to use testcontainers fixture Co-Authored-By: Claude Opus 4.6 (1M context) --- tests/test_document_store.py | 239 ++++++++++++++++++----------------- 1 file changed, 120 insertions(+), 119 deletions(-) diff --git a/tests/test_document_store.py b/tests/test_document_store.py index b290e77..188949d 100644 --- a/tests/test_document_store.py +++ b/tests/test_document_store.py @@ -1,27 +1,19 @@ # SPDX-FileCopyrightText: 2026-present ArcadeData Ltd # SPDX-License-Identifier: Apache-2.0 -""" -Integration tests for ArcadeDBDocumentStore. - -Prerequisites: - docker run -d -p 2480:2480 \ - -e JAVA_OPTS="-Darcadedb.server.rootPassword=arcadedb" \ - arcadedata/arcadedb:latest -""" - -import os -import unittest +"""Integration tests for ArcadeDBDocumentStore (using testcontainers).""" +import pytest from haystack import Document +from haystack.document_stores.errors import DuplicateDocumentError from haystack.document_stores.types import DuplicatePolicy from haystack_integrations.document_stores.arcadedb import ArcadeDBDocumentStore -def _store(**kwargs) -> ArcadeDBDocumentStore: +def _store(arcadedb_url, **kwargs): return ArcadeDBDocumentStore( - url=os.getenv("ARCADEDB_URL", "http://localhost:2480"), + url=arcadedb_url, database="haystack_test", username=kwargs.pop("username", None) or ArcadeDBDocumentStore.__init__.__kwdefaults__["username"], @@ -32,143 +24,152 @@ def _store(**kwargs) -> ArcadeDBDocumentStore: ) -def _sample_docs(n: int = 3, dim: int = 4) -> list[Document]: - docs = [] - for i in range(n): - docs.append( - Document( - content=f"Document number {i}", - embedding=[float(i)] * dim, - meta={"category": "test", "priority": i}, - ) +def _sample_docs(n=3, dim=4): + return [ + Document( + content=f"Document number {i}", + embedding=[float(i)] * dim, + meta={"category": "test", "priority": i}, ) - return docs + for i in range(n) + ] -class TestArcadeDBDocumentStore(unittest.TestCase): - """Integration tests — require a running ArcadeDB instance.""" +# ---- count ---- - def setUp(self): - self.store = _store(embedding_dimension=4) - # ---- count ---- +def test_count_empty(arcadedb_url): + store = _store(arcadedb_url, embedding_dimension=4) + assert store.count_documents() == 0 - def test_count_empty(self): - self.assertEqual(self.store.count_documents(), 0) - def test_count_after_write(self): - docs = _sample_docs(5) - self.store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) - self.assertEqual(self.store.count_documents(), 5) +def test_count_after_write(arcadedb_url): + store = _store(arcadedb_url, embedding_dimension=4) + docs = _sample_docs(5) + store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + assert store.count_documents() == 5 - # ---- write ---- - def test_write_and_read(self): - docs = _sample_docs(2) - written = self.store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) - self.assertEqual(written, 2) +# ---- write ---- - all_docs = self.store.filter_documents() - self.assertEqual(len(all_docs), 2) - def test_write_overwrite(self): - docs = _sample_docs(1) - self.store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) +def test_write_and_read(arcadedb_url): + store = _store(arcadedb_url, embedding_dimension=4) + docs = _sample_docs(2) + written = store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + assert written == 2 - # Modify content and overwrite - docs[0].content = "Updated content" - self.store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + all_docs = store.filter_documents() + assert len(all_docs) == 2 - all_docs = self.store.filter_documents() - self.assertEqual(len(all_docs), 1) - self.assertEqual(all_docs[0].content, "Updated content") - def test_write_skip(self): - docs = _sample_docs(1) - self.store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) +def test_write_overwrite(arcadedb_url): + store = _store(arcadedb_url, embedding_dimension=4) + docs = _sample_docs(1) + store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) - # Attempt to write same doc with SKIP policy - written = self.store.write_documents(docs, policy=DuplicatePolicy.SKIP) - self.assertEqual(written, 0) - self.assertEqual(self.store.count_documents(), 1) + docs[0].content = "Updated content" + store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) - def test_write_duplicate_raises(self): - docs = _sample_docs(1) - self.store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + all_docs = store.filter_documents() + assert len(all_docs) == 1 + assert all_docs[0].content == "Updated content" - from haystack.document_stores.errors import DuplicateDocumentError - with self.assertRaises(DuplicateDocumentError): - self.store.write_documents(docs, policy=DuplicatePolicy.NONE) +def test_write_skip(arcadedb_url): + store = _store(arcadedb_url, embedding_dimension=4) + docs = _sample_docs(1) + store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) - # ---- delete ---- + written = store.write_documents(docs, policy=DuplicatePolicy.SKIP) + assert written == 0 + assert store.count_documents() == 1 - def test_delete(self): - docs = _sample_docs(3) - self.store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) - ids_to_delete = [docs[0].id, docs[1].id] - self.store.delete_documents(ids_to_delete) +def test_write_duplicate_raises(arcadedb_url): + store = _store(arcadedb_url, embedding_dimension=4) + docs = _sample_docs(1) + store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) - self.assertEqual(self.store.count_documents(), 1) + with pytest.raises(DuplicateDocumentError): + store.write_documents(docs, policy=DuplicatePolicy.NONE) - # ---- filter ---- - def test_filter_equality(self): - docs = _sample_docs(3) - self.store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) +# ---- delete ---- - result = self.store.filter_documents( - filters={"field": "meta.category", "operator": "==", "value": "test"} - ) - self.assertEqual(len(result), 3) - def test_filter_comparison(self): - docs = _sample_docs(5) - self.store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) +def test_delete(arcadedb_url): + store = _store(arcadedb_url, embedding_dimension=4) + docs = _sample_docs(3) + store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) - result = self.store.filter_documents( - filters={"field": "meta.priority", "operator": ">", "value": 2} - ) - self.assertEqual(len(result), 2) # priority 3 and 4 - - def test_filter_and(self): - docs = _sample_docs(5) - self.store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) - - result = self.store.filter_documents( - filters={ - "operator": "AND", - "conditions": [ - {"field": "meta.category", "operator": "==", "value": "test"}, - {"field": "meta.priority", "operator": ">=", "value": 3}, - ], - } - ) - self.assertEqual(len(result), 2) + store.delete_documents([docs[0].id, docs[1].id]) + assert store.count_documents() == 1 - # ---- embedding retrieval ---- - def test_embedding_retrieval(self): - docs = _sample_docs(5, dim=4) - self.store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) +# ---- filter ---- - results = self.store._embedding_retrieval( - query_embedding=[4.0, 4.0, 4.0, 4.0], top_k=3 - ) - self.assertLessEqual(len(results), 3) - # The closest document should be the one with embedding [4.0, 4.0, 4.0, 4.0] - self.assertIsNotNone(results[0].score) - # ---- serialization ---- +def test_filter_equality(arcadedb_url): + store = _store(arcadedb_url, embedding_dimension=4) + docs = _sample_docs(3) + store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + + result = store.filter_documents( + filters={"field": "meta.category", "operator": "==", "value": "test"} + ) + assert len(result) == 3 + + +def test_filter_comparison(arcadedb_url): + store = _store(arcadedb_url, embedding_dimension=4) + docs = _sample_docs(5) + store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + + result = store.filter_documents( + filters={"field": "meta.priority", "operator": ">", "value": 2} + ) + assert len(result) == 2 + + +def test_filter_and(arcadedb_url): + store = _store(arcadedb_url, embedding_dimension=4) + docs = _sample_docs(5) + store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + + result = store.filter_documents( + filters={ + "operator": "AND", + "conditions": [ + {"field": "meta.category", "operator": "==", "value": "test"}, + {"field": "meta.priority", "operator": ">=", "value": 3}, + ], + } + ) + assert len(result) == 2 + + +# ---- embedding retrieval ---- + + +def test_embedding_retrieval(arcadedb_url): + store = _store(arcadedb_url, embedding_dimension=4) + docs = _sample_docs(5, dim=4) + store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + + results = store._embedding_retrieval( + query_embedding=[4.0, 4.0, 4.0, 4.0], top_k=3 + ) + assert len(results) <= 3 + assert results[0].score is not None + - def test_to_dict_from_dict(self): - store = _store(embedding_dimension=4) - data = store.to_dict() - restored = ArcadeDBDocumentStore.from_dict(data) - self.assertEqual(restored._database, store._database) - self.assertEqual(restored._embedding_dimension, store._embedding_dimension) +# ---- serialization ---- -if __name__ == "__main__": - unittest.main() +def test_to_dict_from_dict(arcadedb_url): + store = _store(arcadedb_url, embedding_dimension=4) + data = store.to_dict() + restored = ArcadeDBDocumentStore.from_dict(data) + assert restored._database == store._database + assert restored._embedding_dimension == store._embedding_dimension From 56de393ba26b55a0c88a5da07d6dfe2b4852f43c Mon Sep 17 00:00:00 2001 From: robfrank Date: Wed, 18 Mar 2026 13:13:31 +0100 Subject: [PATCH 6/8] style: fix lint issues and test auth for all checks to pass Fix E501 line-too-long violations across source, examples, and tests. Fix integration test auth by using explicit Secret.from_token credentials instead of relying on env vars. Fix serialization test to use env-var based secrets that support to_dict(). Co-Authored-By: Claude Opus 4.6 (1M context) --- examples/embedding_retrieval.py | 44 ++++++--- .../arcadedb/embedding_retriever.py | 17 +++- .../arcadedb/document_store.py | 56 +++++++---- tests/test_document_store.py | 24 +++-- tests/test_filters.py | 99 +++++++++++-------- 5 files changed, 155 insertions(+), 85 deletions(-) diff --git a/examples/embedding_retrieval.py b/examples/embedding_retrieval.py index e849571..d9875fc 100644 --- a/examples/embedding_retrieval.py +++ b/examples/embedding_retrieval.py @@ -21,7 +21,9 @@ from haystack import Document, Pipeline from haystack.document_stores.types import DuplicatePolicy -from haystack_integrations.components.retrievers.arcadedb import ArcadeDBEmbeddingRetriever +from haystack_integrations.components.retrievers.arcadedb import ( + ArcadeDBEmbeddingRetriever, +) from haystack_integrations.document_stores.arcadedb import ArcadeDBDocumentStore # --- 1. Create the document store --- @@ -36,7 +38,10 @@ # --- 2. Write some documents --- documents = [ Document( - content="ArcadeDB is a multi-model database supporting graphs, documents, key-value, time-series, and vectors.", + content=( + "ArcadeDB is a multi-model database supporting graphs," + " documents, key-value, time-series, and vectors." + ), embedding=[1.0, 0.0, 0.0, 0.0], meta={"category": "database", "source": "docs"}, ), @@ -46,12 +51,17 @@ meta={"category": "framework", "source": "docs"}, ), Document( - content="HNSW (Hierarchical Navigable Small World) enables fast approximate nearest neighbor search.", + content=( + "HNSW (Hierarchical Navigable Small World) enables" + " fast approximate nearest neighbor search." + ), embedding=[0.5, 0.5, 0.0, 0.0], meta={"category": "algorithm", "source": "paper"}, ), Document( - content="Vector databases store high-dimensional embeddings for semantic search.", + content=( + "Vector databases store high-dimensional embeddings for semantic search." + ), embedding=[0.8, 0.2, 0.0, 0.0], meta={"category": "database", "source": "blog"}, ), @@ -63,7 +73,9 @@ # --- 3. Build a retrieval pipeline --- pipeline = Pipeline() -pipeline.add_component("retriever", ArcadeDBEmbeddingRetriever(document_store=store, top_k=3)) +pipeline.add_component( + "retriever", ArcadeDBEmbeddingRetriever(document_store=store, top_k=3) +) # --- 4. Run a similarity search --- query_embedding = [0.9, 0.1, 0.0, 0.0] # close to "ArcadeDB" and "Vector databases" @@ -71,15 +83,25 @@ print("\n--- Top 3 results ---") for doc in result["retriever"]["documents"]: - print(f" score={doc.score:.4f} category={doc.meta.get('category')} content={doc.content[:80]}...") + print( + f" score={doc.score:.4f}" + f" category={doc.meta.get('category')}" + f" content={doc.content[:80]}..." + ) # --- 5. Filter retrieval (only 'database' category) --- -result_filtered = pipeline.run({ - "retriever": { - "query_embedding": query_embedding, - "filters": {"field": "meta.category", "operator": "==", "value": "database"}, +result_filtered = pipeline.run( + { + "retriever": { + "query_embedding": query_embedding, + "filters": { + "field": "meta.category", + "operator": "==", + "value": "database", + }, + } } -}) +) print("\n--- Filtered (category=database) ---") for doc in result_filtered["retriever"]["documents"]: diff --git a/src/haystack_integrations/components/retrievers/arcadedb/embedding_retriever.py b/src/haystack_integrations/components/retrievers/arcadedb/embedding_retriever.py index 0dc5b15..369a932 100644 --- a/src/haystack_integrations/components/retrievers/arcadedb/embedding_retriever.py +++ b/src/haystack_integrations/components/retrievers/arcadedb/embedding_retriever.py @@ -19,8 +19,12 @@ class ArcadeDBEmbeddingRetriever: Usage in a Haystack pipeline: ```python - from haystack_integrations.components.retrievers.arcadedb import ArcadeDBEmbeddingRetriever - from haystack_integrations.document_stores.arcadedb import ArcadeDBDocumentStore + from haystack_integrations.components.retrievers.arcadedb import ( + ArcadeDBEmbeddingRetriever, + ) + from haystack_integrations.document_stores.arcadedb import ( + ArcadeDBDocumentStore, + ) store = ArcadeDBDocumentStore(database="mydb") retriever = ArcadeDBEmbeddingRetriever(document_store=store, top_k=5) @@ -51,7 +55,8 @@ def run( Retrieve documents by vector similarity. :param query_embedding: The embedding vector to search with. - :param filters: Optional filters to narrow results (overrides or merges with init filters). + :param filters: Optional filters to narrow results + (overrides or merges with init filters). :param top_k: Maximum number of documents to return. :returns: A dict with key ``"documents"`` containing the retrieved documents. """ @@ -60,7 +65,11 @@ def run( # Resolve filter policy if self._filter_policy == FilterPolicy.REPLACE and filters is not None: effective_filters = filters - elif self._filter_policy == FilterPolicy.MERGE and filters is not None and self._filters is not None: + elif ( + self._filter_policy == FilterPolicy.MERGE + and filters is not None + and self._filters is not None + ): effective_filters = { "operator": "AND", "conditions": [self._filters, filters], diff --git a/src/haystack_integrations/document_stores/arcadedb/document_store.py b/src/haystack_integrations/document_stores/arcadedb/document_store.py index ee30907..f727a17 100644 --- a/src/haystack_integrations/document_stores/arcadedb/document_store.py +++ b/src/haystack_integrations/document_stores/arcadedb/document_store.py @@ -1,7 +1,10 @@ # SPDX-FileCopyrightText: 2026-present ArcadeData Ltd # SPDX-License-Identifier: Apache-2.0 -"""ArcadeDB DocumentStore for Haystack 2.x — document storage + HNSW vector search via HTTP/JSON API.""" +"""ArcadeDB DocumentStore for Haystack 2.x. + +Document storage + HNSW vector search via HTTP/JSON API. +""" import logging from typing import Any, ClassVar @@ -101,7 +104,9 @@ def _auth(self) -> tuple[str, str] | None: return (user, pwd) return None - def _command(self, sql: str, *, positional_params: list[Any] | None = None) -> list[dict[str, Any]]: + def _command( + self, sql: str, *, positional_params: list[Any] | None = None + ) -> list[dict[str, Any]]: """Execute an SQL command via the ArcadeDB HTTP API and return result rows.""" url = f"{self._url}/api/v1/command/{self._database}" payload: dict[str, Any] = {"language": "sql", "command": sql} @@ -136,7 +141,9 @@ def _ensure_initialized(self) -> None: logger.info("Created database '%s'", self._database) except RuntimeError: # Database likely already exists - logger.debug("Database '%s' already exists or cannot be created", self._database) + logger.debug( + "Database '%s' already exists or cannot be created", self._database + ) # 2. Optionally drop existing type if self._recreate_type: @@ -147,24 +154,19 @@ def _ensure_initialized(self) -> None: # 3. Create vertex type + properties self._command(f"CREATE VERTEX TYPE `{self._type_name}` IF NOT EXISTS") - self._command( - f"CREATE PROPERTY `{self._type_name}`.id IF NOT EXISTS STRING" - ) + self._command(f"CREATE PROPERTY `{self._type_name}`.id IF NOT EXISTS STRING") self._command( f"CREATE PROPERTY `{self._type_name}`.content IF NOT EXISTS STRING" ) self._command( - f"CREATE PROPERTY `{self._type_name}`.embedding IF NOT EXISTS ARRAY_OF_FLOATS" - ) - self._command( - f"CREATE PROPERTY `{self._type_name}`.meta IF NOT EXISTS MAP" + f"CREATE PROPERTY `{self._type_name}`" + ".embedding IF NOT EXISTS ARRAY_OF_FLOATS" ) + self._command(f"CREATE PROPERTY `{self._type_name}`.meta IF NOT EXISTS MAP") # 4. Unique index on id try: - self._command( - f"CREATE INDEX ON `{self._type_name}` (id) UNIQUE" - ) + self._command(f"CREATE INDEX ON `{self._type_name}` (id) UNIQUE") except RuntimeError: logger.debug("Unique index on id already exists") @@ -172,16 +174,23 @@ def _ensure_initialized(self) -> None: metric = self._SIMILARITY_MAP.get(self._similarity_function, "COSINE") try: self._command( - f"CREATE INDEX IF NOT EXISTS ON `{self._type_name}` (embedding) LSM_VECTOR " - f"METADATA {{ dimensions: {self._embedding_dimension}, similarity: '{metric}' }}" + f"CREATE INDEX IF NOT EXISTS ON " + f"`{self._type_name}` (embedding) LSM_VECTOR " + f"METADATA {{ dimensions: " + f"{self._embedding_dimension}, " + f"similarity: '{metric}' }}" ) except RuntimeError: logger.debug("Vector index on embedding already exists") self._initialized = True logger.info( - "ArcadeDBDocumentStore initialized: database=%s, type=%s, dim=%d, metric=%s", - self._database, self._type_name, self._embedding_dimension, metric, + "ArcadeDBDocumentStore initialized:" + " database=%s, type=%s, dim=%d, metric=%s", + self._database, + self._type_name, + self._embedding_dimension, + metric, ) # ------------------------------------------------------------------ @@ -247,7 +256,8 @@ def write_documents( elif policy == DuplicatePolicy.SKIP: existing = self._command( - f"SELECT id FROM `{self._type_name}` WHERE id = {_sql_str(record['id'])}" + f"SELECT id FROM `{self._type_name}`" + f" WHERE id = {_sql_str(record['id'])}" ) if existing: continue @@ -257,7 +267,8 @@ def write_documents( else: # DuplicatePolicy.NONE — raise on duplicate existing = self._command( - f"SELECT id FROM `{self._type_name}` WHERE id = {_sql_str(record['id'])}" + f"SELECT id FROM `{self._type_name}`" + f" WHERE id = {_sql_str(record['id'])}" ) if existing: msg = f"Document with id '{record['id']}' already exists." @@ -267,7 +278,9 @@ def write_documents( return written - def _insert_record(self, record: dict[str, Any], embedding_str: str, meta_str: str) -> None: + def _insert_record( + self, record: dict[str, Any], embedding_str: str, meta_str: str + ) -> None: sql = ( f"INSERT INTO `{self._type_name}` SET " f"id = {_sql_str(record['id'])}, " @@ -304,7 +317,8 @@ def _embedding_retrieval( self._ensure_initialized() embedding_str = str(query_embedding) - # vectorNeighbors returns a single row with a "neighbors" list of {record, distance} + # vectorNeighbors returns a single row with + # a "neighbors" list of {record, distance} sql = ( f"SELECT vectorNeighbors('{self._type_name}[embedding]', " f"{embedding_str}, {top_k}) AS neighbors" diff --git a/tests/test_document_store.py b/tests/test_document_store.py index 188949d..c8dd886 100644 --- a/tests/test_document_store.py +++ b/tests/test_document_store.py @@ -7,6 +7,7 @@ from haystack import Document from haystack.document_stores.errors import DuplicateDocumentError from haystack.document_stores.types import DuplicatePolicy +from haystack.utils import Secret from haystack_integrations.document_stores.arcadedb import ArcadeDBDocumentStore @@ -15,10 +16,8 @@ def _store(arcadedb_url, **kwargs): return ArcadeDBDocumentStore( url=arcadedb_url, database="haystack_test", - username=kwargs.pop("username", None) - or ArcadeDBDocumentStore.__init__.__kwdefaults__["username"], - password=kwargs.pop("password", None) - or ArcadeDBDocumentStore.__init__.__kwdefaults__["password"], + username=Secret.from_token("root"), + password=Secret.from_token("arcadedb"), recreate_type=True, **kwargs, ) @@ -157,9 +156,7 @@ def test_embedding_retrieval(arcadedb_url): docs = _sample_docs(5, dim=4) store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) - results = store._embedding_retrieval( - query_embedding=[4.0, 4.0, 4.0, 4.0], top_k=3 - ) + results = store._embedding_retrieval(query_embedding=[4.0, 4.0, 4.0, 4.0], top_k=3) assert len(results) <= 3 assert results[0].score is not None @@ -167,8 +164,17 @@ def test_embedding_retrieval(arcadedb_url): # ---- serialization ---- -def test_to_dict_from_dict(arcadedb_url): - store = _store(arcadedb_url, embedding_dimension=4) +def test_to_dict_from_dict(arcadedb_url, monkeypatch): + monkeypatch.setenv("ARCADEDB_USERNAME", "root") + monkeypatch.setenv("ARCADEDB_PASSWORD", "arcadedb") + store = ArcadeDBDocumentStore( + url=arcadedb_url, + database="haystack_test", + username=Secret.from_env_var("ARCADEDB_USERNAME"), + password=Secret.from_env_var("ARCADEDB_PASSWORD"), + embedding_dimension=4, + recreate_type=True, + ) data = store.to_dict() restored = ArcadeDBDocumentStore.from_dict(data) assert restored._database == store._database diff --git a/tests/test_filters.py b/tests/test_filters.py index 9f460e9..3bcb2d1 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -9,24 +9,31 @@ class TestFilterConversion(unittest.TestCase): - def test_none_returns_empty(self): self.assertEqual(_convert_filters(None), "") def test_equality(self): - result = _convert_filters({"field": "meta.name", "operator": "==", "value": "alice"}) + result = _convert_filters( + {"field": "meta.name", "operator": "==", "value": "alice"} + ) self.assertEqual(result, "meta.name = 'alice'") def test_equality_null(self): - result = _convert_filters({"field": "meta.name", "operator": "==", "value": None}) + result = _convert_filters( + {"field": "meta.name", "operator": "==", "value": None} + ) self.assertEqual(result, "meta.name IS NULL") def test_not_equal(self): - result = _convert_filters({"field": "meta.name", "operator": "!=", "value": "bob"}) + result = _convert_filters( + {"field": "meta.name", "operator": "!=", "value": "bob"} + ) self.assertEqual(result, "meta.name <> 'bob'") def test_not_equal_null(self): - result = _convert_filters({"field": "meta.name", "operator": "!=", "value": None}) + result = _convert_filters( + {"field": "meta.name", "operator": "!=", "value": None} + ) self.assertEqual(result, "meta.name IS NOT NULL") def test_greater_than(self): @@ -34,56 +41,68 @@ def test_greater_than(self): self.assertEqual(result, "meta.score > 5") def test_in_operator(self): - result = _convert_filters({"field": "meta.tag", "operator": "in", "value": ["a", "b"]}) + result = _convert_filters( + {"field": "meta.tag", "operator": "in", "value": ["a", "b"]} + ) self.assertEqual(result, "meta.tag IN ['a', 'b']") def test_not_in_operator(self): - result = _convert_filters({"field": "meta.tag", "operator": "not in", "value": ["x"]}) + result = _convert_filters( + {"field": "meta.tag", "operator": "not in", "value": ["x"]} + ) self.assertEqual(result, "meta.tag NOT IN ['x']") def test_and(self): - result = _convert_filters({ - "operator": "AND", - "conditions": [ - {"field": "meta.a", "operator": "==", "value": 1}, - {"field": "meta.b", "operator": ">", "value": 2}, - ], - }) + result = _convert_filters( + { + "operator": "AND", + "conditions": [ + {"field": "meta.a", "operator": "==", "value": 1}, + {"field": "meta.b", "operator": ">", "value": 2}, + ], + } + ) self.assertEqual(result, "(meta.a = 1 AND meta.b > 2)") def test_or(self): - result = _convert_filters({ - "operator": "OR", - "conditions": [ - {"field": "meta.x", "operator": "==", "value": "yes"}, - {"field": "meta.y", "operator": "==", "value": "no"}, - ], - }) + result = _convert_filters( + { + "operator": "OR", + "conditions": [ + {"field": "meta.x", "operator": "==", "value": "yes"}, + {"field": "meta.y", "operator": "==", "value": "no"}, + ], + } + ) self.assertEqual(result, "(meta.x = 'yes' OR meta.y = 'no')") def test_not(self): - result = _convert_filters({ - "operator": "NOT", - "conditions": [ - {"field": "meta.deleted", "operator": "==", "value": True}, - ], - }) + result = _convert_filters( + { + "operator": "NOT", + "conditions": [ + {"field": "meta.deleted", "operator": "==", "value": True}, + ], + } + ) self.assertEqual(result, "NOT (meta.deleted = true)") def test_nested(self): - result = _convert_filters({ - "operator": "AND", - "conditions": [ - {"field": "meta.a", "operator": "==", "value": 1}, - { - "operator": "OR", - "conditions": [ - {"field": "meta.b", "operator": "==", "value": 2}, - {"field": "meta.c", "operator": "==", "value": 3}, - ], - }, - ], - }) + result = _convert_filters( + { + "operator": "AND", + "conditions": [ + {"field": "meta.a", "operator": "==", "value": 1}, + { + "operator": "OR", + "conditions": [ + {"field": "meta.b", "operator": "==", "value": 2}, + {"field": "meta.c", "operator": "==", "value": 3}, + ], + }, + ], + } + ) self.assertEqual(result, "(meta.a = 1 AND (meta.b = 2 OR meta.c = 3))") def test_missing_operator_raises(self): From 2e50c9185872a1121cfdc2688d3126203c9fdab9 Mon Sep 17 00:00:00 2001 From: robfrank Date: Wed, 18 Mar 2026 13:16:31 +0100 Subject: [PATCH 7/8] docs: add infrastructure design and implementation plan Co-Authored-By: Claude Opus 4.6 (1M context) --- ...026-03-18-project-infrastructure-design.md | 89 ++++ .../2026-03-18-project-infrastructure-plan.md | 456 ++++++++++++++++++ 2 files changed, 545 insertions(+) create mode 100644 docs/plans/2026-03-18-project-infrastructure-design.md create mode 100644 docs/plans/2026-03-18-project-infrastructure-plan.md diff --git a/docs/plans/2026-03-18-project-infrastructure-design.md b/docs/plans/2026-03-18-project-infrastructure-design.md new file mode 100644 index 0000000..7cd975e --- /dev/null +++ b/docs/plans/2026-03-18-project-infrastructure-design.md @@ -0,0 +1,89 @@ +# Project Infrastructure Design + +## Overview + +Set up CI, Dependabot, and foundational project infrastructure for the `arcadedb-haystack` project hosted on GitHub under the `ArcadeData` org. + +## Dependabot + +**File:** `.github/dependabot.yml` + +Two ecosystems, both on a weekly schedule targeting `main`: + +- **pip** — monitors `pyproject.toml` for dependency updates +- **github-actions** — monitors workflow action versions (e.g., `actions/checkout`, `actions/setup-python`) + +No auto-merge, no grouping, no reviewers. The ArcadeDB Docker image version is updated manually (not tracked by Dependabot). + +## CI Workflow + +**File:** `.github/workflows/ci.yml` + +**Triggers:** pull request to main, push to main, `workflow_dispatch` + +### Job 1: `lint` + +- Runs on `ubuntu-latest`, Python 3.13 +- Installs Ruff +- Runs `ruff check .` and `ruff format --check .` + +### Job 2: `test` + +- Runs on `ubuntu-latest`, Python 3.13 +- Installs project with dev dependencies (`pip install -e .` + dev deps) +- Tests use `testcontainers` to spin up `arcadedata/arcadedb:26.3.1` with a readiness check on the HTTP API (`/api/v1/ready` on port 2480) +- Runs `pytest tests/` with coverage +- Docker is pre-installed on GitHub Actions runners + +## File Changes + +### New files + +| File | Purpose | +|------|---------| +| `.github/dependabot.yml` | Dependabot configuration | +| `.github/workflows/ci.yml` | CI workflow with lint + test jobs | + +### Modified files + +| File | Change | +|------|--------| +| `pyproject.toml` | Add `testcontainers`, `docker`, and `ruff` to dev dependencies | +| `tests/test_document_store.py` | Refactor to use a `testcontainers` fixture (module-scoped) instead of requiring a pre-running ArcadeDB instance | + +### Unchanged files + +| File | Reason | +|------|--------| +| `tests/test_filters.py` | Pure unit tests, no ArcadeDB dependency | + +## Testcontainers Fixture Pattern + +Following the pattern from `e2e-python/tests/test_arcadedb.py`: + +```python +from testcontainers.core.container import DockerContainer + +ARCADEDB_IMAGE = "arcadedata/arcadedb:26.3.1" + +arcadedb = ( + DockerContainer(ARCADEDB_IMAGE) + .with_exposed_ports(2480) + .with_env("JAVA_OPTS", "-Darcadedb.server.rootPassword=arcadedb") +) + +@pytest.fixture(scope="module", autouse=True) +def arcadedb_container(): + arcadedb.start() + wait_for_http_endpoint(arcadedb, "/api/v1/ready", 2480, 204, timeout=30) + yield arcadedb + arcadedb.stop() +``` + +The `_store()` helper will resolve the URL from the container's mapped host/port. + +## Out of Scope + +- Release/publish workflow to PyPI (manual for now) +- Python version matrix (3.13 only) +- Docker image version tracking via Dependabot (manual updates) diff --git a/docs/plans/2026-03-18-project-infrastructure-plan.md b/docs/plans/2026-03-18-project-infrastructure-plan.md new file mode 100644 index 0000000..e309595 --- /dev/null +++ b/docs/plans/2026-03-18-project-infrastructure-plan.md @@ -0,0 +1,456 @@ +# Project Infrastructure Implementation Plan + +> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. + +**Goal:** Add Dependabot, CI workflow (lint + integration tests), and testcontainers-based test infrastructure. + +**Architecture:** Dependabot watches pip and GitHub Actions dependencies. CI runs two parallel jobs: fast Ruff lint and integration tests using testcontainers to spin up ArcadeDB 26.3.1. Tests are refactored from requiring a pre-running instance to self-contained container lifecycle. + +**Tech Stack:** GitHub Actions, Dependabot, testcontainers-python, Ruff, pytest, Docker + +--- + +### Task 1: Add dev dependencies to pyproject.toml + +**Files:** +- Modify: `pyproject.toml:39-43` + +**Step 1: Update dev dependencies** + +Replace the existing `[tool.hatch.envs.default] dependencies` block with: + +```toml +[tool.hatch.envs.default] +dependencies = [ + "pytest", + "pytest-cov", + "testcontainers>=4.9.1", + "docker>=7.1.0", + "ruff", +] +``` + +Also add an `[project.optional-dependencies]` section after `[project.urls]` for pip-based installs (used by CI): + +```toml +[project.optional-dependencies] +dev = [ + "pytest", + "pytest-cov", + "testcontainers>=4.9.1", + "docker>=7.1.0", + "ruff", +] +``` + +**Step 2: Verify pyproject.toml is valid** + +Run: `python -c "import tomllib; tomllib.load(open('pyproject.toml', 'rb'))"` +Expected: No output (success) + +**Step 3: Commit** + +```bash +git add pyproject.toml +git commit -m "build: add testcontainers, docker, and ruff to dev dependencies" +``` + +--- + +### Task 2: Create Dependabot configuration + +**Files:** +- Create: `.github/dependabot.yml` + +**Step 1: Create the file** + +```yaml +version: 2 +updates: + - package-ecosystem: "pip" + directory: "/" + schedule: + interval: "weekly" + + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "weekly" +``` + +**Step 2: Validate YAML syntax** + +Run: `python -c "import yaml; yaml.safe_load(open('.github/dependabot.yml'))" 2>/dev/null || python -c "print('yaml module not available, skipping validation')"` + +**Step 3: Commit** + +```bash +git add .github/dependabot.yml +git commit -m "ci: add Dependabot configuration for pip and GitHub Actions" +``` + +--- + +### Task 3: Create CI workflow + +**Files:** +- Create: `.github/workflows/ci.yml` + +**Step 1: Create the workflow file** + +```yaml +name: CI + +on: + pull_request: + branches: [main] + push: + branches: [main] + workflow_dispatch: + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.13" + - run: pip install ruff + - run: ruff check . + - run: ruff format --check . + + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.13" + - run: pip install -e ".[dev]" + - run: pytest tests/ --cov=haystack_integrations --cov-report=term-missing -v +``` + +**Step 2: Validate YAML syntax** + +Run: `python -c "import yaml; yaml.safe_load(open('.github/workflows/ci.yml'))" 2>/dev/null || python -c "print('yaml module not available, skipping validation')"` + +**Step 3: Commit** + +```bash +git add .github/workflows/ci.yml +git commit -m "ci: add CI workflow with lint and test jobs" +``` + +--- + +### Task 4: Add testcontainers fixture to conftest.py + +**Files:** +- Create: `tests/conftest.py` + +**Step 1: Create the shared fixture** + +This fixture starts ArcadeDB once per test session, waits for readiness, and provides the base URL to all tests. + +```python +# SPDX-FileCopyrightText: 2026-present ArcadeData Ltd +# SPDX-License-Identifier: Apache-2.0 + +import time + +import pytest +import requests +from testcontainers.core.container import DockerContainer + +ARCADEDB_IMAGE = "arcadedata/arcadedb:26.3.1" + + +def _wait_for_ready(container, timeout=60): + """Wait for ArcadeDB HTTP API to become ready.""" + host = container.get_container_host_ip() + port = container.get_exposed_port(2480) + url = f"http://{host}:{port}/api/v1/ready" + + start = time.time() + while time.time() - start < timeout: + try: + resp = requests.get(url, timeout=2) + if resp.status_code == 204: + return + except (requests.ConnectionError, requests.Timeout): + pass + time.sleep(1) + raise TimeoutError(f"ArcadeDB not ready at {url} after {timeout}s") + + +@pytest.fixture(scope="session") +def arcadedb_url(): + """Start an ArcadeDB container and yield its HTTP base URL.""" + container = ( + DockerContainer(ARCADEDB_IMAGE) + .with_exposed_ports(2480) + .with_env( + "JAVA_OPTS", + "-Darcadedb.server.rootPassword=arcadedb", + ) + ) + container.start() + _wait_for_ready(container) + + host = container.get_container_host_ip() + port = container.get_exposed_port(2480) + yield f"http://{host}:{port}" + + container.stop() +``` + +**Step 2: Verify the file is syntactically valid** + +Run: `python -c "import ast; ast.parse(open('tests/conftest.py').read()); print('OK')"` +Expected: `OK` + +**Step 3: Commit** + +```bash +git add tests/conftest.py +git commit -m "test: add testcontainers fixture for ArcadeDB" +``` + +--- + +### Task 5: Refactor test_document_store.py to use testcontainers + +**Files:** +- Modify: `tests/test_document_store.py` + +**Step 1: Rewrite the test file** + +Key changes: +- Remove `import os` and `import unittest` +- Add `import pytest` +- Replace `_store()` helper to accept `arcadedb_url` fixture instead of env var +- Convert from `unittest.TestCase` class to plain pytest functions +- Each test receives `arcadedb_url` fixture and creates a fresh store with `recreate_type=True` + +Replace the entire file with: + +```python +# SPDX-FileCopyrightText: 2026-present ArcadeData Ltd +# SPDX-License-Identifier: Apache-2.0 + +"""Integration tests for ArcadeDBDocumentStore (using testcontainers).""" + +import pytest +from haystack import Document +from haystack.document_stores.errors import DuplicateDocumentError +from haystack.document_stores.types import DuplicatePolicy + +from haystack_integrations.document_stores.arcadedb import ArcadeDBDocumentStore + + +def _store(arcadedb_url, **kwargs): + return ArcadeDBDocumentStore( + url=arcadedb_url, + database="haystack_test", + username=kwargs.pop("username", None) + or ArcadeDBDocumentStore.__init__.__kwdefaults__["username"], + password=kwargs.pop("password", None) + or ArcadeDBDocumentStore.__init__.__kwdefaults__["password"], + recreate_type=True, + **kwargs, + ) + + +def _sample_docs(n=3, dim=4): + return [ + Document( + content=f"Document number {i}", + embedding=[float(i)] * dim, + meta={"category": "test", "priority": i}, + ) + for i in range(n) + ] + + +# ---- count ---- + + +def test_count_empty(arcadedb_url): + store = _store(arcadedb_url, embedding_dimension=4) + assert store.count_documents() == 0 + + +def test_count_after_write(arcadedb_url): + store = _store(arcadedb_url, embedding_dimension=4) + docs = _sample_docs(5) + store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + assert store.count_documents() == 5 + + +# ---- write ---- + + +def test_write_and_read(arcadedb_url): + store = _store(arcadedb_url, embedding_dimension=4) + docs = _sample_docs(2) + written = store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + assert written == 2 + + all_docs = store.filter_documents() + assert len(all_docs) == 2 + + +def test_write_overwrite(arcadedb_url): + store = _store(arcadedb_url, embedding_dimension=4) + docs = _sample_docs(1) + store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + + docs[0].content = "Updated content" + store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + + all_docs = store.filter_documents() + assert len(all_docs) == 1 + assert all_docs[0].content == "Updated content" + + +def test_write_skip(arcadedb_url): + store = _store(arcadedb_url, embedding_dimension=4) + docs = _sample_docs(1) + store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + + written = store.write_documents(docs, policy=DuplicatePolicy.SKIP) + assert written == 0 + assert store.count_documents() == 1 + + +def test_write_duplicate_raises(arcadedb_url): + store = _store(arcadedb_url, embedding_dimension=4) + docs = _sample_docs(1) + store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + + with pytest.raises(DuplicateDocumentError): + store.write_documents(docs, policy=DuplicatePolicy.NONE) + + +# ---- delete ---- + + +def test_delete(arcadedb_url): + store = _store(arcadedb_url, embedding_dimension=4) + docs = _sample_docs(3) + store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + + store.delete_documents([docs[0].id, docs[1].id]) + assert store.count_documents() == 1 + + +# ---- filter ---- + + +def test_filter_equality(arcadedb_url): + store = _store(arcadedb_url, embedding_dimension=4) + docs = _sample_docs(3) + store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + + result = store.filter_documents( + filters={"field": "meta.category", "operator": "==", "value": "test"} + ) + assert len(result) == 3 + + +def test_filter_comparison(arcadedb_url): + store = _store(arcadedb_url, embedding_dimension=4) + docs = _sample_docs(5) + store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + + result = store.filter_documents( + filters={"field": "meta.priority", "operator": ">", "value": 2} + ) + assert len(result) == 2 + + +def test_filter_and(arcadedb_url): + store = _store(arcadedb_url, embedding_dimension=4) + docs = _sample_docs(5) + store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + + result = store.filter_documents( + filters={ + "operator": "AND", + "conditions": [ + {"field": "meta.category", "operator": "==", "value": "test"}, + {"field": "meta.priority", "operator": ">=", "value": 3}, + ], + } + ) + assert len(result) == 2 + + +# ---- embedding retrieval ---- + + +def test_embedding_retrieval(arcadedb_url): + store = _store(arcadedb_url, embedding_dimension=4) + docs = _sample_docs(5, dim=4) + store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + + results = store._embedding_retrieval( + query_embedding=[4.0, 4.0, 4.0, 4.0], top_k=3 + ) + assert len(results) <= 3 + assert results[0].score is not None + + +# ---- serialization ---- + + +def test_to_dict_from_dict(arcadedb_url): + store = _store(arcadedb_url, embedding_dimension=4) + data = store.to_dict() + restored = ArcadeDBDocumentStore.from_dict(data) + assert restored._database == store._database + assert restored._embedding_dimension == store._embedding_dimension +``` + +**Step 2: Verify syntax** + +Run: `python -c "import ast; ast.parse(open('tests/test_document_store.py').read()); print('OK')"` +Expected: `OK` + +**Step 3: Run the unit tests (filters) to confirm no breakage** + +Run: `pytest tests/test_filters.py -v` +Expected: All tests pass + +**Step 4: Commit** + +```bash +git add tests/test_document_store.py +git commit -m "test: refactor integration tests to use testcontainers fixture" +``` + +--- + +### Task 6: Run full test suite locally (if Docker available) + +**Step 1: Run all tests** + +Run: `pytest tests/ -v --tb=short` +Expected: All tests pass (filter unit tests + document store integration tests) + +**Step 2: Run linting** + +Run: `ruff check . && ruff format --check .` +Expected: No errors + +**Step 3: Fix any lint issues if found** + +Run: `ruff format .` (if formatting issues) then re-run checks. + +**Step 4: Commit any fixes** + +```bash +git add -A +git commit -m "style: fix lint issues" +``` From d0372b784663b82f3429d6f8a468affc560c82c1 Mon Sep 17 00:00:00 2001 From: robfrank Date: Wed, 18 Mar 2026 13:42:22 +0100 Subject: [PATCH 8/8] ci: add pip caching, use project ruff version, use HttpWaitStrategy Address code review feedback: - Add pip cache to setup-python steps - Install ruff from project deps for version consistency - Add step names for clarity - Replace custom _wait_for_ready with testcontainers HttpWaitStrategy Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/ci.yml | 25 ++++++++++++++++--------- tests/conftest.py | 28 ++++++---------------------- 2 files changed, 22 insertions(+), 31 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index bafe7ef..e4c6e02 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -11,20 +11,27 @@ jobs: lint: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 - - uses: actions/setup-python@v5 + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 + - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 with: python-version: "3.13" - - run: pip install ruff - - run: ruff check . - - run: ruff format --check . + cache: "pip" + - name: Install dependencies + run: pip install -e ".[dev]" + - name: Check linting + run: ruff check . + - name: Check formatting + run: ruff format --check . test: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 - - uses: actions/setup-python@v5 + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 + - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 with: python-version: "3.13" - - run: pip install -e ".[dev]" - - run: pytest tests/ --cov=haystack_integrations --cov-report=term-missing -v + cache: "pip" + - name: Install dependencies + run: pip install -e ".[dev]" + - name: Run tests + run: pytest tests/ --cov=haystack_integrations --cov-report=term-missing -v diff --git a/tests/conftest.py b/tests/conftest.py index a829754..b8a083c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,33 +1,13 @@ # SPDX-FileCopyrightText: 2026-present ArcadeData Ltd # SPDX-License-Identifier: Apache-2.0 -import time - import pytest -import requests from testcontainers.core.container import DockerContainer +from testcontainers.core.wait_strategies import HttpWaitStrategy ARCADEDB_IMAGE = "arcadedata/arcadedb:26.3.1" -def _wait_for_ready(container, timeout=60): - """Wait for ArcadeDB HTTP API to become ready.""" - host = container.get_container_host_ip() - port = container.get_exposed_port(2480) - url = f"http://{host}:{port}/api/v1/ready" - - start = time.time() - while time.time() - start < timeout: - try: - resp = requests.get(url, timeout=2) - if resp.status_code == 204: - return - except (requests.ConnectionError, requests.Timeout): - pass - time.sleep(1) - raise TimeoutError(f"ArcadeDB not ready at {url} after {timeout}s") - - @pytest.fixture(scope="session") def arcadedb_url(): """Start an ArcadeDB container and yield its HTTP base URL.""" @@ -38,9 +18,13 @@ def arcadedb_url(): "JAVA_OPTS", "-Darcadedb.server.rootPassword=arcadedb", ) + .waiting_for( + HttpWaitStrategy(2480, "/api/v1/ready") + .for_status_code(204) + .with_startup_timeout(60) + ) ) container.start() - _wait_for_ready(container) host = container.get_container_host_ip() port = container.get_exposed_port(2480)