diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..4776211 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,11 @@ +version: 2 +updates: + - package-ecosystem: "pip" + directory: "/" + schedule: + interval: "weekly" + + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "weekly" diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..e4c6e02 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,37 @@ +name: CI + +on: + pull_request: + branches: [main] + push: + branches: [main] + workflow_dispatch: + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 + - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 + with: + python-version: "3.13" + cache: "pip" + - name: Install dependencies + run: pip install -e ".[dev]" + - name: Check linting + run: ruff check . + - name: Check formatting + run: ruff format --check . + + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 + - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 + with: + python-version: "3.13" + cache: "pip" + - name: Install dependencies + run: pip install -e ".[dev]" + - name: Run tests + run: pytest tests/ --cov=haystack_integrations --cov-report=term-missing -v diff --git a/docs/plans/2026-03-18-project-infrastructure-design.md b/docs/plans/2026-03-18-project-infrastructure-design.md new file mode 100644 index 0000000..7cd975e --- /dev/null +++ b/docs/plans/2026-03-18-project-infrastructure-design.md @@ -0,0 +1,89 @@ +# Project Infrastructure Design + +## Overview + +Set up CI, Dependabot, and foundational project infrastructure for the `arcadedb-haystack` project hosted on GitHub under the `ArcadeData` org. + +## Dependabot + +**File:** `.github/dependabot.yml` + +Two ecosystems, both on a weekly schedule targeting `main`: + +- **pip** — monitors `pyproject.toml` for dependency updates +- **github-actions** — monitors workflow action versions (e.g., `actions/checkout`, `actions/setup-python`) + +No auto-merge, no grouping, no reviewers. The ArcadeDB Docker image version is updated manually (not tracked by Dependabot). + +## CI Workflow + +**File:** `.github/workflows/ci.yml` + +**Triggers:** pull request to main, push to main, `workflow_dispatch` + +### Job 1: `lint` + +- Runs on `ubuntu-latest`, Python 3.13 +- Installs Ruff +- Runs `ruff check .` and `ruff format --check .` + +### Job 2: `test` + +- Runs on `ubuntu-latest`, Python 3.13 +- Installs project with dev dependencies (`pip install -e .` + dev deps) +- Tests use `testcontainers` to spin up `arcadedata/arcadedb:26.3.1` with a readiness check on the HTTP API (`/api/v1/ready` on port 2480) +- Runs `pytest tests/` with coverage +- Docker is pre-installed on GitHub Actions runners + +## File Changes + +### New files + +| File | Purpose | +|------|---------| +| `.github/dependabot.yml` | Dependabot configuration | +| `.github/workflows/ci.yml` | CI workflow with lint + test jobs | + +### Modified files + +| File | Change | +|------|--------| +| `pyproject.toml` | Add `testcontainers`, `docker`, and `ruff` to dev dependencies | +| `tests/test_document_store.py` | Refactor to use a `testcontainers` fixture (module-scoped) instead of requiring a pre-running ArcadeDB instance | + +### Unchanged files + +| File | Reason | +|------|--------| +| `tests/test_filters.py` | Pure unit tests, no ArcadeDB dependency | + +## Testcontainers Fixture Pattern + +Following the pattern from `e2e-python/tests/test_arcadedb.py`: + +```python +from testcontainers.core.container import DockerContainer + +ARCADEDB_IMAGE = "arcadedata/arcadedb:26.3.1" + +arcadedb = ( + DockerContainer(ARCADEDB_IMAGE) + .with_exposed_ports(2480) + .with_env("JAVA_OPTS", "-Darcadedb.server.rootPassword=arcadedb") +) + +@pytest.fixture(scope="module", autouse=True) +def arcadedb_container(): + arcadedb.start() + wait_for_http_endpoint(arcadedb, "/api/v1/ready", 2480, 204, timeout=30) + yield arcadedb + arcadedb.stop() +``` + +The `_store()` helper will resolve the URL from the container's mapped host/port. + +## Out of Scope + +- Release/publish workflow to PyPI (manual for now) +- Python version matrix (3.13 only) +- Docker image version tracking via Dependabot (manual updates) diff --git a/docs/plans/2026-03-18-project-infrastructure-plan.md b/docs/plans/2026-03-18-project-infrastructure-plan.md new file mode 100644 index 0000000..e309595 --- /dev/null +++ b/docs/plans/2026-03-18-project-infrastructure-plan.md @@ -0,0 +1,456 @@ +# Project Infrastructure Implementation Plan + +> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. + +**Goal:** Add Dependabot, CI workflow (lint + integration tests), and testcontainers-based test infrastructure. + +**Architecture:** Dependabot watches pip and GitHub Actions dependencies. CI runs two parallel jobs: fast Ruff lint and integration tests using testcontainers to spin up ArcadeDB 26.3.1. Tests are refactored from requiring a pre-running instance to self-contained container lifecycle. + +**Tech Stack:** GitHub Actions, Dependabot, testcontainers-python, Ruff, pytest, Docker + +--- + +### Task 1: Add dev dependencies to pyproject.toml + +**Files:** +- Modify: `pyproject.toml:39-43` + +**Step 1: Update dev dependencies** + +Replace the existing `[tool.hatch.envs.default] dependencies` block with: + +```toml +[tool.hatch.envs.default] +dependencies = [ + "pytest", + "pytest-cov", + "testcontainers>=4.9.1", + "docker>=7.1.0", + "ruff", +] +``` + +Also add an `[project.optional-dependencies]` section after `[project.urls]` for pip-based installs (used by CI): + +```toml +[project.optional-dependencies] +dev = [ + "pytest", + "pytest-cov", + "testcontainers>=4.9.1", + "docker>=7.1.0", + "ruff", +] +``` + +**Step 2: Verify pyproject.toml is valid** + +Run: `python -c "import tomllib; tomllib.load(open('pyproject.toml', 'rb'))"` +Expected: No output (success) + +**Step 3: Commit** + +```bash +git add pyproject.toml +git commit -m "build: add testcontainers, docker, and ruff to dev dependencies" +``` + +--- + +### Task 2: Create Dependabot configuration + +**Files:** +- Create: `.github/dependabot.yml` + +**Step 1: Create the file** + +```yaml +version: 2 +updates: + - package-ecosystem: "pip" + directory: "/" + schedule: + interval: "weekly" + + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "weekly" +``` + +**Step 2: Validate YAML syntax** + +Run: `python -c "import yaml; yaml.safe_load(open('.github/dependabot.yml'))" 2>/dev/null || python -c "print('yaml module not available, skipping validation')"` + +**Step 3: Commit** + +```bash +git add .github/dependabot.yml +git commit -m "ci: add Dependabot configuration for pip and GitHub Actions" +``` + +--- + +### Task 3: Create CI workflow + +**Files:** +- Create: `.github/workflows/ci.yml` + +**Step 1: Create the workflow file** + +```yaml +name: CI + +on: + pull_request: + branches: [main] + push: + branches: [main] + workflow_dispatch: + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.13" + - run: pip install ruff + - run: ruff check . + - run: ruff format --check . + + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.13" + - run: pip install -e ".[dev]" + - run: pytest tests/ --cov=haystack_integrations --cov-report=term-missing -v +``` + +**Step 2: Validate YAML syntax** + +Run: `python -c "import yaml; yaml.safe_load(open('.github/workflows/ci.yml'))" 2>/dev/null || python -c "print('yaml module not available, skipping validation')"` + +**Step 3: Commit** + +```bash +git add .github/workflows/ci.yml +git commit -m "ci: add CI workflow with lint and test jobs" +``` + +--- + +### Task 4: Add testcontainers fixture to conftest.py + +**Files:** +- Create: `tests/conftest.py` + +**Step 1: Create the shared fixture** + +This fixture starts ArcadeDB once per test session, waits for readiness, and provides the base URL to all tests. + +```python +# SPDX-FileCopyrightText: 2026-present ArcadeData Ltd +# SPDX-License-Identifier: Apache-2.0 + +import time + +import pytest +import requests +from testcontainers.core.container import DockerContainer + +ARCADEDB_IMAGE = "arcadedata/arcadedb:26.3.1" + + +def _wait_for_ready(container, timeout=60): + """Wait for ArcadeDB HTTP API to become ready.""" + host = container.get_container_host_ip() + port = container.get_exposed_port(2480) + url = f"http://{host}:{port}/api/v1/ready" + + start = time.time() + while time.time() - start < timeout: + try: + resp = requests.get(url, timeout=2) + if resp.status_code == 204: + return + except (requests.ConnectionError, requests.Timeout): + pass + time.sleep(1) + raise TimeoutError(f"ArcadeDB not ready at {url} after {timeout}s") + + +@pytest.fixture(scope="session") +def arcadedb_url(): + """Start an ArcadeDB container and yield its HTTP base URL.""" + container = ( + DockerContainer(ARCADEDB_IMAGE) + .with_exposed_ports(2480) + .with_env( + "JAVA_OPTS", + "-Darcadedb.server.rootPassword=arcadedb", + ) + ) + container.start() + _wait_for_ready(container) + + host = container.get_container_host_ip() + port = container.get_exposed_port(2480) + yield f"http://{host}:{port}" + + container.stop() +``` + +**Step 2: Verify the file is syntactically valid** + +Run: `python -c "import ast; ast.parse(open('tests/conftest.py').read()); print('OK')"` +Expected: `OK` + +**Step 3: Commit** + +```bash +git add tests/conftest.py +git commit -m "test: add testcontainers fixture for ArcadeDB" +``` + +--- + +### Task 5: Refactor test_document_store.py to use testcontainers + +**Files:** +- Modify: `tests/test_document_store.py` + +**Step 1: Rewrite the test file** + +Key changes: +- Remove `import os` and `import unittest` +- Add `import pytest` +- Replace `_store()` helper to accept `arcadedb_url` fixture instead of env var +- Convert from `unittest.TestCase` class to plain pytest functions +- Each test receives `arcadedb_url` fixture and creates a fresh store with `recreate_type=True` + +Replace the entire file with: + +```python +# SPDX-FileCopyrightText: 2026-present ArcadeData Ltd +# SPDX-License-Identifier: Apache-2.0 + +"""Integration tests for ArcadeDBDocumentStore (using testcontainers).""" + +import pytest +from haystack import Document +from haystack.document_stores.errors import DuplicateDocumentError +from haystack.document_stores.types import DuplicatePolicy + +from haystack_integrations.document_stores.arcadedb import ArcadeDBDocumentStore + + +def _store(arcadedb_url, **kwargs): + return ArcadeDBDocumentStore( + url=arcadedb_url, + database="haystack_test", + username=kwargs.pop("username", None) + or ArcadeDBDocumentStore.__init__.__kwdefaults__["username"], + password=kwargs.pop("password", None) + or ArcadeDBDocumentStore.__init__.__kwdefaults__["password"], + recreate_type=True, + **kwargs, + ) + + +def _sample_docs(n=3, dim=4): + return [ + Document( + content=f"Document number {i}", + embedding=[float(i)] * dim, + meta={"category": "test", "priority": i}, + ) + for i in range(n) + ] + + +# ---- count ---- + + +def test_count_empty(arcadedb_url): + store = _store(arcadedb_url, embedding_dimension=4) + assert store.count_documents() == 0 + + +def test_count_after_write(arcadedb_url): + store = _store(arcadedb_url, embedding_dimension=4) + docs = _sample_docs(5) + store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + assert store.count_documents() == 5 + + +# ---- write ---- + + +def test_write_and_read(arcadedb_url): + store = _store(arcadedb_url, embedding_dimension=4) + docs = _sample_docs(2) + written = store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + assert written == 2 + + all_docs = store.filter_documents() + assert len(all_docs) == 2 + + +def test_write_overwrite(arcadedb_url): + store = _store(arcadedb_url, embedding_dimension=4) + docs = _sample_docs(1) + store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + + docs[0].content = "Updated content" + store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + + all_docs = store.filter_documents() + assert len(all_docs) == 1 + assert all_docs[0].content == "Updated content" + + +def test_write_skip(arcadedb_url): + store = _store(arcadedb_url, embedding_dimension=4) + docs = _sample_docs(1) + store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + + written = store.write_documents(docs, policy=DuplicatePolicy.SKIP) + assert written == 0 + assert store.count_documents() == 1 + + +def test_write_duplicate_raises(arcadedb_url): + store = _store(arcadedb_url, embedding_dimension=4) + docs = _sample_docs(1) + store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + + with pytest.raises(DuplicateDocumentError): + store.write_documents(docs, policy=DuplicatePolicy.NONE) + + +# ---- delete ---- + + +def test_delete(arcadedb_url): + store = _store(arcadedb_url, embedding_dimension=4) + docs = _sample_docs(3) + store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + + store.delete_documents([docs[0].id, docs[1].id]) + assert store.count_documents() == 1 + + +# ---- filter ---- + + +def test_filter_equality(arcadedb_url): + store = _store(arcadedb_url, embedding_dimension=4) + docs = _sample_docs(3) + store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + + result = store.filter_documents( + filters={"field": "meta.category", "operator": "==", "value": "test"} + ) + assert len(result) == 3 + + +def test_filter_comparison(arcadedb_url): + store = _store(arcadedb_url, embedding_dimension=4) + docs = _sample_docs(5) + store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + + result = store.filter_documents( + filters={"field": "meta.priority", "operator": ">", "value": 2} + ) + assert len(result) == 2 + + +def test_filter_and(arcadedb_url): + store = _store(arcadedb_url, embedding_dimension=4) + docs = _sample_docs(5) + store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + + result = store.filter_documents( + filters={ + "operator": "AND", + "conditions": [ + {"field": "meta.category", "operator": "==", "value": "test"}, + {"field": "meta.priority", "operator": ">=", "value": 3}, + ], + } + ) + assert len(result) == 2 + + +# ---- embedding retrieval ---- + + +def test_embedding_retrieval(arcadedb_url): + store = _store(arcadedb_url, embedding_dimension=4) + docs = _sample_docs(5, dim=4) + store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + + results = store._embedding_retrieval( + query_embedding=[4.0, 4.0, 4.0, 4.0], top_k=3 + ) + assert len(results) <= 3 + assert results[0].score is not None + + +# ---- serialization ---- + + +def test_to_dict_from_dict(arcadedb_url): + store = _store(arcadedb_url, embedding_dimension=4) + data = store.to_dict() + restored = ArcadeDBDocumentStore.from_dict(data) + assert restored._database == store._database + assert restored._embedding_dimension == store._embedding_dimension +``` + +**Step 2: Verify syntax** + +Run: `python -c "import ast; ast.parse(open('tests/test_document_store.py').read()); print('OK')"` +Expected: `OK` + +**Step 3: Run the unit tests (filters) to confirm no breakage** + +Run: `pytest tests/test_filters.py -v` +Expected: All tests pass + +**Step 4: Commit** + +```bash +git add tests/test_document_store.py +git commit -m "test: refactor integration tests to use testcontainers fixture" +``` + +--- + +### Task 6: Run full test suite locally (if Docker available) + +**Step 1: Run all tests** + +Run: `pytest tests/ -v --tb=short` +Expected: All tests pass (filter unit tests + document store integration tests) + +**Step 2: Run linting** + +Run: `ruff check . && ruff format --check .` +Expected: No errors + +**Step 3: Fix any lint issues if found** + +Run: `ruff format .` (if formatting issues) then re-run checks. + +**Step 4: Commit any fixes** + +```bash +git add -A +git commit -m "style: fix lint issues" +``` diff --git a/examples/embedding_retrieval.py b/examples/embedding_retrieval.py index e849571..d9875fc 100644 --- a/examples/embedding_retrieval.py +++ b/examples/embedding_retrieval.py @@ -21,7 +21,9 @@ from haystack import Document, Pipeline from haystack.document_stores.types import DuplicatePolicy -from haystack_integrations.components.retrievers.arcadedb import ArcadeDBEmbeddingRetriever +from haystack_integrations.components.retrievers.arcadedb import ( + ArcadeDBEmbeddingRetriever, +) from haystack_integrations.document_stores.arcadedb import ArcadeDBDocumentStore # --- 1. Create the document store --- @@ -36,7 +38,10 @@ # --- 2. Write some documents --- documents = [ Document( - content="ArcadeDB is a multi-model database supporting graphs, documents, key-value, time-series, and vectors.", + content=( + "ArcadeDB is a multi-model database supporting graphs," + " documents, key-value, time-series, and vectors." + ), embedding=[1.0, 0.0, 0.0, 0.0], meta={"category": "database", "source": "docs"}, ), @@ -46,12 +51,17 @@ meta={"category": "framework", "source": "docs"}, ), Document( - content="HNSW (Hierarchical Navigable Small World) enables fast approximate nearest neighbor search.", + content=( + "HNSW (Hierarchical Navigable Small World) enables" + " fast approximate nearest neighbor search." + ), embedding=[0.5, 0.5, 0.0, 0.0], meta={"category": "algorithm", "source": "paper"}, ), Document( - content="Vector databases store high-dimensional embeddings for semantic search.", + content=( + "Vector databases store high-dimensional embeddings for semantic search." + ), embedding=[0.8, 0.2, 0.0, 0.0], meta={"category": "database", "source": "blog"}, ), @@ -63,7 +73,9 @@ # --- 3. Build a retrieval pipeline --- pipeline = Pipeline() -pipeline.add_component("retriever", ArcadeDBEmbeddingRetriever(document_store=store, top_k=3)) +pipeline.add_component( + "retriever", ArcadeDBEmbeddingRetriever(document_store=store, top_k=3) +) # --- 4. Run a similarity search --- query_embedding = [0.9, 0.1, 0.0, 0.0] # close to "ArcadeDB" and "Vector databases" @@ -71,15 +83,25 @@ print("\n--- Top 3 results ---") for doc in result["retriever"]["documents"]: - print(f" score={doc.score:.4f} category={doc.meta.get('category')} content={doc.content[:80]}...") + print( + f" score={doc.score:.4f}" + f" category={doc.meta.get('category')}" + f" content={doc.content[:80]}..." + ) # --- 5. Filter retrieval (only 'database' category) --- -result_filtered = pipeline.run({ - "retriever": { - "query_embedding": query_embedding, - "filters": {"field": "meta.category", "operator": "==", "value": "database"}, +result_filtered = pipeline.run( + { + "retriever": { + "query_embedding": query_embedding, + "filters": { + "field": "meta.category", + "operator": "==", + "value": "database", + }, + } } -}) +) print("\n--- Filtered (category=database) ---") for doc in result_filtered["retriever"]["documents"]: diff --git a/pyproject.toml b/pyproject.toml index 6cc2aa7..b38207c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,15 @@ Homepage = "https://arcadedb.com" Repository = "https://github.com/ArcadeData/arcadedb-haystack" Documentation = "https://docs.arcadedb.com" +[project.optional-dependencies] +dev = [ + "pytest", + "pytest-cov", + "testcontainers>=4.9.1", + "docker>=7.1.0", + "ruff", +] + [tool.hatch.build.targets.wheel] packages = ["src/haystack_integrations"] @@ -40,6 +49,9 @@ packages = ["src/haystack_integrations"] dependencies = [ "pytest", "pytest-cov", + "testcontainers>=4.9.1", + "docker>=7.1.0", + "ruff", ] [tool.hatch.envs.default.scripts] diff --git a/src/haystack_integrations/components/retrievers/arcadedb/embedding_retriever.py b/src/haystack_integrations/components/retrievers/arcadedb/embedding_retriever.py index 0dc5b15..369a932 100644 --- a/src/haystack_integrations/components/retrievers/arcadedb/embedding_retriever.py +++ b/src/haystack_integrations/components/retrievers/arcadedb/embedding_retriever.py @@ -19,8 +19,12 @@ class ArcadeDBEmbeddingRetriever: Usage in a Haystack pipeline: ```python - from haystack_integrations.components.retrievers.arcadedb import ArcadeDBEmbeddingRetriever - from haystack_integrations.document_stores.arcadedb import ArcadeDBDocumentStore + from haystack_integrations.components.retrievers.arcadedb import ( + ArcadeDBEmbeddingRetriever, + ) + from haystack_integrations.document_stores.arcadedb import ( + ArcadeDBDocumentStore, + ) store = ArcadeDBDocumentStore(database="mydb") retriever = ArcadeDBEmbeddingRetriever(document_store=store, top_k=5) @@ -51,7 +55,8 @@ def run( Retrieve documents by vector similarity. :param query_embedding: The embedding vector to search with. - :param filters: Optional filters to narrow results (overrides or merges with init filters). + :param filters: Optional filters to narrow results + (overrides or merges with init filters). :param top_k: Maximum number of documents to return. :returns: A dict with key ``"documents"`` containing the retrieved documents. """ @@ -60,7 +65,11 @@ def run( # Resolve filter policy if self._filter_policy == FilterPolicy.REPLACE and filters is not None: effective_filters = filters - elif self._filter_policy == FilterPolicy.MERGE and filters is not None and self._filters is not None: + elif ( + self._filter_policy == FilterPolicy.MERGE + and filters is not None + and self._filters is not None + ): effective_filters = { "operator": "AND", "conditions": [self._filters, filters], diff --git a/src/haystack_integrations/document_stores/arcadedb/document_store.py b/src/haystack_integrations/document_stores/arcadedb/document_store.py index ee30907..f727a17 100644 --- a/src/haystack_integrations/document_stores/arcadedb/document_store.py +++ b/src/haystack_integrations/document_stores/arcadedb/document_store.py @@ -1,7 +1,10 @@ # SPDX-FileCopyrightText: 2026-present ArcadeData Ltd # SPDX-License-Identifier: Apache-2.0 -"""ArcadeDB DocumentStore for Haystack 2.x — document storage + HNSW vector search via HTTP/JSON API.""" +"""ArcadeDB DocumentStore for Haystack 2.x. + +Document storage + HNSW vector search via HTTP/JSON API. +""" import logging from typing import Any, ClassVar @@ -101,7 +104,9 @@ def _auth(self) -> tuple[str, str] | None: return (user, pwd) return None - def _command(self, sql: str, *, positional_params: list[Any] | None = None) -> list[dict[str, Any]]: + def _command( + self, sql: str, *, positional_params: list[Any] | None = None + ) -> list[dict[str, Any]]: """Execute an SQL command via the ArcadeDB HTTP API and return result rows.""" url = f"{self._url}/api/v1/command/{self._database}" payload: dict[str, Any] = {"language": "sql", "command": sql} @@ -136,7 +141,9 @@ def _ensure_initialized(self) -> None: logger.info("Created database '%s'", self._database) except RuntimeError: # Database likely already exists - logger.debug("Database '%s' already exists or cannot be created", self._database) + logger.debug( + "Database '%s' already exists or cannot be created", self._database + ) # 2. Optionally drop existing type if self._recreate_type: @@ -147,24 +154,19 @@ def _ensure_initialized(self) -> None: # 3. Create vertex type + properties self._command(f"CREATE VERTEX TYPE `{self._type_name}` IF NOT EXISTS") - self._command( - f"CREATE PROPERTY `{self._type_name}`.id IF NOT EXISTS STRING" - ) + self._command(f"CREATE PROPERTY `{self._type_name}`.id IF NOT EXISTS STRING") self._command( f"CREATE PROPERTY `{self._type_name}`.content IF NOT EXISTS STRING" ) self._command( - f"CREATE PROPERTY `{self._type_name}`.embedding IF NOT EXISTS ARRAY_OF_FLOATS" - ) - self._command( - f"CREATE PROPERTY `{self._type_name}`.meta IF NOT EXISTS MAP" + f"CREATE PROPERTY `{self._type_name}`" + ".embedding IF NOT EXISTS ARRAY_OF_FLOATS" ) + self._command(f"CREATE PROPERTY `{self._type_name}`.meta IF NOT EXISTS MAP") # 4. Unique index on id try: - self._command( - f"CREATE INDEX ON `{self._type_name}` (id) UNIQUE" - ) + self._command(f"CREATE INDEX ON `{self._type_name}` (id) UNIQUE") except RuntimeError: logger.debug("Unique index on id already exists") @@ -172,16 +174,23 @@ def _ensure_initialized(self) -> None: metric = self._SIMILARITY_MAP.get(self._similarity_function, "COSINE") try: self._command( - f"CREATE INDEX IF NOT EXISTS ON `{self._type_name}` (embedding) LSM_VECTOR " - f"METADATA {{ dimensions: {self._embedding_dimension}, similarity: '{metric}' }}" + f"CREATE INDEX IF NOT EXISTS ON " + f"`{self._type_name}` (embedding) LSM_VECTOR " + f"METADATA {{ dimensions: " + f"{self._embedding_dimension}, " + f"similarity: '{metric}' }}" ) except RuntimeError: logger.debug("Vector index on embedding already exists") self._initialized = True logger.info( - "ArcadeDBDocumentStore initialized: database=%s, type=%s, dim=%d, metric=%s", - self._database, self._type_name, self._embedding_dimension, metric, + "ArcadeDBDocumentStore initialized:" + " database=%s, type=%s, dim=%d, metric=%s", + self._database, + self._type_name, + self._embedding_dimension, + metric, ) # ------------------------------------------------------------------ @@ -247,7 +256,8 @@ def write_documents( elif policy == DuplicatePolicy.SKIP: existing = self._command( - f"SELECT id FROM `{self._type_name}` WHERE id = {_sql_str(record['id'])}" + f"SELECT id FROM `{self._type_name}`" + f" WHERE id = {_sql_str(record['id'])}" ) if existing: continue @@ -257,7 +267,8 @@ def write_documents( else: # DuplicatePolicy.NONE — raise on duplicate existing = self._command( - f"SELECT id FROM `{self._type_name}` WHERE id = {_sql_str(record['id'])}" + f"SELECT id FROM `{self._type_name}`" + f" WHERE id = {_sql_str(record['id'])}" ) if existing: msg = f"Document with id '{record['id']}' already exists." @@ -267,7 +278,9 @@ def write_documents( return written - def _insert_record(self, record: dict[str, Any], embedding_str: str, meta_str: str) -> None: + def _insert_record( + self, record: dict[str, Any], embedding_str: str, meta_str: str + ) -> None: sql = ( f"INSERT INTO `{self._type_name}` SET " f"id = {_sql_str(record['id'])}, " @@ -304,7 +317,8 @@ def _embedding_retrieval( self._ensure_initialized() embedding_str = str(query_embedding) - # vectorNeighbors returns a single row with a "neighbors" list of {record, distance} + # vectorNeighbors returns a single row with + # a "neighbors" list of {record, distance} sql = ( f"SELECT vectorNeighbors('{self._type_name}[embedding]', " f"{embedding_str}, {top_k}) AS neighbors" diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..b8a083c --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,33 @@ +# SPDX-FileCopyrightText: 2026-present ArcadeData Ltd +# SPDX-License-Identifier: Apache-2.0 + +import pytest +from testcontainers.core.container import DockerContainer +from testcontainers.core.wait_strategies import HttpWaitStrategy + +ARCADEDB_IMAGE = "arcadedata/arcadedb:26.3.1" + + +@pytest.fixture(scope="session") +def arcadedb_url(): + """Start an ArcadeDB container and yield its HTTP base URL.""" + container = ( + DockerContainer(ARCADEDB_IMAGE) + .with_exposed_ports(2480) + .with_env( + "JAVA_OPTS", + "-Darcadedb.server.rootPassword=arcadedb", + ) + .waiting_for( + HttpWaitStrategy(2480, "/api/v1/ready") + .for_status_code(204) + .with_startup_timeout(60) + ) + ) + container.start() + + host = container.get_container_host_ip() + port = container.get_exposed_port(2480) + yield f"http://{host}:{port}" + + container.stop() diff --git a/tests/test_document_store.py b/tests/test_document_store.py index b290e77..c8dd886 100644 --- a/tests/test_document_store.py +++ b/tests/test_document_store.py @@ -1,174 +1,181 @@ # SPDX-FileCopyrightText: 2026-present ArcadeData Ltd # SPDX-License-Identifier: Apache-2.0 -""" -Integration tests for ArcadeDBDocumentStore. - -Prerequisites: - docker run -d -p 2480:2480 \ - -e JAVA_OPTS="-Darcadedb.server.rootPassword=arcadedb" \ - arcadedata/arcadedb:latest -""" - -import os -import unittest +"""Integration tests for ArcadeDBDocumentStore (using testcontainers).""" +import pytest from haystack import Document +from haystack.document_stores.errors import DuplicateDocumentError from haystack.document_stores.types import DuplicatePolicy +from haystack.utils import Secret from haystack_integrations.document_stores.arcadedb import ArcadeDBDocumentStore -def _store(**kwargs) -> ArcadeDBDocumentStore: +def _store(arcadedb_url, **kwargs): return ArcadeDBDocumentStore( - url=os.getenv("ARCADEDB_URL", "http://localhost:2480"), + url=arcadedb_url, database="haystack_test", - username=kwargs.pop("username", None) - or ArcadeDBDocumentStore.__init__.__kwdefaults__["username"], - password=kwargs.pop("password", None) - or ArcadeDBDocumentStore.__init__.__kwdefaults__["password"], + username=Secret.from_token("root"), + password=Secret.from_token("arcadedb"), recreate_type=True, **kwargs, ) -def _sample_docs(n: int = 3, dim: int = 4) -> list[Document]: - docs = [] - for i in range(n): - docs.append( - Document( - content=f"Document number {i}", - embedding=[float(i)] * dim, - meta={"category": "test", "priority": i}, - ) +def _sample_docs(n=3, dim=4): + return [ + Document( + content=f"Document number {i}", + embedding=[float(i)] * dim, + meta={"category": "test", "priority": i}, ) - return docs + for i in range(n) + ] -class TestArcadeDBDocumentStore(unittest.TestCase): - """Integration tests — require a running ArcadeDB instance.""" +# ---- count ---- - def setUp(self): - self.store = _store(embedding_dimension=4) - # ---- count ---- +def test_count_empty(arcadedb_url): + store = _store(arcadedb_url, embedding_dimension=4) + assert store.count_documents() == 0 - def test_count_empty(self): - self.assertEqual(self.store.count_documents(), 0) - def test_count_after_write(self): - docs = _sample_docs(5) - self.store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) - self.assertEqual(self.store.count_documents(), 5) +def test_count_after_write(arcadedb_url): + store = _store(arcadedb_url, embedding_dimension=4) + docs = _sample_docs(5) + store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + assert store.count_documents() == 5 - # ---- write ---- - def test_write_and_read(self): - docs = _sample_docs(2) - written = self.store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) - self.assertEqual(written, 2) +# ---- write ---- - all_docs = self.store.filter_documents() - self.assertEqual(len(all_docs), 2) - def test_write_overwrite(self): - docs = _sample_docs(1) - self.store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) +def test_write_and_read(arcadedb_url): + store = _store(arcadedb_url, embedding_dimension=4) + docs = _sample_docs(2) + written = store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + assert written == 2 - # Modify content and overwrite - docs[0].content = "Updated content" - self.store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + all_docs = store.filter_documents() + assert len(all_docs) == 2 - all_docs = self.store.filter_documents() - self.assertEqual(len(all_docs), 1) - self.assertEqual(all_docs[0].content, "Updated content") - def test_write_skip(self): - docs = _sample_docs(1) - self.store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) +def test_write_overwrite(arcadedb_url): + store = _store(arcadedb_url, embedding_dimension=4) + docs = _sample_docs(1) + store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) - # Attempt to write same doc with SKIP policy - written = self.store.write_documents(docs, policy=DuplicatePolicy.SKIP) - self.assertEqual(written, 0) - self.assertEqual(self.store.count_documents(), 1) + docs[0].content = "Updated content" + store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) - def test_write_duplicate_raises(self): - docs = _sample_docs(1) - self.store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + all_docs = store.filter_documents() + assert len(all_docs) == 1 + assert all_docs[0].content == "Updated content" - from haystack.document_stores.errors import DuplicateDocumentError - with self.assertRaises(DuplicateDocumentError): - self.store.write_documents(docs, policy=DuplicatePolicy.NONE) +def test_write_skip(arcadedb_url): + store = _store(arcadedb_url, embedding_dimension=4) + docs = _sample_docs(1) + store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) - # ---- delete ---- + written = store.write_documents(docs, policy=DuplicatePolicy.SKIP) + assert written == 0 + assert store.count_documents() == 1 - def test_delete(self): - docs = _sample_docs(3) - self.store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) - ids_to_delete = [docs[0].id, docs[1].id] - self.store.delete_documents(ids_to_delete) +def test_write_duplicate_raises(arcadedb_url): + store = _store(arcadedb_url, embedding_dimension=4) + docs = _sample_docs(1) + store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) - self.assertEqual(self.store.count_documents(), 1) + with pytest.raises(DuplicateDocumentError): + store.write_documents(docs, policy=DuplicatePolicy.NONE) - # ---- filter ---- - def test_filter_equality(self): - docs = _sample_docs(3) - self.store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) +# ---- delete ---- - result = self.store.filter_documents( - filters={"field": "meta.category", "operator": "==", "value": "test"} - ) - self.assertEqual(len(result), 3) - def test_filter_comparison(self): - docs = _sample_docs(5) - self.store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) +def test_delete(arcadedb_url): + store = _store(arcadedb_url, embedding_dimension=4) + docs = _sample_docs(3) + store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) - result = self.store.filter_documents( - filters={"field": "meta.priority", "operator": ">", "value": 2} - ) - self.assertEqual(len(result), 2) # priority 3 and 4 - - def test_filter_and(self): - docs = _sample_docs(5) - self.store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) - - result = self.store.filter_documents( - filters={ - "operator": "AND", - "conditions": [ - {"field": "meta.category", "operator": "==", "value": "test"}, - {"field": "meta.priority", "operator": ">=", "value": 3}, - ], - } - ) - self.assertEqual(len(result), 2) + store.delete_documents([docs[0].id, docs[1].id]) + assert store.count_documents() == 1 - # ---- embedding retrieval ---- - def test_embedding_retrieval(self): - docs = _sample_docs(5, dim=4) - self.store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) +# ---- filter ---- - results = self.store._embedding_retrieval( - query_embedding=[4.0, 4.0, 4.0, 4.0], top_k=3 - ) - self.assertLessEqual(len(results), 3) - # The closest document should be the one with embedding [4.0, 4.0, 4.0, 4.0] - self.assertIsNotNone(results[0].score) - # ---- serialization ---- +def test_filter_equality(arcadedb_url): + store = _store(arcadedb_url, embedding_dimension=4) + docs = _sample_docs(3) + store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + + result = store.filter_documents( + filters={"field": "meta.category", "operator": "==", "value": "test"} + ) + assert len(result) == 3 + + +def test_filter_comparison(arcadedb_url): + store = _store(arcadedb_url, embedding_dimension=4) + docs = _sample_docs(5) + store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + + result = store.filter_documents( + filters={"field": "meta.priority", "operator": ">", "value": 2} + ) + assert len(result) == 2 + + +def test_filter_and(arcadedb_url): + store = _store(arcadedb_url, embedding_dimension=4) + docs = _sample_docs(5) + store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + + result = store.filter_documents( + filters={ + "operator": "AND", + "conditions": [ + {"field": "meta.category", "operator": "==", "value": "test"}, + {"field": "meta.priority", "operator": ">=", "value": 3}, + ], + } + ) + assert len(result) == 2 + + +# ---- embedding retrieval ---- + + +def test_embedding_retrieval(arcadedb_url): + store = _store(arcadedb_url, embedding_dimension=4) + docs = _sample_docs(5, dim=4) + store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) - def test_to_dict_from_dict(self): - store = _store(embedding_dimension=4) - data = store.to_dict() - restored = ArcadeDBDocumentStore.from_dict(data) - self.assertEqual(restored._database, store._database) - self.assertEqual(restored._embedding_dimension, store._embedding_dimension) + results = store._embedding_retrieval(query_embedding=[4.0, 4.0, 4.0, 4.0], top_k=3) + assert len(results) <= 3 + assert results[0].score is not None -if __name__ == "__main__": - unittest.main() +# ---- serialization ---- + + +def test_to_dict_from_dict(arcadedb_url, monkeypatch): + monkeypatch.setenv("ARCADEDB_USERNAME", "root") + monkeypatch.setenv("ARCADEDB_PASSWORD", "arcadedb") + store = ArcadeDBDocumentStore( + url=arcadedb_url, + database="haystack_test", + username=Secret.from_env_var("ARCADEDB_USERNAME"), + password=Secret.from_env_var("ARCADEDB_PASSWORD"), + embedding_dimension=4, + recreate_type=True, + ) + data = store.to_dict() + restored = ArcadeDBDocumentStore.from_dict(data) + assert restored._database == store._database + assert restored._embedding_dimension == store._embedding_dimension diff --git a/tests/test_filters.py b/tests/test_filters.py index 9f460e9..3bcb2d1 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -9,24 +9,31 @@ class TestFilterConversion(unittest.TestCase): - def test_none_returns_empty(self): self.assertEqual(_convert_filters(None), "") def test_equality(self): - result = _convert_filters({"field": "meta.name", "operator": "==", "value": "alice"}) + result = _convert_filters( + {"field": "meta.name", "operator": "==", "value": "alice"} + ) self.assertEqual(result, "meta.name = 'alice'") def test_equality_null(self): - result = _convert_filters({"field": "meta.name", "operator": "==", "value": None}) + result = _convert_filters( + {"field": "meta.name", "operator": "==", "value": None} + ) self.assertEqual(result, "meta.name IS NULL") def test_not_equal(self): - result = _convert_filters({"field": "meta.name", "operator": "!=", "value": "bob"}) + result = _convert_filters( + {"field": "meta.name", "operator": "!=", "value": "bob"} + ) self.assertEqual(result, "meta.name <> 'bob'") def test_not_equal_null(self): - result = _convert_filters({"field": "meta.name", "operator": "!=", "value": None}) + result = _convert_filters( + {"field": "meta.name", "operator": "!=", "value": None} + ) self.assertEqual(result, "meta.name IS NOT NULL") def test_greater_than(self): @@ -34,56 +41,68 @@ def test_greater_than(self): self.assertEqual(result, "meta.score > 5") def test_in_operator(self): - result = _convert_filters({"field": "meta.tag", "operator": "in", "value": ["a", "b"]}) + result = _convert_filters( + {"field": "meta.tag", "operator": "in", "value": ["a", "b"]} + ) self.assertEqual(result, "meta.tag IN ['a', 'b']") def test_not_in_operator(self): - result = _convert_filters({"field": "meta.tag", "operator": "not in", "value": ["x"]}) + result = _convert_filters( + {"field": "meta.tag", "operator": "not in", "value": ["x"]} + ) self.assertEqual(result, "meta.tag NOT IN ['x']") def test_and(self): - result = _convert_filters({ - "operator": "AND", - "conditions": [ - {"field": "meta.a", "operator": "==", "value": 1}, - {"field": "meta.b", "operator": ">", "value": 2}, - ], - }) + result = _convert_filters( + { + "operator": "AND", + "conditions": [ + {"field": "meta.a", "operator": "==", "value": 1}, + {"field": "meta.b", "operator": ">", "value": 2}, + ], + } + ) self.assertEqual(result, "(meta.a = 1 AND meta.b > 2)") def test_or(self): - result = _convert_filters({ - "operator": "OR", - "conditions": [ - {"field": "meta.x", "operator": "==", "value": "yes"}, - {"field": "meta.y", "operator": "==", "value": "no"}, - ], - }) + result = _convert_filters( + { + "operator": "OR", + "conditions": [ + {"field": "meta.x", "operator": "==", "value": "yes"}, + {"field": "meta.y", "operator": "==", "value": "no"}, + ], + } + ) self.assertEqual(result, "(meta.x = 'yes' OR meta.y = 'no')") def test_not(self): - result = _convert_filters({ - "operator": "NOT", - "conditions": [ - {"field": "meta.deleted", "operator": "==", "value": True}, - ], - }) + result = _convert_filters( + { + "operator": "NOT", + "conditions": [ + {"field": "meta.deleted", "operator": "==", "value": True}, + ], + } + ) self.assertEqual(result, "NOT (meta.deleted = true)") def test_nested(self): - result = _convert_filters({ - "operator": "AND", - "conditions": [ - {"field": "meta.a", "operator": "==", "value": 1}, - { - "operator": "OR", - "conditions": [ - {"field": "meta.b", "operator": "==", "value": 2}, - {"field": "meta.c", "operator": "==", "value": 3}, - ], - }, - ], - }) + result = _convert_filters( + { + "operator": "AND", + "conditions": [ + {"field": "meta.a", "operator": "==", "value": 1}, + { + "operator": "OR", + "conditions": [ + {"field": "meta.b", "operator": "==", "value": 2}, + {"field": "meta.c", "operator": "==", "value": 3}, + ], + }, + ], + } + ) self.assertEqual(result, "(meta.a = 1 AND (meta.b = 2 OR meta.c = 3))") def test_missing_operator_raises(self):