From 0ca011d1b4c283fa593cfb55817585fb2e48e065 Mon Sep 17 00:00:00 2001 From: WU Leizhi Date: Thu, 5 Feb 2026 20:07:59 +0800 Subject: [PATCH] feat: add sqlite vector store --- .gitignore | 3 ++ README.md | 21 ++++++++++ docs/ARCHITECTURE.md | 14 +++++++ docs/SPEC.md | 12 ++++++ pyproject.toml | 23 +++++++++++ src/memomind/__init__.py | 9 +++++ src/memomind/config/__init__.py | 5 +++ src/memomind/config/settings.py | 12 ++++++ src/memomind/core/__init__.py | 6 +++ src/memomind/core/agent.py | 51 +++++++++++++++++++++++ src/memomind/core/knowledge.py | 19 +++++++++ src/memomind/core/memory.py | 31 ++++++++++++++ src/memomind/llm/__init__.py | 6 +++ src/memomind/llm/client.py | 13 ++++++ src/memomind/llm/embeddings.py | 16 ++++++++ src/memomind/models/__init__.py | 7 ++++ src/memomind/models/document.py | 17 ++++++++ src/memomind/models/memory.py | 17 ++++++++ src/memomind/models/message.py | 15 +++++++ src/memomind/storage/__init__.py | 7 ++++ src/memomind/storage/base.py | 13 ++++++ src/memomind/storage/serialization.py | 20 +++++++++ src/memomind/storage/sqlite_store.py | 58 +++++++++++++++++++++++++++ src/memomind/storage/vector_store.py | 35 ++++++++++++++++ tests/conftest.py | 9 +++++ tests/test_vector_store.py | 27 +++++++++++++ 26 files changed, 466 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 docs/ARCHITECTURE.md create mode 100644 docs/SPEC.md create mode 100644 pyproject.toml create mode 100644 src/memomind/__init__.py create mode 100644 src/memomind/config/__init__.py create mode 100644 src/memomind/config/settings.py create mode 100644 src/memomind/core/__init__.py create mode 100644 src/memomind/core/agent.py create mode 100644 src/memomind/core/knowledge.py create mode 100644 src/memomind/core/memory.py create mode 100644 src/memomind/llm/__init__.py create mode 100644 src/memomind/llm/client.py create mode 100644 src/memomind/llm/embeddings.py create mode 100644 src/memomind/models/__init__.py create mode 100644 src/memomind/models/document.py create mode 100644 src/memomind/models/memory.py create mode 100644 src/memomind/models/message.py create mode 100644 src/memomind/storage/__init__.py create mode 100644 src/memomind/storage/base.py create mode 100644 src/memomind/storage/serialization.py create mode 100644 src/memomind/storage/sqlite_store.py create mode 100644 src/memomind/storage/vector_store.py create mode 100644 tests/conftest.py create mode 100644 tests/test_vector_store.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..00f2d38 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +__pycache__/ +*.pyc +.venv/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..adc1669 --- /dev/null +++ b/README.md @@ -0,0 +1,21 @@ +# MemoMind + +MemoMind is a local-first starter kit for building a multimodal personal knowledge agent. It provides a minimal set of Python modules for ingesting content, representing memories, and querying vector stores. + +## Quick start + +```bash +python -m venv .venv +source .venv/bin/activate +pip install -e . +``` + +## Project layout + +- `src/memomind` - core package +- `docs` - specifications and architecture notes + +## Storage options + +- `InMemoryVectorStore` for prototyping. +- `SqliteVectorStore` for local persistence. diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md new file mode 100644 index 0000000..424b092 --- /dev/null +++ b/docs/ARCHITECTURE.md @@ -0,0 +1,14 @@ +# MemoMind Architecture (Draft) + +## Components +- `core.agent`: orchestrates ingestion, retrieval, and LLM calls. +- `core.memory`: manages short/long-term memory stores. +- `storage.vector_store`: in-memory similarity search. +- `storage.sqlite_store`: SQLite-backed vector persistence. +- `llm`: pluggable LLM and embedding clients. + +## Data Flow +1. Raw input -> `models.Document` +2. Document -> embeddings via `llm.embeddings` +3. Embeddings -> `storage.vector_store` or `storage.sqlite_store` +4. Retrieval results -> `core.agent` response diff --git a/docs/SPEC.md b/docs/SPEC.md new file mode 100644 index 0000000..3154a0d --- /dev/null +++ b/docs/SPEC.md @@ -0,0 +1,12 @@ +# MemoMind Specification (Draft) + +## Goals +- Collect multimodal data (text, image, audio, structured) into a personal knowledge base. +- Provide short/long-term memory with retrieval for Q&A. +- Run locally with optional pluggable LLM backends. + +## MVP Scope +- Text ingestion pipeline. +- Memory data models (document, memory, message). +- In-memory or SQLite-backed vector search interface. +- Simple agent orchestrator for storing and retrieving memories. diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..7e144fe --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,23 @@ +[project] +name = "memomind" +version = "0.1.0" +description = "Local-first multimodal memory agent starter" +readme = "README.md" +requires-python = ">=3.10" +license = { text = "MIT" } +authors = [ + { name = "Project-Test Contributors" } +] + +[project.optional-dependencies] +dev = ["pytest"] + +[build-system] +requires = ["setuptools>=68", "wheel"] +build-backend = "setuptools.build_meta" + +[tool.setuptools] +package-dir = {"" = "src"} + +[tool.setuptools.packages.find] +where = ["src"] diff --git a/src/memomind/__init__.py b/src/memomind/__init__.py new file mode 100644 index 0000000..c9f9be2 --- /dev/null +++ b/src/memomind/__init__.py @@ -0,0 +1,9 @@ +"""MemoMind core package.""" + +from memomind.core.agent import MemoAgent +from memomind.core.memory import MemoryStore +from memomind.models.document import Document +from memomind.models.memory import Memory +from memomind.models.message import Message + +__all__ = ["Document", "Memory", "Message", "MemoryStore", "MemoAgent"] diff --git a/src/memomind/config/__init__.py b/src/memomind/config/__init__.py new file mode 100644 index 0000000..9f5ed5a --- /dev/null +++ b/src/memomind/config/__init__.py @@ -0,0 +1,5 @@ +"""Configuration helpers.""" + +from memomind.config.settings import Settings + +__all__ = ["Settings"] diff --git a/src/memomind/config/settings.py b/src/memomind/config/settings.py new file mode 100644 index 0000000..19caf38 --- /dev/null +++ b/src/memomind/config/settings.py @@ -0,0 +1,12 @@ +from __future__ import annotations + +from dataclasses import dataclass + + +@dataclass(frozen=True) +class Settings: + """Runtime configuration for MemoMind.""" + + embedding_model: str = "local-embedding" + llm_model: str = "local-llm" + max_context_messages: int = 12 diff --git a/src/memomind/core/__init__.py b/src/memomind/core/__init__.py new file mode 100644 index 0000000..3e9eab5 --- /dev/null +++ b/src/memomind/core/__init__.py @@ -0,0 +1,6 @@ +"""Core orchestration modules.""" + +from memomind.core.agent import MemoAgent +from memomind.core.memory import MemoryStore + +__all__ = ["MemoAgent", "MemoryStore"] diff --git a/src/memomind/core/agent.py b/src/memomind/core/agent.py new file mode 100644 index 0000000..8182746 --- /dev/null +++ b/src/memomind/core/agent.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import List, Optional + +from memomind.config.settings import Settings +from memomind.core.knowledge import KnowledgeBase +from memomind.core.memory import MemoryStore +from memomind.llm.client import LlmClient +from memomind.llm.embeddings import EmbeddingClient +from memomind.models.document import Document +from memomind.models.memory import Memory + + +@dataclass +class MemoAgent: + """Coordinates ingestion, memory storage, and retrieval.""" + + settings: Settings = field(default_factory=Settings) + knowledge_base: KnowledgeBase = field(default_factory=KnowledgeBase) + memory_store: MemoryStore = field(default_factory=MemoryStore) + llm_client: LlmClient = field(default_factory=LlmClient) + embedding_client: EmbeddingClient = field(default_factory=EmbeddingClient) + + def ingest(self, document: Document) -> None: + embedding = self.embedding_client.embed(document.content) + enriched = Document( + content=document.content, + source=document.source, + created_at=document.created_at, + metadata=document.metadata, + tags=document.tags, + embedding=embedding, + ) + self.knowledge_base.add(enriched) + + def remember(self, memory_id: str, memory: Memory) -> None: + embedding = self.embedding_client.embed(memory.summary) + self.memory_store.add_memory(memory_id, memory, embedding=embedding) + + def recall(self, query: str, limit: int = 5) -> List[Memory]: + embedding = self.embedding_client.embed(query) + return list(self.memory_store.search(embedding, limit=limit)) + + def answer(self, query: str) -> Optional[str]: + memories = self.recall(query) + if not memories: + return None + context = "\n".join(memory.summary for memory in memories) + prompt = f"Use the following memories to answer the question:\n{context}\nQuestion: {query}" + return self.llm_client.complete(prompt) diff --git a/src/memomind/core/knowledge.py b/src/memomind/core/knowledge.py new file mode 100644 index 0000000..b168662 --- /dev/null +++ b/src/memomind/core/knowledge.py @@ -0,0 +1,19 @@ +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import List + +from memomind.models.document import Document + + +@dataclass +class KnowledgeBase: + """Tracks ingested documents for later processing.""" + + documents: List[Document] = field(default_factory=list) + + def add(self, document: Document) -> None: + self.documents.append(document) + + def list_sources(self) -> List[str]: + return [doc.source for doc in self.documents] diff --git a/src/memomind/core/memory.py b/src/memomind/core/memory.py new file mode 100644 index 0000000..6441f7c --- /dev/null +++ b/src/memomind/core/memory.py @@ -0,0 +1,31 @@ +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Dict, Iterable, List, Optional + +from memomind.models.memory import Memory +from memomind.storage.base import VectorStore +from memomind.storage.vector_store import InMemoryVectorStore + + +@dataclass +class MemoryStore: + """Manages short and long-term memory collections.""" + + vector_store: VectorStore = field(default_factory=InMemoryVectorStore) + memories: Dict[str, Memory] = field(default_factory=dict) + + def add_memory(self, memory_id: str, memory: Memory, embedding: Optional[List[float]] = None) -> None: + self.memories[memory_id] = memory + if embedding is not None: + self.vector_store.upsert(memory_id, embedding, payload={"summary": memory.summary}) + + def get_memory(self, memory_id: str) -> Optional[Memory]: + return self.memories.get(memory_id) + + def search(self, embedding: List[float], limit: int = 5) -> Iterable[Memory]: + results = self.vector_store.search(embedding, limit=limit) + for memory_id, _score in results: + memory = self.memories.get(memory_id) + if memory: + yield memory diff --git a/src/memomind/llm/__init__.py b/src/memomind/llm/__init__.py new file mode 100644 index 0000000..4be540b --- /dev/null +++ b/src/memomind/llm/__init__.py @@ -0,0 +1,6 @@ +"""LLM integrations.""" + +from memomind.llm.client import LlmClient +from memomind.llm.embeddings import EmbeddingClient + +__all__ = ["LlmClient", "EmbeddingClient"] diff --git a/src/memomind/llm/client.py b/src/memomind/llm/client.py new file mode 100644 index 0000000..3ab8573 --- /dev/null +++ b/src/memomind/llm/client.py @@ -0,0 +1,13 @@ +from __future__ import annotations + +from dataclasses import dataclass + + +@dataclass +class LlmClient: + """Placeholder LLM client implementation.""" + + model: str = "local-llm" + + def complete(self, prompt: str) -> str: + return f"[stubbed response from {self.model}] {prompt}" diff --git a/src/memomind/llm/embeddings.py b/src/memomind/llm/embeddings.py new file mode 100644 index 0000000..8f8950a --- /dev/null +++ b/src/memomind/llm/embeddings.py @@ -0,0 +1,16 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import List + + +@dataclass +class EmbeddingClient: + """Placeholder embedding client implementation.""" + + model: str = "local-embedding" + + def embed(self, text: str) -> List[float]: + if not text: + return [0.0] + return [float(sum(bytearray(text, "utf-8")) % 997) / 997.0] diff --git a/src/memomind/models/__init__.py b/src/memomind/models/__init__.py new file mode 100644 index 0000000..618a494 --- /dev/null +++ b/src/memomind/models/__init__.py @@ -0,0 +1,7 @@ +"""Data models for MemoMind.""" + +from memomind.models.document import Document +from memomind.models.memory import Memory +from memomind.models.message import Message + +__all__ = ["Document", "Memory", "Message"] diff --git a/src/memomind/models/document.py b/src/memomind/models/document.py new file mode 100644 index 0000000..6232701 --- /dev/null +++ b/src/memomind/models/document.py @@ -0,0 +1,17 @@ +from __future__ import annotations + +from dataclasses import dataclass, field +from datetime import datetime +from typing import Any, Dict, List, Optional + + +@dataclass(frozen=True) +class Document: + """Raw content ingested into the system.""" + + content: str + source: str + created_at: datetime = field(default_factory=datetime.utcnow) + metadata: Dict[str, Any] = field(default_factory=dict) + tags: List[str] = field(default_factory=list) + embedding: Optional[List[float]] = None diff --git a/src/memomind/models/memory.py b/src/memomind/models/memory.py new file mode 100644 index 0000000..0971a1b --- /dev/null +++ b/src/memomind/models/memory.py @@ -0,0 +1,17 @@ +from __future__ import annotations + +from dataclasses import dataclass, field +from datetime import datetime +from typing import Any, Dict, Optional + + +@dataclass(frozen=True) +class Memory: + """Normalized memory representation stored for retrieval.""" + + summary: str + memory_type: str + created_at: datetime = field(default_factory=datetime.utcnow) + importance: float = 0.0 + metadata: Dict[str, Any] = field(default_factory=dict) + related_document_id: Optional[str] = None diff --git a/src/memomind/models/message.py b/src/memomind/models/message.py new file mode 100644 index 0000000..b6abf08 --- /dev/null +++ b/src/memomind/models/message.py @@ -0,0 +1,15 @@ +from __future__ import annotations + +from dataclasses import dataclass, field +from datetime import datetime +from typing import Dict + + +@dataclass(frozen=True) +class Message: + """Conversation message for working memory context.""" + + role: str + content: str + created_at: datetime = field(default_factory=datetime.utcnow) + metadata: Dict[str, str] = field(default_factory=dict) diff --git a/src/memomind/storage/__init__.py b/src/memomind/storage/__init__.py new file mode 100644 index 0000000..08cbe40 --- /dev/null +++ b/src/memomind/storage/__init__.py @@ -0,0 +1,7 @@ +"""Storage backends.""" + +from memomind.storage.base import VectorStore +from memomind.storage.sqlite_store import SqliteVectorStore +from memomind.storage.vector_store import InMemoryVectorStore + +__all__ = ["InMemoryVectorStore", "SqliteVectorStore", "VectorStore"] diff --git a/src/memomind/storage/base.py b/src/memomind/storage/base.py new file mode 100644 index 0000000..56f678e --- /dev/null +++ b/src/memomind/storage/base.py @@ -0,0 +1,13 @@ +from __future__ import annotations + +from typing import Dict, List, Protocol, Tuple + + +class VectorStore(Protocol): + """Protocol for vector storage backends.""" + + def upsert(self, item_id: str, vector: List[float], payload: Dict[str, str]) -> None: + ... + + def search(self, vector: List[float], limit: int = 5) -> List[Tuple[str, float]]: + ... diff --git a/src/memomind/storage/serialization.py b/src/memomind/storage/serialization.py new file mode 100644 index 0000000..4ac288d --- /dev/null +++ b/src/memomind/storage/serialization.py @@ -0,0 +1,20 @@ +from __future__ import annotations + +import json +from typing import Any, Dict, List + + +def dumps_vector(vector: List[float]) -> str: + return json.dumps(vector) + + +def loads_vector(value: str) -> List[float]: + return [float(item) for item in json.loads(value)] + + +def dumps_payload(payload: Dict[str, str]) -> str: + return json.dumps(payload) + + +def loads_payload(value: str) -> Dict[str, str]: + return {str(key): str(val) for key, val in json.loads(value).items()} diff --git a/src/memomind/storage/sqlite_store.py b/src/memomind/storage/sqlite_store.py new file mode 100644 index 0000000..e56bdd3 --- /dev/null +++ b/src/memomind/storage/sqlite_store.py @@ -0,0 +1,58 @@ +from __future__ import annotations + +import sqlite3 +from dataclasses import dataclass, field +from pathlib import Path +from typing import Dict, List, Tuple + +from memomind.storage.serialization import dumps_payload, dumps_vector, loads_vector +from memomind.storage.vector_store import cosine_similarity + + +@dataclass +class SqliteVectorStore: + """SQLite-backed vector store for local persistence.""" + + db_path: Path = field(default_factory=lambda: Path("memomind.db")) + + def __post_init__(self) -> None: + self.db_path.parent.mkdir(parents=True, exist_ok=True) + self._init_db() + + def upsert(self, item_id: str, vector: List[float], payload: Dict[str, str]) -> None: + with self._connect() as conn: + conn.execute( + """ + INSERT INTO vectors (id, vector, payload) + VALUES (?, ?, ?) + ON CONFLICT(id) DO UPDATE SET vector=excluded.vector, payload=excluded.payload + """, + (item_id, dumps_vector(vector), dumps_payload(payload)), + ) + conn.commit() + + def search(self, vector: List[float], limit: int = 5) -> List[Tuple[str, float]]: + with self._connect() as conn: + rows = conn.execute("SELECT id, vector FROM vectors").fetchall() + scored = [] + for item_id, stored_vector in rows: + score = cosine_similarity(vector, loads_vector(stored_vector)) + scored.append((item_id, score)) + scored.sort(key=lambda item: item[1], reverse=True) + return scored[:limit] + + def _init_db(self) -> None: + with self._connect() as conn: + conn.execute( + """ + CREATE TABLE IF NOT EXISTS vectors ( + id TEXT PRIMARY KEY, + vector TEXT NOT NULL, + payload TEXT NOT NULL + ) + """ + ) + conn.commit() + + def _connect(self) -> sqlite3.Connection: + return sqlite3.connect(self.db_path) diff --git a/src/memomind/storage/vector_store.py b/src/memomind/storage/vector_store.py new file mode 100644 index 0000000..656fd62 --- /dev/null +++ b/src/memomind/storage/vector_store.py @@ -0,0 +1,35 @@ +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Dict, List, Tuple + + +def cosine_similarity(a: List[float], b: List[float]) -> float: + if not a or not b: + return 0.0 + dot = sum(x * y for x, y in zip(a, b)) + norm_a = sum(x * x for x in a) ** 0.5 + norm_b = sum(y * y for y in b) ** 0.5 + if norm_a == 0.0 or norm_b == 0.0: + return 0.0 + return dot / (norm_a * norm_b) + + +@dataclass +class InMemoryVectorStore: + """Minimal vector store for prototyping similarity search.""" + + vectors: Dict[str, List[float]] = field(default_factory=dict) + payloads: Dict[str, Dict[str, str]] = field(default_factory=dict) + + def upsert(self, item_id: str, vector: List[float], payload: Dict[str, str]) -> None: + self.vectors[item_id] = vector + self.payloads[item_id] = payload + + def search(self, vector: List[float], limit: int = 5) -> List[Tuple[str, float]]: + scores = [] + for item_id, candidate in self.vectors.items(): + score = cosine_similarity(vector, candidate) + scores.append((item_id, score)) + scores.sort(key=lambda pair: pair[1], reverse=True) + return scores[:limit] diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..fdcbc1f --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,9 @@ +from __future__ import annotations + +import sys +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[1] +SRC = ROOT / "src" +if str(SRC) not in sys.path: + sys.path.insert(0, str(SRC)) diff --git a/tests/test_vector_store.py b/tests/test_vector_store.py new file mode 100644 index 0000000..6c758fe --- /dev/null +++ b/tests/test_vector_store.py @@ -0,0 +1,27 @@ +from pathlib import Path + +from memomind.storage.sqlite_store import SqliteVectorStore +from memomind.storage.vector_store import InMemoryVectorStore + + +def test_in_memory_vector_store_search_orders_results(): + store = InMemoryVectorStore() + store.upsert("a", [1.0, 0.0], payload={"summary": "first"}) + store.upsert("b", [0.0, 1.0], payload={"summary": "second"}) + + results = store.search([1.0, 0.0], limit=2) + + assert results[0][0] == "a" + assert results[0][1] >= results[1][1] + + +def test_sqlite_vector_store_persists_and_searches(tmp_path: Path): + db_path = tmp_path / "memomind.db" + store = SqliteVectorStore(db_path=db_path) + store.upsert("memory", [0.5, 0.5], payload={"summary": "hello"}) + + new_store = SqliteVectorStore(db_path=db_path) + results = new_store.search([0.5, 0.5], limit=1) + + assert results + assert results[0][0] == "memory"