From cbebf7e9c24e5edc96d101e7d197d1e95e7b7af5 Mon Sep 17 00:00:00 2001 From: pvasilek Date: Mon, 23 Feb 2026 12:28:53 +0200 Subject: [PATCH 1/2] added llama-server inference support --- src/memory/core.py | 6 +++++ src/memory/embeddings/__init__.py | 2 ++ src/memory/embeddings/llama.py | 39 +++++++++++++++++++++++++++++++ 3 files changed, 47 insertions(+) create mode 100644 src/memory/embeddings/llama.py diff --git a/src/memory/core.py b/src/memory/core.py index 8fd032b..44e0202 100644 --- a/src/memory/core.py +++ b/src/memory/core.py @@ -107,6 +107,12 @@ def _create_embedding_provider(self) -> EmbeddingProvider: model=self.config.embedding.model, base_url=self.config.embedding.base_url or "http://localhost:11434", ) + elif provider == "llama": + from memory.embeddings.llama import LlamaEmbedding + return LlamaEmbedding( + model=self.config.embedding.model, + base_url=self.config.embedding.base_url or "http://localhost:11435", + ) elif provider == "openai": from memory.embeddings.openai_embed import OpenAIEmbedding return OpenAIEmbedding( diff --git a/src/memory/embeddings/__init__.py b/src/memory/embeddings/__init__.py index fa53c6d..c8d311a 100644 --- a/src/memory/embeddings/__init__.py +++ b/src/memory/embeddings/__init__.py @@ -1,4 +1,5 @@ from memory.embeddings.base import EmbeddingProvider +from memory.embeddings.llama import LlamaEmbedding from memory.embeddings.ollama import OllamaEmbedding from memory.embeddings.openai_embed import OpenAIEmbedding @@ -6,4 +7,5 @@ "EmbeddingProvider", "OllamaEmbedding", "OpenAIEmbedding", + "LlamaEmbedding" ] diff --git a/src/memory/embeddings/llama.py b/src/memory/embeddings/llama.py new file mode 100644 index 0000000..6488e14 --- /dev/null +++ b/src/memory/embeddings/llama.py @@ -0,0 +1,39 @@ +import httpx +from memory.embeddings.base import EmbeddingProvider + + +def _normalize_model_name(name: str) -> str: + return name.split(":", 1)[0] if name else "" + + +def is_model_loaded(model: str, base_url: str, timeout: float = 0.5) -> bool: + try: + resp = httpx.get(f"{base_url.rstrip('/')}/ps", timeout=timeout) + resp.raise_for_status() + data = resp.json() + except Exception: + return False + + models = data.get("models") or [] + target = _normalize_model_name(model) + for entry in models: + name = _normalize_model_name(entry.get("name") or entry.get("model") or "") + if name == target: + return True + return False + + +class LlamaEmbedding(EmbeddingProvider): + def __init__(self, model: str = "text-embedder", + base_url: str = "http://localhost:11435"): + self.model = model + self.base_url = base_url + + def embed(self, text: str) -> list[float]: + resp = httpx.post( + f"{self.base_url}/embeddings", + json={"model": self.model, "content": text}, + timeout=30.0, + ) + resp.raise_for_status() + return resp.json()[0]["embedding"][0] From 1ac8778163219833ff5f9a655abf06ea635083df Mon Sep 17 00:00:00 2001 From: pvasilek Date: Mon, 23 Feb 2026 12:57:47 +0200 Subject: [PATCH 2/2] added llama-server nomic-ai models inference support --- src/memory/core.py | 6 ++++++ src/memory/embeddings/__init__.py | 6 ++++-- src/memory/embeddings/base.py | 4 ++++ src/memory/embeddings/llama.py | 30 ++++++++------------------- src/memory/embeddings/llama_nomic.py | 27 ++++++++++++++++++++++++ src/memory/embeddings/ollama.py | 2 ++ src/memory/embeddings/openai_embed.py | 2 ++ src/memory/search.py | 4 ++-- 8 files changed, 56 insertions(+), 25 deletions(-) create mode 100644 src/memory/embeddings/llama_nomic.py diff --git a/src/memory/core.py b/src/memory/core.py index 44e0202..48fa35a 100644 --- a/src/memory/core.py +++ b/src/memory/core.py @@ -113,6 +113,12 @@ def _create_embedding_provider(self) -> EmbeddingProvider: model=self.config.embedding.model, base_url=self.config.embedding.base_url or "http://localhost:11435", ) + elif provider == "llama-nomic": + from memory.embeddings.llama_nomic import LlamaNomicEmbedding + return LlamaNomicEmbedding( + model=self.config.embedding.model, + base_url=self.config.embedding.base_url or "http://localhost:11435", + ) elif provider == "openai": from memory.embeddings.openai_embed import OpenAIEmbedding return OpenAIEmbedding( diff --git a/src/memory/embeddings/__init__.py b/src/memory/embeddings/__init__.py index c8d311a..e2f871e 100644 --- a/src/memory/embeddings/__init__.py +++ b/src/memory/embeddings/__init__.py @@ -1,11 +1,13 @@ from memory.embeddings.base import EmbeddingProvider -from memory.embeddings.llama import LlamaEmbedding from memory.embeddings.ollama import OllamaEmbedding from memory.embeddings.openai_embed import OpenAIEmbedding +from memory.embeddings.llama import LlamaEmbedding +from memory.embeddings.llama_nomic import LlamaNomicEmbedding __all__ = [ "EmbeddingProvider", "OllamaEmbedding", "OpenAIEmbedding", - "LlamaEmbedding" + "LlamaEmbedding", + "LlamaNomicEmbedding" ] diff --git a/src/memory/embeddings/base.py b/src/memory/embeddings/base.py index 2768723..1ecf3f0 100644 --- a/src/memory/embeddings/base.py +++ b/src/memory/embeddings/base.py @@ -6,5 +6,9 @@ class EmbeddingProvider(ABC): def embed(self, text: str) -> list[float]: ... + @abstractmethod + def search(self, text: str) -> list[float]: + ... + def embed_batch(self, texts: list[str]) -> list[list[float]]: return [self.embed(t) for t in texts] diff --git a/src/memory/embeddings/llama.py b/src/memory/embeddings/llama.py index 6488e14..1c53952 100644 --- a/src/memory/embeddings/llama.py +++ b/src/memory/embeddings/llama.py @@ -2,27 +2,6 @@ from memory.embeddings.base import EmbeddingProvider -def _normalize_model_name(name: str) -> str: - return name.split(":", 1)[0] if name else "" - - -def is_model_loaded(model: str, base_url: str, timeout: float = 0.5) -> bool: - try: - resp = httpx.get(f"{base_url.rstrip('/')}/ps", timeout=timeout) - resp.raise_for_status() - data = resp.json() - except Exception: - return False - - models = data.get("models") or [] - target = _normalize_model_name(model) - for entry in models: - name = _normalize_model_name(entry.get("name") or entry.get("model") or "") - if name == target: - return True - return False - - class LlamaEmbedding(EmbeddingProvider): def __init__(self, model: str = "text-embedder", base_url: str = "http://localhost:11435"): @@ -37,3 +16,12 @@ def embed(self, text: str) -> list[float]: ) resp.raise_for_status() return resp.json()[0]["embedding"][0] + + def search(self, text: str) -> list[float]: + resp = httpx.post( + f"{self.base_url}/embeddings", + json={"model": self.model, "content": text}, + timeout=30.0, + ) + resp.raise_for_status() + return resp.json()[0]["embedding"][0] diff --git a/src/memory/embeddings/llama_nomic.py b/src/memory/embeddings/llama_nomic.py new file mode 100644 index 0000000..123cac6 --- /dev/null +++ b/src/memory/embeddings/llama_nomic.py @@ -0,0 +1,27 @@ +import httpx +from memory.embeddings.base import EmbeddingProvider + + +class LlamaNomicEmbedding(EmbeddingProvider): + def __init__(self, model: str = "text-embedder", + base_url: str = "http://localhost:11435"): + self.model = model + self.base_url = base_url + + def embed(self, text: str) -> list[float]: + resp = httpx.post( + f"{self.base_url}/embeddings", + json={"model": self.model, "content": 'search_document: ' + text}, + timeout=30.0, + ) + resp.raise_for_status() + return resp.json()[0]["embedding"][0] + + def search(self, text: str) -> list[float]: + resp = httpx.post( + f"{self.base_url}/embeddings", + json={"model": self.model, "content": 'search_query: ' + text}, + timeout=30.0, + ) + resp.raise_for_status() + return resp.json()[0]["embedding"][0] diff --git a/src/memory/embeddings/ollama.py b/src/memory/embeddings/ollama.py index 13af8fe..1ccec52 100644 --- a/src/memory/embeddings/ollama.py +++ b/src/memory/embeddings/ollama.py @@ -37,3 +37,5 @@ def embed(self, text: str) -> list[float]: ) resp.raise_for_status() return resp.json()["embedding"] + + search = embed diff --git a/src/memory/embeddings/openai_embed.py b/src/memory/embeddings/openai_embed.py index 501d031..0ed7d58 100644 --- a/src/memory/embeddings/openai_embed.py +++ b/src/memory/embeddings/openai_embed.py @@ -17,3 +17,5 @@ def embed(self, text: str) -> list[float]: ) resp.raise_for_status() return resp.json()["data"][0]["embedding"] + + search = embed diff --git a/src/memory/search.py b/src/memory/search.py index 3d8cb10..53af742 100644 --- a/src/memory/search.py +++ b/src/memory/search.py @@ -99,7 +99,7 @@ def tiered_search( # FTS results are sparse — fall back to hybrid (embed + vector search + merge) try: - query_vec = embedding_provider.embed(query) + query_vec = embedding_provider.search(query) vec_results = db.vector_search( query_vec, limit=limit * 2, project=project, source=source ) @@ -144,7 +144,7 @@ def hybrid_search( r["score"] = r["score"] / max_score if max_score > 0 else 0.0 return fts_results[:limit] - query_vec = embedding_provider.embed(query) + query_vec = embedding_provider.search(query) vec_results = db.vector_search( query_vec, limit=limit * 2, project=project, source=source )