From 3a994e74870b1cd99daa95e9d255686b3a6c0e12 Mon Sep 17 00:00:00 2001 From: Ethan Steininger Date: Mon, 13 Apr 2026 14:52:32 -0400 Subject: [PATCH 1/2] feat: reference contextual signal encoder with model descriptor envelope MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a public-lane encoder that generates ORTB-compatible contextual embeddings with a model descriptor envelope (name, version, embedding space ID, metric) for cross-party interoperability. - POST /encode: text or URL → embedding segment with model metadata - Pluggable provider interface for public/private lane pattern - sentence-transformers default (shared embedding space) - 8 tests covering wire format, model descriptor, and space identification --- src/contextual-signal-encoder/Dockerfile | 7 + src/contextual-signal-encoder/README.md | 101 +++++++++++++ src/contextual-signal-encoder/app/__init__.py | 0 .../app/engine/__init__.py | 0 .../app/engine/encoder.py | 63 ++++++++ .../app/engine/extractor.py | 21 +++ src/contextual-signal-encoder/app/main.py | 39 +++++ .../app/models/__init__.py | 0 .../app/models/encode.py | 64 +++++++++ .../app/providers/__init__.py | 0 .../app/providers/base.py | 30 ++++ .../app/providers/sentence_transformers.py | 54 +++++++ .../app/routes/__init__.py | 0 .../app/routes/encode.py | 33 +++++ .../requirements.txt | 9 ++ .../tests/__init__.py | 0 .../tests/conftest.py | 6 + .../tests/test_encode.py | 135 ++++++++++++++++++ 18 files changed, 562 insertions(+) create mode 100644 src/contextual-signal-encoder/Dockerfile create mode 100644 src/contextual-signal-encoder/README.md create mode 100644 src/contextual-signal-encoder/app/__init__.py create mode 100644 src/contextual-signal-encoder/app/engine/__init__.py create mode 100644 src/contextual-signal-encoder/app/engine/encoder.py create mode 100644 src/contextual-signal-encoder/app/engine/extractor.py create mode 100644 src/contextual-signal-encoder/app/main.py create mode 100644 src/contextual-signal-encoder/app/models/__init__.py create mode 100644 src/contextual-signal-encoder/app/models/encode.py create mode 100644 src/contextual-signal-encoder/app/providers/__init__.py create mode 100644 src/contextual-signal-encoder/app/providers/base.py create mode 100644 src/contextual-signal-encoder/app/providers/sentence_transformers.py create mode 100644 src/contextual-signal-encoder/app/routes/__init__.py create mode 100644 src/contextual-signal-encoder/app/routes/encode.py create mode 100644 src/contextual-signal-encoder/requirements.txt create mode 100644 src/contextual-signal-encoder/tests/__init__.py create mode 100644 src/contextual-signal-encoder/tests/conftest.py create mode 100644 src/contextual-signal-encoder/tests/test_encode.py diff --git a/src/contextual-signal-encoder/Dockerfile b/src/contextual-signal-encoder/Dockerfile new file mode 100644 index 0000000..e18f903 --- /dev/null +++ b/src/contextual-signal-encoder/Dockerfile @@ -0,0 +1,7 @@ +FROM python:3.12-slim +WORKDIR /app +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt +COPY . . +EXPOSE 8081 +CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8081"] diff --git a/src/contextual-signal-encoder/README.md b/src/contextual-signal-encoder/README.md new file mode 100644 index 0000000..ff97d7a --- /dev/null +++ b/src/contextual-signal-encoder/README.md @@ -0,0 +1,101 @@ +# Contextual Signal Encoder + +Reference implementation for generating contextual embedding signals with the [Agentic Audiences](../../specs/v1.0/) protocol. + +The [scoring service](../user-embedding-to-campaign-scoring/) consumes embeddings but the spec has no reference for **producing** them. This service fills that gap: content in, ORTB-compatible embedding out — with a **model descriptor envelope** so the receiving party knows the model name, version, embedding space, and metric needed to process the vector. + +## Public lane / private lane + +The encoder implements the "public lane" concept: a standardized open-source model ([sentence-transformers](https://sbert.net/)) that any party can use to produce and read embeddings in a shared embedding space. Private-lane providers can be added by implementing the `EmbeddingProvider` interface — the model descriptor travels with the embedding so the receiving party always knows what model was used. + +## Quick Start + +```bash +pip install -r requirements.txt +uvicorn app.main:app --port 8081 + +curl -X POST http://localhost:8081/encode \ + -H "Content-Type: application/json" \ + -d '{"text": "NFL playoff predictions: Chiefs vs Bills"}' +``` + +**Response:** +```json +{ + "data": { + "id": "contextual-signal-encoder", + "name": "contextual-embeddings", + "segment": [{ + "id": "ctx-a1b2c3d4", + "ext": { + "ver": "1.0", + "vector": [0.042, -0.118, "..."], + "model": "all-MiniLM-L6-v2", + "dimension": 384, + "type": "context", + "model_descriptor": { + "name": "all-MiniLM-L6-v2", + "version": "2.0.0", + "embedding_space_id": "aa://spaces/contextual/sentence-transformers/minilm-l6-v2", + "metric": "cosine" + } + } + }] + }, + "source": "text", + "content_length": 42 +} +``` + +The `model_descriptor` is the interoperability primitive — it tells the receiving party: +- **name** + **version**: which model to load for reading/matching +- **embedding_space_id**: two vectors are only comparable within the same space +- **metric**: how to compute similarity (cosine, dot, l2) + +The `data` object plugs directly into the scoring service: +```json +{"id": "bid-001", "user": {"id": "page", "data": [.data]}, "top_k": 5} +``` + +## Adding a private-lane provider + +Implement `EmbeddingProvider` with your proprietary model. The model descriptor ensures the receiving party knows how to process your embeddings: + +```python +from app.providers.base import EmbeddingProvider, EmbeddingResult + +class AcmeProvider(EmbeddingProvider): + async def encode_text(self, text: str) -> EmbeddingResult: + vector = my_proprietary_model.encode(text) + return EmbeddingResult( + vector=vector.tolist(), + model="acme-contextual-v3", + version="3.1.0", + dimension=len(vector), + embedding_space_id="aa://spaces/private/acme-corp/v3", + metric="dot", + ) + + async def close(self) -> None: + pass +``` + +## Development + +```bash +pytest tests/ -v +``` + +``` +├── app/ +│ ├── main.py # FastAPI app (public lane default) +│ ├── models/encode.py # ORTB models + ModelDescriptor envelope +│ ├── engine/ +│ │ ├── encoder.py # Encoding orchestrator +│ │ └── extractor.py # URL text extraction +│ ├── providers/ +│ │ ├── base.py # EmbeddingProvider interface +│ │ └── sentence_transformers.py # Public lane (open-source) +│ └── routes/encode.py +└── tests/test_encode.py +``` diff --git a/src/contextual-signal-encoder/app/__init__.py b/src/contextual-signal-encoder/app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/contextual-signal-encoder/app/engine/__init__.py b/src/contextual-signal-encoder/app/engine/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/contextual-signal-encoder/app/engine/encoder.py b/src/contextual-signal-encoder/app/engine/encoder.py new file mode 100644 index 0000000..67332b1 --- /dev/null +++ b/src/contextual-signal-encoder/app/engine/encoder.py @@ -0,0 +1,63 @@ +"""Core encoder — content in, ORTB segment with model descriptor out.""" + +from __future__ import annotations + +import uuid + +from app.engine.extractor import extract_text_from_url +from app.models.encode import ( + ContextualData, + EmbeddingExt, + EncodeRequest, + EncodeResponse, + ModelDescriptor, + Segment, +) +from app.providers.base import EmbeddingProvider + +MAX_WORDS = 512 + + +class ContextualEncoder: + + def __init__(self, provider: EmbeddingProvider) -> None: + self._provider = provider + + async def encode(self, request: EncodeRequest) -> EncodeResponse: + if request.url: + text = await extract_text_from_url(request.url) + source = "url" + elif request.text: + text = request.text + source = "text" + else: + raise ValueError("Either text or url is required.") + + words = text.split() + if len(words) > MAX_WORDS: + text = " ".join(words[:MAX_WORDS]) + + result = await self._provider.encode_text(text) + + segment = Segment( + id=f"ctx-{uuid.uuid4().hex[:8]}", + name=f"contextual-{source}", + ext=EmbeddingExt( + vector=result.vector, + model=result.model, + dimension=result.dimension, + type="context", + model_descriptor=ModelDescriptor( + name=result.model, + version=result.version, + embedding_space_id=result.embedding_space_id, + metric=result.metric, + ), + ), + ) + + return EncodeResponse( + data=ContextualData(segment=[segment]), + source=source, + content_length=len(text), + ) diff --git a/src/contextual-signal-encoder/app/engine/extractor.py b/src/contextual-signal-encoder/app/engine/extractor.py new file mode 100644 index 0000000..327d875 --- /dev/null +++ b/src/contextual-signal-encoder/app/engine/extractor.py @@ -0,0 +1,21 @@ +"""Content extraction from URLs.""" + +from __future__ import annotations + +import httpx +from bs4 import BeautifulSoup + + +async def extract_text_from_url(url: str, timeout: int = 10) -> str: + async with httpx.AsyncClient(timeout=timeout) as client: + resp = await client.get( + url, headers={"User-Agent": "AATech-ContextualEncoder/1.0"}, follow_redirects=True, + ) + resp.raise_for_status() + + soup = BeautifulSoup(resp.text, "html.parser") + for tag in soup(["script", "style", "nav", "footer", "header", "aside"]): + tag.decompose() + + main = soup.find("article") or soup.find("main") or soup.find("body") + return (main or soup).get_text(separator=" ", strip=True) diff --git a/src/contextual-signal-encoder/app/main.py b/src/contextual-signal-encoder/app/main.py new file mode 100644 index 0000000..dd99814 --- /dev/null +++ b/src/contextual-signal-encoder/app/main.py @@ -0,0 +1,39 @@ +"""Contextual Signal Encoder — reference "public lane" implementation.""" + +from __future__ import annotations + +from contextlib import asynccontextmanager + +from fastapi import FastAPI + +from app.engine.encoder import ContextualEncoder +from app.providers.sentence_transformers import SentenceTransformersProvider +from app.routes.encode import router as encode_router, set_encoder + +_provider = None + + +@asynccontextmanager +async def lifespan(app: FastAPI): + global _provider + _provider = SentenceTransformersProvider() + set_encoder(ContextualEncoder(_provider)) + yield + if _provider: + await _provider.close() + + +app = FastAPI( + title="Agentic Audiences — Contextual Signal Encoder", + description="Reference public-lane encoder for generating contextual embeddings " + "with model descriptor metadata, compatible with the scoring service.", + version="1.0.0", + lifespan=lifespan, +) + +app.include_router(encode_router) + + +@app.get("/health") +async def health(): + return {"status": "ok", "lane": "public", "model": "all-MiniLM-L6-v2"} diff --git a/src/contextual-signal-encoder/app/models/__init__.py b/src/contextual-signal-encoder/app/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/contextual-signal-encoder/app/models/encode.py b/src/contextual-signal-encoder/app/models/encode.py new file mode 100644 index 0000000..fd4eea4 --- /dev/null +++ b/src/contextual-signal-encoder/app/models/encode.py @@ -0,0 +1,64 @@ +"""Models for contextual signal encoding. + +The core output is an ORTB-compatible embedding segment with a model descriptor +envelope — model name, version, embedding space, and metric — so the receiving +party knows exactly how to process the vector. +""" + +from __future__ import annotations + +from pydantic import BaseModel, Field + + +class EncodeRequest(BaseModel): + text: str | None = Field(default=None, description="Raw text content.") + url: str | None = Field(default=None, description="URL to fetch and extract text from.") + + +class ModelDescriptor(BaseModel): + """Metadata envelope describing how the embedding was produced. + + This is the key interoperability primitive: the receiving party loads the + same model (or a compatible one) using this descriptor to read the vector. + """ + + name: str = Field(description="Model identifier (e.g., 'all-MiniLM-L6-v2').") + version: str = Field(description="Model version for reproducibility.") + embedding_space_id: str = Field( + description="URI identifying the embedding space (e.g., 'aa://spaces/contextual/en-v1'). " + "Two vectors are comparable only within the same space.", + ) + metric: str = Field(default="cosine", description="Similarity metric: cosine, dot, or l2.") + + +class EmbeddingExt(BaseModel): + """Matches the scoring service's EmbeddingSegmentExt schema, extended + with model descriptor for interoperability.""" + + ver: str = "1.0" + vector: list[float] + model: str + dimension: int + type: str + model_descriptor: ModelDescriptor | None = Field( + default=None, + description="Full model metadata envelope for cross-party interoperability.", + ) + + +class Segment(BaseModel): + id: str + name: str | None = None + ext: EmbeddingExt + + +class ContextualData(BaseModel): + id: str = "contextual-signal-encoder" + name: str = "contextual-embeddings" + segment: list[Segment] + + +class EncodeResponse(BaseModel): + data: ContextualData + source: str + content_length: int diff --git a/src/contextual-signal-encoder/app/providers/__init__.py b/src/contextual-signal-encoder/app/providers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/contextual-signal-encoder/app/providers/base.py b/src/contextual-signal-encoder/app/providers/base.py new file mode 100644 index 0000000..7018ddb --- /dev/null +++ b/src/contextual-signal-encoder/app/providers/base.py @@ -0,0 +1,30 @@ +"""Embedding provider interface. + +The public lane ships with sentence-transformers (open-source, standardized). +The private lane is any custom provider implementing this interface. +""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from dataclasses import dataclass + + +@dataclass +class EmbeddingResult: + vector: list[float] + model: str + version: str + dimension: int + embedding_space_id: str + metric: str = "cosine" + + +class EmbeddingProvider(ABC): + """Interface for pluggable embedding backends (public or private lane).""" + + @abstractmethod + async def encode_text(self, text: str) -> EmbeddingResult: ... + + @abstractmethod + async def close(self) -> None: ... diff --git a/src/contextual-signal-encoder/app/providers/sentence_transformers.py b/src/contextual-signal-encoder/app/providers/sentence_transformers.py new file mode 100644 index 0000000..80cd52d --- /dev/null +++ b/src/contextual-signal-encoder/app/providers/sentence_transformers.py @@ -0,0 +1,54 @@ +"""Public lane provider: sentence-transformers. + +This is the reference "public lane" encoder — a standardized, open-source +model that any party can load to produce or read embeddings in a shared +embedding space. As discussed in the working group, this provides a baseline +for interoperability without requiring proprietary model access. +""" + +from __future__ import annotations + +import numpy as np + +from app.providers.base import EmbeddingProvider, EmbeddingResult + +# Default public lane model and its embedding space identifier +DEFAULT_MODEL = "all-MiniLM-L6-v2" +DEFAULT_VERSION = "2.0.0" +DEFAULT_SPACE = "aa://spaces/contextual/sentence-transformers/minilm-l6-v2" + + +class SentenceTransformersProvider(EmbeddingProvider): + + def __init__( + self, + model_name: str = DEFAULT_MODEL, + version: str = DEFAULT_VERSION, + embedding_space_id: str = DEFAULT_SPACE, + ) -> None: + self._model_name = model_name + self._version = version + self._embedding_space_id = embedding_space_id + self._model = None + + def _load_model(self): + if self._model is None: + from sentence_transformers import SentenceTransformer + self._model = SentenceTransformer(self._model_name) + return self._model + + async def encode_text(self, text: str) -> EmbeddingResult: + model = self._load_model() + embedding = model.encode(text, normalize_embeddings=True) + vector = np.asarray(embedding, dtype=np.float32).tolist() + return EmbeddingResult( + vector=vector, + model=self._model_name, + version=self._version, + dimension=len(vector), + embedding_space_id=self._embedding_space_id, + metric="cosine", + ) + + async def close(self) -> None: + self._model = None diff --git a/src/contextual-signal-encoder/app/routes/__init__.py b/src/contextual-signal-encoder/app/routes/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/contextual-signal-encoder/app/routes/encode.py b/src/contextual-signal-encoder/app/routes/encode.py new file mode 100644 index 0000000..6e3e93a --- /dev/null +++ b/src/contextual-signal-encoder/app/routes/encode.py @@ -0,0 +1,33 @@ +"""Encode route.""" + +from __future__ import annotations + +from fastapi import APIRouter, HTTPException + +from app.models.encode import EncodeRequest, EncodeResponse + +router = APIRouter() +_encoder = None + + +def set_encoder(encoder): + global _encoder + _encoder = encoder + + +@router.post("/encode", response_model=EncodeResponse) +async def encode_content(request: EncodeRequest) -> EncodeResponse: + """Generate an ORTB-compatible contextual embedding with model descriptor. + + The response includes a model_descriptor envelope so the receiving party + knows the model name, version, embedding space, and metric needed to + process the vector. + """ + if _encoder is None: + raise HTTPException(status_code=503, detail="Encoder not initialized.") + if not any([request.text, request.url]): + raise HTTPException(status_code=422, detail="Either text or url is required.") + try: + return await _encoder.encode(request) + except Exception as e: + raise HTTPException(status_code=500, detail=f"Encoding failed: {e}") diff --git a/src/contextual-signal-encoder/requirements.txt b/src/contextual-signal-encoder/requirements.txt new file mode 100644 index 0000000..ad95773 --- /dev/null +++ b/src/contextual-signal-encoder/requirements.txt @@ -0,0 +1,9 @@ +fastapi>=0.115,<1 +uvicorn[standard]>=0.30,<1 +pydantic>=2.0,<3 +numpy>=1.26,<3 +httpx>=0.27,<1 +sentence-transformers>=3.0,<4 +beautifulsoup4>=4.12,<5 +pytest>=8.0,<9 +pytest-asyncio>=0.23,<1 diff --git a/src/contextual-signal-encoder/tests/__init__.py b/src/contextual-signal-encoder/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/contextual-signal-encoder/tests/conftest.py b/src/contextual-signal-encoder/tests/conftest.py new file mode 100644 index 0000000..af7e479 --- /dev/null +++ b/src/contextual-signal-encoder/tests/conftest.py @@ -0,0 +1,6 @@ +import pytest + + +@pytest.fixture +def anyio_backend(): + return "asyncio" diff --git a/src/contextual-signal-encoder/tests/test_encode.py b/src/contextual-signal-encoder/tests/test_encode.py new file mode 100644 index 0000000..d950d3e --- /dev/null +++ b/src/contextual-signal-encoder/tests/test_encode.py @@ -0,0 +1,135 @@ +"""Tests for the contextual signal encoder. + +Focus areas: +1. ORTB wire format compatibility with scoring service +2. Model descriptor envelope (name, version, space, metric) +3. Embedding space identification for interoperability +""" + +from __future__ import annotations + +import json +from unittest.mock import AsyncMock, patch + +import numpy as np +import pytest + +from app.engine.encoder import ContextualEncoder +from app.models.encode import EncodeRequest +from app.providers.base import EmbeddingResult + + +def _mock_provider(dim: int = 384): + provider = AsyncMock() + provider.encode_text = AsyncMock( + return_value=EmbeddingResult( + vector=np.random.randn(dim).tolist(), + model="all-MiniLM-L6-v2", + version="2.0.0", + dimension=dim, + embedding_space_id="aa://spaces/contextual/sentence-transformers/minilm-l6-v2", + metric="cosine", + ) + ) + return provider + + +class TestORTBWireFormat: + """Output must match the scoring service's EmbeddingSegmentExt schema.""" + + @pytest.mark.asyncio + async def test_segment_ext_fields(self): + encoder = ContextualEncoder(_mock_provider()) + resp = await encoder.encode(EncodeRequest(text="NFL playoff highlights")) + + ext = resp.data.segment[0].ext + assert ext.ver == "1.0" + assert ext.type == "context" + assert ext.model == "all-MiniLM-L6-v2" + assert ext.dimension == 384 + assert len(ext.vector) == 384 + + @pytest.mark.asyncio + async def test_plugs_into_score_request(self): + encoder = ContextualEncoder(_mock_provider()) + resp = await encoder.encode(EncodeRequest(text="Test")) + + score_request = { + "id": "bid-001", + "user": {"id": "page", "data": [resp.data.model_dump()]}, + "top_k": 5, + } + # Must serialize cleanly (no numpy types) + parsed = json.loads(json.dumps(score_request)) + seg = parsed["user"]["data"][0]["segment"][0] + assert seg["ext"]["dimension"] == len(seg["ext"]["vector"]) + + +class TestModelDescriptor: + """The model descriptor envelope enables cross-party interoperability.""" + + @pytest.mark.asyncio + async def test_descriptor_present(self): + encoder = ContextualEncoder(_mock_provider()) + resp = await encoder.encode(EncodeRequest(text="Test")) + desc = resp.data.segment[0].ext.model_descriptor + + assert desc is not None + assert desc.name == "all-MiniLM-L6-v2" + assert desc.version == "2.0.0" + assert desc.metric == "cosine" + + @pytest.mark.asyncio + async def test_embedding_space_id(self): + """Two vectors are comparable only within the same embedding space.""" + encoder = ContextualEncoder(_mock_provider()) + resp = await encoder.encode(EncodeRequest(text="Test")) + desc = resp.data.segment[0].ext.model_descriptor + + assert desc.embedding_space_id == "aa://spaces/contextual/sentence-transformers/minilm-l6-v2" + + @pytest.mark.asyncio + async def test_different_provider_different_space(self): + """A private-lane provider produces a different embedding space ID.""" + provider = AsyncMock() + provider.encode_text = AsyncMock( + return_value=EmbeddingResult( + vector=np.random.randn(768).tolist(), + model="proprietary-model-v3", + version="3.1.0", + dimension=768, + embedding_space_id="aa://spaces/private/acme-corp/v3", + metric="dot", + ) + ) + encoder = ContextualEncoder(provider) + resp = await encoder.encode(EncodeRequest(text="Test")) + desc = resp.data.segment[0].ext.model_descriptor + + assert desc.name == "proprietary-model-v3" + assert desc.embedding_space_id == "aa://spaces/private/acme-corp/v3" + assert desc.metric == "dot" + + +class TestBasicBehavior: + + @pytest.mark.asyncio + async def test_unique_segment_ids(self): + encoder = ContextualEncoder(_mock_provider()) + r1 = await encoder.encode(EncodeRequest(text="A")) + r2 = await encoder.encode(EncodeRequest(text="B")) + assert r1.data.segment[0].id != r2.data.segment[0].id + + @pytest.mark.asyncio + async def test_empty_request_raises(self): + encoder = ContextualEncoder(_mock_provider()) + with pytest.raises(ValueError): + await encoder.encode(EncodeRequest()) + + @pytest.mark.asyncio + async def test_url_extraction(self): + encoder = ContextualEncoder(_mock_provider()) + with patch("app.engine.encoder.extract_text_from_url", new_callable=AsyncMock) as mock: + mock.return_value = "Extracted page content" + resp = await encoder.encode(EncodeRequest(url="https://example.com")) + assert resp.source == "url" From 52be04fa00deb5723e83a3c6329a1d3ea081d8a2 Mon Sep 17 00:00:00 2001 From: Ethan Steininger Date: Mon, 13 Apr 2026 15:03:49 -0400 Subject: [PATCH 2/2] refactor: use spec-defined model metadata fields, not custom ModelDescriptor Removed the custom ModelDescriptor type. Now uses the fields already defined in specs/v1.0/embedding_format.schema.json (version, embedding_space_id, metric) directly on the embedding ext object. --- src/contextual-signal-encoder/README.md | 37 +++++----------- .../app/engine/encoder.py | 12 ++---- .../app/models/encode.py | 42 ++++++++----------- .../tests/test_encode.py | 42 +++++++++---------- 4 files changed, 53 insertions(+), 80 deletions(-) diff --git a/src/contextual-signal-encoder/README.md b/src/contextual-signal-encoder/README.md index ff97d7a..760957d 100644 --- a/src/contextual-signal-encoder/README.md +++ b/src/contextual-signal-encoder/README.md @@ -1,12 +1,12 @@ # Contextual Signal Encoder -Reference implementation for generating contextual embedding signals with the [Agentic Audiences](../../specs/v1.0/) protocol. +Reference implementation for generating contextual embedding signals conforming to the [Agentic Audiences](../../specs/v1.0/) protocol and [embedding format schema](../../specs/v1.0/embedding_format.schema.json). -The [scoring service](../user-embedding-to-campaign-scoring/) consumes embeddings but the spec has no reference for **producing** them. This service fills that gap: content in, ORTB-compatible embedding out — with a **model descriptor envelope** so the receiving party knows the model name, version, embedding space, and metric needed to process the vector. +The [scoring service](../user-embedding-to-campaign-scoring/) consumes embeddings but the spec has no reference for **producing** them. This service fills that gap: content in, ORTB-compatible embedding out — with the spec-defined model metadata fields (`version`, `embedding_space_id`, `metric`) so the receiving party knows how to process the vector. ## Public lane / private lane -The encoder implements the "public lane" concept: a standardized open-source model ([sentence-transformers](https://sbert.net/)) that any party can use to produce and read embeddings in a shared embedding space. Private-lane providers can be added by implementing the `EmbeddingProvider` interface — the model descriptor travels with the embedding so the receiving party always knows what model was used. +Ships with sentence-transformers as the "public lane" — a standardized open-source model any party can use. The `EmbeddingProvider` interface supports private-lane providers. The spec metadata fields travel with every embedding regardless of which lane produced it. ## Quick Start @@ -33,12 +33,9 @@ curl -X POST http://localhost:8081/encode \ "model": "all-MiniLM-L6-v2", "dimension": 384, "type": "context", - "model_descriptor": { - "name": "all-MiniLM-L6-v2", - "version": "2.0.0", - "embedding_space_id": "aa://spaces/contextual/sentence-transformers/minilm-l6-v2", - "metric": "cosine" - } + "version": "2.0.0", + "embedding_space_id": "aa://spaces/contextual/sentence-transformers/minilm-l6-v2", + "metric": "cosine" } }] }, @@ -47,26 +44,16 @@ curl -X POST http://localhost:8081/encode \ } ``` -The `model_descriptor` is the interoperability primitive — it tells the receiving party: -- **name** + **version**: which model to load for reading/matching -- **embedding_space_id**: two vectors are only comparable within the same space -- **metric**: how to compute similarity (cosine, dot, l2) - -The `data` object plugs directly into the scoring service: -```json -{"id": "bid-001", "user": {"id": "page", "data": [.data]}, "top_k": 5} -``` +The `version`, `embedding_space_id`, and `metric` fields come from `embedding_format.schema.json`. The `data` object plugs directly into the scoring service as a `user.data[]` entry. ## Adding a private-lane provider -Implement `EmbeddingProvider` with your proprietary model. The model descriptor ensures the receiving party knows how to process your embeddings: - ```python from app.providers.base import EmbeddingProvider, EmbeddingResult class AcmeProvider(EmbeddingProvider): async def encode_text(self, text: str) -> EmbeddingResult: - vector = my_proprietary_model.encode(text) + vector = my_model.encode(text) return EmbeddingResult( vector=vector.tolist(), model="acme-contextual-v3", @@ -75,9 +62,7 @@ class AcmeProvider(EmbeddingProvider): embedding_space_id="aa://spaces/private/acme-corp/v3", metric="dot", ) - - async def close(self) -> None: - pass + async def close(self) -> None: pass ``` ## Development @@ -89,13 +74,13 @@ pytest tests/ -v ``` ├── app/ │ ├── main.py # FastAPI app (public lane default) -│ ├── models/encode.py # ORTB models + ModelDescriptor envelope +│ ├── models/encode.py # ORTB models using spec-defined fields │ ├── engine/ │ │ ├── encoder.py # Encoding orchestrator │ │ └── extractor.py # URL text extraction │ ├── providers/ │ │ ├── base.py # EmbeddingProvider interface -│ │ └── sentence_transformers.py # Public lane (open-source) +│ │ └── sentence_transformers.py # Public lane │ └── routes/encode.py └── tests/test_encode.py ``` diff --git a/src/contextual-signal-encoder/app/engine/encoder.py b/src/contextual-signal-encoder/app/engine/encoder.py index 67332b1..edad43b 100644 --- a/src/contextual-signal-encoder/app/engine/encoder.py +++ b/src/contextual-signal-encoder/app/engine/encoder.py @@ -1,4 +1,4 @@ -"""Core encoder — content in, ORTB segment with model descriptor out.""" +"""Core encoder — content in, ORTB segment with spec-defined metadata out.""" from __future__ import annotations @@ -10,7 +10,6 @@ EmbeddingExt, EncodeRequest, EncodeResponse, - ModelDescriptor, Segment, ) from app.providers.base import EmbeddingProvider @@ -47,12 +46,9 @@ async def encode(self, request: EncodeRequest) -> EncodeResponse: model=result.model, dimension=result.dimension, type="context", - model_descriptor=ModelDescriptor( - name=result.model, - version=result.version, - embedding_space_id=result.embedding_space_id, - metric=result.metric, - ), + version=result.version, + embedding_space_id=result.embedding_space_id, + metric=result.metric, ), ) diff --git a/src/contextual-signal-encoder/app/models/encode.py b/src/contextual-signal-encoder/app/models/encode.py index fd4eea4..ef22884 100644 --- a/src/contextual-signal-encoder/app/models/encode.py +++ b/src/contextual-signal-encoder/app/models/encode.py @@ -1,8 +1,8 @@ """Models for contextual signal encoding. -The core output is an ORTB-compatible embedding segment with a model descriptor -envelope — model name, version, embedding space, and metric — so the receiving -party knows exactly how to process the vector. +The output is an ORTB-compatible embedding segment using the model descriptor +fields defined in specs/v1.0/embedding_format.schema.json — this encoder is a +reference implementation that produces embeddings conforming to the existing spec. """ from __future__ import annotations @@ -15,35 +15,29 @@ class EncodeRequest(BaseModel): url: str | None = Field(default=None, description="URL to fetch and extract text from.") -class ModelDescriptor(BaseModel): - """Metadata envelope describing how the embedding was produced. - - This is the key interoperability primitive: the receiving party loads the - same model (or a compatible one) using this descriptor to read the vector. - """ - - name: str = Field(description="Model identifier (e.g., 'all-MiniLM-L6-v2').") - version: str = Field(description="Model version for reproducibility.") - embedding_space_id: str = Field( - description="URI identifying the embedding space (e.g., 'aa://spaces/contextual/en-v1'). " - "Two vectors are comparable only within the same space.", - ) - metric: str = Field(default="cosine", description="Similarity metric: cosine, dot, or l2.") - - class EmbeddingExt(BaseModel): - """Matches the scoring service's EmbeddingSegmentExt schema, extended - with model descriptor for interoperability.""" + """Matches the scoring service's EmbeddingSegmentExt schema. + + Fields align with specs/v1.0/embedding_format.schema.json: + - model → model.id + - dimension → model.dimension + - type → signal type (context, identity, etc.) + - version → model.version + - embedding_space_id → model.embedding_space_id + - metric → model.metric + """ ver: str = "1.0" vector: list[float] model: str dimension: int type: str - model_descriptor: ModelDescriptor | None = Field( - default=None, - description="Full model metadata envelope for cross-party interoperability.", + version: str = Field(description="Model version (per embedding_format.schema.json).") + embedding_space_id: str = Field( + description="Embedding space URI (per embedding_format.schema.json). " + "Two vectors are comparable only within the same space.", ) + metric: str = Field(default="cosine", description="Similarity metric: cosine, dot, or l2.") class Segment(BaseModel): diff --git a/src/contextual-signal-encoder/tests/test_encode.py b/src/contextual-signal-encoder/tests/test_encode.py index d950d3e..27b212e 100644 --- a/src/contextual-signal-encoder/tests/test_encode.py +++ b/src/contextual-signal-encoder/tests/test_encode.py @@ -1,8 +1,8 @@ """Tests for the contextual signal encoder. -Focus areas: -1. ORTB wire format compatibility with scoring service -2. Model descriptor envelope (name, version, space, metric) +Verifies that output conforms to: +1. Scoring service EmbeddingSegmentExt wire format +2. specs/v1.0/embedding_format.schema.json model metadata fields 3. Embedding space identification for interoperability """ @@ -59,38 +59,36 @@ async def test_plugs_into_score_request(self): "user": {"id": "page", "data": [resp.data.model_dump()]}, "top_k": 5, } - # Must serialize cleanly (no numpy types) parsed = json.loads(json.dumps(score_request)) seg = parsed["user"]["data"][0]["segment"][0] assert seg["ext"]["dimension"] == len(seg["ext"]["vector"]) -class TestModelDescriptor: - """The model descriptor envelope enables cross-party interoperability.""" +class TestSpecMetadata: + """Model metadata fields per embedding_format.schema.json.""" @pytest.mark.asyncio - async def test_descriptor_present(self): + async def test_version_present(self): encoder = ContextualEncoder(_mock_provider()) resp = await encoder.encode(EncodeRequest(text="Test")) - desc = resp.data.segment[0].ext.model_descriptor - - assert desc is not None - assert desc.name == "all-MiniLM-L6-v2" - assert desc.version == "2.0.0" - assert desc.metric == "cosine" + assert resp.data.segment[0].ext.version == "2.0.0" @pytest.mark.asyncio async def test_embedding_space_id(self): - """Two vectors are comparable only within the same embedding space.""" encoder = ContextualEncoder(_mock_provider()) resp = await encoder.encode(EncodeRequest(text="Test")) - desc = resp.data.segment[0].ext.model_descriptor + assert resp.data.segment[0].ext.embedding_space_id == \ + "aa://spaces/contextual/sentence-transformers/minilm-l6-v2" - assert desc.embedding_space_id == "aa://spaces/contextual/sentence-transformers/minilm-l6-v2" + @pytest.mark.asyncio + async def test_metric(self): + encoder = ContextualEncoder(_mock_provider()) + resp = await encoder.encode(EncodeRequest(text="Test")) + assert resp.data.segment[0].ext.metric == "cosine" @pytest.mark.asyncio - async def test_different_provider_different_space(self): - """A private-lane provider produces a different embedding space ID.""" + async def test_private_lane_different_space(self): + """A private-lane provider produces different metadata.""" provider = AsyncMock() provider.encode_text = AsyncMock( return_value=EmbeddingResult( @@ -104,11 +102,11 @@ async def test_different_provider_different_space(self): ) encoder = ContextualEncoder(provider) resp = await encoder.encode(EncodeRequest(text="Test")) - desc = resp.data.segment[0].ext.model_descriptor + ext = resp.data.segment[0].ext - assert desc.name == "proprietary-model-v3" - assert desc.embedding_space_id == "aa://spaces/private/acme-corp/v3" - assert desc.metric == "dot" + assert ext.model == "proprietary-model-v3" + assert ext.embedding_space_id == "aa://spaces/private/acme-corp/v3" + assert ext.metric == "dot" class TestBasicBehavior: