diff --git a/src/contextual-signal-encoder/Dockerfile b/src/contextual-signal-encoder/Dockerfile new file mode 100644 index 0000000..e18f903 --- /dev/null +++ b/src/contextual-signal-encoder/Dockerfile @@ -0,0 +1,7 @@ +FROM python:3.12-slim +WORKDIR /app +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt +COPY . . +EXPOSE 8081 +CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8081"] diff --git a/src/contextual-signal-encoder/README.md b/src/contextual-signal-encoder/README.md new file mode 100644 index 0000000..760957d --- /dev/null +++ b/src/contextual-signal-encoder/README.md @@ -0,0 +1,86 @@ +# Contextual Signal Encoder + +Reference implementation for generating contextual embedding signals conforming to the [Agentic Audiences](../../specs/v1.0/) protocol and [embedding format schema](../../specs/v1.0/embedding_format.schema.json). + +The [scoring service](../user-embedding-to-campaign-scoring/) consumes embeddings but the spec has no reference for **producing** them. This service fills that gap: content in, ORTB-compatible embedding out — with the spec-defined model metadata fields (`version`, `embedding_space_id`, `metric`) so the receiving party knows how to process the vector. + +## Public lane / private lane + +Ships with sentence-transformers as the "public lane" — a standardized open-source model any party can use. The `EmbeddingProvider` interface supports private-lane providers. The spec metadata fields travel with every embedding regardless of which lane produced it. + +## Quick Start + +```bash +pip install -r requirements.txt +uvicorn app.main:app --port 8081 + +curl -X POST http://localhost:8081/encode \ + -H "Content-Type: application/json" \ + -d '{"text": "NFL playoff predictions: Chiefs vs Bills"}' +``` + +**Response:** +```json +{ + "data": { + "id": "contextual-signal-encoder", + "name": "contextual-embeddings", + "segment": [{ + "id": "ctx-a1b2c3d4", + "ext": { + "ver": "1.0", + "vector": [0.042, -0.118, "..."], + "model": "all-MiniLM-L6-v2", + "dimension": 384, + "type": "context", + "version": "2.0.0", + "embedding_space_id": "aa://spaces/contextual/sentence-transformers/minilm-l6-v2", + "metric": "cosine" + } + }] + }, + "source": "text", + "content_length": 42 +} +``` + +The `version`, `embedding_space_id`, and `metric` fields come from `embedding_format.schema.json`. The `data` object plugs directly into the scoring service as a `user.data[]` entry. + +## Adding a private-lane provider + +```python +from app.providers.base import EmbeddingProvider, EmbeddingResult + +class AcmeProvider(EmbeddingProvider): + async def encode_text(self, text: str) -> EmbeddingResult: + vector = my_model.encode(text) + return EmbeddingResult( + vector=vector.tolist(), + model="acme-contextual-v3", + version="3.1.0", + dimension=len(vector), + embedding_space_id="aa://spaces/private/acme-corp/v3", + metric="dot", + ) + async def close(self) -> None: pass +``` + +## Development + +```bash +pytest tests/ -v +``` + +``` +├── app/ +│ ├── main.py # FastAPI app (public lane default) +│ ├── models/encode.py # ORTB models using spec-defined fields +│ ├── engine/ +│ │ ├── encoder.py # Encoding orchestrator +│ │ └── extractor.py # URL text extraction +│ ├── providers/ +│ │ ├── base.py # EmbeddingProvider interface +│ │ └── sentence_transformers.py # Public lane +│ └── routes/encode.py +└── tests/test_encode.py +``` diff --git a/src/contextual-signal-encoder/app/__init__.py b/src/contextual-signal-encoder/app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/contextual-signal-encoder/app/engine/__init__.py b/src/contextual-signal-encoder/app/engine/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/contextual-signal-encoder/app/engine/encoder.py b/src/contextual-signal-encoder/app/engine/encoder.py new file mode 100644 index 0000000..edad43b --- /dev/null +++ b/src/contextual-signal-encoder/app/engine/encoder.py @@ -0,0 +1,59 @@ +"""Core encoder — content in, ORTB segment with spec-defined metadata out.""" + +from __future__ import annotations + +import uuid + +from app.engine.extractor import extract_text_from_url +from app.models.encode import ( + ContextualData, + EmbeddingExt, + EncodeRequest, + EncodeResponse, + Segment, +) +from app.providers.base import EmbeddingProvider + +MAX_WORDS = 512 + + +class ContextualEncoder: + + def __init__(self, provider: EmbeddingProvider) -> None: + self._provider = provider + + async def encode(self, request: EncodeRequest) -> EncodeResponse: + if request.url: + text = await extract_text_from_url(request.url) + source = "url" + elif request.text: + text = request.text + source = "text" + else: + raise ValueError("Either text or url is required.") + + words = text.split() + if len(words) > MAX_WORDS: + text = " ".join(words[:MAX_WORDS]) + + result = await self._provider.encode_text(text) + + segment = Segment( + id=f"ctx-{uuid.uuid4().hex[:8]}", + name=f"contextual-{source}", + ext=EmbeddingExt( + vector=result.vector, + model=result.model, + dimension=result.dimension, + type="context", + version=result.version, + embedding_space_id=result.embedding_space_id, + metric=result.metric, + ), + ) + + return EncodeResponse( + data=ContextualData(segment=[segment]), + source=source, + content_length=len(text), + ) diff --git a/src/contextual-signal-encoder/app/engine/extractor.py b/src/contextual-signal-encoder/app/engine/extractor.py new file mode 100644 index 0000000..327d875 --- /dev/null +++ b/src/contextual-signal-encoder/app/engine/extractor.py @@ -0,0 +1,21 @@ +"""Content extraction from URLs.""" + +from __future__ import annotations + +import httpx +from bs4 import BeautifulSoup + + +async def extract_text_from_url(url: str, timeout: int = 10) -> str: + async with httpx.AsyncClient(timeout=timeout) as client: + resp = await client.get( + url, headers={"User-Agent": "AATech-ContextualEncoder/1.0"}, follow_redirects=True, + ) + resp.raise_for_status() + + soup = BeautifulSoup(resp.text, "html.parser") + for tag in soup(["script", "style", "nav", "footer", "header", "aside"]): + tag.decompose() + + main = soup.find("article") or soup.find("main") or soup.find("body") + return (main or soup).get_text(separator=" ", strip=True) diff --git a/src/contextual-signal-encoder/app/main.py b/src/contextual-signal-encoder/app/main.py new file mode 100644 index 0000000..dd99814 --- /dev/null +++ b/src/contextual-signal-encoder/app/main.py @@ -0,0 +1,39 @@ +"""Contextual Signal Encoder — reference "public lane" implementation.""" + +from __future__ import annotations + +from contextlib import asynccontextmanager + +from fastapi import FastAPI + +from app.engine.encoder import ContextualEncoder +from app.providers.sentence_transformers import SentenceTransformersProvider +from app.routes.encode import router as encode_router, set_encoder + +_provider = None + + +@asynccontextmanager +async def lifespan(app: FastAPI): + global _provider + _provider = SentenceTransformersProvider() + set_encoder(ContextualEncoder(_provider)) + yield + if _provider: + await _provider.close() + + +app = FastAPI( + title="Agentic Audiences — Contextual Signal Encoder", + description="Reference public-lane encoder for generating contextual embeddings " + "with model descriptor metadata, compatible with the scoring service.", + version="1.0.0", + lifespan=lifespan, +) + +app.include_router(encode_router) + + +@app.get("/health") +async def health(): + return {"status": "ok", "lane": "public", "model": "all-MiniLM-L6-v2"} diff --git a/src/contextual-signal-encoder/app/models/__init__.py b/src/contextual-signal-encoder/app/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/contextual-signal-encoder/app/models/encode.py b/src/contextual-signal-encoder/app/models/encode.py new file mode 100644 index 0000000..ef22884 --- /dev/null +++ b/src/contextual-signal-encoder/app/models/encode.py @@ -0,0 +1,58 @@ +"""Models for contextual signal encoding. + +The output is an ORTB-compatible embedding segment using the model descriptor +fields defined in specs/v1.0/embedding_format.schema.json — this encoder is a +reference implementation that produces embeddings conforming to the existing spec. +""" + +from __future__ import annotations + +from pydantic import BaseModel, Field + + +class EncodeRequest(BaseModel): + text: str | None = Field(default=None, description="Raw text content.") + url: str | None = Field(default=None, description="URL to fetch and extract text from.") + + +class EmbeddingExt(BaseModel): + """Matches the scoring service's EmbeddingSegmentExt schema. + + Fields align with specs/v1.0/embedding_format.schema.json: + - model → model.id + - dimension → model.dimension + - type → signal type (context, identity, etc.) + - version → model.version + - embedding_space_id → model.embedding_space_id + - metric → model.metric + """ + + ver: str = "1.0" + vector: list[float] + model: str + dimension: int + type: str + version: str = Field(description="Model version (per embedding_format.schema.json).") + embedding_space_id: str = Field( + description="Embedding space URI (per embedding_format.schema.json). " + "Two vectors are comparable only within the same space.", + ) + metric: str = Field(default="cosine", description="Similarity metric: cosine, dot, or l2.") + + +class Segment(BaseModel): + id: str + name: str | None = None + ext: EmbeddingExt + + +class ContextualData(BaseModel): + id: str = "contextual-signal-encoder" + name: str = "contextual-embeddings" + segment: list[Segment] + + +class EncodeResponse(BaseModel): + data: ContextualData + source: str + content_length: int diff --git a/src/contextual-signal-encoder/app/providers/__init__.py b/src/contextual-signal-encoder/app/providers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/contextual-signal-encoder/app/providers/base.py b/src/contextual-signal-encoder/app/providers/base.py new file mode 100644 index 0000000..7018ddb --- /dev/null +++ b/src/contextual-signal-encoder/app/providers/base.py @@ -0,0 +1,30 @@ +"""Embedding provider interface. + +The public lane ships with sentence-transformers (open-source, standardized). +The private lane is any custom provider implementing this interface. +""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from dataclasses import dataclass + + +@dataclass +class EmbeddingResult: + vector: list[float] + model: str + version: str + dimension: int + embedding_space_id: str + metric: str = "cosine" + + +class EmbeddingProvider(ABC): + """Interface for pluggable embedding backends (public or private lane).""" + + @abstractmethod + async def encode_text(self, text: str) -> EmbeddingResult: ... + + @abstractmethod + async def close(self) -> None: ... diff --git a/src/contextual-signal-encoder/app/providers/sentence_transformers.py b/src/contextual-signal-encoder/app/providers/sentence_transformers.py new file mode 100644 index 0000000..80cd52d --- /dev/null +++ b/src/contextual-signal-encoder/app/providers/sentence_transformers.py @@ -0,0 +1,54 @@ +"""Public lane provider: sentence-transformers. + +This is the reference "public lane" encoder — a standardized, open-source +model that any party can load to produce or read embeddings in a shared +embedding space. As discussed in the working group, this provides a baseline +for interoperability without requiring proprietary model access. +""" + +from __future__ import annotations + +import numpy as np + +from app.providers.base import EmbeddingProvider, EmbeddingResult + +# Default public lane model and its embedding space identifier +DEFAULT_MODEL = "all-MiniLM-L6-v2" +DEFAULT_VERSION = "2.0.0" +DEFAULT_SPACE = "aa://spaces/contextual/sentence-transformers/minilm-l6-v2" + + +class SentenceTransformersProvider(EmbeddingProvider): + + def __init__( + self, + model_name: str = DEFAULT_MODEL, + version: str = DEFAULT_VERSION, + embedding_space_id: str = DEFAULT_SPACE, + ) -> None: + self._model_name = model_name + self._version = version + self._embedding_space_id = embedding_space_id + self._model = None + + def _load_model(self): + if self._model is None: + from sentence_transformers import SentenceTransformer + self._model = SentenceTransformer(self._model_name) + return self._model + + async def encode_text(self, text: str) -> EmbeddingResult: + model = self._load_model() + embedding = model.encode(text, normalize_embeddings=True) + vector = np.asarray(embedding, dtype=np.float32).tolist() + return EmbeddingResult( + vector=vector, + model=self._model_name, + version=self._version, + dimension=len(vector), + embedding_space_id=self._embedding_space_id, + metric="cosine", + ) + + async def close(self) -> None: + self._model = None diff --git a/src/contextual-signal-encoder/app/routes/__init__.py b/src/contextual-signal-encoder/app/routes/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/contextual-signal-encoder/app/routes/encode.py b/src/contextual-signal-encoder/app/routes/encode.py new file mode 100644 index 0000000..6e3e93a --- /dev/null +++ b/src/contextual-signal-encoder/app/routes/encode.py @@ -0,0 +1,33 @@ +"""Encode route.""" + +from __future__ import annotations + +from fastapi import APIRouter, HTTPException + +from app.models.encode import EncodeRequest, EncodeResponse + +router = APIRouter() +_encoder = None + + +def set_encoder(encoder): + global _encoder + _encoder = encoder + + +@router.post("/encode", response_model=EncodeResponse) +async def encode_content(request: EncodeRequest) -> EncodeResponse: + """Generate an ORTB-compatible contextual embedding with model descriptor. + + The response includes a model_descriptor envelope so the receiving party + knows the model name, version, embedding space, and metric needed to + process the vector. + """ + if _encoder is None: + raise HTTPException(status_code=503, detail="Encoder not initialized.") + if not any([request.text, request.url]): + raise HTTPException(status_code=422, detail="Either text or url is required.") + try: + return await _encoder.encode(request) + except Exception as e: + raise HTTPException(status_code=500, detail=f"Encoding failed: {e}") diff --git a/src/contextual-signal-encoder/requirements.txt b/src/contextual-signal-encoder/requirements.txt new file mode 100644 index 0000000..ad95773 --- /dev/null +++ b/src/contextual-signal-encoder/requirements.txt @@ -0,0 +1,9 @@ +fastapi>=0.115,<1 +uvicorn[standard]>=0.30,<1 +pydantic>=2.0,<3 +numpy>=1.26,<3 +httpx>=0.27,<1 +sentence-transformers>=3.0,<4 +beautifulsoup4>=4.12,<5 +pytest>=8.0,<9 +pytest-asyncio>=0.23,<1 diff --git a/src/contextual-signal-encoder/tests/__init__.py b/src/contextual-signal-encoder/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/contextual-signal-encoder/tests/conftest.py b/src/contextual-signal-encoder/tests/conftest.py new file mode 100644 index 0000000..af7e479 --- /dev/null +++ b/src/contextual-signal-encoder/tests/conftest.py @@ -0,0 +1,6 @@ +import pytest + + +@pytest.fixture +def anyio_backend(): + return "asyncio" diff --git a/src/contextual-signal-encoder/tests/test_encode.py b/src/contextual-signal-encoder/tests/test_encode.py new file mode 100644 index 0000000..27b212e --- /dev/null +++ b/src/contextual-signal-encoder/tests/test_encode.py @@ -0,0 +1,133 @@ +"""Tests for the contextual signal encoder. + +Verifies that output conforms to: +1. Scoring service EmbeddingSegmentExt wire format +2. specs/v1.0/embedding_format.schema.json model metadata fields +3. Embedding space identification for interoperability +""" + +from __future__ import annotations + +import json +from unittest.mock import AsyncMock, patch + +import numpy as np +import pytest + +from app.engine.encoder import ContextualEncoder +from app.models.encode import EncodeRequest +from app.providers.base import EmbeddingResult + + +def _mock_provider(dim: int = 384): + provider = AsyncMock() + provider.encode_text = AsyncMock( + return_value=EmbeddingResult( + vector=np.random.randn(dim).tolist(), + model="all-MiniLM-L6-v2", + version="2.0.0", + dimension=dim, + embedding_space_id="aa://spaces/contextual/sentence-transformers/minilm-l6-v2", + metric="cosine", + ) + ) + return provider + + +class TestORTBWireFormat: + """Output must match the scoring service's EmbeddingSegmentExt schema.""" + + @pytest.mark.asyncio + async def test_segment_ext_fields(self): + encoder = ContextualEncoder(_mock_provider()) + resp = await encoder.encode(EncodeRequest(text="NFL playoff highlights")) + + ext = resp.data.segment[0].ext + assert ext.ver == "1.0" + assert ext.type == "context" + assert ext.model == "all-MiniLM-L6-v2" + assert ext.dimension == 384 + assert len(ext.vector) == 384 + + @pytest.mark.asyncio + async def test_plugs_into_score_request(self): + encoder = ContextualEncoder(_mock_provider()) + resp = await encoder.encode(EncodeRequest(text="Test")) + + score_request = { + "id": "bid-001", + "user": {"id": "page", "data": [resp.data.model_dump()]}, + "top_k": 5, + } + parsed = json.loads(json.dumps(score_request)) + seg = parsed["user"]["data"][0]["segment"][0] + assert seg["ext"]["dimension"] == len(seg["ext"]["vector"]) + + +class TestSpecMetadata: + """Model metadata fields per embedding_format.schema.json.""" + + @pytest.mark.asyncio + async def test_version_present(self): + encoder = ContextualEncoder(_mock_provider()) + resp = await encoder.encode(EncodeRequest(text="Test")) + assert resp.data.segment[0].ext.version == "2.0.0" + + @pytest.mark.asyncio + async def test_embedding_space_id(self): + encoder = ContextualEncoder(_mock_provider()) + resp = await encoder.encode(EncodeRequest(text="Test")) + assert resp.data.segment[0].ext.embedding_space_id == \ + "aa://spaces/contextual/sentence-transformers/minilm-l6-v2" + + @pytest.mark.asyncio + async def test_metric(self): + encoder = ContextualEncoder(_mock_provider()) + resp = await encoder.encode(EncodeRequest(text="Test")) + assert resp.data.segment[0].ext.metric == "cosine" + + @pytest.mark.asyncio + async def test_private_lane_different_space(self): + """A private-lane provider produces different metadata.""" + provider = AsyncMock() + provider.encode_text = AsyncMock( + return_value=EmbeddingResult( + vector=np.random.randn(768).tolist(), + model="proprietary-model-v3", + version="3.1.0", + dimension=768, + embedding_space_id="aa://spaces/private/acme-corp/v3", + metric="dot", + ) + ) + encoder = ContextualEncoder(provider) + resp = await encoder.encode(EncodeRequest(text="Test")) + ext = resp.data.segment[0].ext + + assert ext.model == "proprietary-model-v3" + assert ext.embedding_space_id == "aa://spaces/private/acme-corp/v3" + assert ext.metric == "dot" + + +class TestBasicBehavior: + + @pytest.mark.asyncio + async def test_unique_segment_ids(self): + encoder = ContextualEncoder(_mock_provider()) + r1 = await encoder.encode(EncodeRequest(text="A")) + r2 = await encoder.encode(EncodeRequest(text="B")) + assert r1.data.segment[0].id != r2.data.segment[0].id + + @pytest.mark.asyncio + async def test_empty_request_raises(self): + encoder = ContextualEncoder(_mock_provider()) + with pytest.raises(ValueError): + await encoder.encode(EncodeRequest()) + + @pytest.mark.asyncio + async def test_url_extraction(self): + encoder = ContextualEncoder(_mock_provider()) + with patch("app.engine.encoder.extract_text_from_url", new_callable=AsyncMock) as mock: + mock.return_value = "Extracted page content" + resp = await encoder.encode(EncodeRequest(url="https://example.com")) + assert resp.source == "url"