Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions src/contextual-signal-encoder/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
FROM python:3.12-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY . .
EXPOSE 8081
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8081"]
86 changes: 86 additions & 0 deletions src/contextual-signal-encoder/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
# Contextual Signal Encoder

Reference implementation for generating contextual embedding signals conforming to the [Agentic Audiences](../../specs/v1.0/) protocol and [embedding format schema](../../specs/v1.0/embedding_format.schema.json).

The [scoring service](../user-embedding-to-campaign-scoring/) consumes embeddings but the spec has no reference for **producing** them. This service fills that gap: content in, ORTB-compatible embedding out — with the spec-defined model metadata fields (`version`, `embedding_space_id`, `metric`) so the receiving party knows how to process the vector.

## Public lane / private lane

Ships with sentence-transformers as the "public lane" — a standardized open-source model any party can use. The `EmbeddingProvider` interface supports private-lane providers. The spec metadata fields travel with every embedding regardless of which lane produced it.

## Quick Start

```bash
pip install -r requirements.txt
uvicorn app.main:app --port 8081

curl -X POST http://localhost:8081/encode \
-H "Content-Type: application/json" \
-d '{"text": "NFL playoff predictions: Chiefs vs Bills"}'
```

**Response:**
```json
{
"data": {
"id": "contextual-signal-encoder",
"name": "contextual-embeddings",
"segment": [{
"id": "ctx-a1b2c3d4",
"ext": {
"ver": "1.0",
"vector": [0.042, -0.118, "..."],
"model": "all-MiniLM-L6-v2",
"dimension": 384,
"type": "context",
"version": "2.0.0",
"embedding_space_id": "aa://spaces/contextual/sentence-transformers/minilm-l6-v2",
"metric": "cosine"
}
}]
},
"source": "text",
"content_length": 42
}
```

The `version`, `embedding_space_id`, and `metric` fields come from `embedding_format.schema.json`. The `data` object plugs directly into the scoring service as a `user.data[]` entry.

## Adding a private-lane provider

```python
from app.providers.base import EmbeddingProvider, EmbeddingResult

class AcmeProvider(EmbeddingProvider):
async def encode_text(self, text: str) -> EmbeddingResult:
vector = my_model.encode(text)
return EmbeddingResult(
vector=vector.tolist(),
model="acme-contextual-v3",
version="3.1.0",
dimension=len(vector),
embedding_space_id="aa://spaces/private/acme-corp/v3",
metric="dot",
)
async def close(self) -> None: pass
```

## Development

```bash
pytest tests/ -v
```

```
├── app/
│ ├── main.py # FastAPI app (public lane default)
│ ├── models/encode.py # ORTB models using spec-defined fields
│ ├── engine/
│ │ ├── encoder.py # Encoding orchestrator
│ │ └── extractor.py # URL text extraction
│ ├── providers/
│ │ ├── base.py # EmbeddingProvider interface
│ │ └── sentence_transformers.py # Public lane
│ └── routes/encode.py
└── tests/test_encode.py
```
Empty file.
Empty file.
59 changes: 59 additions & 0 deletions src/contextual-signal-encoder/app/engine/encoder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
"""Core encoder — content in, ORTB segment with spec-defined metadata out."""

from __future__ import annotations

import uuid

from app.engine.extractor import extract_text_from_url
from app.models.encode import (
ContextualData,
EmbeddingExt,
EncodeRequest,
EncodeResponse,
Segment,
)
from app.providers.base import EmbeddingProvider

MAX_WORDS = 512


class ContextualEncoder:

def __init__(self, provider: EmbeddingProvider) -> None:
self._provider = provider

async def encode(self, request: EncodeRequest) -> EncodeResponse:
if request.url:
text = await extract_text_from_url(request.url)
source = "url"
elif request.text:
text = request.text
source = "text"
else:
raise ValueError("Either text or url is required.")

words = text.split()
if len(words) > MAX_WORDS:
text = " ".join(words[:MAX_WORDS])

result = await self._provider.encode_text(text)

segment = Segment(
id=f"ctx-{uuid.uuid4().hex[:8]}",
name=f"contextual-{source}",
ext=EmbeddingExt(
vector=result.vector,
model=result.model,
dimension=result.dimension,
type="context",
version=result.version,
embedding_space_id=result.embedding_space_id,
metric=result.metric,
),
)

return EncodeResponse(
data=ContextualData(segment=[segment]),
source=source,
content_length=len(text),
)
21 changes: 21 additions & 0 deletions src/contextual-signal-encoder/app/engine/extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
"""Content extraction from URLs."""

from __future__ import annotations

import httpx
from bs4 import BeautifulSoup


async def extract_text_from_url(url: str, timeout: int = 10) -> str:
async with httpx.AsyncClient(timeout=timeout) as client:
resp = await client.get(
url, headers={"User-Agent": "AATech-ContextualEncoder/1.0"}, follow_redirects=True,
)
resp.raise_for_status()

soup = BeautifulSoup(resp.text, "html.parser")
for tag in soup(["script", "style", "nav", "footer", "header", "aside"]):
tag.decompose()

main = soup.find("article") or soup.find("main") or soup.find("body")
return (main or soup).get_text(separator=" ", strip=True)
39 changes: 39 additions & 0 deletions src/contextual-signal-encoder/app/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
"""Contextual Signal Encoder — reference "public lane" implementation."""

from __future__ import annotations

from contextlib import asynccontextmanager

from fastapi import FastAPI

from app.engine.encoder import ContextualEncoder
from app.providers.sentence_transformers import SentenceTransformersProvider
from app.routes.encode import router as encode_router, set_encoder

_provider = None


@asynccontextmanager
async def lifespan(app: FastAPI):
global _provider
_provider = SentenceTransformersProvider()
set_encoder(ContextualEncoder(_provider))
yield
if _provider:
await _provider.close()


app = FastAPI(
title="Agentic Audiences — Contextual Signal Encoder",
description="Reference public-lane encoder for generating contextual embeddings "
"with model descriptor metadata, compatible with the scoring service.",
version="1.0.0",
lifespan=lifespan,
)

app.include_router(encode_router)


@app.get("/health")
async def health():
return {"status": "ok", "lane": "public", "model": "all-MiniLM-L6-v2"}
Empty file.
58 changes: 58 additions & 0 deletions src/contextual-signal-encoder/app/models/encode.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
"""Models for contextual signal encoding.

The output is an ORTB-compatible embedding segment using the model descriptor
fields defined in specs/v1.0/embedding_format.schema.json — this encoder is a
reference implementation that produces embeddings conforming to the existing spec.
"""

from __future__ import annotations

from pydantic import BaseModel, Field


class EncodeRequest(BaseModel):
text: str | None = Field(default=None, description="Raw text content.")
url: str | None = Field(default=None, description="URL to fetch and extract text from.")


class EmbeddingExt(BaseModel):
"""Matches the scoring service's EmbeddingSegmentExt schema.

Fields align with specs/v1.0/embedding_format.schema.json:
- model → model.id
- dimension → model.dimension
- type → signal type (context, identity, etc.)
- version → model.version
- embedding_space_id → model.embedding_space_id
- metric → model.metric
"""

ver: str = "1.0"
vector: list[float]
model: str
dimension: int
type: str
version: str = Field(description="Model version (per embedding_format.schema.json).")
embedding_space_id: str = Field(
description="Embedding space URI (per embedding_format.schema.json). "
"Two vectors are comparable only within the same space.",
)
metric: str = Field(default="cosine", description="Similarity metric: cosine, dot, or l2.")


class Segment(BaseModel):
id: str
name: str | None = None
ext: EmbeddingExt


class ContextualData(BaseModel):
id: str = "contextual-signal-encoder"
name: str = "contextual-embeddings"
segment: list[Segment]


class EncodeResponse(BaseModel):
data: ContextualData
source: str
content_length: int
Empty file.
30 changes: 30 additions & 0 deletions src/contextual-signal-encoder/app/providers/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
"""Embedding provider interface.

The public lane ships with sentence-transformers (open-source, standardized).
The private lane is any custom provider implementing this interface.
"""

from __future__ import annotations

from abc import ABC, abstractmethod
from dataclasses import dataclass


@dataclass
class EmbeddingResult:
vector: list[float]
model: str
version: str
dimension: int
embedding_space_id: str
metric: str = "cosine"


class EmbeddingProvider(ABC):
"""Interface for pluggable embedding backends (public or private lane)."""

@abstractmethod
async def encode_text(self, text: str) -> EmbeddingResult: ...

@abstractmethod
async def close(self) -> None: ...
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
"""Public lane provider: sentence-transformers.

This is the reference "public lane" encoder — a standardized, open-source
model that any party can load to produce or read embeddings in a shared
embedding space. As discussed in the working group, this provides a baseline
for interoperability without requiring proprietary model access.
"""

from __future__ import annotations

import numpy as np

from app.providers.base import EmbeddingProvider, EmbeddingResult

# Default public lane model and its embedding space identifier
DEFAULT_MODEL = "all-MiniLM-L6-v2"
DEFAULT_VERSION = "2.0.0"
DEFAULT_SPACE = "aa://spaces/contextual/sentence-transformers/minilm-l6-v2"


class SentenceTransformersProvider(EmbeddingProvider):

def __init__(
self,
model_name: str = DEFAULT_MODEL,
version: str = DEFAULT_VERSION,
embedding_space_id: str = DEFAULT_SPACE,
) -> None:
self._model_name = model_name
self._version = version
self._embedding_space_id = embedding_space_id
self._model = None

def _load_model(self):
if self._model is None:
from sentence_transformers import SentenceTransformer
self._model = SentenceTransformer(self._model_name)
return self._model

async def encode_text(self, text: str) -> EmbeddingResult:
model = self._load_model()
embedding = model.encode(text, normalize_embeddings=True)
vector = np.asarray(embedding, dtype=np.float32).tolist()
return EmbeddingResult(
vector=vector,
model=self._model_name,
version=self._version,
dimension=len(vector),
embedding_space_id=self._embedding_space_id,
metric="cosine",
)

async def close(self) -> None:
self._model = None
Empty file.
33 changes: 33 additions & 0 deletions src/contextual-signal-encoder/app/routes/encode.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
"""Encode route."""

from __future__ import annotations

from fastapi import APIRouter, HTTPException

from app.models.encode import EncodeRequest, EncodeResponse

router = APIRouter()
_encoder = None


def set_encoder(encoder):
global _encoder
_encoder = encoder


@router.post("/encode", response_model=EncodeResponse)
async def encode_content(request: EncodeRequest) -> EncodeResponse:
"""Generate an ORTB-compatible contextual embedding with model descriptor.

The response includes a model_descriptor envelope so the receiving party
knows the model name, version, embedding space, and metric needed to
process the vector.
"""
if _encoder is None:
raise HTTPException(status_code=503, detail="Encoder not initialized.")
if not any([request.text, request.url]):
raise HTTPException(status_code=422, detail="Either text or url is required.")
try:
return await _encoder.encode(request)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Encoding failed: {e}")
Loading