cozy-creator
diff --git a/‎README.md‎
Lines changed: 7 additions & 0 deletions b/‎README.md‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎agents/progress.json‎
Lines changed: 37 additions & 4 deletions b/‎agents/progress.json‎
Lines changed: 37 additions & 4 deletions
diff --git a/‎docs/metrics.md‎
Lines changed: 61 additions & 0 deletions b/‎docs/metrics.md‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎examples/flux2-klein-4b/README.md‎
Lines changed: 1 addition & 0 deletions b/‎examples/flux2-klein-4b/README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/flux2-klein-4b/pyproject.toml‎
Lines changed: 2 additions & 1 deletion b/‎examples/flux2-klein-4b/pyproject.toml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/flux2-klein-4b/src/flux2_klein_4b/main.py‎
Lines changed: 12 additions & 3 deletions b/‎examples/flux2-klein-4b/src/flux2_klein_4b/main.py‎
Lines changed: 12 additions & 3 deletions
diff --git a/‎examples/sd15/README.md‎
Lines changed: 1 addition & 0 deletions b/‎examples/sd15/README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/sd15/pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎examples/sd15/pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/sd15/src/sd15/main.py‎
Lines changed: 14 additions & 3 deletions b/‎examples/sd15/src/sd15/main.py‎
Lines changed: 14 additions & 3 deletions
diff --git a/‎examples/sd15/uv.lock‎
Lines changed: 1 addition & 1 deletion b/‎examples/sd15/uv.lock‎
Lines changed: 1 addition & 1 deletion
@@ -40,6 +40,7 @@ def generate(ctx: ActionContext, payload: Input) -> Output:
 - **Model injection** - Dependency injection for ML models with caching
 - **Streaming output** - Support for incremental/streaming responses
 - **Progress reporting** - Built-in progress events via `ActionContext`
+- **Perf metrics** - Best-effort per-run metrics emitted to gen-orchestrator (`metrics.*` worker events)
 - **File handling** - Upload/download assets via Cozy hub file API
 - **Model caching** - LRU cache with VRAM/disk management and cache-aware routing
 
@@ -159,6 +160,12 @@ Local dev / advanced (not injected by orchestrator):
 | `COZY_HUB_TOKEN` | - | Local dev only: Cozy Hub bearer token (only used when `WORKER_ALLOW_COZY_HUB_API_RESOLVE=1`) |
 | `HF_TOKEN` | - | Hugging Face token (for private `hf:` refs) |
 
+## Metrics
+
+The worker can emit best-effort performance/debug metrics to gen-orchestrator via `WorkerEvent` messages.
+
+See `docs/metrics.md`.
+
 ### Hugging Face (`hf:`) download behavior
 
 By default, `hf:` model refs **do not download the full repo**. The worker uses `huggingface_hub.snapshot_download(allow_patterns=...)` to avoid pulling huge legacy weights.
 
@@ -0,0 +1,61 @@
+# Worker-Reported Perf Metrics (v1)
+
+`gen-worker` can optionally report best-effort performance/debug metrics to `gen-orchestrator` via the existing gRPC stream `WorkerSchedulerMessage.worker_event`.
+
+These metrics are:
+
+- Best-effort: metrics emission must never fail a run.
+- Safe: numbers and small strings only. No URLs, secrets, or file paths.
+- Optional: only emit keys when known; omit unknown fields entirely.
+
+## Canonical Events
+
+These event types are designed to be stable and low-cardinality. `gen-orchestrator` can persist them into dedicated columns.
+
+- `metrics.compute.started` payload: `{ "at": "<rfc3339>" }`
+- `metrics.compute.completed` payload: `{ "at": "<rfc3339>" }`
+- `metrics.fetch` payload: `{ "ms": <int> }` (use `0` for warm disk hits)
+- `metrics.gpu_load` payload: `{ "ms": <int> }`
+- `metrics.inference` payload: `{ "ms": <int> }`
+- `metrics.tokens` payload: `{ "output_tokens": <int> }` (only when applicable)
+
+All times are milliseconds as integers.
+
+## Extended Debug Event
+
+Additionally, the worker emits one extended event at the end of each run:
+
+- `metrics.run` payload: JSON object (schema versioned)
+
+### `metrics.run` payload (schema_version=1)
+
+Top-level keys (all optional unless noted):
+
+- `schema_version` (required): `1`
+- `function_name`: string
+- `cache_state`: `hot_vram | warm_disk | cold_remote`
+- `models`: array of objects (best-effort per required model)
+- `pipeline_init_ms`: int
+- `gpu_load_ms`: int
+- `warmup_ms`: int (only for first warmup run; otherwise omit)
+- `inference_ms`: int
+- diffusion-ish extras (optional): `steps`, `iters_per_s`, `width`, `height`, `guidance`
+- post (optional): `png_encode_ms`, `upload_ms`
+- resources (optional): `peak_vram_bytes`, `peak_ram_bytes`
+
+Per-model object keys (all optional unless noted):
+
+- `model_id` (required): canonical model id used by worker/scheduler
+- `variant_label`: string
+- `snapshot_digest`: string
+- `cache_state`: `hot_vram | warm_disk | cold_remote`
+- `bytes_downloaded`: int (0 if none)
+- `download_ms`: int (0 if warm disk hit)
+- `bytes_read_disk`: int
+
+## Notes
+
+- `metrics.fetch` is primarily the time spent ensuring required model blobs are present on disk (remote download vs warm disk hit).
+- `metrics.gpu_load` is best-effort and currently reflects time spent moving injected model objects to the worker device when supported.
+- `metrics.inference` is best-effort and currently reflects time spent executing the user function body (not including scheduler queueing).
+
@@ -4,6 +4,7 @@ FLUX.2-klein-4B example using Cozy’s injection pattern.
 
 - The worker function only defines input/output + runs inference.
 - Model selection + downloading is handled by the worker runtime via `[tool.cozy.models]`.
+- This model is treated as a turbo model: the worker forces `num_inference_steps=8`.
 
 Config:
 
 
@@ -38,4 +38,5 @@ torch = "2.10.0"
 vram_gb = 12
 
 [tool.cozy.models]
-flux2-klein-4b = "hf:black-forest-labs/FLUX.2-klein-4B"
+# Use Cozy Hub snapshot (backed by R2) instead of pulling from Hugging Face at runtime.
+flux2-klein-4b = "black-forest-labs/flux-2-klein-4b"
@@ -24,7 +24,9 @@
 
 class GenerateInput(msgspec.Struct):
     prompt: str
-    num_inference_steps: int = 4
+    # FLUX.2-klein-4B is a turbo model: we always run at 8 steps.
+    # Keep the field for API compatibility, but ignore it in `generate()`.
+    num_inference_steps: int = 8
     guidance_scale: float = 1.0
     width: int = 1024
     height: int = 1024
@@ -51,7 +53,14 @@ def generate(
     if ctx.is_canceled():
         raise InterruptedError("canceled")
 
-    logger.info("[run_id=%s] flux2-klein-4b prompt=%r", ctx.run_id, payload.prompt)
+    steps = 8  # forced turbo steps
+    logger.info(
+        "[run_id=%s] flux2-klein-4b prompt=%r steps=%s (forced, requested=%s)",
+        ctx.run_id,
+        payload.prompt,
+        steps,
+        payload.num_inference_steps,
+    )
 
     # FLUX.2-klein-4B can exceed 8GB VRAM; use sequential CPU offload by default.
     if torch.cuda.is_available() and _should_enable_seq_offload():
@@ -66,7 +75,7 @@ def generate(
 
     result = pipeline(
         prompt=payload.prompt,
-        num_inference_steps=payload.num_inference_steps,
+        num_inference_steps=steps,
         guidance_scale=payload.guidance_scale,
         width=payload.width,
         height=payload.height,
 
@@ -4,6 +4,7 @@ Stable Diffusion 1.5 example using Cozy’s injection pattern.
 
 - The worker function only defines input/output + runs inference.
 - Model selection + downloading is handled by the worker runtime via `[tool.cozy.models]`.
+- The worker clamps `num_inference_steps` to a minimum of 25 for quality.
 
 Config:
 
 
@@ -1,6 +1,6 @@
 [project]
 name = "sd15"
-version = "0.1.0"
+version = "0.1.1"
 description = "Stable Diffusion 1.5 example (inference-only; models via [tool.cozy.models])"
 requires-python = ">=3.12"
 dependencies = [
 
@@ -23,7 +23,7 @@
 class GenerateInput(msgspec.Struct):
     prompt: str
     negative_prompt: str = ""
-    num_inference_steps: int = 20
+    num_inference_steps: int = 25
     guidance_scale: float = 7.5
     width: int = 512
     height: int = 512
@@ -45,7 +45,18 @@ def generate(
     if ctx.is_canceled():
         raise InterruptedError("canceled")
 
-    logger.info("[run_id=%s] sd15 prompt=%r", ctx.run_id, payload.prompt)
+    requested_steps = payload.num_inference_steps
+    steps = requested_steps
+    if steps < 25:
+        steps = 25
+
+    logger.info(
+        "[run_id=%s] sd15 prompt=%r steps=%s (requested=%s)",
+        ctx.run_id,
+        payload.prompt,
+        steps,
+        requested_steps,
+    )
 
     generator = None
     if payload.seed is not None:
@@ -57,7 +68,7 @@ def generate(
     result = pipeline(
         prompt=payload.prompt,
         negative_prompt=payload.negative_prompt,
-        num_inference_steps=payload.num_inference_steps,
+        num_inference_steps=steps,
         guidance_scale=payload.guidance_scale,
         width=payload.width,
         height=payload.height,