From 870de584a08c2150b479115fa60bc2a6aa3fcbd6 Mon Sep 17 00:00:00 2001
From: Simon Kurtz <simonkurtz@microsoft.com>
Date: Thu, 30 Apr 2026 20:59:42 -0400
Subject: [PATCH 1/4] Add support for Responses API

---
 README.md                                     |   2 +-
 assets/APIM-Samples-Slide-Deck.html           |   2 +-
 docs/index.html                               |   2 +-
 samples/costing/README.md                     |  19 +-
 samples/costing/_helpers.py                   | 258 +++++++++++++-----
 .../aoai-gateway-responses-operation.xml      |  32 +++
 samples/costing/bu-token-usage.kql            |  10 +-
 samples/costing/costing.workbook.json         |  12 +-
 samples/costing/create.ipynb                  | 147 ++++++++--
 samples/costing/main.bicep                    |  10 +
 .../pf-ensure-stream-include-usage.xml        |  22 +-
 shared/python/azure_cost.py                   |   6 +
 tests/python/test_costing_helpers.py          | 168 ++++++++++++
 13 files changed, 576 insertions(+), 114 deletions(-)
 create mode 100644 samples/costing/aoai-gateway-responses-operation.xml
 create mode 100644 tests/python/test_costing_helpers.py
diff --git a/README.md b/README.md
index 9afe8a3..85706a6 100644
--- a/README.md
+++ b/README.md
@@ -67,7 +67,7 @@ It's quick and easy to get started!
 | [AuthX][sample-authx]                                       | Authentication and role-based authorization in a mock HR API.                                                       | All infrastructures           |
 | [AuthX Pro][sample-authx-pro]                               | Authentication and role-based authorization in a mock product with multiple APIs and policy fragments.              | All infrastructures           |
 | [Azure Maps][sample-azure-maps]                             | Proxying calls to Azure Maps with APIM policies.                                                                    | All infrastructures           |
-| [Costing][sample-costing]                                   | Track and allocate API costs per business unit using APIM subscriptions, Entra ID application tracking, and AI Gateway token/PTU tracking including streaming (SSE) token usage, which is not simple to capture correctly in APIM. | All infrastructures           |
+| [Costing][sample-costing]                                   | Track and allocate API costs per business unit using APIM subscriptions, Entra ID application tracking, and AI Gateway token/PTU tracking across **both** Azure OpenAI Chat Completions and Responses APIs, including streaming (SSE) token usage which is not simple to capture correctly in APIM. | All infrastructures           |
 | [Dynamic CORS][sample-dynamic-cors]                         | Dynamic per-API CORS origin validation using custom policy fragments and a maintainable origin mapping. | All infrastructures           |
 | [Egress Control][sample-egress-control]                     | Control APIM outbound internet traffic by routing it through a Network Virtual Appliance (NVA) in a hub/spoke topology. | appgw-apim, appgw-apim-pe     |
 | [General][sample-general]                                   | Basic demo of APIM sample setup and policy usage.                                                                   | All infrastructures           |
diff --git a/assets/APIM-Samples-Slide-Deck.html b/assets/APIM-Samples-Slide-Deck.html
index 2ff64bb..2d5c2ef 100644
--- a/assets/APIM-Samples-Slide-Deck.html
+++ b/assets/APIM-Samples-Slide-Deck.html
@@ -1118,7 +1118,7 @@ <h4>Azure Maps</h4>
       </div>
       <div class="arch-card">
         <h4>Costing</h4>
-        <p>Track API costs per business unit via subscriptions, Entra ID apps, and AI Gateway tokens, <em>including streaming (SSE) token usage</em> (not simple to capture correctly in APIM).</p>
+        <p>Track API costs per business unit via subscriptions, Entra ID apps, and AI Gateway tokens across <em>both Azure OpenAI Chat Completions and Responses APIs</em>, including streaming (SSE) token usage (not simple to capture correctly in APIM).</p>
       </div>
       <div class="arch-card">
         <h4>Dynamic CORS</h4>
diff --git a/docs/index.html b/docs/index.html
index 15272f2..a5273f4 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -447,7 +447,7 @@ <h3>Azure Maps</h3>
 
           <a class="sample-card" href="https://github.com/Azure-Samples/Apim-Samples/tree/main/samples/costing" target="_blank" rel="noopener">
             <h3>Costing</h3>
-            <p>Track and allocate API costs per business unit using subscriptions, Entra ID application tracking, and AI Gateway token/PTU tracking <em>including streaming (SSE) token usage</em>, which is not simple to capture correctly in APIM.</p>
+            <p>Track and allocate API costs per business unit using subscriptions, Entra ID application tracking, and AI Gateway token/PTU tracking across <em>both Azure OpenAI Chat Completions and Responses APIs</em>, including streaming (SSE) token usage which is not simple to capture correctly in APIM.</p>
             <span class="infra-tag">All infrastructures</span>
           </a>
 
diff --git a/samples/costing/README.md b/samples/costing/README.md
index f7e947b..b23628a 100644
--- a/samples/costing/README.md
+++ b/samples/costing/README.md
@@ -16,7 +16,7 @@ This sample demonstrates how to track and allocate API costs using Azure API Man
 6. **Enable cost governance** - Establish patterns for consistent tagging and naming conventions
 7. **Enable budget alerts** - Create scheduled query alerts when callers exceed configurable thresholds
 8. **Track AI token consumption per client** - When APIM is used as an AI Gateway, capture prompt, completion, and total token usage per calling application, enabling per-client cost attribution for PTU or pay-as-you-go OpenAI deployments
-9. **Real AOAI interactions via Foundry** (optional) - Deploy a full Microsoft Foundry environment (Hub + Project + Azure AI Services) and route real Azure OpenAI chat completions through APIM, demonstrating accurate token tracking for both non-streaming and streaming (SSE) responses
+9. **Real AOAI interactions via Foundry** (optional) - Deploy a full Microsoft Foundry environment (Hub + Project + Azure AI Services) and route real Azure OpenAI traffic through APIM across **both the Chat Completions and Responses APIs**, demonstrating accurate token tracking for non-streaming, streaming (SSE), and stateless (`store: false`) requests
 
 > **Note on non-OpenAI models**: This sample deploys an Azure OpenAI model only (default: `gpt-5-mini`). Other model families on Azure AI Services - such as Anthropic Claude via the Azure Marketplace - are gated by separate quota that is granted through a manual approval process, which puts them beyond the scope of a self-service sample. If you have approved quota for another provider, you can extend the sample by adding a second deployment in `main.bicep`; the token-tracking policy and workbook queries are model-agnostic.
 
@@ -86,6 +86,23 @@ The workbook surfaces **both** streaming variants side-by-side so you can see ex
 
 The **AI Gateway** tab's *Streaming vs Non-Streaming Breakdown* and the **Per-Request Detail** tab's `AI Delivery Mode` + `Usage Provenance` columns both render this distinction, so you can confirm token capture works regardless of whether the client or APIM supplied the usage option.
 
+### AI Surface Coverage (Chat Completions + Responses API)
+
+The notebook exercises **six** AI request modes per business unit per model so you can see APIM token tracking work across both Azure OpenAI surfaces and every streaming variant. Mode is chosen by `j % 6` for the `j`-th request within a business unit, giving a deterministic, even mix:
+
+| Mode | API surface | Streaming | Notes |
+| --- | --- | --- | --- |
+| 0 | Chat Completions | No | Baseline non-streaming chat. |
+| 1 | Chat Completions | Yes | Client sends `stream_options.include_usage = true`; APIM forwards unchanged. |
+| 2 | Chat Completions | Yes | Client omits `stream_options`; the `pf-ensure-stream-include-usage.xml` fragment injects it and emits an `IncludeUsageInjected` trace. |
+| 3 | Responses API | No | Stateful (`store` defaults to `true`); uses `input` + `max_output_tokens`. |
+| 4 | Responses API | Yes | Streaming Responses; the policy fragment is a no-op for this surface. |
+| 5 | Responses API | No | Stateless variant with `store: false` to demonstrate ephemeral usage. |
+
+The Chat Completions and Responses APIs use different api-versions (`2024-10-21` vs `2025-03-01-preview`), different routes (`/deployments/{id}/chat/completions` vs `/responses`), and different request shapes (`messages` + `max_completion_tokens` vs `input` + `max_output_tokens`). They share the same `aoai-backend` and the same APIM AI logger, so `ApiManagementGatewayLlmLog` rows from both surfaces flow into the same workspace and are split by `OperationId` (`chat-completions-create` vs `responses-create`) in the workbook.
+
+The `pf-ensure-stream-include-usage.xml` fragment short-circuits for the Responses API: it only inspects the body when `messages` is present, so Responses requests pass through untouched. The workbook's *Streaming vs Non-Streaming Breakdown*, *Token Counts by Business Unit & Delivery Mode* table, and *Per-Request Detail* tab all surface an `API Surface` column / slice (`Chat` vs `Responses`) so you can verify each mode produced its expected rows.
+
 > **Business unit attribution**: Join `ApiManagementGatewayLlmLog` with `ApiManagementGatewayLogs` on `CorrelationId` to map token counts to `ApimSubscriptionId` (business unit). See `bu-token-usage.kql` for a ready-to-use query.
 
 ### Context Propagation
diff --git a/samples/costing/_helpers.py b/samples/costing/_helpers.py
index d081350..92daeb6 100644
--- a/samples/costing/_helpers.py
+++ b/samples/costing/_helpers.py
@@ -297,66 +297,115 @@ def send_aoai_traffic(
     chat_body: dict,
     stream_body: dict,
     stream_body_without_usage: dict | None = None,
-) -> tuple[int, int, int, int, bool]:
-    """Send `count` AOAI requests alternating non-streaming / streaming.
-
-    Encapsulates the inner request loop used by cell D1's per-(BU, model) loop:
-    even iterations send non-streaming chat completions, odd iterations send
-    streaming chat completions. When `stream_body_without_usage` is supplied,
-    streaming iterations alternate between a client body that already sets
-    `stream_options.include_usage = true` and one that omits it entirely so
-    the APIM policy fragment can prove when it injected the flag. On the first
-    timeout the function bails out for the rest of `count` to avoid stacking
-    cold-start delays into multi-minute hangs.
+    responses_url: str | None = None,
+    responses_body: dict | None = None,
+    responses_stream_body: dict | None = None,
+    responses_stateless_body: dict | None = None,
+) -> tuple[dict[str, int], dict[str, int], bool]:
+    """Send `count` AOAI requests cycling through up to six modes.
+
+    The dispatcher cycles `j % 6` across these modes:
+
+      | j%6 | API       | Mode                                                       |
+      |-----|-----------|------------------------------------------------------------|
+      | 0   | chat      | non-streaming                                              |
+      | 1   | chat      | streaming WITH stream_options.include_usage                |
+      | 2   | chat      | streaming WITHOUT stream_options (APIM injects + traces)   |
+      | 3   | responses | non-streaming, stateful (default store=true)               |
+      | 4   | responses | streaming                                                  |
+      | 5   | responses | non-streaming, stateless (store=false)                     |
+
+    Mode 2 is intentionally preserved - it is the only case where APIM's
+    `Ensure-Stream-Include-Usage` fragment mutates the request body and writes
+    a `TraceRecords` entry, which the workbook surfaces as proof of injection.
+
+    Mode 5 sends `{store: false}`. Per-request token counts are identical to
+    mode 3; the educational point is the **stateless** behavior (no chaining
+    via `previous_response_id`, no server-side retrieval).
+
+    On the first timeout the function bails out for the rest of `count` to
+    avoid stacking cold-start delays into multi-minute hangs.
 
     Args:
         session: Pre-configured `requests.Session` (built via `build_session`).
         chat_url: Full chat-completions URL for the target deployment.
         caller_headers: Per-call headers (api-key for the BU + Authorization JWT).
         count: Total number of requests to send for this (BU, model) cell.
-        chat_body: JSON body for non-streaming requests.
-        stream_body: JSON body for streaming requests where the client already
-            sets `stream_options.include_usage: True`.
-        stream_body_without_usage: Optional JSON body for streaming requests
-            that intentionally omits `stream_options.include_usage` so APIM can
-            inject it and emit a trace record proving the mutation.
+        chat_body: Non-streaming chat completions body.
+        stream_body: Streaming chat completions body with
+            `stream_options.include_usage = true` set by the client.
+        stream_body_without_usage: Streaming chat completions body that omits
+            `stream_options` so APIM can inject it and emit a trace record.
+            When None, mode 2 falls back to mode 1.
+        responses_url: Full /responses URL. When None, modes 3/4/5 are skipped
+            and replaced by mode-0/1/0 respectively (Chat fallback).
+        responses_body: Responses API non-streaming body (used for mode 3).
+        responses_stream_body: Responses API streaming body (used for mode 4).
+        responses_stateless_body: Responses API non-streaming body with
+            `store: false` (used for mode 5).
 
     Returns:
-        `(non_streaming_delivered, streaming_delivered, planned_ns, planned_s, bailed)`.
-        `*_delivered` counts only requests that returned an HTTP response.
+        `(delivered, planned, bailed)` where `delivered` and `planned` are
+        dicts with these keys:
+            chat_non_streaming
+            chat_stream_with_usage
+            chat_stream_without_usage
+            responses_non_streaming
+            responses_stream
+            responses_non_streaming_stateless
         `bailed` is True if a timeout caused the loop to exit early.
     """
-    non_streaming_count = 0
-    streaming_count = 0
-    planned_non_streaming = 0
-    planned_streaming = 0
+    keys = (
+        'chat_non_streaming',
+        'chat_stream_with_usage',
+        'chat_stream_without_usage',
+        'responses_non_streaming',
+        'responses_stream',
+        'responses_non_streaming_stateless',
+    )
+    delivered = dict.fromkeys(keys, 0)
+    planned = dict.fromkeys(keys, 0)
     bailed = False
 
+    responses_available = (
+        responses_url is not None and responses_body is not None and responses_stream_body is not None and responses_stateless_body is not None
+    )
+
     for j in range(count):
         if bailed:
             break
 
-        use_streaming = j % 2 == 1
-        if use_streaming:
-            planned_streaming += 1
-        else:
-            planned_non_streaming += 1
-
-        if use_streaming:
-            streaming_iteration = planned_streaming - 1
-            body = stream_body_without_usage if stream_body_without_usage is not None and streaming_iteration % 2 == 0 else stream_body
-        else:
-            body = chat_body
+        # Resolve mode from j % 6, with safe fallbacks when optional bodies/URLs missing.
+        mode = j % 6
+        if mode == 2 and stream_body_without_usage is None:
+            mode = 1
+        if mode in (3, 4, 5) and not responses_available:
+            mode = 0 if mode in (3, 5) else 1
+
+        if mode == 0:
+            url, body, key, is_stream = chat_url, chat_body, 'chat_non_streaming', False
+        elif mode == 1:
+            url, body, key, is_stream = chat_url, stream_body, 'chat_stream_with_usage', True
+        elif mode == 2:
+            url, body, key, is_stream = chat_url, stream_body_without_usage, 'chat_stream_without_usage', True
+        elif mode == 3:
+            url, body, key, is_stream = responses_url, responses_body, 'responses_non_streaming', False
+        elif mode == 4:
+            url, body, key, is_stream = responses_url, responses_stream_body, 'responses_stream', True
+        else:  # mode == 5
+            url, body, key, is_stream = responses_url, responses_stateless_body, 'responses_non_streaming_stateless', False
+
+        planned[key] += 1
 
         try:
             r = session.post(
-                chat_url,
+                url,
                 json=body,
                 headers=caller_headers,
-                timeout=45 if use_streaming else 30,
-                stream=use_streaming,
+                timeout=45 if is_stream else 30,
+                stream=is_stream,
             )
-            if use_streaming and r.status_code == 200:
+            if is_stream and r.status_code == 200:
                 # Drain SSE stream so APIM logs the final chunk (with usage).
                 for _ in r.iter_lines(decode_unicode=True):
                     pass
@@ -368,12 +417,9 @@ def send_aoai_traffic(
             continue
 
         # 4xx/5xx still count: they appear in ApiManagementGatewayLogs.
-        if use_streaming:
-            streaming_count += 1
-        else:
-            non_streaming_count += 1
+        delivered[key] += 1
 
-    return non_streaming_count, streaming_count, planned_non_streaming, planned_streaming, bailed
+    return delivered, planned, bailed
 
 
 def print_portal_links(items: list[tuple[str, str | None]]) -> None:
@@ -550,6 +596,9 @@ def build_costing_apis(
 
     if enable_foundry and enable_token_tracking and token_metric_policy_xml is not None:
         aoai_operation_policy_xml = Path(utils.determine_policy_path('aoai-gateway-operation.xml', sample_folder)).read_text(encoding='utf-8')
+        aoai_responses_operation_policy_xml = Path(utils.determine_policy_path('aoai-gateway-responses-operation.xml', sample_folder)).read_text(
+            encoding='utf-8'
+        )
 
         paths['aoai_api_path'] = 'aoai-gateway'
         aoai_chat_post = APIOperation(
@@ -561,6 +610,18 @@ def build_costing_apis(
             policyXml=aoai_operation_policy_xml,
             templateParameters=[{'name': 'deploymentId', 'type': 'string', 'required': True}],
         )
+        # Responses API operation. Uses the modern stateless/stateful
+        # /responses surface. Pinned to api-version 2025-03-01-preview via a
+        # per-operation set-query-parameter so chat-completion stays on
+        # 2024-10-21 unaffected.
+        aoai_responses_post = APIOperation(
+            'responses-create',
+            'Responses Create',
+            '/responses',
+            HTTP_VERB.POST,
+            'Azure OpenAI Responses API create (streaming, non-streaming, and stateless via store=false)',
+            policyXml=aoai_responses_operation_policy_xml,
+        )
         apis.append(
             API(
                 f'{api_prefix}aoai-gateway',
@@ -568,7 +629,7 @@ def build_costing_apis(
                 paths['aoai_api_path'],
                 'Azure OpenAI gateway for demonstrating real token tracking with Foundry',
                 policyXml=token_metric_policy_xml,
-                operations=[aoai_chat_post],
+                operations=[aoai_chat_post, aoai_responses_post],
                 tags=['costing', 'emit-metric', 'ai-gateway', 'aoai', 'foundry'],
                 subscriptionRequired=True,
                 serviceUrl='https://placeholder.openai.azure.com/openai',
@@ -816,30 +877,59 @@ def print_aoai_traffic_summary(
     model_request_counts: dict[str, dict[str, int]],
     bu_model_counts: dict[tuple[str, str], dict[str, int]],
 ) -> tuple[int, int, int]:
-    """Print per-model and per-BU×per-model AOAI request tables.
+    """Print per-model and per-BU x per-model AOAI request tables.
+
+    Each per-(model) and per-(BU, model) value is a dict carrying delivered
+    counts for the six AOAI traffic modes:
+
+        chat_non_streaming, chat_stream_with_usage, chat_stream_without_usage,
+        responses_non_streaming, responses_stream, responses_non_streaming_stateless
+
+    The summary tables collapse those into Chat-Sync, Chat-Stream, Resp-Sync,
+    Resp-Stream so the per-row width stays readable while still surfacing the
+    Chat vs Responses split. The streaming-with vs without-usage detail and
+    the stateful vs stateless Responses split are visible in the workbook.
 
     Returns:
-        `(grand_non_streaming, grand_streaming, total)` — used by the caller
-        for the trailing summary line and persistence step.
+        `(grand_chat, grand_responses, total)` where `grand_chat` is the sum
+        of all chat modes and `grand_responses` is the sum of all responses
+        modes across every (model) row. Used by the caller for the trailing
+        summary line.
     """
+
+    def _agg(counts: dict[str, int]) -> tuple[int, int, int, int]:
+        chat_sync = counts.get('chat_non_streaming', 0)
+        chat_stream = counts.get('chat_stream_with_usage', 0) + counts.get('chat_stream_without_usage', 0)
+        resp_sync = counts.get('responses_non_streaming', 0) + counts.get('responses_non_streaming_stateless', 0)
+        resp_stream = counts.get('responses_stream', 0)
+        return chat_sync, chat_stream, resp_sync, resp_stream
+
     print()
     print_info('Requests per model')
     summary_table = TableLogger()
     summary_table.header(
         Column('Model'),
-        Column('Non-streaming', align='>'),
-        Column('Streaming', align='>'),
+        Column('Chat-Sync', align='>'),
+        Column('Chat-Stream', align='>'),
+        Column('Resp-Sync', align='>'),
+        Column('Resp-Stream', align='>'),
         Column('Total', align='>'),
     )
     summary_rows = []
-    grand_ns = grand_s = 0
+    grand_chat = grand_resp = 0
+    g_cs = g_cstream = g_rs = g_rstream = 0
     for m, counts in model_request_counts.items():
-        total = counts['non_streaming'] + counts['streaming']
-        summary_rows.append([m, counts['non_streaming'], counts['streaming'], total])
-        grand_ns += counts['non_streaming']
-        grand_s += counts['streaming']
+        cs, cstream, rs, rstream = _agg(counts)
+        total = cs + cstream + rs + rstream
+        summary_rows.append([m, cs, cstream, rs, rstream, total])
+        g_cs += cs
+        g_cstream += cstream
+        g_rs += rs
+        g_rstream += rstream
+        grand_chat += cs + cstream
+        grand_resp += rs + rstream
     summary_table.populate(summary_rows)
-    summary_table.total('GRAND TOTAL', grand_ns, grand_s, grand_ns + grand_s)
+    summary_table.total('GRAND TOTAL', g_cs, g_cstream, g_rs, g_rstream, g_cs + g_cstream + g_rs + g_rstream)
     summary_table.print()
 
     print()
@@ -848,23 +938,28 @@ def print_aoai_traffic_summary(
     bu_model_table.header(
         Column('Business Unit'),
         Column('Model'),
-        Column('Non-streaming', align='>'),
-        Column('Streaming', align='>'),
+        Column('Chat-Sync', align='>'),
+        Column('Chat-Stream', align='>'),
+        Column('Resp-Sync', align='>'),
+        Column('Resp-Stream', align='>'),
         Column('Total', align='>'),
     )
     bu_rows = []
-    bu_grand_ns = bu_grand_s = 0
+    bu_cs = bu_cstream = bu_rs = bu_rstream = 0
     for bu, m in sorted(bu_model_counts.keys()):
         counts = bu_model_counts[(bu, m)]
-        total = counts['non_streaming'] + counts['streaming']
-        bu_rows.append([bu, m, counts['non_streaming'], counts['streaming'], total])
-        bu_grand_ns += counts['non_streaming']
-        bu_grand_s += counts['streaming']
+        cs, cstream, rs, rstream = _agg(counts)
+        total = cs + cstream + rs + rstream
+        bu_rows.append([bu, m, cs, cstream, rs, rstream, total])
+        bu_cs += cs
+        bu_cstream += cstream
+        bu_rs += rs
+        bu_rstream += rstream
     bu_model_table.populate(bu_rows)
-    bu_model_table.total('GRAND TOTAL', '', bu_grand_ns, bu_grand_s, bu_grand_ns + bu_grand_s)
+    bu_model_table.total('GRAND TOTAL', '', bu_cs, bu_cstream, bu_rs, bu_rstream, bu_cs + bu_cstream + bu_rs + bu_rstream)
     bu_model_table.print()
 
-    return grand_ns, grand_s, grand_ns + grand_s
+    return grand_chat, grand_resp, grand_chat + grand_resp
 
 
 def persist_aoai_traffic(
@@ -880,15 +975,36 @@ def persist_aoai_traffic(
 ) -> int:
     """Roll up per-(BU,model) AOAI counts into a single trafficSources entry.
 
+    Each `bu_model_counts[(bu, m)]` and `bu_model_planned[(bu, m)]` entry is a
+    six-key dict matching the dispatcher's mode keys. Persisted shape per
+    (BU, model) under `byModel[].chat` / `byModel[].responses` mirrors the
+    dispatcher modes so the workbook cross-reference and tests can identify
+    which AI surface was exercised.
+
     Returns the total planned request count across all BU/model pairs (used
     by the caller for a trailing print line). The total delivered count is
     derived inside the function and stored as `totalRequests` in the JSON.
     """
+
+    def _shape(counts: dict[str, int]) -> dict:
+        return {
+            'chat': {
+                'nonStreaming': counts.get('chat_non_streaming', 0),
+                'streamingWithUsage': counts.get('chat_stream_with_usage', 0),
+                'streamingWithoutUsage': counts.get('chat_stream_without_usage', 0),
+            },
+            'responses': {
+                'nonStreaming': counts.get('responses_non_streaming', 0),
+                'streaming': counts.get('responses_stream', 0),
+                'nonStreamingStateless': counts.get('responses_non_streaming_stateless', 0),
+            },
+        }
+
     ai_bu_rollup: dict[str, dict] = {}
     total_delivered = 0
     for (bu, m), counts in bu_model_counts.items():
         bu_info_local = subscriptions.get(bu, {})
-        planned = bu_model_planned.get((bu, m), {'non_streaming': 0, 'streaming': 0})
+        planned = bu_model_planned.get((bu, m), {})
         entry = ai_bu_rollup.setdefault(
             bu,
             {
@@ -901,23 +1017,21 @@ def persist_aoai_traffic(
                 'byModel': [],
             },
         )
-        model_total = counts['non_streaming'] + counts['streaming']
-        planned_total = planned['non_streaming'] + planned['streaming']
+        model_total = sum(counts.values())
+        planned_total = sum(planned.values())
         entry['planned'] += planned_total
         entry['requests'] += model_total
         total_delivered += model_total
         entry['byModel'].append(
             {
                 'model': m,
-                'plannedNonStreaming': planned['non_streaming'],
-                'plannedStreaming': planned['streaming'],
-                'nonStreaming': counts['non_streaming'],
-                'streaming': counts['streaming'],
+                'planned': _shape(planned),
+                'delivered': _shape(counts),
                 'total': model_total,
             }
         )
 
-    total_planned = sum(p['non_streaming'] + p['streaming'] for p in bu_model_planned.values())
+    total_planned = sum(sum(p.values()) for p in bu_model_planned.values())
     persist_traffic_source(
         local_data_path,
         sample_folder=sample_folder,
diff --git a/samples/costing/aoai-gateway-responses-operation.xml b/samples/costing/aoai-gateway-responses-operation.xml
new file mode 100644
index 0000000..d2ddd0e
--- /dev/null
+++ b/samples/costing/aoai-gateway-responses-operation.xml
@@ -0,0 +1,32 @@
+<!--
+    AOAI Gateway Responses Operation Policy
+
+    Routes /responses requests to the Azure OpenAI backend with managed
+    identity authentication. Pinned to api-version 2025-03-01-preview, which
+    is the first widely-available preview that exposes the Responses API on
+    Azure OpenAI. Existing /chat/completions traffic continues to use the
+    operation-level api-version override on chat-completion (2024-10-21).
+
+    The API-level policy (emit_metric_caller_tokens.xml) handles caller
+    identification, context-propagation headers, and token metric emission
+    for both Chat and Responses operations uniformly.
+-->
+<policies>
+    <inbound>
+        <base />
+        <set-backend-service backend-id="aoai-backend" />
+        <authentication-managed-identity resource="https://cognitiveservices.azure.com" />
+        <set-query-parameter name="api-version" exists-action="skip">
+            <value>2025-03-01-preview</value>
+        </set-query-parameter>
+    </inbound>
+    <backend>
+        <base />
+    </backend>
+    <outbound>
+        <base />
+    </outbound>
+    <on-error>
+        <base />
+    </on-error>
+</policies>
diff --git a/samples/costing/bu-token-usage.kql b/samples/costing/bu-token-usage.kql
index e673bcc..e48dfdd 100644
--- a/samples/costing/bu-token-usage.kql
+++ b/samples/costing/bu-token-usage.kql
@@ -20,6 +20,11 @@ ApiManagementGatewayLlmLog
     TotalTokens,
     ModelName,
     IsStreamCompletion
+// Collapse dated AOAI base-model variants (e.g. 'gpt-4o-mini-2024-07-18')
+// into the deployment alias (e.g. 'gpt-4o-mini'). The trailing date suffix
+// is informational only and would otherwise split a single deployment
+// across multiple rows.
+| extend ModelName = replace_regex(ModelName, @'-\d{4}-\d{2}-\d{2}$', '')
 | join kind=inner (
     ApiManagementGatewayLogs
     | where TimeGenerated > ago(timeWindow)
@@ -29,11 +34,14 @@ ApiManagementGatewayLlmLog
         ApiId,
         OperationId
 ) on CorrelationId
+| extend
+    ApiSurface = iff(OperationId contains 'responses-create', 'Responses', 'Chat'),
+    StreamMode = iff(IsStreamCompletion == true, 'Stream', 'Sync')
 | summarize
     TotalPromptTokens     = sum(PromptTokens),
     TotalCompletionTokens = sum(CompletionTokens),
     TotalTokens           = sum(TotalTokens),
     Requests              = count(),
     StreamingRequests     = countif(IsStreamCompletion == true)
-    by BusinessUnit, ModelName
+    by BusinessUnit, ModelName, ApiSurface, StreamMode
 | order by TotalTokens desc
diff --git a/samples/costing/costing.workbook.json b/samples/costing/costing.workbook.json
index 505bc6a..43bb777 100644
--- a/samples/costing/costing.workbook.json
+++ b/samples/costing/costing.workbook.json
@@ -1542,7 +1542,7 @@
                 },
                 {
                   "content": {
-                    "query": "let promptRate = coalesce(todouble('{PromptTokenRate}'), 0.00025);\nlet completionRate = coalesce(todouble('{CompletionTokenRate}'), 0.002);\nlet rankedBusinessUnits = materialize(ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| summarize RankMetric = count() by ApimSubscriptionId\n| order by RankMetric desc, ApimSubscriptionId asc\n| serialize\n| extend BucketRank = row_number()\n| extend BusinessUnitBucket = substring(ApimSubscriptionId, 3));\nlet bucketedLlmLogs = materialize(ApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| where TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)\n| project CorrelationId, PromptTokens, CompletionTokens, TotalTokens, ModelName\n| join kind=inner (\n    ApiManagementGatewayLogs\n    | where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n    | project CorrelationId, ApimSubscriptionId\n) on CorrelationId\n| lookup kind=leftouter rankedBusinessUnits on ApimSubscriptionId\n);\nlet grandTotalTokens = toscalar(bucketedLlmLogs | summarize sum(TotalTokens));\nbucketedLlmLogs\n| summarize\n    PromptTokens = sum(PromptTokens),\n    CompletionTokens = sum(CompletionTokens),\n    TotalTokens = sum(TotalTokens),\n    Requests = count(),\n    BucketRank = min(BucketRank)\n    by BusinessUnitBucket, ModelName\n| extend PromptCost = round(PromptTokens * promptRate / 1000.0, 4)\n| extend CompletionCost = round(CompletionTokens * completionRate / 1000.0, 4)\n| extend TotalCost = round(PromptCost + CompletionCost, 4)\n| extend TotalTokensPct = iif(grandTotalTokens > 0, round(TotalTokens * 100.0 / grandTotalTokens, 2), 0.0)\n| order by BucketRank asc, BusinessUnitBucket asc, ModelName asc\n| project\n    ['Business Unit'] = BusinessUnitBucket,\n    Model = ModelName,\n    ['Prompt Tokens'] = PromptTokens,\n    ['Completion Tokens'] = CompletionTokens,\n    ['Total Tokens'] = TotalTokens,\n    ['Total Tokens Pct'] = TotalTokensPct,\n    Requests,\n    ['Prompt Cost ($)'] = PromptCost,\n    ['Completion Cost ($)'] = CompletionCost,\n    ['Total Cost ($)'] = TotalCost",
+                    "query": "let promptRate = coalesce(todouble('{PromptTokenRate}'), 0.00025);\nlet completionRate = coalesce(todouble('{CompletionTokenRate}'), 0.002);\nlet rankedBusinessUnits = materialize(ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| summarize RankMetric = count() by ApimSubscriptionId\n| order by RankMetric desc, ApimSubscriptionId asc\n| serialize\n| extend BucketRank = row_number()\n| extend BusinessUnitBucket = substring(ApimSubscriptionId, 3));\nlet bucketedLlmLogs = materialize(ApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| where TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)\n| project CorrelationId, PromptTokens, CompletionTokens, TotalTokens, ModelName\n| extend ModelName = replace_regex(ModelName, @'-\\d{4}-\\d{2}-\\d{2}$', '')\n| join kind=inner (\n    ApiManagementGatewayLogs\n    | where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n    | project CorrelationId, ApimSubscriptionId\n) on CorrelationId\n| lookup kind=leftouter rankedBusinessUnits on ApimSubscriptionId\n);\nlet grandTotalTokens = toscalar(bucketedLlmLogs | summarize sum(TotalTokens));\nbucketedLlmLogs\n| summarize\n    PromptTokens = sum(PromptTokens),\n    CompletionTokens = sum(CompletionTokens),\n    TotalTokens = sum(TotalTokens),\n    Requests = count(),\n    BucketRank = min(BucketRank)\n    by BusinessUnitBucket, ModelName\n| extend PromptCost = round(PromptTokens * promptRate / 1000.0, 4)\n| extend CompletionCost = round(CompletionTokens * completionRate / 1000.0, 4)\n| extend TotalCost = round(PromptCost + CompletionCost, 4)\n| extend TotalTokensPct = iif(grandTotalTokens > 0, round(TotalTokens * 100.0 / grandTotalTokens, 2), 0.0)\n| order by BucketRank asc, BusinessUnitBucket asc, ModelName asc\n| project\n    ['Business Unit'] = BusinessUnitBucket,\n    Model = ModelName,\n    ['Prompt Tokens'] = PromptTokens,\n    ['Completion Tokens'] = CompletionTokens,\n    ['Total Tokens'] = TotalTokens,\n    ['Total Tokens Pct'] = TotalTokensPct,\n    Requests,\n    ['Prompt Cost ($)'] = PromptCost,\n    ['Completion Cost ($)'] = CompletionCost,\n    ['Total Cost ($)'] = TotalCost",
                     "queryType": 0,
                     "resourceType": "microsoft.operationalinsights/workspaces",
                     "size": 0,
@@ -1629,7 +1629,7 @@
               "items": [
                 {
                   "content": {
-                    "query": "ApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| where TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)\n| project CorrelationId, ModelName\n| join kind=inner (\n    ApiManagementGatewayLogs\n    | where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n    | project CorrelationId, BusinessUnit = substring(ApimSubscriptionId, 3)\n) on CorrelationId\n| extend Model = iif(isempty(ModelName), '(unknown)', ModelName)\n| summarize Requests = count() by Model\n| order by Model asc",
+                    "query": "ApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| where TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)\n| project CorrelationId, ModelName\n| extend ModelName = replace_regex(ModelName, @'-\\d{4}-\\d{2}-\\d{2}$', '')\n| join kind=inner (\n    ApiManagementGatewayLogs\n    | where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n    | project CorrelationId, BusinessUnit = substring(ApimSubscriptionId, 3)\n) on CorrelationId\n| extend Model = iif(isempty(ModelName), '(unknown)', ModelName)\n| summarize Requests = count() by Model\n| order by Model asc",
                     "queryType": 0,
                     "resourceType": "microsoft.operationalinsights/workspaces",
                     "size": 1,
@@ -1653,7 +1653,7 @@
                 },
                 {
                   "content": {
-                    "query": "ApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| where TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)\n| project CorrelationId, TotalTokens, ModelName\n| join kind=inner (\n    ApiManagementGatewayLogs\n    | where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n    | project CorrelationId, BusinessUnit = substring(ApimSubscriptionId, 3)\n) on CorrelationId\n| extend Model = iif(isempty(ModelName), '(unknown)', ModelName)\n| summarize TotalTokens = sum(TotalTokens) by Model\n| order by Model asc",
+                    "query": "ApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| where TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)\n| project CorrelationId, TotalTokens, ModelName\n| extend ModelName = replace_regex(ModelName, @'-\\d{4}-\\d{2}-\\d{2}$', '')\n| join kind=inner (\n    ApiManagementGatewayLogs\n    | where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n    | project CorrelationId, BusinessUnit = substring(ApimSubscriptionId, 3)\n) on CorrelationId\n| extend Model = iif(isempty(ModelName), '(unknown)', ModelName)\n| summarize TotalTokens = sum(TotalTokens) by Model\n| order by Model asc",
                     "queryType": 0,
                     "resourceType": "microsoft.operationalinsights/workspaces",
                     "size": 1,
@@ -1677,7 +1677,7 @@
                 },
                 {
                   "content": {
-                    "query": "ApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| where TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)\n| project CorrelationId, IsStreamCompletion\n| join kind=inner (\n    ApiManagementGatewayLogs\n    | where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n    | project CorrelationId, TraceRecords\n) on CorrelationId\n| extend DeliveryMode = case(\n    IsStreamCompletion == true and tostring(TraceRecords) has 'IncludeUsageInjected', 'Streaming (policy-injected usage)',\n    IsStreamCompletion == true, 'Streaming (client-supplied usage)',\n    'Non-Streaming'\n)\n| summarize Requests = count() by DeliveryMode",
+                    "query": "ApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| where TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)\n| project CorrelationId, IsStreamCompletion\n| join kind=inner (\n    ApiManagementGatewayLogs\n    | where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n    | project CorrelationId, OperationId, TraceRecords\n) on CorrelationId\n| extend ApiSurface = iff(OperationId contains 'responses-create', 'Responses', 'Chat')\n| extend DeliveryMode = case(\n    ApiSurface == 'Responses' and IsStreamCompletion == true, 'Responses (streaming)',\n    ApiSurface == 'Responses', 'Responses (non-streaming)',\n    IsStreamCompletion == true and tostring(TraceRecords) has 'IncludeUsageInjected', 'Chat (policy-injected usage)',\n    IsStreamCompletion == true, 'Chat (client-supplied usage)',\n    'Chat (non-streaming)'\n)\n| summarize Requests = count() by DeliveryMode",
                     "queryType": 0,
                     "resourceType": "microsoft.operationalinsights/workspaces",
                     "size": 1,
@@ -1718,7 +1718,7 @@
               "items": [
                 {
                   "content": {
-                    "query": "let rankedBusinessUnits = materialize(ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| summarize RankMetric = count() by ApimSubscriptionId\n| order by RankMetric desc, ApimSubscriptionId asc\n| serialize\n| extend BucketRank = row_number()\n| extend BusinessUnitBucket = substring(ApimSubscriptionId, 3));\nlet bucketedLlmLogs = materialize(ApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| where TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)\n| project CorrelationId, PromptTokens, CompletionTokens, TotalTokens, IsStreamCompletion, ModelName\n| join kind=inner (\n    ApiManagementGatewayLogs\n    | where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n    | project CorrelationId, ApimSubscriptionId, TraceRecords\n) on CorrelationId\n| lookup kind=leftouter rankedBusinessUnits on ApimSubscriptionId\n);\nbucketedLlmLogs\n| extend DeliveryMode = iif(IsStreamCompletion == true, 'Streaming', 'Non-Streaming')\n| extend UsageProvenance = case(\n    IsStreamCompletion != true, 'N/A',\n    tostring(TraceRecords) has 'IncludeUsageInjected', 'Policy injected include_usage',\n    'Client sent include_usage'\n)\n| summarize\n    Requests = count(),\n    PromptTokens = sum(PromptTokens),\n    CompletionTokens = sum(CompletionTokens),\n    TotalTokens = sum(TotalTokens),\n    BucketRank = min(BucketRank)\n    by BusinessUnitBucket, ModelName, DeliveryMode, UsageProvenance\n| order by BucketRank asc, BusinessUnitBucket asc, ModelName asc, DeliveryMode asc, UsageProvenance asc\n| project\n    ['Business Unit'] = BusinessUnitBucket,\n    Model = ModelName,\n    ['Delivery Mode'] = DeliveryMode,\n    ['Usage Provenance'] = UsageProvenance,\n    Requests,\n    ['Prompt Tokens'] = PromptTokens,\n    ['Completion Tokens'] = CompletionTokens,\n    ['Total Tokens'] = TotalTokens",
+                    "query": "let rankedBusinessUnits = materialize(ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| summarize RankMetric = count() by ApimSubscriptionId\n| order by RankMetric desc, ApimSubscriptionId asc\n| serialize\n| extend BucketRank = row_number()\n| extend BusinessUnitBucket = substring(ApimSubscriptionId, 3));\nlet bucketedLlmLogs = materialize(ApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| where TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)\n| project CorrelationId, PromptTokens, CompletionTokens, TotalTokens, IsStreamCompletion, ModelName\n| extend ModelName = replace_regex(ModelName, @'-\\d{4}-\\d{2}-\\d{2}$', '')\n| join kind=inner (\n    ApiManagementGatewayLogs\n    | where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n    | project CorrelationId, ApimSubscriptionId, OperationId, TraceRecords\n) on CorrelationId\n| lookup kind=leftouter rankedBusinessUnits on ApimSubscriptionId\n);\nbucketedLlmLogs\n| extend ApiSurface = iff(OperationId contains 'responses-create', 'Responses', 'Chat')\n| extend DeliveryMode = iif(IsStreamCompletion == true, 'Streaming', 'Non-Streaming')\n| extend UsageProvenance = case(\n    ApiSurface == 'Responses', 'N/A (Responses API)',\n    IsStreamCompletion != true, 'N/A',\n    tostring(TraceRecords) has 'IncludeUsageInjected', 'Policy injected include_usage',\n    'Client sent include_usage'\n)\n| summarize\n    Requests = count(),\n    PromptTokens = sum(PromptTokens),\n    CompletionTokens = sum(CompletionTokens),\n    TotalTokens = sum(TotalTokens),\n    BucketRank = min(BucketRank)\n    by BusinessUnitBucket, ModelName, ApiSurface, DeliveryMode, UsageProvenance\n| order by BucketRank asc, BusinessUnitBucket asc, ModelName asc, ApiSurface asc, DeliveryMode asc, UsageProvenance asc\n| project\n    ['Business Unit'] = BusinessUnitBucket,\n    Model = ModelName,\n    ['API Surface'] = ApiSurface,\n    ['Delivery Mode'] = DeliveryMode,\n    ['Usage Provenance'] = UsageProvenance,\n    Requests,\n    ['Prompt Tokens'] = PromptTokens,\n    ['Completion Tokens'] = CompletionTokens,\n    ['Total Tokens'] = TotalTokens",
                     "queryType": 0,
                     "resourceType": "microsoft.operationalinsights/workspaces",
                     "size": 0,
@@ -1986,7 +1986,7 @@
               "items": [
                 {
                   "content": {
-                    "query": "let promptRate = coalesce(todouble('{PromptTokenRate}'), 0.00025);\nlet completionRate = coalesce(todouble('{CompletionTokenRate}'), 0.002);\nlet rowLimit = toint('{RequestLimit}');\nlet llmPerRequest = ApiManagementGatewayLlmLog\n    | where TimeGenerated {TimeRange}\n    | summarize\n        ModelName = take_anyif(ModelName, isnotempty(ModelName)),\n        PromptTokens = sum(PromptTokens),\n        CompletionTokens = sum(CompletionTokens),\n        TotalTokens = sum(TotalTokens),\n        IsStreamCompletion = max(tobool(IsStreamCompletion))\n        by CorrelationId;\nApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| project CorrelationId, GatewayTime = TimeGenerated, BusinessUnit = substring(ApimSubscriptionId, 3), ApiId, OperationId, TotalTime, BackendTime, ResponseCode, TraceRecords\n| join kind=leftouter llmPerRequest on CorrelationId\n| extend PromptTokens = coalesce(PromptTokens, 0)\n| extend CompletionTokens = coalesce(CompletionTokens, 0)\n| extend TotalTokens = coalesce(TotalTokens, 0)\n| extend PromptCost = round(PromptTokens * promptRate / 1000.0, 6)\n| extend CompletionCost = round(CompletionTokens * completionRate / 1000.0, 6)\n| extend TotalCost = round(PromptCost + CompletionCost, 6)\n| extend AiDeliveryMode = case(isnull(IsStreamCompletion), '', IsStreamCompletion == true, 'Streaming', 'Non-Streaming')\n| extend UsageProvenance = case(\n    isnull(IsStreamCompletion), '',\n    IsStreamCompletion != true, 'N/A',\n    tostring(TraceRecords) has 'IncludeUsageInjected', 'Policy injected include_usage',\n    'Client sent include_usage'\n)\n| order by GatewayTime desc\n| take rowLimit\n| project\n    ['Timestamp (UTC)'] = format_datetime(GatewayTime, '{DateTimeFormat}'),\n    ['Business Unit'] = BusinessUnit,\n    ['Response Code'] = toint(ResponseCode),\n    Model = coalesce(ModelName, 'N/A'),\n    ['AI Delivery Mode'] = AiDeliveryMode,\n    ['Usage Provenance'] = UsageProvenance,\n    ['Prompt Tokens'] = PromptTokens,\n    ['Completion Tokens'] = CompletionTokens,\n    ['Total Tokens'] = TotalTokens,\n    ['Gateway Total (ms)'] = TotalTime,\n    ['Backend (ms)'] = BackendTime,\n    ['Prompt Cost ($)'] = PromptCost,\n    ['Completion Cost ($)'] = CompletionCost,\n    ['Total Cost ($)'] = TotalCost,\n    API = ApiId,\n    Operation = OperationId,\n    CorrelationId",
+                    "query": "let promptRate = coalesce(todouble('{PromptTokenRate}'), 0.00025);\nlet completionRate = coalesce(todouble('{CompletionTokenRate}'), 0.002);\nlet rowLimit = toint('{RequestLimit}');\nlet llmPerRequest = ApiManagementGatewayLlmLog\n    | where TimeGenerated {TimeRange}\n    | summarize\n        ModelName = take_anyif(ModelName, isnotempty(ModelName)),\n        PromptTokens = sum(PromptTokens),\n        CompletionTokens = sum(CompletionTokens),\n        TotalTokens = sum(TotalTokens),\n        IsStreamCompletion = max(tobool(IsStreamCompletion))\n        by CorrelationId\n    | extend ModelName = replace_regex(coalesce(ModelName, ''), @'-\\d{4}-\\d{2}-\\d{2}$', '');\nApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| project CorrelationId, GatewayTime = TimeGenerated, BusinessUnit = substring(ApimSubscriptionId, 3), ApiId, OperationId, TotalTime, BackendTime, ResponseCode, TraceRecords\n| join kind=leftouter llmPerRequest on CorrelationId\n| extend PromptTokens = coalesce(PromptTokens, 0)\n| extend CompletionTokens = coalesce(CompletionTokens, 0)\n| extend TotalTokens = coalesce(TotalTokens, 0)\n| extend PromptCost = round(PromptTokens * promptRate / 1000.0, 6)\n| extend CompletionCost = round(CompletionTokens * completionRate / 1000.0, 6)\n| extend TotalCost = round(PromptCost + CompletionCost, 6)\n| extend ApiSurface = iff(OperationId contains 'responses-create', 'Responses', 'Chat')\n| extend AiDeliveryMode = case(isnull(IsStreamCompletion), '', IsStreamCompletion == true, 'Streaming', 'Non-Streaming')\n| extend UsageProvenance = case(\n    isnull(IsStreamCompletion), '',\n    ApiSurface == 'Responses', 'N/A (Responses API)',\n    IsStreamCompletion != true, 'N/A',\n    tostring(TraceRecords) has 'IncludeUsageInjected', 'Policy injected include_usage',\n    'Client sent include_usage'\n)\n| order by GatewayTime desc\n| take rowLimit\n| project\n    ['Timestamp (UTC)'] = format_datetime(GatewayTime, '{DateTimeFormat}'),\n    ['Business Unit'] = BusinessUnit,\n    ['Response Code'] = toint(ResponseCode),\n    Model = coalesce(ModelName, 'N/A'),\n    ['API Surface'] = ApiSurface,\n    ['AI Delivery Mode'] = AiDeliveryMode,\n    ['Usage Provenance'] = UsageProvenance,\n    ['Prompt Tokens'] = PromptTokens,\n    ['Completion Tokens'] = CompletionTokens,\n    ['Total Tokens'] = TotalTokens,\n    ['Gateway Total (ms)'] = TotalTime,\n    ['Backend (ms)'] = BackendTime,\n    ['Prompt Cost ($)'] = PromptCost,\n    ['Completion Cost ($)'] = CompletionCost,\n    ['Total Cost ($)'] = TotalCost,\n    API = ApiId,\n    Operation = OperationId,\n    CorrelationId",
                     "queryType": 0,
                     "resourceType": "microsoft.operationalinsights/workspaces",
                     "size": 0,
diff --git a/samples/costing/create.ipynb b/samples/costing/create.ipynb
index 5292ef0..8133b56 100644
--- a/samples/costing/create.ipynb
+++ b/samples/costing/create.ipynb
@@ -142,9 +142,18 @@
     "# AI model test matrix - drives which models are deployed to Azure AI Services and\n",
     "# how many requests per simulated caller are generated against each.\n",
     "# Each entry: model name -> { version, capacity (K TPM), requests_per_caller }\n",
+    "#\n",
+    "# Models chosen to exercise three cost tiers AND ensure full emit-token-metric\n",
+    "# coverage on the Responses API. Per Microsoft Learn (azure-openai-emit-token-\n",
+    "# metric-policy), the Responses-API allow-list for the policy is limited to a\n",
+    "# specific set of model snapshots. gpt-4o-mini (2024-07-18) and gpt-4.1-nano\n",
+    "# (2025-04-14) are on that list; gpt-5-mini (2025-08-07) is NOT, so its\n",
+    "# Responses-API rows reach the workbook only via ApiManagementGatewayLlmLog\n",
+    "# (the diagnostic-log path), not via emit-token-metric custom metrics.\n",
     "model_test_matrix = {\n",
-    "    'gpt-5-mini':  {'version': '2025-08-07', 'capacity': 10, 'requests_per_caller': 5},\n",
-    "    'gpt-4o-mini': {'version': '2024-07-18', 'capacity': 10, 'requests_per_caller': 3},\n",
+    "    'gpt-4o-mini':   {'version': '2024-07-18', 'capacity': 10, 'requests_per_caller': 6},\n",
+    "    'gpt-4.1-nano':  {'version': '2025-04-14', 'capacity': 10, 'requests_per_caller': 8},\n",
+    "    'gpt-5-mini':    {'version': '2025-08-07', 'capacity': 10, 'requests_per_caller': 10},\n",
     "}\n",
     "\n",
     "# Derived: list form used for Bicep deployment and iteration\n",
@@ -201,7 +210,7 @@
     "\n",
     "if not subscription_id:\n",
     "    print_error('Could not determine Azure subscription ID. Run: az login')\n",
-    "    raise SystemExit(1)"
+    "    raise SystemExit(1)\n"
    ]
   },
   {
@@ -328,20 +337,35 @@
     "print_info(f'APIM pricing as of {APIM_PRICING_AS_OF}: {APIM_PRICING_URL}', True)\n",
     "print()\n",
     "\n",
-    "# Per-model pricing table\n",
+    "# Per-model pricing table. Models without local pricing data are skipped\n",
+    "# (with a warning) so the table still renders for the remaining models.\n",
+    "# Cost calculations downstream (workbook, KQL) handle missing rates the\n",
+    "# same way: no rate => no $ figure for that model.\n",
     "print_info('AI model pricing (per 1K tokens)')\n",
+    "pricing_rows = []\n",
+    "missing_pricing = []\n",
+    "for m in model_test_matrix:\n",
+    "    try:\n",
+    "        mp = get_model_pricing(m)\n",
+    "    except ValueError:\n",
+    "        missing_pricing.append(m)\n",
+    "        continue\n",
+    "    pricing_rows.append([m, f'${mp.prompt_rate_per_k:.5f}', f'${mp.completion_rate_per_k:.5f}'])\n",
+    "\n",
     "pricing_table = TableLogger()\n",
     "pricing_table.header(\n",
     "    Column('Model'),\n",
     "    Column('Prompt $/1K', align='>'),\n",
     "    Column('Completion $/1K', align='>'),\n",
     ")\n",
-    "pricing_table.populate([\n",
-    "    [m, f'${(mp := get_model_pricing(m)).prompt_rate_per_k:.5f}', f'${mp.completion_rate_per_k:.5f}']\n",
-    "    for m in model_test_matrix\n",
-    "])\n",
+    "pricing_table.populate(pricing_rows)\n",
     "pricing_table.print()\n",
     "\n",
+    "if missing_pricing:\n",
+    "    print()\n",
+    "    print_warning(f'No local pricing data for: {\", \".join(missing_pricing)}. These models will run, but cost figures will be omitted.')\n",
+    "    print_info(f'Add rates to shared/python/azure_cost.py (see {AOAI_PRICING_URL}).', True)\n",
+    "\n",
     "print_info(f'Azure OpenAI pricing as of {AOAI_PRICING_AS_OF}: {AOAI_PRICING_URL}', True)\n"
    ]
   },
@@ -744,15 +768,33 @@
    "source": [
     "### 🤖 D1 — [Traffic · AI Foundry] Real AOAI Interactions\n",
     "\n",
-    "When `enable_foundry = True` (default), the deployment provisions an Azure AI Services account with a model deployment plus an APIM backend with managed-identity auth. This cell sends **real** Azure OpenAI chat completions through the APIM gateway across all BUs and models so you see end-to-end token tracking. The mock simulation in D2 is **skipped** when this cell runs so AI traffic numbers reflect only real Foundry calls.\n",
+    "When `enable_foundry = True` (default), the deployment provisions an Azure AI Services account with a model deployment plus an APIM backend with managed-identity auth. This cell sends **real** Azure OpenAI traffic through the APIM gateway across all BUs and models so you see end-to-end token tracking on both the **Chat Completions** and **Responses** APIs. The mock simulation in D2 is **skipped** when this cell runs so AI traffic numbers reflect only real Foundry calls.\n",
+    "\n",
+    "Six delivery modes are exercised on a `j % 6` rotation per BU per model so a single cell run covers the full surface APIM diagnostics must handle:\n",
+    "\n",
+    "| `j % 6` | API           | Mode                                                                |\n",
+    "|---------|---------------|---------------------------------------------------------------------|\n",
+    "| 0       | Chat          | Non-streaming                                                       |\n",
+    "| 1       | Chat          | Streaming **with** `stream_options.include_usage = true` (client)   |\n",
+    "| 2       | Chat          | Streaming **without** `stream_options` (APIM injects + traces it)   |\n",
+    "| 3       | Responses     | Non-streaming, stateful (default `store: true`)                     |\n",
+    "| 4       | Responses     | Streaming                                                           |\n",
+    "| 5       | Responses     | Non-streaming, **stateless** (`store: false`)                       |\n",
+    "\n",
+    "Mode 2 is the only path where APIM's `Ensure-Stream-Include-Usage` fragment mutates the request body and writes a `TraceRecords` proof entry - the workbook's *Streaming usage source* tile surfaces this. Mode 5 demonstrates the stateless Responses pattern (no chaining via `previous_response_id`); per-request token counts match mode 3.\n",
+    "\n",
+    "#### Zero-impact streaming token capture\n",
     "\n",
-    "Two delivery modes are exercised per BU per model:\n",
-    "1. **Non-streaming** — standard JSON response with a `usage` object.\n",
-    "2. **Streaming (SSE)** — half the requests send `stream_options.include_usage = true` themselves; the other half omit it so the APIM policy can add it and log proof in `TraceRecords`.\n",
+    "This sample is intentionally **zero-impact on the streaming response path**: APIM never buffers, parses, or rewrites the response body. Token counts come from two zero-impact sources:\n",
     "\n",
-    "> **Note:** The outbound policy buffers streaming responses to extract usage. In production, prefer the built-in `azure-openai-emit-token-metric` policy for zero-impact streaming.\n",
+    "1. **`ApiManagementGatewayLlmLog`** (diagnostic log, used by every workbook tile here). APIM's built-in AI gateway diagnostic reads `usage` from the final SSE chunk on the fly and writes one row to Log Analytics per request, with `ModelName`, `PromptTokens`, `CompletionTokens`, `IsStreamCompletion`, and `CorrelationId`. No policy code runs against the response body.\n",
+    "2. **`azure-openai-emit-token-metric`** ([built-in policy](https://learn.microsoft.com/azure/api-management/azure-openai-emit-token-metric-policy)). Emits prompt/completion/total token counts as Application Insights custom metrics with arbitrary dimensions (CallerId, ModelName, etc.). Also reads the SSE stream without buffering. Use this when you need real-time, per-caller token metrics in App Insights instead of (or alongside) Log Analytics.\n",
     "\n",
-    "> **Double-counting warning:** Do NOT enable both the custom `emit-metric` token tracking and the built-in `azure-openai-emit-token-metric` policy simultaneously — they would emit duplicate metrics.\n"
+    "The custom inbound `emit-metric` in this sample emits `caller-requests` with `value=\"1\"` only - a request **counter**, not a token parser - so it never touches the response body and never duplicates token counts.\n",
+    "\n",
+    "> **Production guidance:** Use the built-in `azure-openai-emit-token-metric` for per-caller token metrics in App Insights. **Never** parse the response body in an outbound policy to extract tokens - that buffers the SSE stream, breaks streaming UX, and adds latency.\n",
+    "\n",
+    "> **Double-counting warning:** If you add `azure-openai-emit-token-metric` to this sample, do **not** also add a separate outbound `emit-metric` that parses tokens from the response body - the two would emit duplicate `prompt-tokens` / `completion-tokens` metrics. The existing `caller-requests` counter is safe to keep alongside either.\n"
    ]
   },
   {
@@ -820,9 +862,10 @@
     "        extra_headers={'Content-Type': 'application/json'},\n",
     "    )\n",
     "\n",
-    "    # Request bodies reused across the (BU, model) loop. Streaming requests\n",
-    "    # intentionally alternate between client-supplied include_usage=true and\n",
-    "    # omitted include_usage so the APIM policy can prove when it injected it.\n",
+    "    # Request bodies reused across the (BU, model) loop. The dispatcher in\n",
+    "    # send_aoai_traffic cycles j%6 across these to exercise both Chat and\n",
+    "    # Responses APIs in non-streaming + streaming + (Responses-only) stateless\n",
+    "    # variants.\n",
     "    chat_body = {\n",
     "        'messages': [\n",
     "            {'role': 'system', 'content': 'You are a helpful assistant. Keep responses brief.'},\n",
@@ -846,12 +889,33 @@
     "        'stream': True,\n",
     "    }\n",
     "\n",
+    "    # Responses API bodies. The Responses surface uses `input` (string or list)\n",
+    "    # and `max_output_tokens` instead of Chat Completions' `messages` and\n",
+    "    # `max_completion_tokens`. Mode 5 sets `store: false` for the stateless\n",
+    "    # variant - per-request token counts are identical to mode 3.\n",
+    "    responses_body = {\n",
+    "        'model': '',  # filled per-iteration below; 'model' must match the deployment name\n",
+    "        'input': 'Summarize APIM AI Gateway capabilities in one sentence.',\n",
+    "        'max_output_tokens': 100,\n",
+    "    }\n",
+    "    responses_stream_body = {**responses_body, 'stream': True}\n",
+    "    responses_stateless_body = {**responses_body, 'store': False}\n",
+    "\n",
     "    # --- Multi-BU traffic generation (sequential per BU per model) ---\n",
     "    print_info('Generating multi-BU AOAI traffic for cost tracking (per model)...')\n",
     "\n",
     "    # Per-model and per-(BU, model) request counters drive the summary tables\n",
     "    # printed below and the JSON persisted for the workbook cross-reference (E3).\n",
-    "    model_request_counts = {m: {'non_streaming': 0, 'streaming': 0} for m in model_test_matrix}\n",
+    "    # Each value is a six-key dict matching the dispatcher's mode keys.\n",
+    "    _empty_counts = {\n",
+    "        'chat_non_streaming': 0,\n",
+    "        'chat_stream_with_usage': 0,\n",
+    "        'chat_stream_without_usage': 0,\n",
+    "        'responses_non_streaming': 0,\n",
+    "        'responses_stream': 0,\n",
+    "        'responses_non_streaming_stateless': 0,\n",
+    "    }\n",
+    "    model_request_counts: dict[str, dict[str, int]] = {m: dict(_empty_counts) for m in model_test_matrix}\n",
     "    bu_model_counts: dict[tuple[str, str], dict[str, int]] = {}\n",
     "    bu_model_planned: dict[tuple[str, str], dict[str, int]] = {}\n",
     "\n",
@@ -859,6 +923,11 @@
     "        for model_name, cfg in model_test_matrix.items():\n",
     "            base_requests = cfg['requests_per_caller']\n",
     "            model_chat_url = f'{endpoint_url}/{aoai_api_path}/deployments/{model_name}/chat/completions'\n",
+    "            model_responses_url = f'{endpoint_url}/{aoai_api_path}/responses'\n",
+    "            # Responses API requires `model` in the body (deployment name).\n",
+    "            model_responses_body = {**responses_body, 'model': model_name}\n",
+    "            model_responses_stream_body = {**responses_stream_body, 'model': model_name}\n",
+    "            model_responses_stateless_body = {**responses_stateless_body, 'model': model_name}\n",
     "            print()\n",
     "            print_info(f'-> Model: {model_name} (base {base_requests} requests per BU, scaled by request_weight)')\n",
     "\n",
@@ -876,20 +945,36 @@
     "                # proportionally more tokens than lighter ones.\n",
     "                bu_request_count = max(1, int(base_requests * bu_info.get('request_weight', 1.0)))\n",
     "\n",
-    "                ns, s, planned_ns, planned_s, _ = send_aoai_traffic(\n",
+    "                delivered, planned, _ = send_aoai_traffic(\n",
     "                    session, model_chat_url, caller_headers, bu_request_count,\n",
     "                    chat_body=chat_body,\n",
     "                    stream_body=stream_body_with_usage,\n",
     "                    stream_body_without_usage=stream_body_without_usage,\n",
+    "                    responses_url=model_responses_url,\n",
+    "                    responses_body=model_responses_body,\n",
+    "                    responses_stream_body=model_responses_stream_body,\n",
+    "                    responses_stateless_body=model_responses_stateless_body,\n",
     "                )\n",
     "\n",
-    "                model_request_counts[model_name]['streaming'] += s\n",
-    "                model_request_counts[model_name]['non_streaming'] += ns\n",
-    "                bu_model_counts[(bu_name, model_name)] = {'non_streaming': ns, 'streaming': s}\n",
-    "                bu_model_planned[(bu_name, model_name)] = {'non_streaming': planned_ns, 'streaming': planned_s}\n",
+    "                # Aggregate per-mode delivered counts into the per-model table.\n",
+    "                for k, v in delivered.items():\n",
+    "                    model_request_counts[model_name][k] += v\n",
+    "                bu_model_counts[(bu_name, model_name)] = delivered\n",
+    "                bu_model_planned[(bu_name, model_name)] = planned\n",
+    "\n",
+    "                chat_total = (\n",
+    "                    delivered['chat_non_streaming']\n",
+    "                    + delivered['chat_stream_with_usage']\n",
+    "                    + delivered['chat_stream_without_usage']\n",
+    "                )\n",
+    "                resp_total = (\n",
+    "                    delivered['responses_non_streaming']\n",
+    "                    + delivered['responses_stream']\n",
+    "                    + delivered['responses_non_streaming_stateless']\n",
+    "                )\n",
     "                print_ok(\n",
     "                    f'  Sent {bu_request_count} requests for {bu_name} as {caller[\"name\"]}'\n",
-    "                    f' ({caller[\"appid\"][:12]}...) [{ns} non-streaming, {s} streaming]'\n",
+    "                    f' ({caller[\"appid\"][:12]}...) [chat: {chat_total}, responses: {resp_total}]'\n",
     "                )\n",
     "    finally:\n",
     "        session.close()\n",
@@ -902,8 +987,8 @@
     "    print_info('Each request emits a caller-requests metric entry')\n",
     "    print_info('Token counts are captured via the APIM diagnostic setting (ApiManagementGatewayLlmLog)')\n",
     "    print_info(\n",
-    "        'Streaming requests alternate between client-supplied include_usage and '\n",
-    "        'policy-injected include_usage; the workbook surfaces this via TraceRecords'\n",
+    "        'The dispatcher cycles six modes: 3 Chat (sync, stream w/usage, stream w/o usage) + '\n",
+    "        '3 Responses (sync, stream, sync stateless). Mode 2 is the only path APIM rewrites.'\n",
     "    )\n",
     "    print_info('Note: Custom metrics typically take 5-10 minutes to appear in Application Insights')\n",
     "\n",
@@ -929,15 +1014,15 @@
    "source": [
     "### 🤖 D2 — [Traffic · AI Mock] Mock Token Tracking (Skipped When Foundry Is On)\n",
     "\n",
-    "When `enable_foundry = False`, this cell exercises the AI-gateway pattern against a mock backend (`httpbin`) so you can see per-caller `caller-tokens` custom metrics without provisioning Foundry. When D1 sent real Foundry traffic, this cell is **skipped** to avoid muddling the AI request counts shown in the workbook.\n",
+    "When `enable_foundry = False`, this cell exercises the AI-gateway pattern against a mock backend (`httpbin`) so you can see per-caller request attribution without provisioning Foundry. When D1 sent real Foundry traffic, this cell is **skipped** to avoid muddling the AI request counts shown in the workbook.\n",
     "\n",
-    "The mock policy returns a hard-coded usage payload like:\n",
+    "The mock operation policy returns a hard-coded Azure OpenAI-shaped payload like:\n",
     "\n",
     "```json\n",
     "{ \"usage\": { \"prompt_tokens\": 100, \"completion_tokens\": 200, \"total_tokens\": 300 } }\n",
     "```\n",
     "\n",
-    "The `emit-metric` policy reads those fields and emits separate `prompt-tokens`, `completion-tokens`, and `total-tokens` metric entries — same shape as the real Foundry traffic from D1.\n"
+    "so you can inspect a realistic response shape end-to-end. The same `emit-metric` policy used everywhere else emits a single `caller-requests` (count) metric per call with `CallerId`, `API`, and `Operation` dimensions - this is what drives the workbook's caller-attribution tile. Token-level data is **not** captured for this mock API (`enableLlmLogging` is intentionally off on `costing-token-tracking-api`); for real per-request token counts, run D1 against Foundry instead.\n"
    ]
   },
   {
@@ -1142,7 +1227,9 @@
    "source": [
     "### 🔍 E2 — [Verify] Metric Ingestion\n",
     "\n",
-    "Poll Application Insights for the `caller-requests`, `prompt-tokens`, `completion-tokens`, and `total-tokens` custom metrics emitted by the `emit-metric` policies in C2/D1/D2. Custom metric ingestion can take 5-10 minutes after first emission.\n"
+    "Poll Application Insights for the `caller-requests` custom metric emitted by the `emit-metric` policy on the C2 / D1 / D2 APIs, then print a per-caller breakdown. Custom metric ingestion can take 5-10 minutes after first emission.\n",
+    "\n",
+    "Token counts (`PromptTokens`, `CompletionTokens`, `TotalTokens`) are **not** emitted as App Insights custom metrics in this sample - they are captured by the APIM AI gateway diagnostic into `ApiManagementGatewayLlmLog` in Log Analytics and verified in **E1**. To emit token counts as App Insights metrics in a production setup, add the built-in [`azure-openai-emit-token-metric`](https://learn.microsoft.com/azure/api-management/azure-openai-emit-token-metric-policy) policy.\n"
    ]
   },
   {
diff --git a/samples/costing/main.bicep b/samples/costing/main.bicep
index f33fa69..0784f08 100644
--- a/samples/costing/main.bicep
+++ b/samples/costing/main.bicep
@@ -85,6 +85,16 @@ param aiModels array = [
     version: '2025-08-07'
     capacity: 10
   }
+  {
+    name: 'gpt-4o-mini'
+    version: '2024-07-18'
+    capacity: 10
+  }
+  {
+    name: 'gpt-4.1-nano'
+    version: '2025-04-14'
+    capacity: 10
+  }
 ]
 
 
diff --git a/shared/apim-policies/fragments/pf-ensure-stream-include-usage.xml b/shared/apim-policies/fragments/pf-ensure-stream-include-usage.xml
index 09bc32d..dc7a2da 100644
--- a/shared/apim-policies/fragments/pf-ensure-stream-include-usage.xml
+++ b/shared/apim-policies/fragments/pf-ensure-stream-include-usage.xml
@@ -28,6 +28,13 @@
 <fragment>
     <set-variable name="streamIncludeUsageAction" value='@{
         try {
+            // Responses API uses different request/response shapes (input/output_tokens)
+            // and does not honor stream_options.include_usage. Skip injection so we do
+            // not corrupt a Responses body or trigger a 400 from the AOAI backend.
+            if (context.Operation != null &amp;&amp; context.Operation.Name == "responses-create") {
+                return "not-applicable";
+            }
+
             if (context.Request.Body == null) {
                 return "not-applicable";
             }
@@ -38,6 +45,13 @@
             }
 
             var body = context.Request.Body.As&lt;Newtonsoft.Json.Linq.JObject&gt;(preserveContent: true);
+
+            // Responses API bodies use "input" (string) instead of "messages" (array).
+            // Treat any body that has "input" but no "messages" as a Responses shape and skip.
+            if (body["messages"] == null &amp;&amp; body["input"] != null) {
+                return "not-applicable";
+            }
+
             if (body["stream"]?.Value&lt;bool&gt;() != true) {
                 return "not-applicable";
             }
@@ -70,7 +84,7 @@
     </choose>
 
     <choose>
-        <when condition="@(context.Request.Body != null)">
+        <when condition='@(((string)context.Variables.GetValueOrDefault&lt;string&gt;("streamIncludeUsageAction", string.Empty)) != "not-applicable" &amp;&amp; context.Request.Body != null)'>
             <set-body><![CDATA[@{
                 try {
                     var contentType = context.Request.Headers.GetValueOrDefault("Content-Type", "");
@@ -79,6 +93,12 @@
                     }
 
                     var body = context.Request.Body.As<Newtonsoft.Json.Linq.JObject>(preserveContent: true);
+
+                    // Skip rewriting for Responses-API-shaped bodies (no messages array).
+                    if (body["messages"] == null) {
+                        return body.ToString();
+                    }
+
                     if (body["stream"]?.Value<bool>() == true) {
                         // Ensure stream_options object exists.
                         if (body["stream_options"] == null) {
diff --git a/shared/python/azure_cost.py b/shared/python/azure_cost.py
index d7e0bc2..43556be 100644
--- a/shared/python/azure_cost.py
+++ b/shared/python/azure_cost.py
@@ -151,6 +151,12 @@ class ModelPricing:
         prompt_rate_per_k=0.00015,  # $0.15 / 1M input tokens
         completion_rate_per_k=0.0006,  # $0.60 / 1M output tokens
     ),
+    ('gpt-4.1-nano', 'globalstandard'): ModelPricing(
+        model='gpt-4.1-nano',
+        sku='GlobalStandard',
+        prompt_rate_per_k=0.0001,  # $0.10 / 1M input tokens
+        completion_rate_per_k=0.0004,  # $0.40 / 1M output tokens
+    ),
 }
 
 
diff --git a/tests/python/test_costing_helpers.py b/tests/python/test_costing_helpers.py
new file mode 100644
index 0000000..ce54f6d
--- /dev/null
+++ b/tests/python/test_costing_helpers.py
@@ -0,0 +1,168 @@
+"""Tests for `samples/costing/_helpers.py` 6-mode AOAI traffic dispatcher."""
+
+import sys
+from pathlib import Path
+from unittest.mock import MagicMock
+
+import pytest
+import requests as http_requests
+
+# APIM Samples imports
+COSTING_DIR = Path(__file__).resolve().parents[2] / 'samples' / 'costing'
+sys.path.insert(0, str(COSTING_DIR))
+
+from _helpers import send_aoai_traffic  # noqa: E402
+
+CHAT_URL = 'https://apim.example.com/aoai/deployments/gpt/chat/completions'
+RESPONSES_URL = 'https://apim.example.com/aoai/responses'
+CALLER_HEADERS = {'Ocp-Apim-Subscription-Key': 'k', 'Authorization': 'Bearer t'}
+
+CHAT_BODY = {'messages': [{'role': 'user', 'content': 'hi'}], 'max_completion_tokens': 50}
+STREAM_BODY = {**CHAT_BODY, 'stream': True, 'stream_options': {'include_usage': True}}
+STREAM_BODY_NO_USAGE = {**CHAT_BODY, 'stream': True}
+RESPONSES_BODY = {'model': 'gpt', 'input': 'hi', 'max_output_tokens': 50}
+RESPONSES_STREAM_BODY = {**RESPONSES_BODY, 'stream': True}
+RESPONSES_STATELESS_BODY = {**RESPONSES_BODY, 'store': False}
+
+ALL_KEYS = (
+    'chat_non_streaming',
+    'chat_stream_with_usage',
+    'chat_stream_without_usage',
+    'responses_non_streaming',
+    'responses_stream',
+    'responses_non_streaming_stateless',
+)
+
+
+def _make_session() -> MagicMock:
+    session = MagicMock()
+    response = MagicMock()
+    response.status_code = 200
+    response.iter_lines.return_value = iter([])
+    session.post.return_value = response
+    return session
+
+
+def _full_kwargs() -> dict:
+    return {
+        'chat_body': CHAT_BODY,
+        'stream_body': STREAM_BODY,
+        'stream_body_without_usage': STREAM_BODY_NO_USAGE,
+        'responses_url': RESPONSES_URL,
+        'responses_body': RESPONSES_BODY,
+        'responses_stream_body': RESPONSES_STREAM_BODY,
+        'responses_stateless_body': RESPONSES_STATELESS_BODY,
+    }
+
+
+def test_six_requests_cycle_all_six_modes_exactly_once():
+    session = _make_session()
+
+    delivered, planned, bailed = send_aoai_traffic(session, CHAT_URL, CALLER_HEADERS, 6, **_full_kwargs())
+
+    assert bailed is False
+    for key in ALL_KEYS:
+        assert delivered[key] == 1, f'{key} should have exactly one delivered request'
+        assert planned[key] == 1, f'{key} should have exactly one planned request'
+    assert session.post.call_count == 6
+
+
+def test_dispatcher_routes_each_mode_to_correct_url_and_body():
+    session = _make_session()
+
+    send_aoai_traffic(session, CHAT_URL, CALLER_HEADERS, 6, **_full_kwargs())
+
+    calls = session.post.call_args_list
+    expected = [
+        (CHAT_URL, CHAT_BODY),
+        (CHAT_URL, STREAM_BODY),
+        (CHAT_URL, STREAM_BODY_NO_USAGE),
+        (RESPONSES_URL, RESPONSES_BODY),
+        (RESPONSES_URL, RESPONSES_STREAM_BODY),
+        (RESPONSES_URL, RESPONSES_STATELESS_BODY),
+    ]
+
+    for j, (url, body) in enumerate(expected):
+        args, kwargs = calls[j]
+        assert args[0] == url, f'mode {j} url mismatch'
+        assert kwargs['json'] == body, f'mode {j} body mismatch'
+
+
+def test_responses_stateless_body_carries_store_false():
+    session = _make_session()
+
+    send_aoai_traffic(session, CHAT_URL, CALLER_HEADERS, 6, **_full_kwargs())
+
+    mode_5_call = session.post.call_args_list[5]
+    assert mode_5_call.kwargs['json'].get('store') is False
+
+
+def test_streaming_modes_drain_response_lines():
+    session = _make_session()
+
+    send_aoai_traffic(session, CHAT_URL, CALLER_HEADERS, 6, **_full_kwargs())
+
+    # Modes 1, 2, 4 are streaming; iter_lines must be called for each.
+    response = session.post.return_value
+    assert response.iter_lines.call_count == 3
+
+
+def test_falls_back_to_chat_when_responses_inputs_missing():
+    session = _make_session()
+
+    kwargs = _full_kwargs()
+    kwargs['responses_url'] = None
+    kwargs['responses_body'] = None
+    kwargs['responses_stream_body'] = None
+    kwargs['responses_stateless_body'] = None
+
+    delivered, planned, _ = send_aoai_traffic(session, CHAT_URL, CALLER_HEADERS, 6, **kwargs)
+
+    # Modes 3 and 5 should fall back to mode 0 (chat non-streaming);
+    # mode 4 should fall back to mode 1 (chat streaming with usage).
+    assert delivered['responses_non_streaming'] == 0
+    assert delivered['responses_stream'] == 0
+    assert delivered['responses_non_streaming_stateless'] == 0
+    assert delivered['chat_non_streaming'] == 3  # j=0, j=3 (fallback), j=5 (fallback) -> wait recount
+    assert delivered['chat_stream_with_usage'] == 2  # j=1, j=4 (fallback)
+    assert delivered['chat_stream_without_usage'] == 1  # j=2
+    assert sum(planned.values()) == 6
+
+
+def test_falls_back_when_stream_body_without_usage_missing():
+    session = _make_session()
+
+    kwargs = _full_kwargs()
+    kwargs['stream_body_without_usage'] = None
+
+    delivered, _planned, _ = send_aoai_traffic(session, CHAT_URL, CALLER_HEADERS, 6, **kwargs)
+
+    # Mode 2 should fall back to mode 1 (stream_body with usage).
+    assert delivered['chat_stream_without_usage'] == 0
+    assert delivered['chat_stream_with_usage'] == 2  # j=1 + j=2 (fallback)
+
+
+def test_timeout_bails_remaining_requests():
+    session = MagicMock()
+    response = MagicMock()
+    response.status_code = 200
+    response.iter_lines.return_value = iter([])
+
+    # First call succeeds, second times out, remainder should be skipped.
+    session.post.side_effect = [response, http_requests.Timeout()]
+
+    delivered, planned, bailed = send_aoai_traffic(session, CHAT_URL, CALLER_HEADERS, 6, **_full_kwargs())
+
+    assert bailed is True
+    assert sum(delivered.values()) == 1
+    assert sum(planned.values()) == 2  # planned is incremented before the post call
+    assert session.post.call_count == 2
+
+
+@pytest.mark.parametrize('count', [0, 1, 7, 13])
+def test_planned_count_always_equals_request_count(count):
+    session = _make_session()
+
+    _delivered, planned, _ = send_aoai_traffic(session, CHAT_URL, CALLER_HEADERS, count, **_full_kwargs())
+
+    assert sum(planned.values()) == count

From 97c764f2b845d1b96d82f81fd773a8a774defcee Mon Sep 17 00:00:00 2001
From: Simon Kurtz <simonkurtz@microsoft.com>
Date: Fri, 1 May 2026 13:21:45 -0400
Subject: [PATCH 2/4] Refine workbook data and tiles

---
 samples/costing/README.md             |   2 +-
 samples/costing/costing.workbook.json | 185 +++++++++++++++++++++++---
 samples/costing/create.ipynb          |   6 +-
 3 files changed, 172 insertions(+), 21 deletions(-)

diff --git a/samples/costing/README.md b/samples/costing/README.md
index b23628a..328d0fa 100644
--- a/samples/costing/README.md
+++ b/samples/costing/README.md
@@ -131,7 +131,7 @@ This lab deploys and configures:
 - **Azure Monitor Workbook** - Pre-built tabbed dashboard with:
   - **Subscription-Based Costing tab**: Cost allocation table (base + variable cost per BU), base vs variable cost stacked bar chart, cost breakdown by API, request count and distribution charts, success/error rate analysis, response code distribution, business unit drill-down
   - **Entra ID Application Costing tab**: Usage by caller ID (bar chart + table), cost allocation by caller (table + pie chart), hourly request trend by caller
-  - **AI Gateway Token/PTU tab**: Three rows of summary tiles grouped under **APIM Inbound** (total APIM requests, AI APIM requests, inbound), **AI Backend** (backend requests, successful, throttled, failed), and **Tokens** (total tokens), followed by a request-funnel table, scope-reconciliation explainer + table, token cost allocation table with configurable per-1K-token rates, model and streaming pie charts, streaming vs non-streaming breakdown table, token-share pie, and hourly token-type trend chart
+  - **AI Gateway Token/PTU tab**: Summary tiles grouped under **APIM Inbound** (AI Requests across all subs, AI Requests per BU) and **AI Backend** (a Successful row with `Successful (all 2xx)`, `Successful (2xx, with tokens)`, `Successful (no tokens)`, and an Errors row with `Throttled (429)`, `Client Errors (4xx)`, `Server Errors (5xx)`), then a **Tokens** row (total tokens), followed by a request-funnel table, a Token Coverage Investigation drill-in for `Successful (no tokens)`, scope-reconciliation explainer + table, token cost allocation table with configurable per-1K-token rates, model and streaming pie charts, streaming vs non-streaming breakdown table, token-share pie, and hourly token-type trend chart
 - **SKU-Based Pricing** - Automatically derives base monthly cost, overage rate, and included request allowance from the deployed APIM SKU using built-in pricing data (sourced from the [Azure API Management pricing page](https://azure.microsoft.com/pricing/details/api-management/), March 2026)
 - **Budget Alerts** (optional) - Per-BU scheduled query alerts when request thresholds are exceeded
 
diff --git a/samples/costing/costing.workbook.json b/samples/costing/costing.workbook.json
index 43bb777..c24ce7d 100644
--- a/samples/costing/costing.workbook.json
+++ b/samples/costing/costing.workbook.json
@@ -212,7 +212,7 @@
               "items": [
                 {
                   "content": {
-                    "json": "When APIM is used as an **AI Gateway**, the built-in model-level metrics (PTU utilization, token counts) do not break down by caller. The **AI Gateway Token/PTU** tab fills that gap using `ApiManagementGatewayLlmLog` diagnostic data, joined with `ApiManagementGatewayLogs` to **attribute token consumption to APIM subscriptions (business units)**.\n\n> Requires the `GatewayLlmLogs` diagnostic category to be enabled. For per-model pricing and what these cost estimates do **not** include (PTU, Batch, cached-input, taxes, EA pricing), see the **AI Gateway - Parameters & Pricing** section below.\n\n#### How an AI request flows through APIM\n\n1. **Inbound** - APIM accepts the HTTPS request and matches it to an API + operation. The `Ocp-Apim-Subscription-Key` header maps the call to a `bu-*` product subscription, which is the dimension every query in this workbook attributes cost to.\n2. **Inbound policy chain** - Sample policies enforce per-subscription rate limits and emit telemetry. Requests that exceed the configured limit are rejected with **HTTP 429** at this stage; they never reach the AI backend, so no `ApiManagementGatewayLlmLog` row is written and no tokens are consumed.\n3. **Backend dispatch** - APIM forwards the surviving request to the configured AI backend (Azure OpenAI / AI Foundry). For load-balanced backends, APIM picks a pool member and can fail over on transient errors.\n4. **LLM diagnostic capture** - On a successful (`2xx`) response, APIM writes one row to `ApiManagementGatewayLlmLog` with `PromptTokens`, `CompletionTokens`, `TotalTokens`, `ModelName`, and `IsStreamCompletion`. Streaming responses only carry token counts when `stream_options.include_usage = true` reaches the backend. In this sample, some streaming requests intentionally omit that flag so the policy fragment can add it; when it does, APIM also writes proof into `ApiManagementGatewayLogs.TraceRecords`. Backend `5xx` responses produce no LLM-log row.\n5. **Outbound** - APIM returns the response to the client and writes one row to `ApiManagementGatewayLogs` regardless of outcome. That gateway-log row is what the **AI Requests Received (bu-*)** tile counts.\n\n#### Reading the AI Gateway tiles\n\nThe blue tiles form a **monotonically non-increasing funnel**: every stage is a strict subset of the previous one.\n\n```\nTotal APIM Requests >= AI APIM Requests (all subs) >= AI Requests Received (bu-*)\n  >= AI Not Throttled >= AI Successful (2xx)\n```\n\n**Why two tiles can be equal:** `AI APIM Requests (all subs)` and `AI Requests Received (bu-*)` differ only on the subscription filter. They are equal whenever every AI call was made with a `bu-*` subscription key, which is the default for this sample. They diverge in production environments that mix BU subscriptions with starter/unlimited products or master-key calls.\n\nThe red tiles - **AI Throttled (429)**, **AI Client Errors (4xx, non-429)**, and **AI Failed (5xx)** - sit *outside* the funnel and surface the gaps:\n\n- **AI Throttled (429)** = `AI Requests Received (bu-*)` - `AI Not Throttled`\n- **AI Client Errors (4xx, non-429)** counts requests rejected with status codes such as 400 (bad prompt), 401/403 (auth or content-safety failure), 404 (wrong deployment/model), 408/413/422 (timeout, payload, validation). These are included in `AI Not Throttled` because they are not 429s, but most never reach the AI backend.\n- **AI Failed (5xx)** = backend or APIM 5xx responses; included in `AI Not Throttled` but excluded from `AI Successful (2xx)`.\n\n**AI Successful (2xx)** and **Total Tokens Used** are the `2xx` subset - `429` throttles, non-429 4xx client errors, and `5xx` failures all contribute zero tokens.\n\n**Note on 3xx:** AI chat-completion endpoints are POSTs and do not produce redirect responses in normal operation, so there is no dedicated 3xx tile. The **HTTP Response Code Distribution** chart on the *Usage Analytics* tab will surface any 3xx responses if they ever occur.\n\n#### Why AI APIs are listed manually\n\nAPIM diagnostic logs only carry identifiers like `ApiId`, `OperationId`, and `BackendId` - they do **not** include APIM tags. Azure Resource Graph also does not index APIM API tag associations, so the workbook cannot auto-discover AI APIs from inside Log Analytics. The **AI API names** parameter on the AI Gateway tab is therefore a plain comma-separated list. See the parameter description for an `az rest` snippet that lists APIs by tag.\n\n#### Streaming, model breakdown, PTU\n\n- **Streaming vs Non-Streaming** - confirms APIM captures token counts for **both** delivery modes. The streaming visuals split requests into `Streaming (client-supplied usage)` and `Streaming (policy-injected usage)` using `ApiManagementGatewayLogs.TraceRecords`, so you can prove when APIM altered the request before forwarding it.\n- **Model breakdown** - useful when multiple models (gpt-5-mini, gpt-4o-mini, etc.) are served through the same APIM gateway.\n- **PTU utilization** - the dashed line on the trend chart represents the configurable PTU capacity threshold. If a business unit's hourly token rate approaches the PTU capacity, consider provisioning more throughput or applying rate limiting."
+                    "json": "When APIM is used as an **AI Gateway**, the built-in model-level metrics (PTU utilization, token counts) do not break down by caller. The **AI Gateway Token/PTU** tab fills that gap using `ApiManagementGatewayLlmLog` diagnostic data, joined with `ApiManagementGatewayLogs` to **attribute token consumption to APIM subscriptions (business units)**.\n\n> Requires the `GatewayLlmLogs` diagnostic category to be enabled. For per-model pricing and what these cost estimates do **not** include (PTU, Batch, cached-input, taxes, EA pricing), see the **AI Gateway - Parameters & Pricing** section below.\n\n#### How an AI request flows through APIM\n\n1. **Inbound** - APIM accepts the HTTPS request and matches it to an API + operation. The `Ocp-Apim-Subscription-Key` header maps the call to a `bu-*` product subscription, which is the dimension every query in this workbook attributes cost to.\n2. **Inbound policy chain** - Sample policies enforce per-subscription rate limits and emit telemetry. Requests that exceed the configured limit are rejected with **HTTP 429** at this stage; they never reach the AI backend, so no `ApiManagementGatewayLlmLog` row is written and no tokens are consumed.\n3. **Backend dispatch** - APIM forwards the surviving request to the configured AI backend (Azure OpenAI / AI Foundry). For load-balanced backends, APIM picks a pool member and can fail over on transient errors.\n4. **LLM diagnostic capture** - On a successful (`2xx`) response, APIM writes one row to `ApiManagementGatewayLlmLog` with `PromptTokens`, `CompletionTokens`, `TotalTokens`, `ModelName`, and `IsStreamCompletion`. Streaming responses only carry token counts when `stream_options.include_usage = true` reaches the backend. In this sample, some streaming requests intentionally omit that flag so the policy fragment can add it; when it does, APIM also writes proof into `ApiManagementGatewayLogs.TraceRecords`. Backend `5xx` responses produce no LLM-log row.\n5. **Outbound** - APIM returns the response to the client and writes one row to `ApiManagementGatewayLogs` regardless of outcome. That gateway-log row is what the **AI Requests per BU** tile counts.\n\n#### Reading the AI Gateway tiles\n\nThe **APIM Inbound** row tiles form a **monotonically non-increasing funnel** down to the **AI Backend** row, where the *Successful* tiles continue the funnel and the error tiles surface the gaps:\n\n```\nTotal APIM Requests >= AI Requests (all subs) >= AI Requests per BU\n  >= Successful (all 2xx) >= Successful (2xx, with tokens)\n```\n\n**Why two tiles can be equal:** `AI Requests (all subs)` and `AI Requests per BU` differ only on the subscription filter. They are equal whenever every AI call was made with a `bu-*` subscription key, which is the default for this sample. They diverge in production environments that mix BU subscriptions with starter/unlimited products or master-key calls.\n\n**AI Backend - row 1 (Successful):**\n\n- **Successful (all 2xx)** = every `2xx` response from a `bu-*` subscription on an AI API. This is the total of token-bearing and no-token successes.\n- **Successful (2xx, with tokens)** = the **token-bearing subset** - rows in `ApiManagementGatewayLlmLog` with `TotalTokens > 0`, `CompletionTokens > 0`, and a non-empty `ModelName`. **Total Tokens Used** sums tokens for this same subset. This is the count that drives showback.\n- **Successful (no tokens)** = `Successful (all 2xx)` - `Successful (2xx, with tokens)`. These are real, billable AI calls that returned `2xx` but landed without measurable token data - typically because a streaming response lost its final `usage` chunk, the backend returned an empty completion or content-safety refusal, or the LLM-log row carrying `ModelName` was dropped. Drill in below via the **Token Coverage Investigation** section.\n\n**AI Backend - row 2 (Errors):** these tiles sit *outside* the success funnel:\n\n- **Throttled (429)** = `AI Requests per BU` minus all non-429 outcomes. Rejected by APIM rate-limit policy before reaching the backend; no tokens consumed.\n- **Client Errors (4xx)** counts non-429 requests rejected with status codes such as 400 (bad prompt), 401/403 (auth or content-safety failure), 404 (wrong deployment/model), 408/413/422 (timeout, payload, validation). Most never reach the AI backend.\n- **Server Errors (5xx)** = backend or APIM 5xx responses. The number of 2xx requests *without* matching token data is surfaced in the **Token Coverage Investigation** section directly below the AI Gateway tiles, with a drill-in by API, operation, and likely cause. In production, you should expect this gap to be small but non-zero - track it as a quality KPI for showback accuracy.\n\n**Note on 3xx:** AI chat-completion endpoints are POSTs and do not produce redirect responses in normal operation, so there is no dedicated 3xx tile. The **HTTP Response Code Distribution** chart on the *Usage Analytics* tab will surface any 3xx responses if they ever occur.\n\n#### Why AI APIs are listed manually\n\nAPIM diagnostic logs only carry identifiers like `ApiId`, `OperationId`, and `BackendId` - they do **not** include APIM tags. Azure Resource Graph also does not index APIM API tag associations, so the workbook cannot auto-discover AI APIs from inside Log Analytics. The **AI API names** parameter on the AI Gateway tab is therefore a plain comma-separated list. See the parameter description for an `az rest` snippet that lists APIs by tag.\n\n#### Streaming, model breakdown, PTU\n\n- **Streaming vs Non-Streaming** - confirms APIM captures token counts for **both** delivery modes. The streaming visuals split requests into `Streaming (client-supplied usage)` and `Streaming (policy-injected usage)` using `ApiManagementGatewayLogs.TraceRecords`, so you can prove when APIM altered the request before forwarding it.\n- **Model breakdown** - useful when multiple models (gpt-5-mini, gpt-4o-mini, etc.) are served through the same APIM gateway.\n- **PTU utilization** - the dashed line on the trend chart represents the configurable PTU capacity threshold. If a business unit's hourly token rate approaches the PTU capacity, consider provisioning more throughput or applying rate limiting."
                   },
                   "name": "text - instructions-aigateway-body",
                   "type": 1
@@ -1194,14 +1194,14 @@
                 },
                 {
                   "content": {
-                    "query": "ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| where ApiId in~ (split('{AiApiIds}', ','))\n| where ResponseCode != 429\n| summarize Requests = count()\n| extend Label = 'Not throttled', Status = ''\n| project Label, Requests, Status",
+                    "query": "ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| where ApiId in~ (split('{AiApiIds}', ','))\n| where ResponseCode between (200 .. 299)\n| summarize Requests = count()\n| extend Label = 'Successful', Status = 'all 2xx'\n| project Label, Requests, Status",
                     "queryType": 0,
                     "resourceType": "microsoft.operationalinsights/workspaces",
                     "size": 4,
                     "timeContext": {
                       "durationMs": 2592000000
                     },
-                    "noDataMessage": "No gateway requests found for the selected time range.",
+                    "noDataMessage": "No 2xx responses in the selected time range.",
                     "version": "KqlItem/1.0",
                     "visualization": "tiles",
                     "tileSettings": {
@@ -1231,9 +1231,9 @@
                       "showBorder": true
                     }
                   },
-                  "name": "query - ai-backend-requests-tile",
+                  "name": "query - ai-successful-total-tile",
                   "type": 3,
-                  "customWidth": "20",
+                  "customWidth": "33",
                   "styleSettings": {
                     "maxWidth": "320px",
                     "showBorder": false
@@ -1241,7 +1241,7 @@
                 },
                 {
                   "content": {
-                    "query": "ApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| where TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)\n| project CorrelationId\n| join kind=inner (\n    ApiManagementGatewayLogs\n    | where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n    | project CorrelationId\n) on CorrelationId\n| summarize RequestCount = dcount(CorrelationId)\n| extend Label = 'Successful (2xx)', Status = ''\n| project Label, RequestCount, Status",
+                    "query": "ApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| where TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)\n| project CorrelationId\n| join kind=inner (\n    ApiManagementGatewayLogs\n    | where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n    | project CorrelationId\n) on CorrelationId\n| summarize RequestCount = dcount(CorrelationId)\n| extend Label = 'Successful (2xx, with tokens)', Status = 'billable'\n| project Label, RequestCount, Status",
                     "queryType": 0,
                     "resourceType": "microsoft.operationalinsights/workspaces",
                     "size": 4,
@@ -1280,7 +1280,7 @@
                   },
                   "name": "query - request-count-tile",
                   "type": 3,
-                  "customWidth": "20",
+                  "customWidth": "33",
                   "styleSettings": {
                     "maxWidth": "320px",
                     "showBorder": false
@@ -1288,7 +1288,62 @@
                 },
                 {
                   "content": {
-                    "query": "ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| where ApiId in~ (split('{AiApiIds}', ','))\n| where ResponseCode == 429\n| summarize Requests = count()\n| extend Label = 'Throttled (429)', Status = ''\n| project Label, Requests, Status",
+                    "query": "let aiApis = split('{AiApiIds}', ',');\nlet twoXx = ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| where ApiId in~ (aiApis)\n| where ResponseCode between (200 .. 299)\n| project CorrelationId;\nlet withTokens = ApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| where TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)\n| distinct CorrelationId;\ntwoXx\n| join kind=leftouter withTokens on CorrelationId\n| where isempty(CorrelationId1)\n| summarize NoTokenRequests = dcount(CorrelationId)\n| extend Label = 'Successful (no tokens)', Status = '⚠ investigate'\n| project Label, NoTokenRequests, Status",
+                    "queryType": 0,
+                    "resourceType": "microsoft.operationalinsights/workspaces",
+                    "size": 4,
+                    "timeContext": {
+                      "durationMs": 2592000000
+                    },
+                    "noDataMessage": "All 2xx responses have matching token data.",
+                    "version": "KqlItem/1.0",
+                    "visualization": "tiles",
+                    "tileSettings": {
+                      "titleContent": {
+                        "columnMatch": "Label",
+                        "formatter": 1
+                      },
+                      "leftContent": {
+                        "columnMatch": "NoTokenRequests",
+                        "formatter": 12,
+                        "formatOptions": {
+                          "min": 0,
+                          "palette": "yellow"
+                        },
+                        "numberFormat": {
+                          "unit": 17,
+                          "options": {
+                            "style": "decimal",
+                            "useGrouping": true
+                          }
+                        }
+                      },
+                      "subtitleContent": {
+                        "columnMatch": "Status",
+                        "formatter": 1
+                      },
+                      "showBorder": true
+                    }
+                  },
+                  "name": "query - ai-2xx-no-tokens-row-tile",
+                  "type": 3,
+                  "customWidth": "33",
+                  "styleSettings": {
+                    "maxWidth": "320px",
+                    "showBorder": false
+                  }
+                },
+                {
+                  "type": 1,
+                  "content": {
+                    "json": " "
+                  },
+                  "name": "text - ai-backend-row-break",
+                  "customWidth": "100"
+                },
+                {
+                  "content": {
+                    "query": "ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| where ApiId in~ (split('{AiApiIds}', ','))\n| where ResponseCode == 429\n| summarize Requests = count()\n| extend Label = 'Throttled (429)', Status = 'rate limited'\n| project Label, Requests, Status",
                     "queryType": 0,
                     "resourceType": "microsoft.operationalinsights/workspaces",
                     "size": 4,
@@ -1327,7 +1382,7 @@
                   },
                   "name": "query - ai-throttled-tile",
                   "type": 3,
-                  "customWidth": "20",
+                  "customWidth": "33",
                   "styleSettings": {
                     "maxWidth": "320px",
                     "showBorder": false
@@ -1335,14 +1390,14 @@
                 },
                 {
                   "content": {
-                    "query": "ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| where ApiId in~ (split('{AiApiIds}', ','))\n| where ResponseCode >= 500\n| summarize Requests = count()\n| extend Label = 'Failed (5xx)', Status = ''\n| project Label, Requests, Status",
+                    "query": "ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| where ApiId in~ (split('{AiApiIds}', ','))\n| where ResponseCode >= 400 and ResponseCode < 500 and ResponseCode != 429\n| summarize Requests = count()\n| extend Label = 'Client Errors (4xx)', Status = 'non-429 rejects'\n| project Label, Requests, Status",
                     "queryType": 0,
                     "resourceType": "microsoft.operationalinsights/workspaces",
                     "size": 4,
                     "timeContext": {
                       "durationMs": 2592000000
                     },
-                    "noDataMessage": "No backend failures in the selected time range.",
+                    "noDataMessage": "No non-429 client errors in the selected time range.",
                     "version": "KqlItem/1.0",
                     "visualization": "tiles",
                     "tileSettings": {
@@ -1372,9 +1427,9 @@
                       "showBorder": true
                     }
                   },
-                  "name": "query - ai-failed-tile",
+                  "name": "query - ai-client-errors-tile",
                   "type": 3,
-                  "customWidth": "20",
+                  "customWidth": "33",
                   "styleSettings": {
                     "maxWidth": "320px",
                     "showBorder": false
@@ -1382,14 +1437,14 @@
                 },
                 {
                   "content": {
-                    "query": "ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| where ApiId in~ (split('{AiApiIds}', ','))\n| where ResponseCode >= 400 and ResponseCode < 500 and ResponseCode != 429\n| summarize Requests = count()\n| extend Label = 'Client Errors (4xx, non-429)', Status = ''\n| project Label, Requests, Status",
+                    "query": "ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| where ApiId in~ (split('{AiApiIds}', ','))\n| where ResponseCode >= 500\n| summarize Requests = count()\n| extend Label = 'Server Errors (5xx)', Status = 'backend / APIM'\n| project Label, Requests, Status",
                     "queryType": 0,
                     "resourceType": "microsoft.operationalinsights/workspaces",
                     "size": 4,
                     "timeContext": {
                       "durationMs": 2592000000
                     },
-                    "noDataMessage": "No non-429 client errors in the selected time range.",
+                    "noDataMessage": "No server errors in the selected time range.",
                     "version": "KqlItem/1.0",
                     "visualization": "tiles",
                     "tileSettings": {
@@ -1419,9 +1474,9 @@
                       "showBorder": true
                     }
                   },
-                  "name": "query - ai-client-errors-tile",
+                  "name": "query - ai-failed-tile",
                   "type": 3,
-                  "customWidth": "20",
+                  "customWidth": "33",
                   "styleSettings": {
                     "maxWidth": "320px",
                     "showBorder": false
@@ -1540,6 +1595,102 @@
                     "showBorder": true
                   }
                 },
+                {
+                  "type": 1,
+                  "content": {
+                    "json": "---\n\n## ⚠️ Token Coverage Investigation\n\nIf **`AI Successful (2xx, with tokens)`** is lower than the total `2xx` responses for the AI APIs, some completed calls returned no usable token data. Those requests are real, billable AI traffic - they just won't appear in token-based showback.\n\n**Why this matters in production:** if you bill business units by tokens, *no-token* requests are unbilled work. For streaming-heavy workloads this can be 10-15% of traffic. Track the *2xx without tokens* count below as a quality KPI; spikes warrant investigation.\n\n**Common causes (most likely first):**\n\n1. **Streaming response missing the final `usage` chunk** - the SSE stream ended before `data: {... \"usage\": {...}}` arrived. Causes: client disconnect, upstream truncation, or the model deployment ignored `stream_options.include_usage`. The `Ensure-Stream-Include-Usage` policy fragment in this sample injects the flag inbound to mitigate this.\n2. **Empty completion / content-filter** - backend returned 200 with `completion_tokens = 0` (refusal, content-safety filter, tool-only response).\n3. **Multi-row LLM-log race** - `ApiManagementGatewayLlmLog` emits multiple rows per call; a transient ingestion drop of the `ModelName`-bearing row leaves the request looking token-less.\n4. **Non-chat-completion operations** - models listing, embeddings without diagnostic instrumentation, or operations the LLM diagnostic does not cover.\n\n**How to act on the table below:**\n\n- High counts on a single API + Operation row tagged *Streaming - usage chunk missing* indicate a deployment that needs `stream_options.include_usage` enforced (or a client SDK that's stripping it). Verify the `Ensure-Stream-Include-Usage` fragment is attached.\n- Counts under *Empty completion / content filter* are usually expected baseline noise; spikes may indicate prompt-injection attempts or content-safety policy changes.\n- Counts under *No LLM-log row* are the most concerning - investigate APIM diagnostic settings and confirm `enableLlmLogging` is on for the affected API."
+                  },
+                  "name": "text - token-coverage-investigation-header",
+                  "customWidth": "100"
+                },
+                {
+                  "content": {
+                    "query": "let aiApis = split('{AiApiIds}', ',');\nlet twoXx = ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| where ApiId in~ (aiApis)\n| where ResponseCode between (200 .. 299)\n| project CorrelationId;\nlet withTokens = ApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| where TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)\n| distinct CorrelationId;\ntwoXx\n| join kind=leftouter withTokens on CorrelationId\n| where isempty(CorrelationId1)\n| summarize NoTokenRequests = dcount(CorrelationId)\n| extend Label = '2xx without tokens', Status = 'investigate'\n| project Label, NoTokenRequests, Status",
+                    "queryType": 0,
+                    "resourceType": "microsoft.operationalinsights/workspaces",
+                    "size": 4,
+                    "timeContext": {
+                      "durationMs": 2592000000
+                    },
+                    "noDataMessage": "All 2xx responses have matching token data.",
+                    "version": "KqlItem/1.0",
+                    "visualization": "tiles",
+                    "tileSettings": {
+                      "titleContent": {
+                        "columnMatch": "Label",
+                        "formatter": 1
+                      },
+                      "leftContent": {
+                        "columnMatch": "NoTokenRequests",
+                        "formatter": 12,
+                        "formatOptions": {
+                          "min": 0,
+                          "palette": "orange"
+                        },
+                        "numberFormat": {
+                          "unit": 17,
+                          "options": {
+                            "style": "decimal",
+                            "useGrouping": true
+                          }
+                        }
+                      },
+                      "subtitleContent": {
+                        "columnMatch": "Status",
+                        "formatter": 1
+                      },
+                      "showBorder": true
+                    }
+                  },
+                  "name": "query - 2xx-no-tokens-tile",
+                  "type": 3,
+                  "customWidth": "25",
+                  "styleSettings": {
+                    "maxWidth": "320px",
+                    "showBorder": false
+                  }
+                },
+                {
+                  "content": {
+                    "query": "let aiApis = split('{AiApiIds}', ',');\nlet twoXx = ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| where ApiId in~ (aiApis)\n| where ResponseCode between (200 .. 299)\n| project CorrelationId, ApiId, OperationId, ApimSubscriptionId;\nlet llmRollup = ApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| summarize\n    HasAnyRow = countif(true) > 0,\n    HasTokens = countif(TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)) > 0,\n    HasStream = countif(IsStreamCompletion == true) > 0\n    by CorrelationId;\ntwoXx\n| join kind=leftouter llmRollup on CorrelationId\n| where HasTokens != true\n| extend ['Likely Cause'] = case(\n    HasStream == true, 'Streaming - usage chunk missing',\n    HasAnyRow == true, 'Empty completion / content filter',\n    'No LLM-log row (diagnostic gap)')\n| summarize ['No-Token Requests'] = dcount(CorrelationId) by ['Business Unit'] = substring(ApimSubscriptionId, 3), API = ApiId, Operation = OperationId, ['Likely Cause']\n| order by ['No-Token Requests'] desc",
+                    "queryType": 0,
+                    "resourceType": "microsoft.operationalinsights/workspaces",
+                    "size": 0,
+                    "timeContext": {
+                      "durationMs": 2592000000
+                    },
+                    "title": "2xx Requests Without Token Data - Drill-in by API, Operation & Likely Cause",
+                    "noDataMessage": "All 2xx responses have matching token data.",
+                    "version": "KqlItem/1.0",
+                    "visualization": "table",
+                    "gridSettings": {
+                      "formatters": [
+                        {
+                          "columnMatch": "No-Token Requests",
+                          "formatter": 8,
+                          "formatOptions": {
+                            "min": 0,
+                            "palette": "orange"
+                          },
+                          "numberFormat": {
+                            "unit": 0,
+                            "options": {
+                              "style": "decimal",
+                              "useGrouping": true,
+                              "maximumFractionDigits": 0
+                            }
+                          }
+                        }
+                      ],
+                      "filter": true
+                    }
+                  },
+                  "name": "query - 2xx-no-tokens-breakdown",
+                  "type": 3,
+                  "styleSettings": {
+                    "showBorder": true
+                  }
+                },
                 {
                   "content": {
                     "query": "let promptRate = coalesce(todouble('{PromptTokenRate}'), 0.00025);\nlet completionRate = coalesce(todouble('{CompletionTokenRate}'), 0.002);\nlet rankedBusinessUnits = materialize(ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| summarize RankMetric = count() by ApimSubscriptionId\n| order by RankMetric desc, ApimSubscriptionId asc\n| serialize\n| extend BucketRank = row_number()\n| extend BusinessUnitBucket = substring(ApimSubscriptionId, 3));\nlet bucketedLlmLogs = materialize(ApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| where TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)\n| project CorrelationId, PromptTokens, CompletionTokens, TotalTokens, ModelName\n| extend ModelName = replace_regex(ModelName, @'-\\d{4}-\\d{2}-\\d{2}$', '')\n| join kind=inner (\n    ApiManagementGatewayLogs\n    | where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n    | project CorrelationId, ApimSubscriptionId\n) on CorrelationId\n| lookup kind=leftouter rankedBusinessUnits on ApimSubscriptionId\n);\nlet grandTotalTokens = toscalar(bucketedLlmLogs | summarize sum(TotalTokens));\nbucketedLlmLogs\n| summarize\n    PromptTokens = sum(PromptTokens),\n    CompletionTokens = sum(CompletionTokens),\n    TotalTokens = sum(TotalTokens),\n    Requests = count(),\n    BucketRank = min(BucketRank)\n    by BusinessUnitBucket, ModelName\n| extend PromptCost = round(PromptTokens * promptRate / 1000.0, 4)\n| extend CompletionCost = round(CompletionTokens * completionRate / 1000.0, 4)\n| extend TotalCost = round(PromptCost + CompletionCost, 4)\n| extend TotalTokensPct = iif(grandTotalTokens > 0, round(TotalTokens * 100.0 / grandTotalTokens, 2), 0.0)\n| order by BucketRank asc, BusinessUnitBucket asc, ModelName asc\n| project\n    ['Business Unit'] = BusinessUnitBucket,\n    Model = ModelName,\n    ['Prompt Tokens'] = PromptTokens,\n    ['Completion Tokens'] = CompletionTokens,\n    ['Total Tokens'] = TotalTokens,\n    ['Total Tokens Pct'] = TotalTokensPct,\n    Requests,\n    ['Prompt Cost ($)'] = PromptCost,\n    ['Completion Cost ($)'] = CompletionCost,\n    ['Total Cost ($)'] = TotalCost",
diff --git a/samples/costing/create.ipynb b/samples/costing/create.ipynb
index 8133b56..0d52a23 100644
--- a/samples/costing/create.ipynb
+++ b/samples/costing/create.ipynb
@@ -151,9 +151,9 @@
     "# Responses-API rows reach the workbook only via ApiManagementGatewayLlmLog\n",
     "# (the diagnostic-log path), not via emit-token-metric custom metrics.\n",
     "model_test_matrix = {\n",
-    "    'gpt-4o-mini':   {'version': '2024-07-18', 'capacity': 10, 'requests_per_caller': 6},\n",
-    "    'gpt-4.1-nano':  {'version': '2025-04-14', 'capacity': 10, 'requests_per_caller': 8},\n",
-    "    'gpt-5-mini':    {'version': '2025-08-07', 'capacity': 10, 'requests_per_caller': 10},\n",
+    "    'gpt-4o-mini':   {'version': '2024-07-18', 'capacity': 10, 'requests_per_caller': 2},\n",
+    "    'gpt-4.1-nano':  {'version': '2025-04-14', 'capacity': 10, 'requests_per_caller': 4},\n",
+    "    'gpt-5-mini':    {'version': '2025-08-07', 'capacity': 10, 'requests_per_caller': 6},\n",
     "}\n",
     "\n",
     "# Derived: list form used for Bicep deployment and iteration\n",

From 6fb4bd606c2d4902a8ee9a7aae83d1775d1078ed Mon Sep 17 00:00:00 2001
From: Simon Kurtz <simonkurtz@microsoft.com>
Date: Fri, 1 May 2026 13:47:09 -0400
Subject: [PATCH 3/4] Add cost disclaimer, adjust AI tiles

---
 samples/costing/costing.workbook.json | 52 ++++++++-------------------
 1 file changed, 15 insertions(+), 37 deletions(-)

diff --git a/samples/costing/costing.workbook.json b/samples/costing/costing.workbook.json
index c24ce7d..155f908 100644
--- a/samples/costing/costing.workbook.json
+++ b/samples/costing/costing.workbook.json
@@ -136,7 +136,7 @@
         "items": [
           {
             "content": {
-              "json": "# APIM Costing & Showback Workbook\n\nThis workbook turns raw API Management telemetry into **per-business-unit cost and usage views** so platform teams can chargeback, forecast, and right-size APIM and Azure OpenAI consumption. Use these **Instructions / Overview** as your reading guide - it explains what each tab is for, what data it relies on, and the nuances to be aware of when interpreting the numbers.\n\n## At a glance\n\n| # | Tab | What it answers | Primary data source |\n|---|---|---|---|\n| 1 | **Overview** | What is this workbook and how do I read it? | (this page) |\n| 2 | **Subscription-Based Costing** | How do I split APIM platform + per-request cost across business units (BUs)? | `ApiManagementGatewayLogs` |\n| 3 | **Entra ID Application Costing** | How do I split cost across Entra ID applications calling APIM? | App Insights `customMetrics` (`caller-requests`) |\n| 4 | **AI Gateway Token/PTU** | How many tokens did each BU consume through the AI Gateway, and what does that cost? | `ApiManagementGatewayLlmLog` joined with `ApiManagementGatewayLogs` |\n| 5 | **Per-Request Detail** | Show me every single AI Gateway request with tokens, model, latency, cost. | `ApiManagementGatewayLlmLog` + `ApiManagementGatewayLogs` |\n\n## Shared assumptions and conventions\n\n- **Business units** are identified by APIM **subscription IDs that start with `bu-`** (e.g. `bu-hr`, `bu-finance`). The `bu-` prefix is stripped for display.\n- **Time Range** at the top of the workbook applies to every tab. Each tab also keeps its own `timeContext` so server-side filtering still works.\n- **All costs are estimates.** They are computed from request and token counts and the per-unit rates you configure. They do **not** include caching discounts, Batch API discounts, PTU reservations, regional pricing variations, taxes, or EA pricing. Always validate against your official [Azure Cost Management](https://portal.azure.com/#view/Microsoft_Azure_CostManagement/Menu/~/overview) invoice.\n- **Numbers across tabs are intentionally different.** They measure different scopes:\n  - *Subscription-Based* counts every `bu-*` request at the gateway (AI + non-AI, success + failure).\n  - *AI Gateway* counts only successful AI calls that produced token usage.\n  - *Per-Request Detail* lists every individual gateway record. The **Reconciliation table** on the AI Gateway tab shows exactly how the totals line up.\n\n## Data freshness and gotchas\n\n- **Log Analytics ingestion**: typically 1-3 minutes for gateway logs, 5-10 minutes for App Insights custom metrics. If a tab looks empty right after generating traffic, give it a few minutes and refresh.\n- **AI Gateway streaming**: Server-Sent Events (SSE) responses normally omit the final `usage` object. The `emit_metric_caller_tokens.xml` policy injects `stream_options.include_usage = true` so the last chunk carries token totals. If streaming rows show zero tokens, confirm `force_stream_include_usage = True` in the notebook.\n- **Multiple LLM log rows per request**: `ApiManagementGatewayLlmLog` emits multiple events per call (one summary + per-backend events), and only one carries `ModelName` and tokens. Queries pre-summarize on `CorrelationId` to avoid double-counting; if you copy a query out of the workbook, keep that step.\n- **Throttled (429) and failed AI calls** never show up in the AI Gateway tile counts - they have no token usage. They *do* show up in the Per-Request and Subscription-Based views.\n- **Tenant mismatch**: the Entra ID tab requires the workbook viewer to be signed in to the same tenant that owns the subscription. Otherwise tiles will be empty with an `access token issuer` warning.\n\n## Sections below\n\nExpand the collapsible sections below for usage guidance and tab-by-tab notes covering parameters, formulas, and pitfalls specific to each view."
+              "json": "# APIM Costing & Showback Workbook\n\n> ## ⚠️ For informational use only - **not a system of record for billing**\n>\n> All token counts, request counts, and cost figures shown across every tab of this workbook are **estimates** derived from APIM gateway telemetry and Application Insights metrics. They are intended to support **trend analysis, capacity planning, and showback conversations** - not to serve as the source of truth for chargeback or invoicing.\n>\n> **Telemetry can be incomplete** by the nature of how it is captured:\n> - Streaming (SSE) responses may omit the final `usage` chunk if `stream_options.include_usage` is not set, producing zero-token rows.\n> - Throttled (429), failed, and cancelled AI calls produce no token usage and are excluded from token totals.\n> - `ApiManagementGatewayLlmLog` emits multiple events per request; only one carries `ModelName` and tokens, so partial ingestion or schema drift can drop attribution.\n> - Log Analytics and Application Insights have ingestion delays (1-3 min for gateway logs, 5-10 min for custom metrics) and per-workspace daily caps that can silently drop data.\n> - Per-unit rates are configured manually on each tab and do not reflect caching discounts, Batch API discounts, PTU reservations, regional pricing, taxes, or EA / MCA negotiated pricing.\n>\n> **If these numbers will drive a real chargeback or invoice, always cross-reference them against your authoritative Azure billing data** in [Azure Cost Management](https://portal.azure.com/#view/Microsoft_Azure_CostManagement/Menu/~/overview) and the Azure OpenAI / Azure AI Foundry usage exports for the same time range. Reconcile any material variances before billing a business unit.\n\nThis workbook turns raw API Management telemetry into **per-business-unit cost and usage views** so platform teams can chargeback, forecast, and right-size APIM and Azure OpenAI consumption. Use these **Instructions / Overview** as your reading guide - it explains what each tab is for, what data it relies on, and the nuances to be aware of when interpreting the numbers.\n\n## At a glance\n\n| # | Tab | What it answers | Primary data source |\n|---|---|---|---|\n| 1 | **Overview** | What is this workbook and how do I read it? | (this page) |\n| 2 | **Subscription-Based Costing** | How do I split APIM platform + per-request cost across business units (BUs)? | `ApiManagementGatewayLogs` |\n| 3 | **Entra ID Application Costing** | How do I split cost across Entra ID applications calling APIM? | App Insights `customMetrics` (`caller-requests`) |\n| 4 | **AI Gateway Token/PTU** | How many tokens did each BU consume through the AI Gateway, and what does that cost? | `ApiManagementGatewayLlmLog` joined with `ApiManagementGatewayLogs` |\n| 5 | **Per-Request Detail** | Show me every single AI Gateway request with tokens, model, latency, cost. | `ApiManagementGatewayLlmLog` + `ApiManagementGatewayLogs` |\n\n## Shared assumptions and conventions\n\n- **Business units** are identified by APIM **subscription IDs that start with `bu-`** (e.g. `bu-hr`, `bu-finance`). The `bu-` prefix is stripped for display.\n- **Time Range** at the top of the workbook applies to every tab. Each tab also keeps its own `timeContext` so server-side filtering still works.\n- **All costs are estimates.** They are computed from request and token counts and the per-unit rates you configure. They do **not** include caching discounts, Batch API discounts, PTU reservations, regional pricing variations, taxes, or EA pricing. Always validate against your official [Azure Cost Management](https://portal.azure.com/#view/Microsoft_Azure_CostManagement/Menu/~/overview) invoice.\n- **Numbers across tabs are intentionally different.** They measure different scopes:\n  - *Subscription-Based* counts every `bu-*` request at the gateway (AI + non-AI, success + failure).\n  - *AI Gateway* counts only successful AI calls that produced token usage.\n  - *Per-Request Detail* lists every individual gateway record. The **Reconciliation table** on the AI Gateway tab shows exactly how the totals line up.\n\n## Data freshness and gotchas\n\n- **Log Analytics ingestion**: typically 1-3 minutes for gateway logs, 5-10 minutes for App Insights custom metrics. If a tab looks empty right after generating traffic, give it a few minutes and refresh.\n- **AI Gateway streaming**: Server-Sent Events (SSE) responses normally omit the final `usage` object. The `emit_metric_caller_tokens.xml` policy injects `stream_options.include_usage = true` so the last chunk carries token totals. If streaming rows show zero tokens, confirm `force_stream_include_usage = True` in the notebook.\n- **Multiple LLM log rows per request**: `ApiManagementGatewayLlmLog` emits multiple events per call (one summary + per-backend events), and only one carries `ModelName` and tokens. Queries pre-summarize on `CorrelationId` to avoid double-counting; if you copy a query out of the workbook, keep that step.\n- **Throttled (429) and failed AI calls** never show up in the AI Gateway tile counts - they have no token usage. They *do* show up in the Per-Request and Subscription-Based views.\n- **Tenant mismatch**: the Entra ID tab requires the workbook viewer to be signed in to the same tenant that owns the subscription. Otherwise tiles will be empty with an `access token issuer` warning.\n\n## Sections below\n\nExpand the collapsible sections below for usage guidance and tab-by-tab notes covering parameters, formulas, and pitfalls specific to each view."
             },
             "name": "text - instructions-overview",
             "type": 1
@@ -212,7 +212,7 @@
               "items": [
                 {
                   "content": {
-                    "json": "When APIM is used as an **AI Gateway**, the built-in model-level metrics (PTU utilization, token counts) do not break down by caller. The **AI Gateway Token/PTU** tab fills that gap using `ApiManagementGatewayLlmLog` diagnostic data, joined with `ApiManagementGatewayLogs` to **attribute token consumption to APIM subscriptions (business units)**.\n\n> Requires the `GatewayLlmLogs` diagnostic category to be enabled. For per-model pricing and what these cost estimates do **not** include (PTU, Batch, cached-input, taxes, EA pricing), see the **AI Gateway - Parameters & Pricing** section below.\n\n#### How an AI request flows through APIM\n\n1. **Inbound** - APIM accepts the HTTPS request and matches it to an API + operation. The `Ocp-Apim-Subscription-Key` header maps the call to a `bu-*` product subscription, which is the dimension every query in this workbook attributes cost to.\n2. **Inbound policy chain** - Sample policies enforce per-subscription rate limits and emit telemetry. Requests that exceed the configured limit are rejected with **HTTP 429** at this stage; they never reach the AI backend, so no `ApiManagementGatewayLlmLog` row is written and no tokens are consumed.\n3. **Backend dispatch** - APIM forwards the surviving request to the configured AI backend (Azure OpenAI / AI Foundry). For load-balanced backends, APIM picks a pool member and can fail over on transient errors.\n4. **LLM diagnostic capture** - On a successful (`2xx`) response, APIM writes one row to `ApiManagementGatewayLlmLog` with `PromptTokens`, `CompletionTokens`, `TotalTokens`, `ModelName`, and `IsStreamCompletion`. Streaming responses only carry token counts when `stream_options.include_usage = true` reaches the backend. In this sample, some streaming requests intentionally omit that flag so the policy fragment can add it; when it does, APIM also writes proof into `ApiManagementGatewayLogs.TraceRecords`. Backend `5xx` responses produce no LLM-log row.\n5. **Outbound** - APIM returns the response to the client and writes one row to `ApiManagementGatewayLogs` regardless of outcome. That gateway-log row is what the **AI Requests per BU** tile counts.\n\n#### Reading the AI Gateway tiles\n\nThe **APIM Inbound** row tiles form a **monotonically non-increasing funnel** down to the **AI Backend** row, where the *Successful* tiles continue the funnel and the error tiles surface the gaps:\n\n```\nTotal APIM Requests >= AI Requests (all subs) >= AI Requests per BU\n  >= Successful (all 2xx) >= Successful (2xx, with tokens)\n```\n\n**Why two tiles can be equal:** `AI Requests (all subs)` and `AI Requests per BU` differ only on the subscription filter. They are equal whenever every AI call was made with a `bu-*` subscription key, which is the default for this sample. They diverge in production environments that mix BU subscriptions with starter/unlimited products or master-key calls.\n\n**AI Backend - row 1 (Successful):**\n\n- **Successful (all 2xx)** = every `2xx` response from a `bu-*` subscription on an AI API. This is the total of token-bearing and no-token successes.\n- **Successful (2xx, with tokens)** = the **token-bearing subset** - rows in `ApiManagementGatewayLlmLog` with `TotalTokens > 0`, `CompletionTokens > 0`, and a non-empty `ModelName`. **Total Tokens Used** sums tokens for this same subset. This is the count that drives showback.\n- **Successful (no tokens)** = `Successful (all 2xx)` - `Successful (2xx, with tokens)`. These are real, billable AI calls that returned `2xx` but landed without measurable token data - typically because a streaming response lost its final `usage` chunk, the backend returned an empty completion or content-safety refusal, or the LLM-log row carrying `ModelName` was dropped. Drill in below via the **Token Coverage Investigation** section.\n\n**AI Backend - row 2 (Errors):** these tiles sit *outside* the success funnel:\n\n- **Throttled (429)** = `AI Requests per BU` minus all non-429 outcomes. Rejected by APIM rate-limit policy before reaching the backend; no tokens consumed.\n- **Client Errors (4xx)** counts non-429 requests rejected with status codes such as 400 (bad prompt), 401/403 (auth or content-safety failure), 404 (wrong deployment/model), 408/413/422 (timeout, payload, validation). Most never reach the AI backend.\n- **Server Errors (5xx)** = backend or APIM 5xx responses. The number of 2xx requests *without* matching token data is surfaced in the **Token Coverage Investigation** section directly below the AI Gateway tiles, with a drill-in by API, operation, and likely cause. In production, you should expect this gap to be small but non-zero - track it as a quality KPI for showback accuracy.\n\n**Note on 3xx:** AI chat-completion endpoints are POSTs and do not produce redirect responses in normal operation, so there is no dedicated 3xx tile. The **HTTP Response Code Distribution** chart on the *Usage Analytics* tab will surface any 3xx responses if they ever occur.\n\n#### Why AI APIs are listed manually\n\nAPIM diagnostic logs only carry identifiers like `ApiId`, `OperationId`, and `BackendId` - they do **not** include APIM tags. Azure Resource Graph also does not index APIM API tag associations, so the workbook cannot auto-discover AI APIs from inside Log Analytics. The **AI API names** parameter on the AI Gateway tab is therefore a plain comma-separated list. See the parameter description for an `az rest` snippet that lists APIs by tag.\n\n#### Streaming, model breakdown, PTU\n\n- **Streaming vs Non-Streaming** - confirms APIM captures token counts for **both** delivery modes. The streaming visuals split requests into `Streaming (client-supplied usage)` and `Streaming (policy-injected usage)` using `ApiManagementGatewayLogs.TraceRecords`, so you can prove when APIM altered the request before forwarding it.\n- **Model breakdown** - useful when multiple models (gpt-5-mini, gpt-4o-mini, etc.) are served through the same APIM gateway.\n- **PTU utilization** - the dashed line on the trend chart represents the configurable PTU capacity threshold. If a business unit's hourly token rate approaches the PTU capacity, consider provisioning more throughput or applying rate limiting."
+                    "json": "When APIM is used as an **AI Gateway**, the built-in model-level metrics (PTU utilization, token counts) do not break down by caller. The **AI Gateway Token/PTU** tab fills that gap using `ApiManagementGatewayLlmLog` diagnostic data, joined with `ApiManagementGatewayLogs` to **attribute token consumption to APIM subscriptions (business units)**.\n\n> Requires the `GatewayLlmLogs` diagnostic category to be enabled. For per-model pricing and what these cost estimates do **not** include (PTU, Batch, cached-input, taxes, EA pricing), see the **AI Gateway - Parameters & Pricing** section below.\n\n#### How an AI request flows through APIM\n\n1. **Inbound** - APIM accepts the HTTPS request and matches it to an API + operation. The `Ocp-Apim-Subscription-Key` header maps the call to a `bu-*` product subscription, which is the dimension every query in this workbook attributes cost to.\n2. **Inbound policy chain** - Sample policies enforce per-subscription rate limits and emit telemetry. Requests that exceed the configured limit are rejected with **HTTP 429** at this stage; they never reach the AI backend, so no `ApiManagementGatewayLlmLog` row is written and no tokens are consumed.\n3. **Backend dispatch** - APIM forwards the surviving request to the configured AI backend (Azure OpenAI / AI Foundry). For load-balanced backends, APIM picks a pool member and can fail over on transient errors.\n4. **LLM diagnostic capture** - On a successful (`2xx`) response, APIM writes one row to `ApiManagementGatewayLlmLog` with `PromptTokens`, `CompletionTokens`, `TotalTokens`, `ModelName`, and `IsStreamCompletion`. Streaming responses only carry token counts when `stream_options.include_usage = true` reaches the backend. In this sample, some streaming requests intentionally omit that flag so the policy fragment can add it; when it does, APIM also writes proof into `ApiManagementGatewayLogs.TraceRecords`. Backend `5xx` responses produce no LLM-log row.\n5. **Outbound** - APIM returns the response to the client and writes one row to `ApiManagementGatewayLogs` regardless of outcome. That gateway-log row is what the **AI Requests per BU** tile counts.\n\n#### Reading the AI Gateway tiles\n\nThe **APIM Inbound** row tiles form a **monotonically non-increasing funnel** down to the **AI Backend** row, where the *Successful* tiles continue the funnel and the error tiles surface the gaps:\n\n```\nTotal APIM Requests >= AI Requests (all subs) >= AI Requests per BU\n  >= Successful (all 2xx) >= Successful (2xx, with tokens)\n```\n\n**Why two tiles can be equal:** `AI Requests (all subs)` and `AI Requests per BU` differ only on the subscription filter. They are equal whenever every AI call was made with a `bu-*` subscription key, which is the default for this sample. They diverge in production environments that mix BU subscriptions with starter/unlimited products or master-key calls.\n\n**AI Backend - row 1 (Successful):**\n\n- **Successful (all 2xx)** = every `2xx` response from a `bu-*` subscription on an AI API. This is the total of token-bearing and no-token successes.\n- **Successful (2xx, with tokens)** = the **token-bearing subset** - rows in `ApiManagementGatewayLlmLog` with `TotalTokens > 0`, `CompletionTokens > 0`, and a non-empty `ModelName`. **Total Tokens Used** sums tokens for this same subset. This is the count that drives showback.\n- **Successful (no tokens)** = `Successful (all 2xx)` - `Successful (2xx, with tokens)`. These are real, billable AI calls that returned `2xx` but landed without measurable token data - typically because a streaming response lost its final `usage` chunk, the backend returned an empty completion or content-safety refusal, or the LLM-log row carrying `ModelName` was dropped. Drill in below via the **Token Coverage Investigation** section.\n\n**AI Backend - row 2 (Errors):** these tiles sit *outside* the success funnel:\n\n- **Client Errors (4xx)** counts **all** 4xx responses (including 429s) with status codes such as 400 (bad prompt), 401/403 (auth or content-safety failure), 404 (wrong deployment/model), 408/413/422 (timeout, payload, validation), and 429 (throttled). Most never reach the AI backend. The **Throttled (429)** tile breaks out the 429 subset for visibility.\n- **Throttled (429)** = the 429 subset of `Client Errors (4xx)`. Rejected by APIM rate-limit policy before reaching the backend; no tokens consumed.\n- **Server Errors (5xx)** = backend or APIM 5xx responses. The number of 2xx requests *without* matching token data is surfaced in the **Token Coverage Investigation** section directly below the AI Gateway tiles, with a drill-in by API, operation, and likely cause. In production, you should expect this gap to be small but non-zero - track it as a quality KPI for showback accuracy.\n\n**Note on 3xx:** AI chat-completion endpoints are POSTs and do not produce redirect responses in normal operation, so there is no dedicated 3xx tile. The **HTTP Response Code Distribution** chart on the *Usage Analytics* tab will surface any 3xx responses if they ever occur.\n\n#### Why AI APIs are listed manually\n\nAPIM diagnostic logs only carry identifiers like `ApiId`, `OperationId`, and `BackendId` - they do **not** include APIM tags. Azure Resource Graph also does not index APIM API tag associations, so the workbook cannot auto-discover AI APIs from inside Log Analytics. The **AI API names** parameter on the AI Gateway tab is therefore a plain comma-separated list. See the parameter description for an `az rest` snippet that lists APIs by tag.\n\n#### Streaming, model breakdown, PTU\n\n- **Streaming vs Non-Streaming** - confirms APIM captures token counts for **both** delivery modes. The streaming visuals split requests into `Streaming (client-supplied usage)` and `Streaming (policy-injected usage)` using `ApiManagementGatewayLogs.TraceRecords`, so you can prove when APIM altered the request before forwarding it.\n- **Model breakdown** - useful when multiple models (gpt-5-mini, gpt-4o-mini, etc.) are served through the same APIM gateway.\n- **PTU utilization** - the dashed line on the trend chart represents the configurable PTU capacity threshold. If a business unit's hourly token rate approaches the PTU capacity, consider provisioning more throughput or applying rate limiting."
                   },
                   "name": "text - instructions-aigateway-body",
                   "type": 1
@@ -1019,7 +1019,7 @@
           },
           {
             "content": {
-              "json": "Token & PTU consumption per business unit, joining LLM diagnostic logs with APIM gateway logs. See the **Overview** tab for the full cost model and per-model pricing."
+              "json": "Token & PTU consumption per business unit, joining LLM diagnostic logs with APIM gateway logs. See the **Overview** tab for the full cost model and per-model pricing.\n\n> ⚠️ **Informational only - not a billing system of record.** Token counts can be incomplete (missing `usage` on streaming responses, throttled / failed calls excluded, ingestion delays, partial LLM log events). **Always cross-reference against Azure Cost Management and the Azure OpenAI / Azure AI Foundry usage exports before using these figures for chargeback.**"
             },
             "name": "text - header-aigateway",
             "type": 1
@@ -1241,7 +1241,7 @@
                 },
                 {
                   "content": {
-                    "query": "ApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| where TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)\n| project CorrelationId\n| join kind=inner (\n    ApiManagementGatewayLogs\n    | where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n    | project CorrelationId\n) on CorrelationId\n| summarize RequestCount = dcount(CorrelationId)\n| extend Label = 'Successful (2xx, with tokens)', Status = 'billable'\n| project Label, RequestCount, Status",
+                    "query": "let total2xx = toscalar(\n    ApiManagementGatewayLogs\n    | where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n    | where ApiId in~ (split('{AiApiIds}', ','))\n    | where ResponseCode between (200 .. 299)\n    | summarize count()\n);\nApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| where TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)\n| project CorrelationId\n| join kind=inner (\n    ApiManagementGatewayLogs\n    | where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n    | project CorrelationId\n) on CorrelationId\n| summarize RequestCount = dcount(CorrelationId)\n| extend Label = 'Successful (2xx, with tokens)', Status = 'billable'\n| extend Display = strcat(tostring(RequestCount), ' / ', tostring(round(todouble(RequestCount) * 100.0 / iif(total2xx == 0, 1, total2xx), 1)), '%')\n| project Label, Display, Status",
                     "queryType": 0,
                     "resourceType": "microsoft.operationalinsights/workspaces",
                     "size": 4,
@@ -1257,19 +1257,8 @@
                         "formatter": 1
                       },
                       "leftContent": {
-                        "columnMatch": "RequestCount",
-                        "formatter": 12,
-                        "formatOptions": {
-                          "min": 0,
-                          "palette": "blue"
-                        },
-                        "numberFormat": {
-                          "unit": 17,
-                          "options": {
-                            "style": "decimal",
-                            "useGrouping": true
-                          }
-                        }
+                        "columnMatch": "Display",
+                        "formatter": 1
                       },
                       "subtitleContent": {
                         "columnMatch": "Status",
@@ -1288,7 +1277,7 @@
                 },
                 {
                   "content": {
-                    "query": "let aiApis = split('{AiApiIds}', ',');\nlet twoXx = ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| where ApiId in~ (aiApis)\n| where ResponseCode between (200 .. 299)\n| project CorrelationId;\nlet withTokens = ApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| where TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)\n| distinct CorrelationId;\ntwoXx\n| join kind=leftouter withTokens on CorrelationId\n| where isempty(CorrelationId1)\n| summarize NoTokenRequests = dcount(CorrelationId)\n| extend Label = 'Successful (no tokens)', Status = '⚠ investigate'\n| project Label, NoTokenRequests, Status",
+                    "query": "let aiApis = split('{AiApiIds}', ',');\nlet twoXx = ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| where ApiId in~ (aiApis)\n| where ResponseCode between (200 .. 299)\n| project CorrelationId;\nlet total2xx = toscalar(twoXx | summarize count());\nlet withTokens = ApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| where TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)\n| distinct CorrelationId;\ntwoXx\n| join kind=leftouter withTokens on CorrelationId\n| where isempty(CorrelationId1)\n| summarize NoTokenRequests = dcount(CorrelationId)\n| extend Label = 'Successful (no tokens)', Status = '⚠ investigate'\n| extend Display = strcat(tostring(NoTokenRequests), ' / ', tostring(round(todouble(NoTokenRequests) * 100.0 / iif(total2xx == 0, 1, total2xx), 1)), '%')\n| project Label, Display, Status",
                     "queryType": 0,
                     "resourceType": "microsoft.operationalinsights/workspaces",
                     "size": 4,
@@ -1304,19 +1293,8 @@
                         "formatter": 1
                       },
                       "leftContent": {
-                        "columnMatch": "NoTokenRequests",
-                        "formatter": 12,
-                        "formatOptions": {
-                          "min": 0,
-                          "palette": "yellow"
-                        },
-                        "numberFormat": {
-                          "unit": 17,
-                          "options": {
-                            "style": "decimal",
-                            "useGrouping": true
-                          }
-                        }
+                        "columnMatch": "Display",
+                        "formatter": 1
                       },
                       "subtitleContent": {
                         "columnMatch": "Status",
@@ -1343,14 +1321,14 @@
                 },
                 {
                   "content": {
-                    "query": "ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| where ApiId in~ (split('{AiApiIds}', ','))\n| where ResponseCode == 429\n| summarize Requests = count()\n| extend Label = 'Throttled (429)', Status = 'rate limited'\n| project Label, Requests, Status",
+                    "query": "ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| where ApiId in~ (split('{AiApiIds}', ','))\n| where ResponseCode >= 400 and ResponseCode < 500\n| summarize Requests = count()\n| extend Label = 'Client Errors (4xx)', Status = 'all 4xx incl. 429'\n| project Label, Requests, Status",
                     "queryType": 0,
                     "resourceType": "microsoft.operationalinsights/workspaces",
                     "size": 4,
                     "timeContext": {
                       "durationMs": 2592000000
                     },
-                    "noDataMessage": "No throttled requests in the selected time range.",
+                    "noDataMessage": "No 4xx client errors in the selected time range.",
                     "version": "KqlItem/1.0",
                     "visualization": "tiles",
                     "tileSettings": {
@@ -1380,7 +1358,7 @@
                       "showBorder": true
                     }
                   },
-                  "name": "query - ai-throttled-tile",
+                  "name": "query - ai-client-errors-tile",
                   "type": 3,
                   "customWidth": "33",
                   "styleSettings": {
@@ -1390,14 +1368,14 @@
                 },
                 {
                   "content": {
-                    "query": "ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| where ApiId in~ (split('{AiApiIds}', ','))\n| where ResponseCode >= 400 and ResponseCode < 500 and ResponseCode != 429\n| summarize Requests = count()\n| extend Label = 'Client Errors (4xx)', Status = 'non-429 rejects'\n| project Label, Requests, Status",
+                    "query": "ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| where ApiId in~ (split('{AiApiIds}', ','))\n| where ResponseCode == 429\n| summarize Requests = count()\n| extend Label = 'Throttled (429)', Status = 'rate limited'\n| project Label, Requests, Status",
                     "queryType": 0,
                     "resourceType": "microsoft.operationalinsights/workspaces",
                     "size": 4,
                     "timeContext": {
                       "durationMs": 2592000000
                     },
-                    "noDataMessage": "No non-429 client errors in the selected time range.",
+                    "noDataMessage": "No throttled requests in the selected time range.",
                     "version": "KqlItem/1.0",
                     "visualization": "tiles",
                     "tileSettings": {
@@ -1427,7 +1405,7 @@
                       "showBorder": true
                     }
                   },
-                  "name": "query - ai-client-errors-tile",
+                  "name": "query - ai-throttled-tile",
                   "type": 3,
                   "customWidth": "33",
                   "styleSettings": {

From 479846a91106b55aee4213096936e037b7761e01 Mon Sep 17 00:00:00 2001
From: Simon Kurtz <simonkurtz@microsoft.com>
Date: Fri, 1 May 2026 14:43:54 -0400
Subject: [PATCH 4/4] Tile refinements

---
 samples/costing/costing.workbook.json | 38 +++++++++++++++++++++------
 1 file changed, 30 insertions(+), 8 deletions(-)

diff --git a/samples/costing/costing.workbook.json b/samples/costing/costing.workbook.json
index 155f908..4d7f6ee 100644
--- a/samples/costing/costing.workbook.json
+++ b/samples/costing/costing.workbook.json
@@ -1241,7 +1241,7 @@
                 },
                 {
                   "content": {
-                    "query": "let total2xx = toscalar(\n    ApiManagementGatewayLogs\n    | where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n    | where ApiId in~ (split('{AiApiIds}', ','))\n    | where ResponseCode between (200 .. 299)\n    | summarize count()\n);\nApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| where TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)\n| project CorrelationId\n| join kind=inner (\n    ApiManagementGatewayLogs\n    | where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n    | project CorrelationId\n) on CorrelationId\n| summarize RequestCount = dcount(CorrelationId)\n| extend Label = 'Successful (2xx, with tokens)', Status = 'billable'\n| extend Display = strcat(tostring(RequestCount), ' / ', tostring(round(todouble(RequestCount) * 100.0 / iif(total2xx == 0, 1, total2xx), 1)), '%')\n| project Label, Display, Status",
+                    "query": "let total2xx = toscalar(\n    ApiManagementGatewayLogs\n    | where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n    | where ApiId in~ (split('{AiApiIds}', ','))\n    | where ResponseCode between (200 .. 299)\n    | summarize count()\n);\nApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| where TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)\n| project CorrelationId\n| join kind=inner (\n    ApiManagementGatewayLogs\n    | where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n    | project CorrelationId\n) on CorrelationId\n| summarize RequestCount = dcount(CorrelationId)\n| extend Label = 'Successful (2xx, with tokens)'\n| extend Status = strcat('billable - ', tostring(round(todouble(RequestCount) * 100.0 / iif(total2xx == 0, 1, total2xx), 1)), '%')\n| project Label, RequestCount, Status",
                     "queryType": 0,
                     "resourceType": "microsoft.operationalinsights/workspaces",
                     "size": 4,
@@ -1257,8 +1257,19 @@
                         "formatter": 1
                       },
                       "leftContent": {
-                        "columnMatch": "Display",
-                        "formatter": 1
+                        "columnMatch": "RequestCount",
+                        "formatter": 12,
+                        "formatOptions": {
+                          "min": 0,
+                          "palette": "blue"
+                        },
+                        "numberFormat": {
+                          "unit": 17,
+                          "options": {
+                            "style": "decimal",
+                            "useGrouping": true
+                          }
+                        }
                       },
                       "subtitleContent": {
                         "columnMatch": "Status",
@@ -1277,7 +1288,7 @@
                 },
                 {
                   "content": {
-                    "query": "let aiApis = split('{AiApiIds}', ',');\nlet twoXx = ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| where ApiId in~ (aiApis)\n| where ResponseCode between (200 .. 299)\n| project CorrelationId;\nlet total2xx = toscalar(twoXx | summarize count());\nlet withTokens = ApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| where TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)\n| distinct CorrelationId;\ntwoXx\n| join kind=leftouter withTokens on CorrelationId\n| where isempty(CorrelationId1)\n| summarize NoTokenRequests = dcount(CorrelationId)\n| extend Label = 'Successful (no tokens)', Status = '⚠ investigate'\n| extend Display = strcat(tostring(NoTokenRequests), ' / ', tostring(round(todouble(NoTokenRequests) * 100.0 / iif(total2xx == 0, 1, total2xx), 1)), '%')\n| project Label, Display, Status",
+                    "query": "let aiApis = split('{AiApiIds}', ',');\nlet twoXx = ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| where ApiId in~ (aiApis)\n| where ResponseCode between (200 .. 299)\n| project CorrelationId;\nlet total2xx = toscalar(twoXx | summarize count());\nlet withTokens = ApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| where TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)\n| distinct CorrelationId;\ntwoXx\n| join kind=leftouter withTokens on CorrelationId\n| where isempty(CorrelationId1)\n| summarize NoTokenRequests = dcount(CorrelationId)\n| extend Label = 'Successful (no tokens)'\n| extend Status = strcat('⚠ investigate - ', tostring(round(todouble(NoTokenRequests) * 100.0 / iif(total2xx == 0, 1, total2xx), 1)), '%')\n| project Label, NoTokenRequests, Status",
                     "queryType": 0,
                     "resourceType": "microsoft.operationalinsights/workspaces",
                     "size": 4,
@@ -1293,8 +1304,19 @@
                         "formatter": 1
                       },
                       "leftContent": {
-                        "columnMatch": "Display",
-                        "formatter": 1
+                        "columnMatch": "NoTokenRequests",
+                        "formatter": 12,
+                        "formatOptions": {
+                          "min": 0,
+                          "palette": "yellow"
+                        },
+                        "numberFormat": {
+                          "unit": 17,
+                          "options": {
+                            "style": "decimal",
+                            "useGrouping": true
+                          }
+                        }
                       },
                       "subtitleContent": {
                         "columnMatch": "Status",
@@ -1368,7 +1390,7 @@
                 },
                 {
                   "content": {
-                    "query": "ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| where ApiId in~ (split('{AiApiIds}', ','))\n| where ResponseCode == 429\n| summarize Requests = count()\n| extend Label = 'Throttled (429)', Status = 'rate limited'\n| project Label, Requests, Status",
+                    "query": "let total4xx = toscalar(\n    ApiManagementGatewayLogs\n    | where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n    | where ApiId in~ (split('{AiApiIds}', ','))\n    | where ResponseCode >= 400 and ResponseCode < 500\n    | summarize count()\n);\nApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| where ApiId in~ (split('{AiApiIds}', ','))\n| where ResponseCode == 429\n| summarize Requests = count()\n| extend Label = 'Throttled (429)'\n| extend Status = strcat('rate limited - ', tostring(round(todouble(Requests) * 100.0 / iif(total4xx == 0, 1, total4xx), 1)), '%')\n| project Label, Requests, Status",
                     "queryType": 0,
                     "resourceType": "microsoft.operationalinsights/workspaces",
                     "size": 4,
@@ -1603,7 +1625,7 @@
                         "formatter": 12,
                         "formatOptions": {
                           "min": 0,
-                          "palette": "orange"
+                          "palette": "yellow"
                         },
                         "numberFormat": {
                           "unit": 17,