From 870de584a08c2150b479115fa60bc2a6aa3fcbd6 Mon Sep 17 00:00:00 2001 From: Simon Kurtz Date: Thu, 30 Apr 2026 20:59:42 -0400 Subject: [PATCH 1/4] Add support for Responses API --- README.md | 2 +- assets/APIM-Samples-Slide-Deck.html | 2 +- docs/index.html | 2 +- samples/costing/README.md | 19 +- samples/costing/_helpers.py | 258 +++++++++++++----- .../aoai-gateway-responses-operation.xml | 32 +++ samples/costing/bu-token-usage.kql | 10 +- samples/costing/costing.workbook.json | 12 +- samples/costing/create.ipynb | 147 ++++++++-- samples/costing/main.bicep | 10 + .../pf-ensure-stream-include-usage.xml | 22 +- shared/python/azure_cost.py | 6 + tests/python/test_costing_helpers.py | 168 ++++++++++++ 13 files changed, 576 insertions(+), 114 deletions(-) create mode 100644 samples/costing/aoai-gateway-responses-operation.xml create mode 100644 tests/python/test_costing_helpers.py diff --git a/README.md b/README.md index 9afe8a3..85706a6 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,7 @@ It's quick and easy to get started! | [AuthX][sample-authx] | Authentication and role-based authorization in a mock HR API. | All infrastructures | | [AuthX Pro][sample-authx-pro] | Authentication and role-based authorization in a mock product with multiple APIs and policy fragments. | All infrastructures | | [Azure Maps][sample-azure-maps] | Proxying calls to Azure Maps with APIM policies. | All infrastructures | -| [Costing][sample-costing] | Track and allocate API costs per business unit using APIM subscriptions, Entra ID application tracking, and AI Gateway token/PTU tracking including streaming (SSE) token usage, which is not simple to capture correctly in APIM. | All infrastructures | +| [Costing][sample-costing] | Track and allocate API costs per business unit using APIM subscriptions, Entra ID application tracking, and AI Gateway token/PTU tracking across **both** Azure OpenAI Chat Completions and Responses APIs, including streaming (SSE) token usage which is not simple to capture correctly in APIM. | All infrastructures | | [Dynamic CORS][sample-dynamic-cors] | Dynamic per-API CORS origin validation using custom policy fragments and a maintainable origin mapping. | All infrastructures | | [Egress Control][sample-egress-control] | Control APIM outbound internet traffic by routing it through a Network Virtual Appliance (NVA) in a hub/spoke topology. | appgw-apim, appgw-apim-pe | | [General][sample-general] | Basic demo of APIM sample setup and policy usage. | All infrastructures | diff --git a/assets/APIM-Samples-Slide-Deck.html b/assets/APIM-Samples-Slide-Deck.html index 2ff64bb..2d5c2ef 100644 --- a/assets/APIM-Samples-Slide-Deck.html +++ b/assets/APIM-Samples-Slide-Deck.html @@ -1118,7 +1118,7 @@

Azure Maps

Costing

-

Track API costs per business unit via subscriptions, Entra ID apps, and AI Gateway tokens, including streaming (SSE) token usage (not simple to capture correctly in APIM).

+

Track API costs per business unit via subscriptions, Entra ID apps, and AI Gateway tokens across both Azure OpenAI Chat Completions and Responses APIs, including streaming (SSE) token usage (not simple to capture correctly in APIM).

Dynamic CORS

diff --git a/docs/index.html b/docs/index.html index 15272f2..a5273f4 100644 --- a/docs/index.html +++ b/docs/index.html @@ -447,7 +447,7 @@

Azure Maps

Costing

-

Track and allocate API costs per business unit using subscriptions, Entra ID application tracking, and AI Gateway token/PTU tracking including streaming (SSE) token usage, which is not simple to capture correctly in APIM.

+

Track and allocate API costs per business unit using subscriptions, Entra ID application tracking, and AI Gateway token/PTU tracking across both Azure OpenAI Chat Completions and Responses APIs, including streaming (SSE) token usage which is not simple to capture correctly in APIM.

All infrastructures
diff --git a/samples/costing/README.md b/samples/costing/README.md index f7e947b..b23628a 100644 --- a/samples/costing/README.md +++ b/samples/costing/README.md @@ -16,7 +16,7 @@ This sample demonstrates how to track and allocate API costs using Azure API Man 6. **Enable cost governance** - Establish patterns for consistent tagging and naming conventions 7. **Enable budget alerts** - Create scheduled query alerts when callers exceed configurable thresholds 8. **Track AI token consumption per client** - When APIM is used as an AI Gateway, capture prompt, completion, and total token usage per calling application, enabling per-client cost attribution for PTU or pay-as-you-go OpenAI deployments -9. **Real AOAI interactions via Foundry** (optional) - Deploy a full Microsoft Foundry environment (Hub + Project + Azure AI Services) and route real Azure OpenAI chat completions through APIM, demonstrating accurate token tracking for both non-streaming and streaming (SSE) responses +9. **Real AOAI interactions via Foundry** (optional) - Deploy a full Microsoft Foundry environment (Hub + Project + Azure AI Services) and route real Azure OpenAI traffic through APIM across **both the Chat Completions and Responses APIs**, demonstrating accurate token tracking for non-streaming, streaming (SSE), and stateless (`store: false`) requests > **Note on non-OpenAI models**: This sample deploys an Azure OpenAI model only (default: `gpt-5-mini`). Other model families on Azure AI Services - such as Anthropic Claude via the Azure Marketplace - are gated by separate quota that is granted through a manual approval process, which puts them beyond the scope of a self-service sample. If you have approved quota for another provider, you can extend the sample by adding a second deployment in `main.bicep`; the token-tracking policy and workbook queries are model-agnostic. @@ -86,6 +86,23 @@ The workbook surfaces **both** streaming variants side-by-side so you can see ex The **AI Gateway** tab's *Streaming vs Non-Streaming Breakdown* and the **Per-Request Detail** tab's `AI Delivery Mode` + `Usage Provenance` columns both render this distinction, so you can confirm token capture works regardless of whether the client or APIM supplied the usage option. +### AI Surface Coverage (Chat Completions + Responses API) + +The notebook exercises **six** AI request modes per business unit per model so you can see APIM token tracking work across both Azure OpenAI surfaces and every streaming variant. Mode is chosen by `j % 6` for the `j`-th request within a business unit, giving a deterministic, even mix: + +| Mode | API surface | Streaming | Notes | +| --- | --- | --- | --- | +| 0 | Chat Completions | No | Baseline non-streaming chat. | +| 1 | Chat Completions | Yes | Client sends `stream_options.include_usage = true`; APIM forwards unchanged. | +| 2 | Chat Completions | Yes | Client omits `stream_options`; the `pf-ensure-stream-include-usage.xml` fragment injects it and emits an `IncludeUsageInjected` trace. | +| 3 | Responses API | No | Stateful (`store` defaults to `true`); uses `input` + `max_output_tokens`. | +| 4 | Responses API | Yes | Streaming Responses; the policy fragment is a no-op for this surface. | +| 5 | Responses API | No | Stateless variant with `store: false` to demonstrate ephemeral usage. | + +The Chat Completions and Responses APIs use different api-versions (`2024-10-21` vs `2025-03-01-preview`), different routes (`/deployments/{id}/chat/completions` vs `/responses`), and different request shapes (`messages` + `max_completion_tokens` vs `input` + `max_output_tokens`). They share the same `aoai-backend` and the same APIM AI logger, so `ApiManagementGatewayLlmLog` rows from both surfaces flow into the same workspace and are split by `OperationId` (`chat-completions-create` vs `responses-create`) in the workbook. + +The `pf-ensure-stream-include-usage.xml` fragment short-circuits for the Responses API: it only inspects the body when `messages` is present, so Responses requests pass through untouched. The workbook's *Streaming vs Non-Streaming Breakdown*, *Token Counts by Business Unit & Delivery Mode* table, and *Per-Request Detail* tab all surface an `API Surface` column / slice (`Chat` vs `Responses`) so you can verify each mode produced its expected rows. + > **Business unit attribution**: Join `ApiManagementGatewayLlmLog` with `ApiManagementGatewayLogs` on `CorrelationId` to map token counts to `ApimSubscriptionId` (business unit). See `bu-token-usage.kql` for a ready-to-use query. ### Context Propagation diff --git a/samples/costing/_helpers.py b/samples/costing/_helpers.py index d081350..92daeb6 100644 --- a/samples/costing/_helpers.py +++ b/samples/costing/_helpers.py @@ -297,66 +297,115 @@ def send_aoai_traffic( chat_body: dict, stream_body: dict, stream_body_without_usage: dict | None = None, -) -> tuple[int, int, int, int, bool]: - """Send `count` AOAI requests alternating non-streaming / streaming. - - Encapsulates the inner request loop used by cell D1's per-(BU, model) loop: - even iterations send non-streaming chat completions, odd iterations send - streaming chat completions. When `stream_body_without_usage` is supplied, - streaming iterations alternate between a client body that already sets - `stream_options.include_usage = true` and one that omits it entirely so - the APIM policy fragment can prove when it injected the flag. On the first - timeout the function bails out for the rest of `count` to avoid stacking - cold-start delays into multi-minute hangs. + responses_url: str | None = None, + responses_body: dict | None = None, + responses_stream_body: dict | None = None, + responses_stateless_body: dict | None = None, +) -> tuple[dict[str, int], dict[str, int], bool]: + """Send `count` AOAI requests cycling through up to six modes. + + The dispatcher cycles `j % 6` across these modes: + + | j%6 | API | Mode | + |-----|-----------|------------------------------------------------------------| + | 0 | chat | non-streaming | + | 1 | chat | streaming WITH stream_options.include_usage | + | 2 | chat | streaming WITHOUT stream_options (APIM injects + traces) | + | 3 | responses | non-streaming, stateful (default store=true) | + | 4 | responses | streaming | + | 5 | responses | non-streaming, stateless (store=false) | + + Mode 2 is intentionally preserved - it is the only case where APIM's + `Ensure-Stream-Include-Usage` fragment mutates the request body and writes + a `TraceRecords` entry, which the workbook surfaces as proof of injection. + + Mode 5 sends `{store: false}`. Per-request token counts are identical to + mode 3; the educational point is the **stateless** behavior (no chaining + via `previous_response_id`, no server-side retrieval). + + On the first timeout the function bails out for the rest of `count` to + avoid stacking cold-start delays into multi-minute hangs. Args: session: Pre-configured `requests.Session` (built via `build_session`). chat_url: Full chat-completions URL for the target deployment. caller_headers: Per-call headers (api-key for the BU + Authorization JWT). count: Total number of requests to send for this (BU, model) cell. - chat_body: JSON body for non-streaming requests. - stream_body: JSON body for streaming requests where the client already - sets `stream_options.include_usage: True`. - stream_body_without_usage: Optional JSON body for streaming requests - that intentionally omits `stream_options.include_usage` so APIM can - inject it and emit a trace record proving the mutation. + chat_body: Non-streaming chat completions body. + stream_body: Streaming chat completions body with + `stream_options.include_usage = true` set by the client. + stream_body_without_usage: Streaming chat completions body that omits + `stream_options` so APIM can inject it and emit a trace record. + When None, mode 2 falls back to mode 1. + responses_url: Full /responses URL. When None, modes 3/4/5 are skipped + and replaced by mode-0/1/0 respectively (Chat fallback). + responses_body: Responses API non-streaming body (used for mode 3). + responses_stream_body: Responses API streaming body (used for mode 4). + responses_stateless_body: Responses API non-streaming body with + `store: false` (used for mode 5). Returns: - `(non_streaming_delivered, streaming_delivered, planned_ns, planned_s, bailed)`. - `*_delivered` counts only requests that returned an HTTP response. + `(delivered, planned, bailed)` where `delivered` and `planned` are + dicts with these keys: + chat_non_streaming + chat_stream_with_usage + chat_stream_without_usage + responses_non_streaming + responses_stream + responses_non_streaming_stateless `bailed` is True if a timeout caused the loop to exit early. """ - non_streaming_count = 0 - streaming_count = 0 - planned_non_streaming = 0 - planned_streaming = 0 + keys = ( + 'chat_non_streaming', + 'chat_stream_with_usage', + 'chat_stream_without_usage', + 'responses_non_streaming', + 'responses_stream', + 'responses_non_streaming_stateless', + ) + delivered = dict.fromkeys(keys, 0) + planned = dict.fromkeys(keys, 0) bailed = False + responses_available = ( + responses_url is not None and responses_body is not None and responses_stream_body is not None and responses_stateless_body is not None + ) + for j in range(count): if bailed: break - use_streaming = j % 2 == 1 - if use_streaming: - planned_streaming += 1 - else: - planned_non_streaming += 1 - - if use_streaming: - streaming_iteration = planned_streaming - 1 - body = stream_body_without_usage if stream_body_without_usage is not None and streaming_iteration % 2 == 0 else stream_body - else: - body = chat_body + # Resolve mode from j % 6, with safe fallbacks when optional bodies/URLs missing. + mode = j % 6 + if mode == 2 and stream_body_without_usage is None: + mode = 1 + if mode in (3, 4, 5) and not responses_available: + mode = 0 if mode in (3, 5) else 1 + + if mode == 0: + url, body, key, is_stream = chat_url, chat_body, 'chat_non_streaming', False + elif mode == 1: + url, body, key, is_stream = chat_url, stream_body, 'chat_stream_with_usage', True + elif mode == 2: + url, body, key, is_stream = chat_url, stream_body_without_usage, 'chat_stream_without_usage', True + elif mode == 3: + url, body, key, is_stream = responses_url, responses_body, 'responses_non_streaming', False + elif mode == 4: + url, body, key, is_stream = responses_url, responses_stream_body, 'responses_stream', True + else: # mode == 5 + url, body, key, is_stream = responses_url, responses_stateless_body, 'responses_non_streaming_stateless', False + + planned[key] += 1 try: r = session.post( - chat_url, + url, json=body, headers=caller_headers, - timeout=45 if use_streaming else 30, - stream=use_streaming, + timeout=45 if is_stream else 30, + stream=is_stream, ) - if use_streaming and r.status_code == 200: + if is_stream and r.status_code == 200: # Drain SSE stream so APIM logs the final chunk (with usage). for _ in r.iter_lines(decode_unicode=True): pass @@ -368,12 +417,9 @@ def send_aoai_traffic( continue # 4xx/5xx still count: they appear in ApiManagementGatewayLogs. - if use_streaming: - streaming_count += 1 - else: - non_streaming_count += 1 + delivered[key] += 1 - return non_streaming_count, streaming_count, planned_non_streaming, planned_streaming, bailed + return delivered, planned, bailed def print_portal_links(items: list[tuple[str, str | None]]) -> None: @@ -550,6 +596,9 @@ def build_costing_apis( if enable_foundry and enable_token_tracking and token_metric_policy_xml is not None: aoai_operation_policy_xml = Path(utils.determine_policy_path('aoai-gateway-operation.xml', sample_folder)).read_text(encoding='utf-8') + aoai_responses_operation_policy_xml = Path(utils.determine_policy_path('aoai-gateway-responses-operation.xml', sample_folder)).read_text( + encoding='utf-8' + ) paths['aoai_api_path'] = 'aoai-gateway' aoai_chat_post = APIOperation( @@ -561,6 +610,18 @@ def build_costing_apis( policyXml=aoai_operation_policy_xml, templateParameters=[{'name': 'deploymentId', 'type': 'string', 'required': True}], ) + # Responses API operation. Uses the modern stateless/stateful + # /responses surface. Pinned to api-version 2025-03-01-preview via a + # per-operation set-query-parameter so chat-completion stays on + # 2024-10-21 unaffected. + aoai_responses_post = APIOperation( + 'responses-create', + 'Responses Create', + '/responses', + HTTP_VERB.POST, + 'Azure OpenAI Responses API create (streaming, non-streaming, and stateless via store=false)', + policyXml=aoai_responses_operation_policy_xml, + ) apis.append( API( f'{api_prefix}aoai-gateway', @@ -568,7 +629,7 @@ def build_costing_apis( paths['aoai_api_path'], 'Azure OpenAI gateway for demonstrating real token tracking with Foundry', policyXml=token_metric_policy_xml, - operations=[aoai_chat_post], + operations=[aoai_chat_post, aoai_responses_post], tags=['costing', 'emit-metric', 'ai-gateway', 'aoai', 'foundry'], subscriptionRequired=True, serviceUrl='https://placeholder.openai.azure.com/openai', @@ -816,30 +877,59 @@ def print_aoai_traffic_summary( model_request_counts: dict[str, dict[str, int]], bu_model_counts: dict[tuple[str, str], dict[str, int]], ) -> tuple[int, int, int]: - """Print per-model and per-BU×per-model AOAI request tables. + """Print per-model and per-BU x per-model AOAI request tables. + + Each per-(model) and per-(BU, model) value is a dict carrying delivered + counts for the six AOAI traffic modes: + + chat_non_streaming, chat_stream_with_usage, chat_stream_without_usage, + responses_non_streaming, responses_stream, responses_non_streaming_stateless + + The summary tables collapse those into Chat-Sync, Chat-Stream, Resp-Sync, + Resp-Stream so the per-row width stays readable while still surfacing the + Chat vs Responses split. The streaming-with vs without-usage detail and + the stateful vs stateless Responses split are visible in the workbook. Returns: - `(grand_non_streaming, grand_streaming, total)` — used by the caller - for the trailing summary line and persistence step. + `(grand_chat, grand_responses, total)` where `grand_chat` is the sum + of all chat modes and `grand_responses` is the sum of all responses + modes across every (model) row. Used by the caller for the trailing + summary line. """ + + def _agg(counts: dict[str, int]) -> tuple[int, int, int, int]: + chat_sync = counts.get('chat_non_streaming', 0) + chat_stream = counts.get('chat_stream_with_usage', 0) + counts.get('chat_stream_without_usage', 0) + resp_sync = counts.get('responses_non_streaming', 0) + counts.get('responses_non_streaming_stateless', 0) + resp_stream = counts.get('responses_stream', 0) + return chat_sync, chat_stream, resp_sync, resp_stream + print() print_info('Requests per model') summary_table = TableLogger() summary_table.header( Column('Model'), - Column('Non-streaming', align='>'), - Column('Streaming', align='>'), + Column('Chat-Sync', align='>'), + Column('Chat-Stream', align='>'), + Column('Resp-Sync', align='>'), + Column('Resp-Stream', align='>'), Column('Total', align='>'), ) summary_rows = [] - grand_ns = grand_s = 0 + grand_chat = grand_resp = 0 + g_cs = g_cstream = g_rs = g_rstream = 0 for m, counts in model_request_counts.items(): - total = counts['non_streaming'] + counts['streaming'] - summary_rows.append([m, counts['non_streaming'], counts['streaming'], total]) - grand_ns += counts['non_streaming'] - grand_s += counts['streaming'] + cs, cstream, rs, rstream = _agg(counts) + total = cs + cstream + rs + rstream + summary_rows.append([m, cs, cstream, rs, rstream, total]) + g_cs += cs + g_cstream += cstream + g_rs += rs + g_rstream += rstream + grand_chat += cs + cstream + grand_resp += rs + rstream summary_table.populate(summary_rows) - summary_table.total('GRAND TOTAL', grand_ns, grand_s, grand_ns + grand_s) + summary_table.total('GRAND TOTAL', g_cs, g_cstream, g_rs, g_rstream, g_cs + g_cstream + g_rs + g_rstream) summary_table.print() print() @@ -848,23 +938,28 @@ def print_aoai_traffic_summary( bu_model_table.header( Column('Business Unit'), Column('Model'), - Column('Non-streaming', align='>'), - Column('Streaming', align='>'), + Column('Chat-Sync', align='>'), + Column('Chat-Stream', align='>'), + Column('Resp-Sync', align='>'), + Column('Resp-Stream', align='>'), Column('Total', align='>'), ) bu_rows = [] - bu_grand_ns = bu_grand_s = 0 + bu_cs = bu_cstream = bu_rs = bu_rstream = 0 for bu, m in sorted(bu_model_counts.keys()): counts = bu_model_counts[(bu, m)] - total = counts['non_streaming'] + counts['streaming'] - bu_rows.append([bu, m, counts['non_streaming'], counts['streaming'], total]) - bu_grand_ns += counts['non_streaming'] - bu_grand_s += counts['streaming'] + cs, cstream, rs, rstream = _agg(counts) + total = cs + cstream + rs + rstream + bu_rows.append([bu, m, cs, cstream, rs, rstream, total]) + bu_cs += cs + bu_cstream += cstream + bu_rs += rs + bu_rstream += rstream bu_model_table.populate(bu_rows) - bu_model_table.total('GRAND TOTAL', '', bu_grand_ns, bu_grand_s, bu_grand_ns + bu_grand_s) + bu_model_table.total('GRAND TOTAL', '', bu_cs, bu_cstream, bu_rs, bu_rstream, bu_cs + bu_cstream + bu_rs + bu_rstream) bu_model_table.print() - return grand_ns, grand_s, grand_ns + grand_s + return grand_chat, grand_resp, grand_chat + grand_resp def persist_aoai_traffic( @@ -880,15 +975,36 @@ def persist_aoai_traffic( ) -> int: """Roll up per-(BU,model) AOAI counts into a single trafficSources entry. + Each `bu_model_counts[(bu, m)]` and `bu_model_planned[(bu, m)]` entry is a + six-key dict matching the dispatcher's mode keys. Persisted shape per + (BU, model) under `byModel[].chat` / `byModel[].responses` mirrors the + dispatcher modes so the workbook cross-reference and tests can identify + which AI surface was exercised. + Returns the total planned request count across all BU/model pairs (used by the caller for a trailing print line). The total delivered count is derived inside the function and stored as `totalRequests` in the JSON. """ + + def _shape(counts: dict[str, int]) -> dict: + return { + 'chat': { + 'nonStreaming': counts.get('chat_non_streaming', 0), + 'streamingWithUsage': counts.get('chat_stream_with_usage', 0), + 'streamingWithoutUsage': counts.get('chat_stream_without_usage', 0), + }, + 'responses': { + 'nonStreaming': counts.get('responses_non_streaming', 0), + 'streaming': counts.get('responses_stream', 0), + 'nonStreamingStateless': counts.get('responses_non_streaming_stateless', 0), + }, + } + ai_bu_rollup: dict[str, dict] = {} total_delivered = 0 for (bu, m), counts in bu_model_counts.items(): bu_info_local = subscriptions.get(bu, {}) - planned = bu_model_planned.get((bu, m), {'non_streaming': 0, 'streaming': 0}) + planned = bu_model_planned.get((bu, m), {}) entry = ai_bu_rollup.setdefault( bu, { @@ -901,23 +1017,21 @@ def persist_aoai_traffic( 'byModel': [], }, ) - model_total = counts['non_streaming'] + counts['streaming'] - planned_total = planned['non_streaming'] + planned['streaming'] + model_total = sum(counts.values()) + planned_total = sum(planned.values()) entry['planned'] += planned_total entry['requests'] += model_total total_delivered += model_total entry['byModel'].append( { 'model': m, - 'plannedNonStreaming': planned['non_streaming'], - 'plannedStreaming': planned['streaming'], - 'nonStreaming': counts['non_streaming'], - 'streaming': counts['streaming'], + 'planned': _shape(planned), + 'delivered': _shape(counts), 'total': model_total, } ) - total_planned = sum(p['non_streaming'] + p['streaming'] for p in bu_model_planned.values()) + total_planned = sum(sum(p.values()) for p in bu_model_planned.values()) persist_traffic_source( local_data_path, sample_folder=sample_folder, diff --git a/samples/costing/aoai-gateway-responses-operation.xml b/samples/costing/aoai-gateway-responses-operation.xml new file mode 100644 index 0000000..d2ddd0e --- /dev/null +++ b/samples/costing/aoai-gateway-responses-operation.xml @@ -0,0 +1,32 @@ + + + + + + + + 2025-03-01-preview + + + + + + + + + + + + diff --git a/samples/costing/bu-token-usage.kql b/samples/costing/bu-token-usage.kql index e673bcc..e48dfdd 100644 --- a/samples/costing/bu-token-usage.kql +++ b/samples/costing/bu-token-usage.kql @@ -20,6 +20,11 @@ ApiManagementGatewayLlmLog TotalTokens, ModelName, IsStreamCompletion +// Collapse dated AOAI base-model variants (e.g. 'gpt-4o-mini-2024-07-18') +// into the deployment alias (e.g. 'gpt-4o-mini'). The trailing date suffix +// is informational only and would otherwise split a single deployment +// across multiple rows. +| extend ModelName = replace_regex(ModelName, @'-\d{4}-\d{2}-\d{2}$', '') | join kind=inner ( ApiManagementGatewayLogs | where TimeGenerated > ago(timeWindow) @@ -29,11 +34,14 @@ ApiManagementGatewayLlmLog ApiId, OperationId ) on CorrelationId +| extend + ApiSurface = iff(OperationId contains 'responses-create', 'Responses', 'Chat'), + StreamMode = iff(IsStreamCompletion == true, 'Stream', 'Sync') | summarize TotalPromptTokens = sum(PromptTokens), TotalCompletionTokens = sum(CompletionTokens), TotalTokens = sum(TotalTokens), Requests = count(), StreamingRequests = countif(IsStreamCompletion == true) - by BusinessUnit, ModelName + by BusinessUnit, ModelName, ApiSurface, StreamMode | order by TotalTokens desc diff --git a/samples/costing/costing.workbook.json b/samples/costing/costing.workbook.json index 505bc6a..43bb777 100644 --- a/samples/costing/costing.workbook.json +++ b/samples/costing/costing.workbook.json @@ -1542,7 +1542,7 @@ }, { "content": { - "query": "let promptRate = coalesce(todouble('{PromptTokenRate}'), 0.00025);\nlet completionRate = coalesce(todouble('{CompletionTokenRate}'), 0.002);\nlet rankedBusinessUnits = materialize(ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| summarize RankMetric = count() by ApimSubscriptionId\n| order by RankMetric desc, ApimSubscriptionId asc\n| serialize\n| extend BucketRank = row_number()\n| extend BusinessUnitBucket = substring(ApimSubscriptionId, 3));\nlet bucketedLlmLogs = materialize(ApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| where TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)\n| project CorrelationId, PromptTokens, CompletionTokens, TotalTokens, ModelName\n| join kind=inner (\n ApiManagementGatewayLogs\n | where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n | project CorrelationId, ApimSubscriptionId\n) on CorrelationId\n| lookup kind=leftouter rankedBusinessUnits on ApimSubscriptionId\n);\nlet grandTotalTokens = toscalar(bucketedLlmLogs | summarize sum(TotalTokens));\nbucketedLlmLogs\n| summarize\n PromptTokens = sum(PromptTokens),\n CompletionTokens = sum(CompletionTokens),\n TotalTokens = sum(TotalTokens),\n Requests = count(),\n BucketRank = min(BucketRank)\n by BusinessUnitBucket, ModelName\n| extend PromptCost = round(PromptTokens * promptRate / 1000.0, 4)\n| extend CompletionCost = round(CompletionTokens * completionRate / 1000.0, 4)\n| extend TotalCost = round(PromptCost + CompletionCost, 4)\n| extend TotalTokensPct = iif(grandTotalTokens > 0, round(TotalTokens * 100.0 / grandTotalTokens, 2), 0.0)\n| order by BucketRank asc, BusinessUnitBucket asc, ModelName asc\n| project\n ['Business Unit'] = BusinessUnitBucket,\n Model = ModelName,\n ['Prompt Tokens'] = PromptTokens,\n ['Completion Tokens'] = CompletionTokens,\n ['Total Tokens'] = TotalTokens,\n ['Total Tokens Pct'] = TotalTokensPct,\n Requests,\n ['Prompt Cost ($)'] = PromptCost,\n ['Completion Cost ($)'] = CompletionCost,\n ['Total Cost ($)'] = TotalCost", + "query": "let promptRate = coalesce(todouble('{PromptTokenRate}'), 0.00025);\nlet completionRate = coalesce(todouble('{CompletionTokenRate}'), 0.002);\nlet rankedBusinessUnits = materialize(ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| summarize RankMetric = count() by ApimSubscriptionId\n| order by RankMetric desc, ApimSubscriptionId asc\n| serialize\n| extend BucketRank = row_number()\n| extend BusinessUnitBucket = substring(ApimSubscriptionId, 3));\nlet bucketedLlmLogs = materialize(ApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| where TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)\n| project CorrelationId, PromptTokens, CompletionTokens, TotalTokens, ModelName\n| extend ModelName = replace_regex(ModelName, @'-\\d{4}-\\d{2}-\\d{2}$', '')\n| join kind=inner (\n ApiManagementGatewayLogs\n | where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n | project CorrelationId, ApimSubscriptionId\n) on CorrelationId\n| lookup kind=leftouter rankedBusinessUnits on ApimSubscriptionId\n);\nlet grandTotalTokens = toscalar(bucketedLlmLogs | summarize sum(TotalTokens));\nbucketedLlmLogs\n| summarize\n PromptTokens = sum(PromptTokens),\n CompletionTokens = sum(CompletionTokens),\n TotalTokens = sum(TotalTokens),\n Requests = count(),\n BucketRank = min(BucketRank)\n by BusinessUnitBucket, ModelName\n| extend PromptCost = round(PromptTokens * promptRate / 1000.0, 4)\n| extend CompletionCost = round(CompletionTokens * completionRate / 1000.0, 4)\n| extend TotalCost = round(PromptCost + CompletionCost, 4)\n| extend TotalTokensPct = iif(grandTotalTokens > 0, round(TotalTokens * 100.0 / grandTotalTokens, 2), 0.0)\n| order by BucketRank asc, BusinessUnitBucket asc, ModelName asc\n| project\n ['Business Unit'] = BusinessUnitBucket,\n Model = ModelName,\n ['Prompt Tokens'] = PromptTokens,\n ['Completion Tokens'] = CompletionTokens,\n ['Total Tokens'] = TotalTokens,\n ['Total Tokens Pct'] = TotalTokensPct,\n Requests,\n ['Prompt Cost ($)'] = PromptCost,\n ['Completion Cost ($)'] = CompletionCost,\n ['Total Cost ($)'] = TotalCost", "queryType": 0, "resourceType": "microsoft.operationalinsights/workspaces", "size": 0, @@ -1629,7 +1629,7 @@ "items": [ { "content": { - "query": "ApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| where TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)\n| project CorrelationId, ModelName\n| join kind=inner (\n ApiManagementGatewayLogs\n | where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n | project CorrelationId, BusinessUnit = substring(ApimSubscriptionId, 3)\n) on CorrelationId\n| extend Model = iif(isempty(ModelName), '(unknown)', ModelName)\n| summarize Requests = count() by Model\n| order by Model asc", + "query": "ApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| where TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)\n| project CorrelationId, ModelName\n| extend ModelName = replace_regex(ModelName, @'-\\d{4}-\\d{2}-\\d{2}$', '')\n| join kind=inner (\n ApiManagementGatewayLogs\n | where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n | project CorrelationId, BusinessUnit = substring(ApimSubscriptionId, 3)\n) on CorrelationId\n| extend Model = iif(isempty(ModelName), '(unknown)', ModelName)\n| summarize Requests = count() by Model\n| order by Model asc", "queryType": 0, "resourceType": "microsoft.operationalinsights/workspaces", "size": 1, @@ -1653,7 +1653,7 @@ }, { "content": { - "query": "ApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| where TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)\n| project CorrelationId, TotalTokens, ModelName\n| join kind=inner (\n ApiManagementGatewayLogs\n | where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n | project CorrelationId, BusinessUnit = substring(ApimSubscriptionId, 3)\n) on CorrelationId\n| extend Model = iif(isempty(ModelName), '(unknown)', ModelName)\n| summarize TotalTokens = sum(TotalTokens) by Model\n| order by Model asc", + "query": "ApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| where TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)\n| project CorrelationId, TotalTokens, ModelName\n| extend ModelName = replace_regex(ModelName, @'-\\d{4}-\\d{2}-\\d{2}$', '')\n| join kind=inner (\n ApiManagementGatewayLogs\n | where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n | project CorrelationId, BusinessUnit = substring(ApimSubscriptionId, 3)\n) on CorrelationId\n| extend Model = iif(isempty(ModelName), '(unknown)', ModelName)\n| summarize TotalTokens = sum(TotalTokens) by Model\n| order by Model asc", "queryType": 0, "resourceType": "microsoft.operationalinsights/workspaces", "size": 1, @@ -1677,7 +1677,7 @@ }, { "content": { - "query": "ApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| where TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)\n| project CorrelationId, IsStreamCompletion\n| join kind=inner (\n ApiManagementGatewayLogs\n | where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n | project CorrelationId, TraceRecords\n) on CorrelationId\n| extend DeliveryMode = case(\n IsStreamCompletion == true and tostring(TraceRecords) has 'IncludeUsageInjected', 'Streaming (policy-injected usage)',\n IsStreamCompletion == true, 'Streaming (client-supplied usage)',\n 'Non-Streaming'\n)\n| summarize Requests = count() by DeliveryMode", + "query": "ApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| where TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)\n| project CorrelationId, IsStreamCompletion\n| join kind=inner (\n ApiManagementGatewayLogs\n | where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n | project CorrelationId, OperationId, TraceRecords\n) on CorrelationId\n| extend ApiSurface = iff(OperationId contains 'responses-create', 'Responses', 'Chat')\n| extend DeliveryMode = case(\n ApiSurface == 'Responses' and IsStreamCompletion == true, 'Responses (streaming)',\n ApiSurface == 'Responses', 'Responses (non-streaming)',\n IsStreamCompletion == true and tostring(TraceRecords) has 'IncludeUsageInjected', 'Chat (policy-injected usage)',\n IsStreamCompletion == true, 'Chat (client-supplied usage)',\n 'Chat (non-streaming)'\n)\n| summarize Requests = count() by DeliveryMode", "queryType": 0, "resourceType": "microsoft.operationalinsights/workspaces", "size": 1, @@ -1718,7 +1718,7 @@ "items": [ { "content": { - "query": "let rankedBusinessUnits = materialize(ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| summarize RankMetric = count() by ApimSubscriptionId\n| order by RankMetric desc, ApimSubscriptionId asc\n| serialize\n| extend BucketRank = row_number()\n| extend BusinessUnitBucket = substring(ApimSubscriptionId, 3));\nlet bucketedLlmLogs = materialize(ApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| where TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)\n| project CorrelationId, PromptTokens, CompletionTokens, TotalTokens, IsStreamCompletion, ModelName\n| join kind=inner (\n ApiManagementGatewayLogs\n | where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n | project CorrelationId, ApimSubscriptionId, TraceRecords\n) on CorrelationId\n| lookup kind=leftouter rankedBusinessUnits on ApimSubscriptionId\n);\nbucketedLlmLogs\n| extend DeliveryMode = iif(IsStreamCompletion == true, 'Streaming', 'Non-Streaming')\n| extend UsageProvenance = case(\n IsStreamCompletion != true, 'N/A',\n tostring(TraceRecords) has 'IncludeUsageInjected', 'Policy injected include_usage',\n 'Client sent include_usage'\n)\n| summarize\n Requests = count(),\n PromptTokens = sum(PromptTokens),\n CompletionTokens = sum(CompletionTokens),\n TotalTokens = sum(TotalTokens),\n BucketRank = min(BucketRank)\n by BusinessUnitBucket, ModelName, DeliveryMode, UsageProvenance\n| order by BucketRank asc, BusinessUnitBucket asc, ModelName asc, DeliveryMode asc, UsageProvenance asc\n| project\n ['Business Unit'] = BusinessUnitBucket,\n Model = ModelName,\n ['Delivery Mode'] = DeliveryMode,\n ['Usage Provenance'] = UsageProvenance,\n Requests,\n ['Prompt Tokens'] = PromptTokens,\n ['Completion Tokens'] = CompletionTokens,\n ['Total Tokens'] = TotalTokens", + "query": "let rankedBusinessUnits = materialize(ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| summarize RankMetric = count() by ApimSubscriptionId\n| order by RankMetric desc, ApimSubscriptionId asc\n| serialize\n| extend BucketRank = row_number()\n| extend BusinessUnitBucket = substring(ApimSubscriptionId, 3));\nlet bucketedLlmLogs = materialize(ApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| where TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)\n| project CorrelationId, PromptTokens, CompletionTokens, TotalTokens, IsStreamCompletion, ModelName\n| extend ModelName = replace_regex(ModelName, @'-\\d{4}-\\d{2}-\\d{2}$', '')\n| join kind=inner (\n ApiManagementGatewayLogs\n | where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n | project CorrelationId, ApimSubscriptionId, OperationId, TraceRecords\n) on CorrelationId\n| lookup kind=leftouter rankedBusinessUnits on ApimSubscriptionId\n);\nbucketedLlmLogs\n| extend ApiSurface = iff(OperationId contains 'responses-create', 'Responses', 'Chat')\n| extend DeliveryMode = iif(IsStreamCompletion == true, 'Streaming', 'Non-Streaming')\n| extend UsageProvenance = case(\n ApiSurface == 'Responses', 'N/A (Responses API)',\n IsStreamCompletion != true, 'N/A',\n tostring(TraceRecords) has 'IncludeUsageInjected', 'Policy injected include_usage',\n 'Client sent include_usage'\n)\n| summarize\n Requests = count(),\n PromptTokens = sum(PromptTokens),\n CompletionTokens = sum(CompletionTokens),\n TotalTokens = sum(TotalTokens),\n BucketRank = min(BucketRank)\n by BusinessUnitBucket, ModelName, ApiSurface, DeliveryMode, UsageProvenance\n| order by BucketRank asc, BusinessUnitBucket asc, ModelName asc, ApiSurface asc, DeliveryMode asc, UsageProvenance asc\n| project\n ['Business Unit'] = BusinessUnitBucket,\n Model = ModelName,\n ['API Surface'] = ApiSurface,\n ['Delivery Mode'] = DeliveryMode,\n ['Usage Provenance'] = UsageProvenance,\n Requests,\n ['Prompt Tokens'] = PromptTokens,\n ['Completion Tokens'] = CompletionTokens,\n ['Total Tokens'] = TotalTokens", "queryType": 0, "resourceType": "microsoft.operationalinsights/workspaces", "size": 0, @@ -1986,7 +1986,7 @@ "items": [ { "content": { - "query": "let promptRate = coalesce(todouble('{PromptTokenRate}'), 0.00025);\nlet completionRate = coalesce(todouble('{CompletionTokenRate}'), 0.002);\nlet rowLimit = toint('{RequestLimit}');\nlet llmPerRequest = ApiManagementGatewayLlmLog\n | where TimeGenerated {TimeRange}\n | summarize\n ModelName = take_anyif(ModelName, isnotempty(ModelName)),\n PromptTokens = sum(PromptTokens),\n CompletionTokens = sum(CompletionTokens),\n TotalTokens = sum(TotalTokens),\n IsStreamCompletion = max(tobool(IsStreamCompletion))\n by CorrelationId;\nApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| project CorrelationId, GatewayTime = TimeGenerated, BusinessUnit = substring(ApimSubscriptionId, 3), ApiId, OperationId, TotalTime, BackendTime, ResponseCode, TraceRecords\n| join kind=leftouter llmPerRequest on CorrelationId\n| extend PromptTokens = coalesce(PromptTokens, 0)\n| extend CompletionTokens = coalesce(CompletionTokens, 0)\n| extend TotalTokens = coalesce(TotalTokens, 0)\n| extend PromptCost = round(PromptTokens * promptRate / 1000.0, 6)\n| extend CompletionCost = round(CompletionTokens * completionRate / 1000.0, 6)\n| extend TotalCost = round(PromptCost + CompletionCost, 6)\n| extend AiDeliveryMode = case(isnull(IsStreamCompletion), '', IsStreamCompletion == true, 'Streaming', 'Non-Streaming')\n| extend UsageProvenance = case(\n isnull(IsStreamCompletion), '',\n IsStreamCompletion != true, 'N/A',\n tostring(TraceRecords) has 'IncludeUsageInjected', 'Policy injected include_usage',\n 'Client sent include_usage'\n)\n| order by GatewayTime desc\n| take rowLimit\n| project\n ['Timestamp (UTC)'] = format_datetime(GatewayTime, '{DateTimeFormat}'),\n ['Business Unit'] = BusinessUnit,\n ['Response Code'] = toint(ResponseCode),\n Model = coalesce(ModelName, 'N/A'),\n ['AI Delivery Mode'] = AiDeliveryMode,\n ['Usage Provenance'] = UsageProvenance,\n ['Prompt Tokens'] = PromptTokens,\n ['Completion Tokens'] = CompletionTokens,\n ['Total Tokens'] = TotalTokens,\n ['Gateway Total (ms)'] = TotalTime,\n ['Backend (ms)'] = BackendTime,\n ['Prompt Cost ($)'] = PromptCost,\n ['Completion Cost ($)'] = CompletionCost,\n ['Total Cost ($)'] = TotalCost,\n API = ApiId,\n Operation = OperationId,\n CorrelationId", + "query": "let promptRate = coalesce(todouble('{PromptTokenRate}'), 0.00025);\nlet completionRate = coalesce(todouble('{CompletionTokenRate}'), 0.002);\nlet rowLimit = toint('{RequestLimit}');\nlet llmPerRequest = ApiManagementGatewayLlmLog\n | where TimeGenerated {TimeRange}\n | summarize\n ModelName = take_anyif(ModelName, isnotempty(ModelName)),\n PromptTokens = sum(PromptTokens),\n CompletionTokens = sum(CompletionTokens),\n TotalTokens = sum(TotalTokens),\n IsStreamCompletion = max(tobool(IsStreamCompletion))\n by CorrelationId\n | extend ModelName = replace_regex(coalesce(ModelName, ''), @'-\\d{4}-\\d{2}-\\d{2}$', '');\nApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| project CorrelationId, GatewayTime = TimeGenerated, BusinessUnit = substring(ApimSubscriptionId, 3), ApiId, OperationId, TotalTime, BackendTime, ResponseCode, TraceRecords\n| join kind=leftouter llmPerRequest on CorrelationId\n| extend PromptTokens = coalesce(PromptTokens, 0)\n| extend CompletionTokens = coalesce(CompletionTokens, 0)\n| extend TotalTokens = coalesce(TotalTokens, 0)\n| extend PromptCost = round(PromptTokens * promptRate / 1000.0, 6)\n| extend CompletionCost = round(CompletionTokens * completionRate / 1000.0, 6)\n| extend TotalCost = round(PromptCost + CompletionCost, 6)\n| extend ApiSurface = iff(OperationId contains 'responses-create', 'Responses', 'Chat')\n| extend AiDeliveryMode = case(isnull(IsStreamCompletion), '', IsStreamCompletion == true, 'Streaming', 'Non-Streaming')\n| extend UsageProvenance = case(\n isnull(IsStreamCompletion), '',\n ApiSurface == 'Responses', 'N/A (Responses API)',\n IsStreamCompletion != true, 'N/A',\n tostring(TraceRecords) has 'IncludeUsageInjected', 'Policy injected include_usage',\n 'Client sent include_usage'\n)\n| order by GatewayTime desc\n| take rowLimit\n| project\n ['Timestamp (UTC)'] = format_datetime(GatewayTime, '{DateTimeFormat}'),\n ['Business Unit'] = BusinessUnit,\n ['Response Code'] = toint(ResponseCode),\n Model = coalesce(ModelName, 'N/A'),\n ['API Surface'] = ApiSurface,\n ['AI Delivery Mode'] = AiDeliveryMode,\n ['Usage Provenance'] = UsageProvenance,\n ['Prompt Tokens'] = PromptTokens,\n ['Completion Tokens'] = CompletionTokens,\n ['Total Tokens'] = TotalTokens,\n ['Gateway Total (ms)'] = TotalTime,\n ['Backend (ms)'] = BackendTime,\n ['Prompt Cost ($)'] = PromptCost,\n ['Completion Cost ($)'] = CompletionCost,\n ['Total Cost ($)'] = TotalCost,\n API = ApiId,\n Operation = OperationId,\n CorrelationId", "queryType": 0, "resourceType": "microsoft.operationalinsights/workspaces", "size": 0, diff --git a/samples/costing/create.ipynb b/samples/costing/create.ipynb index 5292ef0..8133b56 100644 --- a/samples/costing/create.ipynb +++ b/samples/costing/create.ipynb @@ -142,9 +142,18 @@ "# AI model test matrix - drives which models are deployed to Azure AI Services and\n", "# how many requests per simulated caller are generated against each.\n", "# Each entry: model name -> { version, capacity (K TPM), requests_per_caller }\n", + "#\n", + "# Models chosen to exercise three cost tiers AND ensure full emit-token-metric\n", + "# coverage on the Responses API. Per Microsoft Learn (azure-openai-emit-token-\n", + "# metric-policy), the Responses-API allow-list for the policy is limited to a\n", + "# specific set of model snapshots. gpt-4o-mini (2024-07-18) and gpt-4.1-nano\n", + "# (2025-04-14) are on that list; gpt-5-mini (2025-08-07) is NOT, so its\n", + "# Responses-API rows reach the workbook only via ApiManagementGatewayLlmLog\n", + "# (the diagnostic-log path), not via emit-token-metric custom metrics.\n", "model_test_matrix = {\n", - " 'gpt-5-mini': {'version': '2025-08-07', 'capacity': 10, 'requests_per_caller': 5},\n", - " 'gpt-4o-mini': {'version': '2024-07-18', 'capacity': 10, 'requests_per_caller': 3},\n", + " 'gpt-4o-mini': {'version': '2024-07-18', 'capacity': 10, 'requests_per_caller': 6},\n", + " 'gpt-4.1-nano': {'version': '2025-04-14', 'capacity': 10, 'requests_per_caller': 8},\n", + " 'gpt-5-mini': {'version': '2025-08-07', 'capacity': 10, 'requests_per_caller': 10},\n", "}\n", "\n", "# Derived: list form used for Bicep deployment and iteration\n", @@ -201,7 +210,7 @@ "\n", "if not subscription_id:\n", " print_error('Could not determine Azure subscription ID. Run: az login')\n", - " raise SystemExit(1)" + " raise SystemExit(1)\n" ] }, { @@ -328,20 +337,35 @@ "print_info(f'APIM pricing as of {APIM_PRICING_AS_OF}: {APIM_PRICING_URL}', True)\n", "print()\n", "\n", - "# Per-model pricing table\n", + "# Per-model pricing table. Models without local pricing data are skipped\n", + "# (with a warning) so the table still renders for the remaining models.\n", + "# Cost calculations downstream (workbook, KQL) handle missing rates the\n", + "# same way: no rate => no $ figure for that model.\n", "print_info('AI model pricing (per 1K tokens)')\n", + "pricing_rows = []\n", + "missing_pricing = []\n", + "for m in model_test_matrix:\n", + " try:\n", + " mp = get_model_pricing(m)\n", + " except ValueError:\n", + " missing_pricing.append(m)\n", + " continue\n", + " pricing_rows.append([m, f'${mp.prompt_rate_per_k:.5f}', f'${mp.completion_rate_per_k:.5f}'])\n", + "\n", "pricing_table = TableLogger()\n", "pricing_table.header(\n", " Column('Model'),\n", " Column('Prompt $/1K', align='>'),\n", " Column('Completion $/1K', align='>'),\n", ")\n", - "pricing_table.populate([\n", - " [m, f'${(mp := get_model_pricing(m)).prompt_rate_per_k:.5f}', f'${mp.completion_rate_per_k:.5f}']\n", - " for m in model_test_matrix\n", - "])\n", + "pricing_table.populate(pricing_rows)\n", "pricing_table.print()\n", "\n", + "if missing_pricing:\n", + " print()\n", + " print_warning(f'No local pricing data for: {\", \".join(missing_pricing)}. These models will run, but cost figures will be omitted.')\n", + " print_info(f'Add rates to shared/python/azure_cost.py (see {AOAI_PRICING_URL}).', True)\n", + "\n", "print_info(f'Azure OpenAI pricing as of {AOAI_PRICING_AS_OF}: {AOAI_PRICING_URL}', True)\n" ] }, @@ -744,15 +768,33 @@ "source": [ "### 🤖 D1 — [Traffic · AI Foundry] Real AOAI Interactions\n", "\n", - "When `enable_foundry = True` (default), the deployment provisions an Azure AI Services account with a model deployment plus an APIM backend with managed-identity auth. This cell sends **real** Azure OpenAI chat completions through the APIM gateway across all BUs and models so you see end-to-end token tracking. The mock simulation in D2 is **skipped** when this cell runs so AI traffic numbers reflect only real Foundry calls.\n", + "When `enable_foundry = True` (default), the deployment provisions an Azure AI Services account with a model deployment plus an APIM backend with managed-identity auth. This cell sends **real** Azure OpenAI traffic through the APIM gateway across all BUs and models so you see end-to-end token tracking on both the **Chat Completions** and **Responses** APIs. The mock simulation in D2 is **skipped** when this cell runs so AI traffic numbers reflect only real Foundry calls.\n", + "\n", + "Six delivery modes are exercised on a `j % 6` rotation per BU per model so a single cell run covers the full surface APIM diagnostics must handle:\n", + "\n", + "| `j % 6` | API | Mode |\n", + "|---------|---------------|---------------------------------------------------------------------|\n", + "| 0 | Chat | Non-streaming |\n", + "| 1 | Chat | Streaming **with** `stream_options.include_usage = true` (client) |\n", + "| 2 | Chat | Streaming **without** `stream_options` (APIM injects + traces it) |\n", + "| 3 | Responses | Non-streaming, stateful (default `store: true`) |\n", + "| 4 | Responses | Streaming |\n", + "| 5 | Responses | Non-streaming, **stateless** (`store: false`) |\n", + "\n", + "Mode 2 is the only path where APIM's `Ensure-Stream-Include-Usage` fragment mutates the request body and writes a `TraceRecords` proof entry - the workbook's *Streaming usage source* tile surfaces this. Mode 5 demonstrates the stateless Responses pattern (no chaining via `previous_response_id`); per-request token counts match mode 3.\n", + "\n", + "#### Zero-impact streaming token capture\n", "\n", - "Two delivery modes are exercised per BU per model:\n", - "1. **Non-streaming** — standard JSON response with a `usage` object.\n", - "2. **Streaming (SSE)** — half the requests send `stream_options.include_usage = true` themselves; the other half omit it so the APIM policy can add it and log proof in `TraceRecords`.\n", + "This sample is intentionally **zero-impact on the streaming response path**: APIM never buffers, parses, or rewrites the response body. Token counts come from two zero-impact sources:\n", "\n", - "> **Note:** The outbound policy buffers streaming responses to extract usage. In production, prefer the built-in `azure-openai-emit-token-metric` policy for zero-impact streaming.\n", + "1. **`ApiManagementGatewayLlmLog`** (diagnostic log, used by every workbook tile here). APIM's built-in AI gateway diagnostic reads `usage` from the final SSE chunk on the fly and writes one row to Log Analytics per request, with `ModelName`, `PromptTokens`, `CompletionTokens`, `IsStreamCompletion`, and `CorrelationId`. No policy code runs against the response body.\n", + "2. **`azure-openai-emit-token-metric`** ([built-in policy](https://learn.microsoft.com/azure/api-management/azure-openai-emit-token-metric-policy)). Emits prompt/completion/total token counts as Application Insights custom metrics with arbitrary dimensions (CallerId, ModelName, etc.). Also reads the SSE stream without buffering. Use this when you need real-time, per-caller token metrics in App Insights instead of (or alongside) Log Analytics.\n", "\n", - "> **Double-counting warning:** Do NOT enable both the custom `emit-metric` token tracking and the built-in `azure-openai-emit-token-metric` policy simultaneously — they would emit duplicate metrics.\n" + "The custom inbound `emit-metric` in this sample emits `caller-requests` with `value=\"1\"` only - a request **counter**, not a token parser - so it never touches the response body and never duplicates token counts.\n", + "\n", + "> **Production guidance:** Use the built-in `azure-openai-emit-token-metric` for per-caller token metrics in App Insights. **Never** parse the response body in an outbound policy to extract tokens - that buffers the SSE stream, breaks streaming UX, and adds latency.\n", + "\n", + "> **Double-counting warning:** If you add `azure-openai-emit-token-metric` to this sample, do **not** also add a separate outbound `emit-metric` that parses tokens from the response body - the two would emit duplicate `prompt-tokens` / `completion-tokens` metrics. The existing `caller-requests` counter is safe to keep alongside either.\n" ] }, { @@ -820,9 +862,10 @@ " extra_headers={'Content-Type': 'application/json'},\n", " )\n", "\n", - " # Request bodies reused across the (BU, model) loop. Streaming requests\n", - " # intentionally alternate between client-supplied include_usage=true and\n", - " # omitted include_usage so the APIM policy can prove when it injected it.\n", + " # Request bodies reused across the (BU, model) loop. The dispatcher in\n", + " # send_aoai_traffic cycles j%6 across these to exercise both Chat and\n", + " # Responses APIs in non-streaming + streaming + (Responses-only) stateless\n", + " # variants.\n", " chat_body = {\n", " 'messages': [\n", " {'role': 'system', 'content': 'You are a helpful assistant. Keep responses brief.'},\n", @@ -846,12 +889,33 @@ " 'stream': True,\n", " }\n", "\n", + " # Responses API bodies. The Responses surface uses `input` (string or list)\n", + " # and `max_output_tokens` instead of Chat Completions' `messages` and\n", + " # `max_completion_tokens`. Mode 5 sets `store: false` for the stateless\n", + " # variant - per-request token counts are identical to mode 3.\n", + " responses_body = {\n", + " 'model': '', # filled per-iteration below; 'model' must match the deployment name\n", + " 'input': 'Summarize APIM AI Gateway capabilities in one sentence.',\n", + " 'max_output_tokens': 100,\n", + " }\n", + " responses_stream_body = {**responses_body, 'stream': True}\n", + " responses_stateless_body = {**responses_body, 'store': False}\n", + "\n", " # --- Multi-BU traffic generation (sequential per BU per model) ---\n", " print_info('Generating multi-BU AOAI traffic for cost tracking (per model)...')\n", "\n", " # Per-model and per-(BU, model) request counters drive the summary tables\n", " # printed below and the JSON persisted for the workbook cross-reference (E3).\n", - " model_request_counts = {m: {'non_streaming': 0, 'streaming': 0} for m in model_test_matrix}\n", + " # Each value is a six-key dict matching the dispatcher's mode keys.\n", + " _empty_counts = {\n", + " 'chat_non_streaming': 0,\n", + " 'chat_stream_with_usage': 0,\n", + " 'chat_stream_without_usage': 0,\n", + " 'responses_non_streaming': 0,\n", + " 'responses_stream': 0,\n", + " 'responses_non_streaming_stateless': 0,\n", + " }\n", + " model_request_counts: dict[str, dict[str, int]] = {m: dict(_empty_counts) for m in model_test_matrix}\n", " bu_model_counts: dict[tuple[str, str], dict[str, int]] = {}\n", " bu_model_planned: dict[tuple[str, str], dict[str, int]] = {}\n", "\n", @@ -859,6 +923,11 @@ " for model_name, cfg in model_test_matrix.items():\n", " base_requests = cfg['requests_per_caller']\n", " model_chat_url = f'{endpoint_url}/{aoai_api_path}/deployments/{model_name}/chat/completions'\n", + " model_responses_url = f'{endpoint_url}/{aoai_api_path}/responses'\n", + " # Responses API requires `model` in the body (deployment name).\n", + " model_responses_body = {**responses_body, 'model': model_name}\n", + " model_responses_stream_body = {**responses_stream_body, 'model': model_name}\n", + " model_responses_stateless_body = {**responses_stateless_body, 'model': model_name}\n", " print()\n", " print_info(f'-> Model: {model_name} (base {base_requests} requests per BU, scaled by request_weight)')\n", "\n", @@ -876,20 +945,36 @@ " # proportionally more tokens than lighter ones.\n", " bu_request_count = max(1, int(base_requests * bu_info.get('request_weight', 1.0)))\n", "\n", - " ns, s, planned_ns, planned_s, _ = send_aoai_traffic(\n", + " delivered, planned, _ = send_aoai_traffic(\n", " session, model_chat_url, caller_headers, bu_request_count,\n", " chat_body=chat_body,\n", " stream_body=stream_body_with_usage,\n", " stream_body_without_usage=stream_body_without_usage,\n", + " responses_url=model_responses_url,\n", + " responses_body=model_responses_body,\n", + " responses_stream_body=model_responses_stream_body,\n", + " responses_stateless_body=model_responses_stateless_body,\n", " )\n", "\n", - " model_request_counts[model_name]['streaming'] += s\n", - " model_request_counts[model_name]['non_streaming'] += ns\n", - " bu_model_counts[(bu_name, model_name)] = {'non_streaming': ns, 'streaming': s}\n", - " bu_model_planned[(bu_name, model_name)] = {'non_streaming': planned_ns, 'streaming': planned_s}\n", + " # Aggregate per-mode delivered counts into the per-model table.\n", + " for k, v in delivered.items():\n", + " model_request_counts[model_name][k] += v\n", + " bu_model_counts[(bu_name, model_name)] = delivered\n", + " bu_model_planned[(bu_name, model_name)] = planned\n", + "\n", + " chat_total = (\n", + " delivered['chat_non_streaming']\n", + " + delivered['chat_stream_with_usage']\n", + " + delivered['chat_stream_without_usage']\n", + " )\n", + " resp_total = (\n", + " delivered['responses_non_streaming']\n", + " + delivered['responses_stream']\n", + " + delivered['responses_non_streaming_stateless']\n", + " )\n", " print_ok(\n", " f' Sent {bu_request_count} requests for {bu_name} as {caller[\"name\"]}'\n", - " f' ({caller[\"appid\"][:12]}...) [{ns} non-streaming, {s} streaming]'\n", + " f' ({caller[\"appid\"][:12]}...) [chat: {chat_total}, responses: {resp_total}]'\n", " )\n", " finally:\n", " session.close()\n", @@ -902,8 +987,8 @@ " print_info('Each request emits a caller-requests metric entry')\n", " print_info('Token counts are captured via the APIM diagnostic setting (ApiManagementGatewayLlmLog)')\n", " print_info(\n", - " 'Streaming requests alternate between client-supplied include_usage and '\n", - " 'policy-injected include_usage; the workbook surfaces this via TraceRecords'\n", + " 'The dispatcher cycles six modes: 3 Chat (sync, stream w/usage, stream w/o usage) + '\n", + " '3 Responses (sync, stream, sync stateless). Mode 2 is the only path APIM rewrites.'\n", " )\n", " print_info('Note: Custom metrics typically take 5-10 minutes to appear in Application Insights')\n", "\n", @@ -929,15 +1014,15 @@ "source": [ "### 🤖 D2 — [Traffic · AI Mock] Mock Token Tracking (Skipped When Foundry Is On)\n", "\n", - "When `enable_foundry = False`, this cell exercises the AI-gateway pattern against a mock backend (`httpbin`) so you can see per-caller `caller-tokens` custom metrics without provisioning Foundry. When D1 sent real Foundry traffic, this cell is **skipped** to avoid muddling the AI request counts shown in the workbook.\n", + "When `enable_foundry = False`, this cell exercises the AI-gateway pattern against a mock backend (`httpbin`) so you can see per-caller request attribution without provisioning Foundry. When D1 sent real Foundry traffic, this cell is **skipped** to avoid muddling the AI request counts shown in the workbook.\n", "\n", - "The mock policy returns a hard-coded usage payload like:\n", + "The mock operation policy returns a hard-coded Azure OpenAI-shaped payload like:\n", "\n", "```json\n", "{ \"usage\": { \"prompt_tokens\": 100, \"completion_tokens\": 200, \"total_tokens\": 300 } }\n", "```\n", "\n", - "The `emit-metric` policy reads those fields and emits separate `prompt-tokens`, `completion-tokens`, and `total-tokens` metric entries — same shape as the real Foundry traffic from D1.\n" + "so you can inspect a realistic response shape end-to-end. The same `emit-metric` policy used everywhere else emits a single `caller-requests` (count) metric per call with `CallerId`, `API`, and `Operation` dimensions - this is what drives the workbook's caller-attribution tile. Token-level data is **not** captured for this mock API (`enableLlmLogging` is intentionally off on `costing-token-tracking-api`); for real per-request token counts, run D1 against Foundry instead.\n" ] }, { @@ -1142,7 +1227,9 @@ "source": [ "### 🔍 E2 — [Verify] Metric Ingestion\n", "\n", - "Poll Application Insights for the `caller-requests`, `prompt-tokens`, `completion-tokens`, and `total-tokens` custom metrics emitted by the `emit-metric` policies in C2/D1/D2. Custom metric ingestion can take 5-10 minutes after first emission.\n" + "Poll Application Insights for the `caller-requests` custom metric emitted by the `emit-metric` policy on the C2 / D1 / D2 APIs, then print a per-caller breakdown. Custom metric ingestion can take 5-10 minutes after first emission.\n", + "\n", + "Token counts (`PromptTokens`, `CompletionTokens`, `TotalTokens`) are **not** emitted as App Insights custom metrics in this sample - they are captured by the APIM AI gateway diagnostic into `ApiManagementGatewayLlmLog` in Log Analytics and verified in **E1**. To emit token counts as App Insights metrics in a production setup, add the built-in [`azure-openai-emit-token-metric`](https://learn.microsoft.com/azure/api-management/azure-openai-emit-token-metric-policy) policy.\n" ] }, { diff --git a/samples/costing/main.bicep b/samples/costing/main.bicep index f33fa69..0784f08 100644 --- a/samples/costing/main.bicep +++ b/samples/costing/main.bicep @@ -85,6 +85,16 @@ param aiModels array = [ version: '2025-08-07' capacity: 10 } + { + name: 'gpt-4o-mini' + version: '2024-07-18' + capacity: 10 + } + { + name: 'gpt-4.1-nano' + version: '2025-04-14' + capacity: 10 + } ] diff --git a/shared/apim-policies/fragments/pf-ensure-stream-include-usage.xml b/shared/apim-policies/fragments/pf-ensure-stream-include-usage.xml index 09bc32d..dc7a2da 100644 --- a/shared/apim-policies/fragments/pf-ensure-stream-include-usage.xml +++ b/shared/apim-policies/fragments/pf-ensure-stream-include-usage.xml @@ -28,6 +28,13 @@ (preserveContent: true); + + // Skip rewriting for Responses-API-shaped bodies (no messages array). + if (body["messages"] == null) { + return body.ToString(); + } + if (body["stream"]?.Value() == true) { // Ensure stream_options object exists. if (body["stream_options"] == null) { diff --git a/shared/python/azure_cost.py b/shared/python/azure_cost.py index d7e0bc2..43556be 100644 --- a/shared/python/azure_cost.py +++ b/shared/python/azure_cost.py @@ -151,6 +151,12 @@ class ModelPricing: prompt_rate_per_k=0.00015, # $0.15 / 1M input tokens completion_rate_per_k=0.0006, # $0.60 / 1M output tokens ), + ('gpt-4.1-nano', 'globalstandard'): ModelPricing( + model='gpt-4.1-nano', + sku='GlobalStandard', + prompt_rate_per_k=0.0001, # $0.10 / 1M input tokens + completion_rate_per_k=0.0004, # $0.40 / 1M output tokens + ), } diff --git a/tests/python/test_costing_helpers.py b/tests/python/test_costing_helpers.py new file mode 100644 index 0000000..ce54f6d --- /dev/null +++ b/tests/python/test_costing_helpers.py @@ -0,0 +1,168 @@ +"""Tests for `samples/costing/_helpers.py` 6-mode AOAI traffic dispatcher.""" + +import sys +from pathlib import Path +from unittest.mock import MagicMock + +import pytest +import requests as http_requests + +# APIM Samples imports +COSTING_DIR = Path(__file__).resolve().parents[2] / 'samples' / 'costing' +sys.path.insert(0, str(COSTING_DIR)) + +from _helpers import send_aoai_traffic # noqa: E402 + +CHAT_URL = 'https://apim.example.com/aoai/deployments/gpt/chat/completions' +RESPONSES_URL = 'https://apim.example.com/aoai/responses' +CALLER_HEADERS = {'Ocp-Apim-Subscription-Key': 'k', 'Authorization': 'Bearer t'} + +CHAT_BODY = {'messages': [{'role': 'user', 'content': 'hi'}], 'max_completion_tokens': 50} +STREAM_BODY = {**CHAT_BODY, 'stream': True, 'stream_options': {'include_usage': True}} +STREAM_BODY_NO_USAGE = {**CHAT_BODY, 'stream': True} +RESPONSES_BODY = {'model': 'gpt', 'input': 'hi', 'max_output_tokens': 50} +RESPONSES_STREAM_BODY = {**RESPONSES_BODY, 'stream': True} +RESPONSES_STATELESS_BODY = {**RESPONSES_BODY, 'store': False} + +ALL_KEYS = ( + 'chat_non_streaming', + 'chat_stream_with_usage', + 'chat_stream_without_usage', + 'responses_non_streaming', + 'responses_stream', + 'responses_non_streaming_stateless', +) + + +def _make_session() -> MagicMock: + session = MagicMock() + response = MagicMock() + response.status_code = 200 + response.iter_lines.return_value = iter([]) + session.post.return_value = response + return session + + +def _full_kwargs() -> dict: + return { + 'chat_body': CHAT_BODY, + 'stream_body': STREAM_BODY, + 'stream_body_without_usage': STREAM_BODY_NO_USAGE, + 'responses_url': RESPONSES_URL, + 'responses_body': RESPONSES_BODY, + 'responses_stream_body': RESPONSES_STREAM_BODY, + 'responses_stateless_body': RESPONSES_STATELESS_BODY, + } + + +def test_six_requests_cycle_all_six_modes_exactly_once(): + session = _make_session() + + delivered, planned, bailed = send_aoai_traffic(session, CHAT_URL, CALLER_HEADERS, 6, **_full_kwargs()) + + assert bailed is False + for key in ALL_KEYS: + assert delivered[key] == 1, f'{key} should have exactly one delivered request' + assert planned[key] == 1, f'{key} should have exactly one planned request' + assert session.post.call_count == 6 + + +def test_dispatcher_routes_each_mode_to_correct_url_and_body(): + session = _make_session() + + send_aoai_traffic(session, CHAT_URL, CALLER_HEADERS, 6, **_full_kwargs()) + + calls = session.post.call_args_list + expected = [ + (CHAT_URL, CHAT_BODY), + (CHAT_URL, STREAM_BODY), + (CHAT_URL, STREAM_BODY_NO_USAGE), + (RESPONSES_URL, RESPONSES_BODY), + (RESPONSES_URL, RESPONSES_STREAM_BODY), + (RESPONSES_URL, RESPONSES_STATELESS_BODY), + ] + + for j, (url, body) in enumerate(expected): + args, kwargs = calls[j] + assert args[0] == url, f'mode {j} url mismatch' + assert kwargs['json'] == body, f'mode {j} body mismatch' + + +def test_responses_stateless_body_carries_store_false(): + session = _make_session() + + send_aoai_traffic(session, CHAT_URL, CALLER_HEADERS, 6, **_full_kwargs()) + + mode_5_call = session.post.call_args_list[5] + assert mode_5_call.kwargs['json'].get('store') is False + + +def test_streaming_modes_drain_response_lines(): + session = _make_session() + + send_aoai_traffic(session, CHAT_URL, CALLER_HEADERS, 6, **_full_kwargs()) + + # Modes 1, 2, 4 are streaming; iter_lines must be called for each. + response = session.post.return_value + assert response.iter_lines.call_count == 3 + + +def test_falls_back_to_chat_when_responses_inputs_missing(): + session = _make_session() + + kwargs = _full_kwargs() + kwargs['responses_url'] = None + kwargs['responses_body'] = None + kwargs['responses_stream_body'] = None + kwargs['responses_stateless_body'] = None + + delivered, planned, _ = send_aoai_traffic(session, CHAT_URL, CALLER_HEADERS, 6, **kwargs) + + # Modes 3 and 5 should fall back to mode 0 (chat non-streaming); + # mode 4 should fall back to mode 1 (chat streaming with usage). + assert delivered['responses_non_streaming'] == 0 + assert delivered['responses_stream'] == 0 + assert delivered['responses_non_streaming_stateless'] == 0 + assert delivered['chat_non_streaming'] == 3 # j=0, j=3 (fallback), j=5 (fallback) -> wait recount + assert delivered['chat_stream_with_usage'] == 2 # j=1, j=4 (fallback) + assert delivered['chat_stream_without_usage'] == 1 # j=2 + assert sum(planned.values()) == 6 + + +def test_falls_back_when_stream_body_without_usage_missing(): + session = _make_session() + + kwargs = _full_kwargs() + kwargs['stream_body_without_usage'] = None + + delivered, _planned, _ = send_aoai_traffic(session, CHAT_URL, CALLER_HEADERS, 6, **kwargs) + + # Mode 2 should fall back to mode 1 (stream_body with usage). + assert delivered['chat_stream_without_usage'] == 0 + assert delivered['chat_stream_with_usage'] == 2 # j=1 + j=2 (fallback) + + +def test_timeout_bails_remaining_requests(): + session = MagicMock() + response = MagicMock() + response.status_code = 200 + response.iter_lines.return_value = iter([]) + + # First call succeeds, second times out, remainder should be skipped. + session.post.side_effect = [response, http_requests.Timeout()] + + delivered, planned, bailed = send_aoai_traffic(session, CHAT_URL, CALLER_HEADERS, 6, **_full_kwargs()) + + assert bailed is True + assert sum(delivered.values()) == 1 + assert sum(planned.values()) == 2 # planned is incremented before the post call + assert session.post.call_count == 2 + + +@pytest.mark.parametrize('count', [0, 1, 7, 13]) +def test_planned_count_always_equals_request_count(count): + session = _make_session() + + _delivered, planned, _ = send_aoai_traffic(session, CHAT_URL, CALLER_HEADERS, count, **_full_kwargs()) + + assert sum(planned.values()) == count From 97c764f2b845d1b96d82f81fd773a8a774defcee Mon Sep 17 00:00:00 2001 From: Simon Kurtz Date: Fri, 1 May 2026 13:21:45 -0400 Subject: [PATCH 2/4] Refine workbook data and tiles --- samples/costing/README.md | 2 +- samples/costing/costing.workbook.json | 185 +++++++++++++++++++++++--- samples/costing/create.ipynb | 6 +- 3 files changed, 172 insertions(+), 21 deletions(-) diff --git a/samples/costing/README.md b/samples/costing/README.md index b23628a..328d0fa 100644 --- a/samples/costing/README.md +++ b/samples/costing/README.md @@ -131,7 +131,7 @@ This lab deploys and configures: - **Azure Monitor Workbook** - Pre-built tabbed dashboard with: - **Subscription-Based Costing tab**: Cost allocation table (base + variable cost per BU), base vs variable cost stacked bar chart, cost breakdown by API, request count and distribution charts, success/error rate analysis, response code distribution, business unit drill-down - **Entra ID Application Costing tab**: Usage by caller ID (bar chart + table), cost allocation by caller (table + pie chart), hourly request trend by caller - - **AI Gateway Token/PTU tab**: Three rows of summary tiles grouped under **APIM Inbound** (total APIM requests, AI APIM requests, inbound), **AI Backend** (backend requests, successful, throttled, failed), and **Tokens** (total tokens), followed by a request-funnel table, scope-reconciliation explainer + table, token cost allocation table with configurable per-1K-token rates, model and streaming pie charts, streaming vs non-streaming breakdown table, token-share pie, and hourly token-type trend chart + - **AI Gateway Token/PTU tab**: Summary tiles grouped under **APIM Inbound** (AI Requests across all subs, AI Requests per BU) and **AI Backend** (a Successful row with `Successful (all 2xx)`, `Successful (2xx, with tokens)`, `Successful (no tokens)`, and an Errors row with `Throttled (429)`, `Client Errors (4xx)`, `Server Errors (5xx)`), then a **Tokens** row (total tokens), followed by a request-funnel table, a Token Coverage Investigation drill-in for `Successful (no tokens)`, scope-reconciliation explainer + table, token cost allocation table with configurable per-1K-token rates, model and streaming pie charts, streaming vs non-streaming breakdown table, token-share pie, and hourly token-type trend chart - **SKU-Based Pricing** - Automatically derives base monthly cost, overage rate, and included request allowance from the deployed APIM SKU using built-in pricing data (sourced from the [Azure API Management pricing page](https://azure.microsoft.com/pricing/details/api-management/), March 2026) - **Budget Alerts** (optional) - Per-BU scheduled query alerts when request thresholds are exceeded diff --git a/samples/costing/costing.workbook.json b/samples/costing/costing.workbook.json index 43bb777..c24ce7d 100644 --- a/samples/costing/costing.workbook.json +++ b/samples/costing/costing.workbook.json @@ -212,7 +212,7 @@ "items": [ { "content": { - "json": "When APIM is used as an **AI Gateway**, the built-in model-level metrics (PTU utilization, token counts) do not break down by caller. The **AI Gateway Token/PTU** tab fills that gap using `ApiManagementGatewayLlmLog` diagnostic data, joined with `ApiManagementGatewayLogs` to **attribute token consumption to APIM subscriptions (business units)**.\n\n> Requires the `GatewayLlmLogs` diagnostic category to be enabled. For per-model pricing and what these cost estimates do **not** include (PTU, Batch, cached-input, taxes, EA pricing), see the **AI Gateway - Parameters & Pricing** section below.\n\n#### How an AI request flows through APIM\n\n1. **Inbound** - APIM accepts the HTTPS request and matches it to an API + operation. The `Ocp-Apim-Subscription-Key` header maps the call to a `bu-*` product subscription, which is the dimension every query in this workbook attributes cost to.\n2. **Inbound policy chain** - Sample policies enforce per-subscription rate limits and emit telemetry. Requests that exceed the configured limit are rejected with **HTTP 429** at this stage; they never reach the AI backend, so no `ApiManagementGatewayLlmLog` row is written and no tokens are consumed.\n3. **Backend dispatch** - APIM forwards the surviving request to the configured AI backend (Azure OpenAI / AI Foundry). For load-balanced backends, APIM picks a pool member and can fail over on transient errors.\n4. **LLM diagnostic capture** - On a successful (`2xx`) response, APIM writes one row to `ApiManagementGatewayLlmLog` with `PromptTokens`, `CompletionTokens`, `TotalTokens`, `ModelName`, and `IsStreamCompletion`. Streaming responses only carry token counts when `stream_options.include_usage = true` reaches the backend. In this sample, some streaming requests intentionally omit that flag so the policy fragment can add it; when it does, APIM also writes proof into `ApiManagementGatewayLogs.TraceRecords`. Backend `5xx` responses produce no LLM-log row.\n5. **Outbound** - APIM returns the response to the client and writes one row to `ApiManagementGatewayLogs` regardless of outcome. That gateway-log row is what the **AI Requests Received (bu-*)** tile counts.\n\n#### Reading the AI Gateway tiles\n\nThe blue tiles form a **monotonically non-increasing funnel**: every stage is a strict subset of the previous one.\n\n```\nTotal APIM Requests >= AI APIM Requests (all subs) >= AI Requests Received (bu-*)\n >= AI Not Throttled >= AI Successful (2xx)\n```\n\n**Why two tiles can be equal:** `AI APIM Requests (all subs)` and `AI Requests Received (bu-*)` differ only on the subscription filter. They are equal whenever every AI call was made with a `bu-*` subscription key, which is the default for this sample. They diverge in production environments that mix BU subscriptions with starter/unlimited products or master-key calls.\n\nThe red tiles - **AI Throttled (429)**, **AI Client Errors (4xx, non-429)**, and **AI Failed (5xx)** - sit *outside* the funnel and surface the gaps:\n\n- **AI Throttled (429)** = `AI Requests Received (bu-*)` - `AI Not Throttled`\n- **AI Client Errors (4xx, non-429)** counts requests rejected with status codes such as 400 (bad prompt), 401/403 (auth or content-safety failure), 404 (wrong deployment/model), 408/413/422 (timeout, payload, validation). These are included in `AI Not Throttled` because they are not 429s, but most never reach the AI backend.\n- **AI Failed (5xx)** = backend or APIM 5xx responses; included in `AI Not Throttled` but excluded from `AI Successful (2xx)`.\n\n**AI Successful (2xx)** and **Total Tokens Used** are the `2xx` subset - `429` throttles, non-429 4xx client errors, and `5xx` failures all contribute zero tokens.\n\n**Note on 3xx:** AI chat-completion endpoints are POSTs and do not produce redirect responses in normal operation, so there is no dedicated 3xx tile. The **HTTP Response Code Distribution** chart on the *Usage Analytics* tab will surface any 3xx responses if they ever occur.\n\n#### Why AI APIs are listed manually\n\nAPIM diagnostic logs only carry identifiers like `ApiId`, `OperationId`, and `BackendId` - they do **not** include APIM tags. Azure Resource Graph also does not index APIM API tag associations, so the workbook cannot auto-discover AI APIs from inside Log Analytics. The **AI API names** parameter on the AI Gateway tab is therefore a plain comma-separated list. See the parameter description for an `az rest` snippet that lists APIs by tag.\n\n#### Streaming, model breakdown, PTU\n\n- **Streaming vs Non-Streaming** - confirms APIM captures token counts for **both** delivery modes. The streaming visuals split requests into `Streaming (client-supplied usage)` and `Streaming (policy-injected usage)` using `ApiManagementGatewayLogs.TraceRecords`, so you can prove when APIM altered the request before forwarding it.\n- **Model breakdown** - useful when multiple models (gpt-5-mini, gpt-4o-mini, etc.) are served through the same APIM gateway.\n- **PTU utilization** - the dashed line on the trend chart represents the configurable PTU capacity threshold. If a business unit's hourly token rate approaches the PTU capacity, consider provisioning more throughput or applying rate limiting." + "json": "When APIM is used as an **AI Gateway**, the built-in model-level metrics (PTU utilization, token counts) do not break down by caller. The **AI Gateway Token/PTU** tab fills that gap using `ApiManagementGatewayLlmLog` diagnostic data, joined with `ApiManagementGatewayLogs` to **attribute token consumption to APIM subscriptions (business units)**.\n\n> Requires the `GatewayLlmLogs` diagnostic category to be enabled. For per-model pricing and what these cost estimates do **not** include (PTU, Batch, cached-input, taxes, EA pricing), see the **AI Gateway - Parameters & Pricing** section below.\n\n#### How an AI request flows through APIM\n\n1. **Inbound** - APIM accepts the HTTPS request and matches it to an API + operation. The `Ocp-Apim-Subscription-Key` header maps the call to a `bu-*` product subscription, which is the dimension every query in this workbook attributes cost to.\n2. **Inbound policy chain** - Sample policies enforce per-subscription rate limits and emit telemetry. Requests that exceed the configured limit are rejected with **HTTP 429** at this stage; they never reach the AI backend, so no `ApiManagementGatewayLlmLog` row is written and no tokens are consumed.\n3. **Backend dispatch** - APIM forwards the surviving request to the configured AI backend (Azure OpenAI / AI Foundry). For load-balanced backends, APIM picks a pool member and can fail over on transient errors.\n4. **LLM diagnostic capture** - On a successful (`2xx`) response, APIM writes one row to `ApiManagementGatewayLlmLog` with `PromptTokens`, `CompletionTokens`, `TotalTokens`, `ModelName`, and `IsStreamCompletion`. Streaming responses only carry token counts when `stream_options.include_usage = true` reaches the backend. In this sample, some streaming requests intentionally omit that flag so the policy fragment can add it; when it does, APIM also writes proof into `ApiManagementGatewayLogs.TraceRecords`. Backend `5xx` responses produce no LLM-log row.\n5. **Outbound** - APIM returns the response to the client and writes one row to `ApiManagementGatewayLogs` regardless of outcome. That gateway-log row is what the **AI Requests per BU** tile counts.\n\n#### Reading the AI Gateway tiles\n\nThe **APIM Inbound** row tiles form a **monotonically non-increasing funnel** down to the **AI Backend** row, where the *Successful* tiles continue the funnel and the error tiles surface the gaps:\n\n```\nTotal APIM Requests >= AI Requests (all subs) >= AI Requests per BU\n >= Successful (all 2xx) >= Successful (2xx, with tokens)\n```\n\n**Why two tiles can be equal:** `AI Requests (all subs)` and `AI Requests per BU` differ only on the subscription filter. They are equal whenever every AI call was made with a `bu-*` subscription key, which is the default for this sample. They diverge in production environments that mix BU subscriptions with starter/unlimited products or master-key calls.\n\n**AI Backend - row 1 (Successful):**\n\n- **Successful (all 2xx)** = every `2xx` response from a `bu-*` subscription on an AI API. This is the total of token-bearing and no-token successes.\n- **Successful (2xx, with tokens)** = the **token-bearing subset** - rows in `ApiManagementGatewayLlmLog` with `TotalTokens > 0`, `CompletionTokens > 0`, and a non-empty `ModelName`. **Total Tokens Used** sums tokens for this same subset. This is the count that drives showback.\n- **Successful (no tokens)** = `Successful (all 2xx)` - `Successful (2xx, with tokens)`. These are real, billable AI calls that returned `2xx` but landed without measurable token data - typically because a streaming response lost its final `usage` chunk, the backend returned an empty completion or content-safety refusal, or the LLM-log row carrying `ModelName` was dropped. Drill in below via the **Token Coverage Investigation** section.\n\n**AI Backend - row 2 (Errors):** these tiles sit *outside* the success funnel:\n\n- **Throttled (429)** = `AI Requests per BU` minus all non-429 outcomes. Rejected by APIM rate-limit policy before reaching the backend; no tokens consumed.\n- **Client Errors (4xx)** counts non-429 requests rejected with status codes such as 400 (bad prompt), 401/403 (auth or content-safety failure), 404 (wrong deployment/model), 408/413/422 (timeout, payload, validation). Most never reach the AI backend.\n- **Server Errors (5xx)** = backend or APIM 5xx responses. The number of 2xx requests *without* matching token data is surfaced in the **Token Coverage Investigation** section directly below the AI Gateway tiles, with a drill-in by API, operation, and likely cause. In production, you should expect this gap to be small but non-zero - track it as a quality KPI for showback accuracy.\n\n**Note on 3xx:** AI chat-completion endpoints are POSTs and do not produce redirect responses in normal operation, so there is no dedicated 3xx tile. The **HTTP Response Code Distribution** chart on the *Usage Analytics* tab will surface any 3xx responses if they ever occur.\n\n#### Why AI APIs are listed manually\n\nAPIM diagnostic logs only carry identifiers like `ApiId`, `OperationId`, and `BackendId` - they do **not** include APIM tags. Azure Resource Graph also does not index APIM API tag associations, so the workbook cannot auto-discover AI APIs from inside Log Analytics. The **AI API names** parameter on the AI Gateway tab is therefore a plain comma-separated list. See the parameter description for an `az rest` snippet that lists APIs by tag.\n\n#### Streaming, model breakdown, PTU\n\n- **Streaming vs Non-Streaming** - confirms APIM captures token counts for **both** delivery modes. The streaming visuals split requests into `Streaming (client-supplied usage)` and `Streaming (policy-injected usage)` using `ApiManagementGatewayLogs.TraceRecords`, so you can prove when APIM altered the request before forwarding it.\n- **Model breakdown** - useful when multiple models (gpt-5-mini, gpt-4o-mini, etc.) are served through the same APIM gateway.\n- **PTU utilization** - the dashed line on the trend chart represents the configurable PTU capacity threshold. If a business unit's hourly token rate approaches the PTU capacity, consider provisioning more throughput or applying rate limiting." }, "name": "text - instructions-aigateway-body", "type": 1 @@ -1194,14 +1194,14 @@ }, { "content": { - "query": "ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| where ApiId in~ (split('{AiApiIds}', ','))\n| where ResponseCode != 429\n| summarize Requests = count()\n| extend Label = 'Not throttled', Status = ''\n| project Label, Requests, Status", + "query": "ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| where ApiId in~ (split('{AiApiIds}', ','))\n| where ResponseCode between (200 .. 299)\n| summarize Requests = count()\n| extend Label = 'Successful', Status = 'all 2xx'\n| project Label, Requests, Status", "queryType": 0, "resourceType": "microsoft.operationalinsights/workspaces", "size": 4, "timeContext": { "durationMs": 2592000000 }, - "noDataMessage": "No gateway requests found for the selected time range.", + "noDataMessage": "No 2xx responses in the selected time range.", "version": "KqlItem/1.0", "visualization": "tiles", "tileSettings": { @@ -1231,9 +1231,9 @@ "showBorder": true } }, - "name": "query - ai-backend-requests-tile", + "name": "query - ai-successful-total-tile", "type": 3, - "customWidth": "20", + "customWidth": "33", "styleSettings": { "maxWidth": "320px", "showBorder": false @@ -1241,7 +1241,7 @@ }, { "content": { - "query": "ApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| where TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)\n| project CorrelationId\n| join kind=inner (\n ApiManagementGatewayLogs\n | where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n | project CorrelationId\n) on CorrelationId\n| summarize RequestCount = dcount(CorrelationId)\n| extend Label = 'Successful (2xx)', Status = ''\n| project Label, RequestCount, Status", + "query": "ApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| where TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)\n| project CorrelationId\n| join kind=inner (\n ApiManagementGatewayLogs\n | where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n | project CorrelationId\n) on CorrelationId\n| summarize RequestCount = dcount(CorrelationId)\n| extend Label = 'Successful (2xx, with tokens)', Status = 'billable'\n| project Label, RequestCount, Status", "queryType": 0, "resourceType": "microsoft.operationalinsights/workspaces", "size": 4, @@ -1280,7 +1280,7 @@ }, "name": "query - request-count-tile", "type": 3, - "customWidth": "20", + "customWidth": "33", "styleSettings": { "maxWidth": "320px", "showBorder": false @@ -1288,7 +1288,62 @@ }, { "content": { - "query": "ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| where ApiId in~ (split('{AiApiIds}', ','))\n| where ResponseCode == 429\n| summarize Requests = count()\n| extend Label = 'Throttled (429)', Status = ''\n| project Label, Requests, Status", + "query": "let aiApis = split('{AiApiIds}', ',');\nlet twoXx = ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| where ApiId in~ (aiApis)\n| where ResponseCode between (200 .. 299)\n| project CorrelationId;\nlet withTokens = ApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| where TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)\n| distinct CorrelationId;\ntwoXx\n| join kind=leftouter withTokens on CorrelationId\n| where isempty(CorrelationId1)\n| summarize NoTokenRequests = dcount(CorrelationId)\n| extend Label = 'Successful (no tokens)', Status = '⚠ investigate'\n| project Label, NoTokenRequests, Status", + "queryType": 0, + "resourceType": "microsoft.operationalinsights/workspaces", + "size": 4, + "timeContext": { + "durationMs": 2592000000 + }, + "noDataMessage": "All 2xx responses have matching token data.", + "version": "KqlItem/1.0", + "visualization": "tiles", + "tileSettings": { + "titleContent": { + "columnMatch": "Label", + "formatter": 1 + }, + "leftContent": { + "columnMatch": "NoTokenRequests", + "formatter": 12, + "formatOptions": { + "min": 0, + "palette": "yellow" + }, + "numberFormat": { + "unit": 17, + "options": { + "style": "decimal", + "useGrouping": true + } + } + }, + "subtitleContent": { + "columnMatch": "Status", + "formatter": 1 + }, + "showBorder": true + } + }, + "name": "query - ai-2xx-no-tokens-row-tile", + "type": 3, + "customWidth": "33", + "styleSettings": { + "maxWidth": "320px", + "showBorder": false + } + }, + { + "type": 1, + "content": { + "json": " " + }, + "name": "text - ai-backend-row-break", + "customWidth": "100" + }, + { + "content": { + "query": "ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| where ApiId in~ (split('{AiApiIds}', ','))\n| where ResponseCode == 429\n| summarize Requests = count()\n| extend Label = 'Throttled (429)', Status = 'rate limited'\n| project Label, Requests, Status", "queryType": 0, "resourceType": "microsoft.operationalinsights/workspaces", "size": 4, @@ -1327,7 +1382,7 @@ }, "name": "query - ai-throttled-tile", "type": 3, - "customWidth": "20", + "customWidth": "33", "styleSettings": { "maxWidth": "320px", "showBorder": false @@ -1335,14 +1390,14 @@ }, { "content": { - "query": "ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| where ApiId in~ (split('{AiApiIds}', ','))\n| where ResponseCode >= 500\n| summarize Requests = count()\n| extend Label = 'Failed (5xx)', Status = ''\n| project Label, Requests, Status", + "query": "ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| where ApiId in~ (split('{AiApiIds}', ','))\n| where ResponseCode >= 400 and ResponseCode < 500 and ResponseCode != 429\n| summarize Requests = count()\n| extend Label = 'Client Errors (4xx)', Status = 'non-429 rejects'\n| project Label, Requests, Status", "queryType": 0, "resourceType": "microsoft.operationalinsights/workspaces", "size": 4, "timeContext": { "durationMs": 2592000000 }, - "noDataMessage": "No backend failures in the selected time range.", + "noDataMessage": "No non-429 client errors in the selected time range.", "version": "KqlItem/1.0", "visualization": "tiles", "tileSettings": { @@ -1372,9 +1427,9 @@ "showBorder": true } }, - "name": "query - ai-failed-tile", + "name": "query - ai-client-errors-tile", "type": 3, - "customWidth": "20", + "customWidth": "33", "styleSettings": { "maxWidth": "320px", "showBorder": false @@ -1382,14 +1437,14 @@ }, { "content": { - "query": "ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| where ApiId in~ (split('{AiApiIds}', ','))\n| where ResponseCode >= 400 and ResponseCode < 500 and ResponseCode != 429\n| summarize Requests = count()\n| extend Label = 'Client Errors (4xx, non-429)', Status = ''\n| project Label, Requests, Status", + "query": "ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| where ApiId in~ (split('{AiApiIds}', ','))\n| where ResponseCode >= 500\n| summarize Requests = count()\n| extend Label = 'Server Errors (5xx)', Status = 'backend / APIM'\n| project Label, Requests, Status", "queryType": 0, "resourceType": "microsoft.operationalinsights/workspaces", "size": 4, "timeContext": { "durationMs": 2592000000 }, - "noDataMessage": "No non-429 client errors in the selected time range.", + "noDataMessage": "No server errors in the selected time range.", "version": "KqlItem/1.0", "visualization": "tiles", "tileSettings": { @@ -1419,9 +1474,9 @@ "showBorder": true } }, - "name": "query - ai-client-errors-tile", + "name": "query - ai-failed-tile", "type": 3, - "customWidth": "20", + "customWidth": "33", "styleSettings": { "maxWidth": "320px", "showBorder": false @@ -1540,6 +1595,102 @@ "showBorder": true } }, + { + "type": 1, + "content": { + "json": "---\n\n## ⚠️ Token Coverage Investigation\n\nIf **`AI Successful (2xx, with tokens)`** is lower than the total `2xx` responses for the AI APIs, some completed calls returned no usable token data. Those requests are real, billable AI traffic - they just won't appear in token-based showback.\n\n**Why this matters in production:** if you bill business units by tokens, *no-token* requests are unbilled work. For streaming-heavy workloads this can be 10-15% of traffic. Track the *2xx without tokens* count below as a quality KPI; spikes warrant investigation.\n\n**Common causes (most likely first):**\n\n1. **Streaming response missing the final `usage` chunk** - the SSE stream ended before `data: {... \"usage\": {...}}` arrived. Causes: client disconnect, upstream truncation, or the model deployment ignored `stream_options.include_usage`. The `Ensure-Stream-Include-Usage` policy fragment in this sample injects the flag inbound to mitigate this.\n2. **Empty completion / content-filter** - backend returned 200 with `completion_tokens = 0` (refusal, content-safety filter, tool-only response).\n3. **Multi-row LLM-log race** - `ApiManagementGatewayLlmLog` emits multiple rows per call; a transient ingestion drop of the `ModelName`-bearing row leaves the request looking token-less.\n4. **Non-chat-completion operations** - models listing, embeddings without diagnostic instrumentation, or operations the LLM diagnostic does not cover.\n\n**How to act on the table below:**\n\n- High counts on a single API + Operation row tagged *Streaming - usage chunk missing* indicate a deployment that needs `stream_options.include_usage` enforced (or a client SDK that's stripping it). Verify the `Ensure-Stream-Include-Usage` fragment is attached.\n- Counts under *Empty completion / content filter* are usually expected baseline noise; spikes may indicate prompt-injection attempts or content-safety policy changes.\n- Counts under *No LLM-log row* are the most concerning - investigate APIM diagnostic settings and confirm `enableLlmLogging` is on for the affected API." + }, + "name": "text - token-coverage-investigation-header", + "customWidth": "100" + }, + { + "content": { + "query": "let aiApis = split('{AiApiIds}', ',');\nlet twoXx = ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| where ApiId in~ (aiApis)\n| where ResponseCode between (200 .. 299)\n| project CorrelationId;\nlet withTokens = ApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| where TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)\n| distinct CorrelationId;\ntwoXx\n| join kind=leftouter withTokens on CorrelationId\n| where isempty(CorrelationId1)\n| summarize NoTokenRequests = dcount(CorrelationId)\n| extend Label = '2xx without tokens', Status = 'investigate'\n| project Label, NoTokenRequests, Status", + "queryType": 0, + "resourceType": "microsoft.operationalinsights/workspaces", + "size": 4, + "timeContext": { + "durationMs": 2592000000 + }, + "noDataMessage": "All 2xx responses have matching token data.", + "version": "KqlItem/1.0", + "visualization": "tiles", + "tileSettings": { + "titleContent": { + "columnMatch": "Label", + "formatter": 1 + }, + "leftContent": { + "columnMatch": "NoTokenRequests", + "formatter": 12, + "formatOptions": { + "min": 0, + "palette": "orange" + }, + "numberFormat": { + "unit": 17, + "options": { + "style": "decimal", + "useGrouping": true + } + } + }, + "subtitleContent": { + "columnMatch": "Status", + "formatter": 1 + }, + "showBorder": true + } + }, + "name": "query - 2xx-no-tokens-tile", + "type": 3, + "customWidth": "25", + "styleSettings": { + "maxWidth": "320px", + "showBorder": false + } + }, + { + "content": { + "query": "let aiApis = split('{AiApiIds}', ',');\nlet twoXx = ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| where ApiId in~ (aiApis)\n| where ResponseCode between (200 .. 299)\n| project CorrelationId, ApiId, OperationId, ApimSubscriptionId;\nlet llmRollup = ApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| summarize\n HasAnyRow = countif(true) > 0,\n HasTokens = countif(TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)) > 0,\n HasStream = countif(IsStreamCompletion == true) > 0\n by CorrelationId;\ntwoXx\n| join kind=leftouter llmRollup on CorrelationId\n| where HasTokens != true\n| extend ['Likely Cause'] = case(\n HasStream == true, 'Streaming - usage chunk missing',\n HasAnyRow == true, 'Empty completion / content filter',\n 'No LLM-log row (diagnostic gap)')\n| summarize ['No-Token Requests'] = dcount(CorrelationId) by ['Business Unit'] = substring(ApimSubscriptionId, 3), API = ApiId, Operation = OperationId, ['Likely Cause']\n| order by ['No-Token Requests'] desc", + "queryType": 0, + "resourceType": "microsoft.operationalinsights/workspaces", + "size": 0, + "timeContext": { + "durationMs": 2592000000 + }, + "title": "2xx Requests Without Token Data - Drill-in by API, Operation & Likely Cause", + "noDataMessage": "All 2xx responses have matching token data.", + "version": "KqlItem/1.0", + "visualization": "table", + "gridSettings": { + "formatters": [ + { + "columnMatch": "No-Token Requests", + "formatter": 8, + "formatOptions": { + "min": 0, + "palette": "orange" + }, + "numberFormat": { + "unit": 0, + "options": { + "style": "decimal", + "useGrouping": true, + "maximumFractionDigits": 0 + } + } + } + ], + "filter": true + } + }, + "name": "query - 2xx-no-tokens-breakdown", + "type": 3, + "styleSettings": { + "showBorder": true + } + }, { "content": { "query": "let promptRate = coalesce(todouble('{PromptTokenRate}'), 0.00025);\nlet completionRate = coalesce(todouble('{CompletionTokenRate}'), 0.002);\nlet rankedBusinessUnits = materialize(ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| summarize RankMetric = count() by ApimSubscriptionId\n| order by RankMetric desc, ApimSubscriptionId asc\n| serialize\n| extend BucketRank = row_number()\n| extend BusinessUnitBucket = substring(ApimSubscriptionId, 3));\nlet bucketedLlmLogs = materialize(ApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| where TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)\n| project CorrelationId, PromptTokens, CompletionTokens, TotalTokens, ModelName\n| extend ModelName = replace_regex(ModelName, @'-\\d{4}-\\d{2}-\\d{2}$', '')\n| join kind=inner (\n ApiManagementGatewayLogs\n | where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n | project CorrelationId, ApimSubscriptionId\n) on CorrelationId\n| lookup kind=leftouter rankedBusinessUnits on ApimSubscriptionId\n);\nlet grandTotalTokens = toscalar(bucketedLlmLogs | summarize sum(TotalTokens));\nbucketedLlmLogs\n| summarize\n PromptTokens = sum(PromptTokens),\n CompletionTokens = sum(CompletionTokens),\n TotalTokens = sum(TotalTokens),\n Requests = count(),\n BucketRank = min(BucketRank)\n by BusinessUnitBucket, ModelName\n| extend PromptCost = round(PromptTokens * promptRate / 1000.0, 4)\n| extend CompletionCost = round(CompletionTokens * completionRate / 1000.0, 4)\n| extend TotalCost = round(PromptCost + CompletionCost, 4)\n| extend TotalTokensPct = iif(grandTotalTokens > 0, round(TotalTokens * 100.0 / grandTotalTokens, 2), 0.0)\n| order by BucketRank asc, BusinessUnitBucket asc, ModelName asc\n| project\n ['Business Unit'] = BusinessUnitBucket,\n Model = ModelName,\n ['Prompt Tokens'] = PromptTokens,\n ['Completion Tokens'] = CompletionTokens,\n ['Total Tokens'] = TotalTokens,\n ['Total Tokens Pct'] = TotalTokensPct,\n Requests,\n ['Prompt Cost ($)'] = PromptCost,\n ['Completion Cost ($)'] = CompletionCost,\n ['Total Cost ($)'] = TotalCost", diff --git a/samples/costing/create.ipynb b/samples/costing/create.ipynb index 8133b56..0d52a23 100644 --- a/samples/costing/create.ipynb +++ b/samples/costing/create.ipynb @@ -151,9 +151,9 @@ "# Responses-API rows reach the workbook only via ApiManagementGatewayLlmLog\n", "# (the diagnostic-log path), not via emit-token-metric custom metrics.\n", "model_test_matrix = {\n", - " 'gpt-4o-mini': {'version': '2024-07-18', 'capacity': 10, 'requests_per_caller': 6},\n", - " 'gpt-4.1-nano': {'version': '2025-04-14', 'capacity': 10, 'requests_per_caller': 8},\n", - " 'gpt-5-mini': {'version': '2025-08-07', 'capacity': 10, 'requests_per_caller': 10},\n", + " 'gpt-4o-mini': {'version': '2024-07-18', 'capacity': 10, 'requests_per_caller': 2},\n", + " 'gpt-4.1-nano': {'version': '2025-04-14', 'capacity': 10, 'requests_per_caller': 4},\n", + " 'gpt-5-mini': {'version': '2025-08-07', 'capacity': 10, 'requests_per_caller': 6},\n", "}\n", "\n", "# Derived: list form used for Bicep deployment and iteration\n", From 6fb4bd606c2d4902a8ee9a7aae83d1775d1078ed Mon Sep 17 00:00:00 2001 From: Simon Kurtz Date: Fri, 1 May 2026 13:47:09 -0400 Subject: [PATCH 3/4] Add cost disclaimer, adjust AI tiles --- samples/costing/costing.workbook.json | 52 ++++++++------------------- 1 file changed, 15 insertions(+), 37 deletions(-) diff --git a/samples/costing/costing.workbook.json b/samples/costing/costing.workbook.json index c24ce7d..155f908 100644 --- a/samples/costing/costing.workbook.json +++ b/samples/costing/costing.workbook.json @@ -136,7 +136,7 @@ "items": [ { "content": { - "json": "# APIM Costing & Showback Workbook\n\nThis workbook turns raw API Management telemetry into **per-business-unit cost and usage views** so platform teams can chargeback, forecast, and right-size APIM and Azure OpenAI consumption. Use these **Instructions / Overview** as your reading guide - it explains what each tab is for, what data it relies on, and the nuances to be aware of when interpreting the numbers.\n\n## At a glance\n\n| # | Tab | What it answers | Primary data source |\n|---|---|---|---|\n| 1 | **Overview** | What is this workbook and how do I read it? | (this page) |\n| 2 | **Subscription-Based Costing** | How do I split APIM platform + per-request cost across business units (BUs)? | `ApiManagementGatewayLogs` |\n| 3 | **Entra ID Application Costing** | How do I split cost across Entra ID applications calling APIM? | App Insights `customMetrics` (`caller-requests`) |\n| 4 | **AI Gateway Token/PTU** | How many tokens did each BU consume through the AI Gateway, and what does that cost? | `ApiManagementGatewayLlmLog` joined with `ApiManagementGatewayLogs` |\n| 5 | **Per-Request Detail** | Show me every single AI Gateway request with tokens, model, latency, cost. | `ApiManagementGatewayLlmLog` + `ApiManagementGatewayLogs` |\n\n## Shared assumptions and conventions\n\n- **Business units** are identified by APIM **subscription IDs that start with `bu-`** (e.g. `bu-hr`, `bu-finance`). The `bu-` prefix is stripped for display.\n- **Time Range** at the top of the workbook applies to every tab. Each tab also keeps its own `timeContext` so server-side filtering still works.\n- **All costs are estimates.** They are computed from request and token counts and the per-unit rates you configure. They do **not** include caching discounts, Batch API discounts, PTU reservations, regional pricing variations, taxes, or EA pricing. Always validate against your official [Azure Cost Management](https://portal.azure.com/#view/Microsoft_Azure_CostManagement/Menu/~/overview) invoice.\n- **Numbers across tabs are intentionally different.** They measure different scopes:\n - *Subscription-Based* counts every `bu-*` request at the gateway (AI + non-AI, success + failure).\n - *AI Gateway* counts only successful AI calls that produced token usage.\n - *Per-Request Detail* lists every individual gateway record. The **Reconciliation table** on the AI Gateway tab shows exactly how the totals line up.\n\n## Data freshness and gotchas\n\n- **Log Analytics ingestion**: typically 1-3 minutes for gateway logs, 5-10 minutes for App Insights custom metrics. If a tab looks empty right after generating traffic, give it a few minutes and refresh.\n- **AI Gateway streaming**: Server-Sent Events (SSE) responses normally omit the final `usage` object. The `emit_metric_caller_tokens.xml` policy injects `stream_options.include_usage = true` so the last chunk carries token totals. If streaming rows show zero tokens, confirm `force_stream_include_usage = True` in the notebook.\n- **Multiple LLM log rows per request**: `ApiManagementGatewayLlmLog` emits multiple events per call (one summary + per-backend events), and only one carries `ModelName` and tokens. Queries pre-summarize on `CorrelationId` to avoid double-counting; if you copy a query out of the workbook, keep that step.\n- **Throttled (429) and failed AI calls** never show up in the AI Gateway tile counts - they have no token usage. They *do* show up in the Per-Request and Subscription-Based views.\n- **Tenant mismatch**: the Entra ID tab requires the workbook viewer to be signed in to the same tenant that owns the subscription. Otherwise tiles will be empty with an `access token issuer` warning.\n\n## Sections below\n\nExpand the collapsible sections below for usage guidance and tab-by-tab notes covering parameters, formulas, and pitfalls specific to each view." + "json": "# APIM Costing & Showback Workbook\n\n> ## ⚠️ For informational use only - **not a system of record for billing**\n>\n> All token counts, request counts, and cost figures shown across every tab of this workbook are **estimates** derived from APIM gateway telemetry and Application Insights metrics. They are intended to support **trend analysis, capacity planning, and showback conversations** - not to serve as the source of truth for chargeback or invoicing.\n>\n> **Telemetry can be incomplete** by the nature of how it is captured:\n> - Streaming (SSE) responses may omit the final `usage` chunk if `stream_options.include_usage` is not set, producing zero-token rows.\n> - Throttled (429), failed, and cancelled AI calls produce no token usage and are excluded from token totals.\n> - `ApiManagementGatewayLlmLog` emits multiple events per request; only one carries `ModelName` and tokens, so partial ingestion or schema drift can drop attribution.\n> - Log Analytics and Application Insights have ingestion delays (1-3 min for gateway logs, 5-10 min for custom metrics) and per-workspace daily caps that can silently drop data.\n> - Per-unit rates are configured manually on each tab and do not reflect caching discounts, Batch API discounts, PTU reservations, regional pricing, taxes, or EA / MCA negotiated pricing.\n>\n> **If these numbers will drive a real chargeback or invoice, always cross-reference them against your authoritative Azure billing data** in [Azure Cost Management](https://portal.azure.com/#view/Microsoft_Azure_CostManagement/Menu/~/overview) and the Azure OpenAI / Azure AI Foundry usage exports for the same time range. Reconcile any material variances before billing a business unit.\n\nThis workbook turns raw API Management telemetry into **per-business-unit cost and usage views** so platform teams can chargeback, forecast, and right-size APIM and Azure OpenAI consumption. Use these **Instructions / Overview** as your reading guide - it explains what each tab is for, what data it relies on, and the nuances to be aware of when interpreting the numbers.\n\n## At a glance\n\n| # | Tab | What it answers | Primary data source |\n|---|---|---|---|\n| 1 | **Overview** | What is this workbook and how do I read it? | (this page) |\n| 2 | **Subscription-Based Costing** | How do I split APIM platform + per-request cost across business units (BUs)? | `ApiManagementGatewayLogs` |\n| 3 | **Entra ID Application Costing** | How do I split cost across Entra ID applications calling APIM? | App Insights `customMetrics` (`caller-requests`) |\n| 4 | **AI Gateway Token/PTU** | How many tokens did each BU consume through the AI Gateway, and what does that cost? | `ApiManagementGatewayLlmLog` joined with `ApiManagementGatewayLogs` |\n| 5 | **Per-Request Detail** | Show me every single AI Gateway request with tokens, model, latency, cost. | `ApiManagementGatewayLlmLog` + `ApiManagementGatewayLogs` |\n\n## Shared assumptions and conventions\n\n- **Business units** are identified by APIM **subscription IDs that start with `bu-`** (e.g. `bu-hr`, `bu-finance`). The `bu-` prefix is stripped for display.\n- **Time Range** at the top of the workbook applies to every tab. Each tab also keeps its own `timeContext` so server-side filtering still works.\n- **All costs are estimates.** They are computed from request and token counts and the per-unit rates you configure. They do **not** include caching discounts, Batch API discounts, PTU reservations, regional pricing variations, taxes, or EA pricing. Always validate against your official [Azure Cost Management](https://portal.azure.com/#view/Microsoft_Azure_CostManagement/Menu/~/overview) invoice.\n- **Numbers across tabs are intentionally different.** They measure different scopes:\n - *Subscription-Based* counts every `bu-*` request at the gateway (AI + non-AI, success + failure).\n - *AI Gateway* counts only successful AI calls that produced token usage.\n - *Per-Request Detail* lists every individual gateway record. The **Reconciliation table** on the AI Gateway tab shows exactly how the totals line up.\n\n## Data freshness and gotchas\n\n- **Log Analytics ingestion**: typically 1-3 minutes for gateway logs, 5-10 minutes for App Insights custom metrics. If a tab looks empty right after generating traffic, give it a few minutes and refresh.\n- **AI Gateway streaming**: Server-Sent Events (SSE) responses normally omit the final `usage` object. The `emit_metric_caller_tokens.xml` policy injects `stream_options.include_usage = true` so the last chunk carries token totals. If streaming rows show zero tokens, confirm `force_stream_include_usage = True` in the notebook.\n- **Multiple LLM log rows per request**: `ApiManagementGatewayLlmLog` emits multiple events per call (one summary + per-backend events), and only one carries `ModelName` and tokens. Queries pre-summarize on `CorrelationId` to avoid double-counting; if you copy a query out of the workbook, keep that step.\n- **Throttled (429) and failed AI calls** never show up in the AI Gateway tile counts - they have no token usage. They *do* show up in the Per-Request and Subscription-Based views.\n- **Tenant mismatch**: the Entra ID tab requires the workbook viewer to be signed in to the same tenant that owns the subscription. Otherwise tiles will be empty with an `access token issuer` warning.\n\n## Sections below\n\nExpand the collapsible sections below for usage guidance and tab-by-tab notes covering parameters, formulas, and pitfalls specific to each view." }, "name": "text - instructions-overview", "type": 1 @@ -212,7 +212,7 @@ "items": [ { "content": { - "json": "When APIM is used as an **AI Gateway**, the built-in model-level metrics (PTU utilization, token counts) do not break down by caller. The **AI Gateway Token/PTU** tab fills that gap using `ApiManagementGatewayLlmLog` diagnostic data, joined with `ApiManagementGatewayLogs` to **attribute token consumption to APIM subscriptions (business units)**.\n\n> Requires the `GatewayLlmLogs` diagnostic category to be enabled. For per-model pricing and what these cost estimates do **not** include (PTU, Batch, cached-input, taxes, EA pricing), see the **AI Gateway - Parameters & Pricing** section below.\n\n#### How an AI request flows through APIM\n\n1. **Inbound** - APIM accepts the HTTPS request and matches it to an API + operation. The `Ocp-Apim-Subscription-Key` header maps the call to a `bu-*` product subscription, which is the dimension every query in this workbook attributes cost to.\n2. **Inbound policy chain** - Sample policies enforce per-subscription rate limits and emit telemetry. Requests that exceed the configured limit are rejected with **HTTP 429** at this stage; they never reach the AI backend, so no `ApiManagementGatewayLlmLog` row is written and no tokens are consumed.\n3. **Backend dispatch** - APIM forwards the surviving request to the configured AI backend (Azure OpenAI / AI Foundry). For load-balanced backends, APIM picks a pool member and can fail over on transient errors.\n4. **LLM diagnostic capture** - On a successful (`2xx`) response, APIM writes one row to `ApiManagementGatewayLlmLog` with `PromptTokens`, `CompletionTokens`, `TotalTokens`, `ModelName`, and `IsStreamCompletion`. Streaming responses only carry token counts when `stream_options.include_usage = true` reaches the backend. In this sample, some streaming requests intentionally omit that flag so the policy fragment can add it; when it does, APIM also writes proof into `ApiManagementGatewayLogs.TraceRecords`. Backend `5xx` responses produce no LLM-log row.\n5. **Outbound** - APIM returns the response to the client and writes one row to `ApiManagementGatewayLogs` regardless of outcome. That gateway-log row is what the **AI Requests per BU** tile counts.\n\n#### Reading the AI Gateway tiles\n\nThe **APIM Inbound** row tiles form a **monotonically non-increasing funnel** down to the **AI Backend** row, where the *Successful* tiles continue the funnel and the error tiles surface the gaps:\n\n```\nTotal APIM Requests >= AI Requests (all subs) >= AI Requests per BU\n >= Successful (all 2xx) >= Successful (2xx, with tokens)\n```\n\n**Why two tiles can be equal:** `AI Requests (all subs)` and `AI Requests per BU` differ only on the subscription filter. They are equal whenever every AI call was made with a `bu-*` subscription key, which is the default for this sample. They diverge in production environments that mix BU subscriptions with starter/unlimited products or master-key calls.\n\n**AI Backend - row 1 (Successful):**\n\n- **Successful (all 2xx)** = every `2xx` response from a `bu-*` subscription on an AI API. This is the total of token-bearing and no-token successes.\n- **Successful (2xx, with tokens)** = the **token-bearing subset** - rows in `ApiManagementGatewayLlmLog` with `TotalTokens > 0`, `CompletionTokens > 0`, and a non-empty `ModelName`. **Total Tokens Used** sums tokens for this same subset. This is the count that drives showback.\n- **Successful (no tokens)** = `Successful (all 2xx)` - `Successful (2xx, with tokens)`. These are real, billable AI calls that returned `2xx` but landed without measurable token data - typically because a streaming response lost its final `usage` chunk, the backend returned an empty completion or content-safety refusal, or the LLM-log row carrying `ModelName` was dropped. Drill in below via the **Token Coverage Investigation** section.\n\n**AI Backend - row 2 (Errors):** these tiles sit *outside* the success funnel:\n\n- **Throttled (429)** = `AI Requests per BU` minus all non-429 outcomes. Rejected by APIM rate-limit policy before reaching the backend; no tokens consumed.\n- **Client Errors (4xx)** counts non-429 requests rejected with status codes such as 400 (bad prompt), 401/403 (auth or content-safety failure), 404 (wrong deployment/model), 408/413/422 (timeout, payload, validation). Most never reach the AI backend.\n- **Server Errors (5xx)** = backend or APIM 5xx responses. The number of 2xx requests *without* matching token data is surfaced in the **Token Coverage Investigation** section directly below the AI Gateway tiles, with a drill-in by API, operation, and likely cause. In production, you should expect this gap to be small but non-zero - track it as a quality KPI for showback accuracy.\n\n**Note on 3xx:** AI chat-completion endpoints are POSTs and do not produce redirect responses in normal operation, so there is no dedicated 3xx tile. The **HTTP Response Code Distribution** chart on the *Usage Analytics* tab will surface any 3xx responses if they ever occur.\n\n#### Why AI APIs are listed manually\n\nAPIM diagnostic logs only carry identifiers like `ApiId`, `OperationId`, and `BackendId` - they do **not** include APIM tags. Azure Resource Graph also does not index APIM API tag associations, so the workbook cannot auto-discover AI APIs from inside Log Analytics. The **AI API names** parameter on the AI Gateway tab is therefore a plain comma-separated list. See the parameter description for an `az rest` snippet that lists APIs by tag.\n\n#### Streaming, model breakdown, PTU\n\n- **Streaming vs Non-Streaming** - confirms APIM captures token counts for **both** delivery modes. The streaming visuals split requests into `Streaming (client-supplied usage)` and `Streaming (policy-injected usage)` using `ApiManagementGatewayLogs.TraceRecords`, so you can prove when APIM altered the request before forwarding it.\n- **Model breakdown** - useful when multiple models (gpt-5-mini, gpt-4o-mini, etc.) are served through the same APIM gateway.\n- **PTU utilization** - the dashed line on the trend chart represents the configurable PTU capacity threshold. If a business unit's hourly token rate approaches the PTU capacity, consider provisioning more throughput or applying rate limiting." + "json": "When APIM is used as an **AI Gateway**, the built-in model-level metrics (PTU utilization, token counts) do not break down by caller. The **AI Gateway Token/PTU** tab fills that gap using `ApiManagementGatewayLlmLog` diagnostic data, joined with `ApiManagementGatewayLogs` to **attribute token consumption to APIM subscriptions (business units)**.\n\n> Requires the `GatewayLlmLogs` diagnostic category to be enabled. For per-model pricing and what these cost estimates do **not** include (PTU, Batch, cached-input, taxes, EA pricing), see the **AI Gateway - Parameters & Pricing** section below.\n\n#### How an AI request flows through APIM\n\n1. **Inbound** - APIM accepts the HTTPS request and matches it to an API + operation. The `Ocp-Apim-Subscription-Key` header maps the call to a `bu-*` product subscription, which is the dimension every query in this workbook attributes cost to.\n2. **Inbound policy chain** - Sample policies enforce per-subscription rate limits and emit telemetry. Requests that exceed the configured limit are rejected with **HTTP 429** at this stage; they never reach the AI backend, so no `ApiManagementGatewayLlmLog` row is written and no tokens are consumed.\n3. **Backend dispatch** - APIM forwards the surviving request to the configured AI backend (Azure OpenAI / AI Foundry). For load-balanced backends, APIM picks a pool member and can fail over on transient errors.\n4. **LLM diagnostic capture** - On a successful (`2xx`) response, APIM writes one row to `ApiManagementGatewayLlmLog` with `PromptTokens`, `CompletionTokens`, `TotalTokens`, `ModelName`, and `IsStreamCompletion`. Streaming responses only carry token counts when `stream_options.include_usage = true` reaches the backend. In this sample, some streaming requests intentionally omit that flag so the policy fragment can add it; when it does, APIM also writes proof into `ApiManagementGatewayLogs.TraceRecords`. Backend `5xx` responses produce no LLM-log row.\n5. **Outbound** - APIM returns the response to the client and writes one row to `ApiManagementGatewayLogs` regardless of outcome. That gateway-log row is what the **AI Requests per BU** tile counts.\n\n#### Reading the AI Gateway tiles\n\nThe **APIM Inbound** row tiles form a **monotonically non-increasing funnel** down to the **AI Backend** row, where the *Successful* tiles continue the funnel and the error tiles surface the gaps:\n\n```\nTotal APIM Requests >= AI Requests (all subs) >= AI Requests per BU\n >= Successful (all 2xx) >= Successful (2xx, with tokens)\n```\n\n**Why two tiles can be equal:** `AI Requests (all subs)` and `AI Requests per BU` differ only on the subscription filter. They are equal whenever every AI call was made with a `bu-*` subscription key, which is the default for this sample. They diverge in production environments that mix BU subscriptions with starter/unlimited products or master-key calls.\n\n**AI Backend - row 1 (Successful):**\n\n- **Successful (all 2xx)** = every `2xx` response from a `bu-*` subscription on an AI API. This is the total of token-bearing and no-token successes.\n- **Successful (2xx, with tokens)** = the **token-bearing subset** - rows in `ApiManagementGatewayLlmLog` with `TotalTokens > 0`, `CompletionTokens > 0`, and a non-empty `ModelName`. **Total Tokens Used** sums tokens for this same subset. This is the count that drives showback.\n- **Successful (no tokens)** = `Successful (all 2xx)` - `Successful (2xx, with tokens)`. These are real, billable AI calls that returned `2xx` but landed without measurable token data - typically because a streaming response lost its final `usage` chunk, the backend returned an empty completion or content-safety refusal, or the LLM-log row carrying `ModelName` was dropped. Drill in below via the **Token Coverage Investigation** section.\n\n**AI Backend - row 2 (Errors):** these tiles sit *outside* the success funnel:\n\n- **Client Errors (4xx)** counts **all** 4xx responses (including 429s) with status codes such as 400 (bad prompt), 401/403 (auth or content-safety failure), 404 (wrong deployment/model), 408/413/422 (timeout, payload, validation), and 429 (throttled). Most never reach the AI backend. The **Throttled (429)** tile breaks out the 429 subset for visibility.\n- **Throttled (429)** = the 429 subset of `Client Errors (4xx)`. Rejected by APIM rate-limit policy before reaching the backend; no tokens consumed.\n- **Server Errors (5xx)** = backend or APIM 5xx responses. The number of 2xx requests *without* matching token data is surfaced in the **Token Coverage Investigation** section directly below the AI Gateway tiles, with a drill-in by API, operation, and likely cause. In production, you should expect this gap to be small but non-zero - track it as a quality KPI for showback accuracy.\n\n**Note on 3xx:** AI chat-completion endpoints are POSTs and do not produce redirect responses in normal operation, so there is no dedicated 3xx tile. The **HTTP Response Code Distribution** chart on the *Usage Analytics* tab will surface any 3xx responses if they ever occur.\n\n#### Why AI APIs are listed manually\n\nAPIM diagnostic logs only carry identifiers like `ApiId`, `OperationId`, and `BackendId` - they do **not** include APIM tags. Azure Resource Graph also does not index APIM API tag associations, so the workbook cannot auto-discover AI APIs from inside Log Analytics. The **AI API names** parameter on the AI Gateway tab is therefore a plain comma-separated list. See the parameter description for an `az rest` snippet that lists APIs by tag.\n\n#### Streaming, model breakdown, PTU\n\n- **Streaming vs Non-Streaming** - confirms APIM captures token counts for **both** delivery modes. The streaming visuals split requests into `Streaming (client-supplied usage)` and `Streaming (policy-injected usage)` using `ApiManagementGatewayLogs.TraceRecords`, so you can prove when APIM altered the request before forwarding it.\n- **Model breakdown** - useful when multiple models (gpt-5-mini, gpt-4o-mini, etc.) are served through the same APIM gateway.\n- **PTU utilization** - the dashed line on the trend chart represents the configurable PTU capacity threshold. If a business unit's hourly token rate approaches the PTU capacity, consider provisioning more throughput or applying rate limiting." }, "name": "text - instructions-aigateway-body", "type": 1 @@ -1019,7 +1019,7 @@ }, { "content": { - "json": "Token & PTU consumption per business unit, joining LLM diagnostic logs with APIM gateway logs. See the **Overview** tab for the full cost model and per-model pricing." + "json": "Token & PTU consumption per business unit, joining LLM diagnostic logs with APIM gateway logs. See the **Overview** tab for the full cost model and per-model pricing.\n\n> ⚠️ **Informational only - not a billing system of record.** Token counts can be incomplete (missing `usage` on streaming responses, throttled / failed calls excluded, ingestion delays, partial LLM log events). **Always cross-reference against Azure Cost Management and the Azure OpenAI / Azure AI Foundry usage exports before using these figures for chargeback.**" }, "name": "text - header-aigateway", "type": 1 @@ -1241,7 +1241,7 @@ }, { "content": { - "query": "ApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| where TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)\n| project CorrelationId\n| join kind=inner (\n ApiManagementGatewayLogs\n | where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n | project CorrelationId\n) on CorrelationId\n| summarize RequestCount = dcount(CorrelationId)\n| extend Label = 'Successful (2xx, with tokens)', Status = 'billable'\n| project Label, RequestCount, Status", + "query": "let total2xx = toscalar(\n ApiManagementGatewayLogs\n | where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n | where ApiId in~ (split('{AiApiIds}', ','))\n | where ResponseCode between (200 .. 299)\n | summarize count()\n);\nApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| where TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)\n| project CorrelationId\n| join kind=inner (\n ApiManagementGatewayLogs\n | where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n | project CorrelationId\n) on CorrelationId\n| summarize RequestCount = dcount(CorrelationId)\n| extend Label = 'Successful (2xx, with tokens)', Status = 'billable'\n| extend Display = strcat(tostring(RequestCount), ' / ', tostring(round(todouble(RequestCount) * 100.0 / iif(total2xx == 0, 1, total2xx), 1)), '%')\n| project Label, Display, Status", "queryType": 0, "resourceType": "microsoft.operationalinsights/workspaces", "size": 4, @@ -1257,19 +1257,8 @@ "formatter": 1 }, "leftContent": { - "columnMatch": "RequestCount", - "formatter": 12, - "formatOptions": { - "min": 0, - "palette": "blue" - }, - "numberFormat": { - "unit": 17, - "options": { - "style": "decimal", - "useGrouping": true - } - } + "columnMatch": "Display", + "formatter": 1 }, "subtitleContent": { "columnMatch": "Status", @@ -1288,7 +1277,7 @@ }, { "content": { - "query": "let aiApis = split('{AiApiIds}', ',');\nlet twoXx = ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| where ApiId in~ (aiApis)\n| where ResponseCode between (200 .. 299)\n| project CorrelationId;\nlet withTokens = ApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| where TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)\n| distinct CorrelationId;\ntwoXx\n| join kind=leftouter withTokens on CorrelationId\n| where isempty(CorrelationId1)\n| summarize NoTokenRequests = dcount(CorrelationId)\n| extend Label = 'Successful (no tokens)', Status = '⚠ investigate'\n| project Label, NoTokenRequests, Status", + "query": "let aiApis = split('{AiApiIds}', ',');\nlet twoXx = ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| where ApiId in~ (aiApis)\n| where ResponseCode between (200 .. 299)\n| project CorrelationId;\nlet total2xx = toscalar(twoXx | summarize count());\nlet withTokens = ApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| where TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)\n| distinct CorrelationId;\ntwoXx\n| join kind=leftouter withTokens on CorrelationId\n| where isempty(CorrelationId1)\n| summarize NoTokenRequests = dcount(CorrelationId)\n| extend Label = 'Successful (no tokens)', Status = '⚠ investigate'\n| extend Display = strcat(tostring(NoTokenRequests), ' / ', tostring(round(todouble(NoTokenRequests) * 100.0 / iif(total2xx == 0, 1, total2xx), 1)), '%')\n| project Label, Display, Status", "queryType": 0, "resourceType": "microsoft.operationalinsights/workspaces", "size": 4, @@ -1304,19 +1293,8 @@ "formatter": 1 }, "leftContent": { - "columnMatch": "NoTokenRequests", - "formatter": 12, - "formatOptions": { - "min": 0, - "palette": "yellow" - }, - "numberFormat": { - "unit": 17, - "options": { - "style": "decimal", - "useGrouping": true - } - } + "columnMatch": "Display", + "formatter": 1 }, "subtitleContent": { "columnMatch": "Status", @@ -1343,14 +1321,14 @@ }, { "content": { - "query": "ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| where ApiId in~ (split('{AiApiIds}', ','))\n| where ResponseCode == 429\n| summarize Requests = count()\n| extend Label = 'Throttled (429)', Status = 'rate limited'\n| project Label, Requests, Status", + "query": "ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| where ApiId in~ (split('{AiApiIds}', ','))\n| where ResponseCode >= 400 and ResponseCode < 500\n| summarize Requests = count()\n| extend Label = 'Client Errors (4xx)', Status = 'all 4xx incl. 429'\n| project Label, Requests, Status", "queryType": 0, "resourceType": "microsoft.operationalinsights/workspaces", "size": 4, "timeContext": { "durationMs": 2592000000 }, - "noDataMessage": "No throttled requests in the selected time range.", + "noDataMessage": "No 4xx client errors in the selected time range.", "version": "KqlItem/1.0", "visualization": "tiles", "tileSettings": { @@ -1380,7 +1358,7 @@ "showBorder": true } }, - "name": "query - ai-throttled-tile", + "name": "query - ai-client-errors-tile", "type": 3, "customWidth": "33", "styleSettings": { @@ -1390,14 +1368,14 @@ }, { "content": { - "query": "ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| where ApiId in~ (split('{AiApiIds}', ','))\n| where ResponseCode >= 400 and ResponseCode < 500 and ResponseCode != 429\n| summarize Requests = count()\n| extend Label = 'Client Errors (4xx)', Status = 'non-429 rejects'\n| project Label, Requests, Status", + "query": "ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| where ApiId in~ (split('{AiApiIds}', ','))\n| where ResponseCode == 429\n| summarize Requests = count()\n| extend Label = 'Throttled (429)', Status = 'rate limited'\n| project Label, Requests, Status", "queryType": 0, "resourceType": "microsoft.operationalinsights/workspaces", "size": 4, "timeContext": { "durationMs": 2592000000 }, - "noDataMessage": "No non-429 client errors in the selected time range.", + "noDataMessage": "No throttled requests in the selected time range.", "version": "KqlItem/1.0", "visualization": "tiles", "tileSettings": { @@ -1427,7 +1405,7 @@ "showBorder": true } }, - "name": "query - ai-client-errors-tile", + "name": "query - ai-throttled-tile", "type": 3, "customWidth": "33", "styleSettings": { From 479846a91106b55aee4213096936e037b7761e01 Mon Sep 17 00:00:00 2001 From: Simon Kurtz Date: Fri, 1 May 2026 14:43:54 -0400 Subject: [PATCH 4/4] Tile refinements --- samples/costing/costing.workbook.json | 38 +++++++++++++++++++++------ 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/samples/costing/costing.workbook.json b/samples/costing/costing.workbook.json index 155f908..4d7f6ee 100644 --- a/samples/costing/costing.workbook.json +++ b/samples/costing/costing.workbook.json @@ -1241,7 +1241,7 @@ }, { "content": { - "query": "let total2xx = toscalar(\n ApiManagementGatewayLogs\n | where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n | where ApiId in~ (split('{AiApiIds}', ','))\n | where ResponseCode between (200 .. 299)\n | summarize count()\n);\nApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| where TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)\n| project CorrelationId\n| join kind=inner (\n ApiManagementGatewayLogs\n | where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n | project CorrelationId\n) on CorrelationId\n| summarize RequestCount = dcount(CorrelationId)\n| extend Label = 'Successful (2xx, with tokens)', Status = 'billable'\n| extend Display = strcat(tostring(RequestCount), ' / ', tostring(round(todouble(RequestCount) * 100.0 / iif(total2xx == 0, 1, total2xx), 1)), '%')\n| project Label, Display, Status", + "query": "let total2xx = toscalar(\n ApiManagementGatewayLogs\n | where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n | where ApiId in~ (split('{AiApiIds}', ','))\n | where ResponseCode between (200 .. 299)\n | summarize count()\n);\nApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| where TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)\n| project CorrelationId\n| join kind=inner (\n ApiManagementGatewayLogs\n | where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n | project CorrelationId\n) on CorrelationId\n| summarize RequestCount = dcount(CorrelationId)\n| extend Label = 'Successful (2xx, with tokens)'\n| extend Status = strcat('billable - ', tostring(round(todouble(RequestCount) * 100.0 / iif(total2xx == 0, 1, total2xx), 1)), '%')\n| project Label, RequestCount, Status", "queryType": 0, "resourceType": "microsoft.operationalinsights/workspaces", "size": 4, @@ -1257,8 +1257,19 @@ "formatter": 1 }, "leftContent": { - "columnMatch": "Display", - "formatter": 1 + "columnMatch": "RequestCount", + "formatter": 12, + "formatOptions": { + "min": 0, + "palette": "blue" + }, + "numberFormat": { + "unit": 17, + "options": { + "style": "decimal", + "useGrouping": true + } + } }, "subtitleContent": { "columnMatch": "Status", @@ -1277,7 +1288,7 @@ }, { "content": { - "query": "let aiApis = split('{AiApiIds}', ',');\nlet twoXx = ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| where ApiId in~ (aiApis)\n| where ResponseCode between (200 .. 299)\n| project CorrelationId;\nlet total2xx = toscalar(twoXx | summarize count());\nlet withTokens = ApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| where TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)\n| distinct CorrelationId;\ntwoXx\n| join kind=leftouter withTokens on CorrelationId\n| where isempty(CorrelationId1)\n| summarize NoTokenRequests = dcount(CorrelationId)\n| extend Label = 'Successful (no tokens)', Status = '⚠ investigate'\n| extend Display = strcat(tostring(NoTokenRequests), ' / ', tostring(round(todouble(NoTokenRequests) * 100.0 / iif(total2xx == 0, 1, total2xx), 1)), '%')\n| project Label, Display, Status", + "query": "let aiApis = split('{AiApiIds}', ',');\nlet twoXx = ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| where ApiId in~ (aiApis)\n| where ResponseCode between (200 .. 299)\n| project CorrelationId;\nlet total2xx = toscalar(twoXx | summarize count());\nlet withTokens = ApiManagementGatewayLlmLog\n| where TimeGenerated {TimeRange}\n| where TotalTokens > 0 and CompletionTokens > 0 and isnotempty(ModelName)\n| distinct CorrelationId;\ntwoXx\n| join kind=leftouter withTokens on CorrelationId\n| where isempty(CorrelationId1)\n| summarize NoTokenRequests = dcount(CorrelationId)\n| extend Label = 'Successful (no tokens)'\n| extend Status = strcat('⚠ investigate - ', tostring(round(todouble(NoTokenRequests) * 100.0 / iif(total2xx == 0, 1, total2xx), 1)), '%')\n| project Label, NoTokenRequests, Status", "queryType": 0, "resourceType": "microsoft.operationalinsights/workspaces", "size": 4, @@ -1293,8 +1304,19 @@ "formatter": 1 }, "leftContent": { - "columnMatch": "Display", - "formatter": 1 + "columnMatch": "NoTokenRequests", + "formatter": 12, + "formatOptions": { + "min": 0, + "palette": "yellow" + }, + "numberFormat": { + "unit": 17, + "options": { + "style": "decimal", + "useGrouping": true + } + } }, "subtitleContent": { "columnMatch": "Status", @@ -1368,7 +1390,7 @@ }, { "content": { - "query": "ApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| where ApiId in~ (split('{AiApiIds}', ','))\n| where ResponseCode == 429\n| summarize Requests = count()\n| extend Label = 'Throttled (429)', Status = 'rate limited'\n| project Label, Requests, Status", + "query": "let total4xx = toscalar(\n ApiManagementGatewayLogs\n | where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n | where ApiId in~ (split('{AiApiIds}', ','))\n | where ResponseCode >= 400 and ResponseCode < 500\n | summarize count()\n);\nApiManagementGatewayLogs\n| where TimeGenerated {TimeRange} and ApimSubscriptionId startswith 'bu-'\n| where ApiId in~ (split('{AiApiIds}', ','))\n| where ResponseCode == 429\n| summarize Requests = count()\n| extend Label = 'Throttled (429)'\n| extend Status = strcat('rate limited - ', tostring(round(todouble(Requests) * 100.0 / iif(total4xx == 0, 1, total4xx), 1)), '%')\n| project Label, Requests, Status", "queryType": 0, "resourceType": "microsoft.operationalinsights/workspaces", "size": 4, @@ -1603,7 +1625,7 @@ "formatter": 12, "formatOptions": { "min": 0, - "palette": "orange" + "palette": "yellow" }, "numberFormat": { "unit": 17,