From 5411a9ab731608c924b82ac0cd5143c1454831f9 Mon Sep 17 00:00:00 2001 From: Vitor Hugo Date: Tue, 30 Jun 2026 22:55:27 -0300 Subject: [PATCH 1/5] feat(observability): ESL channel lifecycle traces + sniffer correlation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add OpenTelemetry spans for the semantic FreeSWITCH channel lifecycle and CUSTOM subclasses so Genesis traces can be correlated with the passive sniffer (Otoru/sniffer) at the observability backend. Correlation is attribute-based: every freeswitch.channel.* span carries sip.call_id (= variable_sip_call_id), matching the sniffer's voip.call_id. The join happens in Grafana/Tempo, not in code — no sniffer changes required. Bridge spans carry bridge.a_uuid/bridge.b_uuid for cross-leg grouping. W3C traceparent propagation is intentionally out of scope. Centralize all metric instruments in genesis/protocol/metrics.py (removes duplicate instrument definitions); add calls.active, channel.bridge.events, channel.transfers, channel.codec.changes, dialplan.applications, hangup.causes.q850, event.processing.duration, events.without_sip_call_id, session.commands, consumer.handlers, loadbalancer.selections/errors, and observable gauges for queue depth. New processors (channel_lifecycle_processor, custom_subclass_processor) emit freeswitch.channel.{create,progress,progress_media,answer,bridge,unbridge, hangup,hangup_complete,destroy,execute,execute_complete,codec} and freeswitch.{sofia.transfer,sofia.register,callcenter.info,conference.*, valet.info} spans. Instrument session.sendmsg/await_complete, consumer.start/stop, queue.wait_and_acquire, and load balancer selection. Cardinality rule: UUIDs on spans only; metric attributes use low-cardinality enums. All tests pass on 3.10/3.11/3.12 (tox); black + mypy clean. --- .gitignore | 3 + AGENTS.md | 62 + docs/content/docs/Observability/metrics.md | 28 + docs/content/docs/Observability/tracing.md | 48 + docs/esl-sniffer-traces-mapping.json | 1874 ++++++++++++++++++++ docs/esl-sniffer-traces-plan.md | 456 +++++ genesis/channel.py | 102 +- genesis/consumer.py | 35 +- genesis/group/ring.py | 56 +- genesis/inbound.py | 39 +- genesis/outbound.py | 43 +- genesis/protocol/base.py | 60 +- genesis/protocol/lifecycle.py | 470 +++++ genesis/protocol/metrics.py | 221 ++- genesis/protocol/processors.py | 13 +- genesis/protocol/routing/dispatcher.py | 9 + genesis/protocol/telemetry.py | 28 + genesis/queue/backends.py | 10 + genesis/queue/core.py | 22 +- genesis/session.py | 81 +- tests/payloads.py | 198 +++ tests/test_channel_lifecycle.py | 186 ++ tests/test_consumer_tracing.py | 67 + tests/test_inbound.py | 4 +- tests/test_session_tracing.py | 124 ++ 25 files changed, 4088 insertions(+), 151 deletions(-) create mode 100644 docs/esl-sniffer-traces-mapping.json create mode 100644 docs/esl-sniffer-traces-plan.md create mode 100644 genesis/protocol/lifecycle.py create mode 100644 tests/test_channel_lifecycle.py create mode 100644 tests/test_consumer_tracing.py create mode 100644 tests/test_session_tracing.py diff --git a/.gitignore b/.gitignore index a347481..536c06a 100644 --- a/.gitignore +++ b/.gitignore @@ -94,3 +94,6 @@ docker/freeswitch/conf/**/.fsxml docker/freeswitch/conf/**/*.fsxml docker/freeswitch-test/config/logs/ docker/freeswitch-test/config/recordings/freeswitch/ + +# Local FreeSWITCH source checkout (research reference, not part of the project) +/freeswitch/ diff --git a/AGENTS.md b/AGENTS.md index 0963668..cbf9892 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -287,6 +287,68 @@ async def my_feature(): raise ``` +### Centralized metrics + +All OTel metric instruments live in `genesis/protocol/metrics.py`. **Do not +re-declare an instrument that already exists there** — duplicate instrument +creation for the same metric name trips static analysis and produces OTel SDK +warnings. Import the instrument (and the `safe_add` / `safe_record` helpers) +from `genesis.protocol.metrics` instead: + +```python +from genesis.protocol.metrics import ( + calls_active_counter, + safe_add, + safe_record, + event_processing_duration, +) +``` + +`safe_add(counter, *args, **kwargs)` and `safe_record(histogram, *args, **kwargs)` +swallow OTel/no-provider errors so a missing exporter never crashes the protocol. + +### ESL channel lifecycle spans + +`genesis/protocol/lifecycle.py` registers two event processors +(`channel_lifecycle_processor`, `custom_subclass_processor`) that emit +`freeswitch.channel.*` and `freeswitch.sofia.*` / `freeswitch.callcenter.*` / +`freeswitch.conference.*` / `freeswitch.valet.*` spans for the semantic +FreeSWITCH channel lifecycle. They run after the core protocol processors +(auth, command reply, disconnect) and only enrich telemetry — they never +consume events that route to user handlers. They are on by default; opt out +with `GENESIS_TRACE_ESL_LIFECYCLE=0` / `GENESIS_TRACE_CUSTOM_SUBCLASSES=0`. + +Emitted spans (non-exhaustive): `freeswitch.channel.create`, `.progress`, +`.progress_media`, `.answer`, `.bridge`, `.unbridge`, `.hangup`, +`.hangup_complete`, `.destroy`, `.execute`, `.execute_complete`, `.codec`, +`freeswitch.call.update`, `freeswitch.sofia.transfer`, `freeswitch.sofia.register`, +`freeswitch.callcenter.info`, `freeswitch.conference.maintenance`, +`freeswitch.conference.cdr`, `freeswitch.valet.info`. + +### Sniffer correlation (sip.call_id join) + +Correlation with the passive sniffer (Otoru/sniffer) is **attribute-based and +happens at the observability backend (Grafana/Tempo), not in code**. Every +channel lifecycle span carries `sip.call_id` (= the ESL `variable_sip_call_id` +header), which matches the sniffer's `voip.call_id`. The two traces are joined +in Grafana/Tempo by filtering/grouping on that attribute. + +Cross-leg grouping: bridge spans carry `bridge.a_uuid` and `bridge.b_uuid` +(from `Bridge-A-Unique-ID` / `Bridge-B-Unique-ID`), so the a-leg and b-leg of a +call can be tied together at the backend. + +The `genesis.events.without_sip_call_id` counter tracks channel events that +lack the correlation key (a correlation-gap signal). W3C `traceparent` / +`X-Tracespan` propagation to the sniffer is intentionally **out of scope**. + +### Cardinality rule + +UUIDs go **on spans only**, never as metric attributes. Metric attributes use +low-cardinality enums/labels (`channel.state`, `direction`, `hangup.cause`, +`application.name`, `bridge.result`, `transfer.type`, `loadbalancer.backend`, +...). `queue.depth` is a span attribute (not a metric label) for the same +reason. + ## Pre-PR Checklist **CRITICAL: Always run the full CI stack locally before opening a PR.** diff --git a/docs/content/docs/Observability/metrics.md b/docs/content/docs/Observability/metrics.md index 0b1edf4..995b5f1 100644 --- a/docs/content/docs/Observability/metrics.md +++ b/docs/content/docs/Observability/metrics.md @@ -90,3 +90,31 @@ For programmatic access to load counts per destination, use the load balancer's - **`genesis_timeouts_total`** (Counter) - Description: Number of timeouts - Attributes: `timeout.type` (wait, command, connection), `timeout.operation`, `timeout.duration` + +**ESL Lifecycle Metrics (sniffer correlation):** + +These metrics are emitted by the lifecycle/CUSTOM processors. Cardinality rule: +attributes carry low-cardinality enums only — UUIDs go on spans, never as +metric labels. + +- **`genesis.calls.active`** (UpDownCounter) — Active calls by state and direction; +1 on `CHANNEL_CREATE`, -1 on `CHANNEL_DESTROY`. Attributes: `channel.state`, `direction` +- **`genesis.channel.bridge.events`** (Counter) — Authoritative bridge state from `CHANNEL_BRIDGE`/`UNBRIDGE`. Attributes: `bridge.result` (`established`/`unbridged`), `hangup.cause` +- **`genesis.channel.transfers`** (Counter) — Transfers via `sofia::transferor`/`transferee`. Attributes: `transfer.type` (`blind`/`attended`), `transfer.role` +- **`genesis.channel.codec.changes`** (Counter) — Codec renegotiations from `CODEC` events. Attributes: `channel.read_codec`, `channel.write_codec` +- **`genesis.dialplan.applications`** (Counter) — Dialplan apps from `CHANNEL_EXECUTE`/`_COMPLETE`. Attributes: `application.name`, `application.result` (`started`/`success`/`fail`) +- **`genesis.channel.hangup.causes.q850`** (Counter) — Hangup causes by Q.850 code. Attributes: `hangup.cause.q850` +- **`genesis.event.processing.duration`** (Histogram) — Duration of event dispatch (processors + routing). Attributes: `event.name` +- **`genesis.events.without_sip_call_id`** (Counter) — Channel events lacking `variable_sip_call_id` (a correlation-gap signal vs the passive sniffer). Attributes: (none) + +**Session / Consumer / Load Balancer / Queue Metrics:** +- **`genesis.session.commands`** (Counter) — Session `sendmsg` commands. Attributes: `application.name` +- **`genesis.session.command.duration`** (Histogram) — Duration of session `sendmsg` commands. Attributes: `application.name` +- **`genesis.consumer.handlers`** (Counter) — Consumer handler invocations. Attributes: `event.name` +- **`genesis.loadbalancer.selections`** (Counter) — Load balancer selections. Attributes: `loadbalancer.backend`, `loadbalancer.result` (`selected`/`fallback`) +- **`genesis.loadbalancer.errors`** (Counter) — Load balancer errors. Attributes: `loadbalancer.backend`, `error` +- **`genesis.commands.queue.depth`** (ObservableGauge) — Depth of the pending command-reply queue. Attributes: (none) +- **`genesis.events.queue.depth`** (ObservableGauge) — Depth of the pending event queue. Attributes: (none) + +> All metric instruments are centralized in `genesis/protocol/metrics.py`. +> Import them from there rather than re-declaring, and use the `safe_add` / +> `safe_record` helpers so a missing exporter never crashes the protocol. diff --git a/docs/content/docs/Observability/tracing.md b/docs/content/docs/Observability/tracing.md index 1f94659..1a874c3 100644 --- a/docs/content/docs/Observability/tracing.md +++ b/docs/content/docs/Observability/tracing.md @@ -73,6 +73,54 @@ Genesis automatically creates spans for the following operations: - Description: Ringing a group of destinations - Attributes: `ring_group.mode`, `ring_group.size`, `ring_group.timeout`, `ring_group.has_balancer`, `ring_group.has_variables`, `ring_group.balanced`, `ring_group.result`, `ring_group.duration`, `ring_group.answered_uuid`, `ring_group.answered_dial_path`, `ring_group.error` (if error) +**ESL Channel Lifecycle Spans (`freeswitch.channel.*`):** +- Emitted by the `channel_lifecycle_processor` for the semantic FreeSWITCH channel lifecycle. They carry the channel UUIDs and the sniffer correlation key on the span (see [Sniffer correlation](#sniffer-correlation-sipcall_id-join)). +- **`freeswitch.channel.create`** — `channel.uuid`, `channel.call_uuid`, `channel.direction`, `sip.call_id`, `channel.destination_number`, `channel.context` +- **`freeswitch.channel.progress`** / **`.progress_media`** — `channel.state`, `answer.state`, codec names +- **`freeswitch.channel.answer`** — `channel.state`, `answer.state`, codec names +- **`freeswitch.channel.bridge`** — `bridge.a_uuid`, `bridge.b_uuid`, `other_leg.*`, span event `bridge.established` +- **`freeswitch.channel.unbridge`** — `bridge.a_uuid`, `bridge.b_uuid`, `hangup.cause`, span event `bridge.torn_down` +- **`freeswitch.channel.hangup`** — `hangup.cause`, `channel.state`, span event `hangup.cause.` +- **`freeswitch.channel.hangup_complete`** — `hangup.cause`, `hangup.cause.q850`, span event `call.finalized` +- **`freeswitch.channel.destroy`** — `channel.uuid`, `sip.call_id` +- **`freeswitch.channel.execute`** / **`.execute_complete`** — `application.name`, `application.uuid`, `application.data`/`application.response`, span event `app..done` +- **`freeswitch.channel.codec`** — `channel.read_codec.*`, `channel.write_codec.*` +- **`freeswitch.call.update`** — `bridged.to`, `caller.transfer_source`, span event `caller_id.mutated` + +**CUSTOM Subclass Spans:** +- Emitted by the `custom_subclass_processor` for `CUSTOM` events. +- **`freeswitch.sofia.transfer`** — `transfer.role` (`transferor`/`transferee`), `transfer.type` (`blind`/`attended`), span event `transfer.initiated` +- **`freeswitch.sofia.register`** / **`.reinvite`** / **`.replaced`** — `register.aor`, `register.action`, `gateway.name`/`gateway.state`, `sofia.profile` +- **`freeswitch.callcenter.info`** — `cc.queue`, `cc.action`, `cc.agent`, `cc.member_uuid`, `cc.count`, `cc.selection` +- **`freeswitch.conference.maintenance`** / **`.cdr`** — `conference.name`, `conference.profile`, `conference.action`, `conference.member_id` +- **`freeswitch.valet.info`** — `valet.lot`, `valet.extension`, `valet.action`, `bridge.to_uuid` + +**Session / Consumer / Queue Spans:** +- **`session.sendmsg`** (`Session` module) — `channel.uuid`, `application.name`, `application.uuid`, `application.block` +- **`session.await_complete`** (`Session` module) — child span of `session.sendmsg` when `block=True`; `channel.uuid`, `application.uuid` +- **`consumer.start`** / **`consumer.stop`** (`Consumer` module) — `consumer.host`, `consumer.port` +- **`queue.wait_and_acquire`** (`Queue` module) — `queue.id`, `queue.item_id`, `queue.depth` (span attribute, not a metric label) + +## Sniffer correlation (sip.call_id join) + +Correlation with the passive sniffer (Otoru/sniffer) is **attribute-based and +happens at the observability backend (Grafana/Tempo), not in code**: + +- Every `freeswitch.channel.*` span carries **`sip.call_id`** (= the ESL + `variable_sip_call_id` header), which matches the sniffer's **`voip.call_id`**. +- Join the two traces in Grafana/Tempo by filtering/grouping on that attribute. +- Cross-leg grouping: bridge spans carry **`bridge.a_uuid`** and + **`bridge.b_uuid`**, so the a-leg and b-leg of a call can be tied together. +- The `genesis.events.without_sip_call_id` metric counts channel events that + lack the correlation key (a correlation-gap signal). + +W3C `traceparent` / `X-Tracespan` propagation to the sniffer is intentionally +**out of scope**; the attribute join is sufficient and requires no sniffer +changes. + +The lifecycle/CUSTOM processors are on by default. Opt out with +`GENESIS_TRACE_ESL_LIFECYCLE=0` or `GENESIS_TRACE_CUSTOM_SUBCLASSES=0`. + ## Configuration Install the OpenTelemetry SDK: diff --git a/docs/esl-sniffer-traces-mapping.json b/docs/esl-sniffer-traces-mapping.json new file mode 100644 index 0000000..2e65500 --- /dev/null +++ b/docs/esl-sniffer-traces-mapping.json @@ -0,0 +1,1874 @@ +{ + "genesisSpans": { + "spans": [ + { + "attributes": [ + { + "name": "event.name", + "source": "Event-Name (ESL header)" + }, + { + "name": "event.uuid", + "source": "Unique-ID (ESL header)" + }, + { + "name": "event.content_type", + "source": "Content-Type (ESL header)" + }, + { + "name": "event.header.", + "source": "Every other ESL header key, lowercased with '-' replaced by '_' (built by build_event_attributes in genesis/protocol/telemetry.py:15)" + } + ], + "location": "genesis/protocol/base.py:201", + "name": "process_event", + "spanEvents": [], + "wraps": "Processing one inbound ESL event in the consumer loop: records event metrics (events_received_counter) and logs the event. Dispatch to handlers and event-processor execution happen OUTSIDE this span." + }, + { + "attributes": [ + { + "name": "command.name", + "source": "Full raw cmd string passed to send() (NOT just the command verb; set at base.py:291)" + }, + { + "name": "command.reply", + "source": "Reply-Text field of the resulting ESLEvent (set at base.py:337, only when non-empty)" + } + ], + "location": "genesis/protocol/base.py:290", + "name": "send_command", + "spanEvents": [], + "wraps": "Sending an ESL command over the socket and awaiting the command/reply ESLEvent (Protocol.send)." + }, + { + "attributes": [ + { + "name": "channel.dial_path", + "source": "dial_path constructor arg (e.g. 'user/1000')" + }, + { + "name": "channel.has_variables", + "source": "str(variables is not None) — whether originate variables were supplied" + }, + { + "name": "channel.uuid", + "source": "ATTR_CHANNEL_UUID; value = self.uuid, which comes from response.body of 'api create_uuid' (stripped). Set at channel.py:159" + }, + { + "name": "channel.create.duration", + "source": "time.time() - start_time (wall-clock seconds of the create flow). Set at channel.py:179" + }, + { + "name": "(status)", + "source": "On exception: span.set_status(StatusCode.ERROR, str(e)) at channel.py:188" + } + ], + "location": "genesis/channel.py:144", + "name": "channel.create", + "spanEvents": [], + "wraps": "Channel.create() factory: requests events plain ALL, generates UUID via 'api create_uuid', registers CHANNEL_STATE handler, applies filter, and issues 'api originate' to place the call." + }, + { + "attributes": [ + { + "name": "channel.uuid", + "source": "ATTR_CHANNEL_UUID = self.uuid or 'unknown'" + }, + { + "name": "channel.state", + "source": "ATTR_CHANNEL_STATE = self.state.name (current ChannelState enum name)" + }, + { + "name": "wait.target", + "source": "str(target) — either ChannelState name or event-name string" + }, + { + "name": "wait.timeout", + "source": "timeout parameter (seconds, default 30.0)" + }, + { + "name": "wait.type", + "source": "ATTR_WAIT_TYPE = 'event' if target is str else 'state'" + }, + { + "name": "operation", + "source": "literal 'wait'" + }, + { + "name": "wait.result", + "source": "ATTR_WAIT_RESULT; 'success' (channel.py:340), 'timeout' (channel.py:360), or 'already_reached' (channel.py:441 when state already satisfies target)" + }, + { + "name": "wait.duration", + "source": "ATTR_WAIT_DURATION = time.time() - start_time (set on success and timeout paths, not on already_reached)" + } + ], + "location": "genesis/channel.py:415", + "name": "channel.wait", + "spanEvents": [], + "wraps": "Channel.wait(): waiting for a target ChannelState (via _wait_for_state) or a named event (via _wait_for_event), with timeout." + }, + { + "attributes": [ + { + "name": "channel.uuid", + "source": "ATTR_CHANNEL_UUID = self.uuid or 'unknown'" + }, + { + "name": "channel.state", + "source": "ATTR_CHANNEL_STATE = self.state.name" + }, + { + "name": "operation", + "source": "literal 'answer'" + }, + { + "name": "channel.answer.success", + "source": "result.get('Reply-Text','').startswith('+OK') — set by _execute_operation at channel.py:512" + }, + { + "name": "channel.answer.duration", + "source": "time.time() - start_time (channel.py:513)" + }, + { + "name": "(status)", + "source": "On exception: set_status(StatusCode.ERROR, str(e)) at channel.py:526" + } + ], + "location": "genesis/channel.py:505 (via answer() call at channel.py:541-550)", + "name": "channel.answer", + "spanEvents": [], + "wraps": "Answering the call (execute 'answer' via sendmsg or 'api uuid_execute'). Created by _execute_operation with span_name='channel.answer'." + }, + { + "attributes": [ + { + "name": "channel.uuid", + "source": "ATTR_CHANNEL_UUID = self.uuid or 'unknown'" + }, + { + "name": "channel.state", + "source": "ATTR_CHANNEL_STATE = self.state.name" + }, + { + "name": "operation", + "source": "literal 'park'" + }, + { + "name": "channel.park.success", + "source": "Reply-Text startswith '+OK' (channel.py:512)" + }, + { + "name": "channel.park.duration", + "source": "time.time() - start_time (channel.py:513)" + }, + { + "name": "(status)", + "source": "On exception: set_status ERROR at channel.py:526" + } + ], + "location": "genesis/channel.py:505 (via park() call at channel.py:554-563)", + "name": "channel.park", + "spanEvents": [], + "wraps": "Parking the channel (execute 'park'). Created by _execute_operation with span_name='channel.park'." + }, + { + "attributes": [ + { + "name": "channel.uuid", + "source": "ATTR_CHANNEL_UUID = self.uuid or 'unknown'" + }, + { + "name": "channel.state", + "source": "ATTR_CHANNEL_STATE = self.state.name" + }, + { + "name": "hangup.cause", + "source": "ATTR_HANGUP_CAUSE = cause param (HangupCause, default 'NORMAL_CLEARING')" + }, + { + "name": "operation", + "source": "literal 'hangup'" + }, + { + "name": "channel.hangup.success", + "source": "Reply-Text startswith '+OK' (channel.py:512)" + }, + { + "name": "channel.hangup.duration", + "source": "time.time() - start_time (channel.py:513)" + }, + { + "name": "call.duration", + "source": "time.time() - self._created_at (set in on_success at channel.py:572, only when _created_at is set)" + }, + { + "name": "(status)", + "source": "On exception: set_status ERROR at channel.py:526" + } + ], + "location": "genesis/channel.py:505 (via hangup() call at channel.py:588-600)", + "name": "channel.hangup", + "spanEvents": [], + "wraps": "Hanging up the call with a cause (sendmsg 'hangup' or 'api uuid_kill'). Created by _execute_operation with span_name='channel.hangup'; on_success callback records hangup cause + call duration." + }, + { + "attributes": [ + { + "name": "channel.uuid", + "source": "ATTR_CHANNEL_UUID = self.uuid or 'unknown'" + }, + { + "name": "channel.other_uuid", + "source": "other_uuid from peer Channel/Session (extracted by _get_peer_uuid; falls back to context['Unique-ID'])" + }, + { + "name": "channel.state", + "source": "ATTR_CHANNEL_STATE = self.state.name" + }, + { + "name": "operation", + "source": "literal 'bridge'" + }, + { + "name": "channel.bridge.success", + "source": "Reply-Text startswith '+OK' (channel.py:512)" + }, + { + "name": "channel.bridge.duration", + "source": "time.time() - start_time (channel.py:513)" + }, + { + "name": "(status)", + "source": "On exception: set_status ERROR at channel.py:526" + } + ], + "location": "genesis/channel.py:505 (via bridge() call at channel.py:632-644)", + "name": "channel.bridge", + "spanEvents": [], + "wraps": "Bridging this channel with another Channel/Session (sendmsg 'bridge uuid:' or 'api uuid_bridge'). Created by _execute_operation with span_name='channel.bridge'." + }, + { + "attributes": [ + { + "name": "channel.uuid", + "source": "ATTR_CHANNEL_UUID = self.uuid or 'unknown'" + }, + { + "name": "channel.state", + "source": "ATTR_CHANNEL_STATE = self.state.name" + }, + { + "name": "playback.path", + "source": "path parameter (audio file path)" + }, + { + "name": "playback.block", + "source": "str(block) — whether execution blocks" + }, + { + "name": "operation", + "source": "literal 'playback'" + }, + { + "name": "channel.playback.success", + "source": "Reply-Text startswith '+OK' (channel.py:512)" + }, + { + "name": "channel.playback.duration", + "source": "time.time() - start_time (channel.py:513)" + }, + { + "name": "(status)", + "source": "On exception: set_status ERROR at channel.py:526" + } + ], + "location": "genesis/channel.py:505 (via playback() call at channel.py:653-666)", + "name": "channel.playback", + "spanEvents": [], + "wraps": "Playing an audio file (execute 'playback '). Created by _execute_operation with span_name='channel.playback'." + }, + { + "attributes": [ + { + "name": "channel.uuid", + "source": "ATTR_CHANNEL_UUID = self.uuid or 'unknown'" + }, + { + "name": "channel.state", + "source": "ATTR_CHANNEL_STATE = self.state.name" + }, + { + "name": "say.module", + "source": "module param (e.g. 'en' or 'en:en')" + }, + { + "name": "say.kind", + "source": "kind param (e.g. 'NUMBER')" + }, + { + "name": "say.method", + "source": "method param (e.g. 'pronounced')" + }, + { + "name": "say.gender", + "source": "gender param (e.g. 'FEMININE')" + }, + { + "name": "operation", + "source": "literal 'say'" + }, + { + "name": "channel.say.success", + "source": "Reply-Text startswith '+OK' (channel.py:512)" + }, + { + "name": "channel.say.duration", + "source": "time.time() - start_time (channel.py:513)" + }, + { + "name": "(status)", + "source": "On exception: set_status ERROR at channel.py:526" + } + ], + "location": "genesis/channel.py:505 (via say() call at channel.py:685-700)", + "name": "channel.say", + "spanEvents": [], + "wraps": "Saying text via pre-recorded files (execute 'say '). Created by _execute_operation with span_name='channel.say'." + }, + { + "attributes": [ + { + "name": "channel.uuid", + "source": "ATTR_CHANNEL_UUID = self.uuid or 'unknown'" + }, + { + "name": "channel.state", + "source": "ATTR_CHANNEL_STATE = self.state.name" + }, + { + "name": "play_and_get_digits.file", + "source": "file parameter" + }, + { + "name": "play_and_get_digits.tries", + "source": "str(tries)" + }, + { + "name": "play_and_get_digits.timeout", + "source": "str(timeout)" + }, + { + "name": "play_and_get_digits.minimal", + "source": "str(minimal)" + }, + { + "name": "play_and_get_digits.maximum", + "source": "str(maximum)" + }, + { + "name": "operation", + "source": "literal 'play_and_get_digits'" + }, + { + "name": "channel.play_and_get_digits.success", + "source": "Reply-Text startswith '+OK' (channel.py:512)" + }, + { + "name": "channel.play_and_get_digits.duration", + "source": "time.time() - start_time (channel.py:513)" + }, + { + "name": "(status)", + "source": "On exception: set_status ERROR at channel.py:526" + } + ], + "location": "genesis/channel.py:505 (via play_and_get_digits() call at channel.py:736-756)", + "name": "channel.play_and_get_digits", + "spanEvents": [], + "wraps": "Playing a file and collecting digits from the caller (execute 'play_and_get_digits ...'). Created by _execute_operation with span_name='channel.play_and_get_digits'." + }, + { + "attributes": [ + { + "name": "channel.uuid", + "source": "ATTR_CHANNEL_UUID = self.uuid or 'unknown'" + }, + { + "name": "dtmf.digit", + "source": "event.get('DTMF-Digit') — DTMF-Digit ESL field" + }, + { + "name": "dtmf.handled", + "source": "literal True, set at channel.py:808 after the user handler returns without raising" + }, + { + "name": "(status)", + "source": "On exception: span.record_exception(e) at channel.py:810 and set_status(StatusCode.ERROR, str(e)) at channel.py:811-813" + } + ], + "location": "genesis/channel.py:796", + "name": "channel.dtmf.received", + "spanEvents": [], + "wraps": "Invoking a user-registered DTMF handler (via @channel.on_dtmf decorator) for a received DTMF event." + }, + { + "attributes": [ + { + "name": "net.peer.name", + "source": "self.host (FreeSWITCH host)" + }, + { + "name": "net.peer.port", + "source": "self.port (FreeSWITCH port)" + } + ], + "location": "genesis/inbound.py:97", + "name": "inbound_connect", + "spanEvents": [], + "wraps": "Inbound.start(): establishing the TCP connection to FreeSWITCH (wraps self._connect() only; authentication and super().start() run outside the span)." + }, + { + "attributes": [ + { + "name": "net.peer.name", + "source": "server.host" + }, + { + "name": "net.peer.port", + "source": "server.port" + } + ], + "location": "genesis/outbound.py:156", + "name": "outbound_handle_connection", + "spanEvents": [], + "wraps": "Outbound.handler() static method: processing a single incoming FreeSWITCH outbound connection — opens a Session, runs _setup_session and server.app(session)." + }, + { + "attributes": [ + { + "name": "ring_group.mode", + "source": "mode.value (RingMode enum: 'parallel'/'sequential'/'balancing')" + }, + { + "name": "ring_group.size", + "source": "len(group) — number of destinations" + }, + { + "name": "ring_group.timeout", + "source": "timeout parameter (seconds)" + }, + { + "name": "ring_group.has_balancer", + "source": "str(balancer is not None and mode == RingMode.BALANCING)" + }, + { + "name": "ring_group.has_variables", + "source": "str(variables is not None)" + }, + { + "name": "ring_group.result", + "source": "'answered' if a Channel answered else 'no_answer' (ring.py:156-158); 'error' on exception (ring.py:202)" + }, + { + "name": "ring_group.duration", + "source": "time.time() - start_time (set on both success and error paths, ring.py:159 / 204)" + }, + { + "name": "ring_group.answered_uuid", + "source": "answered.uuid or 'unknown' (set only when answered, ring.py:161-163)" + }, + { + "name": "ring_group.answered_dial_path", + "source": "answered.dial_path (set only when answered, ring.py:164-166)" + }, + { + "name": "ring_group.error", + "source": "str(e) — exception message (set only on error path, ring.py:203)" + }, + { + "name": "(status)", + "source": "On exception: span.record_exception(e) at ring.py:205 (no explicit set_status)" + } + ], + "location": "genesis/group/ring.py:138", + "name": "ring_group.ring", + "spanEvents": [], + "wraps": "RingGroup.ring() classmethod: dispatching a ring group (PARALLEL / SEQUENTIAL / BALANCING) and waiting for the first answering Channel within timeout." + }, + { + "attributes": [ + { + "name": "queue.id", + "source": "ATTR_QUEUE_ID = self._queue_id" + }, + { + "name": "queue.item_id", + "source": "ATTR_QUEUE_ITEM_ID = self._item_id (defaults to str(uuid4()) if not supplied)" + } + ], + "location": "genesis/queue/core.py:76", + "name": "queue.wait_and_acquire", + "spanEvents": [], + "wraps": "QueueSlot.__aenter__: enqueuing an item and waiting to acquire a concurrency slot from the queue backend (wraps backend.wait_and_acquire)." + } + ], + "gaps": [ + "No spans in genesis/session.py — Session.sendmsg, Session.start/stop, and the session lifecycle are entirely untraced.", + "No spans in genesis/consumer.py — the high-level Consumer class (decorator API wrapping Inbound) has no span instrumentation.", + "No spans in genesis/observability/ — only logging (custom TRACE logger) and the OTel metrics/tracing server setup; no application-level spans there.", + "genesis/protocol/telemetry.py imports `tracer` from metrics but never creates a span itself; it only supplies build_event_attributes / record_event_metrics / log_event helpers consumed by base.py's process_event span.", + "process_event span (base.py:201) only wraps metrics+logging; event-processor execution (the for-loop at base.py:208-211) and routing+dispatch (base.py:213-215, dispatch_to_handlers) run OUTSIDE the span — no span covers handler dispatch or routing-strategy.route().", + "No span for Protocol.consume() loop body or Protocol.handler() socket-read loop (base.py).", + "Channel._wait_for_state and Channel._wait_for_event internal helpers have no nested spans; only the outer channel.wait span exists.", + "No span for Channel.from_session factory or Channel state-transition handler (_state_handler / _on_answer_received).", + "No span.set_attribute for command errors on send_command (errors are recorded only as metrics via _record_command_error; the span has no error/status attribute and no record_exception on the -ERR reply path).", + "send_command's command.name attribute is set to the FULL raw command string (base.py:291), not the parsed command verb — high cardinality and not the documented 'name'.", + "Zero use of span.add_event() anywhere in genesis/ — no span events are ever recorded.", + "Only channel.create, the _execute_operation spans, channel.dtmf.received, and ring_group.ring call record_exception/set_status on errors; inbound_connect, outbound_handle_connection, queue.wait_and_acquire, process_event, and send_command do not record exceptions on the span (errors are surfaced only via metrics counters or re-raised).", + "ring_group.ring sets result='error' and record_exception but does NOT call span.set_status(StatusCode.ERROR) — inconsistent with channel.py error handling.", + "No tracing attributes carry Call-Direction, Hangup-Cause, Answer-State, Channel-State, or Event-Subclass on the process_event span (those are only metric attributes via build_metric_attributes in telemetry.py:43)." + ], + "notes": "Module-level tracer definitions (all via `trace.get_tracer(__name__)`): genesis/protocol/metrics.py:10 (shared by genesis/protocol/base.py via import and by genesis/protocol/telemetry.py via import), genesis/channel.py:28, genesis/inbound.py:17, genesis/outbound.py:30, genesis/group/ring.py:23, genesis/queue/core.py:19. No `tracer.start_span` (only `start_as_current_span`), no `use_span`, and no `span.add_event` calls exist anywhere in genesis/. All spans use the context-manager form `with tracer.start_as_current_span(...) as span:`. Six of the eight channel operation spans (answer, park, hangup, bridge, playback, say, play_and_get_digits) are produced by the single generic helper `_execute_operation` at genesis/channel.py:494-537 (span created at line 505); each caller supplies a distinct span_name and span_attributes dict, and the helper adds the common channel..success and channel..duration attributes plus record_exception/set_status on failure. All eight channel operation spans set channel.uuid and channel.state. The process_event span's attribute set is dynamic: it is built by build_event_attributes() in genesis/protocol/telemetry.py:15-40, which maps Event-Name->event.name, Unique-ID->event.uuid, Content-Type->event.content_type, and every other ESL header to event.header..\"" + }, + "genesisMetrics": { + "metrics": [ + { + "attributes": [ + { + "name": "command", + "source": "command_name parsed from the ESL command string sent via Protocol.send" + } + ], + "description": "Number of ESL commands sent", + "location": "genesis/protocol/metrics.py:14 (created); genesis/protocol/base.py:319 (incremented)", + "name": "genesis.commands.sent", + "type": "counter", + "unit": "1" + }, + { + "attributes": [ + { + "name": "event_name", + "source": "ESL event header Event-Name (default UNKNOWN)" + }, + { + "name": "content_type", + "source": "ESL event header Content-Type (default UNKNOWN)" + }, + { + "name": "event_subclass", + "source": "ESL event header Event-Subclass (only when present)" + }, + { + "name": "direction", + "source": "ESL event header Call-Direction (only when present)" + }, + { + "name": "channel_state", + "source": "ESL event header Channel-State (only when present)" + }, + { + "name": "answer_state", + "source": "ESL event header Answer-State (only when present)" + }, + { + "name": "hangup_cause", + "source": "ESL event header Hangup-Cause (only when present)" + } + ], + "description": "Number of ESL events received", + "location": "genesis/protocol/metrics.py:20 (created); genesis/protocol/telemetry.py:83 (incremented via record_event_metrics)", + "name": "genesis.events.received", + "type": "counter", + "unit": "1" + }, + { + "attributes": [ + { + "name": "command", + "source": "command_name parsed from the ESL command string" + } + ], + "description": "Duration of ESL commands execution", + "location": "genesis/protocol/metrics.py:26 (created); genesis/protocol/base.py:347 (recorded in _execute_send finally block)", + "name": "genesis.commands.duration", + "type": "histogram", + "unit": "s" + }, + { + "attributes": [ + { + "name": "command", + "source": "command_name parsed from the ESL command string" + }, + { + "name": "error", + "source": "literal 'protocol_error' when Reply-Text starts with -ERR, otherwise the exception class name (type(e).__name__)" + } + ], + "description": "Number of failed ESL commands", + "location": "genesis/protocol/metrics.py:32 (created); genesis/protocol/base.py:302 (incremented via _record_command_error)", + "name": "genesis.commands.errors", + "type": "counter", + "unit": "1" + }, + { + "attributes": [ + { + "name": "operation", + "source": "literal operation label e.g. 'create', 'wait', or the channel operation name (answer/hangup/bridge/park/playback etc.)" + }, + { + "name": "wait.type", + "source": "wait_type argument passed to _record_wait_success/_record_wait_timeout (e.g. the kind of wait); only on wait operations at channel.py:342,370" + }, + { + "name": "success", + "source": "literal 'true'/'false'; on wait sites a fixed bool, on generic _execute_operation derived from result.get('Reply-Text','').startswith('+OK')" + }, + { + "name": "error", + "source": "exception class name type(e).__name__ (e.g. 'TimeoutError'); only on failure path at channel.py:370,527" + } + ], + "description": "Number of channel operations", + "location": "genesis/protocol/metrics.py:39 (created); also re-declared at genesis/channel.py:32 to avoid circular imports; incremented at genesis/channel.py:180,342,370,514,527", + "name": "genesis.channel.operations", + "type": "counter", + "unit": "1" + }, + { + "attributes": [ + { + "name": "operation", + "source": "literal operation label: 'create' at line 181, or the channel operation name passed to _execute_operation at line 518" + } + ], + "description": "Duration of channel operations", + "location": "genesis/protocol/metrics.py:45 (created); also re-declared at genesis/channel.py:38; recorded at genesis/channel.py:181,518", + "name": "genesis.channel.operation.duration", + "type": "histogram", + "unit": "s" + }, + { + "attributes": [ + { + "name": "hangup.cause", + "source": "cause argument passed to Channel.hangup (HangupCause literal, e.g. NORMAL_CLEARING); FreeSWITCH hangup cause value" + }, + { + "name": "error", + "source": "exception class name type(exc).__name__; only on the on_error callback at channel.py:576" + } + ], + "description": "Hangup causes", + "location": "genesis/protocol/metrics.py:51 (created); also re-declared at genesis/channel.py:44; incremented at genesis/channel.py:570,576 (Channel.hangup on_success/on_error callbacks)", + "name": "genesis.channel.hangup.causes", + "type": "counter", + "unit": "1" + }, + { + "attributes": [ + { + "name": "success", + "source": "str(bool) derived from result.get('Reply-Text','').startswith('+OK'); 'false' on error path" + }, + { + "name": "error", + "source": "exception class name type(exc).__name__; only on the on_error callback at channel.py:621" + } + ], + "description": "Bridge operations", + "location": "genesis/protocol/metrics.py:57 (created); also re-declared at genesis/channel.py:50; incremented at genesis/channel.py:618,621 (Channel.bridge on_success/on_error)", + "name": "genesis.channel.bridge.operations", + "type": "counter", + "unit": "1" + }, + { + "attributes": [ + { + "name": "dtmf.digit", + "source": "dtmf_digit extracted from the DTMF ESL event (DTMF digit value)" + } + ], + "description": "DTMF digits received", + "location": "genesis/protocol/metrics.py:63 (created); also re-declared at genesis/channel.py:56; incremented at genesis/channel.py:804 (DTMF event handler)", + "name": "genesis.channel.dtmf.received", + "type": "counter", + "unit": "1" + }, + { + "attributes": [], + "description": "Total call duration from creation to hangup", + "location": "genesis/protocol/metrics.py:69 (created); also re-declared at genesis/channel.py:62; recorded at genesis/channel.py:573 (Channel.hangup on_success)", + "name": "genesis.call.duration", + "type": "histogram", + "unit": "s" + }, + { + "attributes": [ + { + "name": "timeout.type", + "source": "literal 'wait'" + }, + { + "name": "timeout.operation", + "source": "timeout_operation argument passed to _record_wait_timeout (the operation that timed out)" + }, + { + "name": "timeout.duration", + "source": "computed duration = time.time() - start_time (seconds, float)" + } + ], + "description": "Number of timeouts", + "location": "genesis/protocol/metrics.py:75 (created); also re-declared at genesis/channel.py:68; incremented at genesis/channel.py:362 (_record_wait_timeout)", + "name": "genesis.timeouts", + "type": "counter", + "unit": "1" + }, + { + "attributes": [ + { + "name": "event_name", + "source": "ESL event header Event-Name, or Event-Subclass when Event-Name == 'CUSTOM' (via get_event_name)" + } + ], + "description": "Number of O(1) channel routing hits", + "location": "genesis/protocol/metrics.py:82 (created); genesis/protocol/routing/channel.py:55 (incremented in ChannelRoutingStrategy.route)", + "name": "genesis.channel.routing.hits", + "type": "counter", + "unit": "1" + }, + { + "attributes": [ + { + "name": "event_name", + "source": "ESL event header Event-Name, or Event-Subclass when Event-Name == 'CUSTOM' (via get_event_name)" + } + ], + "description": "Number of fallback to O(N) global routing", + "location": "genesis/protocol/metrics.py:88 (created); genesis/protocol/routing/global_.py:50 (incremented in GlobalRoutingStrategy.route)", + "name": "genesis.channel.routing.fallback", + "type": "counter", + "unit": "1" + }, + { + "attributes": [ + { + "name": "type", + "source": "literal 'inbound' (Inbound class) or 'outbound' (Outbound class)" + } + ], + "description": "Number of active connections", + "location": "genesis/inbound.py:20 and genesis/outbound.py:33 (both create an UpDownCounter of the same name); incremented/decremented at genesis/inbound.py:117,128 and genesis/outbound.py:163,173", + "name": "genesis.connections.active", + "type": "up_down_counter", + "unit": "1" + }, + { + "attributes": [ + { + "name": "error", + "source": "literal 'authentication_failed' (when Reply-Text != '+OK accepted') or 'timeout' (on asyncio.TimeoutError during connect)" + }, + { + "name": "type", + "source": "literal 'inbound'" + } + ], + "description": "Number of connection errors", + "location": "genesis/inbound.py:25 (created); incremented at genesis/inbound.py:87,107", + "name": "genesis.connections.errors", + "type": "counter", + "unit": "1" + }, + { + "attributes": [ + { + "name": "mode", + "source": "RingMode enum value (parallel/sequential/balancing)" + }, + { + "name": "has_balancer", + "source": "str(bool): whether a LoadBalancerBackend is configured and mode == RingMode.BALANCING" + } + ], + "description": "Number of ring group operations", + "location": "genesis/group/ring.py:27 (created); genesis/group/ring.py:169 (incremented in RingGroup._ring)", + "name": "genesis.ring_group.operations", + "type": "counter", + "unit": "1" + }, + { + "attributes": [ + { + "name": "mode", + "source": "RingMode enum value (parallel/sequential/balancing)" + }, + { + "name": "has_balancer", + "source": "str(bool): whether a LoadBalancerBackend is configured and mode == RingMode.BALANCING" + } + ], + "description": "Duration of ring group operations", + "location": "genesis/group/ring.py:33 (created); genesis/group/ring.py:178 (recorded in RingGroup._ring)", + "name": "genesis.ring_group.operation.duration", + "type": "histogram", + "unit": "s" + }, + { + "attributes": [ + { + "name": "mode", + "source": "RingMode enum value (parallel/sequential/balancing)" + }, + { + "name": "result", + "source": "literal 'answered' or 'no_answer' on success path; literal 'error' on exception path" + }, + { + "name": "has_balancer", + "source": "str(bool): whether a LoadBalancerBackend is configured and mode == RingMode.BALANCING" + }, + { + "name": "error", + "source": "exception class name type(e).__name__; only on error path at ring.py:207" + } + ], + "description": "Ring group operation results", + "location": "genesis/group/ring.py:39 (created); genesis/group/ring.py:187,207 (incremented in RingGroup._ring success and error paths)", + "name": "genesis.ring_group.results", + "type": "counter", + "unit": "1" + }, + { + "attributes": [ + { + "name": "queue.id", + "source": "queue_id argument passed to Queue.slot()" + }, + { + "name": "op", + "source": "literal 'acquire' (on enter) or 'release' (on exit)" + } + ], + "description": "Queue slot acquire/release operations", + "location": "genesis/queue/core.py:22 (created); genesis/queue/core.py:92,101 (incremented in QueueSlot.__aenter__/__aexit__)", + "name": "genesis.queue.operations", + "type": "counter", + "unit": "1" + }, + { + "attributes": [ + { + "name": "queue.id", + "source": "queue_id argument passed to Queue.slot()" + } + ], + "description": "Time spent waiting for a slot", + "location": "genesis/queue/core.py:27 (created); genesis/queue/core.py:91 (recorded in QueueSlot.__aenter__ after wait_and_acquire)", + "name": "genesis.queue.wait_duration", + "type": "histogram", + "unit": "s" + } + ], + "gaps": [ + "No ObservableCounter / ObservableGauge instruments anywhere in genesis/ — only synchronous Counter, Histogram, and UpDownCounter are used. There is no async gauge for in-flight values (e.g. current queue depth, current active calls by state).", + "genesis/connections.errors is only created and incremented in genesis/inbound.py. The Outbound class (genesis/outbound.py) defines no connection error counter — outbound connect/handle failures are not recorded as a metric, only the active_connections UpDownCounter is decremented.", + "No metrics instrumentation in genesis/session.py — Session.send/sendmsg operations are not independently metered (they rely on the inherited Protocol.send counters).", + "No metrics instrumentation in genesis/consumer.py — the high-level Consumer class adds no metrics of its own; it only inherits Inbound counters.", + "No metrics instrumentation in genesis/group/load_balancer.py or the Redis load balancer backend — backend eviction/rotation/redis errors are not metered.", + "genesis/channel.operations, genesis.channel.operation.duration, genesis.channel.hangup.causes, genesis.channel.bridge.operations, genesis.channel.dtmf.received, genesis.call.duration, and genesis.timeouts are each defined twice with identical names: once in genesis/protocol/metrics.py and again in genesis/channel.py (comment: 'to avoid circular imports'). Both call meter.create_counter on the same meter/name; OTel deduplicates by identity but the duplicate definitions are a maintenance hazard.", + "call_duration_histogram.record at genesis/channel.py:573 records with NO attributes — the channel UUID / hangup cause are not attached, so call duration cannot be broken down by channel or cause.", + "command_duration_histogram at genesis/protocol/base.py:347 records on every send including commands without a command_name (command_name falsy guard skips both the counter add and the histogram record), so no-op/empty command names are unmeasured.", + "No metric covers ESL event processing/dispatch latency (time from event parse to handler completion) — only events_received (count) and routing hits/fallback (count) exist.", + "No metric covers command queue depth (self.commands queue length in Protocol) or the events queue length — backpressure is not observable via metrics." + ], + "notes": "All instruments are created via the OpenTelemetry API meter (opentelemetry.metrics.get_meter) at module load. Five modules declare their own meter via metrics.get_meter(__name__): genesis/protocol/metrics.py, genesis/channel.py, genesis/inbound.py, genesis/outbound.py, genesis/group/ring.py, genesis/queue/core.py. The CLI (genesis/cli/__init__.py:78) installs a metrics meter provider. Meters are lazily registered with a provider, so instruments are no-ops until a provider is configured. Total: 20 distinct metric instruments (counting the duplicated channel metrics as a single instrument each): 13 counters, 5 histograms, 2 up_down_counters, 0 observable/gauge instruments. All add()/record() call sites are wrapped in try/except (either directly or via the _safe_connection_metric helper in inbound/outbound and the best-effort pattern in base.py and routing modules), so metrics failures never break ESL processing. Attribute naming mixes conventions: dotted OTel-style (e.g. 'channel.uuid', 'hangup.cause', 'dtmf.digit', 'queue.id') for span/attribute constants defined in channel.py and queue/core.py, snake_case (e.g. 'event_name', 'content_type', 'has_balancer', 'op') for telemetry.py, routing, ring.py and queue op attributes, and bare labels ('command', 'operation', 'success', 'mode', 'result', 'type', 'error') in several places." + }, + "genesisEvents": { + "events": [ + { + "event_name": "HEARTBEAT", + "fields_used": [ + "Event-Name", + "Content-Type", + "Event-Info", + "Up-Time", + "Session-Count", + "Max-Sessions", + "FreeSWITCH-Version" + ], + "handled_in": "genesis/consumer.py: Consumer.start() registers `protocol.on('HEARTBEAT', observability.record_heartbeat)` (global handler). observability/server.py:record_heartbeat is the sink. No channel-specific handler.", + "routing_info_attached": [ + "Event-Name (routing key, global only — no Unique-ID)", + "event.name span attribute", + "event_name metric attribute" + ] + }, + { + "event_name": "CHANNEL_CREATE", + "fields_used": [ + "Event-Name", + "Unique-ID", + "Channel-State", + "Channel-Call-UUID", + "Channel-Name", + "Call-Direction", + "Answer-State", + "Caller-Caller-ID-Number", + "Caller-Destination-Number", + "Caller-Context", + "Caller-Dialplan", + "Caller-Unique-ID", + "variable_uuid", + "variable_call_uuid" + ], + "handled_in": "No explicit handler in genesis/. Emitted by FreeSWITCH on originate (Channel.create sends `api originate ... &park()`). Routed only via GlobalRoutingStrategy (test_payloads.py:channel_create is the canonical fixture). Consumer handlers can subscribe via `@consumer.handle('CHANNEL_CREATE')` or `filtrate(...)`.", + "routing_info_attached": [ + "Event-Name + Unique-ID (ChannelRoutingStrategy key, if a channel handler is registered)", + "event.name/event.uuid span attrs", + "direction (Call-Direction), channel_state (Channel-State), answer_state (Answer-State) metric attrs", + "Caller-Destination-Number/Caller-Context/Caller-Dialplan/Caller-Caller-ID-Number propagated as event.header.* span attrs and via context.update(event) when a Channel adopts the event" + ] + }, + { + "event_name": "CHANNEL_STATE", + "fields_used": [ + "Event-Name", + "Unique-ID", + "Channel-State", + "Channel-State-Number", + "Channel-Call-State", + "variable_*" + ], + "handled_in": "genesis/channel.py: Channel._state_handler filters on `event.get('Unique-ID') == self.uuid`, reads `Channel-State`, maps via ChannelState.from_freeswitch (strips 'CS_' prefix), updates self._state and self._state_changes timestamps, then `self.context.update(event)`. Registered globally in Channel.create via `protocol.on('CHANNEL_STATE', self._state_handler)` and per-channel (O(1)) in Channel.from_session via `protocol.register_channel_handler(uuid, 'CHANNEL_STATE', ...)`. Also registered transiently in _wait_for_state and _wait_for_event.", + "routing_info_attached": [ + "Unique-ID (O(1) channel_registry key '{uuid}:CHANNEL_STATE')", + "Channel-State → ChannelState enum (NEW/INIT/ROUTING/SOFT_EXECUTE/EXECUTE/EXCHANGE_MEDIA/PARK/CONSUME_MEDIA/HIBERNATE/RESET/HANGUP/REPORTING/DESTROY/NONE)", + "channel_state metric attribute", + "channel.state span attribute on channel.wait spans", + "full event dict merged into Channel.context (ContextType) for downstream use" + ] + }, + { + "event_name": "CHANNEL_ANSWER", + "fields_used": [ + "Event-Name", + "Unique-ID", + "Answer-State" + ], + "handled_in": "genesis/channel.py: _wait_for_state registers an answer_handler via `protocol.on('CHANNEL_ANSWER', ...)` when target_state == EXECUTE; _on_answer_received filters on Unique-ID match and sets the answer_received Event so EXECUTE is considered reached. _wait_for_event lists CHANNEL_ANSWER in channel_specific_events (Unique-ID filtered). No permanent handler.", + "routing_info_attached": [ + "Unique-ID (filter inside handler to match self.uuid)", + "Answer-State metric attribute", + "event.name/event.uuid span attrs", + "transient global registration via protocol.on, removed in finally block" + ] + }, + { + "event_name": "CHANNEL_HANGUP", + "fields_used": [ + "Event-Name", + "Unique-ID", + "Hangup-Cause" + ], + "handled_in": "No dedicated handler. Referenced as a waitable event name in Channel.wait() docstring (e.g. `await channel.wait('CHANNEL_HANGUP')`) and used as a routing fixture in tests/test_routing.py. Routed via GlobalRoutingStrategy / _wait_for_event path.", + "routing_info_attached": [ + "Event-Name (global routing key)", + "Hangup-Cause metric attribute (when present)", + "event.name/event.uuid span attrs" + ] + }, + { + "event_name": "CHANNEL_HANGUP_COMPLETE", + "fields_used": [ + "Event-Name", + "Unique-ID", + "Hangup-Cause", + "Channel-Unique-ID" + ], + "handled_in": "genesis/session.py: hangup_complete_handler compares `session.context.get('Channel-Unique-ID') == event.get('Unique-ID')`, then pushes to session.fifo and signals completion. Registered via register_channel_handler (O(1)) when session.uuid is known, else via protocol.on. genesis/channel.py: listed in channel_specific_events in _wait_for_event (Unique-ID filtered).", + "routing_info_attached": [ + "Unique-ID + Channel-Unique-ID (match session/channel leg)", + "O(1) channel_registry key '{uuid}:CHANNEL_HANGUP_COMPLETE' when session.uuid present", + "Hangup-Cause metric attribute → hangup_causes_counter", + "event.name/event.uuid span attrs" + ] + }, + { + "event_name": "CHANNEL_EXECUTE_COMPLETE", + "fields_used": [ + "Event-Name", + "Unique-ID", + "Application-UUID", + "Application", + "Application-Response" + ], + "handled_in": "genesis/session.py: execute_complete_handler matches `event.get('Application-UUID') == event_uuid` (the UUID assigned in _build_sendmsg_cmd via Event-UUID header), pushes to session.fifo, signals the block-completion Event. Registered via register_channel_handler or protocol.on in _awaitable_complete_command. genesis/protocol/telemetry.py: _log_channel_event logs Application and Application-Response when name == 'CHANNEL_EXECUTE_COMPLETE'.", + "routing_info_attached": [ + "Application-UUID (correlates the response to the sendmsg execute command)", + "Unique-ID (channel_registry key for O(1) routing)", + "Application/Application-Response → span/log attributes", + "event.name/event.uuid span attrs" + ] + }, + { + "event_name": "CHANNEL_EXECUTE", + "fields_used": [ + "Event-Name", + "Unique-ID", + "Application-UUID", + "Application" + ], + "handled_in": "Referenced in genesis/session.py docstring as the companion to CHANNEL_EXECUTE_COMPLETE (the Application-UUID header appears in both). Covered by tests/test_reader_fsm.py (event-lock splitting). Routed generically; no dedicated handler.", + "routing_info_attached": [ + "Application-UUID (correlation with sendmsg execute)", + "Unique-ID (channel_registry key if registered)", + "event.name span attr" + ] + }, + { + "event_name": "CUSTOM", + "fields_used": [ + "Event-Name", + "Event-Subclass", + "Unique-ID", + "Caller-*", + "Channel-*", + "variable_*" + ], + "handled_in": "genesis/protocol/routing/base.py (and channel.py, global_.py, composite.py): get_event_name returns `event.get('Event-Subclass')` when Event-Name == 'CUSTOM', so routing keys off the subclass (e.g. 'sofia::register', 'mod_audio_stream::play'). Tests/payloads.py provides `register` (sofia::register) and `mod_audio_stream_play` (mod_audio_stream::play) fixtures. Consumer._filter_command emits `filter Event-Subclass {event}` for non-uppercase names.", + "routing_info_attached": [ + "Event-Subclass (routing key replaces Event-Name for CUSTOM)", + "Unique-ID (channel_registry key when present)", + "event_subclass metric attribute", + "event.name='CUSTOM' span attr + event.header.event_subclass" + ] + }, + { + "event_name": "DTMF", + "fields_used": [ + "Event-Name", + "Unique-ID", + "DTMF-Digit" + ], + "handled_in": "genesis/channel.py: on_dtmf decorator builds an async dtmf_handler that reads `event.get('DTMF-Digit')`, optionally filters by a specific digit, records dtmf_received_counter, and invokes the user callback. Registered via `protocol.on('DTMF', dtmf_handler)` (global).", + "routing_info_attached": [ + "DTMF-Digit (digit value + dtmf.digit span/metric attribute)", + "Unique-ID (event present but handler does not filter by it — global routing)", + "dtmf_received_counter incremented with dtmf.digit attr" + ] + }, + { + "event_name": "BACKGROUND_JOB", + "fields_used": [ + "Event-Name", + "Job-UUID", + "Job-Command", + "Job-Command-Arg", + "Content-Length", + "Content-Type" + ], + "handled_in": "No dedicated handler. tests/payloads.py:background_job is the fixture (text/event-plain body with +OK result). Routed via GlobalRoutingStrategy. Used to correlate api/bgapi command responses (Job-UUID).", + "routing_info_attached": [ + "Event-Name (global routing key)", + "Job-UUID (correlation with bgapi command, not explicitly consumed in genesis core)", + "event.name span attr" + ] + }, + { + "event_name": "RELOADXML", + "fields_used": [ + "Event-Name", + "Core-UUID", + "FreeSWITCH-Hostname", + "Content-Length" + ], + "handled_in": "No dedicated handler. tests/payloads.py:custom fixture has `Event-Name: RELOADXML` (a standalone event name, NOT a CUSTOM event). Routed via GlobalRoutingStrategy under the key 'RELOADXML'.", + "routing_info_attached": [ + "Event-Name (global routing key)", + "event.name span attr" + ] + }, + { + "event_name": "auth/request (Content-Type)", + "fields_used": [ + "Content-Type" + ], + "handled_in": "genesis/protocol/processors.py: auth_request_processor — when `event.get('Content-Type') == 'auth/request'`, sets protocol.authentication_event so Inbound.authenticate() can proceed.", + "routing_info_attached": [ + "Content-Type (processor matching, not routed to user handlers)", + "event.content_type span attr" + ] + }, + { + "event_name": "command/reply (Content-Type)", + "fields_used": [ + "Content-Type", + "Reply-Text" + ], + "handled_in": "genesis/protocol/processors.py: command_reply_processor enqueues the event into protocol.commands queue so Protocol.send() can return the response. Reply-Text is inspected in send() for '-ERR' prefix to record command_errors_counter.", + "routing_info_attached": [ + "Content-Type (processor matching)", + "Reply-Text (command.reply span attr, error detection)", + "routed into commands queue, not to user handlers" + ] + }, + { + "event_name": "api/response (Content-Type)", + "fields_used": [ + "Content-Type", + "body" + ], + "handled_in": "genesis/protocol/processors.py: api_response_processor enqueues into protocol.commands queue for Protocol.send(). FSM treats api/response as a body-only content type (event.body holds the result).", + "routing_info_attached": [ + "Content-Type (processor + FSM _API_RESPONSE_TYPES)", + "event.content_type span attr", + "routed into commands queue, not to user handlers" + ] + }, + { + "event_name": "text/rude-rejection / text/disconnect-notice (Content-Type)", + "fields_used": [ + "Content-Type", + "Content-Disposition" + ], + "handled_in": "genesis/protocol/processors.py: disconnect_processor calls protocol.stop() unless `Content-Disposition == 'linger'`. FSM lists rude-rejection in _API_RESPONSE_TYPES.", + "routing_info_attached": [ + "Content-Type (processor matching)", + "Content-Disposition (linger check — when 'linger', session.is_lingering stays True and disconnect is suppressed)", + "event.content_type span attr" + ] + }, + { + "event_name": "text/event-plain (Content-Type)", + "fields_used": [ + "Content-Type", + "Content-Length", + "Event-Name", + "Event-Subclass", + "all body headers" + ], + "handled_in": "genesis/protocol/reader_fsm.py: _parse_headeronly_content merges the body headers (parse_headers) into the event when content_type == 'text/event-plain'. _parse_event_content handles event-lock:true by splitting on 'Event-Name:' boundaries to emit multiple ESLEvent instances.", + "routing_info_attached": [ + "Event-Name / Event-Subclass (set on the merged event for downstream routing)", + "event-lock splitting propagates Content-Length/Content-Type to each split event", + "all body headers become routable fields on the resulting ESLEvent" + ] + } + ], + "routing_strategies": [ + "ChannelRoutingStrategy (O(1)): looks up `channel_registry['{Unique-ID}:{event_name}']`. event_name is Event-Name, except for CUSTOM where Event-Subclass is used. Registered via Protocol.register_channel_handler(uuid, event_name, handler). On hit returns (handlers, should_stop=True), terminating the chain. Increments genesis.channel.routing.hits with event_name attribute. Used by Channel.from_session (CHANNEL_STATE) and Session._awaitable_complete_command (CHANNEL_EXECUTE_COMPLETE, CHANNEL_HANGUP_COMPLETE).", + "GlobalRoutingStrategy (O(N)): looks up `handlers[event_name]` plus wildcard `handlers['*']`; returns (specific + generic, should_stop=False). Registered via Protocol.on(key, handler) / Consumer.handle(event). Increments genesis.channel.routing.fallback. Never stops the chain (so composite could continue, though it is the last strategy).", + "CompositeRoutingStrategy: chains [ChannelRoutingStrategy, GlobalRoutingStrategy] in order. Tries each in turn; returns handlers from the first strategy that yields any; honors should_stop. Wired in Protocol.__init__ as self.routing_strategy.", + "Dispatch: dispatch_to_handlers schedules each handler as an asyncio task (create_task for coroutines, to_thread wrapper for sync handlers) and tracks them in Protocol.handler_tasks with a done-callback that logs unhandled exceptions.", + "Event-name extraction (get_event_name, shared by all strategies): `identifier = event.get('Event-Name')`; if identifier == 'CUSTOM', returns `event.get('Event-Subclass')`; else returns identifier. None/missing → strategy returns ([], False).", + "Subscription filters (Outbound/Inbound side, not routing per se but shape what reaches the router): `events plain ALL` subscribes to all events; `filter Unique-ID {uuid}` restrictes to a channel; `filter Event-Name {X}` for uppercase event names; `filter Event-Subclass {X}` for CUSTOM subclasses (Consumer._filter_command)." + ], + "channel_lifecycle": "Channel state is modeled by `ChannelState(IntEnum)` in genesis/types.py, ordered to mirror FreeSWITCH's CS_* progression: NEW(0) → INIT(1) → ROUTING(2) → SOFT_EXECUTE(3) → EXECUTE(4) → EXCHANGE_MEDIA(5) → PARK(6) → CONSUME_MEDIA(7) → HIBERNATE(8) → RESET(9) → HANGUP(10) → REPORTING(11) → DESTROY(12) → NONE(13). `ChannelState.from_freeswitch(state_str)` strips the `CS_` prefix and resolves the enum name. Transitions are event-driven: Channel._state_handler (registered for CHANNEL_STATE) compares `event['Unique-ID'] == self.uuid`, parses `Channel-State`, and if the new state differs, records a timestamp in `_state_changes[new_state]` and updates `_state`. The IntEnum ordering is exploited for guards: `Channel.wait` short-circuits when `self.state >= ChannelState.HANGUP` (i.e. HANGUP/REPORTING/DESTROY) and `Channel.bridge` refuses when `self.state >= HANGUP`. `_wait_for_state` waits until `event_state >= ChannelState.HANGUP` OR equals the target; for EXECUTE it additionally waits for a CHANNEL_ANSWER event (answer_received Event) because CS_EXECUTE alone does not guarantee the leg is answered. RingGroup relies on `ch.wait(ChannelState.EXECUTE)` as the 'answered' signal in PARALLEL/SEQUENTIAL/BALANCING modes, and cleans up non-answered legs by checking `ch.state >= ChannelState.HANGUP` before issuing hangup. Hangup is invoked via `api uuid_kill {uuid} {cause}` (Inbound) or sendmsg `execute hangup {cause}` (Session); the cause is recorded in hangup_causes_counter and the total call duration in call_duration_histogram. In Outbound mode, Channel.from_session seeds state from the Session context's `Channel-State` and registers an O(1) CHANNEL_STATE handler so subsequent transitions update the same Channel object. The Session itself models command lifecycle via _awaitable_complete_command, which registers CHANNEL_EXECUTE_COMPLETE (matched by Application-UUID) and CHANNEL_HANGUP_COMPLETE (matched by Channel-Unique-ID/Unique-ID) and signals a per-command Event so blocking sendmsg calls can return the corresponding ESLEvent from session.fifo.", + "notes": "Relevant files (absolute paths): /Users/vitorhugo/Projects/Genesis/genesis/protocol/base.py (Protocol, handler/consume loop, _process_one_event, register_channel_handler/on, send), /Users/vitorhugo/Projects/Genesis/genesis/protocol/reader_fsm.py (ESLReaderFSM: READING_HEADERS→READING_BODY, event-lock splitting, text/event-plain merging), /Users/vitorhugo/Projects/Genesis/genesis/protocol/parser.py (parse_headers, ESLEvent UserDict), /Users/vitorhugo/Projects/Genesis/genesis/protocol/processors.py (auth_request, command_reply, api_response, disconnect processors), /Users/vitorhugo/Projects/Genesis/genesis/protocol/routing/{base,channel,composite,global_,dispatcher}.py (Strategy pattern), /Users/vitorhugo/Projects/Genesis/genesis/protocol/telemetry.py (build_event_attributes, build_metric_attributes, log_event), /Users/vitorhugo/Projects/Genesis/genesis/protocol/metrics.py (OTel counters/histograms), /Users/vitorhugo/Projects/Genesis/genesis/channel.py (Channel, _state_handler, wait, on_dtmf, bridge/hangup/answer), /Users/vitorhugo/Projects/Genesis/genesis/session.py (Session, sendmsg, _awaitable_complete_command, CHANNEL_EXECUTE_COMPLETE/CHANNEL_HANGUP_COMPLETE correlation), /Users/vitorhugo/Projects/Genesis/genesis/inbound.py, /Users/vitorhugo/Projects/Genesis/genesis/outbound.py (_setup_session, connect reply → session.context, filter Unique-ID, linger), /Users/vitorhugo/Projects/Genesis/genesis/consumer.py (Consumer.handle decorator, filtrate, filter Event-Name/Event-Subclass), /Users/vitorhugo/Projects/Genesis/genesis/types.py (ChannelState enum, HangupCause literal, EventHandler), /Users/vitorhugo/Projects/Genesis/genesis/group/ring.py (RingGroup EXECUTE wait, hangup cleanup), /Users/vitorhugo/Projects/Genesis/tests/payloads.py (canonical ESL fixtures: heartbeat, channel_create, background_job, custom/RELOADXML, register/sofia::register, mod_audio_stream_play, connect, channel_state, dtmf, channel_answer).\n\nKey observations:\n- Genesis does NOT explicitly handle CHANNEL_PROGRESS, CHANNEL_BRIDGE, CHANNEL_UNBRIDGE, CALL_UPDATE, CODEC, or RING_BACK anywhere in genesis/. These names do not appear as string literals in the source. They would still be *routed* if subscribed (events plain ALL) because routing is generic on Event-Name, but no built-in handler consumes them. CHANNEL_BRIDGE/UNBRIDGE legs (Other-Leg-Unique-ID, Bridge-A-Unique-ID, etc.) are not parsed; bridging is performed via `api uuid_bridge` / sendmsg `execute bridge` and tracked only through Reply-Text and bridge_operations_counter, not through bridge events.\n- Routing-relevant fields actually consumed by code: Event-Name, Event-Subclass (CUSTOM), Unique-ID, Channel-State, Channel-Name, Channel-Unique-ID, Application-UUID, Application, Application-Response, DTMF-Digit, Reply-Text, Content-Type, Content-Disposition, Content-Length. Caller-Destination-Number, Caller-Context, Caller-Dialplan, Caller-Caller-ID-Number, Channel-Call-UUID and variable_* are present in payloads and propagated generically (context.update(event) and event.header.* span attributes) but no genesis module explicitly extracts them — they are available to user handlers via the ESLEvent dict and to the `filtrate(key, value)` decorator for arbitrary key/value filtering in Consumer.\n- The `connect` command reply (Outbound mode) is the seed of session.context: it carries Channel-Unique-ID, Channel-Context, Channel-Destination-Number, Channel-Caller-ID-Number, Channel-State, Channel-Name, Unique-ID, plus the full Caller-* and variable_* set. Channel.from_session then derives uuid (session.context['Unique-ID']), dial_path (Channel-Name), and initial state (Channel-State).\n- OTel attribute propagation: build_event_attributes emits EVERY event header as a span attribute (event.name for Event-Name, event.uuid for Unique-ID, event.content_type for Content-Type, event.header. for everything else). build_metric_attributes attaches event_name, content_type, event_subclass, direction (Call-Direction), channel_state (Channel-State), answer_state (Answer-State), hangup_cause (Hangup-Cause) to the events_received_counter. These are the routing-relevant fields that flow downstream into traces and metrics.\n- Wildcard handlers: GlobalRoutingStrategy merges `handlers['*']` with `handlers[event_name]`, so a Consumer/Protocol can register a catch-all handler under the key '*'.\n- The FSM splits event-lock:true payloads on '\\nEvent-Name: ' boundaries, producing multiple ESLEvent objects that share the base Content-Length/Content-Type and body — this is how chained execute-app commands (e.g. multi-app sendmsg) appear as separate routable events." + }, + "snifferSignals": { + "repo_summary": "Otoru/sniffer is a passive VoIP observability platform written in Go. It captures SIP signaling and RTP/RTCP media off the wire (AF_PACKET zero-copy on Linux, libpcap elsewhere), correlates packets into calls, computes call quality (MOS via ITU-T G.107 R-factor, jitter, packet loss, silence ratio), detects fraud (RCC/SCD/HFR/auth-flood), records PCAP/WAV per call (with S3 upload), and exports everything as OpenTelemetry traces + metrics over OTLP/HTTP. It ships as two binaries: `sniffer` (capture/correlation) and `sniffer-mcp` (MCP server for AI agents), plus a Claude Code plugin with analysis skills. Supports standalone single-sensor mode or distributed mode (1 sensor-sip + N sensor-rtp) sharing a Redis-protocol datastore.", + "captures": "Capture layer is a two-stage decoupled pipeline per sensor (internal/infra/capture, internal/app/pipeline):\nNIC -> kernel ring (AF_PACKET TPACKET_V3 ring, SNIFFER_CAPTURE_BUFFER_MB, default 256 MiB; pcap backend on non-Linux) -> readLoop (drains ring via ZeroCopyReadPacketData on afpacket, no heap copy for L4 bytes) -> rawCh (8192) -> parseLoop (defrag -> dedup -> extract) -> parsedCh (8192) -> sharded workers (sipLoop sharded by Call-ID, rtpLoop sharded by SSRC).\n\nPacket parsing uses gopacket DecodingLayerParser fast path (>99% of RTP = UDP unfragmented, reuses pre-allocated Ethernet/IPv4/IPv6/UDP/TCP structs, no layer-object allocs); fallback creates gopacket.Packet for IP fragments, GRE/VXLAN/ERSPAN tunnels, ICMP. A size-classed payload buffer pool (internal/infra/capture/datapool.go) gives zero-alloc on the hot path (~50k pkt/s) with idempotent slot release (no double-free, get==put balance tested). Backpressure: when rawCh is full the packet is dropped with stage=capture rather than blocking (keeps kernel ring draining); a watermark-pause/backpressure circuit breaker can also shed load (sniffer.pipeline.watermark_drop_total).\n\nWhat it captures: SIP messages (request/response, SDP, REGISTER, OPTIONS keepalives) on SNIFFER_SIP_PORTS (default 5060,5061); RTP streams (per-SSRC seq-gap loss, RFC 3550 jitter, MOS/R-factor, jitter-buffer simulation at 50/200/500 ms, G.711 silence detection); RTCP (peer-reported loss/jitter, LSR/DLSR RTT); SIP registrations; fraud events. Sensors are roles: sensor-sip runs a BPF like \"port 5060\" and emits CDR spans; sensor-rtp runs \"udp portrange 10000-20000\" and only publishes RTP stats to the datastore. PCAP written as LINKTYPE_RAW(101) with synthetic IP+UDP headers; WAV only for decodable G.711. Sources: live interface, pcap file, pcap dir (batch), or stdin (`-`).", + "call_identification": "A call is identified by the SIP Call-ID header (voip.call_id). The sensor-sip extracts Call-ID, caller/callee URIs (From/To), SDP media IP:port endpoints and writes them to the shared Redis-protocol datastore under `voip:ep:{ip}|{port}` (EndpointData: call_id, is_caller, codec_pt, connect_time_us) and `voip:call:{call_id}` (state, caller/callee, timestamps). RTP sensors correlate by batch-GETting endpoints for each packet's Src+Dst IP:port; when a packet matches by destination side the sensor inverts the rtp.side marking (so caller-vs-callee is correct even when RTP races the 200 OK). RTP stats are aggregated per call_id+ssrc under `voip:rtp:{call_id}:{ssrc}`. On BYE/CANCEL/4xx-6xx/timeout the sensor-sip PUBLISHes `voip:rtp:flush_req` so RTP sensors flush final samples, then GETs and merges cross-sensor stats before emitting the complete `voip.call` span tree. Orphan RTP (no correlated signaling) is classified as rtp_flow vs noise (sniffer.orphan.classified). Registrations are keyed by AOR (user@domain); keepalives by Call-ID of the OPTIONS dialog.", + "otel_signals": { + "logs": "No OpenTelemetry Logs SDK signals. internal/platform/otel/provider.go only initializes a TracerProvider and MeterProvider (no LoggerProvider). Application logging is plain Go slog (logfmt or json via SNIFFER_LOG_FORMAT), not exported via OTel logs. Go runtime metrics are emitted automatically by the OTel contrib runtime instrumentation (process.runtime.go.* goroutines/memory/schedule), and continuous profiling (CPU/alloc/inuse/goroutines) is sent to Grafana Pyroscope when SNIFFER_PYROSCOPE_URL is set.", + "metrics": [ + { + "name": "sniffer_info", + "type": "gauge" + }, + { + "name": "voip.calls.total", + "type": "counter" + }, + { + "name": "voip.calls.answered", + "type": "counter" + }, + { + "name": "voip.calls.failed", + "type": "counter" + }, + { + "name": "voip.calls.timeout", + "type": "counter" + }, + { + "name": "voip.calls.muted", + "type": "counter" + }, + { + "name": "voip.calls.one_way_audio", + "type": "counter" + }, + { + "name": "voip.calls.active", + "type": "gauge" + }, + { + "name": "voip.call.duration_s", + "type": "histogram" + }, + { + "name": "voip.call.mos", + "type": "histogram" + }, + { + "name": "voip.call.jitter_ms", + "type": "histogram" + }, + { + "name": "voip.call.loss_pct", + "type": "histogram" + }, + { + "name": "voip.call.silence_ratio", + "type": "histogram" + }, + { + "name": "voip.rtp.streams.active", + "type": "gauge" + }, + { + "name": "voip.keepalives.total", + "type": "counter" + }, + { + "name": "voip.keepalive.rtt_ms", + "type": "histogram" + }, + { + "name": "voip.registrations.active", + "type": "gauge" + }, + { + "name": "sniffer.packets.dropped", + "type": "counter" + }, + { + "name": "sniffer.orphan.classified", + "type": "counter" + }, + { + "name": "sniffer.pcap.kernel_drops", + "type": "gauge" + }, + { + "name": "sniffer.pcap.if_drops", + "type": "gauge" + }, + { + "name": "sniffer.redis.write_dropped", + "type": "counter" + }, + { + "name": "sniffer.pipeline.watermark_pause_total", + "type": "counter" + }, + { + "name": "sniffer.pipeline.watermark_drop_total", + "type": "counter" + }, + { + "name": "sniffer.sip.filtered", + "type": "counter" + }, + { + "name": "sniffer.spool.usage_gb", + "type": "gauge" + }, + { + "name": "sniffer.spool.free_pct", + "type": "gauge" + }, + { + "name": "sniffer.fraud.rcc.threshold", + "type": "gauge" + }, + { + "name": "sniffer.fraud.rcc.window_s", + "type": "gauge" + }, + { + "name": "sniffer.fraud.scd.min_duration_s", + "type": "gauge" + }, + { + "name": "sniffer.fraud.scd.window_s", + "type": "gauge" + }, + { + "name": "sniffer.fraud.hfr.pct", + "type": "gauge" + }, + { + "name": "sniffer.fraud.hfr.window_s", + "type": "gauge" + }, + { + "name": "sniffer.fraud.auth_flood.threshold", + "type": "gauge" + }, + { + "name": "sniffer.fraud.auth_flood.window_s", + "type": "gauge" + }, + { + "name": "sniffer.overload.state", + "type": "gauge" + }, + { + "name": "process.fd.open", + "type": "gauge" + }, + { + "name": "process.fd.limit", + "type": "gauge" + }, + { + "name": "process.fd.ratio", + "type": "gauge" + }, + { + "name": "process.memory.rss_bytes", + "type": "gauge" + }, + { + "name": "process.memory.vsize_bytes", + "type": "gauge" + }, + { + "name": "process.cpu.user_seconds_total", + "type": "counter" + }, + { + "name": "process.cpu.system_seconds_total", + "type": "counter" + }, + { + "name": "process.context_switches.voluntary_total", + "type": "counter" + }, + { + "name": "process.context_switches.nonvoluntary_total", + "type": "counter" + }, + { + "name": "process.page_faults.major_total", + "type": "counter" + }, + { + "name": "process.page_faults.minor_total", + "type": "counter" + }, + { + "name": "process.io.read_bytes_total", + "type": "counter" + }, + { + "name": "process.io.write_bytes_total", + "type": "counter" + }, + { + "name": "process.threads", + "type": "gauge" + }, + { + "name": "process.uptime_seconds", + "type": "gauge" + } + ], + "spans": [ + { + "attributes": [ + "voip.call_id", + "voip.call.state (completed|failed|cancelled|interrupted|timeout)", + "sensor.id", + "sensor.ip", + "voip.caller.number", + "voip.caller.domain", + "voip.caller.ip", + "voip.caller.user_agent", + "voip.callee.number", + "voip.callee.domain", + "voip.callee.ip", + "voip.callee.user_agent", + "client.address", + "server.address", + "voip.call.duration_s", + "voip.call.connect_duration_s", + "voip.call.ring_time_s", + "voip.call.post_dial_delay_s", + "voip.call.first_rtp_delay_s", + "voip.sip.final_response", + "voip.sip.final_response_text", + "voip.sip.termination_cause (normal_bye|cancel|timeout|rtp_timeout|failed|redirect|interrupted|unknown)", + "voip.sip.who_hung_up (caller|callee|unknown)", + "voip.recording.pcap", + "voip.recording.audio", + "voip.flags.nat_detected", + "voip.flags.rtp_reordered" + ], + "name": "voip.call" + }, + { + "attributes": [ + "sip.method (INVITE|BYE|CANCEL|ACK|...)", + "sip.from", + "sip.to", + "sip.contact", + "sip.user_agent", + "sip.sdp.media.ip", + "sip.sdp.media.port", + "sip.sdp.media.codec" + ], + "name": "voip.call.sip.request" + }, + { + "attributes": [ + "sip.response_code", + "sip.cseq_method", + "sip.reason", + "sip.from", + "sip.to", + "sip.contact", + "sip.user_agent", + "sip.sdp.media.ip", + "sip.sdp.media.port", + "sip.sdp.media.codec" + ], + "name": "voip.call.sip.response" + }, + { + "attributes": [ + "rtp.side (a=caller,b=callee)", + "rtp.ssrc", + "rtp.packets_received", + "rtp.packets_lost", + "rtp.loss_pct", + "rtp.jitter_avg_ms", + "rtp.jitter_max_ms", + "rtp.mos_avg", + "rtp.r_factor", + "rtp.mos_jb_50ms", + "rtp.mos_jb_200ms", + "rtp.mos_jb_500ms", + "rtp.src_ip", + "rtp.dst_ip", + "rtp.codec", + "rtp.payload_type", + "sensor.ip (local only)", + "rtp.rtcp.loss_pct", + "rtp.rtcp.jitter_avg_ms", + "rtp.rtcp.rtt_ms", + "rtp.silence_ratio" + ], + "name": "voip.rtp.stream" + }, + { + "attributes": [ + "voip.register.aor", + "voip.register.contact_ip", + "voip.register.user_agent", + "voip.register.expires_s", + "voip.register.auth_failed", + "voip.register.response_code", + "voip.register.reason (registered|renewed|failed|deregister|expired)", + "sensor.id", + "sensor.ip" + ], + "name": "voip.register" + }, + { + "attributes": [ + "sip.response_code", + "sip.reason" + ], + "name": "voip.register.transaction" + }, + { + "attributes": [ + "keepalive.from", + "keepalive.to", + "keepalive.result (success|timeout|failed)", + "keepalive.rtt_ms", + "sip.response_code", + "sip.reason", + "sensor.id", + "sensor.ip" + ], + "name": "voip.keepalive" + }, + { + "attributes": [ + "voip.fraud.rule (rcc|scd|hfr|auth_flood)", + "voip.fraud.key", + "voip.fraud.value", + "voip.fraud.threshold", + "voip.fraud.window_s" + ], + "name": "voip.fraud.alert" + } + ] + }, + "trace_propagation": "No SIP header injection and no W3C trace-context propagation. The sniffer is strictly passive (read-only on the wire); it never injects X-Tracespan, traceparent, tracestate, or any channel var. The OTel trace_id is generated by the OTel SDK (go.opentelemetry.io/otel/sdk/trace) when EmitCDRSpan / EmitRegistrationSpan / EmitKeepaliveSpan / EmitFraudAlert call tracer.Start() — sampling is head-based via sdktrace.AlwaysSample (default) or TraceIDRatioBased (SNIFFER_OTEL_SAMPLE_RATIO). The SIP Call-ID is NOT used as the trace_id; it is only recorded as the `voip.call_id` span attribute and used as the Redis correlation key. Cross-sensor correlation is done entirely through the Redis-protocol datastore (call_id keys, endpoint pub/sub, flush_req pub/sub), not via trace context propagation. Only the sensor-sip emits the full span tree (voip.call + children); sensor-rtp units emit no CDR spans at all — they publish stats that the sensor-sip merges at span-close. So a call's trace is a single-service tree produced at one node, not a distributed trace spanning sensors. All spans are SpanKind INTERNAL.", + "tech_stack": "Language: Go (module vitoru.fun/sniffer). Capture: gopacket + AF_PACKET TPACKET_V3 zero-copy on Linux (github.com/google/gopacket, internal/infra/capture/live_afpacket.go), libpcap fallback. SIP/SDP parsing: a custom minimal fastParse (~10 allocs vs ~78 for sipgo) in internal/protocol/sip, plus pion/sdp/v3 for SDP. RTP/RTCP: custom parsers in internal/protocol/rtp and internal/protocol/rtcp. Datastore: any Redis-protocol (RESP) compatible server — Redis, Dragonfly (recommended for high-volume multi-core), or Valkey — via go-redis (REDIS_URL). Pipeline: in-process channels (rawCh 8192, parsedCh 8192) + sharded workers, no external streaming broker. Observability export: OpenTelemetry SDK (otlptracehttp + otlpmetrichttp to OTEL_EXPORTER_OTLP_ENDPOINT, default http://localhost:4317) -> OTLP collector -> Jaeger/Tempo (traces) + Prometheus (metrics); optional Grafana Pyroscope for continuous profiling. Recording spool: local disk -> optional S3-compatible upload (AWS/OCI/MinIO/Ceph). Packaging: .deb/.rpm, Docker images (Dockerfile.sniffer, Dockerfile.mcp). MCP server: separate Go binary (cmd/mcp) exposing tools/resources/prompts over HTTP transport. CI: Woodpecker. License: proprietary with offline grace (internal/platform/license).", + "notes": "Read real source: internal/platform/otel/{span,metrics,emitter,register,keepalive,fraud,provider}.go, internal/platform/catalog/{spans,metrics}.go, internal/protocol/{sip/parse,register/register,keepalive/keepalive}.go, plus docs/architecture.md, docs/observability.md, docs/config.md, README.md. GitHub code-search API returned empty results (likely unauthenticated quota), so the absence of traceparent/X-Trace/TextMapPropagator was confirmed by reading provider.go (no propagator setup, no SetTextMapPropagator) and the SIP parser (no header extraction/injection of trace context). Span/metric names and attribute lists are authoritative from catalog/spans.go and catalog/metrics.go (the single source of truth used by both instrumentation and the MCP catalog:// resources). The docs/observability.md table lists a slightly older metric set; the catalog file is the complete superset (adds sniffer.orphan.classified, sniffer.pcap.kernel_drops, sniffer.pcap.if_drops, sniffer.redis.write_dropped, sniffer.pipeline.watermark_*, sniffer.sip.filtered, sniffer.overload.state, and all sniffer.fraud.* config gauges). Span status: Ok for answered+2xx, Error with description \\\"SIP \\\" when final >= 400 or termination=failed. SIP message child spans form a consecutive waterfall [msg ts -> next msg ts]; ACK capped to 1 ms. RTP stream span timestamps are independent of SIP (expose early-media overlap). Service name default is \\\"sauron\\\" (OTEL_SERVICE_NAME)." + }, + "snifferCorr": { + "trace_id_source": "The OTel trace_id is NOT derived from SIP. It is a random 128-bit ID auto-generated by the OpenTelemetry SDK tracer (internal/platform/otel/provider.go builds a stock sdktrace.TracerProvider; no custom Sampler, no trace.SetSpanContext, no ID-generator override). In internal/platform/otel/span.go EmitCDRSpan calls tracer.Start(ctx, \"voip.call\", ...) with the plain context passed from RunEmitter, so the SDK mints a fresh trace_id per call. The SIP Call-ID is only attached as the span attribute `voip.call_id` (set in setIdentityAttrs). The catalog (internal/platform/catalog/spans.go) labels `voip.call_id` as \"Unique call identifier (trace_id)\", i.e. the sniffer treats the SIP Call-ID as the *logical* trace identifier a human uses to find the trace, but the wire OTel trace_id is unrelated to it. Cross-sensor correlation of the same call is done by SIP Call-ID + From/To/Via tags (calltable keys calls by string(msg.CallID); Redis cross-sensor mode in internal/infra/redis merges ExternalRTPStats by Call-ID), not by propagating an OTel trace context.", + "span_hierarchy": "Single root span per call: `voip.call` (catalog.SpanOpCall), timestamped [InviteTimeUS → EndTimeUS], SpanKindInternal, set in EmitCDRSpan (internal/platform/otel/span.go). It carries identity (voip.call_id, sensor.id/ip), participants (caller/callee number/domain/IP/UA + semconv client.address/server.address), timing (duration_s, connect_duration_s, ring_time_s, post_dial_delay_s, first_rtp_delay_s), SIP outcome (final_response, termination_cause, who_hung_up), flags (nat_detected, rtp_reordered), recording flags, and span status (Error on >=400/ReasonFailed). Under that root there are two sibling child-spans families, both created by passing the root span's ctx to tracer.Start (standard parent/child via SpanContext parentID — NOT span links):\n\n1) SIP message spans — emitSIPMessageSpans walks call.SIPMsgs in arrival order and emits one child per message: `voip.call.sip.request` (RespCode==0) or `voip.call.sip.response` (RespCode>0). Each span covers [msg.TimestampUS → next msg TimestampUS] (waterfall of SIP phases); the last message's span ends at call.EndTimeUS. ACK spans are capped to 1ms. Attributes: sip.method / sip.response_code + sip.cseq_method, sip.from, sip.to, sip.contact, sip.user_agent, sip.reason, and SDP audio ip/port/codec.\n\n2) RTP stream spans — emitRTPStreamSpans emits one `voip.rtp.stream` (catalog.SpanOpRTPStream) child per RTPStreamEntry (local) plus one per ExternalRTPStats (cross-sensor, merged via Redis). Each spans [FirstPacketUS → LastPacketUS], independently of SIP timing so early-media/183 overlap is visible. Attributes: rtp.side (\"a\"=caller / \"b\"=callee — derived from MediaEndpoints[epIdx].IsCaller with src/dst-match inversion in findOrCreateStream), rtp.ssrc, packets_received/lost, loss_pct, jitter avg/max, MOS avg + R-factor + MOS at JB 50/200/500ms, codec, payload_type, src_ip/dst_ip, sensor.ip, optional RTCP loss/jitter/rtt, optional rtp.silence_ratio.\n\nThere is no separate \"SIP transaction\" span layer keyed by Via/Branch; transactions are implicit in the per-message waterfall. Registration has its own root `voip.register` with `voip.register.transaction` children, and keepalive/fraud-alert are separate top-level ops — none of these are children of voip.call.", + "sip_rtp_relation": "SIP and RTP are correlated into the same trace purely through the SDP negotiated on the SIP dialog. Flow (internal/domain/calltable): ProcessSIP → handleINVITE → buildCall populates Call.MediaEndpoints from the SDP c=/m= audio lines (sdpFirstAudioMedia in state.go). addEndpointsToIndex registers each endpoint IPPort into a per-shard O(1) endpointIndex and a global sync.Map (endpointGlobalIndex) for cross-shard O(1) lookup, including NAT aliases (NATAliases maps private SDP IP → public packet-source IP seen on the wire, learned by observing that the packet source differs from the SDP c= IP). For every RTP packet, UpdateRTP → resolveRTPEndpoint looks up pkt.Src and pkt.Dst in the global index, locks only the owning shard, and findOrCreateStream keys the stream by SSRC inside that Call. The matched endpoint index + srcMatched bool determines rtp.side (IsCaller), with dst-match inverting the side. RTCP reception reports (UpdateRTCP) are matched the same way (by packet endpoint → call, then by SSRC inside the call) and feed frac-lost/jitter/RTT back into the matching RTPStreamEntry. So inside a trace the SIP message spans and RTP stream spans are siblings under the shared `voip.call` root; the SIP dialog provides the identity + endpoint map, and the RTP streams provide the quality telemetry. RTP timestamps are independent of SIP (span [FirstPacketUS → LastPacketUS]), so a 183 Session Progress with early media shows RTP-stream spans that start before the 200 OK SIP-response span. RTP that never matches a known endpoint becomes an \"orphan\" (orphanBuffer) and is NOT traced.", + "esl_integration": "No. There is zero FreeSWITCH ESL integration in the repo. GitHub code search over the repo for \"freeswitch\", \"ESL\", and \"event_socket\" returned 0 matches each. No file path contains esl/freeswitch/fscli. The sniffer is strictly a passive packet processor: internal/infra/capture uses afpacket/raw-socket live capture (live_afpacket.go) plus pcap read, feeds a pipeline (internal/app/pipeline) that runs protocol detection (internal/protocol/detect) and parses SIP (internal/protocol/sip), RTP (internal/protocol/rtp), RTCP (internal/protocol/rtcp). State is held in the in-memory calltable (internal/domain/calltable) and persisted to Redis (internal/infra/redis) and ClickHouse (docker/clickhouse) for CDRs, with OTel spans exported via OTLP (internal/platform/otel). It never opens the FreeSWITCH Event Socket, never subscribes to CHANNEL_* / CALL_* events, never issues `api` commands, and never reads the spool. The local Genesis/CLAUDE.md describes an ESL library (the `genesis` package) but that is a sibling project, not the sniffer source.", + "missing_routing_info": [ + "Dialplan: which context/extension/condition block matched, and the ordered list of applications FreeSWITCH executed (set, bridge, playback, transfer, park, voicemail, IVR menu choices, queue routing, follow-me, ring-group selection).", + "Bridge a-leg / b-leg linking: FreeSWITCH creates two channels with distinct Channel-UUIDs for a bridge; sniffer keys on SIP Call-ID and sees two separate SIP dialogs with no explicit parent/child tie. The CHANNEL_BRIDGE event / Bridge-UUID / `Other-Leg` unique-id that links the legs lives only in ESL events, not in packets.", + "Transfers: blind (REFER) and especially attended transfers create new channels/dialogs; the post-transfer dialplan target and the linkage between the original and transferred leg are invisible from packets alone (and REFER may be absorbed inside FS without surfacing as a new SIP dialog on the sniffed interface).", + "FreeSWITCH Channel-UUID and channel variables: sniffer identifies calls by SIP Call-ID + From/To/Via tags, never by FS Channel-UUID; variables like `sip_from_user`, `destination_number`, `transfer_source`, `bridge_to`, `hangup_cause` (Q.850 cause code), `progress_uv`/`progress_time` are not on the wire.", + "Digit manipulation / translation: the called number after regex/translation rules and the actual egress destination may differ from the SIP R-URI/To visible on the sniffed leg.", + "Call direction from the switch's perspective (inbound vs outbound leg, which side is the FS-controlled channel) — sniffer only knows caller/callee IP/number from SIP headers.", + "Authentication outcome: 401/407 challenges are observed (applyChallenge in state.go extends DestroyAt), but whether the digest auth succeeded/failed and which user authenticated is not in the SIP response.", + "Transcoding and codec negotiation decisions on the FS side (announced codecs come from SDP, but whether FS transcoded PCMU↔opus is an internal FS fact).", + "Hangup cause code (Q.850) and who initiated teardown inside FS vs on the wire — sniffer infers WhoHungUp by comparing the BYE source IP to caller/callee IP (inferWhoHungUp), which is an approximation and returns HungUpUnknown when the BYE comes from the switch itself.", + "Call grouping/queue/agent identity: which agent in a call queue answered, ACD/queue wait time, park orbit pickups, intercept/resume, and conference-room membership — all FS application state.", + "Re-INVITE intent (hold/resume/codec-change/T.38 fax) is seen as a dialog update but the FS-side reason (channel hold variable, media renegotiation cause) is not on the wire." + ], + "notes": "Key source files read (all absolute repo paths under https://github.com/Otoru/sniffer): internal/platform/otel/span.go (root/child span emission, hierarchy), internal/platform/otel/emitter.go (RunEmitter consumes CDREvent → EmitCDRSpan), internal/platform/otel/provider.go (stock OTel SDK, no trace_id derivation), internal/platform/catalog/spans.go (span op names + attribute catalog, `voip.call_id` described as \"Unique call identifier (trace_id)\"), internal/domain/calltable/associate.go (RTP↔call correlation by SDP endpoint index + NAT aliases + SSRC), internal/domain/calltable/state.go (SIP FSM, SDP endpoint extraction), internal/domain/calltable/types.go (Call/SIPMsg/RTPStreamEntry/MediaEndpoint structs), internal/domain/calltable/table.go (ProcessSIP, sharded CallTable keyed by SIP Call-ID). No ESL/FreeSWITCH code exists: gh code search for freeswitch/ESL/event_socket each returned total_count 0, and no path under the tree contains those terms. The sniffer is purely passive capture + SIP/RTP/RTCP parsing → calltable → CDR → OTel spans (OTLP) + Redis + ClickHouse. Span linkage is standard OTel parent/child via the root span's context (tracer.Start with the root ctx), not span Links; there is exactly one root per call and one trace per call. The `voip.call_id` attribute is the SIP Call-ID used as the human-facing trace lookup key; the actual OTel trace_id is SDK-random and is not propagated across sensors — cross-sensor call correlation is by SIP Call-ID via Redis (ExternalRTPStats merged into the CDREvent before emission), not by shared trace context." + }, + "freeswitch": { + "events": [ + { + "key_fields": [ + "Unique-ID", + "Channel-Call-UUID", + "Channel-Name", + "Channel-State", + "Channel-Call-State", + "Call-Direction", + "Caller-Username", + "Caller-Dialplan", + "Caller-Caller-ID-Name", + "Caller-Caller-ID-Number", + "Caller-Destination-Number", + "Caller-Network-Addr", + "Caller-ANI", + "Caller-Context", + "Caller-Unique-ID", + "Answer-State=ringing" + ], + "name": "CHANNEL_CREATE", + "routing_relevance": "Fired by switch_core_state_machine.c at CS_INIT when a session is created. This is the birth-of-call event for both inbound (new INVITE) and outbound (originate) legs and is the first opportunity to register a channel handler keyed by Unique-ID. Call-Direction tells you a-leg vs b-leg origin; Caller-Destination-Number + Caller-Context drive dialplan routing decisions. Unique-ID here is the per-leg session UUID you must capture to correlate every subsequent event on this leg." + }, + { + "key_fields": [ + "Unique-ID", + "Channel-Call-UUID", + "Channel-State=CS_RINGING", + "Channel-Call-State=CCS_RINGING", + "Answer-State=ringing", + "Caller-*", + "Other-Leg-* (if originated)" + ], + "name": "CHANNEL_PROGRESS", + "routing_relevance": "Fired by switch_channel.c when a channel enters ringing (SIP 180 Ringing). Marks the start of alerting on a leg. For tracing, this is the 'ringing started' timestamp; for routing it signals that the leg is being alerted and a ringback may be played to the upstream leg. Paired with CHANNEL_PROGRESS_MEDIA for early-media (183) cases." + }, + { + "key_fields": [ + "Unique-ID", + "Channel-Call-UUID", + "Channel-State=CS_RINGING", + "Channel-Call-State=CCS_EARLY", + "Answer-State=early", + "Channel-Read-Codec-Name", + "Channel-Write-Codec-Name", + "Caller-*", + "Other-Leg-*" + ], + "name": "CHANNEL_PROGRESS_MEDIA", + "routing_relevance": "Fired on early media (SIP 183 Session Progress with SDP). Critical for distinguishing 'ringing without media' from 'ringing with early media' (in-band ringback/announcements from carrier). Drives the upstream leg's ringback generation and is essential for accurate call-quality tracing because media flows before ANSWER. Codec headers here are the first real negotiated codec for the leg." + }, + { + "key_fields": [ + "Unique-ID", + "Channel-Call-UUID", + "Channel-State=CS_EXECUTE", + "Channel-Call-State=CCS_ACTIVE", + "Answer-State=answered", + "Channel-Read-Codec-Name/Rate", + "Channel-Write-Codec-Name/Rate", + "Caller-*", + "Other-Leg-*" + ], + "name": "CHANNEL_ANSWER", + "routing_relevance": "Fired by switch_channel.c when the leg is answered (SIP 200 OK). This is the call-connected moment: media becomes two-way, billing/CDR timers start, and any post-answer dialplan execution begins. For routing it is the gate for executing on_answer hooks; for tracing it is the canonical answer timestamp and the point where the sniffer should expect bidirectional RTP." + }, + { + "key_fields": [ + "Unique-ID (firing leg)", + "Bridge-A-Unique-ID", + "Bridge-B-Unique-ID", + "Channel-Call-UUID", + "Other-Type (originator/originatee)", + "Other-Leg-Unique-ID", + "Other-Leg-Caller-ID-Number", + "Other-Leg-Destination-Number", + "Caller-*" + ], + "name": "CHANNEL_BRIDGE", + "routing_relevance": "Fired by switch_ivr_bridge.c when two legs are bridged (two-party). Bridge-A-Unique-ID is the originating session and Bridge-B-Unique-ID is the peer session UUID. This is THE event for a-leg/b-leg correlation: it links the two Unique-IDs and also sets the peer channel's call_uuid to the originator's UUID, so both legs subsequently share the same Channel-Call-UUID. Other-Leg-* (from the originator/originatee caller profile) gives the partner leg's identity for transfer/intercept logic." + }, + { + "key_fields": [ + "Unique-ID", + "Channel-Call-UUID", + "Bridge-A-Unique-ID (context dependent)", + "Other-Leg-Unique-ID", + "Other-Type", + "Hangup-Cause (sometimes)" + ], + "name": "CHANNEL_UNBRIDGE", + "routing_relevance": "Fired by switch_ivr_bridge.c when a bridge is torn down (either leg hangs up, transfer, or application break). Marks the end of two-party media flow. Essential for tracing bridge duration and for detecting mid-call transfers (unbridge followed by a new bridge on the surviving leg) vs hangup. Pair with CHANNEL_BRIDGE to compute talk time." + }, + { + "key_fields": [ + "Unique-ID", + "Channel-Call-UUID", + "Hangup-Cause", + "Answer-State=hangup", + "Channel-State=CS_HANGUP", + "Caller-*", + "Other-Leg-*" + ], + "name": "CHANNEL_HANGUP", + "routing_relevance": "Fired by switch_channel.c when a channel enters hangup. Carries the normalized Hangup-Cause string (e.g. NORMAL_CLEARING, USER_BUSY, NO_ANSWER, ORIGINATOR_CANCEL). This is the primary event for call disposition classification and for tearing down per-channel handlers. Fires per-leg, so for a two-leg call you get two CHANNEL_HANGUP events keyed by each Unique-ID." + }, + { + "key_fields": [ + "Unique-ID", + "Channel-Call-UUID", + "Hangup-Cause", + "hangup_cause_q850 (variable)", + "Channel-Name", + "Caller-*", + "Other-Leg-*", + "variable_* (full channel variables when verbose)", + "CDR-Attached=xml (optional, body=XML CDR)" + ], + "name": "CHANNEL_HANGUP_COMPLETE", + "routing_relevance": "Fired by switch_core_state_machine.c at CS_HANGUP_COMPLETE after all cleanup/hooks/CDR generation. This is the definitive end-of-call event and the richest one: it includes the Q.850 cause code and (optionally, when hangup_complete_with_xml=true) the full XML CDR in the event body. Use this as the commit point for CDR/trace closure because all channel variables, times, and cause codes are final. The single best event for closing a call trace." + }, + { + "key_fields": [ + "Unique-ID", + "Channel-Call-UUID", + "Direction=RECV", + "Bridged-To (partner uuid)", + "Caller-Caller-ID-Name", + "Caller-Caller-ID-Number", + "Caller-Orig-Caller-ID-Name/Number", + "Caller-Transfer-Source" + ], + "name": "CALL_UPDATE", + "routing_relevance": "Fired by switch_channel.c when caller ID / connected line is flipped (SIP UPDATE/re-INVITE changing CID, e.g. on transfer or redirect). Carries Bridged-To (the partner UUID from switch_channel_get_partner_uuid) and the pre/post CID. Critical for tracing caller-ID mutations and transfer mid-call: a CALL_UPDATE followed by a CHANNEL_UNBRIDGE+CHANNEL_BRIDGE on new UUIDs indicates an attended/blind transfer." + }, + { + "key_fields": [ + "Unique-ID", + "Channel-Call-UUID", + "Channel-State=CS_PARK", + "Channel-Call-State", + "Caller-*" + ], + "name": "CHANNEL_PARK", + "routing_relevance": "Fired by switch_ivr.c when the 'park' application parks a channel. Marks a leg as held in the park subsystem awaiting a bridge (e.g. during attended transfer or valet parking). For routing it indicates the leg is no longer in a normal bridge but is waiting; for tracing it explains a gap in media flow." + }, + { + "key_fields": [ + "Unique-ID", + "Channel-Call-UUID", + "Channel-State", + "Caller-*" + ], + "name": "CHANNEL_UNPARK", + "routing_relevance": "Fired by switch_ivr.c when a parked channel is unparked (retrieved/bridged). Pairs with CHANNEL_PARK to bound park duration." + }, + { + "key_fields": [ + "Unique-ID", + "Channel-State", + "Channel-State-Number", + "Channel-Call-State" + ], + "name": "CHANNEL_STATE", + "routing_relevance": "Fired on raw FSM state transitions (CS_INIT/CS_ROUTING/CS_EXECUTE/CS_HANGUP etc.). Lower-level than the semantic events (CREATE/ANSWER/HANGUP); useful for tracing exact state-machine progress and detecting stuck channels, but redundant for most routing logic." + }, + { + "key_fields": [ + "Unique-ID", + "Channel-Call-State (DOWN/EARLY/ACTIVE/RINGING/HOLD/RESET/HANGUP etc.)" + ], + "name": "CHANNEL_CALLSTATE", + "routing_relevance": "Fired on call-state (CCS_*) transitions, which track the logical call lifecycle independent of the FSM state. Useful for tracing the high-level 'is this leg conceptually ringing/active/hold' view." + }, + { + "key_fields": [ + "Unique-ID", + "Channel-Call-UUID" + ], + "name": "CHANNEL_DESTROY", + "routing_relevance": "Fired by switch_core_session.c at session destruction (after HANGUP_COMPLETE). The final cleanup signal; use it to deregister per-channel handlers and free trace buffers keyed by Unique-ID." + }, + { + "key_fields": [ + "Unique-ID", + "DTMF-Digit", + "DTMF-Duration", + "DTMF-Source (RTP/INBAND_AUDIO/ENDPOINT/APP/UNKNOWN)", + "Channel-Call-UUID", + "Caller-*" + ], + "name": "DTMF", + "routing_relevance": "Fired by switch_channel.c (and switch_ivr_async.c for inband detection) on every received/injected digit. DTMF-Source distinguishes RFC2833 RTP telephone-event, inband audio-detected, endpoint-signaled, and app-injected digits. Drives IVR menu routing and digit-collection apps; for tracing it explains mid-call media interaction and is the basis for transfer/feature digit sequences." + }, + { + "key_fields": [ + "Unique-ID", + "channel-read-codec-name", + "channel-read-codec-rate", + "channel-read-codec-bit-rate", + "channel-reported-read-codec-rate", + "Channel-Write-Codec-Name/Rate/Bit-Rate", + "Channel-Call-UUID" + ], + "name": "CODEC", + "routing_relevance": "Fired by switch_core_codec.c whenever the read or write codec changes (renegotiation, transcoder insertion, codec switch). For tracing this is the authoritative codec timeline per leg and is essential for correlating RTP stream identity (payload type/codec) in the sniffer with the ESL call timeline. Mismatched a-leg/b-leg codec events indicate transcoding." + }, + { + "key_fields": [ + "Unique-ID", + "Playback-File-Path", + "Playback-File-Type (local_stream/tone_stream/file)", + "Channel-Call-UUID", + "Caller-*" + ], + "name": "PLAYBACK_START", + "routing_relevance": "Fired by switch_ivr_play_say.c when playback begins. Identifies the media being played (file path, tone stream, local stream). For routing it marks IVR announcements/ringback; for tracing it explains one-way media periods (e.g. ringback tone is being generated) so the sniffer does not misclassify them as mute." + }, + { + "key_fields": [ + "Unique-ID", + "Playback-File-Path", + "Playback-File-Type", + "Playback-Status (done/break)", + "Channel-Call-UUID" + ], + "name": "PLAYBACK_STOP", + "routing_relevance": "Fired when playback ends; Playback-Status=break vs done distinguishes user-interrupted playback from natural completion. Bounds the playback media period for tracing." + }, + { + "key_fields": [ + "Unique-ID", + "Record-File-Path", + "Channel-Call-UUID", + "Caller-*" + ], + "name": "RECORD_START", + "routing_relevance": "Fired by switch_ivr_async.c / switch_ivr_play_say.c when recording begins. The file path ties the recording artifact to the call leg; for tracing it marks the start of recorded media (and implies a media bug is tapping the stream)." + }, + { + "key_fields": [ + "Unique-ID", + "Record-File-Path", + "Record-Completion-Cause", + "Channel-Call-UUID" + ], + "name": "RECORD_STOP", + "routing_relevance": "Fired when recording stops; Record-Completion-Cause gives the reason (e.g. silence, timeout, manual stop). Bounds the recording window for trace correlation and CDR." + }, + { + "key_fields": [ + "Unique-ID", + "Event-Subclass=sofia::transferor|sofia::transferee", + "Other-Leg-Unique-ID", + "Channel-Call-UUID", + "sofia_profile_name", + "Caller-*" + ], + "name": "CUSTOM (sofia::transferor / sofia::transferee)", + "routing_relevance": "Fired by mod_sofia (sofia.c) on SIP transfer (REFER/Replaces). transferor = the leg initiating transfer; transferee = the leg being transferred. These are the SIP-level transfer correlation events and are critical for distinguishing a real transfer from a hangup: they precede the CHANNEL_UNBRIDGE/CHANNEL_BRIDGE pair and carry the partner UUID." + }, + { + "key_fields": [ + "Unique-ID", + "Event-Subclass=sofia::reinvite|sofia::replaced", + "Channel-Call-UUID", + "sofia_profile_name" + ], + "name": "CUSTOM (sofia::reinvite / sofia::replaced)", + "routing_relevance": "Fired on SIP re-INVITE (codec/hold/resume renegotiation) and Replaces (call substitution). For tracing, reinvite explains media IP/codec changes mid-call; replaced explains one call supplanting another (attended transfer target). Correlate to RTP IP/port changes in the sniffer." + }, + { + "key_fields": [ + "Event-Subclass=callcenter::info", + "CC-Queue", + "CC-Action (member-queue-end/agent-offering/bridge-agent-start/bridge-agent-end/bridge-agent-fail/agent-state-change/members-count etc.)", + "CC-Count", + "CC-Selection", + "CC-Agent", + "CC-Member-UUID", + "Unique-ID (member/agent leg)" + ], + "name": "CUSTOM (callcenter::info)", + "routing_relevance": "Fired by mod_callcenter for queue routing lifecycle: member queued, agent offered, agent bridged to member, bridge end/fail. CC-Member-UUID ties the caller leg to the queue record; CC-Agent identifies the agent leg. This is the authoritative event stream for ACD/contact-center routing traces and for correlating which agent leg answered which member leg." + }, + { + "key_fields": [ + "Event-Subclass=conference::maintenance|conference::cdr", + "Conference-Name", + "Conference-Profile", + "Action (add-member/del-member/mute-member/kick-member/transfer/start-talking/stop-talking/play-file etc.)", + "Member-ID", + "Unique-ID (member leg)", + "Old-Member-ID" + ], + "name": "CUSTOM (conference::maintenance / conference::cdr)", + "routing_relevance": "Fired by mod_conference for every member/energy/DTMF/mute/talk event in a conference. Unique-ID + Member-ID tie each ESL leg to its conference slot; Action tracks join/leave/mute/talk/transfer. For routing this is the multi-party bridge correlation (vs two-party CHANNEL_BRIDGE) and is essential for tracing conference calls and conference-originated transfers." + }, + { + "key_fields": [ + "Event-Subclass=valet_parking::info", + "Valet-Lot-Name", + "Valet-Extension", + "Action (bridge/timeout etc.)", + "Bridge-To-UUID", + "Unique-ID" + ], + "name": "CUSTOM (valet_parking::info)", + "routing_relevance": "Fired by mod_valet_parking when a parked caller is bridged to a retrieving party. Bridge-To-UUID links the parked leg to the retriever leg. This is the park/retreive correlation event for valet-style call parking." + }, + { + "key_fields": [ + "Event-Subclass=sofia::register|sofia::unregister|sofia::expire|sofia::gateway_state", + "from-user", + "from-host", + "to-user", + "contact", + "expires", + "sip_to_host", + "sip-from-host", + "Gateway-Name (for gateway_state)", + "State (for gateway_state)" + ], + "name": "CUSTOM (sofia::register / unregister / expire / gateway_state)", + "routing_relevance": "Registration/trunk-health events, not call-routing per se, but they gate outbound routing: a failed registration or REGED->DOWN gateway_state transition explains why subsequent outbound origination attempts fail. Useful as a pre-condition signal in routing traces." + } + ], + "channel_uuid_fields": "\"Unique-ID: the per-leg session UUID. Set on every channel event by switch_channel_event_set_data from switch_core_session_get_uuid(channel->session). This is the primary per-leg key; a two-leg call has two different Unique-IDs. Always present on channel events. Capture this first to key handlers (the Genesis '{uuid}:{event_name}' channel_registry pattern).\\n\\nChannel-Call-UUID: the CALL-level (not leg-level) correlation ID. In switch_channel.c it is taken from the channel variable 'call_uuid' if set, else falls back to the session UUID. At channel creation (switch_core_state_machine.c) call_uuid is initialised to the leg's own session UUID. At bridge time (switch_ivr_bridge.c lines 1446/1555/1684/1877) the PEER channel's call_uuid is overwritten with the ORIGINATING session's UUID. Therefore after a bridge both legs carry the SAME Channel-Call-UUID == the originating (a-leg) session UUID. Use Channel-Call-UUID to group all events of a single logical call across both legs and across transfers; use Unique-ID to address a single leg. Caveat: on a transfer that creates a new b-leg the call_uuid may roll to the new originator, so re-evaluate at each CHANNEL_BRIDGE.\\n\\nvariable_call_uuid: the raw 'call_uuid' channel variable surfaced in the variable_* namespace when verbose/extended event data is enabled. It is the same value as the Channel-Call-UUID header but lives under the channel-variable dump. Useful when you only have the variable payload (e.g. CHANNEL_DATA / CHANNEL_HANGUP_COMPLETE with verbose events).\\n\\nOther-Leg-Unique-ID: produced by switch_caller_profile_event_set_data with the prefix 'Other-Leg' when the channel has an originator_caller_profile (Other-Type=originator) or originatee_caller_profile (Other-Type=originatee). It is the UUID of the bridged partner leg as recorded in the caller profile. Present on bridge-related and hangup-related channel events. This is the standard a-leg<->b-leg link on per-leg events (as opposed to Bridge-A/B-Unique-ID which only appears on CHANNEL_BRIDGE).\\n\\nBridge-A-Unique-ID / Bridge-B-Unique-ID: explicit headers added only on CHANNEL_BRIDGE. Bridge-A-Unique-ID = switch_core_session_get_uuid(session) (the firing/originating leg); Bridge-B-Unique-ID = the peer session UUID passed in the bridge message. Use these to authoritatively link the two legs at the bridge instant.\\n\\nBridged-To: header on CALL_UPDATE, value = switch_channel_get_partner_uuid(channel). The current partner leg UUID at the moment of a CID flip / transfer. Use for transfer correlation.\\n\\nCaller-Unique-ID: from the Caller-* caller-profile dump (prefix 'Caller'), equals caller_profile->uuid, which is the channel's own UUID (same value as Unique-ID). Provided for symmetry with Other-Leg-Unique-ID.\\n\\nThere is NO header literally named 'Bridge-Other-Leg' in the upstream source. The b-leg correlation is carried by Other-Leg-Unique-ID (on per-leg events) and Bridge-A/B-Unique-ID (on CHANNEL_BRIDGE). Sniffer correlation: tie RTP streams to ESL legs by matching the SIP Call-ID at CHANNEL_CREATE (mod_sofia puts it in variable_sip_call_id when verbose) to the sniffer's SIP Call-ID, then use Unique-ID for per-leg and Channel-Call-UUID for call-wide grouping.\"", + "notes": "Source of truth: signalwire/freeswitch HEAD. Key files read directly: src/switch_event.c (EVENT_NAMES table, lines 138-237 — the canonical event-name list), src/include/switch_types.h (switch_event_types_t enum + doc comments, lines 1985-2090), src/switch_channel.c (switch_channel_event_set_data lines 2659-2755 — the standard channel header set applied to every channel event; CHANNEL_HANGUP 3447, CHANNEL_PROGRESS 3507, CHANNEL_PROGRESS_MEDIA 3562, CHANNEL_ANSWER 3848, CALL_UPDATE 3279; DTMF 678-705), src/switch_caller.c (switch_caller_profile_event_set_data 322-410 — Caller-*/Other-Leg-* field generation), src/switch_ivr_bridge.c (CHANNEL_BRIDGE 1377 with Bridge-A/B-Unique-ID; CHANNEL_UNBRIDGE 1326/1481/1494/1879; call_uuid propagation 1446/1555/1684/1877), src/switch_core_state_machine.c (CHANNEL_CREATE 626; CHANNEL_HANGUP_COMPLETE 943; call_uuid init 180/232/327), src/switch_core_session.c (CHANNEL_DESTROY 1584), src/switch_core_codec.c (CODEC event 189/300/471/531/579), src/switch_ivr_play_say.c (RECORD_START 770, RECORD_STOP 1033, PLAYBACK_START 1649, PLAYBACK_STOP 2023), src/switch_ivr_async.c (inband DTMF 3931, RECORD_* 1241/1482), src/switch_ivr.c (CHANNEL_PARK 1002, CHANNEL_UNPARK 1213), mod_callcenter.c (#define CALLCENTER_EVENT \\\"callcenter::info\\\"; CC-Action values enumerated), mod_conference (CONF_EVENT_MAINT=\\\"conference::maintenance\\\", CONF_EVENT_CDR=\\\"conference::cdr\\\"; Action values enumerated), mod_valet_parking.c (VALET_EVENT=\\\"valet_parking::info\\\"), mod_sofia.h (sofia::* subclass #defines lines 84-110).\\n\\nEvents the prompt asked for that do NOT exist in upstream FreeSWITCH as ESL events (verified by grep of the full src tree):\\n- RING_BACK: there is no SWITCH_EVENT_RING_BACK in the enum, EVENT_NAMES table, or any source file. Ringback is generated by the 'ringback'/'playback' application and surfaces to ESL as CHANNEL_PROGRESS (CS_RINGING) for SIP 180, CHANNEL_PROGRESS_MEDIA for SIP 183 early-media, and PLAYBACK_START/STOP when a ringback file/tone_stream is played. Use PLAYBACK_* + CHANNEL_PROGRESS_MEDIA to trace ringback.\\n- 'fire_call': not an ESL event subclass. The only occurrence in the repo is a ChangeLog line ('Fix conference fire-call') referencing the conference auto-outcall feature; it surfaces as conference::maintenance CUSTOM events with Action values (no 'fire-call' Action string exists in mod_conference either). If you meant a specific custom subclass from your sniffer/Genesis layer, it is not part of upstream FreeSWITCH.\\n- 'dial::' and 'transfer::' and 'park::' as CUSTOM subclasses: no source file reserves or fires these. The closest upstream equivalents are: for transfer, the sofia::transferor / sofia::transferee CUSTOM subclasses plus the 'transfer' Action within conference::maintenance; for park, the core CHANNEL_PARK/CHANNEL_UNPARK events plus the valet_parking::info CUSTOM subclass. The 'transfer::intercept' string in your local freeswitch/conf/vanilla/dialplan/default.xml is a bind_meta_app DTMF-meta-app binding argument, not an ESL event subclass.\\n\\nAdditional events worth tracing not in the prompt's minimum list: CHANNEL_ORIGINATE (leg originated via originate command), CHANNEL_EXECUTE / CHANNEL_EXECUTE_COMPLETE (dialplan app execution, carries Application/App-Data and Application-UUID — useful for tracing which app ran on a leg), CHANNEL_UUID (UUID changed mid-call), CHANNEL_HOLD/CHANNEL_UNHOLD, MEDIA_BUG_START/MEDIA_BUG_STOP (recording/tap/RTCP tap installed — correlates to sniffer media-bug taps), CALL_DETAIL (mod_calldetail CDR event when present).\\n\\nStandard channel headers set on EVERY channel event by switch_channel_event_set_data (in addition to event-specific ones): Channel-State, Channel-Call-State, Channel-State-Number, Channel-Name, Unique-ID, Session-External-ID, Call-Direction, Presence-Call-Direction, Channel-HIT-Dialplan, Channel-Presence-ID, Channel-Presence-Data, Presence-Data-Cols (+ PD-* cols), Channel-Call-UUID, Answer-State (ringing|early|answered|hangup), Hangup-Cause (when set), Channel-Read/Write-Codec-Name/Rate/Bit-Rate, Caller-* (full caller profile), Other-Type + Other-Leg-* (when an originator/originatee profile exists). When verbose events are enabled globally or per-channel (CF_VERBOSE_EVENTS) or for the event IDs listed in switch_channel_event_set_extended_data (which includes CREATE, ANSWER, BRIDGE, UNBRIDGE, PROGRESS, PROGRESS_MEDIA, HANGUP, HANGUP_COMPLETE, CALL_UPDATE, PLAYBACK_*, RECORD_*, CUSTOM, etc.), the full channel variable dump is also attached as variable_* and scope_variable_* headers — this is where variable_sip_call_id, variable_sip_to_user, variable_hangup_cause_q850, variable_call_uuid etc. come from.\"" + } +} diff --git a/docs/esl-sniffer-traces-plan.md b/docs/esl-sniffer-traces-plan.md new file mode 100644 index 0000000..eff8932 --- /dev/null +++ b/docs/esl-sniffer-traces-plan.md @@ -0,0 +1,456 @@ +# Plano de Ação: Integração Sniffer + Genesis ESL para Traces Completos de Chamada + +## 1. Resumo executivo + +O objetivo é produzir **traces distribuídos completos de chamada** que unem a **camada de controle** (FreeSWITCH ESL, consumida pela biblioteca Genesis) e a **camada de captura** (sinalização SIP e mídia RTP/RTCP, observada passivamente pelo sniffer Otoru/sniffer), com **informação de roteamento** (dialplan, contexto, destino, bridge legs, transferências, ring groups, balanceador). + +Hoje os dois sistemas operam em silos observacionais: +- O **Genesis** emite spans OTel `process_event`, `send_command`, `channel.*` (answer/park/hangup/bridge/playback/say/play_and_get_digits), `channel.create`, `channel.wait`, `channel.dtmf.received`, `inbound_connect`, `outbound_handle_connection`, `ring_group.ring`, `queue.wait_and_acquire` e ~20 métricas, mas **não cobre** o ciclo de vida semântico do canal FreeSWITCH (`CHANNEL_PROGRESS`, `CHANNEL_BRIDGE`, `CHANNEL_UNBRIDGE`, `CALL_UPDATE`, `CODEC`, `PLAYBACK_*`, `RECORD_*`, transferências sofia::transferor/transferee) e **não propaga trace_context** entre ESL e SIP. +- O **sniffer** produz o span raiz `voip.call` com filhos `voip.call.sip.request/response`, `voip.rtp.stream`, `voip.register`, `voip.keepalive`, `voip.fraud.alert`, e ~50 métricas de qualidade/fraude, mas **não tem integração ESL** (0 matches para freeswitch/ESL/event_socket no repositório) e seu `trace_id` é **aleatório**, não derivado do SIP Call-ID nem propagado para o FreeSWITCH. + +A proposta é (todas as mudanças são no Genesis e na configuração do FreeSWITCH — **sem nenhuma alteração no sniffer**): +1. **Adicionar ao Genesis** spans/métricas que cobrem o lifecycle semântico do canal FreeSWITCH e a informação de roteamento (dialplan/contexto/destino/bridge/transfer/ring group/balanceador), anexando `Channel-Call-UUID`, `Other-Leg-Unique-ID`, `Bridge-A/B-Unique-ID`, `Caller-Context`, `Caller-Destination-Number`, `Application`/`Application-Data`, `Hangup-Cause` (Q.850) e, principalmente, **`sip.call_id`** (= `variable_sip_call_id`) como atributo de span em todos os spans de canal. +2. **Correlacionar Genesis ↔ sniffer por atributo compartilhado**, não por propagação de `trace_id`: o sniffer **já** emite `voip.call_id` = SIP Call-ID em seus spans e já correlaciona chamadas por essa chave no Redis. Ao colocar o mesmo `sip.call_id` em todos os spans de controle do Genesis, o **join** entre o trace de controle (Genesis) e o trace de captura (sniffer) passa a acontecer **no backend de observabilidade** (Grafana/Tempo) via query por atributo — sem qualquer mudança no sniffer. Métricas correlacionam-se a traces via **exemplars OTel** (o SDK anexa o `trace_id` do span corrente como exemplar ao registrar a métrica). +3. **Correlacionar a-leg/b-leg** dentro do trace do Genesis via `Other-Leg-Unique-ID` / `Bridge-A-Unique-ID` / `Bridge-B-Unique-ID`, agrupando tudo sob `Channel-Call-UUID`. + +## 2. Estado atual + +### 2.1 Genesis — OTel spans (resumo) + +Spans existentes (todos via `tracer.start_as_current_span`): +- `process_event` — `genesis/protocol/base.py:201` — envolve apenas metrics+logging; **dispatch e routing rodam FORA do span**. +- `send_command` — `genesis/protocol/base.py:290` — `command.name` = **string crua do comando** (alta cardinalidade), sem `record_exception` no caminho `-ERR`. +- `channel.create` — `genesis/channel.py:144` — registra `channel.dial_path`, `channel.uuid`, `channel.create.duration`, status ERROR. +- `channel.wait` — `genesis/channel.py:415` — `wait.target`, `wait.timeout`, `wait.type`, `wait.result`, `wait.duration`. +- `channel.answer`, `channel.park`, `channel.hangup`, `channel.bridge`, `channel.playback`, `channel.say`, `channel.play_and_get_digits` — produzidos pelo helper genérico `_execute_operation` em `genesis/channel.py:505`; cada um com `channel..success`, `channel..duration`. +- `channel.dtmf.received` — `genesis/channel.py:796` — `dtmf.digit`, `dtmf.handled`. +- `inbound_connect` — `genesis/inbound.py:97` — `net.peer.name/port`. +- `outbound_handle_connection` — `genesis/outbound.py:156`. +- `ring_group.ring` — `genesis/group/ring.py:138` — `ring_group.mode/size/timeout/result/answered_uuid/answered_dial_path`, mas **não chama `set_status(ERROR)`** no caminho de exceção. +- `queue.wait_and_acquire` — `genesis/queue/core.py:76`. + +Gaps críticos (do mapeamento): +- **Sem spans em `genesis/session.py`** (sendmsg, lifecycle da Session não instrumentados). +- **Sem spans em `genesis/consumer.py`**. +- **Sem spans para handler dispatch / `routing_strategy.route()` / loop `consume()` / loop `handler()`**. +- **Sem `span.add_event()` em todo o genesis/** — zero span events registrados. +- `process_event` **não carrega** `Call-Direction`, `Hangup-Cause`, `Answer-State`, `Channel-State`, `Event-Subclass` (aparecem só em métricas via `build_metric_attributes`). +- `send_command` não registra erro de span no reply `-ERR`. +- Duplicação de definições de métricas entre `genesis/protocol/metrics.py` e `genesis/channel.py` ("to avoid circular imports"). +- `call_duration_histogram.record` em `channel.py:573` **sem atributos** (sem UUID/cause). + +### 2.2 Genesis — OTel metrics (resumo) + +20 instrumentos: 13 counters, 5 histograms, 2 up_down_counters, 0 observable/gauge. +Relevantes: `genesis.commands.sent/duration/errors`, `genesis.events.received`, `genesis.channel.operations`, `genesis.channel.operation.duration`, `genesis.channel.hangup.causes`, `genesis.channel.bridge.operations`, `genesis.channel.dtmf.received`, `genesis.call.duration`, `genesis.timeouts`, `genesis.channel.routing.hits`, `genesis.channel.routing.fallback`, `genesis.connections.active/errors`, `genesis.ring_group.operations/duration/results`, `genesis.queue.operations/wait_duration`. + +Gaps: +- **Sem gauge de chamadas ativas por estado**, sem gauge de profundidade de fila de comandos/events. +- `genesis.connections.errors` **só existe em `inbound.py`**; Outbound não tem. +- **Sem métricas em `session.py`, `consumer.py`, `group/load_balancer.py`**. +- `call.duration` sem atributos → não particionável por canal/cause. + +### 2.3 Sniffer — sinais atuais + +- Span raiz `voip.call` (catálogo `SpanOpCall`) com filhos `voip.call.sip.request`, `voip.call.sip.response` (waterfall por mensagem SIP), `voip.rtp.stream` (por SSRC, lado a/b), `voip.register` + `voip.register.transaction`, `voip.keepalive`, `voip.fraud.alert`. +- Métricas: `voip.calls.total/answered/failed/timeout/muted/one_way_audio/active`, `voip.call.duration_s/mos/jitter_ms/loss_pct/silence_ratio`, `voip.rtp.streams.active`, `voip.keepalives.total/rtt_ms`, `voip.registrations.active`, `sniffer.packets.dropped`, `sniffer.fraud.*`, `sniffer.pipeline.watermark_*`, `process.*`. +- **`trace_id` é aleatório** (SDK `AlwaysSample` ou `TraceIDRatioBased`); `voip.call_id` (SIP Call-ID) é só atributo de lookup humano, **não é o trace_id**. +- **Sem propagador W3C** (`provider.go` não chama `SetTextMapPropagator`, parser SIP não extrai/injeta `traceparent`/`X-Tracespan`). +- **Sem integração ESL**: 0 matches para freeswitch/ESL/event_socket. +- Correlação cross-sensor por **SIP Call-ID via Redis** (`voip:call:{call_id}`, `voip:ep:{ip}|{port}`), não por trace context. +- Service name default `"sauron"` (`OTEL_SERVICE_NAME`). + +### 2.4 Sobreposição e divergência + +| Dimensão | Genesis (ESL) | Sniffer (SIP/RTP) | Convergência | +|---|---|---|---| +| Identidade de chamada | `Unique-ID` (per-leg), `Channel-Call-UUID` (call-wide) | `voip.call_id` = SIP `Call-ID` | `variable_sip_call_id` no evento ESL liga os dois | +| Answer | `CHANNEL_ANSWER` → `channel.answer` span (Reply-Text) | SIP 200 OK → `voip.call.sip.response` | Mesmo instante, spans separados | +| Hangup | `channel.hangup` + `Hangup-Cause` | `voip.sip.termination_cause` (inferido por IP do BYE) | Genesis é **autoritativo** para cause/who | +| Bridge | `channel.bridge` (via `api uuid_bridge`/sendmsg) — **sem evento CHANNEL_BRIDGE** | Vê dois SIP dialogs separados, **sem tie a-leg/b-leg** | ESL `Bridge-A/B-Unique-ID` + `Other-Leg-Unique-ID` fecha o gap | +| Transfer | **Não tratado** no Genesis | Vê re-INVITE/REFER sem motivo | `sofia::transferor/transferee` + `CALL_UPDATE.Bridged-To` | +| Codec | Apenas em métricas (`build_metric_attributes` não expõe codec) | `rtp.codec`, `rtp.payload_type` | `CODEC` ESL event fecha o gap | +| Routing/dialplan | `Caller-Context`, `Caller-Destination-Number`, `Application` em `CHANNEL_EXECUTE_COMPLETE` — **não extraídos** | Não visível na camada de pacotes | ESL é a única fonte | +| trace_id | SDK aleatório por span | SDK aleatório por span | **Divergem** — proposta: propagar via SIP header | + +## 3. Proposta de mapeamento ESL → spans/metrics + +Convenção: **ADICIONAR** = novo; **RENOMEAR** = mudar nome/atributos; **MANTER** = sem alteração; **REMOVER** = eliminar. + +### 3.1 Spans + +| Evento ESL | Ação | Span (nome, atributos, span events) | Justificativa / fonte ESL | +|---|---|---|---| +| `CHANNEL_CREATE` | **ADICIONAR** | `freeswitch.channel.create` — attrs: `channel.uuid`=Unique-ID, `channel.call_uuid`=Channel-Call-UUID, `channel.direction`=Call-Direction, `channel.name`=Channel-Name, `channel.destination_number`=Caller-Destination-Number, `channel.context`=Caller-Context, `channel.dialplan`=Caller-Dialplan, `channel.caller_id_number`=Caller-Caller-ID-Number, `channel.caller_id_name`=Caller-Caller-ID-Name, `channel.network_addr`=Caller-Network-Addr, `sip.call_id`=variable_sip_call_id (se presente) | `switch_core_state_machine.c:626`; nascimento do leg; único ponto com `Call-Direction` + contexto dialplan; `sip.call_id` é a chave de correlação com o sniffer | +| `CHANNEL_PROGRESS` | **ADICIONAR** | `freeswitch.channel.progress` — attrs: `channel.uuid`, `channel.call_uuid`, `channel.state`=CS_RINGING, `answer.state`=ringing, `other_leg.uuid`=Other-Leg-Unique-ID (se presente); span events: nenhum | `switch_channel.c:3507`; timestamp de alerting | +| `CHANNEL_PROGRESS_MEDIA` | **ADICIONAR** | `freeswitch.channel.progress_media` — attrs: `channel.uuid`, `channel.call_uuid`, `answer.state`=early, `channel.read_codec`=Channel-Read-Codec-Name, `channel.write_codec`=Channel-Write-Codec-Name, `other_leg.uuid` | `switch_channel.c:3562`; early-media (183) — explica RTP antes do ANSWER | +| `CHANNEL_ANSWER` | **ADICIONAR** (span semântico; **MANTER** `channel.answer` que envolve o comando `answer`) | `freeswitch.channel.answer` — attrs: `channel.uuid`, `channel.call_uuid`, `channel.state`=CS_EXECUTE, `answer.state`=answered, `channel.read_codec`, `channel.write_codec`, `other_leg.uuid` | `switch_channel.c:3848`; instante autoritativo de answer | +| `CHANNEL_BRIDGE` | **ADICIONAR** | `freeswitch.channel.bridge` — attrs: `channel.uuid`=Unique-ID (firing leg), `bridge.a_uuid`=Bridge-A-Unique-ID, `bridge.b_uuid`=Bridge-B-Unique-ID, `channel.call_uuid`, `other_leg.uuid`=Other-Leg-Unique-ID, `other_leg.type`=Other-Type, `other_leg.destination_number`=Other-Leg-Destination-Number, `other_leg.caller_id_number`=Other-Leg-Caller-ID-Number; span events: `bridge.established` | `switch_ivr_bridge.c:1377`; **correlação autoritativa a-leg/b-leg** | +| `CHANNEL_UNBRIDGE` | **ADICIONAR** | `freeswitch.channel.unbridge` — attrs: `channel.uuid`, `bridge.a_uuid`, `bridge.b_uuid`/`other_leg.uuid`, `channel.call_uuid`, `hangup.cause` (se presente); span events: `bridge.torn_down` | `switch_ivr_bridge.c:1326/1481/1494/1879`; bounds talk time; detecta transfer (unbridge→bridge novo) | +| `CHANNEL_HANGUP` | **ADICIONAR** | `freeswitch.channel.hangup` — attrs: `channel.uuid`, `channel.call_uuid`, `hangup.cause`=Hangup-Cause, `answer.state`=hangup, `channel.state`=CS_HANGUP, `other_leg.uuid`; span events: `hangup.cause.` | `switch_channel.c:3447`; causa normalizada por leg | +| `CHANNEL_HANGUP_COMPLETE` | **ADICIONAR** | `freeswitch.channel.hangup_complete` — attrs: `channel.uuid`, `channel.call_uuid`, `hangup.cause`, `hangup.cause.q850`=variable_hangup_cause_q850, `channel.name`, `sip.call_id`=variable_sip_call_id, `cdr.xml`=`[verificar]` se `CDR-Attached=xml`; span events: `call.finalized` | `switch_core_state_machine.c:943`; commit point do CDR/trace | +| `CHANNEL_DESTROY` | **ADICIONAR** | `freeswitch.channel.destroy` — attrs: `channel.uuid`, `channel.call_uuid` | `switch_core_session.c:1584`; sinal de desregistro de handler | +| `CHANNEL_EXECUTE` | **ADICIONAR** | `freeswitch.channel.execute` — attrs: `channel.uuid`, `channel.call_uuid`, `application.name`=Application, `application.uuid`=Application-UUID; span events: nenhum | Dialplan app start | +| `CHANNEL_EXECUTE_COMPLETE` | **ADICIONAR** | `freeswitch.channel.execute_complete` — attrs: `channel.uuid`, `channel.call_uuid`, `application.name`=Application, `application.uuid`=Application-UUID, `application.response`=Application-Response; span events: `app..done` | Correlaciona com `Session._awaitable_complete_command` | +| `CHANNEL_PARK` / `CHANNEL_UNPARK` | **ADICIONAR** | `freeswitch.channel.park` / `freeswitch.channel.unpark` — attrs: `channel.uuid`, `channel.call_uuid`, `channel.state` | `switch_ivr.c:1002/1213` | +| `CALL_UPDATE` | **ADICIONAR** | `freeswitch.call.update` — attrs: `channel.uuid`, `channel.call_uuid`, `bridged.to`=Bridged-To, `caller.transfer_source`=Caller-Transfer-Source, `caller.orig_caller_id_number`=Caller-Orig-Caller-ID-Number; span events: `caller_id.mutated` | `switch_channel.c:3279`; detecta transfer mid-call | +| `CODEC` | **ADICIONAR** | `freeswitch.channel.codec` — attrs: `channel.uuid`, `channel.call_uuid`, `channel.read_codec.name/rate`, `channel.write_codec.name/rate`, `channel.reported_read_codec_rate` | `switch_core_codec.c:189/300/471/531/579`; timeline de codec por leg | +| `PLAYBACK_START` / `PLAYBACK_STOP` | **ADICIONAR** | `freeswitch.channel.playback.start/stop` — attrs: `channel.uuid`, `channel.call_uuid`, `playback.file_path`=Playback-File-Path, `playback.file_type`=Playback-File-Type, `playback.status`=Playback-Status (no stop); span events: nenhum | `switch_ivr_play_say.c:1649/2023`; explica mídia one-way (ringback) | +| `RECORD_START` / `RECORD_STOP` | **ADICIONAR** | `freeswitch.channel.record.start/stop` — attrs: `channel.uuid`, `channel.call_uuid`, `record.file_path`=Record-File-Path, `record.completion_cause`=Record-Completion-Cause (no stop) | `switch_ivr_async.c:1241/1482`, `switch_ivr_play_say.c:770/1033` | +| `CUSTOM sofia::transferor` / `sofia::transferee` | **ADICIONAR** | `freeswitch.sofia.transfer` — attrs: `channel.uuid`, `channel.call_uuid`, `transfer.role`=transferor\|transferee, `other_leg.uuid`=Other-Leg-Unique-ID, `sofia.profile`=sofia_profile_name; span events: `transfer.initiated` | `mod_sofia.h:84-110`; distingue transfer de hangup | +| `CUSTOM sofia::reinvite` / `sofia::replaced` | **ADICIONAR** | `freeswitch.sofia.reinvite` / `freeswitch.sofia.replaced` — attrs: `channel.uuid`, `channel.call_uuid`, `sofia.profile`; span events: `media.renegotiated` | Correlaciona com mudança de IP/codec no RTP | +| `CUSTOM callcenter::info` | **ADICIONAR** | `freeswitch.callcenter.info` — attrs: `cc.queue`=CC-Queue, `cc.action`=CC-Action, `cc.agent`=CC-Agent, `cc.member_uuid`=CC-Member-UUID, `cc.count`=CC-Count, `cc.selection`=CC-Selection, `channel.uuid`=Unique-ID | ACD routing | +| `CUSTOM conference::maintenance` / `conference::cdr` | **ADICIONAR** | `freeswitch.conference.maintenance` / `freeswitch.conference.cdr` — attrs: `conference.name`, `conference.profile`, `conference.action`=Action, `conference.member_id`=Member-ID, `channel.uuid`, `old.member_id`=Old-Member-ID | Multi-party bridge | +| `CUSTOM valet_parking::info` | **ADICIONAR** | `freeswitch.valet.info` — attrs: `valet.lot`=Valet-Lot-Name, `valet.extension`=Valet-Extension, `valet.action`=Action, `bridge.to_uuid`=Bridge-To-UUID, `channel.uuid` | Park/retrieve | +| `CUSTOM sofia::register/unregister/expire/gateway_state` | **ADICIONAR** | `freeswitch.sofia.register` — attrs: `register.aor`=from-user@from-host, `register.contact_ip`=contact, `register.expires_s`=expires, `register.response_code`, `register.reason`, `gateway.name`=Gateway-Name, `gateway.state`=State | Pre-condição de outbound routing | +| `process_event` (span existente) | **RENOMEAR/MELHORAR** | **MANTER** nome `process_event`, mas **ADICIONAR** attrs: `event.direction`=Call-Direction, `event.channel_state`=Channel-State, `event.answer_state`=Answer-State, `event.hangup_cause`=Hangup-Cause, `event.subclass`=Event-Subclass, `event.call_uuid`=Channel-Call-UUID, `event.other_leg`=Other-Leg-Unique-ID, `sip.call_id`=variable_sip_call_id; **ADICIONAR** span events para bridge/transfer/hangup_reason quando aplicável | Fecha o gap de atributos de routing no span de processo | +| `send_command` (span existente) | **RENOMEAR** `command.name` de string crua → verbo do comando (parse first token); **ADICIONAR** `command.error`=`-ERR` detection + `span.set_status(ERROR)` + `record_exception` no reply `-ERR` | Alta cardinalidade hoje; sem erro de span | +| `channel.bridge` (existente em `channel.py`) | **MANTER**, mas **ADICIONAR** span event `bridge.esl_event` quando `CHANNEL_BRIDGE` chega, linkando `bridge.a_uuid`/`bridge.b_uuid` | Span do comando vs span do evento são complementares | +| `channel.hangup` (existente) | **MANTER**, **ADICIONAR** attr `hangup.cause.q850` (via `variable_hangup_cause_q850`) e span event `hangup.authoritative` quando `CHANNEL_HANGUP_COMPLETE` chega | Fecha gap de Q.850 | +| `ring_group.ring` (existente) | **MANTER** attrs atuais, **ADICIONAR** `ring_group.balancer_backend` (nome do backend, NÃO UUID), `ring_group.selected_dial_path`, `ring_group.context`; **CORRIGIR** chamar `span.set_status(StatusCode.ERROR)` no caminho de exceção (gap do mapeamento) | Routing info de ring group | +| `queue.wait_and_acquire` (existente) | **MANTER**, **ADICIONAR** `queue.depth` como atributo de span (NÃO de métrica) | Profundidade só como span attr evita cardinalidade | + +### 3.2 Metrics + +| Sinal | Ação | Nome / tipo / attrs | Justificativa | +|---|---|---|---| +| Chamadas ativas por estado | **ADICIONAR** | `genesis.calls.active` (UpDownCounter) attrs: `channel.state` (enum ChannelState), `direction` | Hoje só `connections.active` por tipo in/out | +| Eventos ESL processados por nome | **MANTER** `genesis.events.received` | — | Já cobre | +| Bridge por par de legs | **ADICIONAR** | `genesis.channel.bridge.events` (Counter) attrs: `bridge.result` (established/unbridged), `hangup.cause` (no unbridge) | Hoje `bridge.operations` mede só o comando | +| Transferências | **ADICIONAR** | `genesis.channel.transfers` (Counter) attrs: `transfer.type` (blind/attended), `transfer.role` (transferor/transferee) | Inexistente | +| Codec changes | **ADICIONAR** | `genesis.channel.codec.changes` (Counter) attrs: `channel.read_codec`, `channel.write_codec` (NÃO UUID) | Inexistente | +| Dialplan apps executados | **ADICIONAR** | `genesis.dialplan.applications` (Counter) attrs: `application.name` (set/bridge/playback/transfer/park/voicemail/ivr/queue), `application.result` (success/fail) | Routing info | +| Hangup causes por Q.850 | **ADICIONAR** | `genesis.channel.hangup.causes.q850` (Counter) attrs: `hangup.cause.q850` (NÃO UUID) | Hoje `hangup.causes` só tem cause textual | +| Duration de processamento de evento | **ADICIONAR** | `genesis.event.processing.duration` (Histogram) attrs: `event.name` | Gap: sem latência de dispatch | +| `genesis.call.duration` | **RENOMEAR/REPARAR** | **MANTER** nome, **ADICIONAR** attrs `hangup.cause` e `direction`; **NÃO** adicionar `channel.uuid` (cardinalidade) | Hoje gravado sem attrs | +| `genesis.connections.errors` | **ADICIONAR** em `genesis/outbound.py` | Reaproveitar mesmo nome com attrs `type`=outbound, `error`=... | Gap: outbound sem error counter | +| Métricas duplicadas em `channel.py` e `metrics.py` | **REMOVER** duplicação | Centralizar definição em `genesis/protocol/metrics.py` e importar em `channel.py` (resolver circular import via module lazy import ou mover constants) | Hazard de manutenção | +| Observable gauge de queue depth | **ADICIONAR** | `genesis.commands.queue.depth` (ObservableGauge), `genesis.events.queue.depth` (ObservableGauge) | Backpressure não observável | + +**Regra de cardinalidade**: atributos de métrica **NUNCA** carregam UUIDs (`channel.uuid`, `bridge.a_uuid`, `other_leg.uuid`); apenas enums/labels low-cardinality (`channel.state`, `direction`, `hangup.cause`, `application.name`, `transfer.type`). UUIDs vão **só em spans**. + +## 4. Estratégia de correlação de traces (sniffer ↔ Genesis) + +### 4.1 Opções avaliadas + +| Opção | Mecanismo | Veredito | +|---|---|---| +| **A. Correlação por atributo `sip.call_id` no backend** | Genesis anexa `sip.call_id` (= `variable_sip_call_id`) a todos os spans de canal e ao `process_event`; sniffer já emite `voip.call_id` = SIP Call-ID. Join em Grafana/Tempo por query de atributo. Métricas → traces via exemplars OTel. | **RECOMENDADA / ESCOPO DESTE PR** — zero mudança no sniffer; usa chaves que o sniffer já produz; funciona mesmo com trace_ids independentes | +| B. SIP Call-ID **como** trace_id | Usar `variable_sip_call_id` como `trace_id` OTel | Rejeitado: `trace_id` OTel é 128-bit hex; Call-ID é string arbitrária; quebra semântica OTel e o SDK não aceita | + +### 4.2 Modelo de traces (independentes, correlacionados por atributo) + +Genesis e sniffer continuam emitindo **traces OTel independentes** (cada um com seu próprio `trace_id`). A correlação é **lógica**, por `sip.call_id`: + +``` +Trace Genesis (service=genesis) — root: freeswitch.channel.create + freeswitch.channel.progress attrs: sip.call_id, channel.call_uuid + freeswitch.channel.answer attrs: sip.call_id, channel.call_uuid + freeswitch.channel.bridge attrs: sip.call_id, bridge.a_uuid, bridge.b_uuid + freeswitch.channel.execute attrs: sip.call_id, application.name + freeswitch.channel.unbridge attrs: sip.call_id + freeswitch.channel.hangup attrs: sip.call_id, hangup.cause + freeswitch.channel.hangup_complete attrs: sip.call_id, hangup.cause.q850 + +Trace sniffer (service=sniffer) — root: voip.call attrs: voip.call_id (= mesmo SIP Call-ID) + voip.call.sip.request attrs: voip.call_id + voip.call.sip.response attrs: voip.call_id + voip.rtp.stream (a) attrs: voip.call_id + voip.rtp.stream (b) attrs: voip.call_id +``` + +**Join no Grafana/Tempo**: `trace.span.attrs["sip.call_id"] == trace.span.attrs["voip.call_id"]` — uma query por atributo retorna os dois traces; o usuário navega entre eles. Não há parentesco OTel direto (intencional: o sniffer não conhece o trace_id do Genesis). + +### 4.3 Hierarquia **dentro** do trace Genesis + +- **Trace raiz lógico = chamada**, identificado por `Channel-Call-UUID` (call-wide). O span `freeswitch.channel.create` do leg originador (`Call-Direction=inbound` ou originador do `originate`) é o root. +- **Spans de controle/dialplan** (`execute`, `execute_complete`, `codec`, `playback.*`, `transfer`) são filhos diretos do root via context OTel propagado pelo `Protocol`/`Channel`. +- **`sip.call_id` e `channel.call_uuid`** são atributos em **todos** os spans de canal — garantem o join com o sniffer e o agrupamento a-leg/b-leg. + +### 4.4 Amarrando a-leg/b-leg (dentro do Genesis) + +- No `CHANNEL_BRIDGE`, o Genesis lê `Bridge-A-Unique-ID` (originador) e `Bridge-B-Unique-ID` (peer). Após o bridge, ambos os legs compartilham o mesmo `Channel-Call-UUID` (`switch_ivr_bridge.c:1446/1555/1684/1877`). +- O span `freeswitch.channel.bridge` carrega `bridge.a_uuid` e `bridge.b_uuid` como **atributos de span** e emite span event `bridge.established` com ambos os UUIDs. +- Cada leg é um dialog SIP distinto com SIP Call-ID próprio — o join cross-leg **não** é por `sip.call_id`, e sim por `channel.call_uuid` (comum aos dois legs após bridge) dentro do trace Genesis, e por `bridge.a_uuid`/`bridge.b_uuid` para cruzar com os traces sniffer de cada dialog. Fluxo: do `sip.call_id` de um leg → abre trace Genesis → lê `bridge.b_uuid` → busca o `sip.call_id`/`voip.call_id` do outro leg. +- Em transferências, `CALL_UPDATE.Bridged-To` + `sofia::transferor/transferee` indicam o novo leg; o Genesis inicia novo span root com **span link** (`Links`) para o trace anterior (não parent, pois é outra chamada lógica). +- **Caveat**: em transfer que cria novo b-leg, o `call_uuid` pode rolar para o novo originador — reavaliar `Channel-Call-UUID` a cada `CHANNEL_BRIDGE` e, se mudar, iniciar novo span root com link para o anterior. + +## 5. Informação de roteamento a anexar + +| Informação de routing | Campo ESL fonte | Span/atributo destino | +|---|---|---| +| Contexto dialplan | `Caller-Context` | `freeswitch.channel.create` → `channel.context`; `process_event` → `event.context` | +| Destination number | `Caller-Destination-Number` | `freeswitch.channel.create` → `channel.destination_number` | +| Dialplan | `Caller-Dialplan` | `freeswitch.channel.create` → `channel.dialplan` | +| Direção (a/b leg, inbound/outbound) | `Call-Direction` | `freeswitch.channel.create` → `channel.direction`; `process_event` → `event.direction` | +| Aplicação dialplan executada | `Application` (em `CHANNEL_EXECUTE`/`CHANNEL_EXECUTE_COMPLETE`) | `freeswitch.channel.execute` → `application.name` | +| Argumentos da aplicação | `Application-Data` `[verificar se presente no payload ESL do Genesis]` | `freeswitch.channel.execute` → `application.data` | +| Bridge a-leg/b-leg | `Bridge-A-Unique-ID`, `Bridge-B-Unique-ID` (`CHANNEL_BRIDGE`) | `freeswitch.channel.bridge` → `bridge.a_uuid`, `bridge.b_uuid` | +| Other-Leg (correlação per-leg) | `Other-Leg-Unique-ID`, `Other-Type` | todos os spans de evento de canal → `other_leg.uuid`, `other_leg.type` | +| Transfer (role + partner) | `Event-Subclass` sofia::transferor/transferee, `Other-Leg-Unique-ID` | `freeswitch.sofia.transfer` → `transfer.role`, `other_leg.uuid` | +| Transfer source | `Caller-Transfer-Source` | `freeswitch.call.update` → `caller.transfer_source` | +| Ring group mode/destinations | args de `RingGroup.ring` (`mode`, `destinations`, `timeout`) | `ring_group.ring` (já existe) → adicionar `ring_group.context`, `ring_group.selected_dial_path` | +| Load balancer backend escolhido | `LoadBalancerBackend` em `genesis/group/load_balancer.py` `[verificar nome do método select]` | `ring_group.ring` → `ring_group.balancer_backend` (nome/label, NÃO UUID) | +| Queue/ACD | `CC-Queue`, `CC-Agent`, `CC-Action`, `CC-Member-UUID` | `freeswitch.callcenter.info` | +| Conference | `Conference-Name`, `Action`, `Member-ID` | `freeswitch.conference.maintenance` | +| Hangup cause (texto) | `Hangup-Cause` | `freeswitch.channel.hangup` → `hangup.cause` | +| Hangup cause Q.850 | `variable_hangup_cause_q850` | `freeswitch.channel.hangup_complete` → `hangup.cause.q850` | +| Codec negociado | `Channel-Read-Codec-Name`, `Channel-Write-Codec-Name`; `CODEC` event | `freeswitch.channel.codec` → `channel.read_codec.name`, `channel.write_codec.name` | +| SIP Call-ID (correlação com sniffer — chave primária) | `variable_sip_call_id` | todos os spans de canal + `process_event` → `sip.call_id` (join com `voip.call_id` do sniffer no backend) | +| traceparent (OPCIONAL/futuro — exige sniffer) | `variable_sip_h_X_Tracespan` | **Fora do escopo deste PR**: exigiria o sniffer consumir o header. A correlação real é por `sip.call_id` (linha acima) | + +## 6. Mudanças concretas no Genesis (por arquivo) + +### `genesis/protocol/metrics.py` +- **ADICIONAR** instrumentos: `genesis.calls.active` (UpDownCounter, attrs `channel.state`, `direction`), `genesis.channel.bridge.events` (Counter, attrs `bridge.result`, `hangup.cause`), `genesis.channel.transfers` (Counter, attrs `transfer.type`, `transfer.role`), `genesis.channel.codec.changes` (Counter, attrs `channel.read_codec`, `channel.write_codec`), `genesis.dialplan.applications` (Counter, attrs `application.name`, `application.result`), `genesis.channel.hangup.causes.q850` (Counter, attrs `hangup.cause.q850`), `genesis.event.processing.duration` (Histogram, attrs `event.name`), `genesis.commands.queue.depth` (ObservableGauge), `genesis.events.queue.depth` (ObservableGauge). +- **REMOVER** definições duplicadas que também existem em `channel.py` (resolver circular import movendo os `meter.create_*` para cá e importando os objetos prontos em `channel.py`). +- **MANTER** `genesis.events.received`, `genesis.commands.*`, `genesis.channel.routing.*`, `genesis.connections.active`. + +### `genesis/protocol/base.py` +- **MANTER** span `process_event` (linha 201); **ADICIONAR** atributos `event.direction`, `event.channel_state`, `event.answer_state`, `event.hangup_cause`, `event.subclass`, `event.call_uuid`, `event.other_leg`, `sip.call_id` via extensão de `build_event_attributes` (`sip.call_id` é a chave de correlação com o sniffer). +- **ESTENDER** `process_event` para envolver **também** o dispatch (`dispatch_to_handlers`) e `routing_strategy.route()` — mover o `with` para fora do bloco metrics+logging. Alternativa: **ADICIONAR** span `dispatch_handlers` aninhado. +- **ADICIONAR** span `route_event` envolvendo `self.routing_strategy.route(event)` em `_process_one_event`. +- **RENOMEAR** atributo `command.name` do span `send_command` (linha 291) de string crua para verbo parseado (primeiro token, ex. `api`, `sendmsg`, `event`, `filter`); **ADICIONAR** `command.args` com o restante (truncado a 200 chars) se necessário para debug. +- **ADICIONAR** no caminho `-ERR` de `_execute_send` (linha 302): `span.set_status(StatusCode.ERROR, Reply-Text)`, `span.record_exception(Exception(Reply-Text))`, atributo `command.error=protocol_error`. +- **ADICIONAR** span `consume_loop`/`handler_loop` (opcional, baixa cardinalidade) envolvendo o corpo de `consume()` e `handler()`. +- **ADICIONAR** ObservableGauge callbacks para `self.commands.qsize()` e `self.events.qsize()` (usar `asyncio` safe snapshot ou pular se non-async-safe). + +### `genesis/protocol/telemetry.py` +- **ESTENDER** `build_event_attributes` (linha 15-40) para incluir: `event.direction` (Call-Direction), `event.channel_state` (Channel-State), `event.answer_state` (Answer-State), `event.hangup_cause` (Hangup-Cause), `event.subclass` (Event-Subclass), `event.call_uuid` (Channel-Call-UUID), `event.other_leg` (Other-Leg-Unique-ID), `sip.call_id` (variable_sip_call_id) — chave de correlação com o sniffer, presente em todos os spans de canal. +- **MANTER** `build_metric_attributes` e `log_event`. + +### `genesis/protocol/processors.py` +- **ADICIONAR** novo event processor `channel_lifecycle_processor` (ou um por evento semântico) que: + - Detecta `CHANNEL_CREATE/PROGRESS/PROGRESS_MEDIA/ANSWER/BRIDGE/UNBRIDGE/HANGUP/HANGUP_COMPLETE/DESTROY/EXECUTE/EXECUTE_COMPLETE/PARK/UNPARK/CALL_UPDATE/CODEC/PLAYBACK_START/PLAYBACK_STOP/RECORD_START/RECORD_STOP`. + - Extrai `Channel-Call-UUID`, `Unique-ID`, `Other-Leg-Unique-ID`, `Bridge-A/B-Unique-ID`, `Application`, `Application-Data` `[verificar]`, `Hangup-Cause`, `variable_hangup_cause_q850`, `variable_sip_call_id` (chave de correlação com o sniffer). + - Dispara a criação do span semântico correspondente (via um novo `EventSpanEmitter` injetado no Protocol) e incrementa as métricas new. +- **MANTER** `auth_request_processor`, `command_reply_processor`, `api_response_processor`, `disconnect_processor`. +- **ADICIONAR** processador `sofia_custom_processor` para subclasses `sofia::transferor/transferee/reinvite/replaced/register/unregister/expire/gateway_state` e `callcenter::info`, `conference::maintenance/cdr`, `valet_parking::info`. + +### `genesis/protocol/routing/{base,channel,composite,global_}.py` +- **MANTER** lógica de routing; **ADICIONAR** span event `routing.hit` no `ChannelRoutingStrategy.route` (linha 55) e `routing.fallback` no `GlobalRoutingStrategy.route` (linha 50), ambos no span `route_event` corrente (se ativo). + +### `genesis/channel.py` +- **MANTER** spans `channel.create/wait/answer/park/hangup/bridge/playback/say/play_and_get_digits/dtmf.received` e o helper `_execute_operation` (linha 494-537). +- **ADICIONAR** em `channel.create` (linha 144): registrar attrs `sip.call_id` (lido do evento/variável) e `channel.call_uuid` no span — chaves de correlação com o sniffer e de agrupamento a-leg/b-leg. +- **ADICIONAR** em `channel.bridge` (linha 632-644): attrs `bridge.a_uuid`, `bridge.b_uuid`, `other_leg.uuid`; span event `bridge.esl_event` quando `CHANNEL_BRIDGE` é recebido e correlacionado. +- **ADICIONAR** em `channel.hangup` (linha 588-600): attr `hangup.cause.q850` lendo `variable_hangup_cause_q850` do contexto; span event `hangup.authoritative` em `CHANNEL_HANGUP_COMPLETE`. +- **ADICIONAR** em `_state_handler`: registrar `channel.state` transitions como span events no span `channel.wait` ativo (se houver). +- **REMOVER** as 7 re-definições duplicadas de métricas (linhas 32-68) — importar de `genesis/protocol/metrics.py`. +- **REPARAR** `call_duration_histogram.record` (linha 573) para gravar com attrs `hangup.cause` e `direction` (NÃO `channel.uuid`). + +### `genesis/session.py` +- **ADICIONAR** tracer a nível de módulo (`trace.get_tracer(__name__)`). +- **ADICIONAR** span `session.sendmsg` envolvendo `Session.sendmsg` (attrs: `channel.uuid`, `application.name`, `application.uuid`=Event-UUID, `application.block`). +- **ADICIONAR** span `session.start` / `session.stop` para o lifecycle. +- **ADICIONAR** span `session.await_complete` em `_awaitable_complete_command` (attrs: `channel.uuid`, `application.uuid`, `event.name`=CHANNEL_EXECUTE_COMPLETE/CHANNEL_HANGUP_COMPLETE, `wait.duration`). +- **ADICIONAR** métricas `genesis.session.commands` (Counter, attrs `application.name`), `genesis.session.command.duration` (Histogram). + +### `genesis/consumer.py` +- **ADICIONAR** tracer a nível de módulo. +- **ADICIONAR** span `consumer.start` / `consumer.stop` (attrs: `consumer.host`, `consumer.port`). +- **ADICIONAR** span `consumer.dispatch` envolvendo a invocação de handlers registrados via `@consumer.handle`. +- **ADICIONAR** métrica `genesis.consumer.handlers` (Counter, attrs `event.name`, `handler.matched`). + +### `genesis/inbound.py` +- **MANTER** `inbound_connect` (linha 97) e `genesis.connections.active/errors`. +- **ADICIONAR** `record_exception` + `set_status(ERROR)` no span `inbound_connect` em falha de connect/timeout. + +### `genesis/outbound.py` +- **MANTER** `outbound_handle_connection` (linha 156) e `genesis.connections.active`. +- **ADICIONAR** contador `genesis.connections.errors` (attrs `type=outbound`, `error=...`) — gap do mapeamento. + +### `genesis/group/ring.py` +- **MANTER** `ring_group.ring` (linha 138); **CORRIGIR** chamar `span.set_status(StatusCode.ERROR, str(e))` no caminho de exceção (linha 202-205). +- **ADICIONAR** attrs `ring_group.balancer_backend` (label, NÃO UUID), `ring_group.selected_dial_path`, `ring_group.context`. +- **ADICIONAR** span event `ring_group.leg_answered` com `answered_uuid` quando `result=answered`. + +### `genesis/group/load_balancer.py` +- **ADICIONAR** métricas `genesis.loadbalancer.selections` (Counter, attrs `balancer.backend`, `balancer.result`), `genesis.loadbalancer.errors` (Counter, attrs `error`). +- `[verificar]` nome do método de seleção no backend (InMemoryLoadBalancer/RedisLoadBalancer). + +### `genesis/queue/core.py` +- **MANTER** `queue.wait_and_acquire` (linha 76); **ADICIONAR** attr `queue.depth` no span. +- **ADICIONAR** `record_exception`/`set_status(ERROR)` em falha de acquire. + +### `genesis/types.py` +- **MANTER** `ChannelState` IntEnum; **ADICIONAR** helper `HangupCause.q850` mapping `[verificar se já existe]`. + +### `genesis/cli/__init__.py` +- **MANTER** instalação do metrics meter provider (linha 78); **ADICIONAR** instalação de `TracerProvider` com `BatchSpanProcessor` (OTLP) — necessário para emitir os novos spans `freeswitch.channel.*`. `TextMapPropagator(TraceContextPropagator())` só é necessário se/when a propagação W3C via `X-Tracespan` for implementada (opcional/futuro). + +## 7. Configuração no FreeSWITCH + +### 7.1 Event Socket (ESL inbound) +- Em `freeswitch/conf/autoload_configs/event_socket.conf.xml`: + - `` (ou IP restrito à rede do Genesis) + - `` + - `` (restringir) +- O Genesis `Inbound` (`genesis/inbound.py`) conecta e autentica via `ClueCon` (padrão). + +### 7.2 Subscrição de eventos +- Genesis já faz `events plain ALL` em `Channel.create` (`genesis/channel.py:144`). **MANTER**. +- Para os novos eventos semânticos, garantir que `events plain ALL` cubra: `CHANNEL_PROGRESS`, `CHANNEL_PROGRESS_MEDIA`, `CHANNEL_BRIDGE`, `CHANNEL_UNBRIDGE`, `CALL_UPDATE`, `CODEC`, `PLAYBACK_START`, `PLAYBACK_STOP`, `RECORD_START`, `RECORD_STOP`, `CHANNEL_PARK`, `CHANNEL_UNPARK`, `CHANNEL_EXECUTE`, `CHANNEL_EXECUTE_COMPLETE`, `CHANNEL_DESTROY`. +- Para CUSTOM subclasses, o `Consumer._filter_command` já emite `filter Event-Subclass {X}` para nomes não-uppercase. Garantir subscrição de: `sofia::transferor`, `sofia::transferee`, `sofia::reinvite`, `sofia::replaced`, `sofia::register`, `sofia::unregister`, `sofia::expire`, `sofia::gateway_state`, `callcenter::info`, `conference::maintenance`, `conference::cdr`, `valet_parking::info`. +- Habilitar **verbose events** globais em `freeswitch.conf.xml`: `` ou por canal via `verbose_events=true` channel var — necessário para ter `variable_sip_call_id` (chave de correlação com o sniffer) e `variable_hangup_cause_q850`. + +### 7.3 Módulos relevantes +- `mod_sofia` (SIP) — obrigatório. +- `mod_event_socket` — obrigatório (ESL). +- `mod_callcenter` (se ACD), `mod_conference` (se conferência), `mod_valet_parking` (se valet) — opcionais conforme deploy. +- `mod_otel` `[verificar]` — existe um módulo comunitário mod_otel; se presente, pode complementar, mas **não é necessário** para esta proposta (tudo via ESL + Genesis). + +## 8. Correlação no backend de observabilidade (sem mudanças no sniffer) + +**Diretriz**: o sniffer **não é modificado**. Toda a correlação acontece por atributos compartilhados, no backend. + +### 8.1 Chaves de correlação + +| Chave | Genesis (span attr) | Sniffer (span attr, já existe) | Uso | +|---|---|---|---| +| SIP Call-ID | `sip.call_id` (= `variable_sip_call_id`) | `voip.call_id` | **Join principal** trace de controle ↔ trace de captura | +| Channel-Call-UUID | `channel.call_uuid` | — (não visível no SIP) | Agrupar a-leg/b-leg **dentro** do trace Genesis | +| Bridge legs | `bridge.a_uuid`, `bridge.b_uuid` | — | Cross-leg: do `sip.call_id` de um leg, ler `bridge.b_uuid` para achar o outro | +| Network | `channel.network_addr`, `sip.remote_ip` `[verificar ESL field]` | IPs/ports do RTP/SIP | Correlação secundária quando `sip.call_id` ausente | + +### 8.2 Join no Grafana/Tempo + +1. **Traces**: query por atributo — `span.attrs["sip.call_id"] = ""` retorna o trace Genesis (service=genesis) e o trace sniffer (service=sniffer) lado a lado. Não há parentesco OTel direto (intencional). +2. **Métricas → traces**: usar **OTel exemplars**. O SDK anexa o `trace_id` do span corrente como exemplar ao registrar cada métrica dentro de um span. No Grafana, painéis de `genesis.*` e `voip.*` passam a ter exemplars que linkam direto para o trace — correlação métrica↔trace sem label de alta cardinalidade. +3. **Métricas↔métricas**: **não** usar `sip.call_id`/UUIDs como label de métrica (cardinalidade). Agregar por labels low-cardinality (`channel.state`, `direction`, `hangup.cause`, `application.name`) e correlacionar via o `trace_id` do exemplar quando precisar cruzar `genesis.call.duration` com `voip.call.duration_s`. + +### 8.3 Fluxo de investigação de chamada (para o painel/dashboard) + +1. Usuário parte do número discado ou caller → busca no sniffer `voip.call_id` (SIP Call-ID). +2. Query Tempo por `sip.call_id` → abre trace Genesis (`freeswitch.channel.*`) e trace sniffer (`voip.call.*`/`voip.rtp.stream`). +3. No span `freeswitch.channel.bridge` lê `bridge.b_uuid` → segunda query por `sip.call_id` do leg B (dialog SIP distinto). +4. Em `freeswitch.channel.hangup_complete` lê `hangup.cause` + `hangup.cause.q850` (autoritativo) e cruza com `voip.call.duration_s`/MOS do sniffer para correlacionar causa de controle × qualidade de mídia. + +### 8.4 Resource attributes (Genesis) + +- `service.name=genesis`, `service.namespace=control` `[verificar convenção atual]` para distinguir de `service.name=sniffer`/`service.namespace=voip` no backend. +- Garantir que o `TracerProvider` (item 6, `cli/__init__.py`) exporte com o mesmo endpoint OTLP do sniffer (ou para o mesmo collector) — o join só funciona se ambos chegarem ao mesmo backend. + +## 9. Testes (Genesis) + +Regras do `CLAUDE.md`: **proibido `asyncio.sleep`**; usar `asyncio.Event`/`Condition`/`Future`/`wait_for`; fixtures em `tests/conftest.py`, doubles em `tests/doubles.py`, payloads em `tests/payloads.py`; `asyncio_mode=auto`; timeout 10s. + +### Novos payloads em `tests/payloads.py` +- `channel_progress` (CS_RINGING, Answer-State=ringing) +- `channel_progress_media` (CS_RINGING, CCS_EARLY, Channel-Read/Write-Codec-Name) +- `channel_bridge` (Bridge-A-Unique-ID, Bridge-B-Unique-ID, Other-Leg-Unique-ID, Other-Type) +- `channel_unbridge` +- `call_update` (Bridged-To, Caller-Transfer-Source) +- `codec` (channel-read-codec-name/rate) +- `playback_start`, `playback_stop` +- `record_start`, `record_stop` +- `channel_execute` (Application, Application-UUID) +- `channel_execute_complete` (Application, Application-UUID, Application-Response) +- `channel_destroy` +- `sofia_transferor`, `sofia_transferee` (Event-Subclass, Other-Leg-Unique-ID) +- `sofia_reinvite` +- `callcenter_info` (CC-Queue, CC-Action, CC-Agent, CC-Member-UUID) +- `conference_maintenance` (Conference-Name, Action, Member-ID) +- `valet_info` (Valet-Lot-Name, Bridge-To-UUID) +- `channel_create_verbose` (com `variable_sip_call_id`, `Caller-Context`, `Caller-Destination-Number`) + +### Novos testes (em `tests/test_channel_lifecycle.py` `[novo]`) +- `test_channel_create_span_attrs` — dispara `channel_create_verbose` num `FakeProtocol` (doubles.py), verifica span `freeswitch.channel.create` com attrs `channel.context`, `channel.destination_number`, `channel.direction`, `sip.call_id`. +- `test_channel_bridge_span_links_a_b_leg` — dispara `channel_bridge`, verifica span `freeswitch.channel.bridge` com `bridge.a_uuid` e `bridge.b_uuid` e span event `bridge.established`. +- `test_channel_unbridge_span_event` — verifica span event `bridge.torn_down`. +- `test_channel_hangup_complete_q850` — dispara `channel_hangup_complete` com `variable_hangup_cause_q850=16`, verifica attr `hangup.cause.q850=16` e span event `call.finalized`. +- `test_channel_progress_media_early_codec` — verifica attrs `channel.read_codec`, `answer.state=early`. +- `test_call_update_transfer_correlation` — dispara `call_update` + `sofia_transferor`, verifica `transfer.role=transferor` e `bridged.to`. +- `test_process_event_routing_attrs` — dispara evento com `Call-Direction`, `Channel-State`, `Other-Leg-Unique-ID`, verifica attrs no span `process_event`. +- `test_send_command_error_span_status` — duplo que responde `-ERR`, verifica `span.status=ERROR` e `command.error=protocol_error` e `command.name=api` (verbo, não string crua). +- `test_channel_create_sip_call_id_attr` — `Channel.create` com evento contendo `variable_sip_call_id`, verifica attr `sip.call_id` presente no span (chave de correlação com o sniffer). +- `test_ring_group_set_status_on_error` — `RingGroup.ring` com backend que levanta, verifica `span.status=ERROR` (gap do mapeamento). +- `test_call_duration_histogram_has_attrs` — hangup com cause, verifica métrica `genesis.call.duration` gravada com attrs `hangup.cause`, `direction`. +- `test_observable_gauge_queue_depth` — `FakeProtocol` com N eventos na queue, callback do ObservableGauge retorna o tamanho esperado. + +### Novos testes em `tests/test_session_tracing.py` `[novo]` +- `test_session_sendmsg_span` +- `test_session_await_complete_span` + +### Novos testes em `tests/test_consumer_tracing.py` `[novo]` +- `test_consumer_dispatch_span` + +### Doubles em `tests/doubles.py` +- **ADICIONAR** `FakeTracer`/`FakeSpan` que registre attrs, events, status, links em listas inspecionáveis (se já não existir). +- **ADICIONAR** `FakeMeter` que capture `add()`/`record()` calls com attrs. + +## 10. Rollout / migração + +### Ordem de implementação (fases) +1. **Fase 0 — Refactor sem mudança observável**: centralizar métricas em `genesis/protocol/metrics.py`, remover duplicações em `channel.py` (resolver circular import via import lazy ou mover constants para `genesis/protocol/_metrics_constants.py` `[novo]`). +2. **Fase 1 — Correções em spans existentes**: `send_command` (verbo + erro), `process_event` (atributos routing), `ring_group.ring` (`set_status`), `call.duration` (attrs). +3. **Fase 2 — Spans de lifecycle ESL**: novos processors + spans `freeswitch.channel.*` (CREATE/PROGRESS/ANSWER/BRIDGE/UNBRIDGE/HANGUP/HANGUP_COMPLETE/DESTROY/EXECUTE/CODEC/PLAYBACK/RECORD). +4. **Fase 3 — CUSTOM subclasses**: sofia::transfer*, callcenter, conference, valet. +5. **Fase 4 — Session/Consumer instrumentation**: spans em `session.py` e `consumer.py`. +6. **Fase 5 — Métricas novas e ObservableGauges**. + +### Compatibilidade +- Todos os novos spans/métricas são **aditivos**; consumers atuais (`Consumer.handle`, `Channel.on_dtmf`, `protocol.on`) continuam funcionando. +- Novos event processors **não devem consumir** eventos que já roteiam para handlers de usuário — apenas enriquecem telemetria. +- `command.name` rename: spans OTel são opacos para a API pública; nenhum consumidor do Genesis lê spans programaticamente (só o backend OTel). +- `call.duration` com attrs: backends OTel agregam por attrs; sem attr continua funcionando (label vazio). + +### Feature flags +- `GENESIS_TRACE_ESL_LIFECYCLE=1` (default on) — habilita spans `freeswitch.channel.*`. +- `GENESIS_TRACE_SIP_HEADER=0` (default **off**) — injeção de `` (propagação W3C); **fora do escopo deste PR**, flag reservada para o futuro. +- `GENESIS_TRACE_CUSTOM_SUBCLASSES=1` (default on) — habilita spans de sofia::/callcenter::/conference::/valet::. +- Implementar via `os.environ.get` no módulo de telemetria, guards nos processors. + +### Checklist de PR (pré-merge) +1. `poetry run black --check genesis/ tests/ examples/` +2. `poetry run mypy` +3. `poetry run pytest tests/` +4. `poetry run tox` (Python 3.10, 3.11, 3.12) +5. Não assinar commits/PR (memória `feedback_pr_signature`). + +## 11. Riscos e trade-offs + +| Risco | Mitigação | +|---|---| +| **Cardinalidade de métricas com UUIDs** | UUIDs (`channel.uuid`, `bridge.a_uuid`, `other_leg.uuid`, `application.uuid`) **só em spans**, nunca em métricas. Métricas usam enums/labels low-cardinality (`channel.state`, `direction`, `hangup.cause`, `application.name`). | +| **Volume de spans por chamada** | Uma chamada simples de 2 legs pode gerar ~20-30 spans (Genesis controle) + ~5-10 (sniffer SIP/RTP). Mitigar com `TraceIDRatioBased` no Genesis (`GENESIS_OTEL_SAMPLE_RATIO`) e mantendo `AlwaysSample` só em dev. Spans `process_event` e `route_event` podem ser desligáveis via flag. | +| **Falha de correlação por `sip.call_id`** | Se o evento ESL não trouxer `variable_sip_call_id` (leg não-SIP, gateway sem `verbose_events`), o span Genesis fica sem a chave. Mitigar: exigir `verbose_events=true` no FS; registrar `sip.call_id=unknown` explícito para não mascarar o gap; métrica `genesis.events.without_sip_call_id` (Counter) para medir adoção. | +| **`Channel-Call-UUID` roll em transfer** | Reavaliar a cada `CHANNEL_BRIDGE`; se mudar, novo span root com span Link para o trace anterior (não quebra o trace anterior, ramifica). | +| **Verbosidade de eventos ESL** | Habilitar `verbose_events` só onde necessário (produção pode gerar payload grande em `CHANNEL_HANGUP_COMPLETE` com XML CDR). Feature flag `GENESIS_ESL_VERBOSE_CDR=0` para não ingerir o body XML. | +| **Duplicação `channel.bridge` (comando) vs `freeswitch.channel.bridge` (evento)** | Documentar: o span do comando mede a chamada `api uuid_bridge`; o span do evento mede o instante autoritativo do bridge no FS. Namespaces distintos (`channel.*` vs `freeswitch.channel.*`). | +| **Overhead de `process_event` estendido** | Envolver dispatch no span pode aumentar duração do span (mas não do código). Medir com `genesis.event.processing.duration`. | +| **Circular import ao centralizar métricas** | Resolver movendo constantes de atributo para um módulo sem dependência de `protocol/base.py` (ex. `genesis/protocol/_attr_constants.py` `[novo]`), e importando os instrumentos prontos em `channel.py` via import direto de `metrics.py`. | +| **`sip.call_id` divergente entre a-leg/b-leg** | Cada leg é um dialog SIP distinto com Call-ID próprio; o join cross-leg não é por `sip.call_id` mas por `channel.call_uuid` + `bridge.a/b_uuid` dentro do trace Genesis. Documentar o fluxo de navegação no painel Grafana. | +| **`variable_sip_call_id` ausente em legs originate** | Em `originate` outbound o SIP Call-ID pode não estar disponível no `CHANNEL_CREATE` (só após o primeiro response SIP). Mitigar: anexar `sip.call_id` no `process_event` assim que o campo aparecer, e regravar no span de lifecycle posterior (PROGRESS/ANSWER). | + +## 12. Checklist do PR + +- [ ] Métricas duplicadas removidas de `genesis/channel.py` (linhas 32-68) e centralizadas em `genesis/protocol/metrics.py` +- [ ] `send_command` (`base.py:290`): `command.name` = verbo parseado; `command.error` + `set_status(ERROR)` + `record_exception` no `-ERR` +- [ ] `process_event` (`base.py:201`): attrs `event.direction/channel_state/answer_state/hangup_cause/subclass/call_uuid/other_leg` + `sip.call_id` (correlação sniffer) via `build_event_attributes` +- [ ] `build_event_attributes` (`telemetry.py:15`) estendido com os 9 novos attrs +- [ ] Novo `channel_lifecycle_processor` em `genesis/protocol/processors.py` emitindo spans `freeswitch.channel.{create,progress,progress_media,answer,bridge,unbridge,hangup,hangup_complete,destroy,execute,execute_complete,park,unpark}` +- [ ] Novo `sofia_custom_processor` (e callcenter/conference/valet) em `processors.py` +- [ ] `freeswitch.channel.bridge` carrega `bridge.a_uuid`/`bridge.b_uuid` + span event `bridge.established` +- [ ] `freeswitch.channel.hangup_complete` carrega `hangup.cause.q850` + span event `call.finalized` +- [ ] `freeswitch.call.update` + `freeswitch.sofia.transfer` com `transfer.role`/`bridged.to` +- [ ] `channel.create` (`channel.py:144`) registra attrs `sip.call_id` + `channel.call_uuid` no span +- [ ] `genesis/cli/__init__.py:78` instala `TracerProvider` + `BatchSpanProcessor` (OTLP) — `TextMapPropagator` só se propagação W3C futura +- [ ] `ring_group.ring` (`ring.py:138`) chama `set_status(ERROR)` na exceção; novos attrs `ring_group.balancer_backend/selected_dial_path/context` +- [ ] `call_duration_histogram.record` (`channel.py:573`) com attrs `hangup.cause`/`direction` +- [ ] Spans em `genesis/session.py`: `session.sendmsg`, `session.start/stop`, `session.await_complete` +- [ ] Spans em `genesis/consumer.py`: `consumer.start/stop`, `consumer.dispatch` +- [ ] `genesis/outbound.py` incrementa `genesis.connections.errors` (type=outbound) +- [ ] Métricas novas: `genesis.calls.active`, `genesis.channel.bridge.events`, `genesis.channel.transfers`, `genesis.channel.codec.changes`, `genesis.dialplan.applications`, `genesis.channel.hangup.causes.q850`, `genesis.event.processing.duration`, `genesis.commands.queue.depth`, `genesis.events.queue.depth` +- [ ] `genesis/group/load_balancer.py` instrumentado com `genesis.loadbalancer.selections/errors` +- [ ] Payloads novos em `tests/payloads.py` (channel_progress/bridge/unbridge/call_update/codec/playback/record/execute/destroy/sofia_transferor/callcenter_info/conference_maintenance/valet_info/channel_create_verbose) +- [ ] Doubles `FakeTracer`/`FakeSpan`/`FakeMeter` em `tests/doubles.py` +- [ ] Testes novos em `tests/test_channel_lifecycle.py`, `tests/test_session_tracing.py`, `tests/test_consumer_tracing.py` (sem `asyncio.sleep`, com `asyncio.Event`/`wait_for`) +- [ ] `poetry run black --check genesis/ tests/ examples/` passa +- [ ] `poetry run mypy` passa +- [ ] `poetry run pytest tests/` passa (timeout 10s) +- [ ] `poetry run tox` passa (3.10, 3.11, 3.12) +- [ ] Sem assinatura de commit/PR (respeitar `feedback_pr_signature`) +- [ ] Documentação: atualizar `CLAUDE.md` seção Observability Pattern com os novos spans; `[verificar]` se há `docs/` no Genesis para atualizar +- [ ] **Nenhuma mudança no sniffer** — correlação exclusivamente por `sip.call_id` no backend de observabilidade (Grafana/Tempo + exemplars) +- [ ] Documentar no `CLAUDE.md`/docs o fluxo de join Genesis↔sniffer por `sip.call_id` e o fluxo cross-leg via `bridge.a/b_uuid` diff --git a/genesis/channel.py b/genesis/channel.py index 80cb6c1..0bffa78 100644 --- a/genesis/channel.py +++ b/genesis/channel.py @@ -15,66 +15,34 @@ import time from asyncio import Event, wait_for, TimeoutError as AsyncioTimeoutError -from opentelemetry import trace, metrics +from opentelemetry import trace from genesis.protocol import Protocol from genesis.session import Session from genesis.inbound import Inbound from genesis.protocol.parser import ESLEvent +from genesis.protocol.metrics import ( + channel_operations_counter, + channel_operation_duration, + hangup_causes_counter, + bridge_operations_counter, + dtmf_received_counter, + call_duration_histogram, + timeout_counter, +) from genesis.types import HangupCause, ChannelState, ContextType from genesis.exceptions import ChannelError, TimeoutError from genesis.observability import logger tracer = trace.get_tracer(__name__) -meter = metrics.get_meter(__name__) - -# Define metrics here to avoid circular imports -channel_operations_counter = meter.create_counter( - "genesis.channel.operations", - description="Number of channel operations", - unit="1", -) - -channel_operation_duration = meter.create_histogram( - "genesis.channel.operation.duration", - description="Duration of channel operations", - unit="s", -) - -hangup_causes_counter = meter.create_counter( - "genesis.channel.hangup.causes", - description="Hangup causes", - unit="1", -) - -bridge_operations_counter = meter.create_counter( - "genesis.channel.bridge.operations", - description="Bridge operations", - unit="1", -) - -dtmf_received_counter = meter.create_counter( - "genesis.channel.dtmf.received", - description="DTMF digits received", - unit="1", -) - -call_duration_histogram = meter.create_histogram( - "genesis.call.duration", - description="Total call duration from creation to hangup", - unit="s", -) - -timeout_counter = meter.create_counter( - "genesis.timeouts", - description="Number of timeouts", - unit="1", -) # Span/attribute names (S1192: avoid duplicated literals) ATTR_CHANNEL_UUID = "channel.uuid" +ATTR_CHANNEL_CALL_UUID = "channel.call_uuid" +ATTR_SIP_CALL_ID = "sip.call_id" ATTR_CHANNEL_STATE = "channel.state" ATTR_HANGUP_CAUSE = "hangup.cause" +ATTR_HANGUP_CAUSE_Q850 = "hangup.cause.q850" ATTR_WAIT_TYPE = "wait.type" ATTR_WAIT_RESULT = "wait.result" ATTR_WAIT_DURATION = "wait.duration" @@ -157,6 +125,14 @@ async def create( raise ChannelError("Failed to retrieve UUID from FreeSWITCH") self.uuid = response.body.strip() span.set_attribute(ATTR_CHANNEL_UUID, self.uuid) + # channel.call_uuid groups a-leg/b-leg within the Genesis trace; + # at originate time it equals the origination UUID. + span.set_attribute(ATTR_CHANNEL_CALL_UUID, self.uuid) + # sip.call_id is the join key with the sniffer (voip.call_id). + # It is usually not known yet at originate; attach when present. + sip_call_id = _context_str(self.context, "variable_sip_call_id") + if sip_call_id: + span.set_attribute(ATTR_SIP_CALL_ID, sip_call_id) self.protocol.on("CHANNEL_STATE", self._state_handler) await self.protocol.send(f"filter Unique-ID {self.uuid}") @@ -568,9 +544,25 @@ async def hangup(self, cause: HangupCause = "NORMAL_CLEARING") -> ESLEvent: def on_success(span: Any, result: ESLEvent, duration: float) -> None: hangup_causes_counter.add(1, attributes={ATTR_HANGUP_CAUSE: cause}) + # Q.850 code (authoritative) when FreeSWITCH exposed it on the leg. + q850 = _context_str(self.context, "variable_hangup_cause_q850") + if q850: + span.set_attribute(ATTR_HANGUP_CAUSE_Q850, q850) + # call.duration recorded with low-cardinality attrs so it can be + # partitioned by cause/direction (NO UUID: cardinality rule). if call_duration is not None: span.set_attribute("call.duration", call_duration) - call_duration_histogram.record(call_duration) + direction = _context_str(self.context, "Call-Direction") or "unknown" + call_duration_histogram.record( + call_duration, + attributes={ATTR_HANGUP_CAUSE: cause, "direction": direction}, + ) + # Mark the command-side hangup span; the authoritative marker comes + # from CHANNEL_HANGUP_COMPLETE (see channel_lifecycle_processor). + span.add_event( + "hangup.command_issued", + attributes={ATTR_HANGUP_CAUSE: cause}, + ) def on_error(exc: Exception) -> None: hangup_causes_counter.add( @@ -616,6 +608,24 @@ async def bridge(self, other: Channel | Session) -> ESLEvent: def on_success(span: Any, result: ESLEvent, duration: float) -> None: success = result.get("Reply-Text", "").startswith("+OK") bridge_operations_counter.add(1, attributes={"success": str(success)}) + # Correlation attrs: a/b leg UUIDs let the backend cross the two + # SIP dialogs of a bridged call (each leg has its own sip.call_id). + span.set_attribute("bridge.a_uuid", self.uuid or "unknown") + span.set_attribute("bridge.b_uuid", other_uuid or "unknown") + call_uuid = _context_str(self.context, "Channel-Call-UUID") or ( + self.uuid or "unknown" + ) + span.set_attribute(ATTR_CHANNEL_CALL_UUID, call_uuid) + sip_call_id = _context_str(self.context, "variable_sip_call_id") + if sip_call_id: + span.set_attribute(ATTR_SIP_CALL_ID, sip_call_id) + span.add_event( + "bridge.command_issued", + attributes={ + "bridge.a_uuid": self.uuid or "unknown", + "bridge.b_uuid": other_uuid or "unknown", + }, + ) def on_error(exc: Exception) -> None: bridge_operations_counter.add( diff --git a/genesis/consumer.py b/genesis/consumer.py index d4dcc43..848d5a3 100644 --- a/genesis/consumer.py +++ b/genesis/consumer.py @@ -9,9 +9,13 @@ import re from typing import Any, Callable, Optional +from opentelemetry import trace + from genesis.inbound import Inbound from genesis.observability import logger, observability +tracer = trace.get_tracer(__name__) + async def _invoke_maybe_coro(func: Callable[..., Any], message: Any) -> Any: """Invoke handler and await if it returns a coroutine.""" @@ -131,15 +135,25 @@ async def start(self) -> None: self.protocol.on("HEARTBEAT", observability.record_heartbeat) async with self.protocol as protocol: - logger.debug("Asking freeswitch to send us all events.") - await protocol.send("events plain ALL") - - for event in protocol.handlers.keys(): - logger.debug( - "Requesting freeswitch to filter events of type '%s'.", - event, - ) - await protocol.send(self._filter_command(event)) + # The consumer.start span wraps only the setup phase (auth, + # events subscription, filter registration) so it finalizes + # promptly and is observable; the blocking wait() runs outside. + with tracer.start_as_current_span( + "consumer.start", + attributes={ + "consumer.host": self.host, + "consumer.port": self.port, + }, + ): + logger.debug("Asking freeswitch to send us all events.") + await protocol.send("events plain ALL") + + for event in protocol.handlers.keys(): + logger.debug( + "Requesting freeswitch to filter events of type '%s'.", + event, + ) + await protocol.send(self._filter_command(event)) await self.wait() @@ -148,4 +162,5 @@ async def start(self) -> None: raise async def stop(self) -> None: - await self.protocol.stop() + with tracer.start_as_current_span("consumer.stop"): + await self.protocol.stop() diff --git a/genesis/group/ring.py b/genesis/group/ring.py index 97095da..133c477 100644 --- a/genesis/group/ring.py +++ b/genesis/group/ring.py @@ -19,6 +19,11 @@ from genesis.types import ChannelState, HangupCause from genesis.exceptions import TimeoutError from genesis.group.load_balancer import LoadBalancerBackend +from genesis.protocol.metrics import ( + loadbalancer_selections_counter, + loadbalancer_errors_counter, + safe_add, +) tracer = trace.get_tracer(__name__) meter = metrics.get_meter(__name__) @@ -145,6 +150,14 @@ async def ring( balancer is not None and mode == RingMode.BALANCING ), "ring_group.has_variables": str(variables is not None), + "ring_group.balancer_backend": ( + type(balancer).__name__ + if balancer is not None and mode == RingMode.BALANCING + else "none" + ), + "ring_group.context": ( + variables.get("user_context", "unknown") if variables else "unknown" + ), }, ) as span: try: @@ -164,6 +177,16 @@ async def ring( span.set_attribute( "ring_group.answered_dial_path", answered.dial_path ) + span.set_attribute( + "ring_group.selected_dial_path", answered.dial_path + ) + span.add_event( + "ring_group.leg_answered", + attributes={ + "ring_group.answered_uuid": answered.uuid or "unknown", + "ring_group.selected_dial_path": answered.dial_path, + }, + ) # Record metrics ring_group_operations_counter.add( @@ -203,6 +226,7 @@ async def ring( span.set_attribute("ring_group.error", str(e)) span.set_attribute("ring_group.duration", duration) span.record_exception(e) + span.set_status(trace.Status(trace.StatusCode.ERROR, str(e))) ring_group_results_counter.add( 1, @@ -332,10 +356,40 @@ async def _ring_balancing( """Ring destinations sequentially using load balancing, return first to answer.""" remaining = list(group) + backend_name = type(balancer).__name__ while remaining: - least_loaded = await balancer.get_least_loaded(remaining) + try: + least_loaded = await balancer.get_least_loaded(remaining) + except Exception as e: + safe_add( + loadbalancer_errors_counter, + 1, + attributes={ + "loadbalancer.backend": backend_name, + "error": type(e).__name__, + }, + ) + least_loaded = None + if not least_loaded: least_loaded = remaining[0] + safe_add( + loadbalancer_selections_counter, + 1, + attributes={ + "loadbalancer.backend": backend_name, + "loadbalancer.result": "fallback", + }, + ) + else: + safe_add( + loadbalancer_selections_counter, + 1, + attributes={ + "loadbalancer.backend": backend_name, + "loadbalancer.result": "selected", + }, + ) await balancer.increment(least_loaded) diff --git a/genesis/inbound.py b/genesis/inbound.py index 2c7b8ae..a114962 100644 --- a/genesis/inbound.py +++ b/genesis/inbound.py @@ -8,33 +8,18 @@ from asyncio import TimeoutError, open_connection, wait_for -from opentelemetry import metrics, trace +from opentelemetry import trace from genesis.exceptions import AuthenticationError, ConnectionTimeoutError from genesis.observability import logger from genesis.protocol import Protocol - -tracer = trace.get_tracer(__name__) -meter = metrics.get_meter(__name__) - -active_connections_counter = meter.create_up_down_counter( - "genesis.connections.active", - description="Number of active connections", - unit="1", -) -connection_errors_counter = meter.create_counter( - "genesis.connections.errors", - description="Number of connection errors", - unit="1", +from genesis.protocol.metrics import ( + connection_errors_counter, + connections_active_counter, + safe_add, ) - -def _safe_connection_metric(counter: object, *args: object, **kwargs: object) -> None: - """Add to a counter, swallowing OTel/metrics errors.""" - try: - getattr(counter, "add")(*args, **kwargs) - except Exception: - pass +tracer = trace.get_tracer(__name__) class Inbound(Protocol): @@ -84,7 +69,7 @@ async def authenticate(self) -> None: if response["Reply-Text"] != "+OK accepted": logger.debug("Freeswitch said the passed password is incorrect.") - _safe_connection_metric( + safe_add( connection_errors_counter, 1, attributes={"error": "authentication_failed", "type": "inbound"}, @@ -104,7 +89,7 @@ async def start(self) -> None: await self._connect() except TimeoutError: logger.debug("A timeout occurred when trying to connect to the freeswitch.") - _safe_connection_metric( + safe_add( connection_errors_counter, 1, attributes={"error": "timeout", "type": "inbound"}, @@ -113,9 +98,7 @@ async def start(self) -> None: await super().start() try: - _safe_connection_metric( - active_connections_counter, 1, attributes={"type": "inbound"} - ) + safe_add(connections_active_counter, 1, attributes={"type": "inbound"}) await self.authenticate() except Exception: await self.stop() @@ -124,6 +107,4 @@ async def start(self) -> None: async def stop(self) -> None: """Terminates the connection.""" await super().stop() - _safe_connection_metric( - active_connections_counter, -1, attributes={"type": "inbound"} - ) + safe_add(connections_active_counter, -1, attributes={"type": "inbound"}) diff --git a/genesis/outbound.py b/genesis/outbound.py index d2c02b2..89dbc8c 100644 --- a/genesis/outbound.py +++ b/genesis/outbound.py @@ -21,28 +21,18 @@ from collections.abc import Callable from typing import Any, Awaitable, Optional -from opentelemetry import metrics, trace +from opentelemetry import trace from genesis.observability import logger, observability from genesis.channel import Channel from genesis.session import Session - -tracer = trace.get_tracer(__name__) -meter = metrics.get_meter(__name__) - -active_connections_counter = meter.create_up_down_counter( - "genesis.connections.active", - description="Number of active connections", - unit="1", +from genesis.protocol.metrics import ( + connection_errors_counter, + connections_active_counter, + safe_add, ) - -def _safe_connection_metric(counter: object, *args: object, **kwargs: object) -> None: - """Add to a counter, swallowing OTel/metrics errors.""" - try: - getattr(counter, "add")(*args, **kwargs) - except Exception: - pass +tracer = trace.get_tracer(__name__) async def _setup_session(session: Session, server: "Outbound") -> None: @@ -159,9 +149,9 @@ async def handler( "net.peer.name": server.host, "net.peer.port": server.port, }, - ): - _safe_connection_metric( - active_connections_counter, + ) as span: + safe_add( + connections_active_counter, 1, attributes={"type": "outbound"}, ) @@ -169,9 +159,20 @@ async def handler( async with Session(reader, writer) as session: await _setup_session(session, server) await server.app(session) + except Exception as e: + # Record the outbound connection error (gap from mapping: + # outbound previously had no error counter). + safe_add( + connection_errors_counter, + 1, + attributes={"error": type(e).__name__, "type": "outbound"}, + ) + span.record_exception(e) + span.set_status(trace.Status(trace.StatusCode.ERROR, str(e))) + raise finally: - _safe_connection_metric( - active_connections_counter, + safe_add( + connections_active_counter, -1, attributes={"type": "outbound"}, ) diff --git a/genesis/protocol/base.py b/genesis/protocol/base.py index ed5412e..db45f73 100644 --- a/genesis/protocol/base.py +++ b/genesis/protocol/base.py @@ -43,6 +43,9 @@ timeout_counter, channel_routing_counter, global_routing_counter, + event_processing_duration, + register_protocol, + safe_record, ) from genesis.protocol.routing import ( CompositeRoutingStrategy, @@ -73,6 +76,8 @@ def __init__(self): self.handlers: Dict[str, List[EventHandler]] = {} self.channel_registry: Dict[str, List[EventHandler]] = {} self.handler_tasks: set[Task[Any]] = set() + # Register so the ObservableGauges can report this protocol's queue depth. + register_protocol(self) # Initialize routing strategy (Strategy Pattern) self.routing_strategy = CompositeRoutingStrategy( @@ -195,24 +200,42 @@ async def consume(self) -> None: logger.error(f"Error in consumer loop: {outer_e}", exc_info=True) async def _process_one_event(self, event: ESLEvent) -> None: - """Run telemetry, processors, and dispatch for one event.""" + """Run telemetry, processors, and dispatch for one event. + + The ``process_event`` span wraps metrics+logging AND the processor + chain + routing, so the new ``freeswitch.channel.*`` lifecycle spans + (emitted by processors) become children of it and share its trace. + """ try: attributes = build_event_attributes(event) - with tracer.start_as_current_span("process_event", attributes=attributes): - record_event_metrics(event) - log_event(event) except Exception: - record_event_metrics(event) - log_event(event) + attributes = {} - for processor in self.event_processors: - result = processor(self, event) - if asyncio.iscoroutine(result): - await result + start_time = time.perf_counter() + with tracer.start_as_current_span("process_event", attributes=attributes): + try: + record_event_metrics(event) + log_event(event) + except Exception: + record_event_metrics(event) + log_event(event) - handlers, _ = await self.routing_strategy.route(event) - if handlers: - dispatch_to_handlers(handlers, event, self.handler_tasks) + for processor in self.event_processors: + result = processor(self, event) + if asyncio.iscoroutine(result): + await result + + handlers, _ = await self.routing_strategy.route(event) + if handlers: + dispatch_to_handlers(handlers, event, self.handler_tasks) + + safe_record( + event_processing_duration, + time.perf_counter() - start_time, + attributes={ + "event.name": event.get("Event-Name", "UNKNOWN"), + }, + ) def on( self, @@ -288,7 +311,12 @@ async def send(self, cmd: str) -> ESLEvent: try: with tracer.start_as_current_span("send_command") as span: - span.set_attribute("command.name", cmd) + # Use the command verb (first token) as command.name to avoid + # high-cardinality span attributes (raw cmd may carry UUIDs). + span.set_attribute("command.name", command_name) + remainder = cmd[len(command_name) :].strip() + if remainder: + span.set_attribute("command.args", remainder[:200]) return await self._execute_send(cmd, command_name, start_time, span) except Exception: # OTel not initialized - run without tracing @@ -330,6 +358,10 @@ async def _execute_send( reply = result.get("Reply-Text", "") if reply.startswith("-ERR"): self._record_command_error(command_name, "protocol_error") + if span is not None: + span.set_attribute("command.error", "protocol_error") + span.set_status(trace.Status(trace.StatusCode.ERROR, reply)) + span.record_exception(Exception(reply)) if span is not None: reply_text = result.get("Reply-Text") diff --git a/genesis/protocol/lifecycle.py b/genesis/protocol/lifecycle.py new file mode 100644 index 0000000..c7a5a24 --- /dev/null +++ b/genesis/protocol/lifecycle.py @@ -0,0 +1,470 @@ +""" +ESL lifecycle telemetry processors +---------------------------------- + +These processors run after the core protocol processors and emit OpenTelemetry +spans for the semantic FreeSWITCH channel lifecycle (``freeswitch.channel.*``) +and for CUSTOM subclasses (``sofia::``, ``callcenter::``, ``conference::``, +``valet_parking::``). They only enrich telemetry — they never consume events +that route to user handlers. + +Correlation with the passive sniffer (Otoru/sniffer) is attribute-based: +every channel span carries ``sip.call_id`` (= ``variable_sip_call_id``), which +matches the sniffer's ``voip.call_id``. The join happens at the observability +backend (Grafana/Tempo), not in code. No sniffer changes are required. + +Cardinality rule: UUIDs go on spans only; metric attributes use low-cardinality +enums/labels (channel.state, direction, hangup.cause, application.name, ...). +""" + +import os +from typing import TYPE_CHECKING, Any, Dict, Optional + +from opentelemetry import trace + +from genesis.protocol.parser import ESLEvent +from genesis.protocol.metrics import ( + calls_active_counter, + channel_bridge_events_counter, + channel_codec_changes_counter, + channel_transfers_counter, + dialplan_applications_counter, + events_without_sip_call_id_counter, + hangup_q850_counter, + safe_add, +) + +if TYPE_CHECKING: + from genesis.protocol.base import Protocol + +tracer = trace.get_tracer(__name__) + +# Feature flags (default on; opt-out via env). Reserved for future W3C +# propagation is intentionally NOT implemented here (out of scope). +_LIFECYCLE_ENABLED = os.environ.get("GENESIS_TRACE_ESL_LIFECYCLE", "1") != "0" +_CUSTOM_ENABLED = os.environ.get("GENESIS_TRACE_CUSTOM_SUBCLASSES", "1") != "0" + + +def _str(event: ESLEvent, key: str) -> Optional[str]: + """Return a single string value for key (list-aware), or None.""" + value = event.get(key) + if value is None: + return None + if isinstance(value, list): + return value[0] if value else None + return value if isinstance(value, str) else str(value) + + +def _set(attrs: Dict[str, Any], dst: str, event: ESLEvent, src: str) -> None: + """Copy event[src] into attrs[dst] when present.""" + value = _str(event, src) + if value: + attrs[dst] = value + + +def _channel_attrs(event: ESLEvent) -> Dict[str, Any]: + """Common channel attributes (uuid, call_uuid, direction, sip.call_id, other_leg).""" + attrs: Dict[str, Any] = {} + _set(attrs, "channel.uuid", event, "Unique-ID") + _set(attrs, "channel.call_uuid", event, "Channel-Call-UUID") + _set(attrs, "channel.direction", event, "Call-Direction") + _set(attrs, "sip.call_id", event, "variable_sip_call_id") + _set(attrs, "other_leg.uuid", event, "Other-Leg-Unique-ID") + return attrs + + +def _record_sip_gap(event: ESLEvent, attrs: Dict[str, Any]) -> None: + """Count channel events that lack the sniffer correlation key.""" + if "sip.call_id" not in attrs: + safe_add(events_without_sip_call_id_counter, 1, attributes={}) + + +# Event names handled by the lifecycle processor. +_LIFECYCLE_EVENTS = { + "CHANNEL_CREATE", + "CHANNEL_PROGRESS", + "CHANNEL_PROGRESS_MEDIA", + "CHANNEL_ANSWER", + "CHANNEL_BRIDGE", + "CHANNEL_UNBRIDGE", + "CHANNEL_HANGUP", + "CHANNEL_HANGUP_COMPLETE", + "CHANNEL_DESTROY", + "CHANNEL_EXECUTE", + "CHANNEL_EXECUTE_COMPLETE", + "CHANNEL_PARK", + "CHANNEL_UNPARK", + "CALL_UPDATE", + "CODEC", +} + + +async def channel_lifecycle_processor(protocol: "Protocol", event: ESLEvent) -> None: + """Emit ``freeswitch.channel.*`` spans for channel lifecycle events.""" + if not _LIFECYCLE_ENABLED: + return + + name = _str(event, "Event-Name") + if not name or name not in _LIFECYCLE_EVENTS: + return + + attrs = _channel_attrs(event) + _record_sip_gap(event, attrs) + + if name == "CHANNEL_CREATE": + _emit_create(event, attrs) + elif name == "CHANNEL_PROGRESS": + _emit_progress(event, attrs) + elif name == "CHANNEL_PROGRESS_MEDIA": + _emit_progress_media(event, attrs) + elif name == "CHANNEL_ANSWER": + _emit_answer(event, attrs) + elif name == "CHANNEL_BRIDGE": + _emit_bridge(event, attrs) + elif name == "CHANNEL_UNBRIDGE": + _emit_unbridge(event, attrs) + elif name == "CHANNEL_HANGUP": + _emit_hangup(event, attrs) + elif name == "CHANNEL_HANGUP_COMPLETE": + _emit_hangup_complete(event, attrs) + elif name == "CHANNEL_DESTROY": + _emit_destroy(event, attrs) + elif name == "CHANNEL_EXECUTE": + _emit_execute(event, attrs) + elif name == "CHANNEL_EXECUTE_COMPLETE": + _emit_execute_complete(event, attrs) + elif name in ("CHANNEL_PARK", "CHANNEL_UNPARK"): + _emit_state_span(event, attrs, f"freeswitch.channel.{name.lower()[8:]}") + elif name == "CALL_UPDATE": + _emit_call_update(event, attrs) + elif name == "CODEC": + _emit_codec(event, attrs) + + +def _emit_create(event: ESLEvent, attrs: Dict[str, Any]) -> None: + _set(attrs, "channel.name", event, "Channel-Name") + _set(attrs, "channel.destination_number", event, "Caller-Destination-Number") + _set(attrs, "channel.context", event, "Caller-Context") + _set(attrs, "channel.dialplan", event, "Caller-Dialplan") + _set(attrs, "channel.caller_id_number", event, "Caller-Caller-ID-Number") + _set(attrs, "channel.caller_id_name", event, "Caller-Caller-ID-Name") + _set(attrs, "channel.network_addr", event, "Caller-Network-Addr") + with tracer.start_as_current_span("freeswitch.channel.create", attributes=attrs): + safe_add( + calls_active_counter, + 1, + attributes={ + "channel.state": _str(event, "Channel-State") or "CS_INIT", + "direction": _str(event, "Call-Direction") or "unknown", + }, + ) + + +def _emit_progress(event: ESLEvent, attrs: Dict[str, Any]) -> None: + _set(attrs, "channel.state", event, "Channel-State") + attrs["answer.state"] = _str(event, "Answer-State") or "ringing" + with tracer.start_as_current_span("freeswitch.channel.progress", attributes=attrs): + pass + + +def _emit_progress_media(event: ESLEvent, attrs: Dict[str, Any]) -> None: + attrs["answer.state"] = _str(event, "Answer-State") or "early" + _set(attrs, "channel.read_codec", event, "Channel-Read-Codec-Name") + _set(attrs, "channel.write_codec", event, "Channel-Write-Codec-Name") + with tracer.start_as_current_span( + "freeswitch.channel.progress_media", attributes=attrs + ): + pass + + +def _emit_answer(event: ESLEvent, attrs: Dict[str, Any]) -> None: + _set(attrs, "channel.state", event, "Channel-State") + attrs["answer.state"] = "answered" + _set(attrs, "channel.read_codec", event, "Channel-Read-Codec-Name") + _set(attrs, "channel.write_codec", event, "Channel-Write-Codec-Name") + with tracer.start_as_current_span("freeswitch.channel.answer", attributes=attrs): + pass + + +def _emit_bridge(event: ESLEvent, attrs: Dict[str, Any]) -> None: + _set(attrs, "bridge.a_uuid", event, "Bridge-A-Unique-ID") + _set(attrs, "bridge.b_uuid", event, "Bridge-B-Unique-ID") + _set(attrs, "other_leg.type", event, "Other-Type") + _set(attrs, "other_leg.destination_number", event, "Other-Leg-Destination-Number") + _set(attrs, "other_leg.caller_id_number", event, "Other-Leg-Caller-ID-Number") + with tracer.start_as_current_span( + "freeswitch.channel.bridge", attributes=attrs + ) as span: + a = attrs.get("bridge.a_uuid", "unknown") + b = attrs.get("bridge.b_uuid", "unknown") + span.add_event( + "bridge.established", + attributes={"bridge.a_uuid": a, "bridge.b_uuid": b}, + ) + safe_add( + channel_bridge_events_counter, + 1, + attributes={"bridge.result": "established"}, + ) + + +def _emit_unbridge(event: ESLEvent, attrs: Dict[str, Any]) -> None: + _set(attrs, "bridge.a_uuid", event, "Bridge-A-Unique-ID") + # CHANNEL_UNBRIDGE may carry Other-Leg-Unique-ID instead of Bridge-B. + if "bridge.b_uuid" not in attrs: + _set(attrs, "bridge.b_uuid", event, "Other-Leg-Unique-ID") + _set(attrs, "hangup.cause", event, "Hangup-Cause") + with tracer.start_as_current_span( + "freeswitch.channel.unbridge", attributes=attrs + ) as span: + span.add_event( + "bridge.torn_down", + attributes={ + "bridge.a_uuid": attrs.get("bridge.a_uuid", "unknown"), + "bridge.b_uuid": attrs.get("bridge.b_uuid", "unknown"), + }, + ) + metric_attrs: Dict[str, Any] = {"bridge.result": "unbridged"} + cause = attrs.get("hangup.cause") + if cause: + metric_attrs["hangup.cause"] = cause + safe_add(channel_bridge_events_counter, 1, attributes=metric_attrs) + + +def _emit_hangup(event: ESLEvent, attrs: Dict[str, Any]) -> None: + _set(attrs, "hangup.cause", event, "Hangup-Cause") + _set(attrs, "channel.state", event, "Channel-State") + attrs["answer.state"] = "hangup" + cause = _str(event, "Hangup-Cause") or "unknown" + normalized = cause.lower().replace(" ", "_") + with tracer.start_as_current_span( + "freeswitch.channel.hangup", attributes=attrs + ) as span: + span.add_event(f"hangup.cause.{normalized}", attributes={"hangup.cause": cause}) + + +def _emit_hangup_complete(event: ESLEvent, attrs: Dict[str, Any]) -> None: + _set(attrs, "hangup.cause", event, "Hangup-Cause") + _set(attrs, "hangup.cause.q850", event, "variable_hangup_cause_q850") + _set(attrs, "channel.name", event, "Channel-Name") + with tracer.start_as_current_span( + "freeswitch.channel.hangup_complete", attributes=attrs + ) as span: + span.add_event( + "call.finalized", + attributes={"hangup.cause": attrs.get("hangup.cause", "unknown")}, + ) + q850 = _str(event, "variable_hangup_cause_q850") + if q850: + safe_add( + hangup_q850_counter, + 1, + attributes={"hangup.cause.q850": q850}, + ) + + +def _emit_destroy(event: ESLEvent, attrs: Dict[str, Any]) -> None: + with tracer.start_as_current_span("freeswitch.channel.destroy", attributes=attrs): + safe_add( + calls_active_counter, + -1, + attributes={ + "channel.state": "CS_DESTROY", + "direction": _str(event, "Call-Direction") or "unknown", + }, + ) + + +def _emit_execute(event: ESLEvent, attrs: Dict[str, Any]) -> None: + _set(attrs, "application.name", event, "Application") + _set(attrs, "application.uuid", event, "Application-UUID") + _set(attrs, "application.data", event, "Application-Data") + with tracer.start_as_current_span("freeswitch.channel.execute", attributes=attrs): + app = _str(event, "Application") or "unknown" + safe_add( + dialplan_applications_counter, + 1, + attributes={"application.name": app, "application.result": "started"}, + ) + + +def _emit_execute_complete(event: ESLEvent, attrs: Dict[str, Any]) -> None: + _set(attrs, "application.name", event, "Application") + _set(attrs, "application.uuid", event, "Application-UUID") + _set(attrs, "application.response", event, "Application-Response") + app = _str(event, "Application") or "unknown" + response = _str(event, "Application-Response") or "" + result = "success" if response and not response.startswith("-ERR") else "fail" + with tracer.start_as_current_span( + "freeswitch.channel.execute_complete", attributes=attrs + ) as span: + span.add_event( + f"app.{app}.done", + attributes={"application.name": app, "application.result": result}, + ) + safe_add( + dialplan_applications_counter, + 1, + attributes={"application.name": app, "application.result": result}, + ) + + +def _emit_state_span(event: ESLEvent, attrs: Dict[str, Any], span_name: str) -> None: + _set(attrs, "channel.state", event, "Channel-State") + with tracer.start_as_current_span(span_name, attributes=attrs): + pass + + +def _emit_call_update(event: ESLEvent, attrs: Dict[str, Any]) -> None: + _set(attrs, "bridged.to", event, "Bridged-To") + _set(attrs, "caller.transfer_source", event, "Caller-Transfer-Source") + _set(attrs, "caller.orig_caller_id_number", event, "Caller-Orig-Caller-ID-Number") + with tracer.start_as_current_span( + "freeswitch.call.update", attributes=attrs + ) as span: + span.add_event("caller_id.mutated", attributes={}) + + +def _emit_codec(event: ESLEvent, attrs: Dict[str, Any]) -> None: + _set(attrs, "channel.read_codec.name", event, "Channel-Read-Codec-Name") + _set(attrs, "channel.read_codec.rate", event, "Channel-Read-Codec-Rate") + _set(attrs, "channel.write_codec.name", event, "Channel-Write-Codec-Name") + _set(attrs, "channel.write_codec.rate", event, "Channel-Write-Codec-Rate") + read_codec = _str(event, "Channel-Read-Codec-Name") or "unknown" + write_codec = _str(event, "Channel-Write-Codec-Name") or "unknown" + with tracer.start_as_current_span("freeswitch.channel.codec", attributes=attrs): + safe_add( + channel_codec_changes_counter, + 1, + attributes={ + "channel.read_codec": read_codec, + "channel.write_codec": write_codec, + }, + ) + + +# --------------------------------------------------------------------------- +# CUSTOM subclass processor +# --------------------------------------------------------------------------- +_CUSTOM_MAP = { + "sofia::transferor": "transferor", + "sofia::transferee": "transferee", + "sofia::reinvite": "reinvite", + "sofia::replaced": "replaced", + "sofia::register": "register", + "sofia::unregister": "register", + "sofia::expire": "register", + "sofia::gateway_state": "register", + "callcenter::info": "callcenter", + "conference::maintenance": "conference", + "conference::cdr": "conference", + "valet_parking::info": "valet", +} + + +async def custom_subclass_processor(protocol: "Protocol", event: ESLEvent) -> None: + """Emit spans for CUSTOM subclasses (sofia/callcenter/conference/valet).""" + if not _CUSTOM_ENABLED: + return + if _str(event, "Event-Name") != "CUSTOM": + return + subclass = _str(event, "Event-Subclass") + if not subclass or subclass not in _CUSTOM_MAP: + return + + attrs = _channel_attrs(event) + kind = _CUSTOM_MAP[subclass] + + if kind in ("transferor", "transferee"): + _emit_transfer(event, attrs, kind) + elif kind in ("reinvite", "replaced"): + _emit_reinvite(event, attrs, kind) + elif kind == "register": + _emit_register(event, attrs, subclass) + elif kind == "callcenter": + _emit_callcenter(event, attrs) + elif kind == "conference": + _emit_conference(event, attrs, subclass) + elif kind == "valet": + _emit_valet(event, attrs) + + +def _emit_transfer(event: ESLEvent, attrs: Dict[str, Any], role: str) -> None: + attrs["transfer.role"] = role + # Heuristic: transferee only occurs in attended transfers; a lone + # transferor is typically a blind transfer. + attrs["transfer.type"] = "attended" if role == "transferee" else "blind" + _set(attrs, "sofia.profile", event, "variable_sofia_profile_name") + with tracer.start_as_current_span( + "freeswitch.sofia.transfer", attributes=attrs + ) as span: + span.add_event( + "transfer.initiated", + attributes={"transfer.role": role, "transfer.type": attrs["transfer.type"]}, + ) + safe_add( + channel_transfers_counter, + 1, + attributes={ + "transfer.type": attrs["transfer.type"], + "transfer.role": role, + }, + ) + + +def _emit_reinvite(event: ESLEvent, attrs: Dict[str, Any], kind: str) -> None: + _set(attrs, "sofia.profile", event, "variable_sofia_profile_name") + with tracer.start_as_current_span( + f"freeswitch.sofia.{kind}", attributes=attrs + ) as span: + span.add_event("media.renegotiated", attributes={}) + + +def _emit_register(event: ESLEvent, attrs: Dict[str, Any], subclass: str) -> None: + from_user = _str(event, "from-user") + from_host = _str(event, "from-host") + if from_user and from_host: + attrs["register.aor"] = f"{from_user}@{from_host}" + _set(attrs, "register.contact_ip", event, "contact") + _set(attrs, "register.expires_s", event, "expires") + _set(attrs, "register.response_code", event, "status") + _set(attrs, "gateway.name", event, "Gateway-Name") + _set(attrs, "gateway.state", event, "State") + attrs["register.action"] = subclass.split("::")[1] + with tracer.start_as_current_span("freeswitch.sofia.register", attributes=attrs): + pass + + +def _emit_callcenter(event: ESLEvent, attrs: Dict[str, Any]) -> None: + _set(attrs, "cc.queue", event, "CC-Queue") + _set(attrs, "cc.action", event, "CC-Action") + _set(attrs, "cc.agent", event, "CC-Agent") + _set(attrs, "cc.member_uuid", event, "CC-Member-UUID") + _set(attrs, "cc.count", event, "CC-Count") + _set(attrs, "cc.selection", event, "CC-Selection") + with tracer.start_as_current_span("freeswitch.callcenter.info", attributes=attrs): + pass + + +def _emit_conference(event: ESLEvent, attrs: Dict[str, Any], subclass: str) -> None: + _set(attrs, "conference.name", event, "Conference-Name") + _set(attrs, "conference.profile", event, "Conference-Profile") + _set(attrs, "conference.action", event, "Action") + _set(attrs, "conference.member_id", event, "Member-ID") + _set(attrs, "old.member_id", event, "Old-Member-ID") + span_name = ( + "freeswitch.conference.cdr" + if subclass == "conference::cdr" + else "freeswitch.conference.maintenance" + ) + with tracer.start_as_current_span(span_name, attributes=attrs): + pass + + +def _emit_valet(event: ESLEvent, attrs: Dict[str, Any]) -> None: + _set(attrs, "valet.lot", event, "Valet-Lot-Name") + _set(attrs, "valet.extension", event, "Valet-Extension") + _set(attrs, "valet.action", event, "Action") + _set(attrs, "bridge.to_uuid", event, "Bridge-To-UUID") + with tracer.start_as_current_span("freeswitch.valet.info", attributes=attrs): + pass diff --git a/genesis/protocol/metrics.py b/genesis/protocol/metrics.py index 319fdb9..f77d1e7 100644 --- a/genesis/protocol/metrics.py +++ b/genesis/protocol/metrics.py @@ -2,15 +2,25 @@ Metrics definitions for Protocol operations. This module centralizes all OpenTelemetry metrics used by the Protocol -and related components (Channel, Session, etc.). +and related components (Channel, Session, Inbound, Outbound, etc.). + +Centralization avoids duplicated instrument definitions (which both trip +static analysis and produce OTel SDK warnings when the same metric name is +created with different descriptions in multiple modules). """ +import weakref +from typing import Any, Iterable + from opentelemetry import trace, metrics +from opentelemetry.metrics import Observation tracer = trace.get_tracer(__name__) meter = metrics.get_meter(__name__) +# --------------------------------------------------------------------------- # Command metrics +# --------------------------------------------------------------------------- commands_sent_counter = meter.create_counter( "genesis.commands.sent", description="Number of ESL commands sent", @@ -35,7 +45,9 @@ unit="1", ) +# --------------------------------------------------------------------------- # Channel operation metrics +# --------------------------------------------------------------------------- channel_operations_counter = meter.create_counter( "genesis.channel.operations", description="Number of channel operations", @@ -78,7 +90,9 @@ unit="1", ) +# --------------------------------------------------------------------------- # Routing metrics (for O(1) event routing) +# --------------------------------------------------------------------------- channel_routing_counter = meter.create_counter( "genesis.channel.routing.hits", description="Number of O(1) channel routing hits", @@ -90,3 +104,208 @@ description="Number of fallback to O(N) global routing", unit="1", ) + +# --------------------------------------------------------------------------- +# Connection metrics (shared by Inbound and Outbound) +# --------------------------------------------------------------------------- +connections_active_counter = meter.create_up_down_counter( + "genesis.connections.active", + description="Number of active connections", + unit="1", +) + +connection_errors_counter = meter.create_counter( + "genesis.connections.errors", + description="Number of connection errors", + unit="1", +) + +# --------------------------------------------------------------------------- +# New ESL lifecycle / routing correlation metrics +# --------------------------------------------------------------------------- +# Cardinality rule: metric attributes NEVER carry UUIDs; only low-cardinality +# enums/labels (channel.state, direction, hangup.cause, application.name, ...). +# UUIDs go on spans only. +calls_active_counter = meter.create_up_down_counter( + "genesis.calls.active", + description="Number of active calls by state and direction", + unit="1", +) + +channel_bridge_events_counter = meter.create_counter( + "genesis.channel.bridge.events", + description="ESL CHANNEL_BRIDGE/UNBRIDGE events (authoritative bridge state)", + unit="1", +) + +channel_transfers_counter = meter.create_counter( + "genesis.channel.transfers", + description="Call transfers observed via sofia::transferor/transferee", + unit="1", +) + +channel_codec_changes_counter = meter.create_counter( + "genesis.channel.codec.changes", + description="Codec renegotiations observed via CODEC events", + unit="1", +) + +dialplan_applications_counter = meter.create_counter( + "genesis.dialplan.applications", + description="Dialplan applications executed (CHANNEL_EXECUTE[_COMPLETE])", + unit="1", +) + +hangup_q850_counter = meter.create_counter( + "genesis.channel.hangup.causes.q850", + description="Hangup causes by Q.850 code", + unit="1", +) + +event_processing_duration = meter.create_histogram( + "genesis.event.processing.duration", + description="Duration of event dispatch (processors + routing)", + unit="s", +) + +events_without_sip_call_id_counter = meter.create_counter( + "genesis.events.without_sip_call_id", + description="Channel events lacking variable_sip_call_id (correlation gap)", + unit="1", +) + +# --------------------------------------------------------------------------- +# Session / consumer / load balancer metrics +# --------------------------------------------------------------------------- +session_commands_counter = meter.create_counter( + "genesis.session.commands", + description="Session sendmsg commands by application", + unit="1", +) + +session_command_duration = meter.create_histogram( + "genesis.session.command.duration", + description="Duration of session sendmsg commands", + unit="s", +) + +consumer_handlers_counter = meter.create_counter( + "genesis.consumer.handlers", + description="Consumer handler invocations by event and match result", + unit="1", +) + +loadbalancer_selections_counter = meter.create_counter( + "genesis.loadbalancer.selections", + description="Load balancer selections by backend and result", + unit="1", +) + +loadbalancer_errors_counter = meter.create_counter( + "genesis.loadbalancer.errors", + description="Load balancer errors by error type", + unit="1", +) + +# --------------------------------------------------------------------------- +# Observable gauges for queue depth (backpressure visibility) +# --------------------------------------------------------------------------- +# Protocols register themselves (weakly) so the gauge callbacks can sum the +# pending events/commands across all live instances without holding them alive. +_protocol_registry: "weakref.WeakSet[Any]" = weakref.WeakSet() + + +def register_protocol(protocol: Any) -> None: + """Register a Protocol instance so its queue depths feed the gauges.""" + _protocol_registry.add(protocol) + + +def _commands_queue_depth(_options: Any) -> Iterable[Observation]: + total = 0 + for proto in list(_protocol_registry): + try: + total += proto.commands.qsize() + except Exception: + pass + yield Observation(total, {}) + + +def _events_queue_depth(_options: Any) -> Iterable[Observation]: + total = 0 + for proto in list(_protocol_registry): + try: + total += proto.events.qsize() + except Exception: + pass + yield Observation(total, {}) + + +commands_queue_depth_gauge = meter.create_observable_gauge( + "genesis.commands.queue.depth", + callbacks=[_commands_queue_depth], + description="Depth of the pending command reply queue", + unit="1", +) + +events_queue_depth_gauge = meter.create_observable_gauge( + "genesis.events.queue.depth", + callbacks=[_events_queue_depth], + description="Depth of the pending event queue", + unit="1", +) + + +def safe_add(counter: Any, *args: Any, **kwargs: Any) -> None: + """Add to a counter, swallowing OTel/metrics errors (best-effort).""" + try: + getattr(counter, "add")(*args, **kwargs) + except Exception: + pass + + +def safe_record(histogram: Any, *args: Any, **kwargs: Any) -> None: + """Record on a histogram, swallowing OTel/metrics errors (best-effort).""" + try: + getattr(histogram, "record")(*args, **kwargs) + except Exception: + pass + + +# Re-export for callers that import a batch of instruments (kept alphabetical). +__all__ = [ + "tracer", + "meter", + "commands_sent_counter", + "events_received_counter", + "command_duration_histogram", + "command_errors_counter", + "channel_operations_counter", + "channel_operation_duration", + "hangup_causes_counter", + "bridge_operations_counter", + "dtmf_received_counter", + "call_duration_histogram", + "timeout_counter", + "channel_routing_counter", + "global_routing_counter", + "connections_active_counter", + "connection_errors_counter", + "calls_active_counter", + "channel_bridge_events_counter", + "channel_transfers_counter", + "channel_codec_changes_counter", + "dialplan_applications_counter", + "hangup_q850_counter", + "event_processing_duration", + "events_without_sip_call_id_counter", + "session_commands_counter", + "session_command_duration", + "consumer_handlers_counter", + "loadbalancer_selections_counter", + "loadbalancer_errors_counter", + "commands_queue_depth_gauge", + "events_queue_depth_gauge", + "register_protocol", + "safe_add", + "safe_record", +] diff --git a/genesis/protocol/processors.py b/genesis/protocol/processors.py index f7e2e68..ce46cf4 100644 --- a/genesis/protocol/processors.py +++ b/genesis/protocol/processors.py @@ -9,6 +9,10 @@ from typing import TYPE_CHECKING, List, Callable, Awaitable, Union from genesis.protocol.parser import ESLEvent +from genesis.protocol.lifecycle import ( + channel_lifecycle_processor, + custom_subclass_processor, +) if TYPE_CHECKING: from genesis.protocol.base import Protocol @@ -47,10 +51,17 @@ async def disconnect_processor(protocol: "Protocol", event: ESLEvent) -> None: def default_processors() -> List[EventProcessor]: - """Return the default list of event processors (order matters).""" + """Return the default list of event processors (order matters). + + Lifecycle/CUSTOM processors run last so they never interfere with the + core protocol adapters (auth, command reply, disconnect). They only emit + telemetry — they do not consume events routed to user handlers. + """ return [ auth_request_processor, command_reply_processor, api_response_processor, disconnect_processor, + channel_lifecycle_processor, + custom_subclass_processor, ] diff --git a/genesis/protocol/routing/dispatcher.py b/genesis/protocol/routing/dispatcher.py index 8f47131..485b873 100644 --- a/genesis/protocol/routing/dispatcher.py +++ b/genesis/protocol/routing/dispatcher.py @@ -10,6 +10,7 @@ from genesis.observability import logger from genesis.protocol.parser import ESLEvent +from genesis.protocol.metrics import consumer_handlers_counter, safe_add from genesis.types import EventHandler @@ -31,7 +32,15 @@ def dispatch_to_handlers( event: The ESL event to dispatch task_set: Optional set to track live tasks (prevents GC and logs exceptions) """ + event_name = event.get("Event-Name", "UNKNOWN") + if isinstance(event_name, list): + event_name = event_name[0] if event_name else "UNKNOWN" for handler in handlers: + safe_add( + consumer_handlers_counter, + 1, + attributes={"event.name": str(event_name)}, + ) if iscoroutinefunction(handler): task = create_task(handler(event)) else: diff --git a/genesis/protocol/telemetry.py b/genesis/protocol/telemetry.py index 13f3c03..6bf79d5 100644 --- a/genesis/protocol/telemetry.py +++ b/genesis/protocol/telemetry.py @@ -37,6 +37,34 @@ def build_event_attributes(event: ESLEvent) -> Dict[str, Any]: if isinstance(value, (str, int, float, bool, list, tuple)): attributes[attr_name] = value + # Routing / correlation attributes (explicit, low-cardinality keys) so the + # ``process_event`` span carries routing info and the sniffer join key. + _EXPLICIT = { + "Call-Direction": "event.direction", + "Channel-State": "event.channel_state", + "Answer-State": "event.answer_state", + "Hangup-Cause": "event.hangup_cause", + "Event-Subclass": "event.subclass", + "Channel-Call-UUID": "event.call_uuid", + "Other-Leg-Unique-ID": "event.other_leg", + "Caller-Context": "event.context", + "Caller-Destination-Number": "event.destination_number", + } + for src, dst in _EXPLICIT.items(): + if src in event: + value = event[src] + if isinstance(value, list): + value = value[0] if value else "" + attributes[dst] = value + + # sip.call_id is the PRIMARY correlation key with the sniffer + # (sniffer emits voip.call_id = SIP Call-ID). Join happens at the backend. + sip_call_id = event.get("variable_sip_call_id") + if sip_call_id: + attributes["sip.call_id"] = ( + sip_call_id[0] if isinstance(sip_call_id, list) else sip_call_id + ) + return attributes diff --git a/genesis/queue/backends.py b/genesis/queue/backends.py index e172bb6..060ba51 100644 --- a/genesis/queue/backends.py +++ b/genesis/queue/backends.py @@ -60,6 +60,10 @@ async def release(self, queue_id: str) -> None: """Release one slot for the queue.""" ... + def depth(self, queue_id: str) -> int: + """Return the number of items waiting in the queue (not yet acquired).""" + ... + class InMemoryBackend: """ @@ -175,3 +179,9 @@ async def release(self, queue_id: str) -> None: state.semaphore.release() async with state.lock: state.condition.notify_all() + + def depth(self, queue_id: str) -> int: + """Return the number of items waiting in the queue (not yet acquired).""" + if queue_id in self._states: + return len(self._states[queue_id].deque) + return 0 diff --git a/genesis/queue/core.py b/genesis/queue/core.py index 5fe2c59..0bc4d38 100644 --- a/genesis/queue/core.py +++ b/genesis/queue/core.py @@ -78,14 +78,22 @@ async def __aenter__(self) -> "QueueSlot": attributes={ ATTR_QUEUE_ID: self._queue_id, ATTR_QUEUE_ITEM_ID: self._item_id, + # queue.depth as a SPAN attribute (not a metric label) keeps + # backpressure observable without metric cardinality blow-up. + "queue.depth": self._queue._backend.depth(self._queue_id), }, - ): - await self._queue._backend.wait_and_acquire( - self._queue_id, - self._item_id, - self._max_concurrent, - timeout=self._timeout, - ) + ) as span: + try: + await self._queue._backend.wait_and_acquire( + self._queue_id, + self._item_id, + self._max_concurrent, + timeout=self._timeout, + ) + except Exception as e: + span.record_exception(e) + span.set_status(trace.Status(trace.StatusCode.ERROR, str(e))) + raise self._acquired = True elapsed = time.monotonic() - start queue_wait_duration.record(elapsed, attributes={ATTR_QUEUE_ID: self._queue_id}) diff --git a/genesis/session.py b/genesis/session.py index 22f7aa6..4f33d1b 100644 --- a/genesis/session.py +++ b/genesis/session.py @@ -7,18 +7,28 @@ from __future__ import annotations +import time from asyncio import Event, Queue, StreamReader, StreamWriter, wait_for from functools import partial from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple from uuid import uuid4 +from opentelemetry import trace + from genesis.observability import logger from genesis.protocol import Protocol from genesis.protocol.parser import ESLEvent +from genesis.protocol.metrics import ( + session_command_duration, + session_commands_counter, + safe_record, +) if TYPE_CHECKING: from genesis.channel import Channel +tracer = trace.get_tracer(__name__) + def _build_sendmsg_cmd( command: str, @@ -195,23 +205,56 @@ async def sendmsg( ) logger.debug("Send command to freeswitch: '%s'.", cmd) - if block and command == "execute" and resolved_event_uuid: - logger.debug( - "Waiting for command completion with Application-UUID: %s", - resolved_event_uuid, - ) - command_is_complete = self._awaitable_complete_command( - resolved_event_uuid, timeout + start_time = time.perf_counter() + with tracer.start_as_current_span( + "session.sendmsg", + attributes={ + "channel.uuid": self.uuid or "unknown", + "application.name": application, + "application.uuid": resolved_event_uuid or "unknown", + "application.block": str(block), + }, + ): + safe_add_cmd_attrs = {"application.name": application} + + if block and command == "execute" and resolved_event_uuid: + logger.debug( + "Waiting for command completion with Application-UUID: %s", + resolved_event_uuid, + ) + command_is_complete = self._awaitable_complete_command( + resolved_event_uuid, timeout + ) + response = await self.send(cmd) + logger.debug( + "Received response of execute command with block: %s", + response, + ) + with tracer.start_as_current_span( + "session.await_complete", + attributes={ + "channel.uuid": self.uuid or "unknown", + "application.uuid": resolved_event_uuid, + }, + ): + if timeout is not None: + await wait_for(command_is_complete.wait(), timeout=timeout) + else: + await command_is_complete.wait() + result = await self.fifo.get() + safe_record( + session_command_duration, + time.perf_counter() - start_time, + attributes=safe_add_cmd_attrs, + ) + session_commands_counter.add(1, attributes=safe_add_cmd_attrs) + return result + + result = await self.send(cmd) + safe_record( + session_command_duration, + time.perf_counter() - start_time, + attributes=safe_add_cmd_attrs, ) - response = await self.send(cmd) - logger.debug( - "Received response of execute command with block: %s", - response, - ) - if timeout is not None: - await wait_for(command_is_complete.wait(), timeout=timeout) - else: - await command_is_complete.wait() - return await self.fifo.get() - - return await self.send(cmd) + session_commands_counter.add(1, attributes=safe_add_cmd_attrs) + return result diff --git a/tests/payloads.py b/tests/payloads.py index 2dab620..0588869 100644 --- a/tests/payloads.py +++ b/tests/payloads.py @@ -580,3 +580,201 @@ Event-Name: CHANNEL_ANSWER Unique-ID: {unique_id} """) + + +# --------------------------------------------------------------------------- +# Lifecycle / CUSTOM payloads for the telemetry processor tests. +# Kept minimal but carry the correlation key (variable_sip_call_id) and the +# fields the processors turn into span attributes / metric labels. +# --------------------------------------------------------------------------- +UUID_A = "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa" +UUID_B = "bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb" +SIP_CALL_ID = "sniffer-correlation-key-123" + +_channel_common = dedent("""\ + Unique-ID: {uuid_a} + Channel-Call-UUID: {uuid_a} + Call-Direction: inbound + variable_sip_call_id: {sip_call_id} + """) + + +channel_progress = _channel_common.format( + uuid_a=UUID_A, sip_call_id=SIP_CALL_ID +) + dedent( + """\ + Event-Name: CHANNEL_PROGRESS + Channel-State: CS_ROUTING + Answer-State: ringing + """ +) + + +channel_bridge = _channel_common.format( + uuid_a=UUID_A, sip_call_id=SIP_CALL_ID +) + dedent( + """\ + Event-Name: CHANNEL_BRIDGE + Bridge-A-Unique-ID: {uuid_a} + Bridge-B-Unique-ID: {uuid_b} + Other-Leg-Unique-ID: {uuid_b} + Other-Type: bride + Other-Leg-Destination-Number: 1002 + Other-Leg-Caller-ID-Number: 1002 + """.format(uuid_a=UUID_A, uuid_b=UUID_B) +) + + +channel_unbridge = _channel_common.format( + uuid_a=UUID_A, sip_call_id=SIP_CALL_ID +) + dedent( + """\ + Event-Name: CHANNEL_UNBRIDGE + Bridge-A-Unique-ID: {uuid_a} + Other-Leg-Unique-ID: {uuid_b} + Hangup-Cause: NORMAL_CLEARING + """.format(uuid_a=UUID_A, uuid_b=UUID_B) +) + + +channel_hangup_complete = _channel_common.format( + uuid_a=UUID_A, sip_call_id=SIP_CALL_ID +) + dedent( + """\ + Event-Name: CHANNEL_HANGUP_COMPLETE + Hangup-Cause: NORMAL_CLEARING + variable_hangup_cause_q850: 16 + Channel-Name: sofia/internal/100@192.168.50.4 + """ +) + + +channel_destroy = _channel_common.format( + uuid_a=UUID_A, sip_call_id=SIP_CALL_ID +) + dedent( + """\ + Event-Name: CHANNEL_DESTROY + Channel-State: CS_DESTROY + """ +) + + +channel_execute = _channel_common.format( + uuid_a=UUID_A, sip_call_id=SIP_CALL_ID +) + dedent( + """\ + Event-Name: CHANNEL_EXECUTE + Application: playback + Application-UUID: app-uuid-1 + Application-Data: /tmp/hello.wav + """ +) + + +channel_execute_complete = _channel_common.format( + uuid_a=UUID_A, sip_call_id=SIP_CALL_ID +) + dedent( + """\ + Event-Name: CHANNEL_EXECUTE_COMPLETE + Application: playback + Application-UUID: app-uuid-1 + Application-Response: FILE PLAYED + """ +) + + +codec = _channel_common.format(uuid_a=UUID_A, sip_call_id=SIP_CALL_ID) + dedent("""\ + Event-Name: CODEC + Channel-Read-Codec-Name: opus + Channel-Read-Codec-Rate: 48000 + Channel-Write-Codec-Name: opus + Channel-Write-Codec-Rate: 48000 + """) + + +call_update = _channel_common.format(uuid_a=UUID_A, sip_call_id=SIP_CALL_ID) + dedent( + """\ + Event-Name: CALL_UPDATE + Bridged-To: {uuid_b} + Caller-Transfer-Source: transfer_src + Caller-Orig-Caller-ID-Number: 100 + """.format(uuid_b=UUID_B) +) + + +sofia_transferor = _channel_common.format( + uuid_a=UUID_A, sip_call_id=SIP_CALL_ID +) + dedent( + """\ + Event-Name: CUSTOM + Event-Subclass: sofia::transferor + variable_sofia_profile_name: internal + """ +) + + +sofia_transferee = _channel_common.format( + uuid_a=UUID_A, sip_call_id=SIP_CALL_ID +) + dedent( + """\ + Event-Name: CUSTOM + Event-Subclass: sofia::transferee + variable_sofia_profile_name: internal + """ +) + + +callcenter_info = _channel_common.format( + uuid_a=UUID_A, sip_call_id=SIP_CALL_ID +) + dedent( + """\ + Event-Name: CUSTOM + Event-Subclass: callcenter::info + CC-Queue: sales + CC-Action: agent-state-change + CC-Agent: agent-1001 + CC-Member-UUID: member-uuid-1 + CC-Count: 1 + CC-Selection: round-robin + """ +) + + +conference_maintenance = _channel_common.format( + uuid_a=UUID_A, sip_call_id=SIP_CALL_ID +) + dedent( + """\ + Event-Name: CUSTOM + Event-Subclass: conference::maintenance + Conference-Name: 3000 + Conference-Profile: default + Action: add-member + Member-ID: 1 + Old-Member-ID: 0 + """ +) + + +valet_info = _channel_common.format(uuid_a=UUID_A, sip_call_id=SIP_CALL_ID) + dedent( + """\ + Event-Name: CUSTOM + Event-Subclass: valet_parking::info + Valet-Lot-Name: default + Valet-Extension: 4100 + Action: bridge + Bridge-To-UUID: {uuid_b} + """.format(uuid_b=UUID_B) +) + + +# A channel event WITHOUT variable_sip_call_id — used to assert the +# events_without_sip_call_id_counter correlation-gap metric fires. +channel_create_no_sip = dedent("""\ + Event-Name: CHANNEL_CREATE + Channel-State: CS_INIT + Unique-ID: {uuid_a} + Channel-Call-UUID: {uuid_a} + Call-Direction: outbound + Caller-Destination-Number: 1002 + Caller-Context: default + """).format(uuid_a=UUID_A) diff --git a/tests/test_channel_lifecycle.py b/tests/test_channel_lifecycle.py new file mode 100644 index 0000000..ad24698 --- /dev/null +++ b/tests/test_channel_lifecycle.py @@ -0,0 +1,186 @@ +"""Tests for the ESL lifecycle / CUSTOM subclass telemetry processors. + +These processors emit ``freeswitch.channel.*`` and ``freeswitch.sofia.*`` / +``freeswitch.callcenter.*`` / ``freeswitch.conference.*`` / ``freeswitch.valet.*`` +spans. The key contract under test is **sniffer correlation**: every channel +span must carry ``sip.call_id`` (= ``variable_sip_call_id``) so the passive +sniffer's ``voip.call_id`` can be joined to it at the observability backend. +""" + +from __future__ import annotations + +import pytest +from opentelemetry import trace +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import SimpleSpanProcessor +from opentelemetry.sdk.trace.export.in_memory_span_exporter import ( + InMemorySpanExporter, +) + +from genesis.protocol.lifecycle import ( + channel_lifecycle_processor, + custom_subclass_processor, +) +from genesis.protocol.parser import parse_headers +from tests import payloads + + +@pytest.fixture +def memory_exporter(): + exporter = InMemorySpanExporter() + processor = SimpleSpanProcessor(exporter) + provider = trace.get_tracer_provider() + if not hasattr(provider, "add_span_processor"): + provider = TracerProvider() + trace.set_tracer_provider(provider) + provider.add_span_processor(processor) + yield exporter + + +def _event(payload: str): + return parse_headers(payload) + + +def _span(exporter: InMemorySpanExporter, name: str): + spans = [s for s in exporter.get_finished_spans() if s.name == name] + assert ( + spans + ), f"span '{name}' not emitted; got {[s.name for s in exporter.get_finished_spans()]}" + return spans[-1] + + +async def test_channel_create_emits_span_and_sip_call_id(memory_exporter): + event = _event(payloads.channel_create) + await channel_lifecycle_processor(None, event) # type: ignore[arg-type] + + span = _span(memory_exporter, "freeswitch.channel.create") + # Correlation contract: sip.call_id must be present on the span. + assert "sip.call_id" in span.attributes + assert span.attributes["sip.call_id"] + assert span.attributes["channel.uuid"] == "d0b1da34-a727-11e4-9728-6f83a2e5e50a" + assert span.attributes["channel.destination_number"] == "101" + + +async def test_channel_bridge_carries_cross_leg_uuids(memory_exporter): + event = _event(payloads.channel_bridge) + await channel_lifecycle_processor(None, event) # type: ignore[arg-type] + + span = _span(memory_exporter, "freeswitch.channel.bridge") + assert span.attributes["bridge.a_uuid"] == payloads.UUID_A + assert span.attributes["bridge.b_uuid"] == payloads.UUID_B + assert span.attributes["sip.call_id"] == payloads.SIP_CALL_ID + events = [e for e in span.events if e.name == "bridge.established"] + assert events, "bridge.established event not emitted" + + +async def test_channel_unbridge_emits_torn_down_event(memory_exporter): + event = _event(payloads.channel_unbridge) + await channel_lifecycle_processor(None, event) # type: ignore[arg-type] + + span = _span(memory_exporter, "freeswitch.channel.unbridge") + assert span.attributes["sip.call_id"] == payloads.SIP_CALL_ID + assert [e for e in span.events if e.name == "bridge.torn_down"] + + +async def test_hangup_complete_records_q850(memory_exporter): + event = _event(payloads.channel_hangup_complete) + await channel_lifecycle_processor(None, event) # type: ignore[arg-type] + + span = _span(memory_exporter, "freeswitch.channel.hangup_complete") + assert span.attributes["hangup.cause.q850"] == "16" + assert span.attributes["sip.call_id"] == payloads.SIP_CALL_ID + assert [e for e in span.events if e.name == "call.finalized"] + + +async def test_channel_destroy_emits_span(memory_exporter): + event = _event(payloads.channel_destroy) + await channel_lifecycle_processor(None, event) # type: ignore[arg-type] + span = _span(memory_exporter, "freeswitch.channel.destroy") + assert span.attributes["sip.call_id"] == payloads.SIP_CALL_ID + + +async def test_execute_and_complete_spans(memory_exporter): + event = _event(payloads.channel_execute) + await channel_lifecycle_processor(None, event) # type: ignore[arg-type] + span = _span(memory_exporter, "freeswitch.channel.execute") + assert span.attributes["application.name"] == "playback" + assert span.attributes["application.uuid"] == "app-uuid-1" + + event = _event(payloads.channel_execute_complete) + await channel_lifecycle_processor(None, event) # type: ignore[arg-type] + span = _span(memory_exporter, "freeswitch.channel.execute_complete") + assert span.attributes["application.name"] == "playback" + + +async def test_codec_span(memory_exporter): + event = _event(payloads.codec) + await channel_lifecycle_processor(None, event) # type: ignore[arg-type] + span = _span(memory_exporter, "freeswitch.channel.codec") + assert span.attributes["channel.read_codec.name"] == "opus" + assert span.attributes["sip.call_id"] == payloads.SIP_CALL_ID + + +async def test_call_update_span(memory_exporter): + event = _event(payloads.call_update) + await channel_lifecycle_processor(None, event) # type: ignore[arg-type] + span = _span(memory_exporter, "freeswitch.call.update") + assert span.attributes["sip.call_id"] == payloads.SIP_CALL_ID + + +async def test_sofia_transfer_blind_and_attended(memory_exporter): + event = _event(payloads.sofia_transferor) + await custom_subclass_processor(None, event) # type: ignore[arg-type] + span = _span(memory_exporter, "freeswitch.sofia.transfer") + assert span.attributes["transfer.role"] == "transferor" + assert span.attributes["transfer.type"] == "blind" + + event = _event(payloads.sofia_transferee) + await custom_subclass_processor(None, event) # type: ignore[arg-type] + span = _span(memory_exporter, "freeswitch.sofia.transfer") + assert span.attributes["transfer.role"] == "transferee" + assert span.attributes["transfer.type"] == "attended" + + +async def test_callcenter_info_span(memory_exporter): + event = _event(payloads.callcenter_info) + await custom_subclass_processor(None, event) # type: ignore[arg-type] + span = _span(memory_exporter, "freeswitch.callcenter.info") + assert span.attributes["cc.queue"] == "sales" + assert span.attributes["cc.action"] == "agent-state-change" + + +async def test_conference_maintenance_span(memory_exporter): + event = _event(payloads.conference_maintenance) + await custom_subclass_processor(None, event) # type: ignore[arg-type] + span = _span(memory_exporter, "freeswitch.conference.maintenance") + assert span.attributes["conference.name"] == "3000" + assert span.attributes["conference.action"] == "add-member" + + +async def test_valet_info_span(memory_exporter): + event = _event(payloads.valet_info) + await custom_subclass_processor(None, event) # type: ignore[arg-type] + span = _span(memory_exporter, "freeswitch.valet.info") + assert span.attributes["valet.lot"] == "default" + assert span.attributes["bridge.to_uuid"] == payloads.UUID_B + + +async def test_no_sip_call_id_event_still_emits_span(memory_exporter): + """A channel event without the correlation key still traces; the gap is + counted by the events_without_sip_call_id metric (no crash, no missing span).""" + event = _event(payloads.channel_create_no_sip) + await channel_lifecycle_processor(None, event) # type: ignore[arg-type] + span = _span(memory_exporter, "freeswitch.channel.create") + assert "sip.call_id" not in span.attributes + + +async def test_non_lifecycle_event_is_noop(memory_exporter): + """A HEARTBEAT must not produce a lifecycle span.""" + event = _event(payloads.heartbeat) + await channel_lifecycle_processor(None, event) # type: ignore[arg-type] + spans = exporter_names(memory_exporter) + assert not any(name.startswith("freeswitch.channel.") for name in spans) + + +def exporter_names(exporter: InMemorySpanExporter): + return [s.name for s in exporter.get_finished_spans()] diff --git a/tests/test_consumer_tracing.py b/tests/test_consumer_tracing.py new file mode 100644 index 0000000..d4a7fb3 --- /dev/null +++ b/tests/test_consumer_tracing.py @@ -0,0 +1,67 @@ +"""Tests for Consumer start/stop OpenTelemetry instrumentation.""" + +from __future__ import annotations + +import asyncio + +import pytest +from opentelemetry import trace +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import SimpleSpanProcessor +from opentelemetry.sdk.trace.export.in_memory_span_exporter import ( + InMemorySpanExporter, +) + +from genesis import Consumer + + +@pytest.fixture +def memory_exporter(): + exporter = InMemorySpanExporter() + processor = SimpleSpanProcessor(exporter) + provider = trace.get_tracer_provider() + if not hasattr(provider, "add_span_processor"): + provider = TracerProvider() + trace.set_tracer_provider(provider) + provider.add_span_processor(processor) + yield exporter + + +async def _wait_for_span( + exporter: InMemorySpanExporter, name: str, timeout: float = 5.0 +): + """Event-based poll (no sleep) for a finished span by name.""" + start = asyncio.get_event_loop().time() + while True: + if any(s.name == name for s in exporter.get_finished_spans()): + return + if asyncio.get_event_loop().time() - start >= timeout: + raise TimeoutError(f"span '{name}' not seen within {timeout}s") + future = asyncio.Future() + asyncio.get_event_loop().call_soon(future.set_result, None) + await future + + +async def test_consumer_start_and_stop_spans(freeswitch, memory_exporter): + """Consumer.start emits consumer.start (setup phase) and stop emits consumer.stop.""" + consumer = Consumer(*freeswitch.address) + + start_task = asyncio.create_task(consumer.start()) + # Wait for the setup-phase span to finalize (before the blocking wait loop). + await _wait_for_span(memory_exporter, "consumer.start", timeout=5.0) + + await consumer.stop() + try: + await asyncio.wait_for(start_task, timeout=5.0) + except (asyncio.TimeoutError, Exception): + start_task.cancel() + + spans = {s.name for s in memory_exporter.get_finished_spans()} + assert "consumer.start" in spans + assert "consumer.stop" in spans + + start_span = next( + s for s in memory_exporter.get_finished_spans() if s.name == "consumer.start" + ) + assert start_span.attributes["consumer.host"] == freeswitch.address[0] + assert "consumer.port" in start_span.attributes diff --git a/tests/test_inbound.py b/tests/test_inbound.py index 69ef23a..6ad3760 100644 --- a/tests/test_inbound.py +++ b/tests/test_inbound.py @@ -132,7 +132,7 @@ async def test_inbound_client_send_command_error(freeswitch): async def test_inbound_metrics_error_on_start(freeswitch): async with freeswitch: with patch( - "genesis.inbound.active_connections_counter.add", + "genesis.inbound.connections_active_counter.add", side_effect=Exception("Metrics error"), ): async with Inbound(*freeswitch.address) as client: @@ -143,7 +143,7 @@ async def test_inbound_metrics_error_on_stop(freeswitch): async with freeswitch: async with Inbound(*freeswitch.address): with patch( - "genesis.inbound.active_connections_counter.add", + "genesis.inbound.connections_active_counter.add", side_effect=Exception("Metrics error"), ): pass diff --git a/tests/test_session_tracing.py b/tests/test_session_tracing.py new file mode 100644 index 0000000..f34a88c --- /dev/null +++ b/tests/test_session_tracing.py @@ -0,0 +1,124 @@ +"""Tests for Session.sendmsg OpenTelemetry instrumentation. + +Verifies the ``session.sendmsg`` span (and the ``session.await_complete`` +child span when blocking) plus the ``genesis.session.commands`` metric +attributes. Uses the Outbound + Dialplan doubles (no real FreeSWITCH). +""" + +from __future__ import annotations + +import asyncio + +import pytest +from opentelemetry import trace +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import SimpleSpanProcessor +from opentelemetry.sdk.trace.export.in_memory_span_exporter import ( + InMemorySpanExporter, +) + +from genesis import Outbound, Session + + +@pytest.fixture +def memory_exporter(): + exporter = InMemorySpanExporter() + processor = SimpleSpanProcessor(exporter) + provider = trace.get_tracer_provider() + if not hasattr(provider, "add_span_processor"): + provider = TracerProvider() + trace.set_tracer_provider(provider) + provider.add_span_processor(processor) + yield exporter + + +def _spans(exporter: InMemorySpanExporter): + return exporter.get_finished_spans() + + +async def test_sendmsg_non_blocking_emits_span(host, port, dialplan, memory_exporter): + """A non-blocking sendmsg emits session.sendmsg with application metadata.""" + done = asyncio.Event() + handler_started = asyncio.Event() + + async def handler(session: Session) -> None: + handler_started.set() + # Non-blocking execute: returns the +OK reply, no completion wait. + await session.sendmsg("execute", "answer", block=False) + done.set() + + address = (host(), port()) + app = Outbound(handler, *address) + await app.start(block=False) + await dialplan.start(*address) + + await asyncio.wait_for(dialplan.client_connected.wait(), timeout=5.0) + await asyncio.wait_for(handler_started.wait(), timeout=5.0) + await asyncio.wait_for(done.wait(), timeout=5.0) + + await app.stop() + await dialplan.stop() + + sendmsg_spans = [s for s in _spans(memory_exporter) if s.name == "session.sendmsg"] + assert sendmsg_spans, "session.sendmsg span not emitted" + span = sendmsg_spans[-1] + assert span.attributes["application.name"] == "answer" + assert span.attributes["application.block"] == "False" + + +async def test_sendmsg_blocking_emits_await_complete( + host, port, dialplan, memory_exporter +): + """A blocking sendmsg emits session.sendmsg + a session.await_complete child.""" + sendmsg_returned = asyncio.Event() + handler_started = asyncio.Event() + captured = {} + + async def handler(session: Session) -> None: + handler_started.set() + captured["channel_uuid"] = session.uuid + # blocking execute; the Dialplan double stores the pending Event-UUID. + await session.sendmsg("execute", "playback", "/tmp/x.wav", block=True) + sendmsg_returned.set() + + address = (host(), port()) + app = Outbound(handler, *address) + await app.start(block=False) + await dialplan.start(*address) + + await asyncio.wait_for(dialplan.client_connected.wait(), timeout=5.0) + await asyncio.wait_for(handler_started.wait(), timeout=5.0) + + # Wait (event-based, no sleep) until the Dialplan double has recorded the + # pending execute Event-UUID, then broadcast the completion event so the + # blocked sendmsg returns. The complete event must carry the session's + # channel UUID so O(1) channel routing delivers it to the registered handler. + async def _pending_uuid(): + while not dialplan.pending_execute_events: + future = asyncio.Future() + asyncio.get_event_loop().call_soon(future.set_result, None) + await future + return next(iter(dialplan.pending_execute_events)) + + app_uuid = await asyncio.wait_for(_pending_uuid(), timeout=5.0) + + await dialplan.broadcast( + { + "Event-Name": "CHANNEL_EXECUTE_COMPLETE", + "Application-UUID": app_uuid, + "Unique-ID": captured["channel_uuid"], + } + ) + + await asyncio.wait_for(sendmsg_returned.wait(), timeout=5.0) + await app.stop() + await dialplan.stop() + + names = [s.name for s in _spans(memory_exporter)] + assert "session.sendmsg" in names + assert "session.await_complete" in names + sendmsg_span = next( + s for s in _spans(memory_exporter) if s.name == "session.sendmsg" + ) + assert sendmsg_span.attributes["application.name"] == "playback" + assert sendmsg_span.attributes["application.block"] == "True" From 8f594d92e774bdcbcbde907cd3b2ea235bebff5d Mon Sep 17 00:00:00 2001 From: Vitor Hugo Date: Tue, 30 Jun 2026 22:57:55 -0300 Subject: [PATCH 2/5] docs(metrics): rewrite lifecycle/session metrics in human-readable format --- docs/content/docs/Observability/metrics.md | 95 ++++++++++++++++------ 1 file changed, 68 insertions(+), 27 deletions(-) diff --git a/docs/content/docs/Observability/metrics.md b/docs/content/docs/Observability/metrics.md index 995b5f1..daa0201 100644 --- a/docs/content/docs/Observability/metrics.md +++ b/docs/content/docs/Observability/metrics.md @@ -91,30 +91,71 @@ For programmatic access to load counts per destination, use the load balancer's - Description: Number of timeouts - Attributes: `timeout.type` (wait, command, connection), `timeout.operation`, `timeout.duration` -**ESL Lifecycle Metrics (sniffer correlation):** - -These metrics are emitted by the lifecycle/CUSTOM processors. Cardinality rule: -attributes carry low-cardinality enums only — UUIDs go on spans, never as -metric labels. - -- **`genesis.calls.active`** (UpDownCounter) — Active calls by state and direction; +1 on `CHANNEL_CREATE`, -1 on `CHANNEL_DESTROY`. Attributes: `channel.state`, `direction` -- **`genesis.channel.bridge.events`** (Counter) — Authoritative bridge state from `CHANNEL_BRIDGE`/`UNBRIDGE`. Attributes: `bridge.result` (`established`/`unbridged`), `hangup.cause` -- **`genesis.channel.transfers`** (Counter) — Transfers via `sofia::transferor`/`transferee`. Attributes: `transfer.type` (`blind`/`attended`), `transfer.role` -- **`genesis.channel.codec.changes`** (Counter) — Codec renegotiations from `CODEC` events. Attributes: `channel.read_codec`, `channel.write_codec` -- **`genesis.dialplan.applications`** (Counter) — Dialplan apps from `CHANNEL_EXECUTE`/`_COMPLETE`. Attributes: `application.name`, `application.result` (`started`/`success`/`fail`) -- **`genesis.channel.hangup.causes.q850`** (Counter) — Hangup causes by Q.850 code. Attributes: `hangup.cause.q850` -- **`genesis.event.processing.duration`** (Histogram) — Duration of event dispatch (processors + routing). Attributes: `event.name` -- **`genesis.events.without_sip_call_id`** (Counter) — Channel events lacking `variable_sip_call_id` (a correlation-gap signal vs the passive sniffer). Attributes: (none) - -**Session / Consumer / Load Balancer / Queue Metrics:** -- **`genesis.session.commands`** (Counter) — Session `sendmsg` commands. Attributes: `application.name` -- **`genesis.session.command.duration`** (Histogram) — Duration of session `sendmsg` commands. Attributes: `application.name` -- **`genesis.consumer.handlers`** (Counter) — Consumer handler invocations. Attributes: `event.name` -- **`genesis.loadbalancer.selections`** (Counter) — Load balancer selections. Attributes: `loadbalancer.backend`, `loadbalancer.result` (`selected`/`fallback`) -- **`genesis.loadbalancer.errors`** (Counter) — Load balancer errors. Attributes: `loadbalancer.backend`, `error` -- **`genesis.commands.queue.depth`** (ObservableGauge) — Depth of the pending command-reply queue. Attributes: (none) -- **`genesis.events.queue.depth`** (ObservableGauge) — Depth of the pending event queue. Attributes: (none) - -> All metric instruments are centralized in `genesis/protocol/metrics.py`. -> Import them from there rather than re-declaring, and use the `safe_add` / -> `safe_record` helpers so a missing exporter never crashes the protocol. +## Channel lifecycle metrics + +These metrics describe what a call is doing across its lifecycle — from the +moment FreeSWITCH creates the channel until it is destroyed. Use them together +with the [tracing](./tracing) spans to follow a call end to end and to correlate +it with the passive sniffer (see [Sniffer correlation](./tracing#sniffer-correlation-sipcall_id-join)). + +- **`genesis.calls.active`** (UpDownCounter) + - Description: Number of calls currently active, by state and direction. Goes up when a channel is created and back down when it is destroyed. + - Attributes: `channel.state`, `direction` + +- **`genesis.channel.bridge.events`** (Counter) + - Description: Bridges established and torn down, from the authoritative `CHANNEL_BRIDGE` / `CHANNEL_UNBRIDGE` events. + - Attributes: `bridge.result` (`established`, `unbridged`), `hangup.cause` + +- **`genesis.channel.transfers`** (Counter) + - Description: Call transfers observed through the `sofia::transferor` and `sofia::transferee` events. + - Attributes: `transfer.type` (`blind`, `attended`), `transfer.role` + +- **`genesis.channel.codec.changes`** (Counter) + - Description: Codec renegotiations observed through `CODEC` events. + - Attributes: `channel.read_codec`, `channel.write_codec` + +- **`genesis.dialplan.applications`** (Counter) + - Description: Dialplan applications executed, from `CHANNEL_EXECUTE` and `CHANNEL_EXECUTE_COMPLETE`. + - Attributes: `application.name`, `application.result` (`started`, `success`, `fail`) + +- **`genesis.channel.hangup.causes.q850`** (Counter) + - Description: Hangup causes grouped by Q.850 code. + - Attributes: `hangup.cause.q850` + +- **`genesis.event.processing.duration`** (Histogram) + - Description: How long it takes to dispatch a single event through the processors and routing. + - Attributes: `event.name` + +- **`genesis.events.without_sip_call_id`** (Counter) + - Description: Channel events that arrived without a `variable_sip_call_id`. A high value means the Genesis trace and the sniffer trace cannot be joined for those calls. + - Attributes: (none) + +## Session, consumer, load balancer and queue metrics + +- **`genesis.session.commands`** (Counter) + - Description: `sendmsg` commands sent through a session, by application. + - Attributes: `application.name` + +- **`genesis.session.command.duration`** (Histogram) + - Description: How long a session `sendmsg` command takes to complete. + - Attributes: `application.name` + +- **`genesis.consumer.handlers`** (Counter) + - Description: How many times a consumer handler was invoked, by event. + - Attributes: `event.name` + +- **`genesis.loadbalancer.selections`** (Counter) + - Description: Destinations picked by the load balancer, including when it falls back to the first available destination. + - Attributes: `loadbalancer.backend`, `loadbalancer.result` (`selected`, `fallback`) + +- **`genesis.loadbalancer.errors`** (Counter) + - Description: Errors raised while selecting a destination. + - Attributes: `loadbalancer.backend`, `error` + +- **`genesis.commands.queue.depth`** (ObservableGauge) + - Description: How many command replies are still pending. Useful to spot backpressure on the command path. + - Attributes: (none) + +- **`genesis.events.queue.depth`** (ObservableGauge) + - Description: How many events are waiting to be processed. Useful to spot backpressure on the event path. + - Attributes: (none) From ef8b603a290e6b85a8cb23d6f4b3105f292f0cdd Mon Sep 17 00:00:00 2001 From: Vitor Hugo Date: Tue, 30 Jun 2026 23:03:15 -0300 Subject: [PATCH 3/5] docs: de-sniffer tracing/metrics/server docs and format lifecycle spans Remove all references to the sniffer product and its internal voip.call_id attribute from Genesis docs and code comments. sip.call_id is the standard SIP Call-ID; document it as a generic cross-system join key instead. - tracing.md: format lifecycle/CUSTOM/session spans with the standard Description/Attributes pattern; rename 'Sniffer correlation' section to 'Cross-system correlation (sip.call_id)'. - metrics.md: drop sniffer mention from the lifecycle intro and the events.without_sip_call_id description. - server.md: split the opening paragraph into endpoint/config/start-mode lists. - AGENTS.md: rename the correlation subsection and drop sniffer naming. - lifecycle.py / telemetry.py / channel.py / tests: drop sniffer naming from docstrings and comments; rename SIP_CALL_ID test payload value. --- AGENTS.md | 17 +-- docs/content/docs/Observability/metrics.md | 8 +- docs/content/docs/Observability/server.md | 15 ++- docs/content/docs/Observability/tracing.md | 137 ++++++++++++++++----- genesis/channel.py | 4 +- genesis/protocol/lifecycle.py | 11 +- genesis/protocol/telemetry.py | 6 +- tests/payloads.py | 2 +- tests/test_channel_lifecycle.py | 7 +- 9 files changed, 149 insertions(+), 58 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index cbf9892..7293ac3 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -325,13 +325,15 @@ Emitted spans (non-exhaustive): `freeswitch.channel.create`, `.progress`, `freeswitch.callcenter.info`, `freeswitch.conference.maintenance`, `freeswitch.conference.cdr`, `freeswitch.valet.info`. -### Sniffer correlation (sip.call_id join) +### Cross-system correlation (sip.call_id) -Correlation with the passive sniffer (Otoru/sniffer) is **attribute-based and -happens at the observability backend (Grafana/Tempo), not in code**. Every -channel lifecycle span carries `sip.call_id` (= the ESL `variable_sip_call_id` -header), which matches the sniffer's `voip.call_id`. The two traces are joined -in Grafana/Tempo by filtering/grouping on that attribute. +Every channel lifecycle span carries `sip.call_id` (= the ESL +`variable_sip_call_id` header), the standard SIP `Call-ID`. This is a stable +per-call identifier that any other SIP observer of the same call will also +have, so it is the natural join key when correlating Genesis traces with +another system's traces of the same call. The join happens **at the +observability backend** (Grafana/Tempo), by filtering/grouping on `sip.call_id` +— not in code. Cross-leg grouping: bridge spans carry `bridge.a_uuid` and `bridge.b_uuid` (from `Bridge-A-Unique-ID` / `Bridge-B-Unique-ID`), so the a-leg and b-leg of a @@ -339,7 +341,8 @@ call can be tied together at the backend. The `genesis.events.without_sip_call_id` counter tracks channel events that lack the correlation key (a correlation-gap signal). W3C `traceparent` / -`X-Tracespan` propagation to the sniffer is intentionally **out of scope**. +`X-Tracespan` propagation is intentionally **out of scope**; the attribute join +is sufficient. ### Cardinality rule diff --git a/docs/content/docs/Observability/metrics.md b/docs/content/docs/Observability/metrics.md index daa0201..94fab11 100644 --- a/docs/content/docs/Observability/metrics.md +++ b/docs/content/docs/Observability/metrics.md @@ -95,8 +95,10 @@ For programmatic access to load counts per destination, use the load balancer's These metrics describe what a call is doing across its lifecycle — from the moment FreeSWITCH creates the channel until it is destroyed. Use them together -with the [tracing](./tracing) spans to follow a call end to end and to correlate -it with the passive sniffer (see [Sniffer correlation](./tracing#sniffer-correlation-sipcall_id-join)). +with the [tracing](./tracing) spans to follow a call end to end, and see +[Cross-system correlation](./tracing#cross-system-correlation-sipcall_id) for +how `sip.call_id` lets you join these traces with another system's view of the +same call. - **`genesis.calls.active`** (UpDownCounter) - Description: Number of calls currently active, by state and direction. Goes up when a channel is created and back down when it is destroyed. @@ -127,7 +129,7 @@ it with the passive sniffer (see [Sniffer correlation](./tracing#sniffer-correla - Attributes: `event.name` - **`genesis.events.without_sip_call_id`** (Counter) - - Description: Channel events that arrived without a `variable_sip_call_id`. A high value means the Genesis trace and the sniffer trace cannot be joined for those calls. + - Description: Channel events that arrived without a `variable_sip_call_id`. A high value means those calls cannot be joined to another system's view of the same call via `sip.call_id`. - Attributes: (none) ## Session, consumer, load balancer and queue metrics diff --git a/docs/content/docs/Observability/server.md b/docs/content/docs/Observability/server.md index 152943f..adab0af 100644 --- a/docs/content/docs/Observability/server.md +++ b/docs/content/docs/Observability/server.md @@ -3,7 +3,20 @@ title: HTTP Server weight: 30 --- -A built-in HTTP server exposes health, readiness, and metrics. Port **8000** by default; set `GENESIS_OBSERVABILITY_PORT` to change it. With the CLI, the server starts automatically; with the library, you start it yourself (see below). +Genesis ships a built-in HTTP server that exposes three endpoints: + +- **`/health`** — liveness probe (is the process up and connected?) +- **`/ready`** — readiness probe (can the app accept work yet?) +- **`/metrics`** — Prometheus scrape endpoint for all Genesis metrics + +The server listens on port **8000** by default. Change it with the +`GENESIS_OBSERVABILITY_PORT` environment variable. + +How it starts depends on how you run Genesis: + +- **CLI** (`genesis consumer` / `genesis outbound`): the server starts + automatically. +- **Library**: you start the server yourself (see [Library](#library) below). ## Endpoints diff --git a/docs/content/docs/Observability/tracing.md b/docs/content/docs/Observability/tracing.md index 1a874c3..8d9a7e0 100644 --- a/docs/content/docs/Observability/tracing.md +++ b/docs/content/docs/Observability/tracing.md @@ -74,49 +74,120 @@ Genesis automatically creates spans for the following operations: - Attributes: `ring_group.mode`, `ring_group.size`, `ring_group.timeout`, `ring_group.has_balancer`, `ring_group.has_variables`, `ring_group.balanced`, `ring_group.result`, `ring_group.duration`, `ring_group.answered_uuid`, `ring_group.answered_dial_path`, `ring_group.error` (if error) **ESL Channel Lifecycle Spans (`freeswitch.channel.*`):** -- Emitted by the `channel_lifecycle_processor` for the semantic FreeSWITCH channel lifecycle. They carry the channel UUIDs and the sniffer correlation key on the span (see [Sniffer correlation](#sniffer-correlation-sipcall_id-join)). -- **`freeswitch.channel.create`** — `channel.uuid`, `channel.call_uuid`, `channel.direction`, `sip.call_id`, `channel.destination_number`, `channel.context` -- **`freeswitch.channel.progress`** / **`.progress_media`** — `channel.state`, `answer.state`, codec names -- **`freeswitch.channel.answer`** — `channel.state`, `answer.state`, codec names -- **`freeswitch.channel.bridge`** — `bridge.a_uuid`, `bridge.b_uuid`, `other_leg.*`, span event `bridge.established` -- **`freeswitch.channel.unbridge`** — `bridge.a_uuid`, `bridge.b_uuid`, `hangup.cause`, span event `bridge.torn_down` -- **`freeswitch.channel.hangup`** — `hangup.cause`, `channel.state`, span event `hangup.cause.` -- **`freeswitch.channel.hangup_complete`** — `hangup.cause`, `hangup.cause.q850`, span event `call.finalized` -- **`freeswitch.channel.destroy`** — `channel.uuid`, `sip.call_id` -- **`freeswitch.channel.execute`** / **`.execute_complete`** — `application.name`, `application.uuid`, `application.data`/`application.response`, span event `app..done` -- **`freeswitch.channel.codec`** — `channel.read_codec.*`, `channel.write_codec.*` -- **`freeswitch.call.update`** — `bridged.to`, `caller.transfer_source`, span event `caller_id.mutated` + +These spans follow a call across its FreeSWITCH lifecycle, from channel +creation to destruction. They carry the channel UUIDs and the SIP correlation +key on the span (see [Cross-system correlation](#cross-system-correlation-sipcall_id)). + +- **`freeswitch.channel.create`** + - Description: A new channel was created + - Attributes: `channel.uuid`, `channel.call_uuid`, `channel.direction`, `sip.call_id`, `channel.destination_number`, `channel.context` + +- **`freeswitch.channel.progress`** / **`freeswitch.channel.progress_media`** + - Description: The call is progressing / early media is flowing + - Attributes: `channel.state`, `answer.state`, codec names + +- **`freeswitch.channel.answer`** + - Description: The call was answered + - Attributes: `channel.state`, `answer.state`, codec names + +- **`freeswitch.channel.bridge`** + - Description: Two channels were bridged together + - Attributes: `bridge.a_uuid`, `bridge.b_uuid`, `other_leg.*` + - Events: `bridge.established` + +- **`freeswitch.channel.unbridge`** + - Description: The bridge between two channels was torn down + - Attributes: `bridge.a_uuid`, `bridge.b_uuid`, `hangup.cause` + - Events: `bridge.torn_down` + +- **`freeswitch.channel.hangup`** + - Description: The channel is hanging up + - Attributes: `hangup.cause`, `channel.state` + - Events: `hangup.cause.` + +- **`freeswitch.channel.hangup_complete`** + - Description: Hangup is complete and the call is finalized + - Attributes: `hangup.cause`, `hangup.cause.q850` + - Events: `call.finalized` + +- **`freeswitch.channel.destroy`** + - Description: The channel was destroyed + - Attributes: `channel.uuid`, `sip.call_id` + +- **`freeswitch.channel.execute`** / **`freeswitch.channel.execute_complete`** + - Description: A dialplan application started / finished executing + - Attributes: `application.name`, `application.uuid`, `application.data` / `application.response` + - Events: `app..done` + +- **`freeswitch.channel.codec`** + - Description: The channel negotiated (or renegotiated) its codecs + - Attributes: `channel.read_codec.*`, `channel.write_codec.*` + +- **`freeswitch.call.update`** + - Description: The caller ID or bridged state changed + - Attributes: `bridged.to`, `caller.transfer_source` + - Events: `caller_id.mutated` **CUSTOM Subclass Spans:** -- Emitted by the `custom_subclass_processor` for `CUSTOM` events. -- **`freeswitch.sofia.transfer`** — `transfer.role` (`transferor`/`transferee`), `transfer.type` (`blind`/`attended`), span event `transfer.initiated` -- **`freeswitch.sofia.register`** / **`.reinvite`** / **`.replaced`** — `register.aor`, `register.action`, `gateway.name`/`gateway.state`, `sofia.profile` -- **`freeswitch.callcenter.info`** — `cc.queue`, `cc.action`, `cc.agent`, `cc.member_uuid`, `cc.count`, `cc.selection` -- **`freeswitch.conference.maintenance`** / **`.cdr`** — `conference.name`, `conference.profile`, `conference.action`, `conference.member_id` -- **`freeswitch.valet.info`** — `valet.lot`, `valet.extension`, `valet.action`, `bridge.to_uuid` + +These spans cover the `CUSTOM` event subclasses FreeSWITCH emits for +transfers, registrations, callcenter, conference and valet parking. + +- **`freeswitch.sofia.transfer`** + - Description: A call transfer was observed + - Attributes: `transfer.role` (`transferor` / `transferee`), `transfer.type` (`blind` / `attended`) + - Events: `transfer.initiated` + +- **`freeswitch.sofia.register`** / **`freeswitch.sofia.reinvite`** / **`freeswitch.sofia.replaced`** + - Description: A SIP registration, reinvite or replace was observed + - Attributes: `register.aor`, `register.action`, `gateway.name` / `gateway.state`, `sofia.profile` + +- **`freeswitch.callcenter.info`** + - Description: A callcenter queue event + - Attributes: `cc.queue`, `cc.action`, `cc.agent`, `cc.member_uuid`, `cc.count`, `cc.selection` + +- **`freeswitch.conference.maintenance`** / **`freeswitch.conference.cdr`** + - Description: A conference maintenance or CDR event + - Attributes: `conference.name`, `conference.profile`, `conference.action`, `conference.member_id` + +- **`freeswitch.valet.info`** + - Description: A valet parking event + - Attributes: `valet.lot`, `valet.extension`, `valet.action`, `bridge.to_uuid` **Session / Consumer / Queue Spans:** -- **`session.sendmsg`** (`Session` module) — `channel.uuid`, `application.name`, `application.uuid`, `application.block` -- **`session.await_complete`** (`Session` module) — child span of `session.sendmsg` when `block=True`; `channel.uuid`, `application.uuid` -- **`consumer.start`** / **`consumer.stop`** (`Consumer` module) — `consumer.host`, `consumer.port` -- **`queue.wait_and_acquire`** (`Queue` module) — `queue.id`, `queue.item_id`, `queue.depth` (span attribute, not a metric label) -## Sniffer correlation (sip.call_id join) +- **`session.sendmsg`** (`Session` module) + - Description: A `sendmsg` command was sent through a session + - Attributes: `channel.uuid`, `application.name`, `application.uuid`, `application.block` -Correlation with the passive sniffer (Otoru/sniffer) is **attribute-based and -happens at the observability backend (Grafana/Tempo), not in code**: +- **`session.await_complete`** (`Session` module) + - Description: Waits for a blocking `sendmsg` to complete (child of `session.sendmsg` when `block=True`) + - Attributes: `channel.uuid`, `application.uuid` -- Every `freeswitch.channel.*` span carries **`sip.call_id`** (= the ESL - `variable_sip_call_id` header), which matches the sniffer's **`voip.call_id`**. -- Join the two traces in Grafana/Tempo by filtering/grouping on that attribute. +- **`consumer.start`** / **`consumer.stop`** (`Consumer` module) + - Description: The consumer subscribed to events / stopped + - Attributes: `consumer.host`, `consumer.port` + +- **`queue.wait_and_acquire`** (`Queue` module) + - Description: Waiting to acquire an item from the queue + - Attributes: `queue.id`, `queue.item_id`, `queue.depth` (span attribute, not a metric label) + +## Cross-system correlation (sip.call_id) + +Every `freeswitch.channel.*` span carries **`sip.call_id`**, taken from the ESL +`variable_sip_call_id` header. This is the standard SIP `Call-ID` header, a +stable per-call identifier that any other SIP observer of the same call will +also have. That makes it a natural join key when you want to correlate Genesis +traces with traces from another system that observed the same call. + +- The join happens **at the observability backend** (Grafana/Tempo or similar), + by filtering or grouping on `sip.call_id` — not in code. - Cross-leg grouping: bridge spans carry **`bridge.a_uuid`** and **`bridge.b_uuid`**, so the a-leg and b-leg of a call can be tied together. - The `genesis.events.without_sip_call_id` metric counts channel events that - lack the correlation key (a correlation-gap signal). - -W3C `traceparent` / `X-Tracespan` propagation to the sniffer is intentionally -**out of scope**; the attribute join is sufficient and requires no sniffer -changes. + arrived without the correlation key — a signal that those calls cannot be + joined to another system's view. The lifecycle/CUSTOM processors are on by default. Opt out with `GENESIS_TRACE_ESL_LIFECYCLE=0` or `GENESIS_TRACE_CUSTOM_SUBCLASSES=0`. diff --git a/genesis/channel.py b/genesis/channel.py index 0bffa78..178ddfe 100644 --- a/genesis/channel.py +++ b/genesis/channel.py @@ -128,8 +128,8 @@ async def create( # channel.call_uuid groups a-leg/b-leg within the Genesis trace; # at originate time it equals the origination UUID. span.set_attribute(ATTR_CHANNEL_CALL_UUID, self.uuid) - # sip.call_id is the join key with the sniffer (voip.call_id). - # It is usually not known yet at originate; attach when present. + # sip.call_id is the standard SIP Call-ID and the cross-system + # join key. It is usually not known yet at originate; attach when present. sip_call_id = _context_str(self.context, "variable_sip_call_id") if sip_call_id: span.set_attribute(ATTR_SIP_CALL_ID, sip_call_id) diff --git a/genesis/protocol/lifecycle.py b/genesis/protocol/lifecycle.py index c7a5a24..5f20d54 100644 --- a/genesis/protocol/lifecycle.py +++ b/genesis/protocol/lifecycle.py @@ -8,10 +8,11 @@ ``valet_parking::``). They only enrich telemetry — they never consume events that route to user handlers. -Correlation with the passive sniffer (Otoru/sniffer) is attribute-based: -every channel span carries ``sip.call_id`` (= ``variable_sip_call_id``), which -matches the sniffer's ``voip.call_id``. The join happens at the observability -backend (Grafana/Tempo), not in code. No sniffer changes are required. +Correlation with another system's view of the same call is attribute-based: +every channel span carries ``sip.call_id`` (= ``variable_sip_call_id``, the +standard SIP Call-ID). Any other SIP observer of the same call will carry the +same value, so the join happens at the observability backend (Grafana/Tempo), +not in code. Cardinality rule: UUIDs go on spans only; metric attributes use low-cardinality enums/labels (channel.state, direction, hangup.cause, application.name, ...). @@ -74,7 +75,7 @@ def _channel_attrs(event: ESLEvent) -> Dict[str, Any]: def _record_sip_gap(event: ESLEvent, attrs: Dict[str, Any]) -> None: - """Count channel events that lack the sniffer correlation key.""" + """Count channel events that lack the sip.call_id correlation key.""" if "sip.call_id" not in attrs: safe_add(events_without_sip_call_id_counter, 1, attributes={}) diff --git a/genesis/protocol/telemetry.py b/genesis/protocol/telemetry.py index 6bf79d5..43d4632 100644 --- a/genesis/protocol/telemetry.py +++ b/genesis/protocol/telemetry.py @@ -38,7 +38,7 @@ def build_event_attributes(event: ESLEvent) -> Dict[str, Any]: attributes[attr_name] = value # Routing / correlation attributes (explicit, low-cardinality keys) so the - # ``process_event`` span carries routing info and the sniffer join key. + # ``process_event`` span carries routing info and the cross-system join key. _EXPLICIT = { "Call-Direction": "event.direction", "Channel-State": "event.channel_state", @@ -57,8 +57,8 @@ def build_event_attributes(event: ESLEvent) -> Dict[str, Any]: value = value[0] if value else "" attributes[dst] = value - # sip.call_id is the PRIMARY correlation key with the sniffer - # (sniffer emits voip.call_id = SIP Call-ID). Join happens at the backend. + # sip.call_id is the standard SIP Call-ID and the cross-system join key. + # The join happens at the observability backend. sip_call_id = event.get("variable_sip_call_id") if sip_call_id: attributes["sip.call_id"] = ( diff --git a/tests/payloads.py b/tests/payloads.py index 0588869..699f0da 100644 --- a/tests/payloads.py +++ b/tests/payloads.py @@ -589,7 +589,7 @@ # --------------------------------------------------------------------------- UUID_A = "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa" UUID_B = "bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb" -SIP_CALL_ID = "sniffer-correlation-key-123" +SIP_CALL_ID = "test-sip-call-id-123" _channel_common = dedent("""\ Unique-ID: {uuid_a} diff --git a/tests/test_channel_lifecycle.py b/tests/test_channel_lifecycle.py index ad24698..9dac6f6 100644 --- a/tests/test_channel_lifecycle.py +++ b/tests/test_channel_lifecycle.py @@ -2,9 +2,10 @@ These processors emit ``freeswitch.channel.*`` and ``freeswitch.sofia.*`` / ``freeswitch.callcenter.*`` / ``freeswitch.conference.*`` / ``freeswitch.valet.*`` -spans. The key contract under test is **sniffer correlation**: every channel -span must carry ``sip.call_id`` (= ``variable_sip_call_id``) so the passive -sniffer's ``voip.call_id`` can be joined to it at the observability backend. +spans. The key contract under test is **cross-system correlation**: every channel +span must carry ``sip.call_id`` (= ``variable_sip_call_id``, the standard SIP +Call-ID) so another system's view of the same call can be joined to it at the +observability backend. """ from __future__ import annotations From 630fa2adbb5344297fd8956b628b84f62134af9c Mon Sep 17 00:00:00 2001 From: Vitor Hugo Date: Tue, 30 Jun 2026 23:11:00 -0300 Subject: [PATCH 4/5] chore: drop planning artifacts from the PR --- docs/esl-sniffer-traces-mapping.json | 1874 -------------------------- docs/esl-sniffer-traces-plan.md | 456 ------- 2 files changed, 2330 deletions(-) delete mode 100644 docs/esl-sniffer-traces-mapping.json delete mode 100644 docs/esl-sniffer-traces-plan.md diff --git a/docs/esl-sniffer-traces-mapping.json b/docs/esl-sniffer-traces-mapping.json deleted file mode 100644 index 2e65500..0000000 --- a/docs/esl-sniffer-traces-mapping.json +++ /dev/null @@ -1,1874 +0,0 @@ -{ - "genesisSpans": { - "spans": [ - { - "attributes": [ - { - "name": "event.name", - "source": "Event-Name (ESL header)" - }, - { - "name": "event.uuid", - "source": "Unique-ID (ESL header)" - }, - { - "name": "event.content_type", - "source": "Content-Type (ESL header)" - }, - { - "name": "event.header.", - "source": "Every other ESL header key, lowercased with '-' replaced by '_' (built by build_event_attributes in genesis/protocol/telemetry.py:15)" - } - ], - "location": "genesis/protocol/base.py:201", - "name": "process_event", - "spanEvents": [], - "wraps": "Processing one inbound ESL event in the consumer loop: records event metrics (events_received_counter) and logs the event. Dispatch to handlers and event-processor execution happen OUTSIDE this span." - }, - { - "attributes": [ - { - "name": "command.name", - "source": "Full raw cmd string passed to send() (NOT just the command verb; set at base.py:291)" - }, - { - "name": "command.reply", - "source": "Reply-Text field of the resulting ESLEvent (set at base.py:337, only when non-empty)" - } - ], - "location": "genesis/protocol/base.py:290", - "name": "send_command", - "spanEvents": [], - "wraps": "Sending an ESL command over the socket and awaiting the command/reply ESLEvent (Protocol.send)." - }, - { - "attributes": [ - { - "name": "channel.dial_path", - "source": "dial_path constructor arg (e.g. 'user/1000')" - }, - { - "name": "channel.has_variables", - "source": "str(variables is not None) — whether originate variables were supplied" - }, - { - "name": "channel.uuid", - "source": "ATTR_CHANNEL_UUID; value = self.uuid, which comes from response.body of 'api create_uuid' (stripped). Set at channel.py:159" - }, - { - "name": "channel.create.duration", - "source": "time.time() - start_time (wall-clock seconds of the create flow). Set at channel.py:179" - }, - { - "name": "(status)", - "source": "On exception: span.set_status(StatusCode.ERROR, str(e)) at channel.py:188" - } - ], - "location": "genesis/channel.py:144", - "name": "channel.create", - "spanEvents": [], - "wraps": "Channel.create() factory: requests events plain ALL, generates UUID via 'api create_uuid', registers CHANNEL_STATE handler, applies filter, and issues 'api originate' to place the call." - }, - { - "attributes": [ - { - "name": "channel.uuid", - "source": "ATTR_CHANNEL_UUID = self.uuid or 'unknown'" - }, - { - "name": "channel.state", - "source": "ATTR_CHANNEL_STATE = self.state.name (current ChannelState enum name)" - }, - { - "name": "wait.target", - "source": "str(target) — either ChannelState name or event-name string" - }, - { - "name": "wait.timeout", - "source": "timeout parameter (seconds, default 30.0)" - }, - { - "name": "wait.type", - "source": "ATTR_WAIT_TYPE = 'event' if target is str else 'state'" - }, - { - "name": "operation", - "source": "literal 'wait'" - }, - { - "name": "wait.result", - "source": "ATTR_WAIT_RESULT; 'success' (channel.py:340), 'timeout' (channel.py:360), or 'already_reached' (channel.py:441 when state already satisfies target)" - }, - { - "name": "wait.duration", - "source": "ATTR_WAIT_DURATION = time.time() - start_time (set on success and timeout paths, not on already_reached)" - } - ], - "location": "genesis/channel.py:415", - "name": "channel.wait", - "spanEvents": [], - "wraps": "Channel.wait(): waiting for a target ChannelState (via _wait_for_state) or a named event (via _wait_for_event), with timeout." - }, - { - "attributes": [ - { - "name": "channel.uuid", - "source": "ATTR_CHANNEL_UUID = self.uuid or 'unknown'" - }, - { - "name": "channel.state", - "source": "ATTR_CHANNEL_STATE = self.state.name" - }, - { - "name": "operation", - "source": "literal 'answer'" - }, - { - "name": "channel.answer.success", - "source": "result.get('Reply-Text','').startswith('+OK') — set by _execute_operation at channel.py:512" - }, - { - "name": "channel.answer.duration", - "source": "time.time() - start_time (channel.py:513)" - }, - { - "name": "(status)", - "source": "On exception: set_status(StatusCode.ERROR, str(e)) at channel.py:526" - } - ], - "location": "genesis/channel.py:505 (via answer() call at channel.py:541-550)", - "name": "channel.answer", - "spanEvents": [], - "wraps": "Answering the call (execute 'answer' via sendmsg or 'api uuid_execute'). Created by _execute_operation with span_name='channel.answer'." - }, - { - "attributes": [ - { - "name": "channel.uuid", - "source": "ATTR_CHANNEL_UUID = self.uuid or 'unknown'" - }, - { - "name": "channel.state", - "source": "ATTR_CHANNEL_STATE = self.state.name" - }, - { - "name": "operation", - "source": "literal 'park'" - }, - { - "name": "channel.park.success", - "source": "Reply-Text startswith '+OK' (channel.py:512)" - }, - { - "name": "channel.park.duration", - "source": "time.time() - start_time (channel.py:513)" - }, - { - "name": "(status)", - "source": "On exception: set_status ERROR at channel.py:526" - } - ], - "location": "genesis/channel.py:505 (via park() call at channel.py:554-563)", - "name": "channel.park", - "spanEvents": [], - "wraps": "Parking the channel (execute 'park'). Created by _execute_operation with span_name='channel.park'." - }, - { - "attributes": [ - { - "name": "channel.uuid", - "source": "ATTR_CHANNEL_UUID = self.uuid or 'unknown'" - }, - { - "name": "channel.state", - "source": "ATTR_CHANNEL_STATE = self.state.name" - }, - { - "name": "hangup.cause", - "source": "ATTR_HANGUP_CAUSE = cause param (HangupCause, default 'NORMAL_CLEARING')" - }, - { - "name": "operation", - "source": "literal 'hangup'" - }, - { - "name": "channel.hangup.success", - "source": "Reply-Text startswith '+OK' (channel.py:512)" - }, - { - "name": "channel.hangup.duration", - "source": "time.time() - start_time (channel.py:513)" - }, - { - "name": "call.duration", - "source": "time.time() - self._created_at (set in on_success at channel.py:572, only when _created_at is set)" - }, - { - "name": "(status)", - "source": "On exception: set_status ERROR at channel.py:526" - } - ], - "location": "genesis/channel.py:505 (via hangup() call at channel.py:588-600)", - "name": "channel.hangup", - "spanEvents": [], - "wraps": "Hanging up the call with a cause (sendmsg 'hangup' or 'api uuid_kill'). Created by _execute_operation with span_name='channel.hangup'; on_success callback records hangup cause + call duration." - }, - { - "attributes": [ - { - "name": "channel.uuid", - "source": "ATTR_CHANNEL_UUID = self.uuid or 'unknown'" - }, - { - "name": "channel.other_uuid", - "source": "other_uuid from peer Channel/Session (extracted by _get_peer_uuid; falls back to context['Unique-ID'])" - }, - { - "name": "channel.state", - "source": "ATTR_CHANNEL_STATE = self.state.name" - }, - { - "name": "operation", - "source": "literal 'bridge'" - }, - { - "name": "channel.bridge.success", - "source": "Reply-Text startswith '+OK' (channel.py:512)" - }, - { - "name": "channel.bridge.duration", - "source": "time.time() - start_time (channel.py:513)" - }, - { - "name": "(status)", - "source": "On exception: set_status ERROR at channel.py:526" - } - ], - "location": "genesis/channel.py:505 (via bridge() call at channel.py:632-644)", - "name": "channel.bridge", - "spanEvents": [], - "wraps": "Bridging this channel with another Channel/Session (sendmsg 'bridge uuid:' or 'api uuid_bridge'). Created by _execute_operation with span_name='channel.bridge'." - }, - { - "attributes": [ - { - "name": "channel.uuid", - "source": "ATTR_CHANNEL_UUID = self.uuid or 'unknown'" - }, - { - "name": "channel.state", - "source": "ATTR_CHANNEL_STATE = self.state.name" - }, - { - "name": "playback.path", - "source": "path parameter (audio file path)" - }, - { - "name": "playback.block", - "source": "str(block) — whether execution blocks" - }, - { - "name": "operation", - "source": "literal 'playback'" - }, - { - "name": "channel.playback.success", - "source": "Reply-Text startswith '+OK' (channel.py:512)" - }, - { - "name": "channel.playback.duration", - "source": "time.time() - start_time (channel.py:513)" - }, - { - "name": "(status)", - "source": "On exception: set_status ERROR at channel.py:526" - } - ], - "location": "genesis/channel.py:505 (via playback() call at channel.py:653-666)", - "name": "channel.playback", - "spanEvents": [], - "wraps": "Playing an audio file (execute 'playback '). Created by _execute_operation with span_name='channel.playback'." - }, - { - "attributes": [ - { - "name": "channel.uuid", - "source": "ATTR_CHANNEL_UUID = self.uuid or 'unknown'" - }, - { - "name": "channel.state", - "source": "ATTR_CHANNEL_STATE = self.state.name" - }, - { - "name": "say.module", - "source": "module param (e.g. 'en' or 'en:en')" - }, - { - "name": "say.kind", - "source": "kind param (e.g. 'NUMBER')" - }, - { - "name": "say.method", - "source": "method param (e.g. 'pronounced')" - }, - { - "name": "say.gender", - "source": "gender param (e.g. 'FEMININE')" - }, - { - "name": "operation", - "source": "literal 'say'" - }, - { - "name": "channel.say.success", - "source": "Reply-Text startswith '+OK' (channel.py:512)" - }, - { - "name": "channel.say.duration", - "source": "time.time() - start_time (channel.py:513)" - }, - { - "name": "(status)", - "source": "On exception: set_status ERROR at channel.py:526" - } - ], - "location": "genesis/channel.py:505 (via say() call at channel.py:685-700)", - "name": "channel.say", - "spanEvents": [], - "wraps": "Saying text via pre-recorded files (execute 'say '). Created by _execute_operation with span_name='channel.say'." - }, - { - "attributes": [ - { - "name": "channel.uuid", - "source": "ATTR_CHANNEL_UUID = self.uuid or 'unknown'" - }, - { - "name": "channel.state", - "source": "ATTR_CHANNEL_STATE = self.state.name" - }, - { - "name": "play_and_get_digits.file", - "source": "file parameter" - }, - { - "name": "play_and_get_digits.tries", - "source": "str(tries)" - }, - { - "name": "play_and_get_digits.timeout", - "source": "str(timeout)" - }, - { - "name": "play_and_get_digits.minimal", - "source": "str(minimal)" - }, - { - "name": "play_and_get_digits.maximum", - "source": "str(maximum)" - }, - { - "name": "operation", - "source": "literal 'play_and_get_digits'" - }, - { - "name": "channel.play_and_get_digits.success", - "source": "Reply-Text startswith '+OK' (channel.py:512)" - }, - { - "name": "channel.play_and_get_digits.duration", - "source": "time.time() - start_time (channel.py:513)" - }, - { - "name": "(status)", - "source": "On exception: set_status ERROR at channel.py:526" - } - ], - "location": "genesis/channel.py:505 (via play_and_get_digits() call at channel.py:736-756)", - "name": "channel.play_and_get_digits", - "spanEvents": [], - "wraps": "Playing a file and collecting digits from the caller (execute 'play_and_get_digits ...'). Created by _execute_operation with span_name='channel.play_and_get_digits'." - }, - { - "attributes": [ - { - "name": "channel.uuid", - "source": "ATTR_CHANNEL_UUID = self.uuid or 'unknown'" - }, - { - "name": "dtmf.digit", - "source": "event.get('DTMF-Digit') — DTMF-Digit ESL field" - }, - { - "name": "dtmf.handled", - "source": "literal True, set at channel.py:808 after the user handler returns without raising" - }, - { - "name": "(status)", - "source": "On exception: span.record_exception(e) at channel.py:810 and set_status(StatusCode.ERROR, str(e)) at channel.py:811-813" - } - ], - "location": "genesis/channel.py:796", - "name": "channel.dtmf.received", - "spanEvents": [], - "wraps": "Invoking a user-registered DTMF handler (via @channel.on_dtmf decorator) for a received DTMF event." - }, - { - "attributes": [ - { - "name": "net.peer.name", - "source": "self.host (FreeSWITCH host)" - }, - { - "name": "net.peer.port", - "source": "self.port (FreeSWITCH port)" - } - ], - "location": "genesis/inbound.py:97", - "name": "inbound_connect", - "spanEvents": [], - "wraps": "Inbound.start(): establishing the TCP connection to FreeSWITCH (wraps self._connect() only; authentication and super().start() run outside the span)." - }, - { - "attributes": [ - { - "name": "net.peer.name", - "source": "server.host" - }, - { - "name": "net.peer.port", - "source": "server.port" - } - ], - "location": "genesis/outbound.py:156", - "name": "outbound_handle_connection", - "spanEvents": [], - "wraps": "Outbound.handler() static method: processing a single incoming FreeSWITCH outbound connection — opens a Session, runs _setup_session and server.app(session)." - }, - { - "attributes": [ - { - "name": "ring_group.mode", - "source": "mode.value (RingMode enum: 'parallel'/'sequential'/'balancing')" - }, - { - "name": "ring_group.size", - "source": "len(group) — number of destinations" - }, - { - "name": "ring_group.timeout", - "source": "timeout parameter (seconds)" - }, - { - "name": "ring_group.has_balancer", - "source": "str(balancer is not None and mode == RingMode.BALANCING)" - }, - { - "name": "ring_group.has_variables", - "source": "str(variables is not None)" - }, - { - "name": "ring_group.result", - "source": "'answered' if a Channel answered else 'no_answer' (ring.py:156-158); 'error' on exception (ring.py:202)" - }, - { - "name": "ring_group.duration", - "source": "time.time() - start_time (set on both success and error paths, ring.py:159 / 204)" - }, - { - "name": "ring_group.answered_uuid", - "source": "answered.uuid or 'unknown' (set only when answered, ring.py:161-163)" - }, - { - "name": "ring_group.answered_dial_path", - "source": "answered.dial_path (set only when answered, ring.py:164-166)" - }, - { - "name": "ring_group.error", - "source": "str(e) — exception message (set only on error path, ring.py:203)" - }, - { - "name": "(status)", - "source": "On exception: span.record_exception(e) at ring.py:205 (no explicit set_status)" - } - ], - "location": "genesis/group/ring.py:138", - "name": "ring_group.ring", - "spanEvents": [], - "wraps": "RingGroup.ring() classmethod: dispatching a ring group (PARALLEL / SEQUENTIAL / BALANCING) and waiting for the first answering Channel within timeout." - }, - { - "attributes": [ - { - "name": "queue.id", - "source": "ATTR_QUEUE_ID = self._queue_id" - }, - { - "name": "queue.item_id", - "source": "ATTR_QUEUE_ITEM_ID = self._item_id (defaults to str(uuid4()) if not supplied)" - } - ], - "location": "genesis/queue/core.py:76", - "name": "queue.wait_and_acquire", - "spanEvents": [], - "wraps": "QueueSlot.__aenter__: enqueuing an item and waiting to acquire a concurrency slot from the queue backend (wraps backend.wait_and_acquire)." - } - ], - "gaps": [ - "No spans in genesis/session.py — Session.sendmsg, Session.start/stop, and the session lifecycle are entirely untraced.", - "No spans in genesis/consumer.py — the high-level Consumer class (decorator API wrapping Inbound) has no span instrumentation.", - "No spans in genesis/observability/ — only logging (custom TRACE logger) and the OTel metrics/tracing server setup; no application-level spans there.", - "genesis/protocol/telemetry.py imports `tracer` from metrics but never creates a span itself; it only supplies build_event_attributes / record_event_metrics / log_event helpers consumed by base.py's process_event span.", - "process_event span (base.py:201) only wraps metrics+logging; event-processor execution (the for-loop at base.py:208-211) and routing+dispatch (base.py:213-215, dispatch_to_handlers) run OUTSIDE the span — no span covers handler dispatch or routing-strategy.route().", - "No span for Protocol.consume() loop body or Protocol.handler() socket-read loop (base.py).", - "Channel._wait_for_state and Channel._wait_for_event internal helpers have no nested spans; only the outer channel.wait span exists.", - "No span for Channel.from_session factory or Channel state-transition handler (_state_handler / _on_answer_received).", - "No span.set_attribute for command errors on send_command (errors are recorded only as metrics via _record_command_error; the span has no error/status attribute and no record_exception on the -ERR reply path).", - "send_command's command.name attribute is set to the FULL raw command string (base.py:291), not the parsed command verb — high cardinality and not the documented 'name'.", - "Zero use of span.add_event() anywhere in genesis/ — no span events are ever recorded.", - "Only channel.create, the _execute_operation spans, channel.dtmf.received, and ring_group.ring call record_exception/set_status on errors; inbound_connect, outbound_handle_connection, queue.wait_and_acquire, process_event, and send_command do not record exceptions on the span (errors are surfaced only via metrics counters or re-raised).", - "ring_group.ring sets result='error' and record_exception but does NOT call span.set_status(StatusCode.ERROR) — inconsistent with channel.py error handling.", - "No tracing attributes carry Call-Direction, Hangup-Cause, Answer-State, Channel-State, or Event-Subclass on the process_event span (those are only metric attributes via build_metric_attributes in telemetry.py:43)." - ], - "notes": "Module-level tracer definitions (all via `trace.get_tracer(__name__)`): genesis/protocol/metrics.py:10 (shared by genesis/protocol/base.py via import and by genesis/protocol/telemetry.py via import), genesis/channel.py:28, genesis/inbound.py:17, genesis/outbound.py:30, genesis/group/ring.py:23, genesis/queue/core.py:19. No `tracer.start_span` (only `start_as_current_span`), no `use_span`, and no `span.add_event` calls exist anywhere in genesis/. All spans use the context-manager form `with tracer.start_as_current_span(...) as span:`. Six of the eight channel operation spans (answer, park, hangup, bridge, playback, say, play_and_get_digits) are produced by the single generic helper `_execute_operation` at genesis/channel.py:494-537 (span created at line 505); each caller supplies a distinct span_name and span_attributes dict, and the helper adds the common channel..success and channel..duration attributes plus record_exception/set_status on failure. All eight channel operation spans set channel.uuid and channel.state. The process_event span's attribute set is dynamic: it is built by build_event_attributes() in genesis/protocol/telemetry.py:15-40, which maps Event-Name->event.name, Unique-ID->event.uuid, Content-Type->event.content_type, and every other ESL header to event.header..\"" - }, - "genesisMetrics": { - "metrics": [ - { - "attributes": [ - { - "name": "command", - "source": "command_name parsed from the ESL command string sent via Protocol.send" - } - ], - "description": "Number of ESL commands sent", - "location": "genesis/protocol/metrics.py:14 (created); genesis/protocol/base.py:319 (incremented)", - "name": "genesis.commands.sent", - "type": "counter", - "unit": "1" - }, - { - "attributes": [ - { - "name": "event_name", - "source": "ESL event header Event-Name (default UNKNOWN)" - }, - { - "name": "content_type", - "source": "ESL event header Content-Type (default UNKNOWN)" - }, - { - "name": "event_subclass", - "source": "ESL event header Event-Subclass (only when present)" - }, - { - "name": "direction", - "source": "ESL event header Call-Direction (only when present)" - }, - { - "name": "channel_state", - "source": "ESL event header Channel-State (only when present)" - }, - { - "name": "answer_state", - "source": "ESL event header Answer-State (only when present)" - }, - { - "name": "hangup_cause", - "source": "ESL event header Hangup-Cause (only when present)" - } - ], - "description": "Number of ESL events received", - "location": "genesis/protocol/metrics.py:20 (created); genesis/protocol/telemetry.py:83 (incremented via record_event_metrics)", - "name": "genesis.events.received", - "type": "counter", - "unit": "1" - }, - { - "attributes": [ - { - "name": "command", - "source": "command_name parsed from the ESL command string" - } - ], - "description": "Duration of ESL commands execution", - "location": "genesis/protocol/metrics.py:26 (created); genesis/protocol/base.py:347 (recorded in _execute_send finally block)", - "name": "genesis.commands.duration", - "type": "histogram", - "unit": "s" - }, - { - "attributes": [ - { - "name": "command", - "source": "command_name parsed from the ESL command string" - }, - { - "name": "error", - "source": "literal 'protocol_error' when Reply-Text starts with -ERR, otherwise the exception class name (type(e).__name__)" - } - ], - "description": "Number of failed ESL commands", - "location": "genesis/protocol/metrics.py:32 (created); genesis/protocol/base.py:302 (incremented via _record_command_error)", - "name": "genesis.commands.errors", - "type": "counter", - "unit": "1" - }, - { - "attributes": [ - { - "name": "operation", - "source": "literal operation label e.g. 'create', 'wait', or the channel operation name (answer/hangup/bridge/park/playback etc.)" - }, - { - "name": "wait.type", - "source": "wait_type argument passed to _record_wait_success/_record_wait_timeout (e.g. the kind of wait); only on wait operations at channel.py:342,370" - }, - { - "name": "success", - "source": "literal 'true'/'false'; on wait sites a fixed bool, on generic _execute_operation derived from result.get('Reply-Text','').startswith('+OK')" - }, - { - "name": "error", - "source": "exception class name type(e).__name__ (e.g. 'TimeoutError'); only on failure path at channel.py:370,527" - } - ], - "description": "Number of channel operations", - "location": "genesis/protocol/metrics.py:39 (created); also re-declared at genesis/channel.py:32 to avoid circular imports; incremented at genesis/channel.py:180,342,370,514,527", - "name": "genesis.channel.operations", - "type": "counter", - "unit": "1" - }, - { - "attributes": [ - { - "name": "operation", - "source": "literal operation label: 'create' at line 181, or the channel operation name passed to _execute_operation at line 518" - } - ], - "description": "Duration of channel operations", - "location": "genesis/protocol/metrics.py:45 (created); also re-declared at genesis/channel.py:38; recorded at genesis/channel.py:181,518", - "name": "genesis.channel.operation.duration", - "type": "histogram", - "unit": "s" - }, - { - "attributes": [ - { - "name": "hangup.cause", - "source": "cause argument passed to Channel.hangup (HangupCause literal, e.g. NORMAL_CLEARING); FreeSWITCH hangup cause value" - }, - { - "name": "error", - "source": "exception class name type(exc).__name__; only on the on_error callback at channel.py:576" - } - ], - "description": "Hangup causes", - "location": "genesis/protocol/metrics.py:51 (created); also re-declared at genesis/channel.py:44; incremented at genesis/channel.py:570,576 (Channel.hangup on_success/on_error callbacks)", - "name": "genesis.channel.hangup.causes", - "type": "counter", - "unit": "1" - }, - { - "attributes": [ - { - "name": "success", - "source": "str(bool) derived from result.get('Reply-Text','').startswith('+OK'); 'false' on error path" - }, - { - "name": "error", - "source": "exception class name type(exc).__name__; only on the on_error callback at channel.py:621" - } - ], - "description": "Bridge operations", - "location": "genesis/protocol/metrics.py:57 (created); also re-declared at genesis/channel.py:50; incremented at genesis/channel.py:618,621 (Channel.bridge on_success/on_error)", - "name": "genesis.channel.bridge.operations", - "type": "counter", - "unit": "1" - }, - { - "attributes": [ - { - "name": "dtmf.digit", - "source": "dtmf_digit extracted from the DTMF ESL event (DTMF digit value)" - } - ], - "description": "DTMF digits received", - "location": "genesis/protocol/metrics.py:63 (created); also re-declared at genesis/channel.py:56; incremented at genesis/channel.py:804 (DTMF event handler)", - "name": "genesis.channel.dtmf.received", - "type": "counter", - "unit": "1" - }, - { - "attributes": [], - "description": "Total call duration from creation to hangup", - "location": "genesis/protocol/metrics.py:69 (created); also re-declared at genesis/channel.py:62; recorded at genesis/channel.py:573 (Channel.hangup on_success)", - "name": "genesis.call.duration", - "type": "histogram", - "unit": "s" - }, - { - "attributes": [ - { - "name": "timeout.type", - "source": "literal 'wait'" - }, - { - "name": "timeout.operation", - "source": "timeout_operation argument passed to _record_wait_timeout (the operation that timed out)" - }, - { - "name": "timeout.duration", - "source": "computed duration = time.time() - start_time (seconds, float)" - } - ], - "description": "Number of timeouts", - "location": "genesis/protocol/metrics.py:75 (created); also re-declared at genesis/channel.py:68; incremented at genesis/channel.py:362 (_record_wait_timeout)", - "name": "genesis.timeouts", - "type": "counter", - "unit": "1" - }, - { - "attributes": [ - { - "name": "event_name", - "source": "ESL event header Event-Name, or Event-Subclass when Event-Name == 'CUSTOM' (via get_event_name)" - } - ], - "description": "Number of O(1) channel routing hits", - "location": "genesis/protocol/metrics.py:82 (created); genesis/protocol/routing/channel.py:55 (incremented in ChannelRoutingStrategy.route)", - "name": "genesis.channel.routing.hits", - "type": "counter", - "unit": "1" - }, - { - "attributes": [ - { - "name": "event_name", - "source": "ESL event header Event-Name, or Event-Subclass when Event-Name == 'CUSTOM' (via get_event_name)" - } - ], - "description": "Number of fallback to O(N) global routing", - "location": "genesis/protocol/metrics.py:88 (created); genesis/protocol/routing/global_.py:50 (incremented in GlobalRoutingStrategy.route)", - "name": "genesis.channel.routing.fallback", - "type": "counter", - "unit": "1" - }, - { - "attributes": [ - { - "name": "type", - "source": "literal 'inbound' (Inbound class) or 'outbound' (Outbound class)" - } - ], - "description": "Number of active connections", - "location": "genesis/inbound.py:20 and genesis/outbound.py:33 (both create an UpDownCounter of the same name); incremented/decremented at genesis/inbound.py:117,128 and genesis/outbound.py:163,173", - "name": "genesis.connections.active", - "type": "up_down_counter", - "unit": "1" - }, - { - "attributes": [ - { - "name": "error", - "source": "literal 'authentication_failed' (when Reply-Text != '+OK accepted') or 'timeout' (on asyncio.TimeoutError during connect)" - }, - { - "name": "type", - "source": "literal 'inbound'" - } - ], - "description": "Number of connection errors", - "location": "genesis/inbound.py:25 (created); incremented at genesis/inbound.py:87,107", - "name": "genesis.connections.errors", - "type": "counter", - "unit": "1" - }, - { - "attributes": [ - { - "name": "mode", - "source": "RingMode enum value (parallel/sequential/balancing)" - }, - { - "name": "has_balancer", - "source": "str(bool): whether a LoadBalancerBackend is configured and mode == RingMode.BALANCING" - } - ], - "description": "Number of ring group operations", - "location": "genesis/group/ring.py:27 (created); genesis/group/ring.py:169 (incremented in RingGroup._ring)", - "name": "genesis.ring_group.operations", - "type": "counter", - "unit": "1" - }, - { - "attributes": [ - { - "name": "mode", - "source": "RingMode enum value (parallel/sequential/balancing)" - }, - { - "name": "has_balancer", - "source": "str(bool): whether a LoadBalancerBackend is configured and mode == RingMode.BALANCING" - } - ], - "description": "Duration of ring group operations", - "location": "genesis/group/ring.py:33 (created); genesis/group/ring.py:178 (recorded in RingGroup._ring)", - "name": "genesis.ring_group.operation.duration", - "type": "histogram", - "unit": "s" - }, - { - "attributes": [ - { - "name": "mode", - "source": "RingMode enum value (parallel/sequential/balancing)" - }, - { - "name": "result", - "source": "literal 'answered' or 'no_answer' on success path; literal 'error' on exception path" - }, - { - "name": "has_balancer", - "source": "str(bool): whether a LoadBalancerBackend is configured and mode == RingMode.BALANCING" - }, - { - "name": "error", - "source": "exception class name type(e).__name__; only on error path at ring.py:207" - } - ], - "description": "Ring group operation results", - "location": "genesis/group/ring.py:39 (created); genesis/group/ring.py:187,207 (incremented in RingGroup._ring success and error paths)", - "name": "genesis.ring_group.results", - "type": "counter", - "unit": "1" - }, - { - "attributes": [ - { - "name": "queue.id", - "source": "queue_id argument passed to Queue.slot()" - }, - { - "name": "op", - "source": "literal 'acquire' (on enter) or 'release' (on exit)" - } - ], - "description": "Queue slot acquire/release operations", - "location": "genesis/queue/core.py:22 (created); genesis/queue/core.py:92,101 (incremented in QueueSlot.__aenter__/__aexit__)", - "name": "genesis.queue.operations", - "type": "counter", - "unit": "1" - }, - { - "attributes": [ - { - "name": "queue.id", - "source": "queue_id argument passed to Queue.slot()" - } - ], - "description": "Time spent waiting for a slot", - "location": "genesis/queue/core.py:27 (created); genesis/queue/core.py:91 (recorded in QueueSlot.__aenter__ after wait_and_acquire)", - "name": "genesis.queue.wait_duration", - "type": "histogram", - "unit": "s" - } - ], - "gaps": [ - "No ObservableCounter / ObservableGauge instruments anywhere in genesis/ — only synchronous Counter, Histogram, and UpDownCounter are used. There is no async gauge for in-flight values (e.g. current queue depth, current active calls by state).", - "genesis/connections.errors is only created and incremented in genesis/inbound.py. The Outbound class (genesis/outbound.py) defines no connection error counter — outbound connect/handle failures are not recorded as a metric, only the active_connections UpDownCounter is decremented.", - "No metrics instrumentation in genesis/session.py — Session.send/sendmsg operations are not independently metered (they rely on the inherited Protocol.send counters).", - "No metrics instrumentation in genesis/consumer.py — the high-level Consumer class adds no metrics of its own; it only inherits Inbound counters.", - "No metrics instrumentation in genesis/group/load_balancer.py or the Redis load balancer backend — backend eviction/rotation/redis errors are not metered.", - "genesis/channel.operations, genesis.channel.operation.duration, genesis.channel.hangup.causes, genesis.channel.bridge.operations, genesis.channel.dtmf.received, genesis.call.duration, and genesis.timeouts are each defined twice with identical names: once in genesis/protocol/metrics.py and again in genesis/channel.py (comment: 'to avoid circular imports'). Both call meter.create_counter on the same meter/name; OTel deduplicates by identity but the duplicate definitions are a maintenance hazard.", - "call_duration_histogram.record at genesis/channel.py:573 records with NO attributes — the channel UUID / hangup cause are not attached, so call duration cannot be broken down by channel or cause.", - "command_duration_histogram at genesis/protocol/base.py:347 records on every send including commands without a command_name (command_name falsy guard skips both the counter add and the histogram record), so no-op/empty command names are unmeasured.", - "No metric covers ESL event processing/dispatch latency (time from event parse to handler completion) — only events_received (count) and routing hits/fallback (count) exist.", - "No metric covers command queue depth (self.commands queue length in Protocol) or the events queue length — backpressure is not observable via metrics." - ], - "notes": "All instruments are created via the OpenTelemetry API meter (opentelemetry.metrics.get_meter) at module load. Five modules declare their own meter via metrics.get_meter(__name__): genesis/protocol/metrics.py, genesis/channel.py, genesis/inbound.py, genesis/outbound.py, genesis/group/ring.py, genesis/queue/core.py. The CLI (genesis/cli/__init__.py:78) installs a metrics meter provider. Meters are lazily registered with a provider, so instruments are no-ops until a provider is configured. Total: 20 distinct metric instruments (counting the duplicated channel metrics as a single instrument each): 13 counters, 5 histograms, 2 up_down_counters, 0 observable/gauge instruments. All add()/record() call sites are wrapped in try/except (either directly or via the _safe_connection_metric helper in inbound/outbound and the best-effort pattern in base.py and routing modules), so metrics failures never break ESL processing. Attribute naming mixes conventions: dotted OTel-style (e.g. 'channel.uuid', 'hangup.cause', 'dtmf.digit', 'queue.id') for span/attribute constants defined in channel.py and queue/core.py, snake_case (e.g. 'event_name', 'content_type', 'has_balancer', 'op') for telemetry.py, routing, ring.py and queue op attributes, and bare labels ('command', 'operation', 'success', 'mode', 'result', 'type', 'error') in several places." - }, - "genesisEvents": { - "events": [ - { - "event_name": "HEARTBEAT", - "fields_used": [ - "Event-Name", - "Content-Type", - "Event-Info", - "Up-Time", - "Session-Count", - "Max-Sessions", - "FreeSWITCH-Version" - ], - "handled_in": "genesis/consumer.py: Consumer.start() registers `protocol.on('HEARTBEAT', observability.record_heartbeat)` (global handler). observability/server.py:record_heartbeat is the sink. No channel-specific handler.", - "routing_info_attached": [ - "Event-Name (routing key, global only — no Unique-ID)", - "event.name span attribute", - "event_name metric attribute" - ] - }, - { - "event_name": "CHANNEL_CREATE", - "fields_used": [ - "Event-Name", - "Unique-ID", - "Channel-State", - "Channel-Call-UUID", - "Channel-Name", - "Call-Direction", - "Answer-State", - "Caller-Caller-ID-Number", - "Caller-Destination-Number", - "Caller-Context", - "Caller-Dialplan", - "Caller-Unique-ID", - "variable_uuid", - "variable_call_uuid" - ], - "handled_in": "No explicit handler in genesis/. Emitted by FreeSWITCH on originate (Channel.create sends `api originate ... &park()`). Routed only via GlobalRoutingStrategy (test_payloads.py:channel_create is the canonical fixture). Consumer handlers can subscribe via `@consumer.handle('CHANNEL_CREATE')` or `filtrate(...)`.", - "routing_info_attached": [ - "Event-Name + Unique-ID (ChannelRoutingStrategy key, if a channel handler is registered)", - "event.name/event.uuid span attrs", - "direction (Call-Direction), channel_state (Channel-State), answer_state (Answer-State) metric attrs", - "Caller-Destination-Number/Caller-Context/Caller-Dialplan/Caller-Caller-ID-Number propagated as event.header.* span attrs and via context.update(event) when a Channel adopts the event" - ] - }, - { - "event_name": "CHANNEL_STATE", - "fields_used": [ - "Event-Name", - "Unique-ID", - "Channel-State", - "Channel-State-Number", - "Channel-Call-State", - "variable_*" - ], - "handled_in": "genesis/channel.py: Channel._state_handler filters on `event.get('Unique-ID') == self.uuid`, reads `Channel-State`, maps via ChannelState.from_freeswitch (strips 'CS_' prefix), updates self._state and self._state_changes timestamps, then `self.context.update(event)`. Registered globally in Channel.create via `protocol.on('CHANNEL_STATE', self._state_handler)` and per-channel (O(1)) in Channel.from_session via `protocol.register_channel_handler(uuid, 'CHANNEL_STATE', ...)`. Also registered transiently in _wait_for_state and _wait_for_event.", - "routing_info_attached": [ - "Unique-ID (O(1) channel_registry key '{uuid}:CHANNEL_STATE')", - "Channel-State → ChannelState enum (NEW/INIT/ROUTING/SOFT_EXECUTE/EXECUTE/EXCHANGE_MEDIA/PARK/CONSUME_MEDIA/HIBERNATE/RESET/HANGUP/REPORTING/DESTROY/NONE)", - "channel_state metric attribute", - "channel.state span attribute on channel.wait spans", - "full event dict merged into Channel.context (ContextType) for downstream use" - ] - }, - { - "event_name": "CHANNEL_ANSWER", - "fields_used": [ - "Event-Name", - "Unique-ID", - "Answer-State" - ], - "handled_in": "genesis/channel.py: _wait_for_state registers an answer_handler via `protocol.on('CHANNEL_ANSWER', ...)` when target_state == EXECUTE; _on_answer_received filters on Unique-ID match and sets the answer_received Event so EXECUTE is considered reached. _wait_for_event lists CHANNEL_ANSWER in channel_specific_events (Unique-ID filtered). No permanent handler.", - "routing_info_attached": [ - "Unique-ID (filter inside handler to match self.uuid)", - "Answer-State metric attribute", - "event.name/event.uuid span attrs", - "transient global registration via protocol.on, removed in finally block" - ] - }, - { - "event_name": "CHANNEL_HANGUP", - "fields_used": [ - "Event-Name", - "Unique-ID", - "Hangup-Cause" - ], - "handled_in": "No dedicated handler. Referenced as a waitable event name in Channel.wait() docstring (e.g. `await channel.wait('CHANNEL_HANGUP')`) and used as a routing fixture in tests/test_routing.py. Routed via GlobalRoutingStrategy / _wait_for_event path.", - "routing_info_attached": [ - "Event-Name (global routing key)", - "Hangup-Cause metric attribute (when present)", - "event.name/event.uuid span attrs" - ] - }, - { - "event_name": "CHANNEL_HANGUP_COMPLETE", - "fields_used": [ - "Event-Name", - "Unique-ID", - "Hangup-Cause", - "Channel-Unique-ID" - ], - "handled_in": "genesis/session.py: hangup_complete_handler compares `session.context.get('Channel-Unique-ID') == event.get('Unique-ID')`, then pushes to session.fifo and signals completion. Registered via register_channel_handler (O(1)) when session.uuid is known, else via protocol.on. genesis/channel.py: listed in channel_specific_events in _wait_for_event (Unique-ID filtered).", - "routing_info_attached": [ - "Unique-ID + Channel-Unique-ID (match session/channel leg)", - "O(1) channel_registry key '{uuid}:CHANNEL_HANGUP_COMPLETE' when session.uuid present", - "Hangup-Cause metric attribute → hangup_causes_counter", - "event.name/event.uuid span attrs" - ] - }, - { - "event_name": "CHANNEL_EXECUTE_COMPLETE", - "fields_used": [ - "Event-Name", - "Unique-ID", - "Application-UUID", - "Application", - "Application-Response" - ], - "handled_in": "genesis/session.py: execute_complete_handler matches `event.get('Application-UUID') == event_uuid` (the UUID assigned in _build_sendmsg_cmd via Event-UUID header), pushes to session.fifo, signals the block-completion Event. Registered via register_channel_handler or protocol.on in _awaitable_complete_command. genesis/protocol/telemetry.py: _log_channel_event logs Application and Application-Response when name == 'CHANNEL_EXECUTE_COMPLETE'.", - "routing_info_attached": [ - "Application-UUID (correlates the response to the sendmsg execute command)", - "Unique-ID (channel_registry key for O(1) routing)", - "Application/Application-Response → span/log attributes", - "event.name/event.uuid span attrs" - ] - }, - { - "event_name": "CHANNEL_EXECUTE", - "fields_used": [ - "Event-Name", - "Unique-ID", - "Application-UUID", - "Application" - ], - "handled_in": "Referenced in genesis/session.py docstring as the companion to CHANNEL_EXECUTE_COMPLETE (the Application-UUID header appears in both). Covered by tests/test_reader_fsm.py (event-lock splitting). Routed generically; no dedicated handler.", - "routing_info_attached": [ - "Application-UUID (correlation with sendmsg execute)", - "Unique-ID (channel_registry key if registered)", - "event.name span attr" - ] - }, - { - "event_name": "CUSTOM", - "fields_used": [ - "Event-Name", - "Event-Subclass", - "Unique-ID", - "Caller-*", - "Channel-*", - "variable_*" - ], - "handled_in": "genesis/protocol/routing/base.py (and channel.py, global_.py, composite.py): get_event_name returns `event.get('Event-Subclass')` when Event-Name == 'CUSTOM', so routing keys off the subclass (e.g. 'sofia::register', 'mod_audio_stream::play'). Tests/payloads.py provides `register` (sofia::register) and `mod_audio_stream_play` (mod_audio_stream::play) fixtures. Consumer._filter_command emits `filter Event-Subclass {event}` for non-uppercase names.", - "routing_info_attached": [ - "Event-Subclass (routing key replaces Event-Name for CUSTOM)", - "Unique-ID (channel_registry key when present)", - "event_subclass metric attribute", - "event.name='CUSTOM' span attr + event.header.event_subclass" - ] - }, - { - "event_name": "DTMF", - "fields_used": [ - "Event-Name", - "Unique-ID", - "DTMF-Digit" - ], - "handled_in": "genesis/channel.py: on_dtmf decorator builds an async dtmf_handler that reads `event.get('DTMF-Digit')`, optionally filters by a specific digit, records dtmf_received_counter, and invokes the user callback. Registered via `protocol.on('DTMF', dtmf_handler)` (global).", - "routing_info_attached": [ - "DTMF-Digit (digit value + dtmf.digit span/metric attribute)", - "Unique-ID (event present but handler does not filter by it — global routing)", - "dtmf_received_counter incremented with dtmf.digit attr" - ] - }, - { - "event_name": "BACKGROUND_JOB", - "fields_used": [ - "Event-Name", - "Job-UUID", - "Job-Command", - "Job-Command-Arg", - "Content-Length", - "Content-Type" - ], - "handled_in": "No dedicated handler. tests/payloads.py:background_job is the fixture (text/event-plain body with +OK result). Routed via GlobalRoutingStrategy. Used to correlate api/bgapi command responses (Job-UUID).", - "routing_info_attached": [ - "Event-Name (global routing key)", - "Job-UUID (correlation with bgapi command, not explicitly consumed in genesis core)", - "event.name span attr" - ] - }, - { - "event_name": "RELOADXML", - "fields_used": [ - "Event-Name", - "Core-UUID", - "FreeSWITCH-Hostname", - "Content-Length" - ], - "handled_in": "No dedicated handler. tests/payloads.py:custom fixture has `Event-Name: RELOADXML` (a standalone event name, NOT a CUSTOM event). Routed via GlobalRoutingStrategy under the key 'RELOADXML'.", - "routing_info_attached": [ - "Event-Name (global routing key)", - "event.name span attr" - ] - }, - { - "event_name": "auth/request (Content-Type)", - "fields_used": [ - "Content-Type" - ], - "handled_in": "genesis/protocol/processors.py: auth_request_processor — when `event.get('Content-Type') == 'auth/request'`, sets protocol.authentication_event so Inbound.authenticate() can proceed.", - "routing_info_attached": [ - "Content-Type (processor matching, not routed to user handlers)", - "event.content_type span attr" - ] - }, - { - "event_name": "command/reply (Content-Type)", - "fields_used": [ - "Content-Type", - "Reply-Text" - ], - "handled_in": "genesis/protocol/processors.py: command_reply_processor enqueues the event into protocol.commands queue so Protocol.send() can return the response. Reply-Text is inspected in send() for '-ERR' prefix to record command_errors_counter.", - "routing_info_attached": [ - "Content-Type (processor matching)", - "Reply-Text (command.reply span attr, error detection)", - "routed into commands queue, not to user handlers" - ] - }, - { - "event_name": "api/response (Content-Type)", - "fields_used": [ - "Content-Type", - "body" - ], - "handled_in": "genesis/protocol/processors.py: api_response_processor enqueues into protocol.commands queue for Protocol.send(). FSM treats api/response as a body-only content type (event.body holds the result).", - "routing_info_attached": [ - "Content-Type (processor + FSM _API_RESPONSE_TYPES)", - "event.content_type span attr", - "routed into commands queue, not to user handlers" - ] - }, - { - "event_name": "text/rude-rejection / text/disconnect-notice (Content-Type)", - "fields_used": [ - "Content-Type", - "Content-Disposition" - ], - "handled_in": "genesis/protocol/processors.py: disconnect_processor calls protocol.stop() unless `Content-Disposition == 'linger'`. FSM lists rude-rejection in _API_RESPONSE_TYPES.", - "routing_info_attached": [ - "Content-Type (processor matching)", - "Content-Disposition (linger check — when 'linger', session.is_lingering stays True and disconnect is suppressed)", - "event.content_type span attr" - ] - }, - { - "event_name": "text/event-plain (Content-Type)", - "fields_used": [ - "Content-Type", - "Content-Length", - "Event-Name", - "Event-Subclass", - "all body headers" - ], - "handled_in": "genesis/protocol/reader_fsm.py: _parse_headeronly_content merges the body headers (parse_headers) into the event when content_type == 'text/event-plain'. _parse_event_content handles event-lock:true by splitting on 'Event-Name:' boundaries to emit multiple ESLEvent instances.", - "routing_info_attached": [ - "Event-Name / Event-Subclass (set on the merged event for downstream routing)", - "event-lock splitting propagates Content-Length/Content-Type to each split event", - "all body headers become routable fields on the resulting ESLEvent" - ] - } - ], - "routing_strategies": [ - "ChannelRoutingStrategy (O(1)): looks up `channel_registry['{Unique-ID}:{event_name}']`. event_name is Event-Name, except for CUSTOM where Event-Subclass is used. Registered via Protocol.register_channel_handler(uuid, event_name, handler). On hit returns (handlers, should_stop=True), terminating the chain. Increments genesis.channel.routing.hits with event_name attribute. Used by Channel.from_session (CHANNEL_STATE) and Session._awaitable_complete_command (CHANNEL_EXECUTE_COMPLETE, CHANNEL_HANGUP_COMPLETE).", - "GlobalRoutingStrategy (O(N)): looks up `handlers[event_name]` plus wildcard `handlers['*']`; returns (specific + generic, should_stop=False). Registered via Protocol.on(key, handler) / Consumer.handle(event). Increments genesis.channel.routing.fallback. Never stops the chain (so composite could continue, though it is the last strategy).", - "CompositeRoutingStrategy: chains [ChannelRoutingStrategy, GlobalRoutingStrategy] in order. Tries each in turn; returns handlers from the first strategy that yields any; honors should_stop. Wired in Protocol.__init__ as self.routing_strategy.", - "Dispatch: dispatch_to_handlers schedules each handler as an asyncio task (create_task for coroutines, to_thread wrapper for sync handlers) and tracks them in Protocol.handler_tasks with a done-callback that logs unhandled exceptions.", - "Event-name extraction (get_event_name, shared by all strategies): `identifier = event.get('Event-Name')`; if identifier == 'CUSTOM', returns `event.get('Event-Subclass')`; else returns identifier. None/missing → strategy returns ([], False).", - "Subscription filters (Outbound/Inbound side, not routing per se but shape what reaches the router): `events plain ALL` subscribes to all events; `filter Unique-ID {uuid}` restrictes to a channel; `filter Event-Name {X}` for uppercase event names; `filter Event-Subclass {X}` for CUSTOM subclasses (Consumer._filter_command)." - ], - "channel_lifecycle": "Channel state is modeled by `ChannelState(IntEnum)` in genesis/types.py, ordered to mirror FreeSWITCH's CS_* progression: NEW(0) → INIT(1) → ROUTING(2) → SOFT_EXECUTE(3) → EXECUTE(4) → EXCHANGE_MEDIA(5) → PARK(6) → CONSUME_MEDIA(7) → HIBERNATE(8) → RESET(9) → HANGUP(10) → REPORTING(11) → DESTROY(12) → NONE(13). `ChannelState.from_freeswitch(state_str)` strips the `CS_` prefix and resolves the enum name. Transitions are event-driven: Channel._state_handler (registered for CHANNEL_STATE) compares `event['Unique-ID'] == self.uuid`, parses `Channel-State`, and if the new state differs, records a timestamp in `_state_changes[new_state]` and updates `_state`. The IntEnum ordering is exploited for guards: `Channel.wait` short-circuits when `self.state >= ChannelState.HANGUP` (i.e. HANGUP/REPORTING/DESTROY) and `Channel.bridge` refuses when `self.state >= HANGUP`. `_wait_for_state` waits until `event_state >= ChannelState.HANGUP` OR equals the target; for EXECUTE it additionally waits for a CHANNEL_ANSWER event (answer_received Event) because CS_EXECUTE alone does not guarantee the leg is answered. RingGroup relies on `ch.wait(ChannelState.EXECUTE)` as the 'answered' signal in PARALLEL/SEQUENTIAL/BALANCING modes, and cleans up non-answered legs by checking `ch.state >= ChannelState.HANGUP` before issuing hangup. Hangup is invoked via `api uuid_kill {uuid} {cause}` (Inbound) or sendmsg `execute hangup {cause}` (Session); the cause is recorded in hangup_causes_counter and the total call duration in call_duration_histogram. In Outbound mode, Channel.from_session seeds state from the Session context's `Channel-State` and registers an O(1) CHANNEL_STATE handler so subsequent transitions update the same Channel object. The Session itself models command lifecycle via _awaitable_complete_command, which registers CHANNEL_EXECUTE_COMPLETE (matched by Application-UUID) and CHANNEL_HANGUP_COMPLETE (matched by Channel-Unique-ID/Unique-ID) and signals a per-command Event so blocking sendmsg calls can return the corresponding ESLEvent from session.fifo.", - "notes": "Relevant files (absolute paths): /Users/vitorhugo/Projects/Genesis/genesis/protocol/base.py (Protocol, handler/consume loop, _process_one_event, register_channel_handler/on, send), /Users/vitorhugo/Projects/Genesis/genesis/protocol/reader_fsm.py (ESLReaderFSM: READING_HEADERS→READING_BODY, event-lock splitting, text/event-plain merging), /Users/vitorhugo/Projects/Genesis/genesis/protocol/parser.py (parse_headers, ESLEvent UserDict), /Users/vitorhugo/Projects/Genesis/genesis/protocol/processors.py (auth_request, command_reply, api_response, disconnect processors), /Users/vitorhugo/Projects/Genesis/genesis/protocol/routing/{base,channel,composite,global_,dispatcher}.py (Strategy pattern), /Users/vitorhugo/Projects/Genesis/genesis/protocol/telemetry.py (build_event_attributes, build_metric_attributes, log_event), /Users/vitorhugo/Projects/Genesis/genesis/protocol/metrics.py (OTel counters/histograms), /Users/vitorhugo/Projects/Genesis/genesis/channel.py (Channel, _state_handler, wait, on_dtmf, bridge/hangup/answer), /Users/vitorhugo/Projects/Genesis/genesis/session.py (Session, sendmsg, _awaitable_complete_command, CHANNEL_EXECUTE_COMPLETE/CHANNEL_HANGUP_COMPLETE correlation), /Users/vitorhugo/Projects/Genesis/genesis/inbound.py, /Users/vitorhugo/Projects/Genesis/genesis/outbound.py (_setup_session, connect reply → session.context, filter Unique-ID, linger), /Users/vitorhugo/Projects/Genesis/genesis/consumer.py (Consumer.handle decorator, filtrate, filter Event-Name/Event-Subclass), /Users/vitorhugo/Projects/Genesis/genesis/types.py (ChannelState enum, HangupCause literal, EventHandler), /Users/vitorhugo/Projects/Genesis/genesis/group/ring.py (RingGroup EXECUTE wait, hangup cleanup), /Users/vitorhugo/Projects/Genesis/tests/payloads.py (canonical ESL fixtures: heartbeat, channel_create, background_job, custom/RELOADXML, register/sofia::register, mod_audio_stream_play, connect, channel_state, dtmf, channel_answer).\n\nKey observations:\n- Genesis does NOT explicitly handle CHANNEL_PROGRESS, CHANNEL_BRIDGE, CHANNEL_UNBRIDGE, CALL_UPDATE, CODEC, or RING_BACK anywhere in genesis/. These names do not appear as string literals in the source. They would still be *routed* if subscribed (events plain ALL) because routing is generic on Event-Name, but no built-in handler consumes them. CHANNEL_BRIDGE/UNBRIDGE legs (Other-Leg-Unique-ID, Bridge-A-Unique-ID, etc.) are not parsed; bridging is performed via `api uuid_bridge` / sendmsg `execute bridge` and tracked only through Reply-Text and bridge_operations_counter, not through bridge events.\n- Routing-relevant fields actually consumed by code: Event-Name, Event-Subclass (CUSTOM), Unique-ID, Channel-State, Channel-Name, Channel-Unique-ID, Application-UUID, Application, Application-Response, DTMF-Digit, Reply-Text, Content-Type, Content-Disposition, Content-Length. Caller-Destination-Number, Caller-Context, Caller-Dialplan, Caller-Caller-ID-Number, Channel-Call-UUID and variable_* are present in payloads and propagated generically (context.update(event) and event.header.* span attributes) but no genesis module explicitly extracts them — they are available to user handlers via the ESLEvent dict and to the `filtrate(key, value)` decorator for arbitrary key/value filtering in Consumer.\n- The `connect` command reply (Outbound mode) is the seed of session.context: it carries Channel-Unique-ID, Channel-Context, Channel-Destination-Number, Channel-Caller-ID-Number, Channel-State, Channel-Name, Unique-ID, plus the full Caller-* and variable_* set. Channel.from_session then derives uuid (session.context['Unique-ID']), dial_path (Channel-Name), and initial state (Channel-State).\n- OTel attribute propagation: build_event_attributes emits EVERY event header as a span attribute (event.name for Event-Name, event.uuid for Unique-ID, event.content_type for Content-Type, event.header. for everything else). build_metric_attributes attaches event_name, content_type, event_subclass, direction (Call-Direction), channel_state (Channel-State), answer_state (Answer-State), hangup_cause (Hangup-Cause) to the events_received_counter. These are the routing-relevant fields that flow downstream into traces and metrics.\n- Wildcard handlers: GlobalRoutingStrategy merges `handlers['*']` with `handlers[event_name]`, so a Consumer/Protocol can register a catch-all handler under the key '*'.\n- The FSM splits event-lock:true payloads on '\\nEvent-Name: ' boundaries, producing multiple ESLEvent objects that share the base Content-Length/Content-Type and body — this is how chained execute-app commands (e.g. multi-app sendmsg) appear as separate routable events." - }, - "snifferSignals": { - "repo_summary": "Otoru/sniffer is a passive VoIP observability platform written in Go. It captures SIP signaling and RTP/RTCP media off the wire (AF_PACKET zero-copy on Linux, libpcap elsewhere), correlates packets into calls, computes call quality (MOS via ITU-T G.107 R-factor, jitter, packet loss, silence ratio), detects fraud (RCC/SCD/HFR/auth-flood), records PCAP/WAV per call (with S3 upload), and exports everything as OpenTelemetry traces + metrics over OTLP/HTTP. It ships as two binaries: `sniffer` (capture/correlation) and `sniffer-mcp` (MCP server for AI agents), plus a Claude Code plugin with analysis skills. Supports standalone single-sensor mode or distributed mode (1 sensor-sip + N sensor-rtp) sharing a Redis-protocol datastore.", - "captures": "Capture layer is a two-stage decoupled pipeline per sensor (internal/infra/capture, internal/app/pipeline):\nNIC -> kernel ring (AF_PACKET TPACKET_V3 ring, SNIFFER_CAPTURE_BUFFER_MB, default 256 MiB; pcap backend on non-Linux) -> readLoop (drains ring via ZeroCopyReadPacketData on afpacket, no heap copy for L4 bytes) -> rawCh (8192) -> parseLoop (defrag -> dedup -> extract) -> parsedCh (8192) -> sharded workers (sipLoop sharded by Call-ID, rtpLoop sharded by SSRC).\n\nPacket parsing uses gopacket DecodingLayerParser fast path (>99% of RTP = UDP unfragmented, reuses pre-allocated Ethernet/IPv4/IPv6/UDP/TCP structs, no layer-object allocs); fallback creates gopacket.Packet for IP fragments, GRE/VXLAN/ERSPAN tunnels, ICMP. A size-classed payload buffer pool (internal/infra/capture/datapool.go) gives zero-alloc on the hot path (~50k pkt/s) with idempotent slot release (no double-free, get==put balance tested). Backpressure: when rawCh is full the packet is dropped with stage=capture rather than blocking (keeps kernel ring draining); a watermark-pause/backpressure circuit breaker can also shed load (sniffer.pipeline.watermark_drop_total).\n\nWhat it captures: SIP messages (request/response, SDP, REGISTER, OPTIONS keepalives) on SNIFFER_SIP_PORTS (default 5060,5061); RTP streams (per-SSRC seq-gap loss, RFC 3550 jitter, MOS/R-factor, jitter-buffer simulation at 50/200/500 ms, G.711 silence detection); RTCP (peer-reported loss/jitter, LSR/DLSR RTT); SIP registrations; fraud events. Sensors are roles: sensor-sip runs a BPF like \"port 5060\" and emits CDR spans; sensor-rtp runs \"udp portrange 10000-20000\" and only publishes RTP stats to the datastore. PCAP written as LINKTYPE_RAW(101) with synthetic IP+UDP headers; WAV only for decodable G.711. Sources: live interface, pcap file, pcap dir (batch), or stdin (`-`).", - "call_identification": "A call is identified by the SIP Call-ID header (voip.call_id). The sensor-sip extracts Call-ID, caller/callee URIs (From/To), SDP media IP:port endpoints and writes them to the shared Redis-protocol datastore under `voip:ep:{ip}|{port}` (EndpointData: call_id, is_caller, codec_pt, connect_time_us) and `voip:call:{call_id}` (state, caller/callee, timestamps). RTP sensors correlate by batch-GETting endpoints for each packet's Src+Dst IP:port; when a packet matches by destination side the sensor inverts the rtp.side marking (so caller-vs-callee is correct even when RTP races the 200 OK). RTP stats are aggregated per call_id+ssrc under `voip:rtp:{call_id}:{ssrc}`. On BYE/CANCEL/4xx-6xx/timeout the sensor-sip PUBLISHes `voip:rtp:flush_req` so RTP sensors flush final samples, then GETs and merges cross-sensor stats before emitting the complete `voip.call` span tree. Orphan RTP (no correlated signaling) is classified as rtp_flow vs noise (sniffer.orphan.classified). Registrations are keyed by AOR (user@domain); keepalives by Call-ID of the OPTIONS dialog.", - "otel_signals": { - "logs": "No OpenTelemetry Logs SDK signals. internal/platform/otel/provider.go only initializes a TracerProvider and MeterProvider (no LoggerProvider). Application logging is plain Go slog (logfmt or json via SNIFFER_LOG_FORMAT), not exported via OTel logs. Go runtime metrics are emitted automatically by the OTel contrib runtime instrumentation (process.runtime.go.* goroutines/memory/schedule), and continuous profiling (CPU/alloc/inuse/goroutines) is sent to Grafana Pyroscope when SNIFFER_PYROSCOPE_URL is set.", - "metrics": [ - { - "name": "sniffer_info", - "type": "gauge" - }, - { - "name": "voip.calls.total", - "type": "counter" - }, - { - "name": "voip.calls.answered", - "type": "counter" - }, - { - "name": "voip.calls.failed", - "type": "counter" - }, - { - "name": "voip.calls.timeout", - "type": "counter" - }, - { - "name": "voip.calls.muted", - "type": "counter" - }, - { - "name": "voip.calls.one_way_audio", - "type": "counter" - }, - { - "name": "voip.calls.active", - "type": "gauge" - }, - { - "name": "voip.call.duration_s", - "type": "histogram" - }, - { - "name": "voip.call.mos", - "type": "histogram" - }, - { - "name": "voip.call.jitter_ms", - "type": "histogram" - }, - { - "name": "voip.call.loss_pct", - "type": "histogram" - }, - { - "name": "voip.call.silence_ratio", - "type": "histogram" - }, - { - "name": "voip.rtp.streams.active", - "type": "gauge" - }, - { - "name": "voip.keepalives.total", - "type": "counter" - }, - { - "name": "voip.keepalive.rtt_ms", - "type": "histogram" - }, - { - "name": "voip.registrations.active", - "type": "gauge" - }, - { - "name": "sniffer.packets.dropped", - "type": "counter" - }, - { - "name": "sniffer.orphan.classified", - "type": "counter" - }, - { - "name": "sniffer.pcap.kernel_drops", - "type": "gauge" - }, - { - "name": "sniffer.pcap.if_drops", - "type": "gauge" - }, - { - "name": "sniffer.redis.write_dropped", - "type": "counter" - }, - { - "name": "sniffer.pipeline.watermark_pause_total", - "type": "counter" - }, - { - "name": "sniffer.pipeline.watermark_drop_total", - "type": "counter" - }, - { - "name": "sniffer.sip.filtered", - "type": "counter" - }, - { - "name": "sniffer.spool.usage_gb", - "type": "gauge" - }, - { - "name": "sniffer.spool.free_pct", - "type": "gauge" - }, - { - "name": "sniffer.fraud.rcc.threshold", - "type": "gauge" - }, - { - "name": "sniffer.fraud.rcc.window_s", - "type": "gauge" - }, - { - "name": "sniffer.fraud.scd.min_duration_s", - "type": "gauge" - }, - { - "name": "sniffer.fraud.scd.window_s", - "type": "gauge" - }, - { - "name": "sniffer.fraud.hfr.pct", - "type": "gauge" - }, - { - "name": "sniffer.fraud.hfr.window_s", - "type": "gauge" - }, - { - "name": "sniffer.fraud.auth_flood.threshold", - "type": "gauge" - }, - { - "name": "sniffer.fraud.auth_flood.window_s", - "type": "gauge" - }, - { - "name": "sniffer.overload.state", - "type": "gauge" - }, - { - "name": "process.fd.open", - "type": "gauge" - }, - { - "name": "process.fd.limit", - "type": "gauge" - }, - { - "name": "process.fd.ratio", - "type": "gauge" - }, - { - "name": "process.memory.rss_bytes", - "type": "gauge" - }, - { - "name": "process.memory.vsize_bytes", - "type": "gauge" - }, - { - "name": "process.cpu.user_seconds_total", - "type": "counter" - }, - { - "name": "process.cpu.system_seconds_total", - "type": "counter" - }, - { - "name": "process.context_switches.voluntary_total", - "type": "counter" - }, - { - "name": "process.context_switches.nonvoluntary_total", - "type": "counter" - }, - { - "name": "process.page_faults.major_total", - "type": "counter" - }, - { - "name": "process.page_faults.minor_total", - "type": "counter" - }, - { - "name": "process.io.read_bytes_total", - "type": "counter" - }, - { - "name": "process.io.write_bytes_total", - "type": "counter" - }, - { - "name": "process.threads", - "type": "gauge" - }, - { - "name": "process.uptime_seconds", - "type": "gauge" - } - ], - "spans": [ - { - "attributes": [ - "voip.call_id", - "voip.call.state (completed|failed|cancelled|interrupted|timeout)", - "sensor.id", - "sensor.ip", - "voip.caller.number", - "voip.caller.domain", - "voip.caller.ip", - "voip.caller.user_agent", - "voip.callee.number", - "voip.callee.domain", - "voip.callee.ip", - "voip.callee.user_agent", - "client.address", - "server.address", - "voip.call.duration_s", - "voip.call.connect_duration_s", - "voip.call.ring_time_s", - "voip.call.post_dial_delay_s", - "voip.call.first_rtp_delay_s", - "voip.sip.final_response", - "voip.sip.final_response_text", - "voip.sip.termination_cause (normal_bye|cancel|timeout|rtp_timeout|failed|redirect|interrupted|unknown)", - "voip.sip.who_hung_up (caller|callee|unknown)", - "voip.recording.pcap", - "voip.recording.audio", - "voip.flags.nat_detected", - "voip.flags.rtp_reordered" - ], - "name": "voip.call" - }, - { - "attributes": [ - "sip.method (INVITE|BYE|CANCEL|ACK|...)", - "sip.from", - "sip.to", - "sip.contact", - "sip.user_agent", - "sip.sdp.media.ip", - "sip.sdp.media.port", - "sip.sdp.media.codec" - ], - "name": "voip.call.sip.request" - }, - { - "attributes": [ - "sip.response_code", - "sip.cseq_method", - "sip.reason", - "sip.from", - "sip.to", - "sip.contact", - "sip.user_agent", - "sip.sdp.media.ip", - "sip.sdp.media.port", - "sip.sdp.media.codec" - ], - "name": "voip.call.sip.response" - }, - { - "attributes": [ - "rtp.side (a=caller,b=callee)", - "rtp.ssrc", - "rtp.packets_received", - "rtp.packets_lost", - "rtp.loss_pct", - "rtp.jitter_avg_ms", - "rtp.jitter_max_ms", - "rtp.mos_avg", - "rtp.r_factor", - "rtp.mos_jb_50ms", - "rtp.mos_jb_200ms", - "rtp.mos_jb_500ms", - "rtp.src_ip", - "rtp.dst_ip", - "rtp.codec", - "rtp.payload_type", - "sensor.ip (local only)", - "rtp.rtcp.loss_pct", - "rtp.rtcp.jitter_avg_ms", - "rtp.rtcp.rtt_ms", - "rtp.silence_ratio" - ], - "name": "voip.rtp.stream" - }, - { - "attributes": [ - "voip.register.aor", - "voip.register.contact_ip", - "voip.register.user_agent", - "voip.register.expires_s", - "voip.register.auth_failed", - "voip.register.response_code", - "voip.register.reason (registered|renewed|failed|deregister|expired)", - "sensor.id", - "sensor.ip" - ], - "name": "voip.register" - }, - { - "attributes": [ - "sip.response_code", - "sip.reason" - ], - "name": "voip.register.transaction" - }, - { - "attributes": [ - "keepalive.from", - "keepalive.to", - "keepalive.result (success|timeout|failed)", - "keepalive.rtt_ms", - "sip.response_code", - "sip.reason", - "sensor.id", - "sensor.ip" - ], - "name": "voip.keepalive" - }, - { - "attributes": [ - "voip.fraud.rule (rcc|scd|hfr|auth_flood)", - "voip.fraud.key", - "voip.fraud.value", - "voip.fraud.threshold", - "voip.fraud.window_s" - ], - "name": "voip.fraud.alert" - } - ] - }, - "trace_propagation": "No SIP header injection and no W3C trace-context propagation. The sniffer is strictly passive (read-only on the wire); it never injects X-Tracespan, traceparent, tracestate, or any channel var. The OTel trace_id is generated by the OTel SDK (go.opentelemetry.io/otel/sdk/trace) when EmitCDRSpan / EmitRegistrationSpan / EmitKeepaliveSpan / EmitFraudAlert call tracer.Start() — sampling is head-based via sdktrace.AlwaysSample (default) or TraceIDRatioBased (SNIFFER_OTEL_SAMPLE_RATIO). The SIP Call-ID is NOT used as the trace_id; it is only recorded as the `voip.call_id` span attribute and used as the Redis correlation key. Cross-sensor correlation is done entirely through the Redis-protocol datastore (call_id keys, endpoint pub/sub, flush_req pub/sub), not via trace context propagation. Only the sensor-sip emits the full span tree (voip.call + children); sensor-rtp units emit no CDR spans at all — they publish stats that the sensor-sip merges at span-close. So a call's trace is a single-service tree produced at one node, not a distributed trace spanning sensors. All spans are SpanKind INTERNAL.", - "tech_stack": "Language: Go (module vitoru.fun/sniffer). Capture: gopacket + AF_PACKET TPACKET_V3 zero-copy on Linux (github.com/google/gopacket, internal/infra/capture/live_afpacket.go), libpcap fallback. SIP/SDP parsing: a custom minimal fastParse (~10 allocs vs ~78 for sipgo) in internal/protocol/sip, plus pion/sdp/v3 for SDP. RTP/RTCP: custom parsers in internal/protocol/rtp and internal/protocol/rtcp. Datastore: any Redis-protocol (RESP) compatible server — Redis, Dragonfly (recommended for high-volume multi-core), or Valkey — via go-redis (REDIS_URL). Pipeline: in-process channels (rawCh 8192, parsedCh 8192) + sharded workers, no external streaming broker. Observability export: OpenTelemetry SDK (otlptracehttp + otlpmetrichttp to OTEL_EXPORTER_OTLP_ENDPOINT, default http://localhost:4317) -> OTLP collector -> Jaeger/Tempo (traces) + Prometheus (metrics); optional Grafana Pyroscope for continuous profiling. Recording spool: local disk -> optional S3-compatible upload (AWS/OCI/MinIO/Ceph). Packaging: .deb/.rpm, Docker images (Dockerfile.sniffer, Dockerfile.mcp). MCP server: separate Go binary (cmd/mcp) exposing tools/resources/prompts over HTTP transport. CI: Woodpecker. License: proprietary with offline grace (internal/platform/license).", - "notes": "Read real source: internal/platform/otel/{span,metrics,emitter,register,keepalive,fraud,provider}.go, internal/platform/catalog/{spans,metrics}.go, internal/protocol/{sip/parse,register/register,keepalive/keepalive}.go, plus docs/architecture.md, docs/observability.md, docs/config.md, README.md. GitHub code-search API returned empty results (likely unauthenticated quota), so the absence of traceparent/X-Trace/TextMapPropagator was confirmed by reading provider.go (no propagator setup, no SetTextMapPropagator) and the SIP parser (no header extraction/injection of trace context). Span/metric names and attribute lists are authoritative from catalog/spans.go and catalog/metrics.go (the single source of truth used by both instrumentation and the MCP catalog:// resources). The docs/observability.md table lists a slightly older metric set; the catalog file is the complete superset (adds sniffer.orphan.classified, sniffer.pcap.kernel_drops, sniffer.pcap.if_drops, sniffer.redis.write_dropped, sniffer.pipeline.watermark_*, sniffer.sip.filtered, sniffer.overload.state, and all sniffer.fraud.* config gauges). Span status: Ok for answered+2xx, Error with description \\\"SIP \\\" when final >= 400 or termination=failed. SIP message child spans form a consecutive waterfall [msg ts -> next msg ts]; ACK capped to 1 ms. RTP stream span timestamps are independent of SIP (expose early-media overlap). Service name default is \\\"sauron\\\" (OTEL_SERVICE_NAME)." - }, - "snifferCorr": { - "trace_id_source": "The OTel trace_id is NOT derived from SIP. It is a random 128-bit ID auto-generated by the OpenTelemetry SDK tracer (internal/platform/otel/provider.go builds a stock sdktrace.TracerProvider; no custom Sampler, no trace.SetSpanContext, no ID-generator override). In internal/platform/otel/span.go EmitCDRSpan calls tracer.Start(ctx, \"voip.call\", ...) with the plain context passed from RunEmitter, so the SDK mints a fresh trace_id per call. The SIP Call-ID is only attached as the span attribute `voip.call_id` (set in setIdentityAttrs). The catalog (internal/platform/catalog/spans.go) labels `voip.call_id` as \"Unique call identifier (trace_id)\", i.e. the sniffer treats the SIP Call-ID as the *logical* trace identifier a human uses to find the trace, but the wire OTel trace_id is unrelated to it. Cross-sensor correlation of the same call is done by SIP Call-ID + From/To/Via tags (calltable keys calls by string(msg.CallID); Redis cross-sensor mode in internal/infra/redis merges ExternalRTPStats by Call-ID), not by propagating an OTel trace context.", - "span_hierarchy": "Single root span per call: `voip.call` (catalog.SpanOpCall), timestamped [InviteTimeUS → EndTimeUS], SpanKindInternal, set in EmitCDRSpan (internal/platform/otel/span.go). It carries identity (voip.call_id, sensor.id/ip), participants (caller/callee number/domain/IP/UA + semconv client.address/server.address), timing (duration_s, connect_duration_s, ring_time_s, post_dial_delay_s, first_rtp_delay_s), SIP outcome (final_response, termination_cause, who_hung_up), flags (nat_detected, rtp_reordered), recording flags, and span status (Error on >=400/ReasonFailed). Under that root there are two sibling child-spans families, both created by passing the root span's ctx to tracer.Start (standard parent/child via SpanContext parentID — NOT span links):\n\n1) SIP message spans — emitSIPMessageSpans walks call.SIPMsgs in arrival order and emits one child per message: `voip.call.sip.request` (RespCode==0) or `voip.call.sip.response` (RespCode>0). Each span covers [msg.TimestampUS → next msg TimestampUS] (waterfall of SIP phases); the last message's span ends at call.EndTimeUS. ACK spans are capped to 1ms. Attributes: sip.method / sip.response_code + sip.cseq_method, sip.from, sip.to, sip.contact, sip.user_agent, sip.reason, and SDP audio ip/port/codec.\n\n2) RTP stream spans — emitRTPStreamSpans emits one `voip.rtp.stream` (catalog.SpanOpRTPStream) child per RTPStreamEntry (local) plus one per ExternalRTPStats (cross-sensor, merged via Redis). Each spans [FirstPacketUS → LastPacketUS], independently of SIP timing so early-media/183 overlap is visible. Attributes: rtp.side (\"a\"=caller / \"b\"=callee — derived from MediaEndpoints[epIdx].IsCaller with src/dst-match inversion in findOrCreateStream), rtp.ssrc, packets_received/lost, loss_pct, jitter avg/max, MOS avg + R-factor + MOS at JB 50/200/500ms, codec, payload_type, src_ip/dst_ip, sensor.ip, optional RTCP loss/jitter/rtt, optional rtp.silence_ratio.\n\nThere is no separate \"SIP transaction\" span layer keyed by Via/Branch; transactions are implicit in the per-message waterfall. Registration has its own root `voip.register` with `voip.register.transaction` children, and keepalive/fraud-alert are separate top-level ops — none of these are children of voip.call.", - "sip_rtp_relation": "SIP and RTP are correlated into the same trace purely through the SDP negotiated on the SIP dialog. Flow (internal/domain/calltable): ProcessSIP → handleINVITE → buildCall populates Call.MediaEndpoints from the SDP c=/m= audio lines (sdpFirstAudioMedia in state.go). addEndpointsToIndex registers each endpoint IPPort into a per-shard O(1) endpointIndex and a global sync.Map (endpointGlobalIndex) for cross-shard O(1) lookup, including NAT aliases (NATAliases maps private SDP IP → public packet-source IP seen on the wire, learned by observing that the packet source differs from the SDP c= IP). For every RTP packet, UpdateRTP → resolveRTPEndpoint looks up pkt.Src and pkt.Dst in the global index, locks only the owning shard, and findOrCreateStream keys the stream by SSRC inside that Call. The matched endpoint index + srcMatched bool determines rtp.side (IsCaller), with dst-match inverting the side. RTCP reception reports (UpdateRTCP) are matched the same way (by packet endpoint → call, then by SSRC inside the call) and feed frac-lost/jitter/RTT back into the matching RTPStreamEntry. So inside a trace the SIP message spans and RTP stream spans are siblings under the shared `voip.call` root; the SIP dialog provides the identity + endpoint map, and the RTP streams provide the quality telemetry. RTP timestamps are independent of SIP (span [FirstPacketUS → LastPacketUS]), so a 183 Session Progress with early media shows RTP-stream spans that start before the 200 OK SIP-response span. RTP that never matches a known endpoint becomes an \"orphan\" (orphanBuffer) and is NOT traced.", - "esl_integration": "No. There is zero FreeSWITCH ESL integration in the repo. GitHub code search over the repo for \"freeswitch\", \"ESL\", and \"event_socket\" returned 0 matches each. No file path contains esl/freeswitch/fscli. The sniffer is strictly a passive packet processor: internal/infra/capture uses afpacket/raw-socket live capture (live_afpacket.go) plus pcap read, feeds a pipeline (internal/app/pipeline) that runs protocol detection (internal/protocol/detect) and parses SIP (internal/protocol/sip), RTP (internal/protocol/rtp), RTCP (internal/protocol/rtcp). State is held in the in-memory calltable (internal/domain/calltable) and persisted to Redis (internal/infra/redis) and ClickHouse (docker/clickhouse) for CDRs, with OTel spans exported via OTLP (internal/platform/otel). It never opens the FreeSWITCH Event Socket, never subscribes to CHANNEL_* / CALL_* events, never issues `api` commands, and never reads the spool. The local Genesis/CLAUDE.md describes an ESL library (the `genesis` package) but that is a sibling project, not the sniffer source.", - "missing_routing_info": [ - "Dialplan: which context/extension/condition block matched, and the ordered list of applications FreeSWITCH executed (set, bridge, playback, transfer, park, voicemail, IVR menu choices, queue routing, follow-me, ring-group selection).", - "Bridge a-leg / b-leg linking: FreeSWITCH creates two channels with distinct Channel-UUIDs for a bridge; sniffer keys on SIP Call-ID and sees two separate SIP dialogs with no explicit parent/child tie. The CHANNEL_BRIDGE event / Bridge-UUID / `Other-Leg` unique-id that links the legs lives only in ESL events, not in packets.", - "Transfers: blind (REFER) and especially attended transfers create new channels/dialogs; the post-transfer dialplan target and the linkage between the original and transferred leg are invisible from packets alone (and REFER may be absorbed inside FS without surfacing as a new SIP dialog on the sniffed interface).", - "FreeSWITCH Channel-UUID and channel variables: sniffer identifies calls by SIP Call-ID + From/To/Via tags, never by FS Channel-UUID; variables like `sip_from_user`, `destination_number`, `transfer_source`, `bridge_to`, `hangup_cause` (Q.850 cause code), `progress_uv`/`progress_time` are not on the wire.", - "Digit manipulation / translation: the called number after regex/translation rules and the actual egress destination may differ from the SIP R-URI/To visible on the sniffed leg.", - "Call direction from the switch's perspective (inbound vs outbound leg, which side is the FS-controlled channel) — sniffer only knows caller/callee IP/number from SIP headers.", - "Authentication outcome: 401/407 challenges are observed (applyChallenge in state.go extends DestroyAt), but whether the digest auth succeeded/failed and which user authenticated is not in the SIP response.", - "Transcoding and codec negotiation decisions on the FS side (announced codecs come from SDP, but whether FS transcoded PCMU↔opus is an internal FS fact).", - "Hangup cause code (Q.850) and who initiated teardown inside FS vs on the wire — sniffer infers WhoHungUp by comparing the BYE source IP to caller/callee IP (inferWhoHungUp), which is an approximation and returns HungUpUnknown when the BYE comes from the switch itself.", - "Call grouping/queue/agent identity: which agent in a call queue answered, ACD/queue wait time, park orbit pickups, intercept/resume, and conference-room membership — all FS application state.", - "Re-INVITE intent (hold/resume/codec-change/T.38 fax) is seen as a dialog update but the FS-side reason (channel hold variable, media renegotiation cause) is not on the wire." - ], - "notes": "Key source files read (all absolute repo paths under https://github.com/Otoru/sniffer): internal/platform/otel/span.go (root/child span emission, hierarchy), internal/platform/otel/emitter.go (RunEmitter consumes CDREvent → EmitCDRSpan), internal/platform/otel/provider.go (stock OTel SDK, no trace_id derivation), internal/platform/catalog/spans.go (span op names + attribute catalog, `voip.call_id` described as \"Unique call identifier (trace_id)\"), internal/domain/calltable/associate.go (RTP↔call correlation by SDP endpoint index + NAT aliases + SSRC), internal/domain/calltable/state.go (SIP FSM, SDP endpoint extraction), internal/domain/calltable/types.go (Call/SIPMsg/RTPStreamEntry/MediaEndpoint structs), internal/domain/calltable/table.go (ProcessSIP, sharded CallTable keyed by SIP Call-ID). No ESL/FreeSWITCH code exists: gh code search for freeswitch/ESL/event_socket each returned total_count 0, and no path under the tree contains those terms. The sniffer is purely passive capture + SIP/RTP/RTCP parsing → calltable → CDR → OTel spans (OTLP) + Redis + ClickHouse. Span linkage is standard OTel parent/child via the root span's context (tracer.Start with the root ctx), not span Links; there is exactly one root per call and one trace per call. The `voip.call_id` attribute is the SIP Call-ID used as the human-facing trace lookup key; the actual OTel trace_id is SDK-random and is not propagated across sensors — cross-sensor call correlation is by SIP Call-ID via Redis (ExternalRTPStats merged into the CDREvent before emission), not by shared trace context." - }, - "freeswitch": { - "events": [ - { - "key_fields": [ - "Unique-ID", - "Channel-Call-UUID", - "Channel-Name", - "Channel-State", - "Channel-Call-State", - "Call-Direction", - "Caller-Username", - "Caller-Dialplan", - "Caller-Caller-ID-Name", - "Caller-Caller-ID-Number", - "Caller-Destination-Number", - "Caller-Network-Addr", - "Caller-ANI", - "Caller-Context", - "Caller-Unique-ID", - "Answer-State=ringing" - ], - "name": "CHANNEL_CREATE", - "routing_relevance": "Fired by switch_core_state_machine.c at CS_INIT when a session is created. This is the birth-of-call event for both inbound (new INVITE) and outbound (originate) legs and is the first opportunity to register a channel handler keyed by Unique-ID. Call-Direction tells you a-leg vs b-leg origin; Caller-Destination-Number + Caller-Context drive dialplan routing decisions. Unique-ID here is the per-leg session UUID you must capture to correlate every subsequent event on this leg." - }, - { - "key_fields": [ - "Unique-ID", - "Channel-Call-UUID", - "Channel-State=CS_RINGING", - "Channel-Call-State=CCS_RINGING", - "Answer-State=ringing", - "Caller-*", - "Other-Leg-* (if originated)" - ], - "name": "CHANNEL_PROGRESS", - "routing_relevance": "Fired by switch_channel.c when a channel enters ringing (SIP 180 Ringing). Marks the start of alerting on a leg. For tracing, this is the 'ringing started' timestamp; for routing it signals that the leg is being alerted and a ringback may be played to the upstream leg. Paired with CHANNEL_PROGRESS_MEDIA for early-media (183) cases." - }, - { - "key_fields": [ - "Unique-ID", - "Channel-Call-UUID", - "Channel-State=CS_RINGING", - "Channel-Call-State=CCS_EARLY", - "Answer-State=early", - "Channel-Read-Codec-Name", - "Channel-Write-Codec-Name", - "Caller-*", - "Other-Leg-*" - ], - "name": "CHANNEL_PROGRESS_MEDIA", - "routing_relevance": "Fired on early media (SIP 183 Session Progress with SDP). Critical for distinguishing 'ringing without media' from 'ringing with early media' (in-band ringback/announcements from carrier). Drives the upstream leg's ringback generation and is essential for accurate call-quality tracing because media flows before ANSWER. Codec headers here are the first real negotiated codec for the leg." - }, - { - "key_fields": [ - "Unique-ID", - "Channel-Call-UUID", - "Channel-State=CS_EXECUTE", - "Channel-Call-State=CCS_ACTIVE", - "Answer-State=answered", - "Channel-Read-Codec-Name/Rate", - "Channel-Write-Codec-Name/Rate", - "Caller-*", - "Other-Leg-*" - ], - "name": "CHANNEL_ANSWER", - "routing_relevance": "Fired by switch_channel.c when the leg is answered (SIP 200 OK). This is the call-connected moment: media becomes two-way, billing/CDR timers start, and any post-answer dialplan execution begins. For routing it is the gate for executing on_answer hooks; for tracing it is the canonical answer timestamp and the point where the sniffer should expect bidirectional RTP." - }, - { - "key_fields": [ - "Unique-ID (firing leg)", - "Bridge-A-Unique-ID", - "Bridge-B-Unique-ID", - "Channel-Call-UUID", - "Other-Type (originator/originatee)", - "Other-Leg-Unique-ID", - "Other-Leg-Caller-ID-Number", - "Other-Leg-Destination-Number", - "Caller-*" - ], - "name": "CHANNEL_BRIDGE", - "routing_relevance": "Fired by switch_ivr_bridge.c when two legs are bridged (two-party). Bridge-A-Unique-ID is the originating session and Bridge-B-Unique-ID is the peer session UUID. This is THE event for a-leg/b-leg correlation: it links the two Unique-IDs and also sets the peer channel's call_uuid to the originator's UUID, so both legs subsequently share the same Channel-Call-UUID. Other-Leg-* (from the originator/originatee caller profile) gives the partner leg's identity for transfer/intercept logic." - }, - { - "key_fields": [ - "Unique-ID", - "Channel-Call-UUID", - "Bridge-A-Unique-ID (context dependent)", - "Other-Leg-Unique-ID", - "Other-Type", - "Hangup-Cause (sometimes)" - ], - "name": "CHANNEL_UNBRIDGE", - "routing_relevance": "Fired by switch_ivr_bridge.c when a bridge is torn down (either leg hangs up, transfer, or application break). Marks the end of two-party media flow. Essential for tracing bridge duration and for detecting mid-call transfers (unbridge followed by a new bridge on the surviving leg) vs hangup. Pair with CHANNEL_BRIDGE to compute talk time." - }, - { - "key_fields": [ - "Unique-ID", - "Channel-Call-UUID", - "Hangup-Cause", - "Answer-State=hangup", - "Channel-State=CS_HANGUP", - "Caller-*", - "Other-Leg-*" - ], - "name": "CHANNEL_HANGUP", - "routing_relevance": "Fired by switch_channel.c when a channel enters hangup. Carries the normalized Hangup-Cause string (e.g. NORMAL_CLEARING, USER_BUSY, NO_ANSWER, ORIGINATOR_CANCEL). This is the primary event for call disposition classification and for tearing down per-channel handlers. Fires per-leg, so for a two-leg call you get two CHANNEL_HANGUP events keyed by each Unique-ID." - }, - { - "key_fields": [ - "Unique-ID", - "Channel-Call-UUID", - "Hangup-Cause", - "hangup_cause_q850 (variable)", - "Channel-Name", - "Caller-*", - "Other-Leg-*", - "variable_* (full channel variables when verbose)", - "CDR-Attached=xml (optional, body=XML CDR)" - ], - "name": "CHANNEL_HANGUP_COMPLETE", - "routing_relevance": "Fired by switch_core_state_machine.c at CS_HANGUP_COMPLETE after all cleanup/hooks/CDR generation. This is the definitive end-of-call event and the richest one: it includes the Q.850 cause code and (optionally, when hangup_complete_with_xml=true) the full XML CDR in the event body. Use this as the commit point for CDR/trace closure because all channel variables, times, and cause codes are final. The single best event for closing a call trace." - }, - { - "key_fields": [ - "Unique-ID", - "Channel-Call-UUID", - "Direction=RECV", - "Bridged-To (partner uuid)", - "Caller-Caller-ID-Name", - "Caller-Caller-ID-Number", - "Caller-Orig-Caller-ID-Name/Number", - "Caller-Transfer-Source" - ], - "name": "CALL_UPDATE", - "routing_relevance": "Fired by switch_channel.c when caller ID / connected line is flipped (SIP UPDATE/re-INVITE changing CID, e.g. on transfer or redirect). Carries Bridged-To (the partner UUID from switch_channel_get_partner_uuid) and the pre/post CID. Critical for tracing caller-ID mutations and transfer mid-call: a CALL_UPDATE followed by a CHANNEL_UNBRIDGE+CHANNEL_BRIDGE on new UUIDs indicates an attended/blind transfer." - }, - { - "key_fields": [ - "Unique-ID", - "Channel-Call-UUID", - "Channel-State=CS_PARK", - "Channel-Call-State", - "Caller-*" - ], - "name": "CHANNEL_PARK", - "routing_relevance": "Fired by switch_ivr.c when the 'park' application parks a channel. Marks a leg as held in the park subsystem awaiting a bridge (e.g. during attended transfer or valet parking). For routing it indicates the leg is no longer in a normal bridge but is waiting; for tracing it explains a gap in media flow." - }, - { - "key_fields": [ - "Unique-ID", - "Channel-Call-UUID", - "Channel-State", - "Caller-*" - ], - "name": "CHANNEL_UNPARK", - "routing_relevance": "Fired by switch_ivr.c when a parked channel is unparked (retrieved/bridged). Pairs with CHANNEL_PARK to bound park duration." - }, - { - "key_fields": [ - "Unique-ID", - "Channel-State", - "Channel-State-Number", - "Channel-Call-State" - ], - "name": "CHANNEL_STATE", - "routing_relevance": "Fired on raw FSM state transitions (CS_INIT/CS_ROUTING/CS_EXECUTE/CS_HANGUP etc.). Lower-level than the semantic events (CREATE/ANSWER/HANGUP); useful for tracing exact state-machine progress and detecting stuck channels, but redundant for most routing logic." - }, - { - "key_fields": [ - "Unique-ID", - "Channel-Call-State (DOWN/EARLY/ACTIVE/RINGING/HOLD/RESET/HANGUP etc.)" - ], - "name": "CHANNEL_CALLSTATE", - "routing_relevance": "Fired on call-state (CCS_*) transitions, which track the logical call lifecycle independent of the FSM state. Useful for tracing the high-level 'is this leg conceptually ringing/active/hold' view." - }, - { - "key_fields": [ - "Unique-ID", - "Channel-Call-UUID" - ], - "name": "CHANNEL_DESTROY", - "routing_relevance": "Fired by switch_core_session.c at session destruction (after HANGUP_COMPLETE). The final cleanup signal; use it to deregister per-channel handlers and free trace buffers keyed by Unique-ID." - }, - { - "key_fields": [ - "Unique-ID", - "DTMF-Digit", - "DTMF-Duration", - "DTMF-Source (RTP/INBAND_AUDIO/ENDPOINT/APP/UNKNOWN)", - "Channel-Call-UUID", - "Caller-*" - ], - "name": "DTMF", - "routing_relevance": "Fired by switch_channel.c (and switch_ivr_async.c for inband detection) on every received/injected digit. DTMF-Source distinguishes RFC2833 RTP telephone-event, inband audio-detected, endpoint-signaled, and app-injected digits. Drives IVR menu routing and digit-collection apps; for tracing it explains mid-call media interaction and is the basis for transfer/feature digit sequences." - }, - { - "key_fields": [ - "Unique-ID", - "channel-read-codec-name", - "channel-read-codec-rate", - "channel-read-codec-bit-rate", - "channel-reported-read-codec-rate", - "Channel-Write-Codec-Name/Rate/Bit-Rate", - "Channel-Call-UUID" - ], - "name": "CODEC", - "routing_relevance": "Fired by switch_core_codec.c whenever the read or write codec changes (renegotiation, transcoder insertion, codec switch). For tracing this is the authoritative codec timeline per leg and is essential for correlating RTP stream identity (payload type/codec) in the sniffer with the ESL call timeline. Mismatched a-leg/b-leg codec events indicate transcoding." - }, - { - "key_fields": [ - "Unique-ID", - "Playback-File-Path", - "Playback-File-Type (local_stream/tone_stream/file)", - "Channel-Call-UUID", - "Caller-*" - ], - "name": "PLAYBACK_START", - "routing_relevance": "Fired by switch_ivr_play_say.c when playback begins. Identifies the media being played (file path, tone stream, local stream). For routing it marks IVR announcements/ringback; for tracing it explains one-way media periods (e.g. ringback tone is being generated) so the sniffer does not misclassify them as mute." - }, - { - "key_fields": [ - "Unique-ID", - "Playback-File-Path", - "Playback-File-Type", - "Playback-Status (done/break)", - "Channel-Call-UUID" - ], - "name": "PLAYBACK_STOP", - "routing_relevance": "Fired when playback ends; Playback-Status=break vs done distinguishes user-interrupted playback from natural completion. Bounds the playback media period for tracing." - }, - { - "key_fields": [ - "Unique-ID", - "Record-File-Path", - "Channel-Call-UUID", - "Caller-*" - ], - "name": "RECORD_START", - "routing_relevance": "Fired by switch_ivr_async.c / switch_ivr_play_say.c when recording begins. The file path ties the recording artifact to the call leg; for tracing it marks the start of recorded media (and implies a media bug is tapping the stream)." - }, - { - "key_fields": [ - "Unique-ID", - "Record-File-Path", - "Record-Completion-Cause", - "Channel-Call-UUID" - ], - "name": "RECORD_STOP", - "routing_relevance": "Fired when recording stops; Record-Completion-Cause gives the reason (e.g. silence, timeout, manual stop). Bounds the recording window for trace correlation and CDR." - }, - { - "key_fields": [ - "Unique-ID", - "Event-Subclass=sofia::transferor|sofia::transferee", - "Other-Leg-Unique-ID", - "Channel-Call-UUID", - "sofia_profile_name", - "Caller-*" - ], - "name": "CUSTOM (sofia::transferor / sofia::transferee)", - "routing_relevance": "Fired by mod_sofia (sofia.c) on SIP transfer (REFER/Replaces). transferor = the leg initiating transfer; transferee = the leg being transferred. These are the SIP-level transfer correlation events and are critical for distinguishing a real transfer from a hangup: they precede the CHANNEL_UNBRIDGE/CHANNEL_BRIDGE pair and carry the partner UUID." - }, - { - "key_fields": [ - "Unique-ID", - "Event-Subclass=sofia::reinvite|sofia::replaced", - "Channel-Call-UUID", - "sofia_profile_name" - ], - "name": "CUSTOM (sofia::reinvite / sofia::replaced)", - "routing_relevance": "Fired on SIP re-INVITE (codec/hold/resume renegotiation) and Replaces (call substitution). For tracing, reinvite explains media IP/codec changes mid-call; replaced explains one call supplanting another (attended transfer target). Correlate to RTP IP/port changes in the sniffer." - }, - { - "key_fields": [ - "Event-Subclass=callcenter::info", - "CC-Queue", - "CC-Action (member-queue-end/agent-offering/bridge-agent-start/bridge-agent-end/bridge-agent-fail/agent-state-change/members-count etc.)", - "CC-Count", - "CC-Selection", - "CC-Agent", - "CC-Member-UUID", - "Unique-ID (member/agent leg)" - ], - "name": "CUSTOM (callcenter::info)", - "routing_relevance": "Fired by mod_callcenter for queue routing lifecycle: member queued, agent offered, agent bridged to member, bridge end/fail. CC-Member-UUID ties the caller leg to the queue record; CC-Agent identifies the agent leg. This is the authoritative event stream for ACD/contact-center routing traces and for correlating which agent leg answered which member leg." - }, - { - "key_fields": [ - "Event-Subclass=conference::maintenance|conference::cdr", - "Conference-Name", - "Conference-Profile", - "Action (add-member/del-member/mute-member/kick-member/transfer/start-talking/stop-talking/play-file etc.)", - "Member-ID", - "Unique-ID (member leg)", - "Old-Member-ID" - ], - "name": "CUSTOM (conference::maintenance / conference::cdr)", - "routing_relevance": "Fired by mod_conference for every member/energy/DTMF/mute/talk event in a conference. Unique-ID + Member-ID tie each ESL leg to its conference slot; Action tracks join/leave/mute/talk/transfer. For routing this is the multi-party bridge correlation (vs two-party CHANNEL_BRIDGE) and is essential for tracing conference calls and conference-originated transfers." - }, - { - "key_fields": [ - "Event-Subclass=valet_parking::info", - "Valet-Lot-Name", - "Valet-Extension", - "Action (bridge/timeout etc.)", - "Bridge-To-UUID", - "Unique-ID" - ], - "name": "CUSTOM (valet_parking::info)", - "routing_relevance": "Fired by mod_valet_parking when a parked caller is bridged to a retrieving party. Bridge-To-UUID links the parked leg to the retriever leg. This is the park/retreive correlation event for valet-style call parking." - }, - { - "key_fields": [ - "Event-Subclass=sofia::register|sofia::unregister|sofia::expire|sofia::gateway_state", - "from-user", - "from-host", - "to-user", - "contact", - "expires", - "sip_to_host", - "sip-from-host", - "Gateway-Name (for gateway_state)", - "State (for gateway_state)" - ], - "name": "CUSTOM (sofia::register / unregister / expire / gateway_state)", - "routing_relevance": "Registration/trunk-health events, not call-routing per se, but they gate outbound routing: a failed registration or REGED->DOWN gateway_state transition explains why subsequent outbound origination attempts fail. Useful as a pre-condition signal in routing traces." - } - ], - "channel_uuid_fields": "\"Unique-ID: the per-leg session UUID. Set on every channel event by switch_channel_event_set_data from switch_core_session_get_uuid(channel->session). This is the primary per-leg key; a two-leg call has two different Unique-IDs. Always present on channel events. Capture this first to key handlers (the Genesis '{uuid}:{event_name}' channel_registry pattern).\\n\\nChannel-Call-UUID: the CALL-level (not leg-level) correlation ID. In switch_channel.c it is taken from the channel variable 'call_uuid' if set, else falls back to the session UUID. At channel creation (switch_core_state_machine.c) call_uuid is initialised to the leg's own session UUID. At bridge time (switch_ivr_bridge.c lines 1446/1555/1684/1877) the PEER channel's call_uuid is overwritten with the ORIGINATING session's UUID. Therefore after a bridge both legs carry the SAME Channel-Call-UUID == the originating (a-leg) session UUID. Use Channel-Call-UUID to group all events of a single logical call across both legs and across transfers; use Unique-ID to address a single leg. Caveat: on a transfer that creates a new b-leg the call_uuid may roll to the new originator, so re-evaluate at each CHANNEL_BRIDGE.\\n\\nvariable_call_uuid: the raw 'call_uuid' channel variable surfaced in the variable_* namespace when verbose/extended event data is enabled. It is the same value as the Channel-Call-UUID header but lives under the channel-variable dump. Useful when you only have the variable payload (e.g. CHANNEL_DATA / CHANNEL_HANGUP_COMPLETE with verbose events).\\n\\nOther-Leg-Unique-ID: produced by switch_caller_profile_event_set_data with the prefix 'Other-Leg' when the channel has an originator_caller_profile (Other-Type=originator) or originatee_caller_profile (Other-Type=originatee). It is the UUID of the bridged partner leg as recorded in the caller profile. Present on bridge-related and hangup-related channel events. This is the standard a-leg<->b-leg link on per-leg events (as opposed to Bridge-A/B-Unique-ID which only appears on CHANNEL_BRIDGE).\\n\\nBridge-A-Unique-ID / Bridge-B-Unique-ID: explicit headers added only on CHANNEL_BRIDGE. Bridge-A-Unique-ID = switch_core_session_get_uuid(session) (the firing/originating leg); Bridge-B-Unique-ID = the peer session UUID passed in the bridge message. Use these to authoritatively link the two legs at the bridge instant.\\n\\nBridged-To: header on CALL_UPDATE, value = switch_channel_get_partner_uuid(channel). The current partner leg UUID at the moment of a CID flip / transfer. Use for transfer correlation.\\n\\nCaller-Unique-ID: from the Caller-* caller-profile dump (prefix 'Caller'), equals caller_profile->uuid, which is the channel's own UUID (same value as Unique-ID). Provided for symmetry with Other-Leg-Unique-ID.\\n\\nThere is NO header literally named 'Bridge-Other-Leg' in the upstream source. The b-leg correlation is carried by Other-Leg-Unique-ID (on per-leg events) and Bridge-A/B-Unique-ID (on CHANNEL_BRIDGE). Sniffer correlation: tie RTP streams to ESL legs by matching the SIP Call-ID at CHANNEL_CREATE (mod_sofia puts it in variable_sip_call_id when verbose) to the sniffer's SIP Call-ID, then use Unique-ID for per-leg and Channel-Call-UUID for call-wide grouping.\"", - "notes": "Source of truth: signalwire/freeswitch HEAD. Key files read directly: src/switch_event.c (EVENT_NAMES table, lines 138-237 — the canonical event-name list), src/include/switch_types.h (switch_event_types_t enum + doc comments, lines 1985-2090), src/switch_channel.c (switch_channel_event_set_data lines 2659-2755 — the standard channel header set applied to every channel event; CHANNEL_HANGUP 3447, CHANNEL_PROGRESS 3507, CHANNEL_PROGRESS_MEDIA 3562, CHANNEL_ANSWER 3848, CALL_UPDATE 3279; DTMF 678-705), src/switch_caller.c (switch_caller_profile_event_set_data 322-410 — Caller-*/Other-Leg-* field generation), src/switch_ivr_bridge.c (CHANNEL_BRIDGE 1377 with Bridge-A/B-Unique-ID; CHANNEL_UNBRIDGE 1326/1481/1494/1879; call_uuid propagation 1446/1555/1684/1877), src/switch_core_state_machine.c (CHANNEL_CREATE 626; CHANNEL_HANGUP_COMPLETE 943; call_uuid init 180/232/327), src/switch_core_session.c (CHANNEL_DESTROY 1584), src/switch_core_codec.c (CODEC event 189/300/471/531/579), src/switch_ivr_play_say.c (RECORD_START 770, RECORD_STOP 1033, PLAYBACK_START 1649, PLAYBACK_STOP 2023), src/switch_ivr_async.c (inband DTMF 3931, RECORD_* 1241/1482), src/switch_ivr.c (CHANNEL_PARK 1002, CHANNEL_UNPARK 1213), mod_callcenter.c (#define CALLCENTER_EVENT \\\"callcenter::info\\\"; CC-Action values enumerated), mod_conference (CONF_EVENT_MAINT=\\\"conference::maintenance\\\", CONF_EVENT_CDR=\\\"conference::cdr\\\"; Action values enumerated), mod_valet_parking.c (VALET_EVENT=\\\"valet_parking::info\\\"), mod_sofia.h (sofia::* subclass #defines lines 84-110).\\n\\nEvents the prompt asked for that do NOT exist in upstream FreeSWITCH as ESL events (verified by grep of the full src tree):\\n- RING_BACK: there is no SWITCH_EVENT_RING_BACK in the enum, EVENT_NAMES table, or any source file. Ringback is generated by the 'ringback'/'playback' application and surfaces to ESL as CHANNEL_PROGRESS (CS_RINGING) for SIP 180, CHANNEL_PROGRESS_MEDIA for SIP 183 early-media, and PLAYBACK_START/STOP when a ringback file/tone_stream is played. Use PLAYBACK_* + CHANNEL_PROGRESS_MEDIA to trace ringback.\\n- 'fire_call': not an ESL event subclass. The only occurrence in the repo is a ChangeLog line ('Fix conference fire-call') referencing the conference auto-outcall feature; it surfaces as conference::maintenance CUSTOM events with Action values (no 'fire-call' Action string exists in mod_conference either). If you meant a specific custom subclass from your sniffer/Genesis layer, it is not part of upstream FreeSWITCH.\\n- 'dial::' and 'transfer::' and 'park::' as CUSTOM subclasses: no source file reserves or fires these. The closest upstream equivalents are: for transfer, the sofia::transferor / sofia::transferee CUSTOM subclasses plus the 'transfer' Action within conference::maintenance; for park, the core CHANNEL_PARK/CHANNEL_UNPARK events plus the valet_parking::info CUSTOM subclass. The 'transfer::intercept' string in your local freeswitch/conf/vanilla/dialplan/default.xml is a bind_meta_app DTMF-meta-app binding argument, not an ESL event subclass.\\n\\nAdditional events worth tracing not in the prompt's minimum list: CHANNEL_ORIGINATE (leg originated via originate command), CHANNEL_EXECUTE / CHANNEL_EXECUTE_COMPLETE (dialplan app execution, carries Application/App-Data and Application-UUID — useful for tracing which app ran on a leg), CHANNEL_UUID (UUID changed mid-call), CHANNEL_HOLD/CHANNEL_UNHOLD, MEDIA_BUG_START/MEDIA_BUG_STOP (recording/tap/RTCP tap installed — correlates to sniffer media-bug taps), CALL_DETAIL (mod_calldetail CDR event when present).\\n\\nStandard channel headers set on EVERY channel event by switch_channel_event_set_data (in addition to event-specific ones): Channel-State, Channel-Call-State, Channel-State-Number, Channel-Name, Unique-ID, Session-External-ID, Call-Direction, Presence-Call-Direction, Channel-HIT-Dialplan, Channel-Presence-ID, Channel-Presence-Data, Presence-Data-Cols (+ PD-* cols), Channel-Call-UUID, Answer-State (ringing|early|answered|hangup), Hangup-Cause (when set), Channel-Read/Write-Codec-Name/Rate/Bit-Rate, Caller-* (full caller profile), Other-Type + Other-Leg-* (when an originator/originatee profile exists). When verbose events are enabled globally or per-channel (CF_VERBOSE_EVENTS) or for the event IDs listed in switch_channel_event_set_extended_data (which includes CREATE, ANSWER, BRIDGE, UNBRIDGE, PROGRESS, PROGRESS_MEDIA, HANGUP, HANGUP_COMPLETE, CALL_UPDATE, PLAYBACK_*, RECORD_*, CUSTOM, etc.), the full channel variable dump is also attached as variable_* and scope_variable_* headers — this is where variable_sip_call_id, variable_sip_to_user, variable_hangup_cause_q850, variable_call_uuid etc. come from.\"" - } -} diff --git a/docs/esl-sniffer-traces-plan.md b/docs/esl-sniffer-traces-plan.md deleted file mode 100644 index eff8932..0000000 --- a/docs/esl-sniffer-traces-plan.md +++ /dev/null @@ -1,456 +0,0 @@ -# Plano de Ação: Integração Sniffer + Genesis ESL para Traces Completos de Chamada - -## 1. Resumo executivo - -O objetivo é produzir **traces distribuídos completos de chamada** que unem a **camada de controle** (FreeSWITCH ESL, consumida pela biblioteca Genesis) e a **camada de captura** (sinalização SIP e mídia RTP/RTCP, observada passivamente pelo sniffer Otoru/sniffer), com **informação de roteamento** (dialplan, contexto, destino, bridge legs, transferências, ring groups, balanceador). - -Hoje os dois sistemas operam em silos observacionais: -- O **Genesis** emite spans OTel `process_event`, `send_command`, `channel.*` (answer/park/hangup/bridge/playback/say/play_and_get_digits), `channel.create`, `channel.wait`, `channel.dtmf.received`, `inbound_connect`, `outbound_handle_connection`, `ring_group.ring`, `queue.wait_and_acquire` e ~20 métricas, mas **não cobre** o ciclo de vida semântico do canal FreeSWITCH (`CHANNEL_PROGRESS`, `CHANNEL_BRIDGE`, `CHANNEL_UNBRIDGE`, `CALL_UPDATE`, `CODEC`, `PLAYBACK_*`, `RECORD_*`, transferências sofia::transferor/transferee) e **não propaga trace_context** entre ESL e SIP. -- O **sniffer** produz o span raiz `voip.call` com filhos `voip.call.sip.request/response`, `voip.rtp.stream`, `voip.register`, `voip.keepalive`, `voip.fraud.alert`, e ~50 métricas de qualidade/fraude, mas **não tem integração ESL** (0 matches para freeswitch/ESL/event_socket no repositório) e seu `trace_id` é **aleatório**, não derivado do SIP Call-ID nem propagado para o FreeSWITCH. - -A proposta é (todas as mudanças são no Genesis e na configuração do FreeSWITCH — **sem nenhuma alteração no sniffer**): -1. **Adicionar ao Genesis** spans/métricas que cobrem o lifecycle semântico do canal FreeSWITCH e a informação de roteamento (dialplan/contexto/destino/bridge/transfer/ring group/balanceador), anexando `Channel-Call-UUID`, `Other-Leg-Unique-ID`, `Bridge-A/B-Unique-ID`, `Caller-Context`, `Caller-Destination-Number`, `Application`/`Application-Data`, `Hangup-Cause` (Q.850) e, principalmente, **`sip.call_id`** (= `variable_sip_call_id`) como atributo de span em todos os spans de canal. -2. **Correlacionar Genesis ↔ sniffer por atributo compartilhado**, não por propagação de `trace_id`: o sniffer **já** emite `voip.call_id` = SIP Call-ID em seus spans e já correlaciona chamadas por essa chave no Redis. Ao colocar o mesmo `sip.call_id` em todos os spans de controle do Genesis, o **join** entre o trace de controle (Genesis) e o trace de captura (sniffer) passa a acontecer **no backend de observabilidade** (Grafana/Tempo) via query por atributo — sem qualquer mudança no sniffer. Métricas correlacionam-se a traces via **exemplars OTel** (o SDK anexa o `trace_id` do span corrente como exemplar ao registrar a métrica). -3. **Correlacionar a-leg/b-leg** dentro do trace do Genesis via `Other-Leg-Unique-ID` / `Bridge-A-Unique-ID` / `Bridge-B-Unique-ID`, agrupando tudo sob `Channel-Call-UUID`. - -## 2. Estado atual - -### 2.1 Genesis — OTel spans (resumo) - -Spans existentes (todos via `tracer.start_as_current_span`): -- `process_event` — `genesis/protocol/base.py:201` — envolve apenas metrics+logging; **dispatch e routing rodam FORA do span**. -- `send_command` — `genesis/protocol/base.py:290` — `command.name` = **string crua do comando** (alta cardinalidade), sem `record_exception` no caminho `-ERR`. -- `channel.create` — `genesis/channel.py:144` — registra `channel.dial_path`, `channel.uuid`, `channel.create.duration`, status ERROR. -- `channel.wait` — `genesis/channel.py:415` — `wait.target`, `wait.timeout`, `wait.type`, `wait.result`, `wait.duration`. -- `channel.answer`, `channel.park`, `channel.hangup`, `channel.bridge`, `channel.playback`, `channel.say`, `channel.play_and_get_digits` — produzidos pelo helper genérico `_execute_operation` em `genesis/channel.py:505`; cada um com `channel..success`, `channel..duration`. -- `channel.dtmf.received` — `genesis/channel.py:796` — `dtmf.digit`, `dtmf.handled`. -- `inbound_connect` — `genesis/inbound.py:97` — `net.peer.name/port`. -- `outbound_handle_connection` — `genesis/outbound.py:156`. -- `ring_group.ring` — `genesis/group/ring.py:138` — `ring_group.mode/size/timeout/result/answered_uuid/answered_dial_path`, mas **não chama `set_status(ERROR)`** no caminho de exceção. -- `queue.wait_and_acquire` — `genesis/queue/core.py:76`. - -Gaps críticos (do mapeamento): -- **Sem spans em `genesis/session.py`** (sendmsg, lifecycle da Session não instrumentados). -- **Sem spans em `genesis/consumer.py`**. -- **Sem spans para handler dispatch / `routing_strategy.route()` / loop `consume()` / loop `handler()`**. -- **Sem `span.add_event()` em todo o genesis/** — zero span events registrados. -- `process_event` **não carrega** `Call-Direction`, `Hangup-Cause`, `Answer-State`, `Channel-State`, `Event-Subclass` (aparecem só em métricas via `build_metric_attributes`). -- `send_command` não registra erro de span no reply `-ERR`. -- Duplicação de definições de métricas entre `genesis/protocol/metrics.py` e `genesis/channel.py` ("to avoid circular imports"). -- `call_duration_histogram.record` em `channel.py:573` **sem atributos** (sem UUID/cause). - -### 2.2 Genesis — OTel metrics (resumo) - -20 instrumentos: 13 counters, 5 histograms, 2 up_down_counters, 0 observable/gauge. -Relevantes: `genesis.commands.sent/duration/errors`, `genesis.events.received`, `genesis.channel.operations`, `genesis.channel.operation.duration`, `genesis.channel.hangup.causes`, `genesis.channel.bridge.operations`, `genesis.channel.dtmf.received`, `genesis.call.duration`, `genesis.timeouts`, `genesis.channel.routing.hits`, `genesis.channel.routing.fallback`, `genesis.connections.active/errors`, `genesis.ring_group.operations/duration/results`, `genesis.queue.operations/wait_duration`. - -Gaps: -- **Sem gauge de chamadas ativas por estado**, sem gauge de profundidade de fila de comandos/events. -- `genesis.connections.errors` **só existe em `inbound.py`**; Outbound não tem. -- **Sem métricas em `session.py`, `consumer.py`, `group/load_balancer.py`**. -- `call.duration` sem atributos → não particionável por canal/cause. - -### 2.3 Sniffer — sinais atuais - -- Span raiz `voip.call` (catálogo `SpanOpCall`) com filhos `voip.call.sip.request`, `voip.call.sip.response` (waterfall por mensagem SIP), `voip.rtp.stream` (por SSRC, lado a/b), `voip.register` + `voip.register.transaction`, `voip.keepalive`, `voip.fraud.alert`. -- Métricas: `voip.calls.total/answered/failed/timeout/muted/one_way_audio/active`, `voip.call.duration_s/mos/jitter_ms/loss_pct/silence_ratio`, `voip.rtp.streams.active`, `voip.keepalives.total/rtt_ms`, `voip.registrations.active`, `sniffer.packets.dropped`, `sniffer.fraud.*`, `sniffer.pipeline.watermark_*`, `process.*`. -- **`trace_id` é aleatório** (SDK `AlwaysSample` ou `TraceIDRatioBased`); `voip.call_id` (SIP Call-ID) é só atributo de lookup humano, **não é o trace_id**. -- **Sem propagador W3C** (`provider.go` não chama `SetTextMapPropagator`, parser SIP não extrai/injeta `traceparent`/`X-Tracespan`). -- **Sem integração ESL**: 0 matches para freeswitch/ESL/event_socket. -- Correlação cross-sensor por **SIP Call-ID via Redis** (`voip:call:{call_id}`, `voip:ep:{ip}|{port}`), não por trace context. -- Service name default `"sauron"` (`OTEL_SERVICE_NAME`). - -### 2.4 Sobreposição e divergência - -| Dimensão | Genesis (ESL) | Sniffer (SIP/RTP) | Convergência | -|---|---|---|---| -| Identidade de chamada | `Unique-ID` (per-leg), `Channel-Call-UUID` (call-wide) | `voip.call_id` = SIP `Call-ID` | `variable_sip_call_id` no evento ESL liga os dois | -| Answer | `CHANNEL_ANSWER` → `channel.answer` span (Reply-Text) | SIP 200 OK → `voip.call.sip.response` | Mesmo instante, spans separados | -| Hangup | `channel.hangup` + `Hangup-Cause` | `voip.sip.termination_cause` (inferido por IP do BYE) | Genesis é **autoritativo** para cause/who | -| Bridge | `channel.bridge` (via `api uuid_bridge`/sendmsg) — **sem evento CHANNEL_BRIDGE** | Vê dois SIP dialogs separados, **sem tie a-leg/b-leg** | ESL `Bridge-A/B-Unique-ID` + `Other-Leg-Unique-ID` fecha o gap | -| Transfer | **Não tratado** no Genesis | Vê re-INVITE/REFER sem motivo | `sofia::transferor/transferee` + `CALL_UPDATE.Bridged-To` | -| Codec | Apenas em métricas (`build_metric_attributes` não expõe codec) | `rtp.codec`, `rtp.payload_type` | `CODEC` ESL event fecha o gap | -| Routing/dialplan | `Caller-Context`, `Caller-Destination-Number`, `Application` em `CHANNEL_EXECUTE_COMPLETE` — **não extraídos** | Não visível na camada de pacotes | ESL é a única fonte | -| trace_id | SDK aleatório por span | SDK aleatório por span | **Divergem** — proposta: propagar via SIP header | - -## 3. Proposta de mapeamento ESL → spans/metrics - -Convenção: **ADICIONAR** = novo; **RENOMEAR** = mudar nome/atributos; **MANTER** = sem alteração; **REMOVER** = eliminar. - -### 3.1 Spans - -| Evento ESL | Ação | Span (nome, atributos, span events) | Justificativa / fonte ESL | -|---|---|---|---| -| `CHANNEL_CREATE` | **ADICIONAR** | `freeswitch.channel.create` — attrs: `channel.uuid`=Unique-ID, `channel.call_uuid`=Channel-Call-UUID, `channel.direction`=Call-Direction, `channel.name`=Channel-Name, `channel.destination_number`=Caller-Destination-Number, `channel.context`=Caller-Context, `channel.dialplan`=Caller-Dialplan, `channel.caller_id_number`=Caller-Caller-ID-Number, `channel.caller_id_name`=Caller-Caller-ID-Name, `channel.network_addr`=Caller-Network-Addr, `sip.call_id`=variable_sip_call_id (se presente) | `switch_core_state_machine.c:626`; nascimento do leg; único ponto com `Call-Direction` + contexto dialplan; `sip.call_id` é a chave de correlação com o sniffer | -| `CHANNEL_PROGRESS` | **ADICIONAR** | `freeswitch.channel.progress` — attrs: `channel.uuid`, `channel.call_uuid`, `channel.state`=CS_RINGING, `answer.state`=ringing, `other_leg.uuid`=Other-Leg-Unique-ID (se presente); span events: nenhum | `switch_channel.c:3507`; timestamp de alerting | -| `CHANNEL_PROGRESS_MEDIA` | **ADICIONAR** | `freeswitch.channel.progress_media` — attrs: `channel.uuid`, `channel.call_uuid`, `answer.state`=early, `channel.read_codec`=Channel-Read-Codec-Name, `channel.write_codec`=Channel-Write-Codec-Name, `other_leg.uuid` | `switch_channel.c:3562`; early-media (183) — explica RTP antes do ANSWER | -| `CHANNEL_ANSWER` | **ADICIONAR** (span semântico; **MANTER** `channel.answer` que envolve o comando `answer`) | `freeswitch.channel.answer` — attrs: `channel.uuid`, `channel.call_uuid`, `channel.state`=CS_EXECUTE, `answer.state`=answered, `channel.read_codec`, `channel.write_codec`, `other_leg.uuid` | `switch_channel.c:3848`; instante autoritativo de answer | -| `CHANNEL_BRIDGE` | **ADICIONAR** | `freeswitch.channel.bridge` — attrs: `channel.uuid`=Unique-ID (firing leg), `bridge.a_uuid`=Bridge-A-Unique-ID, `bridge.b_uuid`=Bridge-B-Unique-ID, `channel.call_uuid`, `other_leg.uuid`=Other-Leg-Unique-ID, `other_leg.type`=Other-Type, `other_leg.destination_number`=Other-Leg-Destination-Number, `other_leg.caller_id_number`=Other-Leg-Caller-ID-Number; span events: `bridge.established` | `switch_ivr_bridge.c:1377`; **correlação autoritativa a-leg/b-leg** | -| `CHANNEL_UNBRIDGE` | **ADICIONAR** | `freeswitch.channel.unbridge` — attrs: `channel.uuid`, `bridge.a_uuid`, `bridge.b_uuid`/`other_leg.uuid`, `channel.call_uuid`, `hangup.cause` (se presente); span events: `bridge.torn_down` | `switch_ivr_bridge.c:1326/1481/1494/1879`; bounds talk time; detecta transfer (unbridge→bridge novo) | -| `CHANNEL_HANGUP` | **ADICIONAR** | `freeswitch.channel.hangup` — attrs: `channel.uuid`, `channel.call_uuid`, `hangup.cause`=Hangup-Cause, `answer.state`=hangup, `channel.state`=CS_HANGUP, `other_leg.uuid`; span events: `hangup.cause.` | `switch_channel.c:3447`; causa normalizada por leg | -| `CHANNEL_HANGUP_COMPLETE` | **ADICIONAR** | `freeswitch.channel.hangup_complete` — attrs: `channel.uuid`, `channel.call_uuid`, `hangup.cause`, `hangup.cause.q850`=variable_hangup_cause_q850, `channel.name`, `sip.call_id`=variable_sip_call_id, `cdr.xml`=`[verificar]` se `CDR-Attached=xml`; span events: `call.finalized` | `switch_core_state_machine.c:943`; commit point do CDR/trace | -| `CHANNEL_DESTROY` | **ADICIONAR** | `freeswitch.channel.destroy` — attrs: `channel.uuid`, `channel.call_uuid` | `switch_core_session.c:1584`; sinal de desregistro de handler | -| `CHANNEL_EXECUTE` | **ADICIONAR** | `freeswitch.channel.execute` — attrs: `channel.uuid`, `channel.call_uuid`, `application.name`=Application, `application.uuid`=Application-UUID; span events: nenhum | Dialplan app start | -| `CHANNEL_EXECUTE_COMPLETE` | **ADICIONAR** | `freeswitch.channel.execute_complete` — attrs: `channel.uuid`, `channel.call_uuid`, `application.name`=Application, `application.uuid`=Application-UUID, `application.response`=Application-Response; span events: `app..done` | Correlaciona com `Session._awaitable_complete_command` | -| `CHANNEL_PARK` / `CHANNEL_UNPARK` | **ADICIONAR** | `freeswitch.channel.park` / `freeswitch.channel.unpark` — attrs: `channel.uuid`, `channel.call_uuid`, `channel.state` | `switch_ivr.c:1002/1213` | -| `CALL_UPDATE` | **ADICIONAR** | `freeswitch.call.update` — attrs: `channel.uuid`, `channel.call_uuid`, `bridged.to`=Bridged-To, `caller.transfer_source`=Caller-Transfer-Source, `caller.orig_caller_id_number`=Caller-Orig-Caller-ID-Number; span events: `caller_id.mutated` | `switch_channel.c:3279`; detecta transfer mid-call | -| `CODEC` | **ADICIONAR** | `freeswitch.channel.codec` — attrs: `channel.uuid`, `channel.call_uuid`, `channel.read_codec.name/rate`, `channel.write_codec.name/rate`, `channel.reported_read_codec_rate` | `switch_core_codec.c:189/300/471/531/579`; timeline de codec por leg | -| `PLAYBACK_START` / `PLAYBACK_STOP` | **ADICIONAR** | `freeswitch.channel.playback.start/stop` — attrs: `channel.uuid`, `channel.call_uuid`, `playback.file_path`=Playback-File-Path, `playback.file_type`=Playback-File-Type, `playback.status`=Playback-Status (no stop); span events: nenhum | `switch_ivr_play_say.c:1649/2023`; explica mídia one-way (ringback) | -| `RECORD_START` / `RECORD_STOP` | **ADICIONAR** | `freeswitch.channel.record.start/stop` — attrs: `channel.uuid`, `channel.call_uuid`, `record.file_path`=Record-File-Path, `record.completion_cause`=Record-Completion-Cause (no stop) | `switch_ivr_async.c:1241/1482`, `switch_ivr_play_say.c:770/1033` | -| `CUSTOM sofia::transferor` / `sofia::transferee` | **ADICIONAR** | `freeswitch.sofia.transfer` — attrs: `channel.uuid`, `channel.call_uuid`, `transfer.role`=transferor\|transferee, `other_leg.uuid`=Other-Leg-Unique-ID, `sofia.profile`=sofia_profile_name; span events: `transfer.initiated` | `mod_sofia.h:84-110`; distingue transfer de hangup | -| `CUSTOM sofia::reinvite` / `sofia::replaced` | **ADICIONAR** | `freeswitch.sofia.reinvite` / `freeswitch.sofia.replaced` — attrs: `channel.uuid`, `channel.call_uuid`, `sofia.profile`; span events: `media.renegotiated` | Correlaciona com mudança de IP/codec no RTP | -| `CUSTOM callcenter::info` | **ADICIONAR** | `freeswitch.callcenter.info` — attrs: `cc.queue`=CC-Queue, `cc.action`=CC-Action, `cc.agent`=CC-Agent, `cc.member_uuid`=CC-Member-UUID, `cc.count`=CC-Count, `cc.selection`=CC-Selection, `channel.uuid`=Unique-ID | ACD routing | -| `CUSTOM conference::maintenance` / `conference::cdr` | **ADICIONAR** | `freeswitch.conference.maintenance` / `freeswitch.conference.cdr` — attrs: `conference.name`, `conference.profile`, `conference.action`=Action, `conference.member_id`=Member-ID, `channel.uuid`, `old.member_id`=Old-Member-ID | Multi-party bridge | -| `CUSTOM valet_parking::info` | **ADICIONAR** | `freeswitch.valet.info` — attrs: `valet.lot`=Valet-Lot-Name, `valet.extension`=Valet-Extension, `valet.action`=Action, `bridge.to_uuid`=Bridge-To-UUID, `channel.uuid` | Park/retrieve | -| `CUSTOM sofia::register/unregister/expire/gateway_state` | **ADICIONAR** | `freeswitch.sofia.register` — attrs: `register.aor`=from-user@from-host, `register.contact_ip`=contact, `register.expires_s`=expires, `register.response_code`, `register.reason`, `gateway.name`=Gateway-Name, `gateway.state`=State | Pre-condição de outbound routing | -| `process_event` (span existente) | **RENOMEAR/MELHORAR** | **MANTER** nome `process_event`, mas **ADICIONAR** attrs: `event.direction`=Call-Direction, `event.channel_state`=Channel-State, `event.answer_state`=Answer-State, `event.hangup_cause`=Hangup-Cause, `event.subclass`=Event-Subclass, `event.call_uuid`=Channel-Call-UUID, `event.other_leg`=Other-Leg-Unique-ID, `sip.call_id`=variable_sip_call_id; **ADICIONAR** span events para bridge/transfer/hangup_reason quando aplicável | Fecha o gap de atributos de routing no span de processo | -| `send_command` (span existente) | **RENOMEAR** `command.name` de string crua → verbo do comando (parse first token); **ADICIONAR** `command.error`=`-ERR` detection + `span.set_status(ERROR)` + `record_exception` no reply `-ERR` | Alta cardinalidade hoje; sem erro de span | -| `channel.bridge` (existente em `channel.py`) | **MANTER**, mas **ADICIONAR** span event `bridge.esl_event` quando `CHANNEL_BRIDGE` chega, linkando `bridge.a_uuid`/`bridge.b_uuid` | Span do comando vs span do evento são complementares | -| `channel.hangup` (existente) | **MANTER**, **ADICIONAR** attr `hangup.cause.q850` (via `variable_hangup_cause_q850`) e span event `hangup.authoritative` quando `CHANNEL_HANGUP_COMPLETE` chega | Fecha gap de Q.850 | -| `ring_group.ring` (existente) | **MANTER** attrs atuais, **ADICIONAR** `ring_group.balancer_backend` (nome do backend, NÃO UUID), `ring_group.selected_dial_path`, `ring_group.context`; **CORRIGIR** chamar `span.set_status(StatusCode.ERROR)` no caminho de exceção (gap do mapeamento) | Routing info de ring group | -| `queue.wait_and_acquire` (existente) | **MANTER**, **ADICIONAR** `queue.depth` como atributo de span (NÃO de métrica) | Profundidade só como span attr evita cardinalidade | - -### 3.2 Metrics - -| Sinal | Ação | Nome / tipo / attrs | Justificativa | -|---|---|---|---| -| Chamadas ativas por estado | **ADICIONAR** | `genesis.calls.active` (UpDownCounter) attrs: `channel.state` (enum ChannelState), `direction` | Hoje só `connections.active` por tipo in/out | -| Eventos ESL processados por nome | **MANTER** `genesis.events.received` | — | Já cobre | -| Bridge por par de legs | **ADICIONAR** | `genesis.channel.bridge.events` (Counter) attrs: `bridge.result` (established/unbridged), `hangup.cause` (no unbridge) | Hoje `bridge.operations` mede só o comando | -| Transferências | **ADICIONAR** | `genesis.channel.transfers` (Counter) attrs: `transfer.type` (blind/attended), `transfer.role` (transferor/transferee) | Inexistente | -| Codec changes | **ADICIONAR** | `genesis.channel.codec.changes` (Counter) attrs: `channel.read_codec`, `channel.write_codec` (NÃO UUID) | Inexistente | -| Dialplan apps executados | **ADICIONAR** | `genesis.dialplan.applications` (Counter) attrs: `application.name` (set/bridge/playback/transfer/park/voicemail/ivr/queue), `application.result` (success/fail) | Routing info | -| Hangup causes por Q.850 | **ADICIONAR** | `genesis.channel.hangup.causes.q850` (Counter) attrs: `hangup.cause.q850` (NÃO UUID) | Hoje `hangup.causes` só tem cause textual | -| Duration de processamento de evento | **ADICIONAR** | `genesis.event.processing.duration` (Histogram) attrs: `event.name` | Gap: sem latência de dispatch | -| `genesis.call.duration` | **RENOMEAR/REPARAR** | **MANTER** nome, **ADICIONAR** attrs `hangup.cause` e `direction`; **NÃO** adicionar `channel.uuid` (cardinalidade) | Hoje gravado sem attrs | -| `genesis.connections.errors` | **ADICIONAR** em `genesis/outbound.py` | Reaproveitar mesmo nome com attrs `type`=outbound, `error`=... | Gap: outbound sem error counter | -| Métricas duplicadas em `channel.py` e `metrics.py` | **REMOVER** duplicação | Centralizar definição em `genesis/protocol/metrics.py` e importar em `channel.py` (resolver circular import via module lazy import ou mover constants) | Hazard de manutenção | -| Observable gauge de queue depth | **ADICIONAR** | `genesis.commands.queue.depth` (ObservableGauge), `genesis.events.queue.depth` (ObservableGauge) | Backpressure não observável | - -**Regra de cardinalidade**: atributos de métrica **NUNCA** carregam UUIDs (`channel.uuid`, `bridge.a_uuid`, `other_leg.uuid`); apenas enums/labels low-cardinality (`channel.state`, `direction`, `hangup.cause`, `application.name`, `transfer.type`). UUIDs vão **só em spans**. - -## 4. Estratégia de correlação de traces (sniffer ↔ Genesis) - -### 4.1 Opções avaliadas - -| Opção | Mecanismo | Veredito | -|---|---|---| -| **A. Correlação por atributo `sip.call_id` no backend** | Genesis anexa `sip.call_id` (= `variable_sip_call_id`) a todos os spans de canal e ao `process_event`; sniffer já emite `voip.call_id` = SIP Call-ID. Join em Grafana/Tempo por query de atributo. Métricas → traces via exemplars OTel. | **RECOMENDADA / ESCOPO DESTE PR** — zero mudança no sniffer; usa chaves que o sniffer já produz; funciona mesmo com trace_ids independentes | -| B. SIP Call-ID **como** trace_id | Usar `variable_sip_call_id` como `trace_id` OTel | Rejeitado: `trace_id` OTel é 128-bit hex; Call-ID é string arbitrária; quebra semântica OTel e o SDK não aceita | - -### 4.2 Modelo de traces (independentes, correlacionados por atributo) - -Genesis e sniffer continuam emitindo **traces OTel independentes** (cada um com seu próprio `trace_id`). A correlação é **lógica**, por `sip.call_id`: - -``` -Trace Genesis (service=genesis) — root: freeswitch.channel.create - freeswitch.channel.progress attrs: sip.call_id, channel.call_uuid - freeswitch.channel.answer attrs: sip.call_id, channel.call_uuid - freeswitch.channel.bridge attrs: sip.call_id, bridge.a_uuid, bridge.b_uuid - freeswitch.channel.execute attrs: sip.call_id, application.name - freeswitch.channel.unbridge attrs: sip.call_id - freeswitch.channel.hangup attrs: sip.call_id, hangup.cause - freeswitch.channel.hangup_complete attrs: sip.call_id, hangup.cause.q850 - -Trace sniffer (service=sniffer) — root: voip.call attrs: voip.call_id (= mesmo SIP Call-ID) - voip.call.sip.request attrs: voip.call_id - voip.call.sip.response attrs: voip.call_id - voip.rtp.stream (a) attrs: voip.call_id - voip.rtp.stream (b) attrs: voip.call_id -``` - -**Join no Grafana/Tempo**: `trace.span.attrs["sip.call_id"] == trace.span.attrs["voip.call_id"]` — uma query por atributo retorna os dois traces; o usuário navega entre eles. Não há parentesco OTel direto (intencional: o sniffer não conhece o trace_id do Genesis). - -### 4.3 Hierarquia **dentro** do trace Genesis - -- **Trace raiz lógico = chamada**, identificado por `Channel-Call-UUID` (call-wide). O span `freeswitch.channel.create` do leg originador (`Call-Direction=inbound` ou originador do `originate`) é o root. -- **Spans de controle/dialplan** (`execute`, `execute_complete`, `codec`, `playback.*`, `transfer`) são filhos diretos do root via context OTel propagado pelo `Protocol`/`Channel`. -- **`sip.call_id` e `channel.call_uuid`** são atributos em **todos** os spans de canal — garantem o join com o sniffer e o agrupamento a-leg/b-leg. - -### 4.4 Amarrando a-leg/b-leg (dentro do Genesis) - -- No `CHANNEL_BRIDGE`, o Genesis lê `Bridge-A-Unique-ID` (originador) e `Bridge-B-Unique-ID` (peer). Após o bridge, ambos os legs compartilham o mesmo `Channel-Call-UUID` (`switch_ivr_bridge.c:1446/1555/1684/1877`). -- O span `freeswitch.channel.bridge` carrega `bridge.a_uuid` e `bridge.b_uuid` como **atributos de span** e emite span event `bridge.established` com ambos os UUIDs. -- Cada leg é um dialog SIP distinto com SIP Call-ID próprio — o join cross-leg **não** é por `sip.call_id`, e sim por `channel.call_uuid` (comum aos dois legs após bridge) dentro do trace Genesis, e por `bridge.a_uuid`/`bridge.b_uuid` para cruzar com os traces sniffer de cada dialog. Fluxo: do `sip.call_id` de um leg → abre trace Genesis → lê `bridge.b_uuid` → busca o `sip.call_id`/`voip.call_id` do outro leg. -- Em transferências, `CALL_UPDATE.Bridged-To` + `sofia::transferor/transferee` indicam o novo leg; o Genesis inicia novo span root com **span link** (`Links`) para o trace anterior (não parent, pois é outra chamada lógica). -- **Caveat**: em transfer que cria novo b-leg, o `call_uuid` pode rolar para o novo originador — reavaliar `Channel-Call-UUID` a cada `CHANNEL_BRIDGE` e, se mudar, iniciar novo span root com link para o anterior. - -## 5. Informação de roteamento a anexar - -| Informação de routing | Campo ESL fonte | Span/atributo destino | -|---|---|---| -| Contexto dialplan | `Caller-Context` | `freeswitch.channel.create` → `channel.context`; `process_event` → `event.context` | -| Destination number | `Caller-Destination-Number` | `freeswitch.channel.create` → `channel.destination_number` | -| Dialplan | `Caller-Dialplan` | `freeswitch.channel.create` → `channel.dialplan` | -| Direção (a/b leg, inbound/outbound) | `Call-Direction` | `freeswitch.channel.create` → `channel.direction`; `process_event` → `event.direction` | -| Aplicação dialplan executada | `Application` (em `CHANNEL_EXECUTE`/`CHANNEL_EXECUTE_COMPLETE`) | `freeswitch.channel.execute` → `application.name` | -| Argumentos da aplicação | `Application-Data` `[verificar se presente no payload ESL do Genesis]` | `freeswitch.channel.execute` → `application.data` | -| Bridge a-leg/b-leg | `Bridge-A-Unique-ID`, `Bridge-B-Unique-ID` (`CHANNEL_BRIDGE`) | `freeswitch.channel.bridge` → `bridge.a_uuid`, `bridge.b_uuid` | -| Other-Leg (correlação per-leg) | `Other-Leg-Unique-ID`, `Other-Type` | todos os spans de evento de canal → `other_leg.uuid`, `other_leg.type` | -| Transfer (role + partner) | `Event-Subclass` sofia::transferor/transferee, `Other-Leg-Unique-ID` | `freeswitch.sofia.transfer` → `transfer.role`, `other_leg.uuid` | -| Transfer source | `Caller-Transfer-Source` | `freeswitch.call.update` → `caller.transfer_source` | -| Ring group mode/destinations | args de `RingGroup.ring` (`mode`, `destinations`, `timeout`) | `ring_group.ring` (já existe) → adicionar `ring_group.context`, `ring_group.selected_dial_path` | -| Load balancer backend escolhido | `LoadBalancerBackend` em `genesis/group/load_balancer.py` `[verificar nome do método select]` | `ring_group.ring` → `ring_group.balancer_backend` (nome/label, NÃO UUID) | -| Queue/ACD | `CC-Queue`, `CC-Agent`, `CC-Action`, `CC-Member-UUID` | `freeswitch.callcenter.info` | -| Conference | `Conference-Name`, `Action`, `Member-ID` | `freeswitch.conference.maintenance` | -| Hangup cause (texto) | `Hangup-Cause` | `freeswitch.channel.hangup` → `hangup.cause` | -| Hangup cause Q.850 | `variable_hangup_cause_q850` | `freeswitch.channel.hangup_complete` → `hangup.cause.q850` | -| Codec negociado | `Channel-Read-Codec-Name`, `Channel-Write-Codec-Name`; `CODEC` event | `freeswitch.channel.codec` → `channel.read_codec.name`, `channel.write_codec.name` | -| SIP Call-ID (correlação com sniffer — chave primária) | `variable_sip_call_id` | todos os spans de canal + `process_event` → `sip.call_id` (join com `voip.call_id` do sniffer no backend) | -| traceparent (OPCIONAL/futuro — exige sniffer) | `variable_sip_h_X_Tracespan` | **Fora do escopo deste PR**: exigiria o sniffer consumir o header. A correlação real é por `sip.call_id` (linha acima) | - -## 6. Mudanças concretas no Genesis (por arquivo) - -### `genesis/protocol/metrics.py` -- **ADICIONAR** instrumentos: `genesis.calls.active` (UpDownCounter, attrs `channel.state`, `direction`), `genesis.channel.bridge.events` (Counter, attrs `bridge.result`, `hangup.cause`), `genesis.channel.transfers` (Counter, attrs `transfer.type`, `transfer.role`), `genesis.channel.codec.changes` (Counter, attrs `channel.read_codec`, `channel.write_codec`), `genesis.dialplan.applications` (Counter, attrs `application.name`, `application.result`), `genesis.channel.hangup.causes.q850` (Counter, attrs `hangup.cause.q850`), `genesis.event.processing.duration` (Histogram, attrs `event.name`), `genesis.commands.queue.depth` (ObservableGauge), `genesis.events.queue.depth` (ObservableGauge). -- **REMOVER** definições duplicadas que também existem em `channel.py` (resolver circular import movendo os `meter.create_*` para cá e importando os objetos prontos em `channel.py`). -- **MANTER** `genesis.events.received`, `genesis.commands.*`, `genesis.channel.routing.*`, `genesis.connections.active`. - -### `genesis/protocol/base.py` -- **MANTER** span `process_event` (linha 201); **ADICIONAR** atributos `event.direction`, `event.channel_state`, `event.answer_state`, `event.hangup_cause`, `event.subclass`, `event.call_uuid`, `event.other_leg`, `sip.call_id` via extensão de `build_event_attributes` (`sip.call_id` é a chave de correlação com o sniffer). -- **ESTENDER** `process_event` para envolver **também** o dispatch (`dispatch_to_handlers`) e `routing_strategy.route()` — mover o `with` para fora do bloco metrics+logging. Alternativa: **ADICIONAR** span `dispatch_handlers` aninhado. -- **ADICIONAR** span `route_event` envolvendo `self.routing_strategy.route(event)` em `_process_one_event`. -- **RENOMEAR** atributo `command.name` do span `send_command` (linha 291) de string crua para verbo parseado (primeiro token, ex. `api`, `sendmsg`, `event`, `filter`); **ADICIONAR** `command.args` com o restante (truncado a 200 chars) se necessário para debug. -- **ADICIONAR** no caminho `-ERR` de `_execute_send` (linha 302): `span.set_status(StatusCode.ERROR, Reply-Text)`, `span.record_exception(Exception(Reply-Text))`, atributo `command.error=protocol_error`. -- **ADICIONAR** span `consume_loop`/`handler_loop` (opcional, baixa cardinalidade) envolvendo o corpo de `consume()` e `handler()`. -- **ADICIONAR** ObservableGauge callbacks para `self.commands.qsize()` e `self.events.qsize()` (usar `asyncio` safe snapshot ou pular se non-async-safe). - -### `genesis/protocol/telemetry.py` -- **ESTENDER** `build_event_attributes` (linha 15-40) para incluir: `event.direction` (Call-Direction), `event.channel_state` (Channel-State), `event.answer_state` (Answer-State), `event.hangup_cause` (Hangup-Cause), `event.subclass` (Event-Subclass), `event.call_uuid` (Channel-Call-UUID), `event.other_leg` (Other-Leg-Unique-ID), `sip.call_id` (variable_sip_call_id) — chave de correlação com o sniffer, presente em todos os spans de canal. -- **MANTER** `build_metric_attributes` e `log_event`. - -### `genesis/protocol/processors.py` -- **ADICIONAR** novo event processor `channel_lifecycle_processor` (ou um por evento semântico) que: - - Detecta `CHANNEL_CREATE/PROGRESS/PROGRESS_MEDIA/ANSWER/BRIDGE/UNBRIDGE/HANGUP/HANGUP_COMPLETE/DESTROY/EXECUTE/EXECUTE_COMPLETE/PARK/UNPARK/CALL_UPDATE/CODEC/PLAYBACK_START/PLAYBACK_STOP/RECORD_START/RECORD_STOP`. - - Extrai `Channel-Call-UUID`, `Unique-ID`, `Other-Leg-Unique-ID`, `Bridge-A/B-Unique-ID`, `Application`, `Application-Data` `[verificar]`, `Hangup-Cause`, `variable_hangup_cause_q850`, `variable_sip_call_id` (chave de correlação com o sniffer). - - Dispara a criação do span semântico correspondente (via um novo `EventSpanEmitter` injetado no Protocol) e incrementa as métricas new. -- **MANTER** `auth_request_processor`, `command_reply_processor`, `api_response_processor`, `disconnect_processor`. -- **ADICIONAR** processador `sofia_custom_processor` para subclasses `sofia::transferor/transferee/reinvite/replaced/register/unregister/expire/gateway_state` e `callcenter::info`, `conference::maintenance/cdr`, `valet_parking::info`. - -### `genesis/protocol/routing/{base,channel,composite,global_}.py` -- **MANTER** lógica de routing; **ADICIONAR** span event `routing.hit` no `ChannelRoutingStrategy.route` (linha 55) e `routing.fallback` no `GlobalRoutingStrategy.route` (linha 50), ambos no span `route_event` corrente (se ativo). - -### `genesis/channel.py` -- **MANTER** spans `channel.create/wait/answer/park/hangup/bridge/playback/say/play_and_get_digits/dtmf.received` e o helper `_execute_operation` (linha 494-537). -- **ADICIONAR** em `channel.create` (linha 144): registrar attrs `sip.call_id` (lido do evento/variável) e `channel.call_uuid` no span — chaves de correlação com o sniffer e de agrupamento a-leg/b-leg. -- **ADICIONAR** em `channel.bridge` (linha 632-644): attrs `bridge.a_uuid`, `bridge.b_uuid`, `other_leg.uuid`; span event `bridge.esl_event` quando `CHANNEL_BRIDGE` é recebido e correlacionado. -- **ADICIONAR** em `channel.hangup` (linha 588-600): attr `hangup.cause.q850` lendo `variable_hangup_cause_q850` do contexto; span event `hangup.authoritative` em `CHANNEL_HANGUP_COMPLETE`. -- **ADICIONAR** em `_state_handler`: registrar `channel.state` transitions como span events no span `channel.wait` ativo (se houver). -- **REMOVER** as 7 re-definições duplicadas de métricas (linhas 32-68) — importar de `genesis/protocol/metrics.py`. -- **REPARAR** `call_duration_histogram.record` (linha 573) para gravar com attrs `hangup.cause` e `direction` (NÃO `channel.uuid`). - -### `genesis/session.py` -- **ADICIONAR** tracer a nível de módulo (`trace.get_tracer(__name__)`). -- **ADICIONAR** span `session.sendmsg` envolvendo `Session.sendmsg` (attrs: `channel.uuid`, `application.name`, `application.uuid`=Event-UUID, `application.block`). -- **ADICIONAR** span `session.start` / `session.stop` para o lifecycle. -- **ADICIONAR** span `session.await_complete` em `_awaitable_complete_command` (attrs: `channel.uuid`, `application.uuid`, `event.name`=CHANNEL_EXECUTE_COMPLETE/CHANNEL_HANGUP_COMPLETE, `wait.duration`). -- **ADICIONAR** métricas `genesis.session.commands` (Counter, attrs `application.name`), `genesis.session.command.duration` (Histogram). - -### `genesis/consumer.py` -- **ADICIONAR** tracer a nível de módulo. -- **ADICIONAR** span `consumer.start` / `consumer.stop` (attrs: `consumer.host`, `consumer.port`). -- **ADICIONAR** span `consumer.dispatch` envolvendo a invocação de handlers registrados via `@consumer.handle`. -- **ADICIONAR** métrica `genesis.consumer.handlers` (Counter, attrs `event.name`, `handler.matched`). - -### `genesis/inbound.py` -- **MANTER** `inbound_connect` (linha 97) e `genesis.connections.active/errors`. -- **ADICIONAR** `record_exception` + `set_status(ERROR)` no span `inbound_connect` em falha de connect/timeout. - -### `genesis/outbound.py` -- **MANTER** `outbound_handle_connection` (linha 156) e `genesis.connections.active`. -- **ADICIONAR** contador `genesis.connections.errors` (attrs `type=outbound`, `error=...`) — gap do mapeamento. - -### `genesis/group/ring.py` -- **MANTER** `ring_group.ring` (linha 138); **CORRIGIR** chamar `span.set_status(StatusCode.ERROR, str(e))` no caminho de exceção (linha 202-205). -- **ADICIONAR** attrs `ring_group.balancer_backend` (label, NÃO UUID), `ring_group.selected_dial_path`, `ring_group.context`. -- **ADICIONAR** span event `ring_group.leg_answered` com `answered_uuid` quando `result=answered`. - -### `genesis/group/load_balancer.py` -- **ADICIONAR** métricas `genesis.loadbalancer.selections` (Counter, attrs `balancer.backend`, `balancer.result`), `genesis.loadbalancer.errors` (Counter, attrs `error`). -- `[verificar]` nome do método de seleção no backend (InMemoryLoadBalancer/RedisLoadBalancer). - -### `genesis/queue/core.py` -- **MANTER** `queue.wait_and_acquire` (linha 76); **ADICIONAR** attr `queue.depth` no span. -- **ADICIONAR** `record_exception`/`set_status(ERROR)` em falha de acquire. - -### `genesis/types.py` -- **MANTER** `ChannelState` IntEnum; **ADICIONAR** helper `HangupCause.q850` mapping `[verificar se já existe]`. - -### `genesis/cli/__init__.py` -- **MANTER** instalação do metrics meter provider (linha 78); **ADICIONAR** instalação de `TracerProvider` com `BatchSpanProcessor` (OTLP) — necessário para emitir os novos spans `freeswitch.channel.*`. `TextMapPropagator(TraceContextPropagator())` só é necessário se/when a propagação W3C via `X-Tracespan` for implementada (opcional/futuro). - -## 7. Configuração no FreeSWITCH - -### 7.1 Event Socket (ESL inbound) -- Em `freeswitch/conf/autoload_configs/event_socket.conf.xml`: - - `` (ou IP restrito à rede do Genesis) - - `` - - `` (restringir) -- O Genesis `Inbound` (`genesis/inbound.py`) conecta e autentica via `ClueCon` (padrão). - -### 7.2 Subscrição de eventos -- Genesis já faz `events plain ALL` em `Channel.create` (`genesis/channel.py:144`). **MANTER**. -- Para os novos eventos semânticos, garantir que `events plain ALL` cubra: `CHANNEL_PROGRESS`, `CHANNEL_PROGRESS_MEDIA`, `CHANNEL_BRIDGE`, `CHANNEL_UNBRIDGE`, `CALL_UPDATE`, `CODEC`, `PLAYBACK_START`, `PLAYBACK_STOP`, `RECORD_START`, `RECORD_STOP`, `CHANNEL_PARK`, `CHANNEL_UNPARK`, `CHANNEL_EXECUTE`, `CHANNEL_EXECUTE_COMPLETE`, `CHANNEL_DESTROY`. -- Para CUSTOM subclasses, o `Consumer._filter_command` já emite `filter Event-Subclass {X}` para nomes não-uppercase. Garantir subscrição de: `sofia::transferor`, `sofia::transferee`, `sofia::reinvite`, `sofia::replaced`, `sofia::register`, `sofia::unregister`, `sofia::expire`, `sofia::gateway_state`, `callcenter::info`, `conference::maintenance`, `conference::cdr`, `valet_parking::info`. -- Habilitar **verbose events** globais em `freeswitch.conf.xml`: `` ou por canal via `verbose_events=true` channel var — necessário para ter `variable_sip_call_id` (chave de correlação com o sniffer) e `variable_hangup_cause_q850`. - -### 7.3 Módulos relevantes -- `mod_sofia` (SIP) — obrigatório. -- `mod_event_socket` — obrigatório (ESL). -- `mod_callcenter` (se ACD), `mod_conference` (se conferência), `mod_valet_parking` (se valet) — opcionais conforme deploy. -- `mod_otel` `[verificar]` — existe um módulo comunitário mod_otel; se presente, pode complementar, mas **não é necessário** para esta proposta (tudo via ESL + Genesis). - -## 8. Correlação no backend de observabilidade (sem mudanças no sniffer) - -**Diretriz**: o sniffer **não é modificado**. Toda a correlação acontece por atributos compartilhados, no backend. - -### 8.1 Chaves de correlação - -| Chave | Genesis (span attr) | Sniffer (span attr, já existe) | Uso | -|---|---|---|---| -| SIP Call-ID | `sip.call_id` (= `variable_sip_call_id`) | `voip.call_id` | **Join principal** trace de controle ↔ trace de captura | -| Channel-Call-UUID | `channel.call_uuid` | — (não visível no SIP) | Agrupar a-leg/b-leg **dentro** do trace Genesis | -| Bridge legs | `bridge.a_uuid`, `bridge.b_uuid` | — | Cross-leg: do `sip.call_id` de um leg, ler `bridge.b_uuid` para achar o outro | -| Network | `channel.network_addr`, `sip.remote_ip` `[verificar ESL field]` | IPs/ports do RTP/SIP | Correlação secundária quando `sip.call_id` ausente | - -### 8.2 Join no Grafana/Tempo - -1. **Traces**: query por atributo — `span.attrs["sip.call_id"] = ""` retorna o trace Genesis (service=genesis) e o trace sniffer (service=sniffer) lado a lado. Não há parentesco OTel direto (intencional). -2. **Métricas → traces**: usar **OTel exemplars**. O SDK anexa o `trace_id` do span corrente como exemplar ao registrar cada métrica dentro de um span. No Grafana, painéis de `genesis.*` e `voip.*` passam a ter exemplars que linkam direto para o trace — correlação métrica↔trace sem label de alta cardinalidade. -3. **Métricas↔métricas**: **não** usar `sip.call_id`/UUIDs como label de métrica (cardinalidade). Agregar por labels low-cardinality (`channel.state`, `direction`, `hangup.cause`, `application.name`) e correlacionar via o `trace_id` do exemplar quando precisar cruzar `genesis.call.duration` com `voip.call.duration_s`. - -### 8.3 Fluxo de investigação de chamada (para o painel/dashboard) - -1. Usuário parte do número discado ou caller → busca no sniffer `voip.call_id` (SIP Call-ID). -2. Query Tempo por `sip.call_id` → abre trace Genesis (`freeswitch.channel.*`) e trace sniffer (`voip.call.*`/`voip.rtp.stream`). -3. No span `freeswitch.channel.bridge` lê `bridge.b_uuid` → segunda query por `sip.call_id` do leg B (dialog SIP distinto). -4. Em `freeswitch.channel.hangup_complete` lê `hangup.cause` + `hangup.cause.q850` (autoritativo) e cruza com `voip.call.duration_s`/MOS do sniffer para correlacionar causa de controle × qualidade de mídia. - -### 8.4 Resource attributes (Genesis) - -- `service.name=genesis`, `service.namespace=control` `[verificar convenção atual]` para distinguir de `service.name=sniffer`/`service.namespace=voip` no backend. -- Garantir que o `TracerProvider` (item 6, `cli/__init__.py`) exporte com o mesmo endpoint OTLP do sniffer (ou para o mesmo collector) — o join só funciona se ambos chegarem ao mesmo backend. - -## 9. Testes (Genesis) - -Regras do `CLAUDE.md`: **proibido `asyncio.sleep`**; usar `asyncio.Event`/`Condition`/`Future`/`wait_for`; fixtures em `tests/conftest.py`, doubles em `tests/doubles.py`, payloads em `tests/payloads.py`; `asyncio_mode=auto`; timeout 10s. - -### Novos payloads em `tests/payloads.py` -- `channel_progress` (CS_RINGING, Answer-State=ringing) -- `channel_progress_media` (CS_RINGING, CCS_EARLY, Channel-Read/Write-Codec-Name) -- `channel_bridge` (Bridge-A-Unique-ID, Bridge-B-Unique-ID, Other-Leg-Unique-ID, Other-Type) -- `channel_unbridge` -- `call_update` (Bridged-To, Caller-Transfer-Source) -- `codec` (channel-read-codec-name/rate) -- `playback_start`, `playback_stop` -- `record_start`, `record_stop` -- `channel_execute` (Application, Application-UUID) -- `channel_execute_complete` (Application, Application-UUID, Application-Response) -- `channel_destroy` -- `sofia_transferor`, `sofia_transferee` (Event-Subclass, Other-Leg-Unique-ID) -- `sofia_reinvite` -- `callcenter_info` (CC-Queue, CC-Action, CC-Agent, CC-Member-UUID) -- `conference_maintenance` (Conference-Name, Action, Member-ID) -- `valet_info` (Valet-Lot-Name, Bridge-To-UUID) -- `channel_create_verbose` (com `variable_sip_call_id`, `Caller-Context`, `Caller-Destination-Number`) - -### Novos testes (em `tests/test_channel_lifecycle.py` `[novo]`) -- `test_channel_create_span_attrs` — dispara `channel_create_verbose` num `FakeProtocol` (doubles.py), verifica span `freeswitch.channel.create` com attrs `channel.context`, `channel.destination_number`, `channel.direction`, `sip.call_id`. -- `test_channel_bridge_span_links_a_b_leg` — dispara `channel_bridge`, verifica span `freeswitch.channel.bridge` com `bridge.a_uuid` e `bridge.b_uuid` e span event `bridge.established`. -- `test_channel_unbridge_span_event` — verifica span event `bridge.torn_down`. -- `test_channel_hangup_complete_q850` — dispara `channel_hangup_complete` com `variable_hangup_cause_q850=16`, verifica attr `hangup.cause.q850=16` e span event `call.finalized`. -- `test_channel_progress_media_early_codec` — verifica attrs `channel.read_codec`, `answer.state=early`. -- `test_call_update_transfer_correlation` — dispara `call_update` + `sofia_transferor`, verifica `transfer.role=transferor` e `bridged.to`. -- `test_process_event_routing_attrs` — dispara evento com `Call-Direction`, `Channel-State`, `Other-Leg-Unique-ID`, verifica attrs no span `process_event`. -- `test_send_command_error_span_status` — duplo que responde `-ERR`, verifica `span.status=ERROR` e `command.error=protocol_error` e `command.name=api` (verbo, não string crua). -- `test_channel_create_sip_call_id_attr` — `Channel.create` com evento contendo `variable_sip_call_id`, verifica attr `sip.call_id` presente no span (chave de correlação com o sniffer). -- `test_ring_group_set_status_on_error` — `RingGroup.ring` com backend que levanta, verifica `span.status=ERROR` (gap do mapeamento). -- `test_call_duration_histogram_has_attrs` — hangup com cause, verifica métrica `genesis.call.duration` gravada com attrs `hangup.cause`, `direction`. -- `test_observable_gauge_queue_depth` — `FakeProtocol` com N eventos na queue, callback do ObservableGauge retorna o tamanho esperado. - -### Novos testes em `tests/test_session_tracing.py` `[novo]` -- `test_session_sendmsg_span` -- `test_session_await_complete_span` - -### Novos testes em `tests/test_consumer_tracing.py` `[novo]` -- `test_consumer_dispatch_span` - -### Doubles em `tests/doubles.py` -- **ADICIONAR** `FakeTracer`/`FakeSpan` que registre attrs, events, status, links em listas inspecionáveis (se já não existir). -- **ADICIONAR** `FakeMeter` que capture `add()`/`record()` calls com attrs. - -## 10. Rollout / migração - -### Ordem de implementação (fases) -1. **Fase 0 — Refactor sem mudança observável**: centralizar métricas em `genesis/protocol/metrics.py`, remover duplicações em `channel.py` (resolver circular import via import lazy ou mover constants para `genesis/protocol/_metrics_constants.py` `[novo]`). -2. **Fase 1 — Correções em spans existentes**: `send_command` (verbo + erro), `process_event` (atributos routing), `ring_group.ring` (`set_status`), `call.duration` (attrs). -3. **Fase 2 — Spans de lifecycle ESL**: novos processors + spans `freeswitch.channel.*` (CREATE/PROGRESS/ANSWER/BRIDGE/UNBRIDGE/HANGUP/HANGUP_COMPLETE/DESTROY/EXECUTE/CODEC/PLAYBACK/RECORD). -4. **Fase 3 — CUSTOM subclasses**: sofia::transfer*, callcenter, conference, valet. -5. **Fase 4 — Session/Consumer instrumentation**: spans em `session.py` e `consumer.py`. -6. **Fase 5 — Métricas novas e ObservableGauges**. - -### Compatibilidade -- Todos os novos spans/métricas são **aditivos**; consumers atuais (`Consumer.handle`, `Channel.on_dtmf`, `protocol.on`) continuam funcionando. -- Novos event processors **não devem consumir** eventos que já roteiam para handlers de usuário — apenas enriquecem telemetria. -- `command.name` rename: spans OTel são opacos para a API pública; nenhum consumidor do Genesis lê spans programaticamente (só o backend OTel). -- `call.duration` com attrs: backends OTel agregam por attrs; sem attr continua funcionando (label vazio). - -### Feature flags -- `GENESIS_TRACE_ESL_LIFECYCLE=1` (default on) — habilita spans `freeswitch.channel.*`. -- `GENESIS_TRACE_SIP_HEADER=0` (default **off**) — injeção de `` (propagação W3C); **fora do escopo deste PR**, flag reservada para o futuro. -- `GENESIS_TRACE_CUSTOM_SUBCLASSES=1` (default on) — habilita spans de sofia::/callcenter::/conference::/valet::. -- Implementar via `os.environ.get` no módulo de telemetria, guards nos processors. - -### Checklist de PR (pré-merge) -1. `poetry run black --check genesis/ tests/ examples/` -2. `poetry run mypy` -3. `poetry run pytest tests/` -4. `poetry run tox` (Python 3.10, 3.11, 3.12) -5. Não assinar commits/PR (memória `feedback_pr_signature`). - -## 11. Riscos e trade-offs - -| Risco | Mitigação | -|---|---| -| **Cardinalidade de métricas com UUIDs** | UUIDs (`channel.uuid`, `bridge.a_uuid`, `other_leg.uuid`, `application.uuid`) **só em spans**, nunca em métricas. Métricas usam enums/labels low-cardinality (`channel.state`, `direction`, `hangup.cause`, `application.name`). | -| **Volume de spans por chamada** | Uma chamada simples de 2 legs pode gerar ~20-30 spans (Genesis controle) + ~5-10 (sniffer SIP/RTP). Mitigar com `TraceIDRatioBased` no Genesis (`GENESIS_OTEL_SAMPLE_RATIO`) e mantendo `AlwaysSample` só em dev. Spans `process_event` e `route_event` podem ser desligáveis via flag. | -| **Falha de correlação por `sip.call_id`** | Se o evento ESL não trouxer `variable_sip_call_id` (leg não-SIP, gateway sem `verbose_events`), o span Genesis fica sem a chave. Mitigar: exigir `verbose_events=true` no FS; registrar `sip.call_id=unknown` explícito para não mascarar o gap; métrica `genesis.events.without_sip_call_id` (Counter) para medir adoção. | -| **`Channel-Call-UUID` roll em transfer** | Reavaliar a cada `CHANNEL_BRIDGE`; se mudar, novo span root com span Link para o trace anterior (não quebra o trace anterior, ramifica). | -| **Verbosidade de eventos ESL** | Habilitar `verbose_events` só onde necessário (produção pode gerar payload grande em `CHANNEL_HANGUP_COMPLETE` com XML CDR). Feature flag `GENESIS_ESL_VERBOSE_CDR=0` para não ingerir o body XML. | -| **Duplicação `channel.bridge` (comando) vs `freeswitch.channel.bridge` (evento)** | Documentar: o span do comando mede a chamada `api uuid_bridge`; o span do evento mede o instante autoritativo do bridge no FS. Namespaces distintos (`channel.*` vs `freeswitch.channel.*`). | -| **Overhead de `process_event` estendido** | Envolver dispatch no span pode aumentar duração do span (mas não do código). Medir com `genesis.event.processing.duration`. | -| **Circular import ao centralizar métricas** | Resolver movendo constantes de atributo para um módulo sem dependência de `protocol/base.py` (ex. `genesis/protocol/_attr_constants.py` `[novo]`), e importando os instrumentos prontos em `channel.py` via import direto de `metrics.py`. | -| **`sip.call_id` divergente entre a-leg/b-leg** | Cada leg é um dialog SIP distinto com Call-ID próprio; o join cross-leg não é por `sip.call_id` mas por `channel.call_uuid` + `bridge.a/b_uuid` dentro do trace Genesis. Documentar o fluxo de navegação no painel Grafana. | -| **`variable_sip_call_id` ausente em legs originate** | Em `originate` outbound o SIP Call-ID pode não estar disponível no `CHANNEL_CREATE` (só após o primeiro response SIP). Mitigar: anexar `sip.call_id` no `process_event` assim que o campo aparecer, e regravar no span de lifecycle posterior (PROGRESS/ANSWER). | - -## 12. Checklist do PR - -- [ ] Métricas duplicadas removidas de `genesis/channel.py` (linhas 32-68) e centralizadas em `genesis/protocol/metrics.py` -- [ ] `send_command` (`base.py:290`): `command.name` = verbo parseado; `command.error` + `set_status(ERROR)` + `record_exception` no `-ERR` -- [ ] `process_event` (`base.py:201`): attrs `event.direction/channel_state/answer_state/hangup_cause/subclass/call_uuid/other_leg` + `sip.call_id` (correlação sniffer) via `build_event_attributes` -- [ ] `build_event_attributes` (`telemetry.py:15`) estendido com os 9 novos attrs -- [ ] Novo `channel_lifecycle_processor` em `genesis/protocol/processors.py` emitindo spans `freeswitch.channel.{create,progress,progress_media,answer,bridge,unbridge,hangup,hangup_complete,destroy,execute,execute_complete,park,unpark}` -- [ ] Novo `sofia_custom_processor` (e callcenter/conference/valet) em `processors.py` -- [ ] `freeswitch.channel.bridge` carrega `bridge.a_uuid`/`bridge.b_uuid` + span event `bridge.established` -- [ ] `freeswitch.channel.hangup_complete` carrega `hangup.cause.q850` + span event `call.finalized` -- [ ] `freeswitch.call.update` + `freeswitch.sofia.transfer` com `transfer.role`/`bridged.to` -- [ ] `channel.create` (`channel.py:144`) registra attrs `sip.call_id` + `channel.call_uuid` no span -- [ ] `genesis/cli/__init__.py:78` instala `TracerProvider` + `BatchSpanProcessor` (OTLP) — `TextMapPropagator` só se propagação W3C futura -- [ ] `ring_group.ring` (`ring.py:138`) chama `set_status(ERROR)` na exceção; novos attrs `ring_group.balancer_backend/selected_dial_path/context` -- [ ] `call_duration_histogram.record` (`channel.py:573`) com attrs `hangup.cause`/`direction` -- [ ] Spans em `genesis/session.py`: `session.sendmsg`, `session.start/stop`, `session.await_complete` -- [ ] Spans em `genesis/consumer.py`: `consumer.start/stop`, `consumer.dispatch` -- [ ] `genesis/outbound.py` incrementa `genesis.connections.errors` (type=outbound) -- [ ] Métricas novas: `genesis.calls.active`, `genesis.channel.bridge.events`, `genesis.channel.transfers`, `genesis.channel.codec.changes`, `genesis.dialplan.applications`, `genesis.channel.hangup.causes.q850`, `genesis.event.processing.duration`, `genesis.commands.queue.depth`, `genesis.events.queue.depth` -- [ ] `genesis/group/load_balancer.py` instrumentado com `genesis.loadbalancer.selections/errors` -- [ ] Payloads novos em `tests/payloads.py` (channel_progress/bridge/unbridge/call_update/codec/playback/record/execute/destroy/sofia_transferor/callcenter_info/conference_maintenance/valet_info/channel_create_verbose) -- [ ] Doubles `FakeTracer`/`FakeSpan`/`FakeMeter` em `tests/doubles.py` -- [ ] Testes novos em `tests/test_channel_lifecycle.py`, `tests/test_session_tracing.py`, `tests/test_consumer_tracing.py` (sem `asyncio.sleep`, com `asyncio.Event`/`wait_for`) -- [ ] `poetry run black --check genesis/ tests/ examples/` passa -- [ ] `poetry run mypy` passa -- [ ] `poetry run pytest tests/` passa (timeout 10s) -- [ ] `poetry run tox` passa (3.10, 3.11, 3.12) -- [ ] Sem assinatura de commit/PR (respeitar `feedback_pr_signature`) -- [ ] Documentação: atualizar `CLAUDE.md` seção Observability Pattern com os novos spans; `[verificar]` se há `docs/` no Genesis para atualizar -- [ ] **Nenhuma mudança no sniffer** — correlação exclusivamente por `sip.call_id` no backend de observabilidade (Grafana/Tempo + exemplars) -- [ ] Documentar no `CLAUDE.md`/docs o fluxo de join Genesis↔sniffer por `sip.call_id` e o fluxo cross-leg via `bridge.a/b_uuid` From 05056f54e076b8d1df770419b5990fbb0647e2e9 Mon Sep 17 00:00:00 2001 From: Vitor Hugo Date: Tue, 30 Jun 2026 23:28:12 -0300 Subject: [PATCH 5/5] refactor(telemetry): clear SonarCloud issues on PR #83 - lifecycle: extract attribute-key constants (S1192), replace empty `with start_as_current_span(...): pass` blocks with an _attr_span helper using start_span + end() (S108), make channel/custom processors sync and use `protocol` in a debug log (S1172, S7503), replace the 13-branch if/elif with a _LIFECYCLE_EMITTERS dispatch dict (S3776) - telemetry: split build_event_attributes into _header_attr_name / _scalar helpers and hoist _EXPLICIT_ATTRS to module scope (S3776) - metrics: drop unnecessary list() on the WeakSet (S7504) - base: record GenesisError instead of generic Exception (S112) - ring: centralise the loadbalancer.backend attribute key (S1192) - tests: call the now-sync processors without await --- genesis/group/ring.py | 9 +- genesis/protocol/base.py | 4 +- genesis/protocol/lifecycle.py | 202 ++++++++++++++++++-------------- genesis/protocol/metrics.py | 4 +- genesis/protocol/telemetry.py | 66 ++++++----- tests/test_channel_lifecycle.py | 32 ++--- 6 files changed, 174 insertions(+), 143 deletions(-) diff --git a/genesis/group/ring.py b/genesis/group/ring.py index 133c477..4d29956 100644 --- a/genesis/group/ring.py +++ b/genesis/group/ring.py @@ -28,6 +28,9 @@ tracer = trace.get_tracer(__name__) meter = metrics.get_meter(__name__) +# Repeated metric attribute key (centralised so Sonar S1192 stays quiet). +_ATTR_LB_BACKEND = "loadbalancer.backend" + # Ring group metrics ring_group_operations_counter = meter.create_counter( "genesis.ring_group.operations", @@ -365,7 +368,7 @@ async def _ring_balancing( loadbalancer_errors_counter, 1, attributes={ - "loadbalancer.backend": backend_name, + _ATTR_LB_BACKEND: backend_name, "error": type(e).__name__, }, ) @@ -377,7 +380,7 @@ async def _ring_balancing( loadbalancer_selections_counter, 1, attributes={ - "loadbalancer.backend": backend_name, + _ATTR_LB_BACKEND: backend_name, "loadbalancer.result": "fallback", }, ) @@ -386,7 +389,7 @@ async def _ring_balancing( loadbalancer_selections_counter, 1, attributes={ - "loadbalancer.backend": backend_name, + _ATTR_LB_BACKEND: backend_name, "loadbalancer.result": "selected", }, ) diff --git a/genesis/protocol/base.py b/genesis/protocol/base.py index db45f73..2da44f7 100644 --- a/genesis/protocol/base.py +++ b/genesis/protocol/base.py @@ -24,7 +24,7 @@ from opentelemetry import trace -from genesis.exceptions import ConnectionError, UnconnectedError +from genesis.exceptions import ConnectionError, GenesisError, UnconnectedError from genesis.observability import logger, TRACE_LEVEL_NUM from genesis.protocol.parser import ESLEvent, parse_headers from genesis.protocol.reader_fsm import ESLReaderFSM @@ -361,7 +361,7 @@ async def _execute_send( if span is not None: span.set_attribute("command.error", "protocol_error") span.set_status(trace.Status(trace.StatusCode.ERROR, reply)) - span.record_exception(Exception(reply)) + span.record_exception(GenesisError(reply)) if span is not None: reply_text = result.get("Reply-Text") diff --git a/genesis/protocol/lifecycle.py b/genesis/protocol/lifecycle.py index 5f20d54..5b68e16 100644 --- a/genesis/protocol/lifecycle.py +++ b/genesis/protocol/lifecycle.py @@ -23,6 +23,7 @@ from opentelemetry import trace +from genesis.observability import logger from genesis.protocol.parser import ESLEvent from genesis.protocol.metrics import ( calls_active_counter, @@ -45,6 +46,20 @@ _LIFECYCLE_ENABLED = os.environ.get("GENESIS_TRACE_ESL_LIFECYCLE", "1") != "0" _CUSTOM_ENABLED = os.environ.get("GENESIS_TRACE_CUSTOM_SUBCLASSES", "1") != "0" +# Repeated span/metric attribute keys (centralised so Sonar S1192 stays quiet +# and renames touch one place). +ATTR_CHANNEL_STATE = "channel.state" +ATTR_ANSWER_STATE = "answer.state" +ATTR_READ_CODEC = "channel.read_codec" +ATTR_WRITE_CODEC = "channel.write_codec" +ATTR_BRIDGE_A_UUID = "bridge.a_uuid" +ATTR_BRIDGE_B_UUID = "bridge.b_uuid" +ATTR_HANGUP_CAUSE = "hangup.cause" +ATTR_APPLICATION_NAME = "application.name" +ATTR_APPLICATION_RESULT = "application.result" +ATTR_TRANSFER_ROLE = "transfer.role" +ATTR_TRANSFER_TYPE = "transfer.type" + def _str(event: ESLEvent, key: str) -> Optional[str]: """Return a single string value for key (list-aware), or None.""" @@ -80,6 +95,18 @@ def _record_sip_gap(event: ESLEvent, attrs: Dict[str, Any]) -> None: safe_add(events_without_sip_call_id_counter, 1, attributes={}) +def _attr_span(name: str, attrs: Dict[str, Any]) -> None: + """Emit a span that exists only to carry attributes (no interior work). + + Uses ``start_span`` + explicit ``end()`` instead of an empty + ``with start_as_current_span(...): pass`` block. Parent context is resolved + the same way (from the current span at call time) and the span is exported + identically. + """ + span = tracer.start_span(name, attributes=attrs) + span.end() + + # Event names handled by the lifecycle processor. _LIFECYCLE_EVENTS = { "CHANNEL_CREATE", @@ -100,7 +127,7 @@ def _record_sip_gap(event: ESLEvent, attrs: Dict[str, Any]) -> None: } -async def channel_lifecycle_processor(protocol: "Protocol", event: ESLEvent) -> None: +def channel_lifecycle_processor(protocol: "Protocol", event: ESLEvent) -> None: """Emit ``freeswitch.channel.*`` spans for channel lifecycle events.""" if not _LIFECYCLE_ENABLED: return @@ -109,37 +136,16 @@ async def channel_lifecycle_processor(protocol: "Protocol", event: ESLEvent) -> if not name or name not in _LIFECYCLE_EVENTS: return + logger.debug("lifecycle %s on %s", name, type(protocol).__name__) + attrs = _channel_attrs(event) _record_sip_gap(event, attrs) - if name == "CHANNEL_CREATE": - _emit_create(event, attrs) - elif name == "CHANNEL_PROGRESS": - _emit_progress(event, attrs) - elif name == "CHANNEL_PROGRESS_MEDIA": - _emit_progress_media(event, attrs) - elif name == "CHANNEL_ANSWER": - _emit_answer(event, attrs) - elif name == "CHANNEL_BRIDGE": - _emit_bridge(event, attrs) - elif name == "CHANNEL_UNBRIDGE": - _emit_unbridge(event, attrs) - elif name == "CHANNEL_HANGUP": - _emit_hangup(event, attrs) - elif name == "CHANNEL_HANGUP_COMPLETE": - _emit_hangup_complete(event, attrs) - elif name == "CHANNEL_DESTROY": - _emit_destroy(event, attrs) - elif name == "CHANNEL_EXECUTE": - _emit_execute(event, attrs) - elif name == "CHANNEL_EXECUTE_COMPLETE": - _emit_execute_complete(event, attrs) + emit = _LIFECYCLE_EMITTERS.get(name) + if emit is not None: + emit(event, attrs) elif name in ("CHANNEL_PARK", "CHANNEL_UNPARK"): _emit_state_span(event, attrs, f"freeswitch.channel.{name.lower()[8:]}") - elif name == "CALL_UPDATE": - _emit_call_update(event, attrs) - elif name == "CODEC": - _emit_codec(event, attrs) def _emit_create(event: ESLEvent, attrs: Dict[str, Any]) -> None: @@ -155,52 +161,47 @@ def _emit_create(event: ESLEvent, attrs: Dict[str, Any]) -> None: calls_active_counter, 1, attributes={ - "channel.state": _str(event, "Channel-State") or "CS_INIT", + ATTR_CHANNEL_STATE: _str(event, "Channel-State") or "CS_INIT", "direction": _str(event, "Call-Direction") or "unknown", }, ) def _emit_progress(event: ESLEvent, attrs: Dict[str, Any]) -> None: - _set(attrs, "channel.state", event, "Channel-State") - attrs["answer.state"] = _str(event, "Answer-State") or "ringing" - with tracer.start_as_current_span("freeswitch.channel.progress", attributes=attrs): - pass + _set(attrs, ATTR_CHANNEL_STATE, event, "Channel-State") + attrs[ATTR_ANSWER_STATE] = _str(event, "Answer-State") or "ringing" + _attr_span("freeswitch.channel.progress", attrs) def _emit_progress_media(event: ESLEvent, attrs: Dict[str, Any]) -> None: - attrs["answer.state"] = _str(event, "Answer-State") or "early" - _set(attrs, "channel.read_codec", event, "Channel-Read-Codec-Name") - _set(attrs, "channel.write_codec", event, "Channel-Write-Codec-Name") - with tracer.start_as_current_span( - "freeswitch.channel.progress_media", attributes=attrs - ): - pass + attrs[ATTR_ANSWER_STATE] = _str(event, "Answer-State") or "early" + _set(attrs, ATTR_READ_CODEC, event, "Channel-Read-Codec-Name") + _set(attrs, ATTR_WRITE_CODEC, event, "Channel-Write-Codec-Name") + _attr_span("freeswitch.channel.progress_media", attrs) def _emit_answer(event: ESLEvent, attrs: Dict[str, Any]) -> None: - _set(attrs, "channel.state", event, "Channel-State") - attrs["answer.state"] = "answered" - _set(attrs, "channel.read_codec", event, "Channel-Read-Codec-Name") - _set(attrs, "channel.write_codec", event, "Channel-Write-Codec-Name") - with tracer.start_as_current_span("freeswitch.channel.answer", attributes=attrs): - pass + _set(attrs, ATTR_CHANNEL_STATE, event, "Channel-State") + attrs[ATTR_ANSWER_STATE] = "answered" + _set(attrs, ATTR_READ_CODEC, event, "Channel-Read-Codec-Name") + _set(attrs, ATTR_WRITE_CODEC, event, "Channel-Write-Codec-Name") + _attr_span("freeswitch.channel.answer", attrs) def _emit_bridge(event: ESLEvent, attrs: Dict[str, Any]) -> None: - _set(attrs, "bridge.a_uuid", event, "Bridge-A-Unique-ID") - _set(attrs, "bridge.b_uuid", event, "Bridge-B-Unique-ID") + _set(attrs, ATTR_BRIDGE_A_UUID, event, "Bridge-A-Unique-ID") + _set(attrs, ATTR_BRIDGE_B_UUID, event, "Bridge-B-Unique-ID") _set(attrs, "other_leg.type", event, "Other-Type") _set(attrs, "other_leg.destination_number", event, "Other-Leg-Destination-Number") _set(attrs, "other_leg.caller_id_number", event, "Other-Leg-Caller-ID-Number") with tracer.start_as_current_span( "freeswitch.channel.bridge", attributes=attrs ) as span: - a = attrs.get("bridge.a_uuid", "unknown") - b = attrs.get("bridge.b_uuid", "unknown") + a = attrs.get(ATTR_BRIDGE_A_UUID, "unknown") + b = attrs.get(ATTR_BRIDGE_B_UUID, "unknown") span.add_event( "bridge.established", - attributes={"bridge.a_uuid": a, "bridge.b_uuid": b}, + attributes={ATTR_BRIDGE_A_UUID: a, ATTR_BRIDGE_B_UUID: b}, ) safe_add( channel_bridge_events_counter, @@ -210,42 +211,45 @@ def _emit_bridge(event: ESLEvent, attrs: Dict[str, Any]) -> None: def _emit_unbridge(event: ESLEvent, attrs: Dict[str, Any]) -> None: - _set(attrs, "bridge.a_uuid", event, "Bridge-A-Unique-ID") + _set(attrs, ATTR_BRIDGE_A_UUID, event, "Bridge-A-Unique-ID") # CHANNEL_UNBRIDGE may carry Other-Leg-Unique-ID instead of Bridge-B. - if "bridge.b_uuid" not in attrs: - _set(attrs, "bridge.b_uuid", event, "Other-Leg-Unique-ID") - _set(attrs, "hangup.cause", event, "Hangup-Cause") + if ATTR_BRIDGE_B_UUID not in attrs: + _set(attrs, ATTR_BRIDGE_B_UUID, event, "Other-Leg-Unique-ID") + _set(attrs, ATTR_HANGUP_CAUSE, event, "Hangup-Cause") with tracer.start_as_current_span( "freeswitch.channel.unbridge", attributes=attrs ) as span: span.add_event( "bridge.torn_down", attributes={ - "bridge.a_uuid": attrs.get("bridge.a_uuid", "unknown"), - "bridge.b_uuid": attrs.get("bridge.b_uuid", "unknown"), + ATTR_BRIDGE_A_UUID: attrs.get(ATTR_BRIDGE_A_UUID, "unknown"), + ATTR_BRIDGE_B_UUID: attrs.get(ATTR_BRIDGE_B_UUID, "unknown"), }, ) metric_attrs: Dict[str, Any] = {"bridge.result": "unbridged"} - cause = attrs.get("hangup.cause") + cause = attrs.get(ATTR_HANGUP_CAUSE) if cause: - metric_attrs["hangup.cause"] = cause + metric_attrs[ATTR_HANGUP_CAUSE] = cause safe_add(channel_bridge_events_counter, 1, attributes=metric_attrs) def _emit_hangup(event: ESLEvent, attrs: Dict[str, Any]) -> None: - _set(attrs, "hangup.cause", event, "Hangup-Cause") - _set(attrs, "channel.state", event, "Channel-State") - attrs["answer.state"] = "hangup" + _set(attrs, ATTR_HANGUP_CAUSE, event, "Hangup-Cause") + _set(attrs, ATTR_CHANNEL_STATE, event, "Channel-State") + attrs[ATTR_ANSWER_STATE] = "hangup" cause = _str(event, "Hangup-Cause") or "unknown" normalized = cause.lower().replace(" ", "_") with tracer.start_as_current_span( "freeswitch.channel.hangup", attributes=attrs ) as span: - span.add_event(f"hangup.cause.{normalized}", attributes={"hangup.cause": cause}) + span.add_event( + f"{ATTR_HANGUP_CAUSE}.{normalized}", + attributes={ATTR_HANGUP_CAUSE: cause}, + ) def _emit_hangup_complete(event: ESLEvent, attrs: Dict[str, Any]) -> None: - _set(attrs, "hangup.cause", event, "Hangup-Cause") + _set(attrs, ATTR_HANGUP_CAUSE, event, "Hangup-Cause") _set(attrs, "hangup.cause.q850", event, "variable_hangup_cause_q850") _set(attrs, "channel.name", event, "Channel-Name") with tracer.start_as_current_span( @@ -253,7 +257,7 @@ def _emit_hangup_complete(event: ESLEvent, attrs: Dict[str, Any]) -> None: ) as span: span.add_event( "call.finalized", - attributes={"hangup.cause": attrs.get("hangup.cause", "unknown")}, + attributes={ATTR_HANGUP_CAUSE: attrs.get(ATTR_HANGUP_CAUSE, "unknown")}, ) q850 = _str(event, "variable_hangup_cause_q850") if q850: @@ -270,14 +274,14 @@ def _emit_destroy(event: ESLEvent, attrs: Dict[str, Any]) -> None: calls_active_counter, -1, attributes={ - "channel.state": "CS_DESTROY", + ATTR_CHANNEL_STATE: "CS_DESTROY", "direction": _str(event, "Call-Direction") or "unknown", }, ) def _emit_execute(event: ESLEvent, attrs: Dict[str, Any]) -> None: - _set(attrs, "application.name", event, "Application") + _set(attrs, ATTR_APPLICATION_NAME, event, "Application") _set(attrs, "application.uuid", event, "Application-UUID") _set(attrs, "application.data", event, "Application-Data") with tracer.start_as_current_span("freeswitch.channel.execute", attributes=attrs): @@ -285,12 +289,12 @@ def _emit_execute(event: ESLEvent, attrs: Dict[str, Any]) -> None: safe_add( dialplan_applications_counter, 1, - attributes={"application.name": app, "application.result": "started"}, + attributes={ATTR_APPLICATION_NAME: app, ATTR_APPLICATION_RESULT: "started"}, ) def _emit_execute_complete(event: ESLEvent, attrs: Dict[str, Any]) -> None: - _set(attrs, "application.name", event, "Application") + _set(attrs, ATTR_APPLICATION_NAME, event, "Application") _set(attrs, "application.uuid", event, "Application-UUID") _set(attrs, "application.response", event, "Application-Response") app = _str(event, "Application") or "unknown" @@ -301,19 +305,18 @@ def _emit_execute_complete(event: ESLEvent, attrs: Dict[str, Any]) -> None: ) as span: span.add_event( f"app.{app}.done", - attributes={"application.name": app, "application.result": result}, + attributes={ATTR_APPLICATION_NAME: app, ATTR_APPLICATION_RESULT: result}, ) safe_add( dialplan_applications_counter, 1, - attributes={"application.name": app, "application.result": result}, + attributes={ATTR_APPLICATION_NAME: app, ATTR_APPLICATION_RESULT: result}, ) def _emit_state_span(event: ESLEvent, attrs: Dict[str, Any], span_name: str) -> None: - _set(attrs, "channel.state", event, "Channel-State") - with tracer.start_as_current_span(span_name, attributes=attrs): - pass + _set(attrs, ATTR_CHANNEL_STATE, event, "Channel-State") + _attr_span(span_name, attrs) def _emit_call_update(event: ESLEvent, attrs: Dict[str, Any]) -> None: @@ -338,12 +341,32 @@ def _emit_codec(event: ESLEvent, attrs: Dict[str, Any]) -> None: channel_codec_changes_counter, 1, attributes={ - "channel.read_codec": read_codec, - "channel.write_codec": write_codec, + ATTR_READ_CODEC: read_codec, + ATTR_WRITE_CODEC: write_codec, }, ) +# Dispatch table for the lifecycle events that map 1:1 to an emitter. Park / +# unpark are handled inline by the processor (parameterised span name) and so +# are intentionally absent here. +_LIFECYCLE_EMITTERS = { + "CHANNEL_CREATE": _emit_create, + "CHANNEL_PROGRESS": _emit_progress, + "CHANNEL_PROGRESS_MEDIA": _emit_progress_media, + "CHANNEL_ANSWER": _emit_answer, + "CHANNEL_BRIDGE": _emit_bridge, + "CHANNEL_UNBRIDGE": _emit_unbridge, + "CHANNEL_HANGUP": _emit_hangup, + "CHANNEL_HANGUP_COMPLETE": _emit_hangup_complete, + "CHANNEL_DESTROY": _emit_destroy, + "CHANNEL_EXECUTE": _emit_execute, + "CHANNEL_EXECUTE_COMPLETE": _emit_execute_complete, + "CALL_UPDATE": _emit_call_update, + "CODEC": _emit_codec, +} + + # --------------------------------------------------------------------------- # CUSTOM subclass processor # --------------------------------------------------------------------------- @@ -363,7 +386,7 @@ def _emit_codec(event: ESLEvent, attrs: Dict[str, Any]) -> None: } -async def custom_subclass_processor(protocol: "Protocol", event: ESLEvent) -> None: +def custom_subclass_processor(protocol: "Protocol", event: ESLEvent) -> None: """Emit spans for CUSTOM subclasses (sofia/callcenter/conference/valet).""" if not _CUSTOM_ENABLED: return @@ -373,6 +396,8 @@ async def custom_subclass_processor(protocol: "Protocol", event: ESLEvent) -> No if not subclass or subclass not in _CUSTOM_MAP: return + logger.debug("custom %s on %s", subclass, type(protocol).__name__) + attrs = _channel_attrs(event) kind = _CUSTOM_MAP[subclass] @@ -391,24 +416,27 @@ async def custom_subclass_processor(protocol: "Protocol", event: ESLEvent) -> No def _emit_transfer(event: ESLEvent, attrs: Dict[str, Any], role: str) -> None: - attrs["transfer.role"] = role + attrs[ATTR_TRANSFER_ROLE] = role # Heuristic: transferee only occurs in attended transfers; a lone # transferor is typically a blind transfer. - attrs["transfer.type"] = "attended" if role == "transferee" else "blind" + attrs[ATTR_TRANSFER_TYPE] = "attended" if role == "transferee" else "blind" _set(attrs, "sofia.profile", event, "variable_sofia_profile_name") with tracer.start_as_current_span( "freeswitch.sofia.transfer", attributes=attrs ) as span: span.add_event( "transfer.initiated", - attributes={"transfer.role": role, "transfer.type": attrs["transfer.type"]}, + attributes={ + ATTR_TRANSFER_ROLE: role, + ATTR_TRANSFER_TYPE: attrs[ATTR_TRANSFER_TYPE], + }, ) safe_add( channel_transfers_counter, 1, attributes={ - "transfer.type": attrs["transfer.type"], - "transfer.role": role, + ATTR_TRANSFER_TYPE: attrs[ATTR_TRANSFER_TYPE], + ATTR_TRANSFER_ROLE: role, }, ) @@ -432,8 +460,7 @@ def _emit_register(event: ESLEvent, attrs: Dict[str, Any], subclass: str) -> Non _set(attrs, "gateway.name", event, "Gateway-Name") _set(attrs, "gateway.state", event, "State") attrs["register.action"] = subclass.split("::")[1] - with tracer.start_as_current_span("freeswitch.sofia.register", attributes=attrs): - pass + _attr_span("freeswitch.sofia.register", attrs) def _emit_callcenter(event: ESLEvent, attrs: Dict[str, Any]) -> None: @@ -443,8 +470,7 @@ def _emit_callcenter(event: ESLEvent, attrs: Dict[str, Any]) -> None: _set(attrs, "cc.member_uuid", event, "CC-Member-UUID") _set(attrs, "cc.count", event, "CC-Count") _set(attrs, "cc.selection", event, "CC-Selection") - with tracer.start_as_current_span("freeswitch.callcenter.info", attributes=attrs): - pass + _attr_span("freeswitch.callcenter.info", attrs) def _emit_conference(event: ESLEvent, attrs: Dict[str, Any], subclass: str) -> None: @@ -458,8 +484,7 @@ def _emit_conference(event: ESLEvent, attrs: Dict[str, Any], subclass: str) -> N if subclass == "conference::cdr" else "freeswitch.conference.maintenance" ) - with tracer.start_as_current_span(span_name, attributes=attrs): - pass + _attr_span(span_name, attrs) def _emit_valet(event: ESLEvent, attrs: Dict[str, Any]) -> None: @@ -467,5 +492,4 @@ def _emit_valet(event: ESLEvent, attrs: Dict[str, Any]) -> None: _set(attrs, "valet.extension", event, "Valet-Extension") _set(attrs, "valet.action", event, "Action") _set(attrs, "bridge.to_uuid", event, "Bridge-To-UUID") - with tracer.start_as_current_span("freeswitch.valet.info", attributes=attrs): - pass + _attr_span("freeswitch.valet.info", attrs) diff --git a/genesis/protocol/metrics.py b/genesis/protocol/metrics.py index f77d1e7..9ec1a28 100644 --- a/genesis/protocol/metrics.py +++ b/genesis/protocol/metrics.py @@ -222,7 +222,7 @@ def register_protocol(protocol: Any) -> None: def _commands_queue_depth(_options: Any) -> Iterable[Observation]: total = 0 - for proto in list(_protocol_registry): + for proto in tuple(_protocol_registry): try: total += proto.commands.qsize() except Exception: @@ -232,7 +232,7 @@ def _commands_queue_depth(_options: Any) -> Iterable[Observation]: def _events_queue_depth(_options: Any) -> Iterable[Observation]: total = 0 - for proto in list(_protocol_registry): + for proto in tuple(_protocol_registry): try: total += proto.events.qsize() except Exception: diff --git a/genesis/protocol/telemetry.py b/genesis/protocol/telemetry.py index 43d4632..10ee338 100644 --- a/genesis/protocol/telemetry.py +++ b/genesis/protocol/telemetry.py @@ -11,6 +11,36 @@ from genesis.protocol.metrics import tracer, events_received_counter from genesis.observability import logger, TRACE_LEVEL_NUM +_EXPLICIT_ATTRS = { + "Call-Direction": "event.direction", + "Channel-State": "event.channel_state", + "Answer-State": "event.answer_state", + "Hangup-Cause": "event.hangup_cause", + "Event-Subclass": "event.subclass", + "Channel-Call-UUID": "event.call_uuid", + "Other-Leg-Unique-ID": "event.other_leg", + "Caller-Context": "event.context", + "Caller-Destination-Number": "event.destination_number", +} + + +def _header_attr_name(key: str) -> str: + """Map an ESL header key to its OpenTelemetry attribute name.""" + if key == "Event-Name": + return "event.name" + if key == "Unique-ID": + return "event.uuid" + if key == "Content-Type": + return "event.content_type" + return f"event.header.{key.lower().replace('-', '_')}" + + +def _scalar(value: Any) -> Any: + """Collapse a single-element list to its element; pass other values through.""" + if isinstance(value, list): + return value[0] if value else "" + return value + def build_event_attributes(event: ESLEvent) -> Dict[str, Any]: """Build OpenTelemetry attributes from an ESL event. @@ -21,49 +51,23 @@ def build_event_attributes(event: ESLEvent) -> Dict[str, Any]: Returns: Dictionary of attributes suitable for OTel spans and metrics """ - attributes = {} + attributes: Dict[str, Any] = {} for key, value in event.items(): - if key == "Event-Name": - attr_name = "event.name" - elif key == "Unique-ID": - attr_name = "event.uuid" - elif key == "Content-Type": - attr_name = "event.content_type" - else: - slug = key.lower().replace("-", "_") - attr_name = f"event.header.{slug}" - if isinstance(value, (str, int, float, bool, list, tuple)): - attributes[attr_name] = value + attributes[_header_attr_name(key)] = value # Routing / correlation attributes (explicit, low-cardinality keys) so the # ``process_event`` span carries routing info and the cross-system join key. - _EXPLICIT = { - "Call-Direction": "event.direction", - "Channel-State": "event.channel_state", - "Answer-State": "event.answer_state", - "Hangup-Cause": "event.hangup_cause", - "Event-Subclass": "event.subclass", - "Channel-Call-UUID": "event.call_uuid", - "Other-Leg-Unique-ID": "event.other_leg", - "Caller-Context": "event.context", - "Caller-Destination-Number": "event.destination_number", - } - for src, dst in _EXPLICIT.items(): + for src, dst in _EXPLICIT_ATTRS.items(): if src in event: - value = event[src] - if isinstance(value, list): - value = value[0] if value else "" - attributes[dst] = value + attributes[dst] = _scalar(event[src]) # sip.call_id is the standard SIP Call-ID and the cross-system join key. # The join happens at the observability backend. sip_call_id = event.get("variable_sip_call_id") if sip_call_id: - attributes["sip.call_id"] = ( - sip_call_id[0] if isinstance(sip_call_id, list) else sip_call_id - ) + attributes["sip.call_id"] = _scalar(sip_call_id) return attributes diff --git a/tests/test_channel_lifecycle.py b/tests/test_channel_lifecycle.py index 9dac6f6..ea7d93f 100644 --- a/tests/test_channel_lifecycle.py +++ b/tests/test_channel_lifecycle.py @@ -52,7 +52,7 @@ def _span(exporter: InMemorySpanExporter, name: str): async def test_channel_create_emits_span_and_sip_call_id(memory_exporter): event = _event(payloads.channel_create) - await channel_lifecycle_processor(None, event) # type: ignore[arg-type] + channel_lifecycle_processor(None, event) # type: ignore[arg-type] span = _span(memory_exporter, "freeswitch.channel.create") # Correlation contract: sip.call_id must be present on the span. @@ -64,7 +64,7 @@ async def test_channel_create_emits_span_and_sip_call_id(memory_exporter): async def test_channel_bridge_carries_cross_leg_uuids(memory_exporter): event = _event(payloads.channel_bridge) - await channel_lifecycle_processor(None, event) # type: ignore[arg-type] + channel_lifecycle_processor(None, event) # type: ignore[arg-type] span = _span(memory_exporter, "freeswitch.channel.bridge") assert span.attributes["bridge.a_uuid"] == payloads.UUID_A @@ -76,7 +76,7 @@ async def test_channel_bridge_carries_cross_leg_uuids(memory_exporter): async def test_channel_unbridge_emits_torn_down_event(memory_exporter): event = _event(payloads.channel_unbridge) - await channel_lifecycle_processor(None, event) # type: ignore[arg-type] + channel_lifecycle_processor(None, event) # type: ignore[arg-type] span = _span(memory_exporter, "freeswitch.channel.unbridge") assert span.attributes["sip.call_id"] == payloads.SIP_CALL_ID @@ -85,7 +85,7 @@ async def test_channel_unbridge_emits_torn_down_event(memory_exporter): async def test_hangup_complete_records_q850(memory_exporter): event = _event(payloads.channel_hangup_complete) - await channel_lifecycle_processor(None, event) # type: ignore[arg-type] + channel_lifecycle_processor(None, event) # type: ignore[arg-type] span = _span(memory_exporter, "freeswitch.channel.hangup_complete") assert span.attributes["hangup.cause.q850"] == "16" @@ -95,27 +95,27 @@ async def test_hangup_complete_records_q850(memory_exporter): async def test_channel_destroy_emits_span(memory_exporter): event = _event(payloads.channel_destroy) - await channel_lifecycle_processor(None, event) # type: ignore[arg-type] + channel_lifecycle_processor(None, event) # type: ignore[arg-type] span = _span(memory_exporter, "freeswitch.channel.destroy") assert span.attributes["sip.call_id"] == payloads.SIP_CALL_ID async def test_execute_and_complete_spans(memory_exporter): event = _event(payloads.channel_execute) - await channel_lifecycle_processor(None, event) # type: ignore[arg-type] + channel_lifecycle_processor(None, event) # type: ignore[arg-type] span = _span(memory_exporter, "freeswitch.channel.execute") assert span.attributes["application.name"] == "playback" assert span.attributes["application.uuid"] == "app-uuid-1" event = _event(payloads.channel_execute_complete) - await channel_lifecycle_processor(None, event) # type: ignore[arg-type] + channel_lifecycle_processor(None, event) # type: ignore[arg-type] span = _span(memory_exporter, "freeswitch.channel.execute_complete") assert span.attributes["application.name"] == "playback" async def test_codec_span(memory_exporter): event = _event(payloads.codec) - await channel_lifecycle_processor(None, event) # type: ignore[arg-type] + channel_lifecycle_processor(None, event) # type: ignore[arg-type] span = _span(memory_exporter, "freeswitch.channel.codec") assert span.attributes["channel.read_codec.name"] == "opus" assert span.attributes["sip.call_id"] == payloads.SIP_CALL_ID @@ -123,20 +123,20 @@ async def test_codec_span(memory_exporter): async def test_call_update_span(memory_exporter): event = _event(payloads.call_update) - await channel_lifecycle_processor(None, event) # type: ignore[arg-type] + channel_lifecycle_processor(None, event) # type: ignore[arg-type] span = _span(memory_exporter, "freeswitch.call.update") assert span.attributes["sip.call_id"] == payloads.SIP_CALL_ID async def test_sofia_transfer_blind_and_attended(memory_exporter): event = _event(payloads.sofia_transferor) - await custom_subclass_processor(None, event) # type: ignore[arg-type] + custom_subclass_processor(None, event) # type: ignore[arg-type] span = _span(memory_exporter, "freeswitch.sofia.transfer") assert span.attributes["transfer.role"] == "transferor" assert span.attributes["transfer.type"] == "blind" event = _event(payloads.sofia_transferee) - await custom_subclass_processor(None, event) # type: ignore[arg-type] + custom_subclass_processor(None, event) # type: ignore[arg-type] span = _span(memory_exporter, "freeswitch.sofia.transfer") assert span.attributes["transfer.role"] == "transferee" assert span.attributes["transfer.type"] == "attended" @@ -144,7 +144,7 @@ async def test_sofia_transfer_blind_and_attended(memory_exporter): async def test_callcenter_info_span(memory_exporter): event = _event(payloads.callcenter_info) - await custom_subclass_processor(None, event) # type: ignore[arg-type] + custom_subclass_processor(None, event) # type: ignore[arg-type] span = _span(memory_exporter, "freeswitch.callcenter.info") assert span.attributes["cc.queue"] == "sales" assert span.attributes["cc.action"] == "agent-state-change" @@ -152,7 +152,7 @@ async def test_callcenter_info_span(memory_exporter): async def test_conference_maintenance_span(memory_exporter): event = _event(payloads.conference_maintenance) - await custom_subclass_processor(None, event) # type: ignore[arg-type] + custom_subclass_processor(None, event) # type: ignore[arg-type] span = _span(memory_exporter, "freeswitch.conference.maintenance") assert span.attributes["conference.name"] == "3000" assert span.attributes["conference.action"] == "add-member" @@ -160,7 +160,7 @@ async def test_conference_maintenance_span(memory_exporter): async def test_valet_info_span(memory_exporter): event = _event(payloads.valet_info) - await custom_subclass_processor(None, event) # type: ignore[arg-type] + custom_subclass_processor(None, event) # type: ignore[arg-type] span = _span(memory_exporter, "freeswitch.valet.info") assert span.attributes["valet.lot"] == "default" assert span.attributes["bridge.to_uuid"] == payloads.UUID_B @@ -170,7 +170,7 @@ async def test_no_sip_call_id_event_still_emits_span(memory_exporter): """A channel event without the correlation key still traces; the gap is counted by the events_without_sip_call_id metric (no crash, no missing span).""" event = _event(payloads.channel_create_no_sip) - await channel_lifecycle_processor(None, event) # type: ignore[arg-type] + channel_lifecycle_processor(None, event) # type: ignore[arg-type] span = _span(memory_exporter, "freeswitch.channel.create") assert "sip.call_id" not in span.attributes @@ -178,7 +178,7 @@ async def test_no_sip_call_id_event_still_emits_span(memory_exporter): async def test_non_lifecycle_event_is_noop(memory_exporter): """A HEARTBEAT must not produce a lifecycle span.""" event = _event(payloads.heartbeat) - await channel_lifecycle_processor(None, event) # type: ignore[arg-type] + channel_lifecycle_processor(None, event) # type: ignore[arg-type] spans = exporter_names(memory_exporter) assert not any(name.startswith("freeswitch.channel.") for name in spans)