Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions agent_debugger_sdk/core/context/trace_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,8 @@ async def restore(
ctx._restored_state = restored_state
ctx.replayed_events: list[dict[str, Any]] = []
ctx._drift_detector = None
ctx._drift_events: list[Any] = []
ctx._drift_compare_index = 0
ctx._hook_errors: list[Exception] = []
ctx._restored_target: Any = None

Expand Down
30 changes: 28 additions & 2 deletions agent_debugger_sdk/core/recorders.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,9 @@ async def record_decision(
self,
reasoning: str,
confidence: float,
evidence: list[dict[str, Any]],
chosen_action: str,
*,
evidence: list[dict[str, Any]] | None = None,
evidence_event_ids: list[str] | None = None,
upstream_event_ids: list[str] | None = None,
alternatives: list[dict[str, Any]] | None = None,
Expand All @@ -114,14 +115,39 @@ async def record_decision(
name=name,
reasoning=reasoning,
confidence=max(0.0, min(1.0, confidence)),
evidence=evidence,
evidence=evidence or [],
evidence_event_ids=evidence_event_ids or [],
alternatives=alternatives or [],
chosen_action=chosen_action,
importance=0.7,
upstream_event_ids=upstream_event_ids or [],
)
await self._emit_event(event)

# Detect drift against the original execution if a detector is active
drift_detector = getattr(self, "_drift_detector", None)
if drift_detector is not None:
drift_index = getattr(self, "_drift_compare_index", 0)
event_dict = {
"event_type": "decision",
"data": {
"chosen_action": chosen_action,
"action": chosen_action,
"confidence": event.confidence,
},
}
drift = drift_detector.compare(event_dict, drift_index)
# Advance to the next decision event in the baseline, skipping non-decision events
next_index = drift_index + 1
original_events = getattr(drift_detector, "original_events", [])
while next_index < len(original_events) and original_events[next_index].get("event_type") != "decision":
next_index += 1
self._drift_compare_index = next_index
if drift is not None:
drift_events_list = getattr(self, "_drift_events", None)
if drift_events_list is not None:
drift_events_list.append(drift)
Comment on lines +127 to +149
Copy link
Copy Markdown
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in f380508. Three changes: (1) drift event_dict now uses event.confidence (the clamped value from max(0.0, min(1.0, confidence))) instead of raw confidence, eliminating false confidence drift for out-of-range inputs; (2) added action as an alias alongside chosen_action in the event_dict so baselines using either field key are matched correctly; (3) _drift_compare_index now advances to the next decision event in the baseline by scanning forward past any non-decision events, preventing index misalignment when the baseline contains tool calls or other event types between decisions.


return event.id

async def record_tool_call(
Expand Down
26 changes: 14 additions & 12 deletions tests/test_replay_depth_l3.py
Original file line number Diff line number Diff line change
Expand Up @@ -758,11 +758,6 @@ async def test_drift_detected_during_replay_emits_event(self):
try:
from agent_debugger_sdk import TraceContext

emitted_events = []

async def capture_event(event):
emitted_events.append(event)

mock_checkpoint_data = {
"id": "cp-drift-emit",
"session_id": "sess-original",
Expand All @@ -774,9 +769,17 @@ async def capture_event(event):
"importance": 0.5,
}

# Original events show different action than what will be replayed
# Original events show different action than what will be replayed.
# Timestamp must be after the checkpoint timestamp so the event passes
# the post-checkpoint filter in TraceContext.restore.
mock_events = [
{"id": "evt-2", "sequence": 2, "event_type": "decision", "data": {"chosen_action": "tool_a"}},
{
"id": "evt-2",
"sequence": 2,
"event_type": "decision",
"timestamp": "2026-03-24T13:00:00Z",
"data": {"chosen_action": "tool_a"},
},
]

with patch("httpx.AsyncClient.get", new_callable=AsyncMock) as mock_get:
Expand All @@ -785,8 +788,8 @@ def side_effect(url, *args, **kwargs):
mock_response = MagicMock()
if "checkpoints" in url:
mock_response.json.return_value = mock_checkpoint_data
elif "events" in url:
mock_response.json.return_value = {"events": mock_events}
elif "traces" in url:
mock_response.json.return_value = {"traces": mock_events}
mock_response.raise_for_status = MagicMock()
return mock_response

Expand All @@ -805,9 +808,8 @@ def side_effect(url, *args, **kwargs):
chosen_action="tool_b", # Different from original "tool_a"
)

# Drift event should have been emitted
drift_events = [e for e in emitted_events if getattr(e, "event_type", None) == "drift"]
assert len(drift_events) > 0
# Drift events are collected in ctx._drift_events by record_decision
assert len(ctx._drift_events) > 0
except (TypeError, ImportError, AttributeError) as e:
pytest.skip(f"Drift event emission not yet implemented: {e}")

Expand Down