diff --git a/src/bedrock_agentcore/evaluation/span_to_adot_serializer/adot_models.py b/src/bedrock_agentcore/evaluation/span_to_adot_serializer/adot_models.py index 7f336ada..03acccce 100644 --- a/src/bedrock_agentcore/evaluation/span_to_adot_serializer/adot_models.py +++ b/src/bedrock_agentcore/evaluation/span_to_adot_serializer/adot_models.py @@ -10,6 +10,7 @@ """ import logging +import warnings from dataclasses import dataclass from typing import Any, Dict, List, Optional @@ -46,13 +47,89 @@ class ResourceInfo: scope_version: str -@dataclass class ConversationTurn: - """A single user-assistant conversation turn.""" + """A single user-assistant conversation turn, with prior history preserved. + + ``input_messages`` holds every input event (``gen_ai.user.message`` and + ``gen_ai.assistant.message``) in the order observed on the span, so + downstream consumers can reconstruct the actual conversation flow. + ``assistant_messages`` holds only the current turn's output, derived from + ``gen_ai.choice`` events. ``tool_results`` holds any tool outputs observed + on the span. + + The ``user_message`` attribute is retained as a backwards-compatible alias + returning the most recent user turn's content; new code should iterate + ``input_messages`` filtered by role. + """ + + def __init__( + self, + user_message: Optional[str] = None, + assistant_messages: Optional[List[Dict[str, Any]]] = None, + tool_results: Optional[List[str]] = None, + input_messages: Optional[List[Dict[str, Any]]] = None, + ): + """Initialize a conversation turn. + + ``input_messages`` is the chronological list of input events + (``{"role": "user"|"assistant", "content": {...}}``). For backwards + compatibility, callers may instead pass a legacy ``user_message`` + scalar, which is converted into a single-entry ``input_messages`` + list. Supplying both is an error. + """ + if input_messages is not None and user_message is not None: + raise ValueError("Provide either input_messages or user_message, not both") + if input_messages is not None: + self.input_messages = list(input_messages) + elif user_message is not None: + self.input_messages = [{"content": {"content": user_message}, "role": "user"}] + else: + self.input_messages = [] + self.assistant_messages = list(assistant_messages or []) + self.tool_results = list(tool_results or []) + + def __repr__(self) -> str: + """Return a debug representation listing every stored field.""" + return ( + f"ConversationTurn(input_messages={self.input_messages!r}, " + f"assistant_messages={self.assistant_messages!r}, " + f"tool_results={self.tool_results!r})" + ) + + def __eq__(self, other: object) -> bool: + """Compare turns by the full set of instance fields.""" + if not isinstance(other, ConversationTurn): + return NotImplemented + return ( + self.input_messages == other.input_messages + and self.assistant_messages == other.assistant_messages + and self.tool_results == other.tool_results + ) - user_message: str - assistant_messages: List[Dict[str, Any]] - tool_results: List[str] + @property + def user_message(self) -> str: + """Return the most recent user turn's content (backwards-compatible alias). + + Deprecated: iterate ``input_messages`` to access the full conversation + history. This alias returns only the last user entry. + """ + user_entries = [m for m in self.input_messages if m.get("role") == "user"] + if not user_entries: + return "" + if len(user_entries) > 1 or any(m.get("role") == "assistant" for m in self.input_messages): + warnings.warn( + "ConversationTurn.user_message drops prior user turns and history " + "when the span carries multiple input events. Iterate " + "ConversationTurn.input_messages for the full conversation.", + DeprecationWarning, + stacklevel=2, + ) + last = user_entries[-1] + content = last.get("content") + if isinstance(content, dict): + inner = content.get("content", "") + return inner if isinstance(inner, str) else "" + return content if isinstance(content, str) else "" @dataclass @@ -191,7 +268,14 @@ def build_conversation_log_record( metadata: SpanMetadata, resource_info: ResourceInfo, ) -> Dict[str, Any]: - """Build ADOT log record for conversation turn.""" + """Build ADOT log record for conversation turn. + + ``body.input.messages`` carries ``ConversationTurn.input_messages`` in + event arrival order, so downstream consumers reconstruct the exact + conversation the model received (user and prior assistant turns + interleaved). ``body.output.messages`` carries only the current turn's + output. + """ output_messages = [] for i, msg in enumerate(conversation.assistant_messages): output_msg = msg.copy() @@ -206,7 +290,7 @@ def build_conversation_log_record( body = { "output": {"messages": output_messages}, - "input": {"messages": [{"content": {"content": conversation.user_message}, "role": "user"}]}, + "input": {"messages": list(conversation.input_messages)}, } return cls._build_log_record_base(metadata, resource_info, body) diff --git a/src/bedrock_agentcore/evaluation/span_to_adot_serializer/strands_converter.py b/src/bedrock_agentcore/evaluation/span_to_adot_serializer/strands_converter.py index 7ff3fc07..27d78d22 100644 --- a/src/bedrock_agentcore/evaluation/span_to_adot_serializer/strands_converter.py +++ b/src/bedrock_agentcore/evaluation/span_to_adot_serializer/strands_converter.py @@ -37,17 +37,29 @@ class StrandsEventParser: @classmethod def extract_conversation_turn(cls, events: List[Any]) -> Optional[ConversationTurn]: - """Extract conversation turn from Strands span events.""" - user_message = None - assistant_messages = [] - tool_results = [] + """Extract conversation turn from Strands span events. + + Per OTel GenAI semantic conventions, ``gen_ai.{user,assistant,tool}.message`` + events describe input context (including prior assistant turns replayed + as history), while ``gen_ai.choice`` describes the model's current-turn + output. ``input_messages`` preserves event arrival order so downstream + consumers can reconstruct the actual conversation flow (``[user, + assistant, user, assistant, user]`` rather than role-grouped). + """ + input_messages: List[Dict[str, Any]] = [] + assistant_messages: List[Dict[str, Any]] = [] + tool_results: List[str] = [] for event in events: event_attrs = dict(event.attributes) if hasattr(event, "attributes") and event.attributes else {} match event.name: case cls.EVENT_USER_MESSAGE: - user_message = event_attrs.get("content", "") + content = event_attrs.get("content", "") + if content: + input_messages.append({"content": {"content": content}, "role": "user"}) + else: + logger.debug("Skipping gen_ai.user.message with empty content") case cls.EVENT_CHOICE: message = event_attrs.get("message", "") @@ -66,16 +78,21 @@ def extract_conversation_turn(cls, events: List[Any]) -> Optional[ConversationTu case cls.EVENT_ASSISTANT_MESSAGE: content = event_attrs.get("content", "") if content: - assistant_messages.append({"content": {"content": content}, "role": "assistant"}) + input_messages.append({"content": {"content": content}, "role": "assistant"}) + else: + logger.debug("Skipping gen_ai.assistant.message with empty content") case cls.EVENT_TOOL_MESSAGE: content = event_attrs.get("content", "") if content: tool_results.append(content) + else: + logger.debug("Skipping gen_ai.tool.message with empty content") - if user_message and assistant_messages: + has_user_input = any(m.get("role") == "user" for m in input_messages) + if has_user_input and assistant_messages: return ConversationTurn( - user_message=user_message, + input_messages=input_messages, assistant_messages=assistant_messages, tool_results=tool_results, ) diff --git a/tests/bedrock_agentcore/evaluation/span_to_adot_serializer/conftest.py b/tests/bedrock_agentcore/evaluation/span_to_adot_serializer/conftest.py new file mode 100644 index 00000000..fa79adf9 --- /dev/null +++ b/tests/bedrock_agentcore/evaluation/span_to_adot_serializer/conftest.py @@ -0,0 +1,105 @@ +"""Shared fixtures for span_to_adot_serializer tests.""" + +from unittest.mock import Mock + +import pytest + +from bedrock_agentcore.evaluation.span_to_adot_serializer.adot_models import ( + ResourceInfo, + SpanMetadata, +) + + +@pytest.fixture +def mock_span_context(): + """Create a mock span context.""" + context = Mock() + context.trace_id = 0x1234567890ABCDEF1234567890ABCDEF + context.span_id = 0x1234567890ABCDEF + context.trace_flags = 1 + return context + + +@pytest.fixture +def mock_resource(): + """Create a mock resource.""" + resource = Mock() + resource.attributes = {"service.name": "test-service"} + return resource + + +@pytest.fixture +def mock_instrumentation_scope(): + """Create a mock instrumentation scope.""" + scope = Mock() + scope.name = "strands.agent" + scope.version = "1.0.0" + return scope + + +@pytest.fixture +def mock_status(): + """Create a mock status.""" + status = Mock() + status.status_code = Mock() + status.status_code.__str__ = Mock(return_value="StatusCode.OK") + return status + + +@pytest.fixture +def mock_span(mock_span_context, mock_resource, mock_instrumentation_scope, mock_status): + """Create a mock OTel span.""" + span = Mock() + span.context = mock_span_context + span.resource = mock_resource + span.instrumentation_scope = mock_instrumentation_scope + span.status = mock_status + span.parent = None + span.name = "test-span" + span.start_time = 1000000000 + span.end_time = 2000000000 + span.kind = Mock() + span.kind.__str__ = Mock(return_value="SpanKind.INTERNAL") + span.attributes = {"gen_ai.operation.name": "chat"} + span.events = [] + return span + + +@pytest.fixture +def mock_event(): + """Create a mock span event factory.""" + + def _create_event(name, attributes): + event = Mock() + event.name = name + event.attributes = attributes + return event + + return _create_event + + +@pytest.fixture +def span_metadata(): + """Create test SpanMetadata.""" + return SpanMetadata( + trace_id="1234567890abcdef1234567890abcdef", + span_id="1234567890abcdef", + parent_span_id=None, + name="test-span", + start_time=1000000000, + end_time=2000000000, + duration=1000000000, + kind="INTERNAL", + flags=1, + status_code="OK", + ) + + +@pytest.fixture +def resource_info(): + """Create test ResourceInfo.""" + return ResourceInfo( + resource_attributes={"service.name": "test-service"}, + scope_name="strands.agent", + scope_version="1.0.0", + ) diff --git a/tests/bedrock_agentcore/evaluation/span_to_adot_serializer/test_adot_models.py b/tests/bedrock_agentcore/evaluation/span_to_adot_serializer/test_adot_models.py index 5c38edd2..a2b9eb15 100644 --- a/tests/bedrock_agentcore/evaluation/span_to_adot_serializer/test_adot_models.py +++ b/tests/bedrock_agentcore/evaluation/span_to_adot_serializer/test_adot_models.py @@ -1,5 +1,6 @@ """Tests for framework-agnostic ADOT models and builders.""" +import warnings from unittest.mock import Mock import pytest @@ -13,93 +14,6 @@ ToolExecution, ) -# ============================================================================== -# Fixtures -# ============================================================================== - - -@pytest.fixture -def mock_span_context(): - """Create a mock span context.""" - context = Mock() - context.trace_id = 0x1234567890ABCDEF1234567890ABCDEF - context.span_id = 0x1234567890ABCDEF - context.trace_flags = 1 - return context - - -@pytest.fixture -def mock_resource(): - """Create a mock resource.""" - resource = Mock() - resource.attributes = {"service.name": "test-service"} - return resource - - -@pytest.fixture -def mock_instrumentation_scope(): - """Create a mock instrumentation scope.""" - scope = Mock() - scope.name = "strands.agent" - scope.version = "1.0.0" - return scope - - -@pytest.fixture -def mock_status(): - """Create a mock status.""" - status = Mock() - status.status_code = Mock() - status.status_code.__str__ = Mock(return_value="StatusCode.OK") - return status - - -@pytest.fixture -def mock_span(mock_span_context, mock_resource, mock_instrumentation_scope, mock_status): - """Create a mock OTel span.""" - span = Mock() - span.context = mock_span_context - span.resource = mock_resource - span.instrumentation_scope = mock_instrumentation_scope - span.status = mock_status - span.parent = None - span.name = "test-span" - span.start_time = 1000000000 - span.end_time = 2000000000 - span.kind = Mock() - span.kind.__str__ = Mock(return_value="SpanKind.INTERNAL") - span.attributes = {"gen_ai.operation.name": "chat"} - span.events = [] - return span - - -@pytest.fixture -def span_metadata(): - """Create test SpanMetadata.""" - return SpanMetadata( - trace_id="1234567890abcdef1234567890abcdef", - span_id="1234567890abcdef", - parent_span_id=None, - name="test-span", - start_time=1000000000, - end_time=2000000000, - duration=1000000000, - kind="INTERNAL", - flags=1, - status_code="OK", - ) - - -@pytest.fixture -def resource_info(): - """Create test ResourceInfo.""" - return ResourceInfo( - resource_attributes={"service.name": "test-service"}, - scope_name="strands.agent", - scope_version="1.0.0", - ) - - # ============================================================================== # Domain Model Tests # ============================================================================== @@ -160,19 +74,77 @@ def test_creation(self): class TestConversationTurn: - """Test ConversationTurn dataclass.""" + """Test ConversationTurn class.""" - def test_creation(self): - """Test ConversationTurn creation.""" + def test_creation_legacy_scalar(self): + """Legacy ``user_message`` scalar becomes a single-entry input_messages list.""" turn = ConversationTurn( user_message="Hello", assistant_messages=[{"content": {"message": "Hi"}, "role": "assistant"}], tool_results=["result1"], ) assert turn.user_message == "Hello" + assert turn.input_messages == [{"content": {"content": "Hello"}, "role": "user"}] assert len(turn.assistant_messages) == 1 assert len(turn.tool_results) == 1 + def test_creation_with_input_messages(self): + """ConversationTurn accepts a chronological input_messages list.""" + input_msgs = [ + {"content": {"content": "first"}, "role": "user"}, + {"content": {"content": "prior"}, "role": "assistant"}, + {"content": {"content": "second"}, "role": "user"}, + ] + turn = ConversationTurn( + input_messages=input_msgs, + assistant_messages=[{"content": {"message": "ok"}, "role": "assistant"}], + ) + assert turn.input_messages == input_msgs + + def test_user_message_alias_returns_latest_user_entry(self): + """user_message returns the last user entry and warns when history is present.""" + turn = ConversationTurn( + input_messages=[ + {"content": {"content": "a"}, "role": "user"}, + {"content": {"content": "prior"}, "role": "assistant"}, + {"content": {"content": "b"}, "role": "user"}, + ], + assistant_messages=[{}], + ) + + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + assert turn.user_message == "b" + assert any(issubclass(w.category, DeprecationWarning) for w in caught) + + def test_user_message_and_input_messages_rejected(self): + """Supplying both user_message and input_messages is an error.""" + with pytest.raises(ValueError): + ConversationTurn( + user_message="x", + input_messages=[{"content": {"content": "y"}, "role": "user"}], + ) + + def test_equality_compares_full_instance_state(self): + """__eq__ compares input_messages, assistant_messages, and tool_results.""" + a = ConversationTurn( + input_messages=[{"content": {"content": "u"}, "role": "user"}], + assistant_messages=[{"content": {"message": "out"}, "role": "assistant"}], + tool_results=["t"], + ) + b = ConversationTurn( + input_messages=[{"content": {"content": "u"}, "role": "user"}], + assistant_messages=[{"content": {"message": "out"}, "role": "assistant"}], + tool_results=["t"], + ) + c = ConversationTurn( + input_messages=[{"content": {"content": "different"}, "role": "user"}], + assistant_messages=[{"content": {"message": "out"}, "role": "assistant"}], + tool_results=["t"], + ) + assert a == b + assert a != c + class TestToolExecution: """Test ToolExecution dataclass.""" @@ -308,6 +280,41 @@ def test_build_conversation_log_record(self, span_metadata, resource_info): assert doc["body"]["input"]["messages"][0]["content"]["content"] == "Hello" assert doc["body"]["output"]["messages"][0]["content"]["message"] == "Hi" + def test_build_conversation_log_record_preserves_chronological_order(self, span_metadata, resource_info): + """Builder emits input_messages in event arrival order (user/assistant interleaved).""" + conversation = ConversationTurn( + input_messages=[ + {"content": {"content": "u1"}, "role": "user"}, + {"content": {"content": "prior-1"}, "role": "assistant"}, + {"content": {"content": "u2"}, "role": "user"}, + {"content": {"content": "prior-2"}, "role": "assistant"}, + {"content": {"content": "u3"}, "role": "user"}, + ], + assistant_messages=[{"content": {"message": "new-output"}, "role": "assistant"}], + ) + + doc = ADOTDocumentBuilder.build_conversation_log_record(conversation, span_metadata, resource_info) + + input_msgs = doc["body"]["input"]["messages"] + assert [m.get("role") for m in input_msgs] == [ + "user", + "assistant", + "user", + "assistant", + "user", + ] + assert [m["content"].get("content") for m in input_msgs] == [ + "u1", + "prior-1", + "u2", + "prior-2", + "u3", + ] + + output_msgs = doc["body"]["output"]["messages"] + assert len(output_msgs) == 1 + assert output_msgs[0]["content"]["message"] == "new-output" + def test_build_conversation_log_record_with_tool_results(self, span_metadata, resource_info): """Test building conversation log record with tool results.""" conversation = ConversationTurn( diff --git a/tests/bedrock_agentcore/evaluation/span_to_adot_serializer/test_strands_converter.py b/tests/bedrock_agentcore/evaluation/span_to_adot_serializer/test_strands_converter.py index 76801273..de3eaa29 100644 --- a/tests/bedrock_agentcore/evaluation/span_to_adot_serializer/test_strands_converter.py +++ b/tests/bedrock_agentcore/evaluation/span_to_adot_serializer/test_strands_converter.py @@ -2,87 +2,12 @@ from unittest.mock import Mock -import pytest - from bedrock_agentcore.evaluation.span_to_adot_serializer import convert_strands_to_adot from bedrock_agentcore.evaluation.span_to_adot_serializer.strands_converter import ( StrandsEventParser, StrandsToADOTConverter, ) -# ============================================================================== -# Fixtures -# ============================================================================== - - -@pytest.fixture -def mock_span_context(): - """Create a mock span context.""" - context = Mock() - context.trace_id = 0x1234567890ABCDEF1234567890ABCDEF - context.span_id = 0x1234567890ABCDEF - context.trace_flags = 1 - return context - - -@pytest.fixture -def mock_resource(): - """Create a mock resource.""" - resource = Mock() - resource.attributes = {"service.name": "test-service"} - return resource - - -@pytest.fixture -def mock_instrumentation_scope(): - """Create a mock instrumentation scope.""" - scope = Mock() - scope.name = "strands.agent" - scope.version = "1.0.0" - return scope - - -@pytest.fixture -def mock_status(): - """Create a mock status.""" - status = Mock() - status.status_code = Mock() - status.status_code.__str__ = Mock(return_value="StatusCode.OK") - return status - - -@pytest.fixture -def mock_span(mock_span_context, mock_resource, mock_instrumentation_scope, mock_status): - """Create a mock OTel span.""" - span = Mock() - span.context = mock_span_context - span.resource = mock_resource - span.instrumentation_scope = mock_instrumentation_scope - span.status = mock_status - span.parent = None - span.name = "test-span" - span.start_time = 1000000000 - span.end_time = 2000000000 - span.kind = Mock() - span.kind.__str__ = Mock(return_value="SpanKind.INTERNAL") - span.attributes = {"gen_ai.operation.name": "chat"} - span.events = [] - return span - - -@pytest.fixture -def mock_event(): - """Create a mock span event.""" - - def _create_event(name, attributes): - event = Mock() - event.name = name - event.attributes = attributes - return event - - return _create_event - - # ============================================================================== # Strands Event Parser Tests # ============================================================================== @@ -101,7 +26,7 @@ def test_extract_conversation_turn(self, mock_event): turn = StrandsEventParser.extract_conversation_turn(events) assert turn is not None - assert turn.user_message == "Hello" + assert turn.input_messages == [{"content": {"content": "Hello"}, "role": "user"}] assert len(turn.assistant_messages) == 1 assert turn.assistant_messages[0]["content"]["message"] == "Hi there" assert turn.assistant_messages[0]["content"]["finish_reason"] == "stop" @@ -119,8 +44,8 @@ def test_extract_conversation_turn_with_tool_result(self, mock_event): assert len(turn.tool_results) == 1 assert turn.tool_results[0] == "4" - def test_extract_conversation_turn_assistant_message(self, mock_event): - """Test extracting assistant message event.""" + def test_extract_conversation_turn_assistant_message_only_returns_none(self, mock_event): + """No gen_ai.choice means no current-turn output, so no ConversationTurn is emitted.""" events = [ mock_event("gen_ai.user.message", {"content": "Hello"}), mock_event("gen_ai.assistant.message", {"content": "Hi there"}), @@ -128,8 +53,64 @@ def test_extract_conversation_turn_assistant_message(self, mock_event): turn = StrandsEventParser.extract_conversation_turn(events) + assert turn is None + + def test_extract_conversation_turn_tool_results_only_returns_none(self, mock_event): + """User + tool_result with no choice and no history must not emit a record.""" + events = [ + mock_event("gen_ai.user.message", {"content": "Hello"}), + mock_event("gen_ai.tool.message", {"content": "tool output"}), + ] + + turn = StrandsEventParser.extract_conversation_turn(events) + + assert turn is None + + def test_extract_conversation_turn_preserves_all_user_messages(self, mock_event): + """Multiple gen_ai.user.message events are all preserved, not deduped.""" + events = [ + mock_event("gen_ai.user.message", {"content": "first"}), + mock_event("gen_ai.user.message", {"content": "second"}), + mock_event("gen_ai.user.message", {"content": "third"}), + mock_event("gen_ai.choice", {"message": "ok"}), + ] + + turn = StrandsEventParser.extract_conversation_turn(events) + + assert turn is not None + user_contents = [m["content"]["content"] for m in turn.input_messages if m["role"] == "user"] + assert user_contents == ["first", "second", "third"] + + def test_extract_conversation_turn_preserves_chronological_order(self, mock_event): + """input_messages must interleave user and prior assistant turns in event arrival order.""" + events = [ + mock_event("gen_ai.user.message", {"content": "u1"}), + mock_event("gen_ai.assistant.message", {"content": "prior-assistant-1"}), + mock_event("gen_ai.user.message", {"content": "u2"}), + mock_event("gen_ai.assistant.message", {"content": "prior-assistant-2"}), + mock_event("gen_ai.user.message", {"content": "u3"}), + mock_event("gen_ai.choice", {"message": "new-model-output", "finish_reason": "end_turn"}), + ] + + turn = StrandsEventParser.extract_conversation_turn(events) + assert turn is not None - assert turn.assistant_messages[0]["content"]["content"] == "Hi there" + assert [m["role"] for m in turn.input_messages] == [ + "user", + "assistant", + "user", + "assistant", + "user", + ] + assert [m["content"]["content"] for m in turn.input_messages] == [ + "u1", + "prior-assistant-1", + "u2", + "prior-assistant-2", + "u3", + ] + assert len(turn.assistant_messages) == 1 + assert turn.assistant_messages[0]["content"]["message"] == "new-model-output" def test_extract_conversation_turn_tool_message(self, mock_event): """Test extracting tool message as tool result."""