From e7409563e77063d982fd9bb740c37d6ef349e00d Mon Sep 17 00:00:00 2001 From: Tejas Kashinath Date: Tue, 5 May 2026 11:25:09 -0400 Subject: [PATCH 1/2] fix: preserve multi-turn history in Strands ConversationTurn MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit StrandsEventParser.extract_conversation_turn previously stored the gen_ai.user.message content in a scalar, so spans carrying multiple user-message events (common in multi-turn agent conversations) dropped all but the newest entry. It also appended gen_ai.assistant.message events — which per OTel GenAI semconv are prior-turn input context — into the same list as gen_ai.choice outputs, mixing history with the current model response. ConversationTurn now exposes user_messages (List[str]) and history_messages (List[Dict]) alongside the existing assistant_messages and tool_results. user_message is retained as a backwards-compatible property that returns the most recent user turn and emits a DeprecationWarning when it truncates. ADOTDocumentBuilder now emits every user turn plus all prior assistant turns as input.messages, so downstream consumers (Online Evaluations, Observability dashboards) see the full context the model received. Refs: V2197498381. Companion fix needed upstream in aws-opentelemetry-distro LLOHandler, which has the same class of bug. --- .../span_to_adot_serializer/adot_models.py | 82 +++++++++++++++++-- .../strands_converter.py | 29 +++++-- .../test_adot_models.py | 67 ++++++++++++++- .../test_strands_converter.py | 40 ++++++++- 4 files changed, 200 insertions(+), 18 deletions(-) diff --git a/src/bedrock_agentcore/evaluation/span_to_adot_serializer/adot_models.py b/src/bedrock_agentcore/evaluation/span_to_adot_serializer/adot_models.py index 7f336ada..327a2e3c 100644 --- a/src/bedrock_agentcore/evaluation/span_to_adot_serializer/adot_models.py +++ b/src/bedrock_agentcore/evaluation/span_to_adot_serializer/adot_models.py @@ -10,7 +10,8 @@ """ import logging -from dataclasses import dataclass +import warnings +from dataclasses import dataclass, field from typing import Any, Dict, List, Optional logger = logging.getLogger(__name__) @@ -48,11 +49,67 @@ class ResourceInfo: @dataclass class ConversationTurn: - """A single user-assistant conversation turn.""" + """A single user-assistant conversation turn, with prior history preserved. - user_message: str - assistant_messages: List[Dict[str, Any]] - tool_results: List[str] + ``user_messages`` holds every ``gen_ai.user.message`` event observed on the + span (conversation history + the current user turn). ``history_messages`` + holds every ``gen_ai.assistant.message`` event — these are prior assistant + turns replayed as input context, not the current model output. + ``assistant_messages`` holds only the current turn's output, derived from + ``gen_ai.choice`` events. + + The ``user_message`` attribute is retained as a backwards-compatible alias + returning the most recent user turn; new code should read ``user_messages``. + """ + + user_messages: List[str] = field(default_factory=list) + assistant_messages: List[Dict[str, Any]] = field(default_factory=list) + tool_results: List[str] = field(default_factory=list) + history_messages: List[Dict[str, Any]] = field(default_factory=list) + + def __init__( + self, + user_message: Optional[str] = None, + assistant_messages: Optional[List[Dict[str, Any]]] = None, + tool_results: Optional[List[str]] = None, + user_messages: Optional[List[str]] = None, + history_messages: Optional[List[Dict[str, Any]]] = None, + ): + """Initialize a conversation turn. + + Accepts either the new ``user_messages`` list or the legacy + ``user_message`` scalar for backwards compatibility. + """ + if user_messages is not None and user_message is not None: + raise ValueError("Provide either user_messages or user_message, not both") + if user_messages is not None: + self.user_messages = list(user_messages) + elif user_message is not None: + self.user_messages = [user_message] + else: + self.user_messages = [] + self.assistant_messages = list(assistant_messages) if assistant_messages is not None else [] + self.tool_results = list(tool_results) if tool_results is not None else [] + self.history_messages = list(history_messages) if history_messages is not None else [] + + @property + def user_message(self) -> str: + """Return the most recent user turn (backwards-compatible alias). + + Deprecated: read ``user_messages`` to access the full list of user + turns on the span. This alias drops all but the last entry. + """ + if not self.user_messages: + return "" + if len(self.user_messages) > 1: + warnings.warn( + "ConversationTurn.user_message drops prior turns when the span " + "carries multiple gen_ai.user.message events. Read " + "ConversationTurn.user_messages for the full list.", + DeprecationWarning, + stacklevel=2, + ) + return self.user_messages[-1] @dataclass @@ -191,7 +248,13 @@ def build_conversation_log_record( metadata: SpanMetadata, resource_info: ResourceInfo, ) -> Dict[str, Any]: - """Build ADOT log record for conversation turn.""" + """Build ADOT log record for conversation turn. + + Input messages preserve conversation history: user turns and prior + assistant turns (``history_messages``) are merged in the order they + were observed on the span so downstream consumers see the full context + the model received. Output messages carry only the current turn. + """ output_messages = [] for i, msg in enumerate(conversation.assistant_messages): output_msg = msg.copy() @@ -204,9 +267,14 @@ def build_conversation_log_record( for tool_result in conversation.tool_results: output_messages.append({"content": tool_result, "role": "assistant"}) + input_messages: List[Dict[str, Any]] = [ + {"content": {"content": user_msg}, "role": "user"} for user_msg in conversation.user_messages + ] + input_messages.extend(conversation.history_messages) + body = { "output": {"messages": output_messages}, - "input": {"messages": [{"content": {"content": conversation.user_message}, "role": "user"}]}, + "input": {"messages": input_messages}, } return cls._build_log_record_base(metadata, resource_info, body) diff --git a/src/bedrock_agentcore/evaluation/span_to_adot_serializer/strands_converter.py b/src/bedrock_agentcore/evaluation/span_to_adot_serializer/strands_converter.py index 7ff3fc07..5d2af807 100644 --- a/src/bedrock_agentcore/evaluation/span_to_adot_serializer/strands_converter.py +++ b/src/bedrock_agentcore/evaluation/span_to_adot_serializer/strands_converter.py @@ -37,17 +37,29 @@ class StrandsEventParser: @classmethod def extract_conversation_turn(cls, events: List[Any]) -> Optional[ConversationTurn]: - """Extract conversation turn from Strands span events.""" - user_message = None - assistant_messages = [] - tool_results = [] + """Extract conversation turn from Strands span events. + + Per OTel GenAI semantic conventions, ``gen_ai.{user,assistant,tool}.message`` + events describe input context (including prior assistant turns replayed + as history), while ``gen_ai.choice`` describes the model's current-turn + output. This parser preserves every event it sees: multiple + ``gen_ai.user.message`` events are all captured in ``user_messages``, + and ``gen_ai.assistant.message`` events land in ``history_messages`` + so they are not mixed with the current-turn choice output. + """ + user_messages: List[str] = [] + assistant_messages: List[Dict[str, Any]] = [] + history_messages: List[Dict[str, Any]] = [] + tool_results: List[str] = [] for event in events: event_attrs = dict(event.attributes) if hasattr(event, "attributes") and event.attributes else {} match event.name: case cls.EVENT_USER_MESSAGE: - user_message = event_attrs.get("content", "") + content = event_attrs.get("content", "") + if content: + user_messages.append(content) case cls.EVENT_CHOICE: message = event_attrs.get("message", "") @@ -66,18 +78,19 @@ def extract_conversation_turn(cls, events: List[Any]) -> Optional[ConversationTu case cls.EVENT_ASSISTANT_MESSAGE: content = event_attrs.get("content", "") if content: - assistant_messages.append({"content": {"content": content}, "role": "assistant"}) + history_messages.append({"content": {"content": content}, "role": "assistant"}) case cls.EVENT_TOOL_MESSAGE: content = event_attrs.get("content", "") if content: tool_results.append(content) - if user_message and assistant_messages: + if user_messages and (assistant_messages or history_messages): return ConversationTurn( - user_message=user_message, + user_messages=user_messages, assistant_messages=assistant_messages, tool_results=tool_results, + history_messages=history_messages, ) return None diff --git a/tests/bedrock_agentcore/evaluation/span_to_adot_serializer/test_adot_models.py b/tests/bedrock_agentcore/evaluation/span_to_adot_serializer/test_adot_models.py index 5c38edd2..56fdc92f 100644 --- a/tests/bedrock_agentcore/evaluation/span_to_adot_serializer/test_adot_models.py +++ b/tests/bedrock_agentcore/evaluation/span_to_adot_serializer/test_adot_models.py @@ -163,15 +163,53 @@ class TestConversationTurn: """Test ConversationTurn dataclass.""" def test_creation(self): - """Test ConversationTurn creation.""" + """Test ConversationTurn creation via legacy scalar user_message.""" turn = ConversationTurn( user_message="Hello", assistant_messages=[{"content": {"message": "Hi"}, "role": "assistant"}], tool_results=["result1"], ) assert turn.user_message == "Hello" + assert turn.user_messages == ["Hello"] assert len(turn.assistant_messages) == 1 assert len(turn.tool_results) == 1 + assert turn.history_messages == [] + + def test_creation_with_user_messages_list(self): + """ConversationTurn accepts a list of user messages preserving order.""" + turn = ConversationTurn( + user_messages=["first", "second", "third"], + assistant_messages=[{"content": {"message": "ok"}, "role": "assistant"}], + ) + assert turn.user_messages == ["first", "second", "third"] + + def test_user_message_alias_returns_latest(self): + """user_message returns the latest entry; accessing it with history emits a DeprecationWarning.""" + import warnings + + turn = ConversationTurn(user_messages=["a", "b", "c"], assistant_messages=[{}]) + + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + assert turn.user_message == "c" + assert any(issubclass(w.category, DeprecationWarning) for w in caught) + + def test_user_message_and_user_messages_rejected(self): + """Supplying both user_message and user_messages is an error.""" + import pytest + + with pytest.raises(ValueError): + ConversationTurn(user_message="x", user_messages=["y"]) + + def test_history_messages_preserved(self): + """history_messages are kept distinct from assistant_messages.""" + turn = ConversationTurn( + user_messages=["u"], + assistant_messages=[{"content": {"message": "out"}, "role": "assistant"}], + history_messages=[{"content": {"content": "prior"}, "role": "assistant"}], + ) + assert turn.assistant_messages[0]["content"]["message"] == "out" + assert turn.history_messages[0]["content"]["content"] == "prior" class TestToolExecution: @@ -308,6 +346,33 @@ def test_build_conversation_log_record(self, span_metadata, resource_info): assert doc["body"]["input"]["messages"][0]["content"]["content"] == "Hello" assert doc["body"]["output"]["messages"][0]["content"]["message"] == "Hi" + def test_build_conversation_log_record_multi_turn_history(self, span_metadata, resource_info): + """Builder emits all user turns + prior assistant history as input messages.""" + conversation = ConversationTurn( + user_messages=["u1", "u2", "u3"], + assistant_messages=[{"content": {"message": "new-output"}, "role": "assistant"}], + history_messages=[ + {"content": {"content": "prior-1"}, "role": "assistant"}, + {"content": {"content": "prior-2"}, "role": "assistant"}, + ], + ) + + doc = ADOTDocumentBuilder.build_conversation_log_record(conversation, span_metadata, resource_info) + + input_msgs = doc["body"]["input"]["messages"] + assert [m.get("role") for m in input_msgs] == ["user", "user", "user", "assistant", "assistant"] + assert [m["content"].get("content") for m in input_msgs] == [ + "u1", + "u2", + "u3", + "prior-1", + "prior-2", + ] + + output_msgs = doc["body"]["output"]["messages"] + assert len(output_msgs) == 1 + assert output_msgs[0]["content"]["message"] == "new-output" + def test_build_conversation_log_record_with_tool_results(self, span_metadata, resource_info): """Test building conversation log record with tool results.""" conversation = ConversationTurn( diff --git a/tests/bedrock_agentcore/evaluation/span_to_adot_serializer/test_strands_converter.py b/tests/bedrock_agentcore/evaluation/span_to_adot_serializer/test_strands_converter.py index 76801273..9deee408 100644 --- a/tests/bedrock_agentcore/evaluation/span_to_adot_serializer/test_strands_converter.py +++ b/tests/bedrock_agentcore/evaluation/span_to_adot_serializer/test_strands_converter.py @@ -120,7 +120,7 @@ def test_extract_conversation_turn_with_tool_result(self, mock_event): assert turn.tool_results[0] == "4" def test_extract_conversation_turn_assistant_message(self, mock_event): - """Test extracting assistant message event.""" + """gen_ai.assistant.message events are history input, not current output.""" events = [ mock_event("gen_ai.user.message", {"content": "Hello"}), mock_event("gen_ai.assistant.message", {"content": "Hi there"}), @@ -129,7 +129,43 @@ def test_extract_conversation_turn_assistant_message(self, mock_event): turn = StrandsEventParser.extract_conversation_turn(events) assert turn is not None - assert turn.assistant_messages[0]["content"]["content"] == "Hi there" + assert turn.assistant_messages == [] + assert turn.history_messages[0]["content"]["content"] == "Hi there" + + def test_extract_conversation_turn_preserves_all_user_messages(self, mock_event): + """Multiple gen_ai.user.message events are all preserved, not deduped.""" + events = [ + mock_event("gen_ai.user.message", {"content": "first"}), + mock_event("gen_ai.user.message", {"content": "second"}), + mock_event("gen_ai.user.message", {"content": "third"}), + mock_event("gen_ai.choice", {"message": "ok"}), + ] + + turn = StrandsEventParser.extract_conversation_turn(events) + + assert turn is not None + assert turn.user_messages == ["first", "second", "third"] + + def test_extract_conversation_turn_separates_history_from_choice(self, mock_event): + """Prior assistant turns land in history_messages; choice lands in assistant_messages.""" + events = [ + mock_event("gen_ai.user.message", {"content": "u1"}), + mock_event("gen_ai.assistant.message", {"content": "prior-assistant-1"}), + mock_event("gen_ai.user.message", {"content": "u2"}), + mock_event("gen_ai.assistant.message", {"content": "prior-assistant-2"}), + mock_event("gen_ai.user.message", {"content": "u3"}), + mock_event("gen_ai.choice", {"message": "new-model-output", "finish_reason": "end_turn"}), + ] + + turn = StrandsEventParser.extract_conversation_turn(events) + + assert turn is not None + assert turn.user_messages == ["u1", "u2", "u3"] + assert len(turn.history_messages) == 2 + assert turn.history_messages[0]["content"]["content"] == "prior-assistant-1" + assert turn.history_messages[1]["content"]["content"] == "prior-assistant-2" + assert len(turn.assistant_messages) == 1 + assert turn.assistant_messages[0]["content"]["message"] == "new-model-output" def test_extract_conversation_turn_tool_message(self, mock_event): """Test extracting tool message as tool result.""" From 66ffbec87447b769e8b4ce3b7b95e2e6e84c8b10 Mon Sep 17 00:00:00 2001 From: Tejas Kashinath Date: Tue, 5 May 2026 16:12:02 -0400 Subject: [PATCH 2/2] =?UTF-8?q?fix:=20address=20review=20feedback=20?= =?UTF-8?q?=E2=80=94=20chronological=20input=5Fmessages,=20cleaner=20model?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses review comments from jariy17, aidandaly24, notgitika on #454. Data model: replace `user_messages: List[str]` + `history_messages: List[Dict]` split with a single `input_messages: List[Dict]` field that preserves the chronological arrival order of `gen_ai.user.message` and `gen_ai.assistant.message` events. For a span `[u1, a1, u2, a2, u3, choice]`, downstream now receives `[user, assistant, user, assistant, user]` so the actual conversation flow is reconstructable, per OTel GenAI semconv. ConversationTurn: drop @dataclass decorator (it was conflicting with the custom __init__ and leaving the class-level field defaults dead), write explicit __init__, __repr__, __eq__ so equality compares actual instance state. `user_message: str` property retained for backwards compatibility, now returns the most recent user entry from `input_messages` and warns when history is present. Parser: gate tightened back to require `assistant_messages` so a span with no `gen_ai.choice` does not emit an empty-output ConversationTurn. Empty event content now logs at debug level instead of silently skipping. Tests: move duplicated fixtures into a shared conftest.py; hoist `import warnings` / `import pytest` to module-level; update the multi-turn regression test to assert chronological ordering rather than role-grouped; add regression tests for no-choice-path and user+tool_result-only-path returning None. --- .../span_to_adot_serializer/adot_models.py | 106 +++++----- .../strands_converter.py | 26 ++- .../span_to_adot_serializer/conftest.py | 105 ++++++++++ .../test_adot_models.py | 188 ++++++------------ .../test_strands_converter.py | 121 +++-------- 5 files changed, 279 insertions(+), 267 deletions(-) create mode 100644 tests/bedrock_agentcore/evaluation/span_to_adot_serializer/conftest.py diff --git a/src/bedrock_agentcore/evaluation/span_to_adot_serializer/adot_models.py b/src/bedrock_agentcore/evaluation/span_to_adot_serializer/adot_models.py index 327a2e3c..03acccce 100644 --- a/src/bedrock_agentcore/evaluation/span_to_adot_serializer/adot_models.py +++ b/src/bedrock_agentcore/evaluation/span_to_adot_serializer/adot_models.py @@ -11,7 +11,7 @@ import logging import warnings -from dataclasses import dataclass, field +from dataclasses import dataclass from typing import Any, Dict, List, Optional logger = logging.getLogger(__name__) @@ -47,69 +47,89 @@ class ResourceInfo: scope_version: str -@dataclass class ConversationTurn: """A single user-assistant conversation turn, with prior history preserved. - ``user_messages`` holds every ``gen_ai.user.message`` event observed on the - span (conversation history + the current user turn). ``history_messages`` - holds every ``gen_ai.assistant.message`` event — these are prior assistant - turns replayed as input context, not the current model output. + ``input_messages`` holds every input event (``gen_ai.user.message`` and + ``gen_ai.assistant.message``) in the order observed on the span, so + downstream consumers can reconstruct the actual conversation flow. ``assistant_messages`` holds only the current turn's output, derived from - ``gen_ai.choice`` events. + ``gen_ai.choice`` events. ``tool_results`` holds any tool outputs observed + on the span. The ``user_message`` attribute is retained as a backwards-compatible alias - returning the most recent user turn; new code should read ``user_messages``. + returning the most recent user turn's content; new code should iterate + ``input_messages`` filtered by role. """ - user_messages: List[str] = field(default_factory=list) - assistant_messages: List[Dict[str, Any]] = field(default_factory=list) - tool_results: List[str] = field(default_factory=list) - history_messages: List[Dict[str, Any]] = field(default_factory=list) - def __init__( self, user_message: Optional[str] = None, assistant_messages: Optional[List[Dict[str, Any]]] = None, tool_results: Optional[List[str]] = None, - user_messages: Optional[List[str]] = None, - history_messages: Optional[List[Dict[str, Any]]] = None, + input_messages: Optional[List[Dict[str, Any]]] = None, ): """Initialize a conversation turn. - Accepts either the new ``user_messages`` list or the legacy - ``user_message`` scalar for backwards compatibility. + ``input_messages`` is the chronological list of input events + (``{"role": "user"|"assistant", "content": {...}}``). For backwards + compatibility, callers may instead pass a legacy ``user_message`` + scalar, which is converted into a single-entry ``input_messages`` + list. Supplying both is an error. """ - if user_messages is not None and user_message is not None: - raise ValueError("Provide either user_messages or user_message, not both") - if user_messages is not None: - self.user_messages = list(user_messages) + if input_messages is not None and user_message is not None: + raise ValueError("Provide either input_messages or user_message, not both") + if input_messages is not None: + self.input_messages = list(input_messages) elif user_message is not None: - self.user_messages = [user_message] + self.input_messages = [{"content": {"content": user_message}, "role": "user"}] else: - self.user_messages = [] - self.assistant_messages = list(assistant_messages) if assistant_messages is not None else [] - self.tool_results = list(tool_results) if tool_results is not None else [] - self.history_messages = list(history_messages) if history_messages is not None else [] + self.input_messages = [] + self.assistant_messages = list(assistant_messages or []) + self.tool_results = list(tool_results or []) + + def __repr__(self) -> str: + """Return a debug representation listing every stored field.""" + return ( + f"ConversationTurn(input_messages={self.input_messages!r}, " + f"assistant_messages={self.assistant_messages!r}, " + f"tool_results={self.tool_results!r})" + ) + + def __eq__(self, other: object) -> bool: + """Compare turns by the full set of instance fields.""" + if not isinstance(other, ConversationTurn): + return NotImplemented + return ( + self.input_messages == other.input_messages + and self.assistant_messages == other.assistant_messages + and self.tool_results == other.tool_results + ) @property def user_message(self) -> str: - """Return the most recent user turn (backwards-compatible alias). + """Return the most recent user turn's content (backwards-compatible alias). - Deprecated: read ``user_messages`` to access the full list of user - turns on the span. This alias drops all but the last entry. + Deprecated: iterate ``input_messages`` to access the full conversation + history. This alias returns only the last user entry. """ - if not self.user_messages: + user_entries = [m for m in self.input_messages if m.get("role") == "user"] + if not user_entries: return "" - if len(self.user_messages) > 1: + if len(user_entries) > 1 or any(m.get("role") == "assistant" for m in self.input_messages): warnings.warn( - "ConversationTurn.user_message drops prior turns when the span " - "carries multiple gen_ai.user.message events. Read " - "ConversationTurn.user_messages for the full list.", + "ConversationTurn.user_message drops prior user turns and history " + "when the span carries multiple input events. Iterate " + "ConversationTurn.input_messages for the full conversation.", DeprecationWarning, stacklevel=2, ) - return self.user_messages[-1] + last = user_entries[-1] + content = last.get("content") + if isinstance(content, dict): + inner = content.get("content", "") + return inner if isinstance(inner, str) else "" + return content if isinstance(content, str) else "" @dataclass @@ -250,10 +270,11 @@ def build_conversation_log_record( ) -> Dict[str, Any]: """Build ADOT log record for conversation turn. - Input messages preserve conversation history: user turns and prior - assistant turns (``history_messages``) are merged in the order they - were observed on the span so downstream consumers see the full context - the model received. Output messages carry only the current turn. + ``body.input.messages`` carries ``ConversationTurn.input_messages`` in + event arrival order, so downstream consumers reconstruct the exact + conversation the model received (user and prior assistant turns + interleaved). ``body.output.messages`` carries only the current turn's + output. """ output_messages = [] for i, msg in enumerate(conversation.assistant_messages): @@ -267,14 +288,9 @@ def build_conversation_log_record( for tool_result in conversation.tool_results: output_messages.append({"content": tool_result, "role": "assistant"}) - input_messages: List[Dict[str, Any]] = [ - {"content": {"content": user_msg}, "role": "user"} for user_msg in conversation.user_messages - ] - input_messages.extend(conversation.history_messages) - body = { "output": {"messages": output_messages}, - "input": {"messages": input_messages}, + "input": {"messages": list(conversation.input_messages)}, } return cls._build_log_record_base(metadata, resource_info, body) diff --git a/src/bedrock_agentcore/evaluation/span_to_adot_serializer/strands_converter.py b/src/bedrock_agentcore/evaluation/span_to_adot_serializer/strands_converter.py index 5d2af807..27d78d22 100644 --- a/src/bedrock_agentcore/evaluation/span_to_adot_serializer/strands_converter.py +++ b/src/bedrock_agentcore/evaluation/span_to_adot_serializer/strands_converter.py @@ -42,14 +42,12 @@ def extract_conversation_turn(cls, events: List[Any]) -> Optional[ConversationTu Per OTel GenAI semantic conventions, ``gen_ai.{user,assistant,tool}.message`` events describe input context (including prior assistant turns replayed as history), while ``gen_ai.choice`` describes the model's current-turn - output. This parser preserves every event it sees: multiple - ``gen_ai.user.message`` events are all captured in ``user_messages``, - and ``gen_ai.assistant.message`` events land in ``history_messages`` - so they are not mixed with the current-turn choice output. + output. ``input_messages`` preserves event arrival order so downstream + consumers can reconstruct the actual conversation flow (``[user, + assistant, user, assistant, user]`` rather than role-grouped). """ - user_messages: List[str] = [] + input_messages: List[Dict[str, Any]] = [] assistant_messages: List[Dict[str, Any]] = [] - history_messages: List[Dict[str, Any]] = [] tool_results: List[str] = [] for event in events: @@ -59,7 +57,9 @@ def extract_conversation_turn(cls, events: List[Any]) -> Optional[ConversationTu case cls.EVENT_USER_MESSAGE: content = event_attrs.get("content", "") if content: - user_messages.append(content) + input_messages.append({"content": {"content": content}, "role": "user"}) + else: + logger.debug("Skipping gen_ai.user.message with empty content") case cls.EVENT_CHOICE: message = event_attrs.get("message", "") @@ -78,19 +78,23 @@ def extract_conversation_turn(cls, events: List[Any]) -> Optional[ConversationTu case cls.EVENT_ASSISTANT_MESSAGE: content = event_attrs.get("content", "") if content: - history_messages.append({"content": {"content": content}, "role": "assistant"}) + input_messages.append({"content": {"content": content}, "role": "assistant"}) + else: + logger.debug("Skipping gen_ai.assistant.message with empty content") case cls.EVENT_TOOL_MESSAGE: content = event_attrs.get("content", "") if content: tool_results.append(content) + else: + logger.debug("Skipping gen_ai.tool.message with empty content") - if user_messages and (assistant_messages or history_messages): + has_user_input = any(m.get("role") == "user" for m in input_messages) + if has_user_input and assistant_messages: return ConversationTurn( - user_messages=user_messages, + input_messages=input_messages, assistant_messages=assistant_messages, tool_results=tool_results, - history_messages=history_messages, ) return None diff --git a/tests/bedrock_agentcore/evaluation/span_to_adot_serializer/conftest.py b/tests/bedrock_agentcore/evaluation/span_to_adot_serializer/conftest.py new file mode 100644 index 00000000..fa79adf9 --- /dev/null +++ b/tests/bedrock_agentcore/evaluation/span_to_adot_serializer/conftest.py @@ -0,0 +1,105 @@ +"""Shared fixtures for span_to_adot_serializer tests.""" + +from unittest.mock import Mock + +import pytest + +from bedrock_agentcore.evaluation.span_to_adot_serializer.adot_models import ( + ResourceInfo, + SpanMetadata, +) + + +@pytest.fixture +def mock_span_context(): + """Create a mock span context.""" + context = Mock() + context.trace_id = 0x1234567890ABCDEF1234567890ABCDEF + context.span_id = 0x1234567890ABCDEF + context.trace_flags = 1 + return context + + +@pytest.fixture +def mock_resource(): + """Create a mock resource.""" + resource = Mock() + resource.attributes = {"service.name": "test-service"} + return resource + + +@pytest.fixture +def mock_instrumentation_scope(): + """Create a mock instrumentation scope.""" + scope = Mock() + scope.name = "strands.agent" + scope.version = "1.0.0" + return scope + + +@pytest.fixture +def mock_status(): + """Create a mock status.""" + status = Mock() + status.status_code = Mock() + status.status_code.__str__ = Mock(return_value="StatusCode.OK") + return status + + +@pytest.fixture +def mock_span(mock_span_context, mock_resource, mock_instrumentation_scope, mock_status): + """Create a mock OTel span.""" + span = Mock() + span.context = mock_span_context + span.resource = mock_resource + span.instrumentation_scope = mock_instrumentation_scope + span.status = mock_status + span.parent = None + span.name = "test-span" + span.start_time = 1000000000 + span.end_time = 2000000000 + span.kind = Mock() + span.kind.__str__ = Mock(return_value="SpanKind.INTERNAL") + span.attributes = {"gen_ai.operation.name": "chat"} + span.events = [] + return span + + +@pytest.fixture +def mock_event(): + """Create a mock span event factory.""" + + def _create_event(name, attributes): + event = Mock() + event.name = name + event.attributes = attributes + return event + + return _create_event + + +@pytest.fixture +def span_metadata(): + """Create test SpanMetadata.""" + return SpanMetadata( + trace_id="1234567890abcdef1234567890abcdef", + span_id="1234567890abcdef", + parent_span_id=None, + name="test-span", + start_time=1000000000, + end_time=2000000000, + duration=1000000000, + kind="INTERNAL", + flags=1, + status_code="OK", + ) + + +@pytest.fixture +def resource_info(): + """Create test ResourceInfo.""" + return ResourceInfo( + resource_attributes={"service.name": "test-service"}, + scope_name="strands.agent", + scope_version="1.0.0", + ) diff --git a/tests/bedrock_agentcore/evaluation/span_to_adot_serializer/test_adot_models.py b/tests/bedrock_agentcore/evaluation/span_to_adot_serializer/test_adot_models.py index 56fdc92f..a2b9eb15 100644 --- a/tests/bedrock_agentcore/evaluation/span_to_adot_serializer/test_adot_models.py +++ b/tests/bedrock_agentcore/evaluation/span_to_adot_serializer/test_adot_models.py @@ -1,5 +1,6 @@ """Tests for framework-agnostic ADOT models and builders.""" +import warnings from unittest.mock import Mock import pytest @@ -13,93 +14,6 @@ ToolExecution, ) -# ============================================================================== -# Fixtures -# ============================================================================== - - -@pytest.fixture -def mock_span_context(): - """Create a mock span context.""" - context = Mock() - context.trace_id = 0x1234567890ABCDEF1234567890ABCDEF - context.span_id = 0x1234567890ABCDEF - context.trace_flags = 1 - return context - - -@pytest.fixture -def mock_resource(): - """Create a mock resource.""" - resource = Mock() - resource.attributes = {"service.name": "test-service"} - return resource - - -@pytest.fixture -def mock_instrumentation_scope(): - """Create a mock instrumentation scope.""" - scope = Mock() - scope.name = "strands.agent" - scope.version = "1.0.0" - return scope - - -@pytest.fixture -def mock_status(): - """Create a mock status.""" - status = Mock() - status.status_code = Mock() - status.status_code.__str__ = Mock(return_value="StatusCode.OK") - return status - - -@pytest.fixture -def mock_span(mock_span_context, mock_resource, mock_instrumentation_scope, mock_status): - """Create a mock OTel span.""" - span = Mock() - span.context = mock_span_context - span.resource = mock_resource - span.instrumentation_scope = mock_instrumentation_scope - span.status = mock_status - span.parent = None - span.name = "test-span" - span.start_time = 1000000000 - span.end_time = 2000000000 - span.kind = Mock() - span.kind.__str__ = Mock(return_value="SpanKind.INTERNAL") - span.attributes = {"gen_ai.operation.name": "chat"} - span.events = [] - return span - - -@pytest.fixture -def span_metadata(): - """Create test SpanMetadata.""" - return SpanMetadata( - trace_id="1234567890abcdef1234567890abcdef", - span_id="1234567890abcdef", - parent_span_id=None, - name="test-span", - start_time=1000000000, - end_time=2000000000, - duration=1000000000, - kind="INTERNAL", - flags=1, - status_code="OK", - ) - - -@pytest.fixture -def resource_info(): - """Create test ResourceInfo.""" - return ResourceInfo( - resource_attributes={"service.name": "test-service"}, - scope_name="strands.agent", - scope_version="1.0.0", - ) - - # ============================================================================== # Domain Model Tests # ============================================================================== @@ -160,56 +74,76 @@ def test_creation(self): class TestConversationTurn: - """Test ConversationTurn dataclass.""" + """Test ConversationTurn class.""" - def test_creation(self): - """Test ConversationTurn creation via legacy scalar user_message.""" + def test_creation_legacy_scalar(self): + """Legacy ``user_message`` scalar becomes a single-entry input_messages list.""" turn = ConversationTurn( user_message="Hello", assistant_messages=[{"content": {"message": "Hi"}, "role": "assistant"}], tool_results=["result1"], ) assert turn.user_message == "Hello" - assert turn.user_messages == ["Hello"] + assert turn.input_messages == [{"content": {"content": "Hello"}, "role": "user"}] assert len(turn.assistant_messages) == 1 assert len(turn.tool_results) == 1 - assert turn.history_messages == [] - def test_creation_with_user_messages_list(self): - """ConversationTurn accepts a list of user messages preserving order.""" + def test_creation_with_input_messages(self): + """ConversationTurn accepts a chronological input_messages list.""" + input_msgs = [ + {"content": {"content": "first"}, "role": "user"}, + {"content": {"content": "prior"}, "role": "assistant"}, + {"content": {"content": "second"}, "role": "user"}, + ] turn = ConversationTurn( - user_messages=["first", "second", "third"], + input_messages=input_msgs, assistant_messages=[{"content": {"message": "ok"}, "role": "assistant"}], ) - assert turn.user_messages == ["first", "second", "third"] - - def test_user_message_alias_returns_latest(self): - """user_message returns the latest entry; accessing it with history emits a DeprecationWarning.""" - import warnings + assert turn.input_messages == input_msgs - turn = ConversationTurn(user_messages=["a", "b", "c"], assistant_messages=[{}]) + def test_user_message_alias_returns_latest_user_entry(self): + """user_message returns the last user entry and warns when history is present.""" + turn = ConversationTurn( + input_messages=[ + {"content": {"content": "a"}, "role": "user"}, + {"content": {"content": "prior"}, "role": "assistant"}, + {"content": {"content": "b"}, "role": "user"}, + ], + assistant_messages=[{}], + ) with warnings.catch_warnings(record=True) as caught: warnings.simplefilter("always") - assert turn.user_message == "c" + assert turn.user_message == "b" assert any(issubclass(w.category, DeprecationWarning) for w in caught) - def test_user_message_and_user_messages_rejected(self): - """Supplying both user_message and user_messages is an error.""" - import pytest - + def test_user_message_and_input_messages_rejected(self): + """Supplying both user_message and input_messages is an error.""" with pytest.raises(ValueError): - ConversationTurn(user_message="x", user_messages=["y"]) - - def test_history_messages_preserved(self): - """history_messages are kept distinct from assistant_messages.""" - turn = ConversationTurn( - user_messages=["u"], + ConversationTurn( + user_message="x", + input_messages=[{"content": {"content": "y"}, "role": "user"}], + ) + + def test_equality_compares_full_instance_state(self): + """__eq__ compares input_messages, assistant_messages, and tool_results.""" + a = ConversationTurn( + input_messages=[{"content": {"content": "u"}, "role": "user"}], + assistant_messages=[{"content": {"message": "out"}, "role": "assistant"}], + tool_results=["t"], + ) + b = ConversationTurn( + input_messages=[{"content": {"content": "u"}, "role": "user"}], assistant_messages=[{"content": {"message": "out"}, "role": "assistant"}], - history_messages=[{"content": {"content": "prior"}, "role": "assistant"}], + tool_results=["t"], ) - assert turn.assistant_messages[0]["content"]["message"] == "out" - assert turn.history_messages[0]["content"]["content"] == "prior" + c = ConversationTurn( + input_messages=[{"content": {"content": "different"}, "role": "user"}], + assistant_messages=[{"content": {"message": "out"}, "role": "assistant"}], + tool_results=["t"], + ) + assert a == b + assert a != c class TestToolExecution: @@ -346,27 +280,35 @@ def test_build_conversation_log_record(self, span_metadata, resource_info): assert doc["body"]["input"]["messages"][0]["content"]["content"] == "Hello" assert doc["body"]["output"]["messages"][0]["content"]["message"] == "Hi" - def test_build_conversation_log_record_multi_turn_history(self, span_metadata, resource_info): - """Builder emits all user turns + prior assistant history as input messages.""" + def test_build_conversation_log_record_preserves_chronological_order(self, span_metadata, resource_info): + """Builder emits input_messages in event arrival order (user/assistant interleaved).""" conversation = ConversationTurn( - user_messages=["u1", "u2", "u3"], - assistant_messages=[{"content": {"message": "new-output"}, "role": "assistant"}], - history_messages=[ + input_messages=[ + {"content": {"content": "u1"}, "role": "user"}, {"content": {"content": "prior-1"}, "role": "assistant"}, + {"content": {"content": "u2"}, "role": "user"}, {"content": {"content": "prior-2"}, "role": "assistant"}, + {"content": {"content": "u3"}, "role": "user"}, ], + assistant_messages=[{"content": {"message": "new-output"}, "role": "assistant"}], ) doc = ADOTDocumentBuilder.build_conversation_log_record(conversation, span_metadata, resource_info) input_msgs = doc["body"]["input"]["messages"] - assert [m.get("role") for m in input_msgs] == ["user", "user", "user", "assistant", "assistant"] + assert [m.get("role") for m in input_msgs] == [ + "user", + "assistant", + "user", + "assistant", + "user", + ] assert [m["content"].get("content") for m in input_msgs] == [ "u1", - "u2", - "u3", "prior-1", + "u2", "prior-2", + "u3", ] output_msgs = doc["body"]["output"]["messages"] diff --git a/tests/bedrock_agentcore/evaluation/span_to_adot_serializer/test_strands_converter.py b/tests/bedrock_agentcore/evaluation/span_to_adot_serializer/test_strands_converter.py index 9deee408..de3eaa29 100644 --- a/tests/bedrock_agentcore/evaluation/span_to_adot_serializer/test_strands_converter.py +++ b/tests/bedrock_agentcore/evaluation/span_to_adot_serializer/test_strands_converter.py @@ -2,87 +2,12 @@ from unittest.mock import Mock -import pytest - from bedrock_agentcore.evaluation.span_to_adot_serializer import convert_strands_to_adot from bedrock_agentcore.evaluation.span_to_adot_serializer.strands_converter import ( StrandsEventParser, StrandsToADOTConverter, ) -# ============================================================================== -# Fixtures -# ============================================================================== - - -@pytest.fixture -def mock_span_context(): - """Create a mock span context.""" - context = Mock() - context.trace_id = 0x1234567890ABCDEF1234567890ABCDEF - context.span_id = 0x1234567890ABCDEF - context.trace_flags = 1 - return context - - -@pytest.fixture -def mock_resource(): - """Create a mock resource.""" - resource = Mock() - resource.attributes = {"service.name": "test-service"} - return resource - - -@pytest.fixture -def mock_instrumentation_scope(): - """Create a mock instrumentation scope.""" - scope = Mock() - scope.name = "strands.agent" - scope.version = "1.0.0" - return scope - - -@pytest.fixture -def mock_status(): - """Create a mock status.""" - status = Mock() - status.status_code = Mock() - status.status_code.__str__ = Mock(return_value="StatusCode.OK") - return status - - -@pytest.fixture -def mock_span(mock_span_context, mock_resource, mock_instrumentation_scope, mock_status): - """Create a mock OTel span.""" - span = Mock() - span.context = mock_span_context - span.resource = mock_resource - span.instrumentation_scope = mock_instrumentation_scope - span.status = mock_status - span.parent = None - span.name = "test-span" - span.start_time = 1000000000 - span.end_time = 2000000000 - span.kind = Mock() - span.kind.__str__ = Mock(return_value="SpanKind.INTERNAL") - span.attributes = {"gen_ai.operation.name": "chat"} - span.events = [] - return span - - -@pytest.fixture -def mock_event(): - """Create a mock span event.""" - - def _create_event(name, attributes): - event = Mock() - event.name = name - event.attributes = attributes - return event - - return _create_event - - # ============================================================================== # Strands Event Parser Tests # ============================================================================== @@ -101,7 +26,7 @@ def test_extract_conversation_turn(self, mock_event): turn = StrandsEventParser.extract_conversation_turn(events) assert turn is not None - assert turn.user_message == "Hello" + assert turn.input_messages == [{"content": {"content": "Hello"}, "role": "user"}] assert len(turn.assistant_messages) == 1 assert turn.assistant_messages[0]["content"]["message"] == "Hi there" assert turn.assistant_messages[0]["content"]["finish_reason"] == "stop" @@ -119,8 +44,8 @@ def test_extract_conversation_turn_with_tool_result(self, mock_event): assert len(turn.tool_results) == 1 assert turn.tool_results[0] == "4" - def test_extract_conversation_turn_assistant_message(self, mock_event): - """gen_ai.assistant.message events are history input, not current output.""" + def test_extract_conversation_turn_assistant_message_only_returns_none(self, mock_event): + """No gen_ai.choice means no current-turn output, so no ConversationTurn is emitted.""" events = [ mock_event("gen_ai.user.message", {"content": "Hello"}), mock_event("gen_ai.assistant.message", {"content": "Hi there"}), @@ -128,9 +53,18 @@ def test_extract_conversation_turn_assistant_message(self, mock_event): turn = StrandsEventParser.extract_conversation_turn(events) - assert turn is not None - assert turn.assistant_messages == [] - assert turn.history_messages[0]["content"]["content"] == "Hi there" + assert turn is None + + def test_extract_conversation_turn_tool_results_only_returns_none(self, mock_event): + """User + tool_result with no choice and no history must not emit a record.""" + events = [ + mock_event("gen_ai.user.message", {"content": "Hello"}), + mock_event("gen_ai.tool.message", {"content": "tool output"}), + ] + + turn = StrandsEventParser.extract_conversation_turn(events) + + assert turn is None def test_extract_conversation_turn_preserves_all_user_messages(self, mock_event): """Multiple gen_ai.user.message events are all preserved, not deduped.""" @@ -144,10 +78,11 @@ def test_extract_conversation_turn_preserves_all_user_messages(self, mock_event) turn = StrandsEventParser.extract_conversation_turn(events) assert turn is not None - assert turn.user_messages == ["first", "second", "third"] + user_contents = [m["content"]["content"] for m in turn.input_messages if m["role"] == "user"] + assert user_contents == ["first", "second", "third"] - def test_extract_conversation_turn_separates_history_from_choice(self, mock_event): - """Prior assistant turns land in history_messages; choice lands in assistant_messages.""" + def test_extract_conversation_turn_preserves_chronological_order(self, mock_event): + """input_messages must interleave user and prior assistant turns in event arrival order.""" events = [ mock_event("gen_ai.user.message", {"content": "u1"}), mock_event("gen_ai.assistant.message", {"content": "prior-assistant-1"}), @@ -160,10 +95,20 @@ def test_extract_conversation_turn_separates_history_from_choice(self, mock_even turn = StrandsEventParser.extract_conversation_turn(events) assert turn is not None - assert turn.user_messages == ["u1", "u2", "u3"] - assert len(turn.history_messages) == 2 - assert turn.history_messages[0]["content"]["content"] == "prior-assistant-1" - assert turn.history_messages[1]["content"]["content"] == "prior-assistant-2" + assert [m["role"] for m in turn.input_messages] == [ + "user", + "assistant", + "user", + "assistant", + "user", + ] + assert [m["content"]["content"] for m in turn.input_messages] == [ + "u1", + "prior-assistant-1", + "u2", + "prior-assistant-2", + "u3", + ] assert len(turn.assistant_messages) == 1 assert turn.assistant_messages[0]["content"]["message"] == "new-model-output"