Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 2 additions & 6 deletions src/strands_evals/mappers/cloudwatch_session_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from typing import Any

from ..mappers.session_mapper import SessionMapper
from ..mappers.utils import get_body
from ..mappers.utils import get_body, join_tool_result_content
from ..types.trace import (
AgentInvocationSpan,
AssistantMessage,
Expand Down Expand Up @@ -318,11 +318,7 @@ def _extract_tool_results(self, body: dict) -> list[ToolResult]:

def _extract_tool_result_text(self, content: Any) -> str:
"""Extract text from tool result content."""
if not content:
return ""
if isinstance(content, list) and content:
return content[0].get("text", "")
return str(content)
return join_tool_result_content(content)
Comment thread
poshinchen marked this conversation as resolved.

# --- Body-to-messages conversion ---

Expand Down
40 changes: 14 additions & 26 deletions src/strands_evals/mappers/strands_in_memory_session_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,15 @@
UserMessage,
)
from .session_mapper import SessionMapper
from .utils import join_tool_result_content


def _response_to_text(response: Any) -> str:
"""Normalize a tool_call_response value to a plain string."""
if isinstance(response, list):
return join_tool_result_content(response)
return response if isinstance(response, str) else ""


logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -194,10 +203,7 @@ def _process_tool_results(self, content_list: list[dict[str, Any]]) -> list[Text
continue

tool_result = item["toolResult"]
result_text = ""
if "content" in tool_result and tool_result["content"]:
content = tool_result["content"]
result_text = content[0].get("text", "") if isinstance(content, list) else str(content)
result_text = join_tool_result_content(tool_result.get("content"))

result.append(
ToolResultContent(
Expand Down Expand Up @@ -324,17 +330,7 @@ def _convert_inference_messages(self, otel_msg: dict[str, Any]) -> UserMessage |
content.append(TextContent(text=part.get("content", "")))

if part_type == "tool_call_response":
# Extract text from response array if present
response = part.get("response", [])
response_text = ""

## To-do: Compare the differences for multiple toolResults
if isinstance(response, list) and response:
response_text = (
response[0].get("text", "") if isinstance(response[0], dict) else str(response[0])
)
elif isinstance(response, str):
response_text = response
response_text = _response_to_text(part.get("response", []))

content.append(
ToolResultContent(
Expand Down Expand Up @@ -380,17 +376,9 @@ def _convert_tool_execution_span(self, span: ReadableSpan, session_id: str) -> T
if output_messages and output_messages[0].get("parts"):
part = output_messages[0]["parts"][0]
if part.get("type") == "tool_call_response":
response = part.get("response", [])
if isinstance(response, list) and response:
tool_result_content = (
response[0].get("text", "")
if isinstance(response[0], dict)
else str(response[0])
)
elif isinstance(response, str):
tool_result_content = response
tool_result_content = _response_to_text(part.get("response", []))
except Exception as e:
logger.warning(f"Failed to process tool event {event.name}: {e}")
logger.warning("Failed to process tool event %s: %s", event.name, e)
else:
for event in span.events:
try:
Expand All @@ -403,7 +391,7 @@ def _convert_tool_execution_span(self, span: ReadableSpan, session_id: str) -> T
message_list = self._parse_json_attr(event_attributes, "message")
tool_result_content = message_list[0].get("text", "") if message_list else ""
except Exception as e:
logger.warning(f"Failed to process tool event {event.name}: {e}")
logger.warning("Failed to process tool event %s: %s", event.name, e)

tool_call = ToolCall(name=tool_name, arguments=tool_arguments, tool_call_id=tool_call_id)
tool_result = ToolResult(content=tool_result_content, error=tool_error, tool_call_id=tool_call_id)
Expand Down
52 changes: 52 additions & 0 deletions src/strands_evals/mappers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,63 @@
Utility functions for mapper selection and detection.
"""

import json
import logging
from typing import Any

from .constants import SCOPE_LANGCHAIN_OTEL, SCOPE_OPENINFERENCE, SCOPE_STRANDS
from .session_mapper import SessionMapper

logger = logging.getLogger(__name__)


def join_tool_result_content(content: Any) -> str:
"""Join all blocks in a Bedrock-style toolResult content list into one string.

Bedrock toolResult.content is a list of typed blocks that are joined with a
newline separator so multi-paragraph tool outputs stay readable for downstream
LLM judges. text blocks pass through as-is, json blocks are serialized via
json.dumps, and image/document/video blocks become placeholder markers.

Args:
content: A Bedrock-style toolResult content value. May be a list of typed
block dicts, a non-list value (coerced to str), or None/empty.

Returns:
A single string with all block values newline-joined, or empty string for
empty/None input. Note: empty-string text block values are excluded from
the join (they contribute no visible content), so a list containing only
empty-text blocks returns an empty string.
"""
if content is None:
return ""
if isinstance(content, list) and len(content) == 0:
return ""
if not isinstance(content, list):
return str(content)

parts: list[str] = []
for block in content:
if not isinstance(block, dict):
parts.append(str(block))
continue
if "text" in block:
parts.append(str(block["text"]) if block["text"] is not None else "")
elif "json" in block:
try:
parts.append(json.dumps(block["json"], sort_keys=True))
except (TypeError, ValueError) as exc:
logger.debug("json_error=<%s> | join_tool_result_content: could not serialize json block", exc)
elif "image" in block:
parts.append("[image]")
elif "document" in block:
parts.append("[document]")
elif "video" in block:
parts.append("[video]")
else:
logger.debug("block_keys=<%s> | join_tool_result_content: unknown block type, skipping", list(block.keys()))
return "\n".join(p for p in parts if p)


def detect_otel_mapper(spans: list[Any]) -> SessionMapper:
"""Detect the appropriate mapper based on span scope and data format.
Expand Down
61 changes: 61 additions & 0 deletions tests/strands_evals/mappers/test_cloudwatch_session_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,3 +245,64 @@ def test_record_with_no_body_skipped(self, mapper):
session = mapper.map_to_session(records, "sess-1")
assert len(session.traces) == 1
assert len(session.traces[0].spans) > 0


# --- Regression tests: multi-block toolResult.content ---


def _make_multi_block_tool_result_message(tool_use_id, content_blocks):
"""Build a tool result message with arbitrary content blocks."""
return {
"role": "tool",
"content": {"content": json.dumps([{"toolResult": {"content": content_blocks, "toolUseId": tool_use_id}}])},
}


class TestMultiBlockToolResult:
def test_multi_text_blocks_joined(self, mapper):
"""Multiple text blocks in toolResult.content are joined, not truncated to [0]."""
record1 = make_log_record(
trace_id="t1",
span_id="s1",
input_messages=[make_user_message("hi")],
output_messages=[_make_assistant_tool_use_message("tool_x", {}, "tu-1")],
time_nano=1000,
)
record2 = make_log_record(
trace_id="t1",
span_id="s2",
input_messages=[
make_user_message("hi"),
_make_multi_block_tool_result_message("tu-1", [{"text": "first"}, {"text": "second"}]),
],
output_messages=[make_assistant_text_message("ok")],
time_nano=2000,
)
session = mapper.map_to_session([record1, record2], "sess-1")
tool_spans = [s for s in session.traces[0].spans if isinstance(s, ToolExecutionSpan)]
assert len(tool_spans) == 1
assert tool_spans[0].tool_result.content == "first\nsecond"

def test_text_and_json_blocks_joined(self, mapper):
"""Mixed text+json blocks are both included in the joined string."""
record1 = make_log_record(
trace_id="t1",
span_id="s1",
input_messages=[make_user_message("hi")],
output_messages=[_make_assistant_tool_use_message("tool_y", {}, "tu-2")],
time_nano=1000,
)
record2 = make_log_record(
trace_id="t1",
span_id="s2",
input_messages=[
make_user_message("hi"),
_make_multi_block_tool_result_message("tu-2", [{"text": "val:"}, {"json": {"x": 1}}]),
],
output_messages=[make_assistant_text_message("ok")],
time_nano=2000,
)
session = mapper.map_to_session([record1, record2], "sess-1")
tool_spans = [s for s in session.traces[0].spans if isinstance(s, ToolExecutionSpan)]
assert len(tool_spans) == 1
assert tool_spans[0].tool_result.content == 'val:\n{"x": 1}'
143 changes: 143 additions & 0 deletions tests/strands_evals/mappers/test_strands_in_memory_mapper.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import json

import pytest
from opentelemetry.sdk.trace import ReadableSpan, TracerProvider
from opentelemetry.trace import SpanContext, SpanKind, TraceFlags
Expand Down Expand Up @@ -573,3 +575,144 @@ def test_session_id_filtering_gen_ai_conversation_id_takes_precedence(provider):
# Should NOT match on session.id when gen_ai.conversation.id is present
session2 = mapper.map_to_session([span], "session-456")
assert len(session2.traces) == 0


# --- Regression tests: multi-part toolResult.content ---


def test_legacy_process_tool_results_multi_text(provider):
"""Legacy _process_tool_results joins all text blocks, not just content[0]."""
payload = json.dumps(
[
{
"toolResult": {
"toolUseId": "tr1",
"content": [{"text": "first"}, {"text": "second"}],
}
}
]
)
span = make_span(
provider,
0xAAA,
0xBBB,
0xCCC,
"chat",
{"gen_ai.operation.name": "chat"},
lambda s: (
s.add_event("gen_ai.tool.message", {"content": payload}),
s.add_event("gen_ai.choice", {"message": '[{"text": "ok"}]'}),
),
)

session = StrandsInMemorySessionMapper().map_to_session([span], "sid")
tool_msg = session.traces[0].spans[0].messages[0]
assert tool_msg.content[0].content == "first\nsecond"


def test_legacy_process_tool_results_text_and_json(provider):
"""Legacy _process_tool_results handles mixed text+json blocks."""
payload = json.dumps(
[
{
"toolResult": {
"toolUseId": "tr2",
"content": [{"text": "label:"}, {"json": {"value": 42}}],
}
}
]
)
span = make_span(
provider,
0xAAA,
0xBBB,
0xCCC,
"chat",
{"gen_ai.operation.name": "chat"},
lambda s: (
s.add_event("gen_ai.tool.message", {"content": payload}),
s.add_event("gen_ai.choice", {"message": '[{"text": "ok"}]'}),
),
)

session = StrandsInMemorySessionMapper().map_to_session([span], "sid")
tool_msg = session.traces[0].spans[0].messages[0]
assert tool_msg.content[0].content == 'label:\n{"value": 42}'


def test_latest_convention_inference_multi_text_tool_result(provider):
"""Latest _convert_inference_messages joins all blocks in tool_call_response."""
input_msg = json.dumps(
[
{
"role": "user",
"parts": [
{
"type": "tool_call_response",
"id": "t1",
"response": [{"text": "alpha"}, {"text": "beta"}],
}
],
}
]
)
span = make_span(
provider,
0xAAA,
0xBBB,
0xCCC,
"chat",
{"gen_ai.operation.name": "chat", "gen_ai.provider.name": "strands-agents"},
lambda s: s.add_event(
"gen_ai.client.inference.operation.details",
{
"gen_ai.input.messages": input_msg,
"gen_ai.output.messages": '[{"role": "assistant", "parts": [{"type": "text", "content": "done"}]}]',
},
),
)

session = StrandsInMemorySessionMapper().map_to_session([span], "sid")
inference = session.traces[0].spans[0]
assert inference.messages[0].content[0].content == "alpha\nbeta"


def test_latest_convention_tool_execution_multi_text(provider):
"""Latest _convert_tool_execution_span joins all blocks in tool_call_response."""
output_msg = json.dumps(
[
{
"role": "tool",
"parts": [
{
"type": "tool_call_response",
"id": "t1",
"response": [{"text": "part1"}, {"text": "part2"}],
}
],
}
]
)
span = make_span(
provider,
0xAAA,
0xBBB,
0xCCC,
"execute_tool",
{
"gen_ai.operation.name": "execute_tool",
"gen_ai.provider.name": "strands-agents",
"gen_ai.tool.name": "search",
"gen_ai.tool.call.id": "t1",
"gen_ai.tool.status": "success",
},
lambda s: s.add_event(
"gen_ai.client.inference.operation.details",
{"gen_ai.output.messages": output_msg},
),
)

session = StrandsInMemorySessionMapper().map_to_session([span], "sid")
tool = session.traces[0].spans[0]
assert isinstance(tool, ToolExecutionSpan)
assert tool.tool_result.content == "part1\npart2"
Loading
Loading