Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 3 additions & 13 deletions src/strands_evals/mappers/cloudwatch_session_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from typing import Any

from ..mappers.session_mapper import SessionMapper
from ..mappers.utils import get_body
from ..mappers.utils import get_body, join_tool_result_content
from ..types.trace import (
AgentInvocationSpan,
AssistantMessage,
Expand Down Expand Up @@ -305,25 +305,16 @@ def _extract_tool_results(self, body: dict) -> list[ToolResult]:
for item in parsed:
if isinstance(item, dict) and "toolResult" in item:
tr_data = item["toolResult"]
result_text = self._extract_tool_result_text(tr_data.get("content"))
tool_results.append(
ToolResult(
content=result_text,
content=join_tool_result_content(tr_data.get("content")),
error=tr_data.get("error"),
tool_call_id=tr_data.get("toolUseId"),
)
)

return tool_results

def _extract_tool_result_text(self, content: Any) -> str:
"""Extract text from tool result content."""
if not content:
return ""
if isinstance(content, list) and content:
return content[0].get("text", "")
return str(content)

# --- Body-to-messages conversion ---

def _body_to_messages(self, body: dict) -> list[UserMessage | AssistantMessage]:
Expand Down Expand Up @@ -392,10 +383,9 @@ def _process_tool_results(self, content_list: list[dict[str, Any]]) -> list[Text
if "toolResult" not in item:
continue
tool_result = item["toolResult"]
result_text = self._extract_tool_result_text(tool_result.get("content"))
result.append(
ToolResultContent(
content=result_text,
content=join_tool_result_content(tool_result.get("content")),
error=tool_result.get("error"),
tool_call_id=tool_result.get("toolUseId"),
)
Expand Down
32 changes: 4 additions & 28 deletions src/strands_evals/mappers/strands_in_memory_session_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
UserMessage,
)
from .session_mapper import SessionMapper
from .utils import join_tool_result_content

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -194,14 +195,9 @@ def _process_tool_results(self, content_list: list[dict[str, Any]]) -> list[Text
continue

tool_result = item["toolResult"]
result_text = ""
if "content" in tool_result and tool_result["content"]:
content = tool_result["content"]
result_text = content[0].get("text", "") if isinstance(content, list) else str(content)

result.append(
ToolResultContent(
content=result_text,
content=join_tool_result_content(tool_result.get("content")),
error=tool_result.get("error"),
tool_call_id=tool_result.get("toolUseId"),
)
Expand Down Expand Up @@ -324,21 +320,9 @@ def _convert_inference_messages(self, otel_msg: dict[str, Any]) -> UserMessage |
content.append(TextContent(text=part.get("content", "")))

if part_type == "tool_call_response":
# Extract text from response array if present
response = part.get("response", [])
response_text = ""

## To-do: Compare the differences for multiple toolResults
if isinstance(response, list) and response:
response_text = (
response[0].get("text", "") if isinstance(response[0], dict) else str(response[0])
)
elif isinstance(response, str):
response_text = response

content.append(
ToolResultContent(
content=response_text,
content=join_tool_result_content(part.get("response")),
tool_call_id=part.get("id"),
)
)
Expand Down Expand Up @@ -380,15 +364,7 @@ def _convert_tool_execution_span(self, span: ReadableSpan, session_id: str) -> T
if output_messages and output_messages[0].get("parts"):
part = output_messages[0]["parts"][0]
if part.get("type") == "tool_call_response":
response = part.get("response", [])
if isinstance(response, list) and response:
tool_result_content = (
response[0].get("text", "")
if isinstance(response[0], dict)
else str(response[0])
)
elif isinstance(response, str):
tool_result_content = response
tool_result_content = join_tool_result_content(part.get("response"))
except Exception as e:
logger.warning(f"Failed to process tool event {event.name}: {e}")
else:
Expand Down
38 changes: 38 additions & 0 deletions src/strands_evals/mappers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,50 @@
Utility functions for mapper selection and detection.
"""

import json
from typing import Any

from .constants import SCOPE_LANGCHAIN_OTEL, SCOPE_OPENINFERENCE, SCOPE_STRANDS
from .session_mapper import SessionMapper


def serialize_tool_result_block(block: Any) -> str:
"""Serialize a single Bedrock-style tool result content block to text.

Handles text, json, image, document, and video block types so values in
any block are visible to evaluators (not just block[0]).
"""
if not isinstance(block, dict):
return str(block) if block is not None else ""
if "text" in block:
return block["text"] or ""
if "json" in block:
try:
return json.dumps(block["json"], default=str)
except (TypeError, ValueError):
return str(block["json"])
for key in ("image", "document", "video"):
if key in block:
return f"[{key} content]"
return ""


def join_tool_result_content(content: Any) -> str:

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Issue: serialize_tool_result_block and join_tool_result_content have multiple code paths (text, json, image/document/video, non-dict, None, str, list) but no direct unit tests in test_utils.py. The existing regression tests only exercise the text and json paths through the mapper integration layer.

Suggestion: Add focused unit tests in tests/strands_evals/mappers/test_utils.py for these functions covering edge cases:

  • serialize_tool_result_block with: None, non-dict, {"text": ""}, {"json": {...}}, {"json": <unserializable>}, {"image": ...}, {"document": ...}, {"video": ...}, empty dict
  • join_tool_result_content with: None, [], "", a plain string, a list with mixed block types, a non-list/non-str value

"""Join every block of a tool result's content into a single string.

Replaces the prior `content[0]`-only behavior that hid values in
subsequent blocks from faithfulness/correctness judges.
"""
if not content:
return ""
if isinstance(content, list):
parts = [serialize_tool_result_block(b) for b in content]
return "\n".join(p for p in parts if p)
if isinstance(content, str):
return content
return str(content)


def detect_otel_mapper(spans: list[Any]) -> SessionMapper:
"""Detect the appropriate mapper based on span scope and data format.

Expand Down
38 changes: 38 additions & 0 deletions tests/strands_evals/mappers/test_cloudwatch_session_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,3 +245,41 @@ def test_record_with_no_body_skipped(self, mapper):
session = mapper.map_to_session(records, "sess-1")
assert len(session.traces) == 1
assert len(session.traces[0].spans) > 0

def test_multi_block_tool_result_preserves_all_blocks(self, mapper):
"""Regression for #235: every block of toolResult.content reaches the evaluator."""
multi_block_msg = {
"role": "tool",
"content": {
"content": json.dumps(
[
{
"toolResult": {
"content": [{"text": "summary"}, {"text": "Output: 2"}],
"toolUseId": "tu-1",
}
}
]
)
},
}
record1 = make_log_record(
trace_id="t1",
span_id="s1",
input_messages=[make_user_message("run it")],
output_messages=[_make_assistant_tool_use_message("runner", {}, "tu-1")],
time_nano=1000,
)
record2 = make_log_record(
trace_id="t1",
span_id="s2",
input_messages=[make_user_message("run it"), multi_block_msg],
output_messages=[make_assistant_text_message("done")],
time_nano=2000,
)

session = mapper.map_to_session([record1, record2], "sess-1")
tool_spans = [s for s in session.traces[0].spans if isinstance(s, ToolExecutionSpan)]
assert len(tool_spans) == 1
assert "summary" in tool_spans[0].tool_result.content
assert "Output: 2" in tool_spans[0].tool_result.content
131 changes: 131 additions & 0 deletions tests/strands_evals/mappers/test_strands_in_memory_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -573,3 +573,134 @@ def test_session_id_filtering_gen_ai_conversation_id_takes_precedence(provider):
# Should NOT match on session.id when gen_ai.conversation.id is present
session2 = mapper.map_to_session([span], "session-456")
assert len(session2.traces) == 0


# Regression tests for issue #235: multi-block toolResult.content was dropped


def test_legacy_tool_result_preserves_all_text_blocks(provider):
"""Legacy convention: every text block in toolResult.content is included."""
span = make_span(
provider,
0xAAA,
0xBBB,
0xCCC,
"chat",
{"gen_ai.operation.name": "chat"},
lambda s: s.add_event(
"gen_ai.tool.message",
{
"content": (
'[{"toolResult": {"toolUseId": "t1", "content": [{"text": "summary line"},{"text": "Output: 2"}]}}]'
)
},
),
)

session = StrandsInMemorySessionMapper().map_to_session([span], "sid")

tool_result = session.traces[0].spans[0].messages[0].content[0]
assert "summary line" in tool_result.content
assert "Output: 2" in tool_result.content


def test_legacy_tool_result_serializes_json_block(provider):
"""Legacy convention: json blocks are serialized so values stay visible."""
span = make_span(
provider,
0xAAA,
0xBBB,
0xCCC,
"chat",
{"gen_ai.operation.name": "chat"},
lambda s: s.add_event(
"gen_ai.tool.message",
{
"content": (
'[{"toolResult": {"toolUseId": "t1", "content": [{"text": "see data"},{"json": {"answer": 42}}]}}]'
)
},
),
)

session = StrandsInMemorySessionMapper().map_to_session([span], "sid")

tool_result = session.traces[0].spans[0].messages[0].content[0]
assert "see data" in tool_result.content
assert '"answer": 42' in tool_result.content


def test_latest_convention_tool_result_preserves_all_blocks(provider):
"""Latest convention inference span: all response blocks join into content."""
span = make_span(
provider,
0xAAA,
0xBBB,
0xCCC,
"chat",
{"gen_ai.operation.name": "chat", "gen_ai.provider.name": "strands-agents"},
lambda s: s.add_event(
"gen_ai.client.inference.operation.details",
{
"gen_ai.input.messages": """
[
{"role": "user", "parts": [
{"type": "tool_call_response", "id": "t1", "response": [
{"text": "summary"},
{"text": "Output: 2"}
]}
]}
]
""",
"gen_ai.output.messages": """
[{"role": "assistant", "parts": [
{"type": "text", "content": "done"}
], "finish_reason": "stop"}]
""",
},
),
)

session = StrandsInMemorySessionMapper().map_to_session([span], "sid")

tool_result = session.traces[0].spans[0].messages[0].content[0]
assert "summary" in tool_result.content
assert "Output: 2" in tool_result.content


def test_latest_convention_tool_execution_span_preserves_all_blocks(provider):
"""Latest convention tool execution span: all response blocks join into content."""
span = make_span(
provider,
0xAAA,
0xBBB,
0xCCC,
"execute_tool",
{
"gen_ai.operation.name": "execute_tool",
"gen_ai.provider.name": "strands-agents",
"gen_ai.tool.name": "calc",
"gen_ai.tool.call.id": "t1",
"gen_ai.tool.status": "success",
},
lambda s: s.add_event(
"gen_ai.client.inference.operation.details",
{
"gen_ai.output.messages": """[
{"role": "tool", "parts": [
{"type": "tool_call_response", "id": "t1", "response": [
{"text": "summary"},
{"text": "Output: 2"}
]}
], "finish_reason": "stop"}
]"""
},
),
)

session = StrandsInMemorySessionMapper().map_to_session([span], "sid")

tool = session.traces[0].spans[0]
assert isinstance(tool, ToolExecutionSpan)
assert "summary" in tool.tool_result.content
assert "Output: 2" in tool.tool_result.content
Loading