From e29e4b98c3da505b82062b09ec0365ff1bc8ec5b Mon Sep 17 00:00:00 2001 From: Alexey Tyurin <> Date: Thu, 18 Jun 2026 20:30:38 -0500 Subject: [PATCH 1/3] feat(tool-loader): native load_tools recovery + escape-hatch rate (#1450) --- docs/plans/tool-loader.mdx | 67 +++++++-- src/gaia/agents/base/agent.py | 15 +- src/gaia/agents/base/tool_loader.py | 141 +++++++++++++++++- src/gaia/agents/chat/agent.py | 83 ++++++++--- src/gaia/agents/chat/tool_bundles.py | 12 +- src/gaia/eval/tool_cost.py | 31 +++- src/gaia/eval/tool_recall.py | 173 +++++++++++++++++++---- tests/unit/test_chat_dynamic_tools.py | 131 ++++++++++++++--- tests/unit/test_chat_tool_bundles.py | 17 ++- tests/unit/test_tool_loader_selection.py | 142 ++++++++++++++++++- tests/unit/test_tool_recall.py | 147 +++++++++++++++++++ 11 files changed, 866 insertions(+), 93 deletions(-) diff --git a/docs/plans/tool-loader.mdx b/docs/plans/tool-loader.mdx index 2bf0484b8..bafefc11a 100644 --- a/docs/plans/tool-loader.mdx +++ b/docs/plans/tool-loader.mdx @@ -9,7 +9,7 @@ title: "Dynamic Tool Loader" **Component:** Per-turn tool visibility for agents (issue [#688](https://github.com/amd/gaia/issues/688)) **Module:** `gaia.agents.base.tool_loader` -**Status:** **Part 0 (#1448) + Part 1 (#1449) landed.** Part 1 ships the selection mechanism behind a default-off toggle on the ChatAgent `doc` profile. Parts 2–3 (explicit escape hatch, skill signal) are still proposed. +**Status:** **Part 0 (#1448) + Part 1 (#1449) + Part 2 (#1450) landed.** Part 1 ships the selection mechanism behind a default-off toggle on the ChatAgent `doc` profile; Part 2 adds the explicit `load_tools` escape hatch (so native tool-calling models can recover a semantic miss) plus the escape-hatch activation-rate tuning signal. Part 3 (skill signal) is still proposed. **Target agent (v1):** `ChatAgent` (`doc` profile), behind a default-off toggle. @@ -297,11 +297,14 @@ backend KV prefix stays warm. When a filter is active the tools block moves **after** the response-format template (volatile content last); with no filter the legacy order and bytes are preserved exactly. -**Native known gap (Amendment 2).** `_execute_tool` is never tightened, so a -non-tool-calling model that names an unlisted tool still runs it (free recovery) -and the loader logs `TOOL_LOADER_ESCAPE_HATCH`. Native tool-calling models have -no such hatch until Part 2's `load_tools`; on first activation the agent logs the -miss as a *known gap* rather than padding the loaded set. +**Native known gap (Amendment 2) — closed by Part 2.** `_execute_tool` is never +tightened, so a non-tool-calling model that names an unlisted tool still runs it +(free recovery) and the loader logs `TOOL_LOADER_ESCAPE_HATCH`. In Part 1 native +tool-calling models had no such hatch — a semantic miss could not self-recover. +[Part 2](#part-2-explicit-escape-hatch--tuning-1450) closes the recovery gap with +the always-on `load_tools` meta-tool (the model loads the bundle it needs and +calls the tool on its next step), and the recall gate's native exemption is +removed accordingly. **Approved deviations from this sketch** (flagged in the #1449 PR): @@ -323,7 +326,7 @@ baseline — meaning **CORE-only is the ~60%-reduction best case** and a full `test_tool_loader_token_budget.py` pins these filtered costs as a static guard. -### Part 2 — Explicit escape hatch + tuning +### Part 2 — Explicit escape hatch + tuning ✅ landed (#1450) - Add bundle re-surfacing + a discoverability menu of bundle names, and the `load_tools` meta-tool that native tool-calling models need (the free recovery @@ -340,6 +343,50 @@ baseline — meaning **CORE-only is the ~60%-reduction best case** and a full - **Escape-hatch activation rate** is logged per session and usable as the threshold-tuning signal (rising rate ⇒ τ too strict). +#### How Part 2 shipped (implementation reference) + +**`load_tools` is always-on via CORE.** `load_tools` is added to +[`DOC_CORE_TOOLS`](https://github.com/amd/gaia/blob/main/src/gaia/agents/chat/tool_bundles.py) +(CORE = 11), so once registered it renders in **both** the text prompt and the +native `tools=` schema every active turn and is cap-/eviction-exempt. It is +registered **only when the loader is active** (`self.tool_loader is not None`), +so the default-off `doc` path stays byte-identical — the unfiltered 37-tool +baseline is unchanged. + +**Recovery lands on the next model *step*, not the next user turn.** The +`load_tools(bundle)` handler calls `ToolLoader.load_bundle`, then +`Agent._apply_tool_filter` — the one place the active filter and the cached +system prompt move together. Because `system_prompt` and `_openai_tools` are +read live at every LLM call, the expanded set is visible to the very next step +in the same query, which is what lets `smart_discovery` recover on turn 1. + +**`load_bundle` is cap-aware.** It resolves a bundle name (or a bare tool name, +via the reverse index) and admits members with the same LRU-evict path `select()` +uses — protecting CORE and the members being loaded now — so `max_tools` holds at +all times. It emits a same-turn `TOOL_LOADER {…, "event": "load_tools", …}` +superset line. + +**Menu is stable and native-only.** A compact bundle menu (name + one-line +description, from `ToolBundle.description`) is injected into the **stable** prefix +of the doc system prompt (before the volatile tools tail → no KV thrash), and +**only for native tool-calling models** — non-native models already have free +recovery and are the TTFT-sensitive path. + +**Tuning signal is log-derived.** The loader counts escape-hatch (free) and +`load_tools` (explicit) activations per session and emits a `TOOL_LOADER_SESSION` +summary on `reset_session()` (`escape_hatch_rate = (escape_hatch + load_tools) / +turns`). `gaia.eval.tool_recall` aggregates these from the server log and reports +the per-turn rate alongside recall — no UI-DB migration. + +**Recall gate flipped correctly.** `tool_recall.py` unions same-turn +`load_tools` superset lines into that turn's loaded set and treats `load_tools` +as always-satisfied; **only then** is the native "known gap" exemption removed, +so a successful recovery passes the gate and a genuinely unrecovered miss fails +it on every model. + +**Cap unchanged at 14** (→ 3 dynamic slots now that CORE = 11). The eval gates +recall; bump the default only if recall or the escape-hatch rate regresses. + ### Part 3 — Skill-driven signal (gated on #887) A third selection signal, added **only after** [#887](https://github.com/amd/gaia/issues/887) @@ -407,7 +454,11 @@ via the base `_select_tools_for_turn` hook, and both render paths filter from th same selection. The old keyword/bundle-policy skeleton was removed; the class name `ToolLoader` and `reset_session()` were kept so the existing (guarded) call sites in `cli.py` / `chat/app.py` needed no change. Recall recovery for native -tool-calling models (the `load_tools` meta-tool) is still **Part 2**. +tool-calling models has shipped (Part 2, #1450): the loader exposes +`bundle_names` / `format_bundle_menu` / `load_bundle` and per-session escape-hatch +counters; `ChatAgent` registers the `load_tools` meta-tool and injects the +native-only bundle menu; and `gaia.eval.tool_recall` unions mid-loop `load_tools` +lines, drops the native exemption, and reports the escape-hatch activation rate. ## Dependencies diff --git a/src/gaia/agents/base/agent.py b/src/gaia/agents/base/agent.py index be37fc0d3..09f2d25a2 100644 --- a/src/gaia/agents/base/agent.py +++ b/src/gaia/agents/base/agent.py @@ -817,8 +817,19 @@ def _refresh_active_tool_filter(self, user_input: str) -> None: # pylint: disable-next=assignment-from-none new_filter = self._select_tools_for_turn(user_input) if new_filter != self._active_tool_filter: - self._active_tool_filter = new_filter - self._system_prompt_cache = self._compose_system_prompt() + self._apply_tool_filter(new_filter) + + def _apply_tool_filter(self, new_filter: Optional[List[str]]) -> None: + """Swap the active tool filter and recompute the cached system prompt. + + The single place the "filter and prompt move together" invariant lives. + Called from :meth:`_refresh_active_tool_filter` (per user turn) and from + the ``load_tools`` escape-hatch handler (mid-loop), so a mid-query + expansion is visible to the very next model step — both render paths + (``system_prompt`` and ``_openai_tools``) read these live. + """ + self._active_tool_filter = new_filter + self._system_prompt_cache = self._compose_system_prompt() def rebuild_system_prompt(self) -> None: """Rebuild system prompt with current tools from _TOOL_REGISTRY. diff --git a/src/gaia/agents/base/tool_loader.py b/src/gaia/agents/base/tool_loader.py index 2afd1f612..d7003d4fb 100644 --- a/src/gaia/agents/base/tool_loader.py +++ b/src/gaia/agents/base/tool_loader.py @@ -156,6 +156,13 @@ def __init__( self._loaded: Dict[str, _ToolState] = {} self._turn = 0 self._session_disabled = False + # Escape-hatch activation counters (Part 2, #1450). Both recovery paths + # feed the τ-tuning signal: the non-tool-calling free recovery + # (record_tool_use on an unlisted tool) and the native explicit recovery + # (load_bundle). Summarized on reset_session(), aggregated from logs by + # the eval. A rising per-turn rate ⇒ τ too strict. + self._escape_hatch_count = 0 + self._load_tools_count = 0 # ── public API ─────────────────────────────────────────────────────── @@ -284,21 +291,23 @@ def record_tool_use(self, tool_name: str) -> None: If the tool is loaded, refresh its ``last_call_ts``. If it is **not** loaded, the model reached a tool the prompt didn't list (a free - non-tool-calling recovery via the full registry); log it as the - escape-hatch signal. This does *not* auto-load the tool — that is - Part 2's job. + non-tool-calling recovery via the full registry); count and log it as the + escape-hatch signal. This does *not* auto-load the tool; a native model + re-surfaces a missed tool through the explicit :meth:`load_bundle` path + (the ``load_tools`` meta-tool). """ state = self._loaded.get(tool_name) if state is not None: state.last_call_ts = time.time() return + self._escape_hatch_count += 1 logger.info( json.dumps( { "event": "TOOL_LOADER_ESCAPE_HATCH", "tool": tool_name, "turn": self._turn, - "note": "executed unlisted tool via full registry (Part-2 gap)", + "note": "executed unlisted tool via full registry (free recovery)", } ) ) @@ -306,15 +315,113 @@ def record_tool_use(self, tool_name: str) -> None: def reset_session(self) -> None: """Clear per-session state for a new conversation. - The content-keyed embedding cache survives — embeddings depend only on - the tool docs, not on the conversation. + Emits the per-session escape-hatch summary (the τ-tuning signal) for the + conversation just ending **before** clearing, then zeroes the counters + alongside the existing state clears. The content-keyed embedding cache + survives — embeddings depend only on the tool docs, not the conversation. """ + if self._turn > 0: + self._log_session_summary() self._loaded.clear() self._turn = 0 self._session_disabled = False + self._escape_hatch_count = 0 + self._load_tools_count = 0 + + def bundle_names(self) -> List[str]: + """Return the configured bundle names, sorted (the ``load_tools`` menu).""" + return sorted(b.name for b in self._bundles) + + def format_bundle_menu(self) -> str: + """Return a compact ``"- {name}: {description}"`` menu over all bundles. + + Used both for the native-model system-prompt menu and for the + unknown-bundle error text, so the model always sees the same valid names. + """ + return "\n".join( + f"- {b.name}: {b.description}" if b.description else f"- {b.name}" + for b in self._bundles + ) + + def load_bundle(self, bundle: str, registry: Dict[str, dict]) -> List[str]: + """Admit a bundle's tools into the loaded set (the explicit escape hatch). + + Resolves *bundle* to a :class:`ToolBundle` — exact bundle-name match + first, else (robustness nicety) a bare tool name resolved to its + bundle(s) via the reverse index — and admits each member present in + *registry* and not already loaded, **cap-aware**: under the cap via + :meth:`_admit`; at the cap by LRU-evicting a non-CORE tool that is not + being loaded right now (or skipping + logging if nothing is evictable), + mirroring :meth:`select`'s admission loop. So ``max_tools`` holds at all + times. Emits a same-turn ``TOOL_LOADER`` *loaded superset* line so the + recall parser sees the mid-loop expansion. + + Args: + bundle: A bundle name from the menu, or a bare tool name to resolve + to its owning bundle(s). + registry: The live tool registry (same object passed to + :meth:`select`); members absent from it are not admitted. + + Returns: + The sorted loaded set after admission. + + Raises: + KeyError: *bundle* is neither a known bundle name nor a known tool + name — the caller turns this into an actionable error listing the + valid bundle names. + """ + members = self._resolve_bundle_members(bundle) + resolved_name = bundle + + protected = set(self._core) | set(members) + sel = _Selection() + for member in sorted(members): + if member not in registry or member in self._loaded: + continue + if len(self._loaded) < self._max_tools: + self._admit(member, sel) + continue + victim = self._pick_eviction_victim(protected) + if victim is None: + sel.skipped_at_cap.append(member) + continue + del self._loaded[victim] + sel.evicted.append(victim) + self._admit(member, sel) + + self._load_tools_count += 1 + logger.info( + "TOOL_LOADER %s", + json.dumps( + { + "turn": self._turn, + "event": "load_tools", + "bundle": resolved_name, + "admitted": sorted(sel.admitted), + "evicted": sorted(sel.evicted), + "skipped_at_cap": sorted(sel.skipped_at_cap), + "loaded": sorted(self._loaded), + } + ), + ) + return sorted(self._loaded) # ── internals ──────────────────────────────────────────────────────── + def _resolve_bundle_members(self, bundle: str) -> FrozenSet[str]: + """Resolve *bundle* to its member set, or raise ``KeyError``. + + Exact bundle-name match first; else a bare tool name resolved to the + union of its owning bundles' members via the reverse index. + """ + for b in self._bundles: + if b.name == bundle: + return b.members + owning = self._tool_to_bundles.get(bundle) + if owning: + return frozenset().union(*(b.members for b in owning)) + raise KeyError(bundle) + def _admit(self, name: str, sel: _Selection) -> None: """Add *name* to the loaded set with fresh bookkeeping.""" self._loaded[name] = _ToolState(loaded_at=time.time(), load_turn=self._turn) @@ -404,6 +511,28 @@ def _log_selection( ), ) + def _log_session_summary(self) -> None: + """Emit one ``TOOL_LOADER_SESSION`` INFO line — the τ-tuning signal. + + ``escape_hatch_rate`` is per turn over both recovery paths (free + non-tool-calling recovery + native ``load_tools``); the two component + counts are reported separately so the tuner can see which path fired. + """ + logger.info( + "TOOL_LOADER_SESSION %s", + json.dumps( + { + "turns": self._turn, + "escape_hatch_count": self._escape_hatch_count, + "load_tools_count": self._load_tools_count, + "escape_hatch_rate": ( + self._escape_hatch_count + self._load_tools_count + ) + / max(self._turn, 1), + } + ), + ) + def _sha256(text: str) -> str: """Hex SHA-256 of *text* (UTF-8).""" diff --git a/src/gaia/agents/chat/agent.py b/src/gaia/agents/chat/agent.py index 1704faf31..f9754162b 100644 --- a/src/gaia/agents/chat/agent.py +++ b/src/gaia/agents/chat/agent.py @@ -342,7 +342,6 @@ def __init__(self, config: Optional[ChatAgentConfig] = None): # None → full registry / legacy prompt. Embedding fns are injected so the # loader never imports MemoryMixin; they resolve lazily on first select(), # by which point init_memory() has probed the embedder. - self._dynamic_tools_native_warned = False self._dynamic_tools_validated = False self.tool_loader = self._maybe_build_tool_loader() @@ -521,7 +520,6 @@ def _select_tools_for_turn(self, user_input: str) -> Optional[List[str]]: """Return this turn's sorted tool subset, or ``None`` for the full registry.""" if not self._dynamic_tools_active(): return None - self._maybe_warn_native_tool_gap() if not self._dynamic_tools_validated: # Fail loudly on first activation if a CORE/bundle name doesn't exist # in the live registry (drift). The reverse direction is the CI test. @@ -553,22 +551,6 @@ def _build_tool_selection_query(self, user_input: str) -> str: combined = f"{prev}\n{user_input}" if prev else user_input return combined[-4000:] - def _maybe_warn_native_tool_gap(self) -> None: - """Log the Amendment-2 known gap once, on first activation for a native model. - - Native tool-calling models have no escape hatch until Part 2's - ``load_tools`` lands, so a semantic miss can't self-recover. We log it as - a known gap rather than padding the loaded set. - """ - if self._dynamic_tools_native_warned: - return - self._dynamic_tools_native_warned = True - if is_tool_calling_model(getattr(self, "model_id", None)): - logger.warning( - "tool_loader: native tool-calling model — no escape hatch until " - "Part 2; semantic misses are a known gap" - ) - def _post_process_tool_result( self, tool_name: str, @@ -891,7 +873,22 @@ def _get_system_prompt(self) -> str: return base_prompt + extras if profile == "doc": - # Document Q&A: RAG tools + hallucination prevention + # Document Q&A: RAG tools + hallucination prevention. + # Native-only escape-hatch menu (#1450): non-native models already + # self-recover via the free full-registry path and are the + # TTFT-sensitive case, so we don't tax them with the menu. Lives in + # this stable prefix (before the volatile tools tail) → no KV thrash. + load_tools_menu = "" + if self.tool_loader is not None and is_tool_calling_model( + getattr(self, "model_id", None) + ): + load_tools_menu = ( + "\n\n==== LOADABLE TOOL BUNDLES ====\n" + "Your visible tools are trimmed to what this turn needs. If a " + "capability you need is missing, call load_tools(bundle) with " + "one of these names; its tools become available on your next " + "step:\n" + self.tool_loader.format_bundle_menu() + ) return ( base_prompt + indexed_docs_section @@ -899,6 +896,7 @@ def _get_system_prompt(self) -> str: + discovery_rules + discovery_rules_tail + rag_query_rules + + load_tools_menu ) if profile == "file": @@ -1192,6 +1190,53 @@ def _register_tools(self) -> None: self._register_external_tools_conditional() self._register_loop_control_tools() # set_loop_state, request_user_input + # load_tools escape hatch (#1450, Part 2) — registered ONLY when the + # dynamic loader is active, so the default-off doc path stays + # byte-identical. It is in DOC_CORE_TOOLS, so once registered it renders + # in both prompt paths every active turn (cap- and eviction-exempt). + if self.tool_loader is not None: + + @tool + def load_tools(bundle: str) -> dict: + """Load a bundle of tools so you can call them on your next step. + + Call this when the capability you need is not in your current + tool list — pick a bundle name from the "Loadable tool bundles" + menu in your instructions (a bare tool name also works; it loads + that tool's whole bundle). The bundle's tools become available on + your **next** step; then call the one you need. + + Args: + bundle: A bundle name from the menu (e.g. "file_search", + "rag_index"), or a specific tool name to load its bundle. + + Returns: + Dictionary with status, the resolved bundle, and the full + loaded_tools list now available to call. + """ + loader = self.tool_loader + if loader is None: + return { + "status": "error", + "error": "Dynamic tool loading is not active; all tools " + "are already available.", + } + try: + loaded = loader.load_bundle(bundle, self._tools_registry) + except KeyError: + return { + "status": "error", + "error": f"Unknown bundle '{bundle}'. Choose one of: " + f"{', '.join(loader.bundle_names())}", + } + # Make the expansion visible to the next model step in this query. + self._apply_tool_filter(loaded) + return { + "status": "success", + "bundle": bundle, + "loaded_tools": loaded, + } + # Inline list_files — only for profiles that need file operations if profile in ("file", "data", "full"): diff --git a/src/gaia/agents/chat/tool_bundles.py b/src/gaia/agents/chat/tool_bundles.py index eac7ed975..afc0ef516 100644 --- a/src/gaia/agents/chat/tool_bundles.py +++ b/src/gaia/agents/chat/tool_bundles.py @@ -20,9 +20,13 @@ from gaia.agents.base.tool_loader import ToolBundle -# Always-on set (10 tools): memory v2, file-read + RAG-query entry points, and -# loop control. The design sketch listed a "finish" tool, dropped here — turn -# completion is protocol-level in GAIA, there is no such registry tool. +# Always-on set (11 tools): memory v2, file-read + RAG-query entry points, loop +# control, and the Part-2 escape hatch. The design sketch listed a "finish" tool, +# dropped here — turn completion is protocol-level in GAIA, there is no such +# registry tool. ``load_tools`` (#1450) is CORE-only — never in a bundle — so it +# renders in both the text prompt and the native ``tools=`` schema every active +# turn, cap- and eviction-exempt, giving native models a way back to any tool a +# semantic miss didn't surface. DOC_CORE_TOOLS = frozenset( { # memory v2 — persistent recall is always relevant @@ -38,6 +42,8 @@ # loop control — autonomous-turn signalling "set_loop_state", "request_user_input", + # escape hatch (#1450) — always-on explicit tool loader for native models + "load_tools", } ) diff --git a/src/gaia/eval/tool_cost.py b/src/gaia/eval/tool_cost.py index 20977adad..05e97b6d1 100644 --- a/src/gaia/eval/tool_cost.py +++ b/src/gaia/eval/tool_cost.py @@ -147,8 +147,30 @@ def _isolated_registry(): tools_mod._TOOL_REGISTRY.update(saved) +def _build_skeleton_tool_loader(dynamic_tools: bool): + """Return a real ToolLoader over the doc config, or ``None`` when off. + + Registration only consults ``self.tool_loader is not None``; it never embeds + or selects, so a trivial zero-vector embedder is enough to attach a loader. + """ + if not dynamic_tools: + return None + import numpy as np + + from gaia.agents.base.tool_loader import ToolLoader + from gaia.agents.chat.tool_bundles import DOC_BUNDLES, DOC_CORE_TOOLS + + return ToolLoader( + core_tools=DOC_CORE_TOOLS, + bundles=DOC_BUNDLES, + embed_fn=lambda text: np.zeros(1, dtype=np.float32), + ) + + def build_doc_agent_skeleton( - profile: str = DEFAULT_PROFILE, deterministic: bool = True + profile: str = DEFAULT_PROFILE, + deterministic: bool = True, + dynamic_tools: bool = False, ) -> "ChatAgent": """Build a ChatAgent skeleton with the *profile* tools registered. @@ -158,6 +180,12 @@ def build_doc_agent_skeleton( (memory store, rag) are enough to let ``_register_tools`` populate the registry. + With ``dynamic_tools=True`` a real :class:`ToolLoader` is attached **before** + ``_register_tools`` runs, so the ``load_tools`` meta-tool (#1450) registers + (registry +1, ``load_tools``). A trivial embedder suffices — registration + never embeds or selects; only the loader's presence is consulted. Default + ``False`` keeps the unfiltered baseline path unchanged (no ``load_tools``). + With ``deterministic=True`` the environment-conditional external tools (``search_documentation`` / ``search_web``, gated on npx and ``PERPLEXITY_API_KEY``) are forced off so the tool set — and therefore the @@ -213,6 +241,7 @@ def build_doc_agent_skeleton( agent._web_client = None agent._fs_index = None agent._scratchpad = None + agent.tool_loader = _build_skeleton_tool_loader(dynamic_tools) agent._register_tools() agent._instance_tools = dict(tools_mod._TOOL_REGISTRY) diff --git a/src/gaia/eval/tool_recall.py b/src/gaia/eval/tool_recall.py index 91daf9c92..8d08e2402 100644 --- a/src/gaia/eval/tool_recall.py +++ b/src/gaia/eval/tool_recall.py @@ -23,9 +23,21 @@ * **called sets** — ``scorecard.json`` in the eval run dir (``scenarios[].turns[].agent_tools``). -Amendment 2: on native tool-calling models a semantic miss can't self-recover -until Part 2, so misses there are reported as a *known gap* and do not fail the -gate. On non-native models recall below ``--min-recall`` exits non-zero. +Part 2 (#1450): native tool-calling models recover a semantically-missed tool +via the always-on ``load_tools`` meta-tool, so the Amendment-2 native exemption +is **removed** — a miss fails the gate on every model. Two parser changes make +that correct: a mid-loop ``load_tools`` line (same ``turn``, ``event": +"load_tools"``) is **unioned** into that turn's loaded set so a successful +recovery shows the tool as loaded; and ``load_tools`` itself counts as +always-satisfied. Recall below ``--min-recall`` exits non-zero. + +Escape-hatch activation rate (the τ-tuning signal, rising ⇒ τ too strict) is +derived from the **raw per-turn log events** — explicit ``load_tools`` lines and +free-recovery ``TOOL_LOADER_ESCAPE_HATCH`` lines over the turn count — because +those appear on every run. The per-session ``TOOL_LOADER_SESSION`` summary +(emitted only on ``reset_session``, i.e. the ``gaia chat``/CLI path, **not** the +UI-server/eval path) is a convenience for CLI logs; the recall gate does not +depend on it, so the rate is reported for eval runs too. (#1450) """ from __future__ import annotations @@ -39,6 +51,12 @@ from typing import Dict, List, Optional _TOOL_LOADER_RE = re.compile(r"TOOL_LOADER (\{.*\})\s*$") +_SESSION_RE = re.compile(r"TOOL_LOADER_SESSION (\{.*\})\s*$") +_ESCAPE_HATCH_RE = re.compile(r'"event"\s*:\s*"TOOL_LOADER_ESCAPE_HATCH"') + +# Tools that never count as a recall miss: ``load_tools`` is the always-on +# escape hatch (CORE), so calling it is always satisfiable by construction. +_ALWAYS_SATISFIED = frozenset({"load_tools"}) # ── pure join logic (unit-tested) ───────────────────────────────────────── @@ -110,7 +128,9 @@ def compute_recall( loaded = list(loaded_turns[t]) called = list(called_turns[t]) loaded_set = set(loaded) - missing = sorted(c for c in called if c not in loaded_set) + missing = sorted( + c for c in called if c not in loaded_set and c not in _ALWAYS_SATISFIED + ) turns.append( TurnRecall( scenario_idx=s, @@ -130,8 +150,13 @@ def compute_recall( def parse_loaded_sets_from_log(text: str) -> List[List[List[str]]]: """Extract per-scenario, per-turn loaded sets from server-log TOOL_LOADER lines. - A new scenario begins at each ``"turn": 1`` selection line (the loader resets - its turn counter per conversation). + A new scenario begins at each ``"turn": 1`` *selection* line (the loader + resets its turn counter per conversation). A mid-loop ``load_tools`` line + (Part 2) shares its turn's number but carries ``"event": "load_tools"``; it + is **unioned** into that turn's loaded set rather than opening a new turn, so + a within-turn recovery shows the loaded set as it stood *after* the load. + Only ``event``-less selection lines move the turn/scenario cursor, so two + consecutive single-turn scenarios still split correctly. Assumption: every scenario emits a ``turn == 1`` line. A turn-1 *embedder failure* session-disables the loader before ``_log_selection`` runs, so that @@ -150,10 +175,20 @@ def parse_loaded_sets_from_log(text: str) -> List[List[List[str]]]: payload = json.loads(m.group(1)) if "loaded" not in payload or "turn" not in payload: continue # not a selection line (e.g. escape-hatch event) + loaded = list(payload["loaded"]) + if payload.get("event") == "load_tools": + # Mid-loop expansion: union into the current turn's loaded set. A + # load_tools line always follows its turn's selection line, so + # ``current`` is non-empty in a well-formed log; tolerate the start. + if current: + current[-1] = sorted(set(current[-1]) | set(loaded)) + else: + current.append(loaded) + continue if payload["turn"] == 1 and current: scenarios.append(current) current = [] - current.append(list(payload["loaded"])) + current.append(loaded) if current: scenarios.append(current) return scenarios @@ -168,16 +203,81 @@ def parse_called_sets_from_scorecard(scorecard: Dict) -> List[List[List[str]]]: return out -def _model_is_native(scorecard: Dict) -> bool: - """Whether the scorecard's model uses native tool-calling (Amendment-2 gate).""" - model = (scorecard.get("config") or {}).get("model") - try: - from gaia.llm.lemonade_client import is_tool_calling_model +def parse_session_summaries_from_log(text: str) -> List[Dict]: + """Extract ``TOOL_LOADER_SESSION`` payloads (one per finished conversation).""" + out: List[Dict] = [] + for line in text.splitlines(): + m = _SESSION_RE.search(line) + if not m: + continue + out.append(json.loads(m.group(1))) + return out - return is_tool_calling_model(model) - except ImportError: - # Can't import the classifier — treat as native so misses don't hard-fail. - return True + +def aggregate_escape_hatch(summaries: List[Dict]) -> Dict: + """Aggregate per-session ``TOOL_LOADER_SESSION`` summaries into a per-turn rate. + + Only ``gaia chat``/CLI logs carry these summaries (they emit on + ``reset_session()``). For the canonical τ-tuning path — eval logs — use + :func:`escape_hatch_rate_from_log`, which derives the same rate from the raw + per-turn lines that are always present. The rate (free non-tool-calling + recovery + native ``load_tools``, per turn) is the tuning signal: rising ⇒ τ + too strict; the two component counts are kept separate so the tuner sees + which path fired. + """ + turns = sum(int(s.get("turns", 0)) for s in summaries) + escape = sum(int(s.get("escape_hatch_count", 0)) for s in summaries) + loads = sum(int(s.get("load_tools_count", 0)) for s in summaries) + return { + "sessions": len(summaries), + "turns": turns, + "escape_hatch_count": escape, + "load_tools_count": loads, + "escape_hatch_rate": (escape + loads) / max(turns, 1), + } + + +def count_recovery_events_from_log(text: str) -> tuple: + """Count the two escape-hatch recovery paths from raw per-turn log lines. + + Returns ``(free_recovery_count, load_tools_count)`` — free non-tool-calling + recovery (``TOOL_LOADER_ESCAPE_HATCH`` lines) and explicit native recovery + (``TOOL_LOADER {… "event": "load_tools" …}`` lines). These per-turn lines are + emitted on **every** run (eval and CLI), independent of ``reset_session()``, + so they — not the per-session ``TOOL_LOADER_SESSION`` summary — are the + source of truth for the activation rate. (The UI-server/eval path never calls + ``reset_session()``, so eval logs carry no summary; #1450.) + """ + free = loads = 0 + for line in text.splitlines(): + if _ESCAPE_HATCH_RE.search(line): + free += 1 + continue + m = _TOOL_LOADER_RE.search(line) + if m and json.loads(m.group(1)).get("event") == "load_tools": + loads += 1 + return free, loads + + +def escape_hatch_rate_from_log( + text: str, loaded_per_scenario: List[List[List[str]]] +) -> Dict: + """Per-turn escape-hatch activation rate derived from the raw log. + + ``rate = (free recoveries + explicit load_tools) / total turns``, where total + turns is the number of per-turn selection lines across all scenarios. Works + on eval logs (which lack ``TOOL_LOADER_SESSION``) — this is the τ-tuning + signal the recall gate reports. + """ + free, loads = count_recovery_events_from_log(text) + turns = sum(len(scenario) for scenario in loaded_per_scenario) + return { + "turns": turns, + "free_recovery_count": free, + "load_tools_count": loads, + "escape_hatch_rate": (free + loads) / max(turns, 1), + "session_summaries": len(parse_session_summaries_from_log(text)), + } # ── CLI ─────────────────────────────────────────────────────────────────── @@ -197,9 +297,9 @@ def _discover_log(run_dir: Path) -> Optional[Path]: return None -def _format_report(report: RecallReport, native: bool) -> str: +def _format_report(report: RecallReport, escape_hatch: Optional[Dict] = None) -> str: lines = [ - "# Tool-recall gate (#1449)", + "# Tool-recall gate (#1449, #1450)", "", f"Turns scored: {len(report.turns)} | recall: {report.recall:.1%}", "", @@ -210,8 +310,9 @@ def _format_report(report: RecallReport, native: bool) -> str: lines.append("") misses = [t for t in report.turns if not t.ok] if misses: - label = "known gap (native model, Part 2)" if native else "RECALL MISS" - lines.append(f"## {label}") + # Part 2 removed the native exemption: a miss is a miss on every model + # (native models recover via load_tools, which the parser unions in). + lines.append("## RECALL MISS") for t in misses: lines.append( f"- scenario {t.scenario_idx} turn {t.turn_idx}: called " @@ -219,6 +320,21 @@ def _format_report(report: RecallReport, native: bool) -> str: ) else: lines.append("All called tools were loaded when called. ✅") + if escape_hatch is not None: + lines.extend( + [ + "", + "## Escape-hatch activation (τ-tuning signal)", + f"turns: {escape_hatch['turns']} | rate/turn: " + f"{escape_hatch['escape_hatch_rate']:.3f} " + f"(free recovery: {escape_hatch['free_recovery_count']}, " + f"load_tools: {escape_hatch['load_tools_count']}) — " + "rising ⇒ τ too strict.", + f"(derived from per-turn log events; " + f"{escape_hatch['session_summaries']} TOOL_LOADER_SESSION " + "summaries present)", + ] + ) return "\n".join(lines) @@ -242,7 +358,8 @@ def main(argv: Optional[List[str]] = None) -> int: "--min-recall", type=float, default=1.0, - help="Minimum recall for a PASS on non-native models (default: 1.0).", + help="Minimum recall for a PASS (default: 1.0). Applies to every model " + "— Part 2 removed the native exemption.", ) args = parser.parse_args(argv) @@ -264,7 +381,8 @@ def main(argv: Optional[List[str]] = None) -> int: "(`... 2>&1 | tee server.log`, NOT `2> server.log`) and pass it via " "--log. The loader emits one `TOOL_LOADER {json}` line per turn." ) - loaded = parse_loaded_sets_from_log(log_path.read_text(encoding="utf-8")) + log_text = log_path.read_text(encoding="utf-8") + loaded = parse_loaded_sets_from_log(log_text) if not loaded: raise SystemExit( f"{log_path} contained no TOOL_LOADER selection lines — was the loader " @@ -273,13 +391,14 @@ def main(argv: Optional[List[str]] = None) -> int: called = parse_called_sets_from_scorecard(scorecard) report = compute_recall(loaded, called) - native = _model_is_native(scorecard) - print(_format_report(report, native)) + # Derive the τ-tuning rate from the raw per-turn events (present in eval logs); + # the per-session TOOL_LOADER_SESSION summary only exists on the CLI path. + escape_hatch = escape_hatch_rate_from_log(log_text, loaded) + print(_format_report(report, escape_hatch)) - if not native and report.recall < args.min_recall: + if report.recall < args.min_recall: print( - f"\nFAIL: recall {report.recall:.1%} < {args.min_recall:.1%} " - "on a non-native model.", + f"\nFAIL: recall {report.recall:.1%} < {args.min_recall:.1%}.", file=sys.stderr, ) return 1 diff --git a/tests/unit/test_chat_dynamic_tools.py b/tests/unit/test_chat_dynamic_tools.py index fce83f558..cd4edb362 100644 --- a/tests/unit/test_chat_dynamic_tools.py +++ b/tests/unit/test_chat_dynamic_tools.py @@ -1,12 +1,13 @@ # Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved. # SPDX-License-Identifier: MIT -"""ChatAgent wiring for the dynamic tool loader (#1449). +"""ChatAgent wiring for the dynamic tool loader (#1449, Part 2 #1450). Covers the ChatAgent-level glue without a Lemonade backend: loader construction gating (profile + toggle + env), the three off-states reverting to the full registry (``None`` filter), the selection-query builder, the LRU record hook, -env-override parsing (incl. loud failure on malformed values), and the -native-model known-gap warning. +env-override parsing (incl. loud failure on malformed values), the ``load_tools`` +escape hatch + native-only menu, and that the Part-1 native known-gap warning is +gone now that Part 2 closes the gap. ChatAgent is built via ``__new__`` with only the attributes each method needs — ``Agent.__init__`` (Lemonade) is never run. @@ -32,6 +33,7 @@ from gaia.agents.base.tool_loader import ToolLoader # noqa: E402 from gaia.agents.chat.agent import ChatAgent, ChatAgentConfig # noqa: E402 +from gaia.eval.tool_cost import build_doc_agent_skeleton # noqa: E402 for _mod in _stubbed: sys.modules.pop(_mod, None) @@ -44,7 +46,6 @@ def _bare_agent(**attrs) -> ChatAgent: a.conversation_history = [] a.tool_loader = None a._memory_store = object() - a._dynamic_tools_native_warned = False a._dynamic_tools_validated = False a.model_id = None for k, v in attrs.items(): @@ -183,7 +184,7 @@ def test_query_builder_excludes_assistant_and_truncates(): assert q.endswith("C" * 100) # current turn always fully included -# ── record hook + known gap ─────────────────────────────────────────────── +# ── record hook ──────────────────────────────────────────────────────────── def test_on_tool_invoked_forwards_to_loader(): @@ -198,29 +199,119 @@ def test_on_tool_invoked_noop_when_no_loader(): a._on_tool_invoked("read_file") # must not raise -def test_native_model_known_gap_warned_once(caplog): +def test_native_model_no_longer_warns_known_gap(caplog): + """Part 2 (#1450) closed the native gap via load_tools — the warning is gone.""" loader = MagicMock() loader.session_disabled = False loader.select.return_value = ["c1"] - a = _bare_agent(tool_loader=loader, model_id="Gemma-4-E4B-it-GGUF") + a = _bare_agent(tool_loader=loader, model_id="Gemma-4-E4B-it-GGUF") # native with patch.object( ChatAgent, "_tools_registry", new_callable=lambda: property(lambda self: {}) ): with caplog.at_level(logging.WARNING): a._select_tools_for_turn("q1") a._select_tools_for_turn("q2") - gap_logs = [r for r in caplog.records if "known gap" in r.getMessage()] - assert len(gap_logs) == 1 # logged exactly once + assert not any( + "known gap" in r.getMessage() or "no escape hatch" in r.getMessage() + for r in caplog.records + ) -def test_non_native_model_no_known_gap_warning(caplog): - loader = MagicMock() - loader.session_disabled = False - loader.select.return_value = ["c1"] - a = _bare_agent(tool_loader=loader, model_id=None) # non-tool-calling - with patch.object( - ChatAgent, "_tools_registry", new_callable=lambda: property(lambda self: {}) - ): - with caplog.at_level(logging.WARNING): - a._select_tools_for_turn("q1") - assert not any("known gap" in r.getMessage() for r in caplog.records) +# ── _apply_tool_filter invariant (Part 2 mid-loop recovery) ──────────────── + + +def test_apply_tool_filter_swaps_filter_and_recomputes_prompt(): + """The base helper moves the filter and the cached prompt together.""" + a = ChatAgent.__new__(ChatAgent) + a.observers = [] # quiet __del__ during GC + a._active_tool_filter = None + a._system_prompt_cache = "OLD" + a._compose_system_prompt = lambda: f"PROMPT::{a._active_tool_filter}" + a._apply_tool_filter(["load_tools", "search_file"]) + assert a._active_tool_filter == ["load_tools", "search_file"] + assert a._system_prompt_cache == "PROMPT::['load_tools', 'search_file']" + + +# ── load_tools registration + handler (Part 2, #1450) ────────────────────── + + +def test_load_tools_registered_only_when_loader_active(): + on = build_doc_agent_skeleton(profile="doc", deterministic=True, dynamic_tools=True) + off = build_doc_agent_skeleton( + profile="doc", deterministic=True, dynamic_tools=False + ) + assert "load_tools" in on._tools_registry + assert "load_tools" not in off._tools_registry + + +def test_load_tools_handler_admits_bundle_and_applies_filter(): + agent = build_doc_agent_skeleton( + profile="doc", deterministic=True, dynamic_tools=True + ) + applied: dict = {} + agent._apply_tool_filter = lambda f: applied.__setitem__("filter", f) + load_tools = agent._tools_registry["load_tools"]["function"] + + result = load_tools("file_search") + assert result["status"] == "success" + assert result["bundle"] == "file_search" + # The bundle's tools are now in the loaded set, and that set was applied as + # the active filter so the next model step sees them. + assert "search_file" in result["loaded_tools"] + assert applied["filter"] == result["loaded_tools"] + + +def test_load_tools_handler_resolves_bare_tool_name(): + agent = build_doc_agent_skeleton( + profile="doc", deterministic=True, dynamic_tools=True + ) + agent._apply_tool_filter = lambda f: None + load_tools = agent._tools_registry["load_tools"]["function"] + result = load_tools("search_file") # bare tool name → its bundle + assert result["status"] == "success" + assert "search_file" in result["loaded_tools"] + + +def test_load_tools_handler_unknown_bundle_returns_actionable_error(): + agent = build_doc_agent_skeleton( + profile="doc", deterministic=True, dynamic_tools=True + ) + agent._apply_tool_filter = lambda f: None + load_tools = agent._tools_registry["load_tools"]["function"] + result = load_tools("does_not_exist") + assert result["status"] == "error" + assert "Unknown bundle 'does_not_exist'" in result["error"] + assert "file_search" in result["error"] # lists valid bundle names + + +# ── native-only escape-hatch menu ────────────────────────────────────────── + + +def test_native_doc_prompt_includes_load_tools_menu(): + agent = build_doc_agent_skeleton( + profile="doc", deterministic=True, dynamic_tools=True + ) + agent.rag = None # no-docs branch keeps _get_system_prompt light + prompt = agent._get_system_prompt() + assert "LOADABLE TOOL BUNDLES" in prompt + assert "load_tools(bundle)" in prompt + assert "- file_search:" in prompt # a real bundle line from the menu + + +def test_non_native_doc_prompt_omits_load_tools_menu(): + agent = build_doc_agent_skeleton( + profile="doc", deterministic=True, dynamic_tools=True + ) + agent.rag = None + agent.model_id = None # non-tool-calling → free recovery, no menu + prompt = agent._get_system_prompt() + assert "LOADABLE TOOL BUNDLES" not in prompt + + +def test_loader_off_doc_prompt_omits_load_tools_menu(): + agent = build_doc_agent_skeleton( + profile="doc", deterministic=True, dynamic_tools=False + ) + agent.rag = None + prompt = agent._get_system_prompt() + assert "LOADABLE TOOL BUNDLES" not in prompt diff --git a/tests/unit/test_chat_tool_bundles.py b/tests/unit/test_chat_tool_bundles.py index d274d9d8e..6782115d6 100644 --- a/tests/unit/test_chat_tool_bundles.py +++ b/tests/unit/test_chat_tool_bundles.py @@ -28,7 +28,11 @@ def _bundle_union() -> set[str]: def test_core_and_bundles_cover_doc_registry_exactly(): - agent = build_doc_agent_skeleton(profile="doc", deterministic=True) + # Loader-on skeleton so the CORE-only load_tools meta-tool (#1450) is + # registered — the doc registry must balance against CORE∪bundles with it. + agent = build_doc_agent_skeleton( + profile="doc", deterministic=True, dynamic_tools=True + ) registry = set(agent._tools_registry) covered = _bundle_union() @@ -44,12 +48,17 @@ def test_core_and_bundles_cover_doc_registry_exactly(): f"CORE/bundle names absent from the doc registry: {dangling}. " "Remove them or fix the name — validate_registry rejects these at runtime." ) + # The escape hatch is present in both CORE and the live registry (#1450). + assert "load_tools" in DOC_CORE_TOOLS + assert "load_tools" in registry def test_core_is_subset_of_bundle_union(): - """CORE tools are also covered by bundles (the union is the registry).""" - covered = _bundle_union() - assert DOC_CORE_TOOLS <= covered + """Every CORE tool is in a bundle too, except the CORE-only load_tools (#1450).""" + bundle_members: set[str] = set() + for bundle in DOC_BUNDLES: + bundle_members |= set(bundle.members) + assert DOC_CORE_TOOLS - bundle_members == {"load_tools"} def test_bundles_have_unique_names(): diff --git a/tests/unit/test_tool_loader_selection.py b/tests/unit/test_tool_loader_selection.py index 06ab635a3..43cfc29df 100644 --- a/tests/unit/test_tool_loader_selection.py +++ b/tests/unit/test_tool_loader_selection.py @@ -255,6 +255,124 @@ def test_record_tool_use_logs_escape_hatch_for_unloaded(): assert any("TOOL_LOADER_ESCAPE_HATCH" in r.getMessage() for r in records) +# ── load_bundle / menu / counters (Part 2, #1450) ────────────────────────── + + +def _loader_with_bundles(max_tools: int = 14): + """A loader over a tiny CORE + two bundles, with a never-matching embedder.""" + tools = ["c1", "a1", "a2", "b1"] + embed = _make_embed_fn(tools, {"q": {"c1": 0.0, "a1": 0.0, "a2": 0.0, "b1": 0.0}}) + bundles = [ + ToolBundle(name="A", members=frozenset({"a1", "a2"}), description="A tools"), + ToolBundle(name="B", members=frozenset({"b1"}), description="B tools"), + ] + loader = ToolLoader( + frozenset({"c1"}), bundles, embed, threshold=0.55, max_tools=max_tools + ) + return loader, _registry(tools) + + +def test_bundle_names_are_sorted(): + loader, _ = _loader_with_bundles() + assert loader.bundle_names() == ["A", "B"] + + +def test_format_bundle_menu_lists_name_and_description(): + loader, _ = _loader_with_bundles() + menu = loader.format_bundle_menu() + assert "- A: A tools" in menu + assert "- B: B tools" in menu + + +def test_load_bundle_by_bundle_name_admits_members(): + loader, reg = _loader_with_bundles() + loader.select("q", reg) # turn 1: CORE only (c1) + loaded = loader.load_bundle("A", reg) + assert {"c1", "a1", "a2"} <= set(loaded) + assert loader._load_tools_count == 1 + + +def test_load_bundle_by_tool_name_resolves_to_owning_bundle(): + loader, reg = _loader_with_bundles() + loader.select("q", reg) + loaded = loader.load_bundle("a1", reg) # bare tool name → bundle A + assert {"a1", "a2"} <= set(loaded) + + +def test_load_bundle_unknown_name_raises_keyerror(): + loader, reg = _loader_with_bundles() + loader.select("q", reg) + with pytest.raises(KeyError): + loader.load_bundle("does_not_exist", reg) + + +def test_load_bundle_skips_members_absent_from_registry(): + tools = ["c1", "a1"] + embed = _make_embed_fn(tools, {"q": {"c1": 0.0, "a1": 0.0}}) + bundles = [ToolBundle(name="A", members=frozenset({"a1", "ghost"}))] + loader = ToolLoader(frozenset({"c1"}), bundles, embed, threshold=0.55, max_tools=14) + reg = _registry(tools) + loader.select("q", reg) + loaded = loader.load_bundle("A", reg) + assert "a1" in loaded and "ghost" not in loaded + + +def test_load_bundle_is_cap_aware_and_protects_just_loaded(): + """At cap, load_bundle evicts an LRU non-CORE tool, never CORE or just-loaded.""" + tools = ["c1", "d1", "a1", "a2"] + embed = _make_embed_fn(tools, {"q": {"c1": 0.0, "d1": 0.9, "a1": 0.0, "a2": 0.0}}) + bundles = [ToolBundle(name="A", members=frozenset({"a1", "a2"}), description="A")] + loader = ToolLoader(frozenset({"c1"}), bundles, embed, threshold=0.55, max_tools=3) + reg = _registry(tools) + assert loader.select("q", reg) == ["c1", "d1"] # CORE + matched d1 (2 of 3) + loaded = loader.load_bundle("A", reg) # wants a1,a2 with 1 slot free → evict + assert set(loaded) == {"c1", "a1", "a2"} # cap held; d1 evicted + assert "d1" not in loaded + + +def test_load_bundle_emits_same_turn_loaded_superset_line(): + loader, reg = _loader_with_bundles() + loader.select("q", reg) + with _capture("gaia.agents.base.tool_loader") as records: + loader.load_bundle("A", reg) + events = [p for p in _loader_payloads(records) if p.get("event") == "load_tools"] + assert events, "no load_tools TOOL_LOADER line captured" + assert events[0]["turn"] == loader._turn + assert {"a1", "a2"} <= set(events[0]["loaded"]) + + +def test_escape_hatch_and_load_counters_increment(): + loader, reg = _loader_with_bundles() + loader.select("q", reg) + loader.record_tool_use("never_loaded") # free recovery + loader.load_bundle("A", reg) # explicit recovery + assert loader._escape_hatch_count == 1 + assert loader._load_tools_count == 1 + + +def test_reset_session_emits_summary_then_zeroes_counters(): + loader, reg = _loader_with_bundles() + loader.select("q", reg) + loader.record_tool_use("never_loaded") + loader.load_bundle("A", reg) + with _capture("gaia.agents.base.tool_loader") as records: + loader.reset_session() + summary = _session_payload(records) + assert summary["turns"] == 1 + assert summary["escape_hatch_count"] == 1 + assert summary["load_tools_count"] == 1 + assert summary["escape_hatch_rate"] == pytest.approx(2.0) # (1+1)/1 + assert loader._escape_hatch_count == 0 + assert loader._load_tools_count == 0 + + +def test_reset_session_emits_no_summary_when_no_turns(): + loader, _ = _loader_with_bundles() + with _capture("gaia.agents.base.tool_loader") as records: + loader.reset_session() # turn == 0 → nothing to summarize + assert not any("TOOL_LOADER_SESSION" in r.getMessage() for r in records) + + # ── embedder failure ───────────────────────────────────────────────────── @@ -343,10 +461,28 @@ def __exit__(self, *exc) -> None: self._logger.propagate = self._prev_propagate -def _selection_payload(records: list[logging.LogRecord]) -> dict: - """Extract the JSON payload from the TOOL_LOADER selection log line.""" +def _loader_payloads(records: list[logging.LogRecord]) -> list[dict]: + """All JSON payloads from ``TOOL_LOADER {...}`` lines (selection + load_tools).""" + out: list[dict] = [] for r in records: msg = r.getMessage() if msg.startswith("TOOL_LOADER {"): - return json.loads(msg[len("TOOL_LOADER ") :]) + out.append(json.loads(msg[len("TOOL_LOADER ") :])) + return out + + +def _selection_payload(records: list[logging.LogRecord]) -> dict: + """Extract the JSON payload from the TOOL_LOADER selection log line.""" + for payload in _loader_payloads(records): + if "event" not in payload: # the per-turn select line (not load_tools) + return payload raise AssertionError("no TOOL_LOADER selection line captured") + + +def _session_payload(records: list[logging.LogRecord]) -> dict: + """Extract the JSON payload from the TOOL_LOADER_SESSION summary line.""" + for r in records: + msg = r.getMessage() + if msg.startswith("TOOL_LOADER_SESSION {"): + return json.loads(msg[len("TOOL_LOADER_SESSION ") :]) + raise AssertionError("no TOOL_LOADER_SESSION line captured") diff --git a/tests/unit/test_tool_recall.py b/tests/unit/test_tool_recall.py index 16e53ac80..d0d2e9cbf 100644 --- a/tests/unit/test_tool_recall.py +++ b/tests/unit/test_tool_recall.py @@ -8,9 +8,13 @@ from __future__ import annotations from gaia.eval.tool_recall import ( + aggregate_escape_hatch, compute_recall, + count_recovery_events_from_log, + escape_hatch_rate_from_log, parse_called_sets_from_scorecard, parse_loaded_sets_from_log, + parse_session_summaries_from_log, ) @@ -107,6 +111,149 @@ def test_parse_called_sets_from_scorecard(): assert called == [[["read_file"], []], [["remember"]]] +# ── Part 2 (#1450): load_tools coalesce + gate flip ──────────────────────── + + +def test_parse_loaded_sets_unions_load_tools_lines_within_a_turn(): + """A mid-loop load_tools line unions into its turn (not a new turn/scenario).""" + log = "\n".join( + [ + 'TOOL_LOADER {"turn": 1, "loaded": ["read_file", "load_tools"]}', + 'TOOL_LOADER {"turn": 1, "event": "load_tools", "bundle": ' + '"file_search", "loaded": ["read_file", "load_tools", "search_file"]}', + 'TOOL_LOADER {"turn": 2, "loaded": ["read_file", "load_tools", ' + '"search_file"]}', + ] + ) + scenarios = parse_loaded_sets_from_log(log) + assert len(scenarios) == 1 + assert len(scenarios[0]) == 2 # two turns, not three log lines + assert scenarios[0][0] == ["load_tools", "read_file", "search_file"] # unioned + + +def test_parse_loaded_sets_splits_consecutive_single_turn_scenarios(): + """Two single-turn scenarios still split — only event-less lines move cursor.""" + log = "\n".join( + [ + 'TOOL_LOADER {"turn": 1, "loaded": ["read_file"]}', + 'TOOL_LOADER {"turn": 1, "loaded": ["remember"]}', + ] + ) + assert parse_loaded_sets_from_log(log) == [[["read_file"]], [["remember"]]] + + +def test_load_tools_call_is_always_satisfied(): + """Calling load_tools never counts as a recall miss (it is always-on CORE).""" + loaded = [[["read_file"]]] + called = [[["read_file", "load_tools"]]] + report = compute_recall(loaded, called) + assert report.recall == 1.0 + assert report.all_missing == [] + + +def test_native_recovery_within_turn_passes_gate(): + """A tool surfaced mid-turn via load_tools is in the loaded set when called.""" + log = "\n".join( + [ + 'TOOL_LOADER {"turn": 1, "loaded": ["read_file", "load_tools"]}', + 'TOOL_LOADER {"turn": 1, "event": "load_tools", "bundle": ' + '"file_search", "loaded": ["read_file", "load_tools", "search_file"]}', + ] + ) + scorecard = { + "scenarios": [{"turns": [{"agent_tools": ["load_tools", "search_file"]}]}] + } + report = compute_recall( + parse_loaded_sets_from_log(log), + parse_called_sets_from_scorecard(scorecard), + ) + assert report.recall == 1.0 + + +def test_unrecovered_miss_still_counts_against_recall(): + """A semantic miss with no load_tools recovery fails the gate (exemption gone).""" + log = 'TOOL_LOADER {"turn": 1, "loaded": ["read_file", "load_tools"]}' + scorecard = {"scenarios": [{"turns": [{"agent_tools": ["search_file"]}]}]} + report = compute_recall( + parse_loaded_sets_from_log(log), + parse_called_sets_from_scorecard(scorecard), + ) + assert report.recall == 0.0 + assert report.all_missing == ["search_file"] + + +# ── escape-hatch session summaries (τ-tuning signal) ─────────────────────── + + +def test_parse_and_aggregate_session_summaries(): + log = "\n".join( + [ + 'TOOL_LOADER_SESSION {"turns": 4, "escape_hatch_count": 1, ' + '"load_tools_count": 1, "escape_hatch_rate": 0.5}', + 'TOOL_LOADER_SESSION {"turns": 6, "escape_hatch_count": 0, ' + '"load_tools_count": 2, "escape_hatch_rate": 0.333}', + ] + ) + summaries = parse_session_summaries_from_log(log) + assert len(summaries) == 2 + agg = aggregate_escape_hatch(summaries) + assert agg["sessions"] == 2 + assert agg["turns"] == 10 # 4 + 6 + assert agg["escape_hatch_count"] == 1 # 1 + 0 + assert agg["load_tools_count"] == 3 # 1 + 2 + assert agg["escape_hatch_rate"] == (1 + 3) / 10 + + +def test_count_recovery_events_from_log(): + """Both escape-hatch paths counted from raw per-turn lines (no summary needed).""" + log = "\n".join( + [ + 'TOOL_LOADER {"turn": 1, "loaded": ["read_file", "load_tools"]}', + 'TOOL_LOADER {"turn": 1, "event": "load_tools", "bundle": ' + '"file_search", "loaded": ["read_file", "load_tools", "search_file"]}', + '{"event": "TOOL_LOADER_ESCAPE_HATCH", "tool": "write_file", "turn": 2}', + 'TOOL_LOADER {"turn": 2, "loaded": ["read_file", "load_tools", ' + '"search_file"]}', + ] + ) + free, loads = count_recovery_events_from_log(log) + assert free == 1 # the ESCAPE_HATCH line + assert loads == 1 # the load_tools event line + + +def test_escape_hatch_rate_from_log_works_without_session_summary(): + """Eval case: no TOOL_LOADER_SESSION line, rate derived from per-turn events.""" + log = "\n".join( + [ + 'TOOL_LOADER {"turn": 1, "loaded": ["load_tools"]}', + 'TOOL_LOADER {"turn": 1, "event": "load_tools", "bundle": ' + '"file_search", "loaded": ["load_tools", "search_file"]}', + 'TOOL_LOADER {"turn": 2, "loaded": ["load_tools", "search_file"]}', + 'TOOL_LOADER {"turn": 2, "event": "load_tools", "bundle": ' + '"rag_index", "loaded": ["load_tools", "search_file", "index_document"]}', + ] + ) + loaded = parse_loaded_sets_from_log(log) # 1 scenario, 2 turns + eh = escape_hatch_rate_from_log(log, loaded) + assert eh["turns"] == 2 + assert eh["load_tools_count"] == 2 + assert eh["free_recovery_count"] == 0 + assert eh["escape_hatch_rate"] == 1.0 # (0 + 2) / 2 — high ⇒ τ too strict here + assert eh["session_summaries"] == 0 # eval logs carry none + + +def test_session_and_selection_parsers_do_not_cross_contaminate(): + log = "\n".join( + [ + 'TOOL_LOADER {"turn": 1, "loaded": ["read_file"]}', + 'TOOL_LOADER_SESSION {"turns": 1, "escape_hatch_count": 0, ' + '"load_tools_count": 0, "escape_hatch_rate": 0.0}', + ] + ) + assert parse_loaded_sets_from_log(log) == [[["read_file"]]] + assert len(parse_session_summaries_from_log(log)) == 1 + + def test_end_to_end_log_and_scorecard_join(): log = "\n".join( [ From ecf62c44cecc879bcc9bc9e4a002c7901a154179 Mon Sep 17 00:00:00 2001 From: Alexey Tyurin <> Date: Thu, 18 Jun 2026 21:11:18 -0500 Subject: [PATCH 2/3] fix(tool-loader): correct stale cap comments and tighten escape-hatch tests --- src/gaia/agents/base/tool_loader.py | 5 ++-- src/gaia/agents/chat/agent.py | 11 +++---- tests/unit/test_tool_loader_token_budget.py | 33 +++++++++++++++++---- 3 files changed, 35 insertions(+), 14 deletions(-) diff --git a/src/gaia/agents/base/tool_loader.py b/src/gaia/agents/base/tool_loader.py index d7003d4fb..f1d9f14c1 100644 --- a/src/gaia/agents/base/tool_loader.py +++ b/src/gaia/agents/base/tool_loader.py @@ -57,8 +57,9 @@ # tools (index/summarize/RAG) for doc-oriented turns while excluding lower- # scoring noise; plain content questions fall back to the CORE set. Overridable. DEFAULT_THRESHOLD = 0.20 -# Default cap: 10 CORE + 4 dynamic slots = 14 (≈62% shrink on the 37-tool doc -# profile, clears the ≥60% Part-0 TTFT-reduction gate). See the plan deviations. +# Default cap: 11 CORE (doc profile, incl. the load_tools escape hatch) + 3 +# dynamic slots = 14 (≈62% shrink on the 37-tool doc profile, clears the +# ≥60% Part-0 TTFT-reduction gate). See the plan deviations. DEFAULT_MAX_TOOLS = 14 diff --git a/src/gaia/agents/chat/agent.py b/src/gaia/agents/chat/agent.py index f9754162b..a3014a1fd 100644 --- a/src/gaia/agents/chat/agent.py +++ b/src/gaia/agents/chat/agent.py @@ -136,7 +136,7 @@ class ChatAgentConfig: # __init__: GAIA_DYNAMIC_TOOLS / GAIA_DYNAMIC_TOOLS_TAU / GAIA_DYNAMIC_TOOLS_MAX. dynamic_tools: bool = False dynamic_tools_threshold: float = 0.20 # inclusive cosine; calibrated #1449 - dynamic_tools_max: int = 14 # cap (10 CORE + 4 dynamic slots) + dynamic_tools_max: int = 14 # cap (11 CORE + 3 dynamic slots) # Per-agent identity for the connectors activation filter (#1005). # Must be set BEFORE ``Agent.__init__`` runs ``_register_tools``, because @@ -1214,13 +1214,10 @@ def load_tools(bundle: str) -> dict: Dictionary with status, the resolved bundle, and the full loaded_tools list now available to call. """ + # load_tools is registered only inside ``if self.tool_loader is + # not None`` and the loader is never re-nulled after construction, + # so the loader is always live here. loader = self.tool_loader - if loader is None: - return { - "status": "error", - "error": "Dynamic tool loading is not active; all tools " - "are already available.", - } try: loaded = loader.load_bundle(bundle, self._tools_registry) except KeyError: diff --git a/tests/unit/test_tool_loader_token_budget.py b/tests/unit/test_tool_loader_token_budget.py index 4a581662d..5c9cb433f 100644 --- a/tests/unit/test_tool_loader_token_budget.py +++ b/tests/unit/test_tool_loader_token_budget.py @@ -68,10 +68,29 @@ def _within(value: float, baseline: float, tol: float = TOLERANCE) -> bool: @pytest.fixture(scope="module") def doc_agent(): - """A deterministic doc-profile skeleton (built once for the module).""" + """A deterministic doc-profile skeleton (built once for the module). + + Loader-off, so the registry is the pinned 37-tool unfiltered baseline + (``load_tools`` is *not* registered) — keep it that way for the baseline + and slope/distribution pins below. + """ return build_doc_agent_skeleton(profile="doc", deterministic=True) +@pytest.fixture(scope="module") +def doc_agent_loader_on(): + """Doc skeleton with the loader active, so ``load_tools`` is registered. + + The CORE-floor guard must measure the set that actually ships every active + turn, which includes the always-on ``load_tools`` escape hatch (#1450). The + loader-off ``doc_agent`` fixture omits it, and a filtered render silently + drops any name absent from the registry — so the floor would under-count. + """ + return build_doc_agent_skeleton( + profile="doc", deterministic=True, dynamic_tools=True + ) + + def test_harness_runs_and_pins_baseline(doc_agent): """The harness runs and the measured cost matches the pinned baseline.""" cost = measure_tool_prompt_cost(doc_agent) @@ -191,14 +210,18 @@ def _filtered_text_tokens(agent, names, tok) -> int: return len(tok.encode(agent._format_tools_for_prompt(filter_to=names))) -def test_core_only_is_the_reduction_best_case(doc_agent): - """CORE-only (the always-on floor) renders well under half the baseline cost.""" +def test_core_only_is_the_reduction_best_case(doc_agent_loader_on): + """CORE-only (the always-on floor) renders well under half the baseline cost. + + Uses the loader-on skeleton so ``load_tools`` — a CORE member that ships + every active turn — is in the registry and counted in the floor. + """ tok = get_tokenizer() if tok is None: pytest.skip("tiktoken not installed — token proxy unavailable") core = sorted(DOC_CORE_TOOLS) - native = _filtered_native_tokens(doc_agent, core, tok) - text = _filtered_text_tokens(doc_agent, core, tok) + native = _filtered_native_tokens(doc_agent_loader_on, core, tok) + text = _filtered_text_tokens(doc_agent_loader_on, core, tok) # Headroom over the measured ~40% native / ~37% text so an incidental # docstring edit doesn't flip the gate, but real CORE bloat is caught. assert native <= 0.45 * BASELINE_NATIVE_TOKENS, ( From 6c05962dade27a077c5913e2cbdff78d395ca027 Mon Sep 17 00:00:00 2001 From: Alexey Tyurin <> Date: Thu, 18 Jun 2026 22:44:06 -0500 Subject: [PATCH 3/3] fix(tool-loader): repair full-suite unit tests + review nits (#1450) --- src/gaia/agents/base/tool_loader.py | 15 +++++++++------ src/gaia/agents/chat/agent.py | 19 +++++++++++-------- src/gaia/eval/tool_recall.py | 4 ++-- tests/unit/test_dynamic_tool_filtering.py | 5 ++++- tests/unit/test_tool_loader_selection.py | 6 +++++- 5 files changed, 31 insertions(+), 18 deletions(-) diff --git a/src/gaia/agents/base/tool_loader.py b/src/gaia/agents/base/tool_loader.py index f1d9f14c1..c4480cade 100644 --- a/src/gaia/agents/base/tool_loader.py +++ b/src/gaia/agents/base/tool_loader.py @@ -371,8 +371,7 @@ def load_bundle(self, bundle: str, registry: Dict[str, dict]) -> List[str]: name — the caller turns this into an actionable error listing the valid bundle names. """ - members = self._resolve_bundle_members(bundle) - resolved_name = bundle + members, resolved_name = self._resolve_bundle_members(bundle) protected = set(self._core) | set(members) sel = _Selection() @@ -409,18 +408,22 @@ def load_bundle(self, bundle: str, registry: Dict[str, dict]) -> List[str]: # ── internals ──────────────────────────────────────────────────────── - def _resolve_bundle_members(self, bundle: str) -> FrozenSet[str]: - """Resolve *bundle* to its member set, or raise ``KeyError``. + def _resolve_bundle_members(self, bundle: str) -> tuple["FrozenSet[str]", str]: + """Resolve *bundle* to ``(members, resolved_name)``, or raise ``KeyError``. Exact bundle-name match first; else a bare tool name resolved to the union of its owning bundles' members via the reverse index. + ``resolved_name`` is the matched bundle name (exact match) or the owning + bundle name(s) joined with ``+`` (tool-name match), so the ``load_tools`` + log line records the bundle actually pulled, not the bare tool name. """ for b in self._bundles: if b.name == bundle: - return b.members + return b.members, b.name owning = self._tool_to_bundles.get(bundle) if owning: - return frozenset().union(*(b.members for b in owning)) + members = frozenset().union(*(b.members for b in owning)) + return members, "+".join(b.name for b in owning) raise KeyError(bundle) def _admit(self, name: str, sel: _Selection) -> None: diff --git a/src/gaia/agents/chat/agent.py b/src/gaia/agents/chat/agent.py index a3014a1fd..3f82b79d5 100644 --- a/src/gaia/agents/chat/agent.py +++ b/src/gaia/agents/chat/agent.py @@ -879,7 +879,8 @@ def _get_system_prompt(self) -> str: # TTFT-sensitive case, so we don't tax them with the menu. Lives in # this stable prefix (before the volatile tools tail) → no KV thrash. load_tools_menu = "" - if self.tool_loader is not None and is_tool_calling_model( + loader = getattr(self, "tool_loader", None) + if loader is not None and is_tool_calling_model( getattr(self, "model_id", None) ): load_tools_menu = ( @@ -887,7 +888,7 @@ def _get_system_prompt(self) -> str: "Your visible tools are trimmed to what this turn needs. If a " "capability you need is missing, call load_tools(bundle) with " "one of these names; its tools become available on your next " - "step:\n" + self.tool_loader.format_bundle_menu() + "step:\n" + loader.format_bundle_menu() ) return ( base_prompt @@ -1201,14 +1202,16 @@ def load_tools(bundle: str) -> dict: """Load a bundle of tools so you can call them on your next step. Call this when the capability you need is not in your current - tool list — pick a bundle name from the "Loadable tool bundles" - menu in your instructions (a bare tool name also works; it loads - that tool's whole bundle). The bundle's tools become available on - your **next** step; then call the one you need. + tool list. If a "Loadable tool bundles" menu is shown in your + instructions, pick a bundle name from it; otherwise pass the name + of the specific tool you need and its bundle is loaded. The + bundle's tools become available on your **next** step; then call + the one you need. Args: - bundle: A bundle name from the menu (e.g. "file_search", - "rag_index"), or a specific tool name to load its bundle. + bundle: A bundle name (e.g. "file_search", "rag_index") — from + the menu when one is shown — or a specific tool name to + load its owning bundle. Returns: Dictionary with status, the resolved bundle, and the full diff --git a/src/gaia/eval/tool_recall.py b/src/gaia/eval/tool_recall.py index 8d08e2402..a9d822cd7 100644 --- a/src/gaia/eval/tool_recall.py +++ b/src/gaia/eval/tool_recall.py @@ -48,7 +48,7 @@ import sys from dataclasses import dataclass, field from pathlib import Path -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Tuple _TOOL_LOADER_RE = re.compile(r"TOOL_LOADER (\{.*\})\s*$") _SESSION_RE = re.compile(r"TOOL_LOADER_SESSION (\{.*\})\s*$") @@ -237,7 +237,7 @@ def aggregate_escape_hatch(summaries: List[Dict]) -> Dict: } -def count_recovery_events_from_log(text: str) -> tuple: +def count_recovery_events_from_log(text: str) -> Tuple[int, int]: """Count the two escape-hatch recovery paths from raw per-turn log lines. Returns ``(free_recovery_count, load_tools_count)`` — free non-tool-calling diff --git a/tests/unit/test_dynamic_tool_filtering.py b/tests/unit/test_dynamic_tool_filtering.py index b7854e15f..c3ef676bc 100644 --- a/tests/unit/test_dynamic_tool_filtering.py +++ b/tests/unit/test_dynamic_tool_filtering.py @@ -128,10 +128,13 @@ def _compose_system_prompt(self): self.compose_calls += 1 return f"PROMPT::{self._active_tool_filter}" - # Bind the real method under test. + # Bind the real methods under test. ``_refresh_active_tool_filter`` now + # delegates the filter+prompt swap to ``_apply_tool_filter`` (#1450), so the + # spy must borrow both to exercise the real recompute-on-change path. from gaia.agents.base.agent import Agent _refresh_active_tool_filter = Agent._refresh_active_tool_filter + _apply_tool_filter = Agent._apply_tool_filter def test_recompute_only_on_change(): diff --git a/tests/unit/test_tool_loader_selection.py b/tests/unit/test_tool_loader_selection.py index 43cfc29df..e8b0061dd 100644 --- a/tests/unit/test_tool_loader_selection.py +++ b/tests/unit/test_tool_loader_selection.py @@ -295,8 +295,12 @@ def test_load_bundle_by_bundle_name_admits_members(): def test_load_bundle_by_tool_name_resolves_to_owning_bundle(): loader, reg = _loader_with_bundles() loader.select("q", reg) - loaded = loader.load_bundle("a1", reg) # bare tool name → bundle A + with _capture("gaia.agents.base.tool_loader") as records: + loaded = loader.load_bundle("a1", reg) # bare tool name → bundle A assert {"a1", "a2"} <= set(loaded) + # The log records the resolved bundle ("A"), not the bare tool name ("a1"). + events = [p for p in _loader_payloads(records) if p.get("event") == "load_tools"] + assert events and events[0]["bundle"] == "A" def test_load_bundle_unknown_name_raises_keyerror():