From e29e4b98c3da505b82062b09ec0365ff1bc8ec5b Mon Sep 17 00:00:00 2001
From: Alexey Tyurin <>
Date: Thu, 18 Jun 2026 20:30:38 -0500
Subject: [PATCH 1/3] feat(tool-loader): native load_tools recovery +
 escape-hatch rate (#1450)

---
 docs/plans/tool-loader.mdx               |  67 +++++++--
 src/gaia/agents/base/agent.py            |  15 +-
 src/gaia/agents/base/tool_loader.py      | 141 +++++++++++++++++-
 src/gaia/agents/chat/agent.py            |  83 ++++++++---
 src/gaia/agents/chat/tool_bundles.py     |  12 +-
 src/gaia/eval/tool_cost.py               |  31 +++-
 src/gaia/eval/tool_recall.py             | 173 +++++++++++++++++++----
 tests/unit/test_chat_dynamic_tools.py    | 131 ++++++++++++++---
 tests/unit/test_chat_tool_bundles.py     |  17 ++-
 tests/unit/test_tool_loader_selection.py | 142 ++++++++++++++++++-
 tests/unit/test_tool_recall.py           | 147 +++++++++++++++++++
 11 files changed, 866 insertions(+), 93 deletions(-)
diff --git a/docs/plans/tool-loader.mdx b/docs/plans/tool-loader.mdx
index 2bf0484b8..bafefc11a 100644
--- a/docs/plans/tool-loader.mdx
+++ b/docs/plans/tool-loader.mdx
@@ -9,7 +9,7 @@ title: "Dynamic Tool Loader"
 <Note>
 **Component:** Per-turn tool visibility for agents (issue [#688](https://github.com/amd/gaia/issues/688))
 **Module:** `gaia.agents.base.tool_loader`
-**Status:** **Part 0 (#1448) + Part 1 (#1449) landed.** Part 1 ships the selection mechanism behind a default-off toggle on the ChatAgent `doc` profile. Parts 2–3 (explicit escape hatch, skill signal) are still proposed.
+**Status:** **Part 0 (#1448) + Part 1 (#1449) + Part 2 (#1450) landed.** Part 1 ships the selection mechanism behind a default-off toggle on the ChatAgent `doc` profile; Part 2 adds the explicit `load_tools` escape hatch (so native tool-calling models can recover a semantic miss) plus the escape-hatch activation-rate tuning signal. Part 3 (skill signal) is still proposed.
 **Target agent (v1):** `ChatAgent` (`doc` profile), behind a default-off toggle.
 </Note>
 
@@ -297,11 +297,14 @@ backend KV prefix stays warm. When a filter is active the tools block moves
 **after** the response-format template (volatile content last); with no filter
 the legacy order and bytes are preserved exactly.
 
-**Native known gap (Amendment 2).** `_execute_tool` is never tightened, so a
-non-tool-calling model that names an unlisted tool still runs it (free recovery)
-and the loader logs `TOOL_LOADER_ESCAPE_HATCH`. Native tool-calling models have
-no such hatch until Part 2's `load_tools`; on first activation the agent logs the
-miss as a *known gap* rather than padding the loaded set.
+**Native known gap (Amendment 2) — closed by Part 2.** `_execute_tool` is never
+tightened, so a non-tool-calling model that names an unlisted tool still runs it
+(free recovery) and the loader logs `TOOL_LOADER_ESCAPE_HATCH`. In Part 1 native
+tool-calling models had no such hatch — a semantic miss could not self-recover.
+[Part 2](#part-2-explicit-escape-hatch--tuning-1450) closes the recovery gap with
+the always-on `load_tools` meta-tool (the model loads the bundle it needs and
+calls the tool on its next step), and the recall gate's native exemption is
+removed accordingly.
 
 **Approved deviations from this sketch** (flagged in the #1449 PR):
 
@@ -323,7 +326,7 @@ baseline — meaning **CORE-only is the ~60%-reduction best case** and a full
 `test_tool_loader_token_budget.py` pins these filtered costs as a static guard.
 </Warning>
 
-### Part 2 — Explicit escape hatch + tuning
+### Part 2 — Explicit escape hatch + tuning ✅ landed (#1450)
 
 - Add bundle re-surfacing + a discoverability menu of bundle names, and the
   `load_tools` meta-tool that native tool-calling models need (the free recovery
@@ -340,6 +343,50 @@ baseline — meaning **CORE-only is the ~60%-reduction best case** and a full
 - **Escape-hatch activation rate** is logged per session and usable as the
   threshold-tuning signal (rising rate ⇒ τ too strict).
 
+#### How Part 2 shipped (implementation reference)
+
+**`load_tools` is always-on via CORE.** `load_tools` is added to
+[`DOC_CORE_TOOLS`](https://github.com/amd/gaia/blob/main/src/gaia/agents/chat/tool_bundles.py)
+(CORE = 11), so once registered it renders in **both** the text prompt and the
+native `tools=` schema every active turn and is cap-/eviction-exempt. It is
+registered **only when the loader is active** (`self.tool_loader is not None`),
+so the default-off `doc` path stays byte-identical — the unfiltered 37-tool
+baseline is unchanged.
+
+**Recovery lands on the next model *step*, not the next user turn.** The
+`load_tools(bundle)` handler calls `ToolLoader.load_bundle`, then
+`Agent._apply_tool_filter` — the one place the active filter and the cached
+system prompt move together. Because `system_prompt` and `_openai_tools` are
+read live at every LLM call, the expanded set is visible to the very next step
+in the same query, which is what lets `smart_discovery` recover on turn 1.
+
+**`load_bundle` is cap-aware.** It resolves a bundle name (or a bare tool name,
+via the reverse index) and admits members with the same LRU-evict path `select()`
+uses — protecting CORE and the members being loaded now — so `max_tools` holds at
+all times. It emits a same-turn `TOOL_LOADER {…, "event": "load_tools", …}`
+superset line.
+
+**Menu is stable and native-only.** A compact bundle menu (name + one-line
+description, from `ToolBundle.description`) is injected into the **stable** prefix
+of the doc system prompt (before the volatile tools tail → no KV thrash), and
+**only for native tool-calling models** — non-native models already have free
+recovery and are the TTFT-sensitive path.
+
+**Tuning signal is log-derived.** The loader counts escape-hatch (free) and
+`load_tools` (explicit) activations per session and emits a `TOOL_LOADER_SESSION`
+summary on `reset_session()` (`escape_hatch_rate = (escape_hatch + load_tools) /
+turns`). `gaia.eval.tool_recall` aggregates these from the server log and reports
+the per-turn rate alongside recall — no UI-DB migration.
+
+**Recall gate flipped correctly.** `tool_recall.py` unions same-turn
+`load_tools` superset lines into that turn's loaded set and treats `load_tools`
+as always-satisfied; **only then** is the native "known gap" exemption removed,
+so a successful recovery passes the gate and a genuinely unrecovered miss fails
+it on every model.
+
+**Cap unchanged at 14** (→ 3 dynamic slots now that CORE = 11). The eval gates
+recall; bump the default only if recall or the escape-hatch rate regresses.
+
 ### Part 3 — Skill-driven signal (gated on #887)
 
 A third selection signal, added **only after** [#887](https://github.com/amd/gaia/issues/887)
@@ -407,7 +454,11 @@ via the base `_select_tools_for_turn` hook, and both render paths filter from th
 same selection. The old keyword/bundle-policy skeleton was removed; the class name
 `ToolLoader` and `reset_session()` were kept so the existing (guarded) call sites
 in `cli.py` / `chat/app.py` needed no change. Recall recovery for native
-tool-calling models (the `load_tools` meta-tool) is still **Part 2**.
+tool-calling models has shipped (Part 2, #1450): the loader exposes
+`bundle_names` / `format_bundle_menu` / `load_bundle` and per-session escape-hatch
+counters; `ChatAgent` registers the `load_tools` meta-tool and injects the
+native-only bundle menu; and `gaia.eval.tool_recall` unions mid-loop `load_tools`
+lines, drops the native exemption, and reports the escape-hatch activation rate.
 
 ## Dependencies
 
diff --git a/src/gaia/agents/base/agent.py b/src/gaia/agents/base/agent.py
index be37fc0d3..09f2d25a2 100644
--- a/src/gaia/agents/base/agent.py
+++ b/src/gaia/agents/base/agent.py
@@ -817,8 +817,19 @@ def _refresh_active_tool_filter(self, user_input: str) -> None:
         # pylint: disable-next=assignment-from-none
         new_filter = self._select_tools_for_turn(user_input)
         if new_filter != self._active_tool_filter:
-            self._active_tool_filter = new_filter
-            self._system_prompt_cache = self._compose_system_prompt()
+            self._apply_tool_filter(new_filter)
+
+    def _apply_tool_filter(self, new_filter: Optional[List[str]]) -> None:
+        """Swap the active tool filter and recompute the cached system prompt.
+
+        The single place the "filter and prompt move together" invariant lives.
+        Called from :meth:`_refresh_active_tool_filter` (per user turn) and from
+        the ``load_tools`` escape-hatch handler (mid-loop), so a mid-query
+        expansion is visible to the very next model step — both render paths
+        (``system_prompt`` and ``_openai_tools``) read these live.
+        """
+        self._active_tool_filter = new_filter
+        self._system_prompt_cache = self._compose_system_prompt()
 
     def rebuild_system_prompt(self) -> None:
         """Rebuild system prompt with current tools from _TOOL_REGISTRY.
diff --git a/src/gaia/agents/base/tool_loader.py b/src/gaia/agents/base/tool_loader.py
index 2afd1f612..d7003d4fb 100644
--- a/src/gaia/agents/base/tool_loader.py
+++ b/src/gaia/agents/base/tool_loader.py
@@ -156,6 +156,13 @@ def __init__(
         self._loaded: Dict[str, _ToolState] = {}
         self._turn = 0
         self._session_disabled = False
+        # Escape-hatch activation counters (Part 2, #1450). Both recovery paths
+        # feed the τ-tuning signal: the non-tool-calling free recovery
+        # (record_tool_use on an unlisted tool) and the native explicit recovery
+        # (load_bundle). Summarized on reset_session(), aggregated from logs by
+        # the eval. A rising per-turn rate ⇒ τ too strict.
+        self._escape_hatch_count = 0
+        self._load_tools_count = 0
 
     # ── public API ───────────────────────────────────────────────────────
 
@@ -284,21 +291,23 @@ def record_tool_use(self, tool_name: str) -> None:
 
         If the tool is loaded, refresh its ``last_call_ts``. If it is **not**
         loaded, the model reached a tool the prompt didn't list (a free
-        non-tool-calling recovery via the full registry); log it as the
-        escape-hatch signal. This does *not* auto-load the tool — that is
-        Part 2's job.
+        non-tool-calling recovery via the full registry); count and log it as the
+        escape-hatch signal. This does *not* auto-load the tool; a native model
+        re-surfaces a missed tool through the explicit :meth:`load_bundle` path
+        (the ``load_tools`` meta-tool).
         """
         state = self._loaded.get(tool_name)
         if state is not None:
             state.last_call_ts = time.time()
             return
+        self._escape_hatch_count += 1
         logger.info(
             json.dumps(
                 {
                     "event": "TOOL_LOADER_ESCAPE_HATCH",
                     "tool": tool_name,
                     "turn": self._turn,
-                    "note": "executed unlisted tool via full registry (Part-2 gap)",
+                    "note": "executed unlisted tool via full registry (free recovery)",
                 }
             )
         )
@@ -306,15 +315,113 @@ def record_tool_use(self, tool_name: str) -> None:
     def reset_session(self) -> None:
         """Clear per-session state for a new conversation.
 
-        The content-keyed embedding cache survives — embeddings depend only on
-        the tool docs, not on the conversation.
+        Emits the per-session escape-hatch summary (the τ-tuning signal) for the
+        conversation just ending **before** clearing, then zeroes the counters
+        alongside the existing state clears. The content-keyed embedding cache
+        survives — embeddings depend only on the tool docs, not the conversation.
         """
+        if self._turn > 0:
+            self._log_session_summary()
         self._loaded.clear()
         self._turn = 0
         self._session_disabled = False
+        self._escape_hatch_count = 0
+        self._load_tools_count = 0
+
+    def bundle_names(self) -> List[str]:
+        """Return the configured bundle names, sorted (the ``load_tools`` menu)."""
+        return sorted(b.name for b in self._bundles)
+
+    def format_bundle_menu(self) -> str:
+        """Return a compact ``"- {name}: {description}"`` menu over all bundles.
+
+        Used both for the native-model system-prompt menu and for the
+        unknown-bundle error text, so the model always sees the same valid names.
+        """
+        return "\n".join(
+            f"- {b.name}: {b.description}" if b.description else f"- {b.name}"
+            for b in self._bundles
+        )
+
+    def load_bundle(self, bundle: str, registry: Dict[str, dict]) -> List[str]:
+        """Admit a bundle's tools into the loaded set (the explicit escape hatch).
+
+        Resolves *bundle* to a :class:`ToolBundle` — exact bundle-name match
+        first, else (robustness nicety) a bare tool name resolved to its
+        bundle(s) via the reverse index — and admits each member present in
+        *registry* and not already loaded, **cap-aware**: under the cap via
+        :meth:`_admit`; at the cap by LRU-evicting a non-CORE tool that is not
+        being loaded right now (or skipping + logging if nothing is evictable),
+        mirroring :meth:`select`'s admission loop. So ``max_tools`` holds at all
+        times. Emits a same-turn ``TOOL_LOADER`` *loaded superset* line so the
+        recall parser sees the mid-loop expansion.
+
+        Args:
+            bundle: A bundle name from the menu, or a bare tool name to resolve
+                to its owning bundle(s).
+            registry: The live tool registry (same object passed to
+                :meth:`select`); members absent from it are not admitted.
+
+        Returns:
+            The sorted loaded set after admission.
+
+        Raises:
+            KeyError: *bundle* is neither a known bundle name nor a known tool
+                name — the caller turns this into an actionable error listing the
+                valid bundle names.
+        """
+        members = self._resolve_bundle_members(bundle)
+        resolved_name = bundle
+
+        protected = set(self._core) | set(members)
+        sel = _Selection()
+        for member in sorted(members):
+            if member not in registry or member in self._loaded:
+                continue
+            if len(self._loaded) < self._max_tools:
+                self._admit(member, sel)
+                continue
+            victim = self._pick_eviction_victim(protected)
+            if victim is None:
+                sel.skipped_at_cap.append(member)
+                continue
+            del self._loaded[victim]
+            sel.evicted.append(victim)
+            self._admit(member, sel)
+
+        self._load_tools_count += 1
+        logger.info(
+            "TOOL_LOADER %s",
+            json.dumps(
+                {
+                    "turn": self._turn,
+                    "event": "load_tools",
+                    "bundle": resolved_name,
+                    "admitted": sorted(sel.admitted),
+                    "evicted": sorted(sel.evicted),
+                    "skipped_at_cap": sorted(sel.skipped_at_cap),
+                    "loaded": sorted(self._loaded),
+                }
+            ),
+        )
+        return sorted(self._loaded)
 
     # ── internals ────────────────────────────────────────────────────────
 
+    def _resolve_bundle_members(self, bundle: str) -> FrozenSet[str]:
+        """Resolve *bundle* to its member set, or raise ``KeyError``.
+
+        Exact bundle-name match first; else a bare tool name resolved to the
+        union of its owning bundles' members via the reverse index.
+        """
+        for b in self._bundles:
+            if b.name == bundle:
+                return b.members
+        owning = self._tool_to_bundles.get(bundle)
+        if owning:
+            return frozenset().union(*(b.members for b in owning))
+        raise KeyError(bundle)
+
     def _admit(self, name: str, sel: _Selection) -> None:
         """Add *name* to the loaded set with fresh bookkeeping."""
         self._loaded[name] = _ToolState(loaded_at=time.time(), load_turn=self._turn)
@@ -404,6 +511,28 @@ def _log_selection(
             ),
         )
 
+    def _log_session_summary(self) -> None:
+        """Emit one ``TOOL_LOADER_SESSION`` INFO line — the τ-tuning signal.
+
+        ``escape_hatch_rate`` is per turn over both recovery paths (free
+        non-tool-calling recovery + native ``load_tools``); the two component
+        counts are reported separately so the tuner can see which path fired.
+        """
+        logger.info(
+            "TOOL_LOADER_SESSION %s",
+            json.dumps(
+                {
+                    "turns": self._turn,
+                    "escape_hatch_count": self._escape_hatch_count,
+                    "load_tools_count": self._load_tools_count,
+                    "escape_hatch_rate": (
+                        self._escape_hatch_count + self._load_tools_count
+                    )
+                    / max(self._turn, 1),
+                }
+            ),
+        )
+
 
 def _sha256(text: str) -> str:
     """Hex SHA-256 of *text* (UTF-8)."""
diff --git a/src/gaia/agents/chat/agent.py b/src/gaia/agents/chat/agent.py
index 1704faf31..f9754162b 100644
--- a/src/gaia/agents/chat/agent.py
+++ b/src/gaia/agents/chat/agent.py
@@ -342,7 +342,6 @@ def __init__(self, config: Optional[ChatAgentConfig] = None):
         # None → full registry / legacy prompt. Embedding fns are injected so the
         # loader never imports MemoryMixin; they resolve lazily on first select(),
         # by which point init_memory() has probed the embedder.
-        self._dynamic_tools_native_warned = False
         self._dynamic_tools_validated = False
         self.tool_loader = self._maybe_build_tool_loader()
 
@@ -521,7 +520,6 @@ def _select_tools_for_turn(self, user_input: str) -> Optional[List[str]]:
         """Return this turn's sorted tool subset, or ``None`` for the full registry."""
         if not self._dynamic_tools_active():
             return None
-        self._maybe_warn_native_tool_gap()
         if not self._dynamic_tools_validated:
             # Fail loudly on first activation if a CORE/bundle name doesn't exist
             # in the live registry (drift). The reverse direction is the CI test.
@@ -553,22 +551,6 @@ def _build_tool_selection_query(self, user_input: str) -> str:
         combined = f"{prev}\n{user_input}" if prev else user_input
         return combined[-4000:]
 
-    def _maybe_warn_native_tool_gap(self) -> None:
-        """Log the Amendment-2 known gap once, on first activation for a native model.
-
-        Native tool-calling models have no escape hatch until Part 2's
-        ``load_tools`` lands, so a semantic miss can't self-recover. We log it as
-        a known gap rather than padding the loaded set.
-        """
-        if self._dynamic_tools_native_warned:
-            return
-        self._dynamic_tools_native_warned = True
-        if is_tool_calling_model(getattr(self, "model_id", None)):
-            logger.warning(
-                "tool_loader: native tool-calling model — no escape hatch until "
-                "Part 2; semantic misses are a known gap"
-            )
-
     def _post_process_tool_result(
         self,
         tool_name: str,
@@ -891,7 +873,22 @@ def _get_system_prompt(self) -> str:
             return base_prompt + extras
 
         if profile == "doc":
-            # Document Q&A: RAG tools + hallucination prevention
+            # Document Q&A: RAG tools + hallucination prevention.
+            # Native-only escape-hatch menu (#1450): non-native models already
+            # self-recover via the free full-registry path and are the
+            # TTFT-sensitive case, so we don't tax them with the menu. Lives in
+            # this stable prefix (before the volatile tools tail) → no KV thrash.
+            load_tools_menu = ""
+            if self.tool_loader is not None and is_tool_calling_model(
+                getattr(self, "model_id", None)
+            ):
+                load_tools_menu = (
+                    "\n\n==== LOADABLE TOOL BUNDLES ====\n"
+                    "Your visible tools are trimmed to what this turn needs. If a "
+                    "capability you need is missing, call load_tools(bundle) with "
+                    "one of these names; its tools become available on your next "
+                    "step:\n" + self.tool_loader.format_bundle_menu()
+                )
             return (
                 base_prompt
                 + indexed_docs_section
@@ -899,6 +896,7 @@ def _get_system_prompt(self) -> str:
                 + discovery_rules
                 + discovery_rules_tail
                 + rag_query_rules
+                + load_tools_menu
             )
 
         if profile == "file":
@@ -1192,6 +1190,53 @@ def _register_tools(self) -> None:
         self._register_external_tools_conditional()
         self._register_loop_control_tools()  # set_loop_state, request_user_input
 
+        # load_tools escape hatch (#1450, Part 2) — registered ONLY when the
+        # dynamic loader is active, so the default-off doc path stays
+        # byte-identical. It is in DOC_CORE_TOOLS, so once registered it renders
+        # in both prompt paths every active turn (cap- and eviction-exempt).
+        if self.tool_loader is not None:
+
+            @tool
+            def load_tools(bundle: str) -> dict:
+                """Load a bundle of tools so you can call them on your next step.
+
+                Call this when the capability you need is not in your current
+                tool list — pick a bundle name from the "Loadable tool bundles"
+                menu in your instructions (a bare tool name also works; it loads
+                that tool's whole bundle). The bundle's tools become available on
+                your **next** step; then call the one you need.
+
+                Args:
+                    bundle: A bundle name from the menu (e.g. "file_search",
+                        "rag_index"), or a specific tool name to load its bundle.
+
+                Returns:
+                    Dictionary with status, the resolved bundle, and the full
+                    loaded_tools list now available to call.
+                """
+                loader = self.tool_loader
+                if loader is None:
+                    return {
+                        "status": "error",
+                        "error": "Dynamic tool loading is not active; all tools "
+                        "are already available.",
+                    }
+                try:
+                    loaded = loader.load_bundle(bundle, self._tools_registry)
+                except KeyError:
+                    return {
+                        "status": "error",
+                        "error": f"Unknown bundle '{bundle}'. Choose one of: "
+                        f"{', '.join(loader.bundle_names())}",
+                    }
+                # Make the expansion visible to the next model step in this query.
+                self._apply_tool_filter(loaded)
+                return {
+                    "status": "success",
+                    "bundle": bundle,
+                    "loaded_tools": loaded,
+                }
+
         # Inline list_files — only for profiles that need file operations
         if profile in ("file", "data", "full"):
 
diff --git a/src/gaia/agents/chat/tool_bundles.py b/src/gaia/agents/chat/tool_bundles.py
index eac7ed975..afc0ef516 100644
--- a/src/gaia/agents/chat/tool_bundles.py
+++ b/src/gaia/agents/chat/tool_bundles.py
@@ -20,9 +20,13 @@
 
 from gaia.agents.base.tool_loader import ToolBundle
 
-# Always-on set (10 tools): memory v2, file-read + RAG-query entry points, and
-# loop control. The design sketch listed a "finish" tool, dropped here — turn
-# completion is protocol-level in GAIA, there is no such registry tool.
+# Always-on set (11 tools): memory v2, file-read + RAG-query entry points, loop
+# control, and the Part-2 escape hatch. The design sketch listed a "finish" tool,
+# dropped here — turn completion is protocol-level in GAIA, there is no such
+# registry tool. ``load_tools`` (#1450) is CORE-only — never in a bundle — so it
+# renders in both the text prompt and the native ``tools=`` schema every active
+# turn, cap- and eviction-exempt, giving native models a way back to any tool a
+# semantic miss didn't surface.
 DOC_CORE_TOOLS = frozenset(
     {
         # memory v2 — persistent recall is always relevant
@@ -38,6 +42,8 @@
         # loop control — autonomous-turn signalling
         "set_loop_state",
         "request_user_input",
+        # escape hatch (#1450) — always-on explicit tool loader for native models
+        "load_tools",
     }
 )
 
diff --git a/src/gaia/eval/tool_cost.py b/src/gaia/eval/tool_cost.py
index 20977adad..05e97b6d1 100644
--- a/src/gaia/eval/tool_cost.py
+++ b/src/gaia/eval/tool_cost.py
@@ -147,8 +147,30 @@ def _isolated_registry():
         tools_mod._TOOL_REGISTRY.update(saved)
 
 
+def _build_skeleton_tool_loader(dynamic_tools: bool):
+    """Return a real ToolLoader over the doc config, or ``None`` when off.
+
+    Registration only consults ``self.tool_loader is not None``; it never embeds
+    or selects, so a trivial zero-vector embedder is enough to attach a loader.
+    """
+    if not dynamic_tools:
+        return None
+    import numpy as np
+
+    from gaia.agents.base.tool_loader import ToolLoader
+    from gaia.agents.chat.tool_bundles import DOC_BUNDLES, DOC_CORE_TOOLS
+
+    return ToolLoader(
+        core_tools=DOC_CORE_TOOLS,
+        bundles=DOC_BUNDLES,
+        embed_fn=lambda text: np.zeros(1, dtype=np.float32),
+    )
+
+
 def build_doc_agent_skeleton(
-    profile: str = DEFAULT_PROFILE, deterministic: bool = True
+    profile: str = DEFAULT_PROFILE,
+    deterministic: bool = True,
+    dynamic_tools: bool = False,
 ) -> "ChatAgent":
     """Build a ChatAgent skeleton with the *profile* tools registered.
 
@@ -158,6 +180,12 @@ def build_doc_agent_skeleton(
     (memory store, rag) are enough to let ``_register_tools`` populate the
     registry.
 
+    With ``dynamic_tools=True`` a real :class:`ToolLoader` is attached **before**
+    ``_register_tools`` runs, so the ``load_tools`` meta-tool (#1450) registers
+    (registry +1, ``load_tools``). A trivial embedder suffices — registration
+    never embeds or selects; only the loader's presence is consulted. Default
+    ``False`` keeps the unfiltered baseline path unchanged (no ``load_tools``).
+
     With ``deterministic=True`` the environment-conditional external tools
     (``search_documentation`` / ``search_web``, gated on npx and
     ``PERPLEXITY_API_KEY``) are forced off so the tool set — and therefore the
@@ -213,6 +241,7 @@ def build_doc_agent_skeleton(
                 agent._web_client = None
                 agent._fs_index = None
                 agent._scratchpad = None
+                agent.tool_loader = _build_skeleton_tool_loader(dynamic_tools)
                 agent._register_tools()
                 agent._instance_tools = dict(tools_mod._TOOL_REGISTRY)
 
diff --git a/src/gaia/eval/tool_recall.py b/src/gaia/eval/tool_recall.py
index 91daf9c92..8d08e2402 100644
--- a/src/gaia/eval/tool_recall.py
+++ b/src/gaia/eval/tool_recall.py
@@ -23,9 +23,21 @@
 * **called sets** — ``scorecard.json`` in the eval run dir
   (``scenarios[].turns[].agent_tools``).
 
-Amendment 2: on native tool-calling models a semantic miss can't self-recover
-until Part 2, so misses there are reported as a *known gap* and do not fail the
-gate. On non-native models recall below ``--min-recall`` exits non-zero.
+Part 2 (#1450): native tool-calling models recover a semantically-missed tool
+via the always-on ``load_tools`` meta-tool, so the Amendment-2 native exemption
+is **removed** — a miss fails the gate on every model. Two parser changes make
+that correct: a mid-loop ``load_tools`` line (same ``turn``, ``event":
+"load_tools"``) is **unioned** into that turn's loaded set so a successful
+recovery shows the tool as loaded; and ``load_tools`` itself counts as
+always-satisfied. Recall below ``--min-recall`` exits non-zero.
+
+Escape-hatch activation rate (the τ-tuning signal, rising ⇒ τ too strict) is
+derived from the **raw per-turn log events** — explicit ``load_tools`` lines and
+free-recovery ``TOOL_LOADER_ESCAPE_HATCH`` lines over the turn count — because
+those appear on every run. The per-session ``TOOL_LOADER_SESSION`` summary
+(emitted only on ``reset_session``, i.e. the ``gaia chat``/CLI path, **not** the
+UI-server/eval path) is a convenience for CLI logs; the recall gate does not
+depend on it, so the rate is reported for eval runs too. (#1450)
 """
 
 from __future__ import annotations
@@ -39,6 +51,12 @@
 from typing import Dict, List, Optional
 
 _TOOL_LOADER_RE = re.compile(r"TOOL_LOADER (\{.*\})\s*$")
+_SESSION_RE = re.compile(r"TOOL_LOADER_SESSION (\{.*\})\s*$")
+_ESCAPE_HATCH_RE = re.compile(r'"event"\s*:\s*"TOOL_LOADER_ESCAPE_HATCH"')
+
+# Tools that never count as a recall miss: ``load_tools`` is the always-on
+# escape hatch (CORE), so calling it is always satisfiable by construction.
+_ALWAYS_SATISFIED = frozenset({"load_tools"})
 
 
 # ── pure join logic (unit-tested) ─────────────────────────────────────────
@@ -110,7 +128,9 @@ def compute_recall(
             loaded = list(loaded_turns[t])
             called = list(called_turns[t])
             loaded_set = set(loaded)
-            missing = sorted(c for c in called if c not in loaded_set)
+            missing = sorted(
+                c for c in called if c not in loaded_set and c not in _ALWAYS_SATISFIED
+            )
             turns.append(
                 TurnRecall(
                     scenario_idx=s,
@@ -130,8 +150,13 @@ def compute_recall(
 def parse_loaded_sets_from_log(text: str) -> List[List[List[str]]]:
     """Extract per-scenario, per-turn loaded sets from server-log TOOL_LOADER lines.
 
-    A new scenario begins at each ``"turn": 1`` selection line (the loader resets
-    its turn counter per conversation).
+    A new scenario begins at each ``"turn": 1`` *selection* line (the loader
+    resets its turn counter per conversation). A mid-loop ``load_tools`` line
+    (Part 2) shares its turn's number but carries ``"event": "load_tools"``; it
+    is **unioned** into that turn's loaded set rather than opening a new turn, so
+    a within-turn recovery shows the loaded set as it stood *after* the load.
+    Only ``event``-less selection lines move the turn/scenario cursor, so two
+    consecutive single-turn scenarios still split correctly.
 
     Assumption: every scenario emits a ``turn == 1`` line. A turn-1 *embedder
     failure* session-disables the loader before ``_log_selection`` runs, so that
@@ -150,10 +175,20 @@ def parse_loaded_sets_from_log(text: str) -> List[List[List[str]]]:
         payload = json.loads(m.group(1))
         if "loaded" not in payload or "turn" not in payload:
             continue  # not a selection line (e.g. escape-hatch event)
+        loaded = list(payload["loaded"])
+        if payload.get("event") == "load_tools":
+            # Mid-loop expansion: union into the current turn's loaded set. A
+            # load_tools line always follows its turn's selection line, so
+            # ``current`` is non-empty in a well-formed log; tolerate the start.
+            if current:
+                current[-1] = sorted(set(current[-1]) | set(loaded))
+            else:
+                current.append(loaded)
+            continue
         if payload["turn"] == 1 and current:
             scenarios.append(current)
             current = []
-        current.append(list(payload["loaded"]))
+        current.append(loaded)
     if current:
         scenarios.append(current)
     return scenarios
@@ -168,16 +203,81 @@ def parse_called_sets_from_scorecard(scorecard: Dict) -> List[List[List[str]]]:
     return out
 
 
-def _model_is_native(scorecard: Dict) -> bool:
-    """Whether the scorecard's model uses native tool-calling (Amendment-2 gate)."""
-    model = (scorecard.get("config") or {}).get("model")
-    try:
-        from gaia.llm.lemonade_client import is_tool_calling_model
+def parse_session_summaries_from_log(text: str) -> List[Dict]:
+    """Extract ``TOOL_LOADER_SESSION`` payloads (one per finished conversation)."""
+    out: List[Dict] = []
+    for line in text.splitlines():
+        m = _SESSION_RE.search(line)
+        if not m:
+            continue
+        out.append(json.loads(m.group(1)))
+    return out
 
-        return is_tool_calling_model(model)
-    except ImportError:
-        # Can't import the classifier — treat as native so misses don't hard-fail.
-        return True
+
+def aggregate_escape_hatch(summaries: List[Dict]) -> Dict:
+    """Aggregate per-session ``TOOL_LOADER_SESSION`` summaries into a per-turn rate.
+
+    Only ``gaia chat``/CLI logs carry these summaries (they emit on
+    ``reset_session()``). For the canonical τ-tuning path — eval logs — use
+    :func:`escape_hatch_rate_from_log`, which derives the same rate from the raw
+    per-turn lines that are always present. The rate (free non-tool-calling
+    recovery + native ``load_tools``, per turn) is the tuning signal: rising ⇒ τ
+    too strict; the two component counts are kept separate so the tuner sees
+    which path fired.
+    """
+    turns = sum(int(s.get("turns", 0)) for s in summaries)
+    escape = sum(int(s.get("escape_hatch_count", 0)) for s in summaries)
+    loads = sum(int(s.get("load_tools_count", 0)) for s in summaries)
+    return {
+        "sessions": len(summaries),
+        "turns": turns,
+        "escape_hatch_count": escape,
+        "load_tools_count": loads,
+        "escape_hatch_rate": (escape + loads) / max(turns, 1),
+    }
+
+
+def count_recovery_events_from_log(text: str) -> tuple:
+    """Count the two escape-hatch recovery paths from raw per-turn log lines.
+
+    Returns ``(free_recovery_count, load_tools_count)`` — free non-tool-calling
+    recovery (``TOOL_LOADER_ESCAPE_HATCH`` lines) and explicit native recovery
+    (``TOOL_LOADER {… "event": "load_tools" …}`` lines). These per-turn lines are
+    emitted on **every** run (eval and CLI), independent of ``reset_session()``,
+    so they — not the per-session ``TOOL_LOADER_SESSION`` summary — are the
+    source of truth for the activation rate. (The UI-server/eval path never calls
+    ``reset_session()``, so eval logs carry no summary; #1450.)
+    """
+    free = loads = 0
+    for line in text.splitlines():
+        if _ESCAPE_HATCH_RE.search(line):
+            free += 1
+            continue
+        m = _TOOL_LOADER_RE.search(line)
+        if m and json.loads(m.group(1)).get("event") == "load_tools":
+            loads += 1
+    return free, loads
+
+
+def escape_hatch_rate_from_log(
+    text: str, loaded_per_scenario: List[List[List[str]]]
+) -> Dict:
+    """Per-turn escape-hatch activation rate derived from the raw log.
+
+    ``rate = (free recoveries + explicit load_tools) / total turns``, where total
+    turns is the number of per-turn selection lines across all scenarios. Works
+    on eval logs (which lack ``TOOL_LOADER_SESSION``) — this is the τ-tuning
+    signal the recall gate reports.
+    """
+    free, loads = count_recovery_events_from_log(text)
+    turns = sum(len(scenario) for scenario in loaded_per_scenario)
+    return {
+        "turns": turns,
+        "free_recovery_count": free,
+        "load_tools_count": loads,
+        "escape_hatch_rate": (free + loads) / max(turns, 1),
+        "session_summaries": len(parse_session_summaries_from_log(text)),
+    }
 
 
 # ── CLI ───────────────────────────────────────────────────────────────────
@@ -197,9 +297,9 @@ def _discover_log(run_dir: Path) -> Optional[Path]:
     return None
 
 
-def _format_report(report: RecallReport, native: bool) -> str:
+def _format_report(report: RecallReport, escape_hatch: Optional[Dict] = None) -> str:
     lines = [
-        "# Tool-recall gate (#1449)",
+        "# Tool-recall gate (#1449, #1450)",
         "",
         f"Turns scored: {len(report.turns)}  |  recall: {report.recall:.1%}",
         "",
@@ -210,8 +310,9 @@ def _format_report(report: RecallReport, native: bool) -> str:
         lines.append("")
     misses = [t for t in report.turns if not t.ok]
     if misses:
-        label = "known gap (native model, Part 2)" if native else "RECALL MISS"
-        lines.append(f"## {label}")
+        # Part 2 removed the native exemption: a miss is a miss on every model
+        # (native models recover via load_tools, which the parser unions in).
+        lines.append("## RECALL MISS")
         for t in misses:
             lines.append(
                 f"- scenario {t.scenario_idx} turn {t.turn_idx}: called "
@@ -219,6 +320,21 @@ def _format_report(report: RecallReport, native: bool) -> str:
             )
     else:
         lines.append("All called tools were loaded when called. ✅")
+    if escape_hatch is not None:
+        lines.extend(
+            [
+                "",
+                "## Escape-hatch activation (τ-tuning signal)",
+                f"turns: {escape_hatch['turns']}  |  rate/turn: "
+                f"{escape_hatch['escape_hatch_rate']:.3f} "
+                f"(free recovery: {escape_hatch['free_recovery_count']}, "
+                f"load_tools: {escape_hatch['load_tools_count']})  — "
+                "rising ⇒ τ too strict.",
+                f"(derived from per-turn log events; "
+                f"{escape_hatch['session_summaries']} TOOL_LOADER_SESSION "
+                "summaries present)",
+            ]
+        )
     return "\n".join(lines)
 
 
@@ -242,7 +358,8 @@ def main(argv: Optional[List[str]] = None) -> int:
         "--min-recall",
         type=float,
         default=1.0,
-        help="Minimum recall for a PASS on non-native models (default: 1.0).",
+        help="Minimum recall for a PASS (default: 1.0). Applies to every model "
+        "— Part 2 removed the native exemption.",
     )
     args = parser.parse_args(argv)
 
@@ -264,7 +381,8 @@ def main(argv: Optional[List[str]] = None) -> int:
             "(`... 2>&1 | tee server.log`, NOT `2> server.log`) and pass it via "
             "--log. The loader emits one `TOOL_LOADER {json}` line per turn."
         )
-    loaded = parse_loaded_sets_from_log(log_path.read_text(encoding="utf-8"))
+    log_text = log_path.read_text(encoding="utf-8")
+    loaded = parse_loaded_sets_from_log(log_text)
     if not loaded:
         raise SystemExit(
             f"{log_path} contained no TOOL_LOADER selection lines — was the loader "
@@ -273,13 +391,14 @@ def main(argv: Optional[List[str]] = None) -> int:
     called = parse_called_sets_from_scorecard(scorecard)
 
     report = compute_recall(loaded, called)
-    native = _model_is_native(scorecard)
-    print(_format_report(report, native))
+    # Derive the τ-tuning rate from the raw per-turn events (present in eval logs);
+    # the per-session TOOL_LOADER_SESSION summary only exists on the CLI path.
+    escape_hatch = escape_hatch_rate_from_log(log_text, loaded)
+    print(_format_report(report, escape_hatch))
 
-    if not native and report.recall < args.min_recall:
+    if report.recall < args.min_recall:
         print(
-            f"\nFAIL: recall {report.recall:.1%} < {args.min_recall:.1%} "
-            "on a non-native model.",
+            f"\nFAIL: recall {report.recall:.1%} < {args.min_recall:.1%}.",
             file=sys.stderr,
         )
         return 1
diff --git a/tests/unit/test_chat_dynamic_tools.py b/tests/unit/test_chat_dynamic_tools.py
index fce83f558..cd4edb362 100644
--- a/tests/unit/test_chat_dynamic_tools.py
+++ b/tests/unit/test_chat_dynamic_tools.py
@@ -1,12 +1,13 @@
 # Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: MIT
-"""ChatAgent wiring for the dynamic tool loader (#1449).
+"""ChatAgent wiring for the dynamic tool loader (#1449, Part 2 #1450).
 
 Covers the ChatAgent-level glue without a Lemonade backend: loader construction
 gating (profile + toggle + env), the three off-states reverting to the full
 registry (``None`` filter), the selection-query builder, the LRU record hook,
-env-override parsing (incl. loud failure on malformed values), and the
-native-model known-gap warning.
+env-override parsing (incl. loud failure on malformed values), the ``load_tools``
+escape hatch + native-only menu, and that the Part-1 native known-gap warning is
+gone now that Part 2 closes the gap.
 
 ChatAgent is built via ``__new__`` with only the attributes each method needs —
 ``Agent.__init__`` (Lemonade) is never run.
@@ -32,6 +33,7 @@
 
 from gaia.agents.base.tool_loader import ToolLoader  # noqa: E402
 from gaia.agents.chat.agent import ChatAgent, ChatAgentConfig  # noqa: E402
+from gaia.eval.tool_cost import build_doc_agent_skeleton  # noqa: E402
 
 for _mod in _stubbed:
     sys.modules.pop(_mod, None)
@@ -44,7 +46,6 @@ def _bare_agent(**attrs) -> ChatAgent:
     a.conversation_history = []
     a.tool_loader = None
     a._memory_store = object()
-    a._dynamic_tools_native_warned = False
     a._dynamic_tools_validated = False
     a.model_id = None
     for k, v in attrs.items():
@@ -183,7 +184,7 @@ def test_query_builder_excludes_assistant_and_truncates():
     assert q.endswith("C" * 100)  # current turn always fully included
 
 
-# ── record hook + known gap ───────────────────────────────────────────────
+# ── record hook ────────────────────────────────────────────────────────────
 
 
 def test_on_tool_invoked_forwards_to_loader():
@@ -198,29 +199,119 @@ def test_on_tool_invoked_noop_when_no_loader():
     a._on_tool_invoked("read_file")  # must not raise
 
 
-def test_native_model_known_gap_warned_once(caplog):
+def test_native_model_no_longer_warns_known_gap(caplog):
+    """Part 2 (#1450) closed the native gap via load_tools — the warning is gone."""
     loader = MagicMock()
     loader.session_disabled = False
     loader.select.return_value = ["c1"]
-    a = _bare_agent(tool_loader=loader, model_id="Gemma-4-E4B-it-GGUF")
+    a = _bare_agent(tool_loader=loader, model_id="Gemma-4-E4B-it-GGUF")  # native
     with patch.object(
         ChatAgent, "_tools_registry", new_callable=lambda: property(lambda self: {})
     ):
         with caplog.at_level(logging.WARNING):
             a._select_tools_for_turn("q1")
             a._select_tools_for_turn("q2")
-    gap_logs = [r for r in caplog.records if "known gap" in r.getMessage()]
-    assert len(gap_logs) == 1  # logged exactly once
+    assert not any(
+        "known gap" in r.getMessage() or "no escape hatch" in r.getMessage()
+        for r in caplog.records
+    )
 
 
-def test_non_native_model_no_known_gap_warning(caplog):
-    loader = MagicMock()
-    loader.session_disabled = False
-    loader.select.return_value = ["c1"]
-    a = _bare_agent(tool_loader=loader, model_id=None)  # non-tool-calling
-    with patch.object(
-        ChatAgent, "_tools_registry", new_callable=lambda: property(lambda self: {})
-    ):
-        with caplog.at_level(logging.WARNING):
-            a._select_tools_for_turn("q1")
-    assert not any("known gap" in r.getMessage() for r in caplog.records)
+# ── _apply_tool_filter invariant (Part 2 mid-loop recovery) ────────────────
+
+
+def test_apply_tool_filter_swaps_filter_and_recomputes_prompt():
+    """The base helper moves the filter and the cached prompt together."""
+    a = ChatAgent.__new__(ChatAgent)
+    a.observers = []  # quiet __del__ during GC
+    a._active_tool_filter = None
+    a._system_prompt_cache = "OLD"
+    a._compose_system_prompt = lambda: f"PROMPT::{a._active_tool_filter}"
+    a._apply_tool_filter(["load_tools", "search_file"])
+    assert a._active_tool_filter == ["load_tools", "search_file"]
+    assert a._system_prompt_cache == "PROMPT::['load_tools', 'search_file']"
+
+
+# ── load_tools registration + handler (Part 2, #1450) ──────────────────────
+
+
+def test_load_tools_registered_only_when_loader_active():
+    on = build_doc_agent_skeleton(profile="doc", deterministic=True, dynamic_tools=True)
+    off = build_doc_agent_skeleton(
+        profile="doc", deterministic=True, dynamic_tools=False
+    )
+    assert "load_tools" in on._tools_registry
+    assert "load_tools" not in off._tools_registry
+
+
+def test_load_tools_handler_admits_bundle_and_applies_filter():
+    agent = build_doc_agent_skeleton(
+        profile="doc", deterministic=True, dynamic_tools=True
+    )
+    applied: dict = {}
+    agent._apply_tool_filter = lambda f: applied.__setitem__("filter", f)
+    load_tools = agent._tools_registry["load_tools"]["function"]
+
+    result = load_tools("file_search")
+    assert result["status"] == "success"
+    assert result["bundle"] == "file_search"
+    # The bundle's tools are now in the loaded set, and that set was applied as
+    # the active filter so the next model step sees them.
+    assert "search_file" in result["loaded_tools"]
+    assert applied["filter"] == result["loaded_tools"]
+
+
+def test_load_tools_handler_resolves_bare_tool_name():
+    agent = build_doc_agent_skeleton(
+        profile="doc", deterministic=True, dynamic_tools=True
+    )
+    agent._apply_tool_filter = lambda f: None
+    load_tools = agent._tools_registry["load_tools"]["function"]
+    result = load_tools("search_file")  # bare tool name → its bundle
+    assert result["status"] == "success"
+    assert "search_file" in result["loaded_tools"]
+
+
+def test_load_tools_handler_unknown_bundle_returns_actionable_error():
+    agent = build_doc_agent_skeleton(
+        profile="doc", deterministic=True, dynamic_tools=True
+    )
+    agent._apply_tool_filter = lambda f: None
+    load_tools = agent._tools_registry["load_tools"]["function"]
+    result = load_tools("does_not_exist")
+    assert result["status"] == "error"
+    assert "Unknown bundle 'does_not_exist'" in result["error"]
+    assert "file_search" in result["error"]  # lists valid bundle names
+
+
+# ── native-only escape-hatch menu ──────────────────────────────────────────
+
+
+def test_native_doc_prompt_includes_load_tools_menu():
+    agent = build_doc_agent_skeleton(
+        profile="doc", deterministic=True, dynamic_tools=True
+    )
+    agent.rag = None  # no-docs branch keeps _get_system_prompt light
+    prompt = agent._get_system_prompt()
+    assert "LOADABLE TOOL BUNDLES" in prompt
+    assert "load_tools(bundle)" in prompt
+    assert "- file_search:" in prompt  # a real bundle line from the menu
+
+
+def test_non_native_doc_prompt_omits_load_tools_menu():
+    agent = build_doc_agent_skeleton(
+        profile="doc", deterministic=True, dynamic_tools=True
+    )
+    agent.rag = None
+    agent.model_id = None  # non-tool-calling → free recovery, no menu
+    prompt = agent._get_system_prompt()
+    assert "LOADABLE TOOL BUNDLES" not in prompt
+
+
+def test_loader_off_doc_prompt_omits_load_tools_menu():
+    agent = build_doc_agent_skeleton(
+        profile="doc", deterministic=True, dynamic_tools=False
+    )
+    agent.rag = None
+    prompt = agent._get_system_prompt()
+    assert "LOADABLE TOOL BUNDLES" not in prompt
diff --git a/tests/unit/test_chat_tool_bundles.py b/tests/unit/test_chat_tool_bundles.py
index d274d9d8e..6782115d6 100644
--- a/tests/unit/test_chat_tool_bundles.py
+++ b/tests/unit/test_chat_tool_bundles.py
@@ -28,7 +28,11 @@ def _bundle_union() -> set[str]:
 
 
 def test_core_and_bundles_cover_doc_registry_exactly():
-    agent = build_doc_agent_skeleton(profile="doc", deterministic=True)
+    # Loader-on skeleton so the CORE-only load_tools meta-tool (#1450) is
+    # registered — the doc registry must balance against CORE∪bundles with it.
+    agent = build_doc_agent_skeleton(
+        profile="doc", deterministic=True, dynamic_tools=True
+    )
     registry = set(agent._tools_registry)
     covered = _bundle_union()
 
@@ -44,12 +48,17 @@ def test_core_and_bundles_cover_doc_registry_exactly():
         f"CORE/bundle names absent from the doc registry: {dangling}. "
         "Remove them or fix the name — validate_registry rejects these at runtime."
     )
+    # The escape hatch is present in both CORE and the live registry (#1450).
+    assert "load_tools" in DOC_CORE_TOOLS
+    assert "load_tools" in registry
 
 
 def test_core_is_subset_of_bundle_union():
-    """CORE tools are also covered by bundles (the union is the registry)."""
-    covered = _bundle_union()
-    assert DOC_CORE_TOOLS <= covered
+    """Every CORE tool is in a bundle too, except the CORE-only load_tools (#1450)."""
+    bundle_members: set[str] = set()
+    for bundle in DOC_BUNDLES:
+        bundle_members |= set(bundle.members)
+    assert DOC_CORE_TOOLS - bundle_members == {"load_tools"}
 
 
 def test_bundles_have_unique_names():
diff --git a/tests/unit/test_tool_loader_selection.py b/tests/unit/test_tool_loader_selection.py
index 06ab635a3..43cfc29df 100644
--- a/tests/unit/test_tool_loader_selection.py
+++ b/tests/unit/test_tool_loader_selection.py
@@ -255,6 +255,124 @@ def test_record_tool_use_logs_escape_hatch_for_unloaded():
     assert any("TOOL_LOADER_ESCAPE_HATCH" in r.getMessage() for r in records)
 
 
+# ── load_bundle / menu / counters (Part 2, #1450) ──────────────────────────
+
+
+def _loader_with_bundles(max_tools: int = 14):
+    """A loader over a tiny CORE + two bundles, with a never-matching embedder."""
+    tools = ["c1", "a1", "a2", "b1"]
+    embed = _make_embed_fn(tools, {"q": {"c1": 0.0, "a1": 0.0, "a2": 0.0, "b1": 0.0}})
+    bundles = [
+        ToolBundle(name="A", members=frozenset({"a1", "a2"}), description="A tools"),
+        ToolBundle(name="B", members=frozenset({"b1"}), description="B tools"),
+    ]
+    loader = ToolLoader(
+        frozenset({"c1"}), bundles, embed, threshold=0.55, max_tools=max_tools
+    )
+    return loader, _registry(tools)
+
+
+def test_bundle_names_are_sorted():
+    loader, _ = _loader_with_bundles()
+    assert loader.bundle_names() == ["A", "B"]
+
+
+def test_format_bundle_menu_lists_name_and_description():
+    loader, _ = _loader_with_bundles()
+    menu = loader.format_bundle_menu()
+    assert "- A: A tools" in menu
+    assert "- B: B tools" in menu
+
+
+def test_load_bundle_by_bundle_name_admits_members():
+    loader, reg = _loader_with_bundles()
+    loader.select("q", reg)  # turn 1: CORE only (c1)
+    loaded = loader.load_bundle("A", reg)
+    assert {"c1", "a1", "a2"} <= set(loaded)
+    assert loader._load_tools_count == 1
+
+
+def test_load_bundle_by_tool_name_resolves_to_owning_bundle():
+    loader, reg = _loader_with_bundles()
+    loader.select("q", reg)
+    loaded = loader.load_bundle("a1", reg)  # bare tool name → bundle A
+    assert {"a1", "a2"} <= set(loaded)
+
+
+def test_load_bundle_unknown_name_raises_keyerror():
+    loader, reg = _loader_with_bundles()
+    loader.select("q", reg)
+    with pytest.raises(KeyError):
+        loader.load_bundle("does_not_exist", reg)
+
+
+def test_load_bundle_skips_members_absent_from_registry():
+    tools = ["c1", "a1"]
+    embed = _make_embed_fn(tools, {"q": {"c1": 0.0, "a1": 0.0}})
+    bundles = [ToolBundle(name="A", members=frozenset({"a1", "ghost"}))]
+    loader = ToolLoader(frozenset({"c1"}), bundles, embed, threshold=0.55, max_tools=14)
+    reg = _registry(tools)
+    loader.select("q", reg)
+    loaded = loader.load_bundle("A", reg)
+    assert "a1" in loaded and "ghost" not in loaded
+
+
+def test_load_bundle_is_cap_aware_and_protects_just_loaded():
+    """At cap, load_bundle evicts an LRU non-CORE tool, never CORE or just-loaded."""
+    tools = ["c1", "d1", "a1", "a2"]
+    embed = _make_embed_fn(tools, {"q": {"c1": 0.0, "d1": 0.9, "a1": 0.0, "a2": 0.0}})
+    bundles = [ToolBundle(name="A", members=frozenset({"a1", "a2"}), description="A")]
+    loader = ToolLoader(frozenset({"c1"}), bundles, embed, threshold=0.55, max_tools=3)
+    reg = _registry(tools)
+    assert loader.select("q", reg) == ["c1", "d1"]  # CORE + matched d1 (2 of 3)
+    loaded = loader.load_bundle("A", reg)  # wants a1,a2 with 1 slot free → evict
+    assert set(loaded) == {"c1", "a1", "a2"}  # cap held; d1 evicted
+    assert "d1" not in loaded
+
+
+def test_load_bundle_emits_same_turn_loaded_superset_line():
+    loader, reg = _loader_with_bundles()
+    loader.select("q", reg)
+    with _capture("gaia.agents.base.tool_loader") as records:
+        loader.load_bundle("A", reg)
+    events = [p for p in _loader_payloads(records) if p.get("event") == "load_tools"]
+    assert events, "no load_tools TOOL_LOADER line captured"
+    assert events[0]["turn"] == loader._turn
+    assert {"a1", "a2"} <= set(events[0]["loaded"])
+
+
+def test_escape_hatch_and_load_counters_increment():
+    loader, reg = _loader_with_bundles()
+    loader.select("q", reg)
+    loader.record_tool_use("never_loaded")  # free recovery
+    loader.load_bundle("A", reg)  # explicit recovery
+    assert loader._escape_hatch_count == 1
+    assert loader._load_tools_count == 1
+
+
+def test_reset_session_emits_summary_then_zeroes_counters():
+    loader, reg = _loader_with_bundles()
+    loader.select("q", reg)
+    loader.record_tool_use("never_loaded")
+    loader.load_bundle("A", reg)
+    with _capture("gaia.agents.base.tool_loader") as records:
+        loader.reset_session()
+    summary = _session_payload(records)
+    assert summary["turns"] == 1
+    assert summary["escape_hatch_count"] == 1
+    assert summary["load_tools_count"] == 1
+    assert summary["escape_hatch_rate"] == pytest.approx(2.0)  # (1+1)/1
+    assert loader._escape_hatch_count == 0
+    assert loader._load_tools_count == 0
+
+
+def test_reset_session_emits_no_summary_when_no_turns():
+    loader, _ = _loader_with_bundles()
+    with _capture("gaia.agents.base.tool_loader") as records:
+        loader.reset_session()  # turn == 0 → nothing to summarize
+    assert not any("TOOL_LOADER_SESSION" in r.getMessage() for r in records)
+
+
 # ── embedder failure ─────────────────────────────────────────────────────
 
 
@@ -343,10 +461,28 @@ def __exit__(self, *exc) -> None:
         self._logger.propagate = self._prev_propagate
 
 
-def _selection_payload(records: list[logging.LogRecord]) -> dict:
-    """Extract the JSON payload from the TOOL_LOADER selection log line."""
+def _loader_payloads(records: list[logging.LogRecord]) -> list[dict]:
+    """All JSON payloads from ``TOOL_LOADER {...}`` lines (selection + load_tools)."""
+    out: list[dict] = []
     for r in records:
         msg = r.getMessage()
         if msg.startswith("TOOL_LOADER {"):
-            return json.loads(msg[len("TOOL_LOADER ") :])
+            out.append(json.loads(msg[len("TOOL_LOADER ") :]))
+    return out
+
+
+def _selection_payload(records: list[logging.LogRecord]) -> dict:
+    """Extract the JSON payload from the TOOL_LOADER selection log line."""
+    for payload in _loader_payloads(records):
+        if "event" not in payload:  # the per-turn select line (not load_tools)
+            return payload
     raise AssertionError("no TOOL_LOADER selection line captured")
+
+
+def _session_payload(records: list[logging.LogRecord]) -> dict:
+    """Extract the JSON payload from the TOOL_LOADER_SESSION summary line."""
+    for r in records:
+        msg = r.getMessage()
+        if msg.startswith("TOOL_LOADER_SESSION {"):
+            return json.loads(msg[len("TOOL_LOADER_SESSION ") :])
+    raise AssertionError("no TOOL_LOADER_SESSION line captured")
diff --git a/tests/unit/test_tool_recall.py b/tests/unit/test_tool_recall.py
index 16e53ac80..d0d2e9cbf 100644
--- a/tests/unit/test_tool_recall.py
+++ b/tests/unit/test_tool_recall.py
@@ -8,9 +8,13 @@
 from __future__ import annotations
 
 from gaia.eval.tool_recall import (
+    aggregate_escape_hatch,
     compute_recall,
+    count_recovery_events_from_log,
+    escape_hatch_rate_from_log,
     parse_called_sets_from_scorecard,
     parse_loaded_sets_from_log,
+    parse_session_summaries_from_log,
 )
 
 
@@ -107,6 +111,149 @@ def test_parse_called_sets_from_scorecard():
     assert called == [[["read_file"], []], [["remember"]]]
 
 
+# ── Part 2 (#1450): load_tools coalesce + gate flip ────────────────────────
+
+
+def test_parse_loaded_sets_unions_load_tools_lines_within_a_turn():
+    """A mid-loop load_tools line unions into its turn (not a new turn/scenario)."""
+    log = "\n".join(
+        [
+            'TOOL_LOADER {"turn": 1, "loaded": ["read_file", "load_tools"]}',
+            'TOOL_LOADER {"turn": 1, "event": "load_tools", "bundle": '
+            '"file_search", "loaded": ["read_file", "load_tools", "search_file"]}',
+            'TOOL_LOADER {"turn": 2, "loaded": ["read_file", "load_tools", '
+            '"search_file"]}',
+        ]
+    )
+    scenarios = parse_loaded_sets_from_log(log)
+    assert len(scenarios) == 1
+    assert len(scenarios[0]) == 2  # two turns, not three log lines
+    assert scenarios[0][0] == ["load_tools", "read_file", "search_file"]  # unioned
+
+
+def test_parse_loaded_sets_splits_consecutive_single_turn_scenarios():
+    """Two single-turn scenarios still split — only event-less lines move cursor."""
+    log = "\n".join(
+        [
+            'TOOL_LOADER {"turn": 1, "loaded": ["read_file"]}',
+            'TOOL_LOADER {"turn": 1, "loaded": ["remember"]}',
+        ]
+    )
+    assert parse_loaded_sets_from_log(log) == [[["read_file"]], [["remember"]]]
+
+
+def test_load_tools_call_is_always_satisfied():
+    """Calling load_tools never counts as a recall miss (it is always-on CORE)."""
+    loaded = [[["read_file"]]]
+    called = [[["read_file", "load_tools"]]]
+    report = compute_recall(loaded, called)
+    assert report.recall == 1.0
+    assert report.all_missing == []
+
+
+def test_native_recovery_within_turn_passes_gate():
+    """A tool surfaced mid-turn via load_tools is in the loaded set when called."""
+    log = "\n".join(
+        [
+            'TOOL_LOADER {"turn": 1, "loaded": ["read_file", "load_tools"]}',
+            'TOOL_LOADER {"turn": 1, "event": "load_tools", "bundle": '
+            '"file_search", "loaded": ["read_file", "load_tools", "search_file"]}',
+        ]
+    )
+    scorecard = {
+        "scenarios": [{"turns": [{"agent_tools": ["load_tools", "search_file"]}]}]
+    }
+    report = compute_recall(
+        parse_loaded_sets_from_log(log),
+        parse_called_sets_from_scorecard(scorecard),
+    )
+    assert report.recall == 1.0
+
+
+def test_unrecovered_miss_still_counts_against_recall():
+    """A semantic miss with no load_tools recovery fails the gate (exemption gone)."""
+    log = 'TOOL_LOADER {"turn": 1, "loaded": ["read_file", "load_tools"]}'
+    scorecard = {"scenarios": [{"turns": [{"agent_tools": ["search_file"]}]}]}
+    report = compute_recall(
+        parse_loaded_sets_from_log(log),
+        parse_called_sets_from_scorecard(scorecard),
+    )
+    assert report.recall == 0.0
+    assert report.all_missing == ["search_file"]
+
+
+# ── escape-hatch session summaries (τ-tuning signal) ───────────────────────
+
+
+def test_parse_and_aggregate_session_summaries():
+    log = "\n".join(
+        [
+            'TOOL_LOADER_SESSION {"turns": 4, "escape_hatch_count": 1, '
+            '"load_tools_count": 1, "escape_hatch_rate": 0.5}',
+            'TOOL_LOADER_SESSION {"turns": 6, "escape_hatch_count": 0, '
+            '"load_tools_count": 2, "escape_hatch_rate": 0.333}',
+        ]
+    )
+    summaries = parse_session_summaries_from_log(log)
+    assert len(summaries) == 2
+    agg = aggregate_escape_hatch(summaries)
+    assert agg["sessions"] == 2
+    assert agg["turns"] == 10  # 4 + 6
+    assert agg["escape_hatch_count"] == 1  # 1 + 0
+    assert agg["load_tools_count"] == 3  # 1 + 2
+    assert agg["escape_hatch_rate"] == (1 + 3) / 10
+
+
+def test_count_recovery_events_from_log():
+    """Both escape-hatch paths counted from raw per-turn lines (no summary needed)."""
+    log = "\n".join(
+        [
+            'TOOL_LOADER {"turn": 1, "loaded": ["read_file", "load_tools"]}',
+            'TOOL_LOADER {"turn": 1, "event": "load_tools", "bundle": '
+            '"file_search", "loaded": ["read_file", "load_tools", "search_file"]}',
+            '{"event": "TOOL_LOADER_ESCAPE_HATCH", "tool": "write_file", "turn": 2}',
+            'TOOL_LOADER {"turn": 2, "loaded": ["read_file", "load_tools", '
+            '"search_file"]}',
+        ]
+    )
+    free, loads = count_recovery_events_from_log(log)
+    assert free == 1  # the ESCAPE_HATCH line
+    assert loads == 1  # the load_tools event line
+
+
+def test_escape_hatch_rate_from_log_works_without_session_summary():
+    """Eval case: no TOOL_LOADER_SESSION line, rate derived from per-turn events."""
+    log = "\n".join(
+        [
+            'TOOL_LOADER {"turn": 1, "loaded": ["load_tools"]}',
+            'TOOL_LOADER {"turn": 1, "event": "load_tools", "bundle": '
+            '"file_search", "loaded": ["load_tools", "search_file"]}',
+            'TOOL_LOADER {"turn": 2, "loaded": ["load_tools", "search_file"]}',
+            'TOOL_LOADER {"turn": 2, "event": "load_tools", "bundle": '
+            '"rag_index", "loaded": ["load_tools", "search_file", "index_document"]}',
+        ]
+    )
+    loaded = parse_loaded_sets_from_log(log)  # 1 scenario, 2 turns
+    eh = escape_hatch_rate_from_log(log, loaded)
+    assert eh["turns"] == 2
+    assert eh["load_tools_count"] == 2
+    assert eh["free_recovery_count"] == 0
+    assert eh["escape_hatch_rate"] == 1.0  # (0 + 2) / 2 — high ⇒ τ too strict here
+    assert eh["session_summaries"] == 0  # eval logs carry none
+
+
+def test_session_and_selection_parsers_do_not_cross_contaminate():
+    log = "\n".join(
+        [
+            'TOOL_LOADER {"turn": 1, "loaded": ["read_file"]}',
+            'TOOL_LOADER_SESSION {"turns": 1, "escape_hatch_count": 0, '
+            '"load_tools_count": 0, "escape_hatch_rate": 0.0}',
+        ]
+    )
+    assert parse_loaded_sets_from_log(log) == [[["read_file"]]]
+    assert len(parse_session_summaries_from_log(log)) == 1
+
+
 def test_end_to_end_log_and_scorecard_join():
     log = "\n".join(
         [

From ecf62c44cecc879bcc9bc9e4a002c7901a154179 Mon Sep 17 00:00:00 2001
From: Alexey Tyurin <>
Date: Thu, 18 Jun 2026 21:11:18 -0500
Subject: [PATCH 2/3] fix(tool-loader): correct stale cap comments and tighten
 escape-hatch tests

---
 src/gaia/agents/base/tool_loader.py         |  5 ++--
 src/gaia/agents/chat/agent.py               | 11 +++----
 tests/unit/test_tool_loader_token_budget.py | 33 +++++++++++++++++----
 3 files changed, 35 insertions(+), 14 deletions(-)

diff --git a/src/gaia/agents/base/tool_loader.py b/src/gaia/agents/base/tool_loader.py
index d7003d4fb..f1d9f14c1 100644
--- a/src/gaia/agents/base/tool_loader.py
+++ b/src/gaia/agents/base/tool_loader.py
@@ -57,8 +57,9 @@
 # tools (index/summarize/RAG) for doc-oriented turns while excluding lower-
 # scoring noise; plain content questions fall back to the CORE set. Overridable.
 DEFAULT_THRESHOLD = 0.20
-# Default cap: 10 CORE + 4 dynamic slots = 14 (≈62% shrink on the 37-tool doc
-# profile, clears the ≥60% Part-0 TTFT-reduction gate). See the plan deviations.
+# Default cap: 11 CORE (doc profile, incl. the load_tools escape hatch) + 3
+# dynamic slots = 14 (≈62% shrink on the 37-tool doc profile, clears the
+# ≥60% Part-0 TTFT-reduction gate). See the plan deviations.
 DEFAULT_MAX_TOOLS = 14
 
 
diff --git a/src/gaia/agents/chat/agent.py b/src/gaia/agents/chat/agent.py
index f9754162b..a3014a1fd 100644
--- a/src/gaia/agents/chat/agent.py
+++ b/src/gaia/agents/chat/agent.py
@@ -136,7 +136,7 @@ class ChatAgentConfig:
     # __init__: GAIA_DYNAMIC_TOOLS / GAIA_DYNAMIC_TOOLS_TAU / GAIA_DYNAMIC_TOOLS_MAX.
     dynamic_tools: bool = False
     dynamic_tools_threshold: float = 0.20  # inclusive cosine; calibrated #1449
-    dynamic_tools_max: int = 14  # cap (10 CORE + 4 dynamic slots)
+    dynamic_tools_max: int = 14  # cap (11 CORE + 3 dynamic slots)
 
     # Per-agent identity for the connectors activation filter (#1005).
     # Must be set BEFORE ``Agent.__init__`` runs ``_register_tools``, because
@@ -1214,13 +1214,10 @@ def load_tools(bundle: str) -> dict:
                     Dictionary with status, the resolved bundle, and the full
                     loaded_tools list now available to call.
                 """
+                # load_tools is registered only inside ``if self.tool_loader is
+                # not None`` and the loader is never re-nulled after construction,
+                # so the loader is always live here.
                 loader = self.tool_loader
-                if loader is None:
-                    return {
-                        "status": "error",
-                        "error": "Dynamic tool loading is not active; all tools "
-                        "are already available.",
-                    }
                 try:
                     loaded = loader.load_bundle(bundle, self._tools_registry)
                 except KeyError:
diff --git a/tests/unit/test_tool_loader_token_budget.py b/tests/unit/test_tool_loader_token_budget.py
index 4a581662d..5c9cb433f 100644
--- a/tests/unit/test_tool_loader_token_budget.py
+++ b/tests/unit/test_tool_loader_token_budget.py
@@ -68,10 +68,29 @@ def _within(value: float, baseline: float, tol: float = TOLERANCE) -> bool:
 
 @pytest.fixture(scope="module")
 def doc_agent():
-    """A deterministic doc-profile skeleton (built once for the module)."""
+    """A deterministic doc-profile skeleton (built once for the module).
+
+    Loader-off, so the registry is the pinned 37-tool unfiltered baseline
+    (``load_tools`` is *not* registered) — keep it that way for the baseline
+    and slope/distribution pins below.
+    """
     return build_doc_agent_skeleton(profile="doc", deterministic=True)
 
 
+@pytest.fixture(scope="module")
+def doc_agent_loader_on():
+    """Doc skeleton with the loader active, so ``load_tools`` is registered.
+
+    The CORE-floor guard must measure the set that actually ships every active
+    turn, which includes the always-on ``load_tools`` escape hatch (#1450). The
+    loader-off ``doc_agent`` fixture omits it, and a filtered render silently
+    drops any name absent from the registry — so the floor would under-count.
+    """
+    return build_doc_agent_skeleton(
+        profile="doc", deterministic=True, dynamic_tools=True
+    )
+
+
 def test_harness_runs_and_pins_baseline(doc_agent):
     """The harness runs and the measured cost matches the pinned baseline."""
     cost = measure_tool_prompt_cost(doc_agent)
@@ -191,14 +210,18 @@ def _filtered_text_tokens(agent, names, tok) -> int:
     return len(tok.encode(agent._format_tools_for_prompt(filter_to=names)))
 
 
-def test_core_only_is_the_reduction_best_case(doc_agent):
-    """CORE-only (the always-on floor) renders well under half the baseline cost."""
+def test_core_only_is_the_reduction_best_case(doc_agent_loader_on):
+    """CORE-only (the always-on floor) renders well under half the baseline cost.
+
+    Uses the loader-on skeleton so ``load_tools`` — a CORE member that ships
+    every active turn — is in the registry and counted in the floor.
+    """
     tok = get_tokenizer()
     if tok is None:
         pytest.skip("tiktoken not installed — token proxy unavailable")
     core = sorted(DOC_CORE_TOOLS)
-    native = _filtered_native_tokens(doc_agent, core, tok)
-    text = _filtered_text_tokens(doc_agent, core, tok)
+    native = _filtered_native_tokens(doc_agent_loader_on, core, tok)
+    text = _filtered_text_tokens(doc_agent_loader_on, core, tok)
     # Headroom over the measured ~40% native / ~37% text so an incidental
     # docstring edit doesn't flip the gate, but real CORE bloat is caught.
     assert native <= 0.45 * BASELINE_NATIVE_TOKENS, (

From 6c05962dade27a077c5913e2cbdff78d395ca027 Mon Sep 17 00:00:00 2001
From: Alexey Tyurin <>
Date: Thu, 18 Jun 2026 22:44:06 -0500
Subject: [PATCH 3/3] fix(tool-loader): repair full-suite unit tests + review
 nits (#1450)

---
 src/gaia/agents/base/tool_loader.py       | 15 +++++++++------
 src/gaia/agents/chat/agent.py             | 19 +++++++++++--------
 src/gaia/eval/tool_recall.py              |  4 ++--
 tests/unit/test_dynamic_tool_filtering.py |  5 ++++-
 tests/unit/test_tool_loader_selection.py  |  6 +++++-
 5 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/src/gaia/agents/base/tool_loader.py b/src/gaia/agents/base/tool_loader.py
index f1d9f14c1..c4480cade 100644
--- a/src/gaia/agents/base/tool_loader.py
+++ b/src/gaia/agents/base/tool_loader.py
@@ -371,8 +371,7 @@ def load_bundle(self, bundle: str, registry: Dict[str, dict]) -> List[str]:
                 name — the caller turns this into an actionable error listing the
                 valid bundle names.
         """
-        members = self._resolve_bundle_members(bundle)
-        resolved_name = bundle
+        members, resolved_name = self._resolve_bundle_members(bundle)
 
         protected = set(self._core) | set(members)
         sel = _Selection()
@@ -409,18 +408,22 @@ def load_bundle(self, bundle: str, registry: Dict[str, dict]) -> List[str]:
 
     # ── internals ────────────────────────────────────────────────────────
 
-    def _resolve_bundle_members(self, bundle: str) -> FrozenSet[str]:
-        """Resolve *bundle* to its member set, or raise ``KeyError``.
+    def _resolve_bundle_members(self, bundle: str) -> tuple["FrozenSet[str]", str]:
+        """Resolve *bundle* to ``(members, resolved_name)``, or raise ``KeyError``.
 
         Exact bundle-name match first; else a bare tool name resolved to the
         union of its owning bundles' members via the reverse index.
+        ``resolved_name`` is the matched bundle name (exact match) or the owning
+        bundle name(s) joined with ``+`` (tool-name match), so the ``load_tools``
+        log line records the bundle actually pulled, not the bare tool name.
         """
         for b in self._bundles:
             if b.name == bundle:
-                return b.members
+                return b.members, b.name
         owning = self._tool_to_bundles.get(bundle)
         if owning:
-            return frozenset().union(*(b.members for b in owning))
+            members = frozenset().union(*(b.members for b in owning))
+            return members, "+".join(b.name for b in owning)
         raise KeyError(bundle)
 
     def _admit(self, name: str, sel: _Selection) -> None:
diff --git a/src/gaia/agents/chat/agent.py b/src/gaia/agents/chat/agent.py
index a3014a1fd..3f82b79d5 100644
--- a/src/gaia/agents/chat/agent.py
+++ b/src/gaia/agents/chat/agent.py
@@ -879,7 +879,8 @@ def _get_system_prompt(self) -> str:
             # TTFT-sensitive case, so we don't tax them with the menu. Lives in
             # this stable prefix (before the volatile tools tail) → no KV thrash.
             load_tools_menu = ""
-            if self.tool_loader is not None and is_tool_calling_model(
+            loader = getattr(self, "tool_loader", None)
+            if loader is not None and is_tool_calling_model(
                 getattr(self, "model_id", None)
             ):
                 load_tools_menu = (
@@ -887,7 +888,7 @@ def _get_system_prompt(self) -> str:
                     "Your visible tools are trimmed to what this turn needs. If a "
                     "capability you need is missing, call load_tools(bundle) with "
                     "one of these names; its tools become available on your next "
-                    "step:\n" + self.tool_loader.format_bundle_menu()
+                    "step:\n" + loader.format_bundle_menu()
                 )
             return (
                 base_prompt
@@ -1201,14 +1202,16 @@ def load_tools(bundle: str) -> dict:
                 """Load a bundle of tools so you can call them on your next step.
 
                 Call this when the capability you need is not in your current
-                tool list — pick a bundle name from the "Loadable tool bundles"
-                menu in your instructions (a bare tool name also works; it loads
-                that tool's whole bundle). The bundle's tools become available on
-                your **next** step; then call the one you need.
+                tool list. If a "Loadable tool bundles" menu is shown in your
+                instructions, pick a bundle name from it; otherwise pass the name
+                of the specific tool you need and its bundle is loaded. The
+                bundle's tools become available on your **next** step; then call
+                the one you need.
 
                 Args:
-                    bundle: A bundle name from the menu (e.g. "file_search",
-                        "rag_index"), or a specific tool name to load its bundle.
+                    bundle: A bundle name (e.g. "file_search", "rag_index") — from
+                        the menu when one is shown — or a specific tool name to
+                        load its owning bundle.
 
                 Returns:
                     Dictionary with status, the resolved bundle, and the full
diff --git a/src/gaia/eval/tool_recall.py b/src/gaia/eval/tool_recall.py
index 8d08e2402..a9d822cd7 100644
--- a/src/gaia/eval/tool_recall.py
+++ b/src/gaia/eval/tool_recall.py
@@ -48,7 +48,7 @@
 import sys
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Tuple
 
 _TOOL_LOADER_RE = re.compile(r"TOOL_LOADER (\{.*\})\s*$")
 _SESSION_RE = re.compile(r"TOOL_LOADER_SESSION (\{.*\})\s*$")
@@ -237,7 +237,7 @@ def aggregate_escape_hatch(summaries: List[Dict]) -> Dict:
     }
 
 
-def count_recovery_events_from_log(text: str) -> tuple:
+def count_recovery_events_from_log(text: str) -> Tuple[int, int]:
     """Count the two escape-hatch recovery paths from raw per-turn log lines.
 
     Returns ``(free_recovery_count, load_tools_count)`` — free non-tool-calling
diff --git a/tests/unit/test_dynamic_tool_filtering.py b/tests/unit/test_dynamic_tool_filtering.py
index b7854e15f..c3ef676bc 100644
--- a/tests/unit/test_dynamic_tool_filtering.py
+++ b/tests/unit/test_dynamic_tool_filtering.py
@@ -128,10 +128,13 @@ def _compose_system_prompt(self):
         self.compose_calls += 1
         return f"PROMPT::{self._active_tool_filter}"
 
-    # Bind the real method under test.
+    # Bind the real methods under test. ``_refresh_active_tool_filter`` now
+    # delegates the filter+prompt swap to ``_apply_tool_filter`` (#1450), so the
+    # spy must borrow both to exercise the real recompute-on-change path.
     from gaia.agents.base.agent import Agent
 
     _refresh_active_tool_filter = Agent._refresh_active_tool_filter
+    _apply_tool_filter = Agent._apply_tool_filter
 
 
 def test_recompute_only_on_change():
diff --git a/tests/unit/test_tool_loader_selection.py b/tests/unit/test_tool_loader_selection.py
index 43cfc29df..e8b0061dd 100644
--- a/tests/unit/test_tool_loader_selection.py
+++ b/tests/unit/test_tool_loader_selection.py
@@ -295,8 +295,12 @@ def test_load_bundle_by_bundle_name_admits_members():
 def test_load_bundle_by_tool_name_resolves_to_owning_bundle():
     loader, reg = _loader_with_bundles()
     loader.select("q", reg)
-    loaded = loader.load_bundle("a1", reg)  # bare tool name → bundle A
+    with _capture("gaia.agents.base.tool_loader") as records:
+        loaded = loader.load_bundle("a1", reg)  # bare tool name → bundle A
     assert {"a1", "a2"} <= set(loaded)
+    # The log records the resolved bundle ("A"), not the bare tool name ("a1").
+    events = [p for p in _loader_payloads(records) if p.get("event") == "load_tools"]
+    assert events and events[0]["bundle"] == "A"
 
 
 def test_load_bundle_unknown_name_raises_keyerror():