From 11c7144d5c0045412034da12292c39cd3908b5e8 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 29 May 2026 10:46:13 +0000
Subject: [PATCH 01/13] feat(code_mode): expose host-backed OS access to the
 sandbox

Sandboxed `run_code` had no way to reach the filesystem, environment, or
wall clock: Monty supports it through an OS callback / `AbstractOS` and
directory mounts, but `CodeMode` never threaded `os`/`mount` into
`feed_start` or the snapshot resume loop, so callers couldn't enable it.

Add `os` and `mount` options on `CodeMode`/`CodeModeToolset`, thread them
through `feed_start` and every `resume` site (OS auto-dispatch stops the
moment a resume omits them), and make the `run_code` description reflect
whether host-backed access is configured.
---
 pydantic_ai_harness/code_mode/README.md      |  50 ++++++++-
 pydantic_ai_harness/code_mode/__init__.py    |   4 +-
 pydantic_ai_harness/code_mode/_capability.py |  33 +++++-
 pydantic_ai_harness/code_mode/_toolset.py    | 100 ++++++++++++++---
 tests/code_mode/test_code_mode.py            | 106 +++++++++++++++++++
 5 files changed, 270 insertions(+), 23 deletions(-)

diff --git a/pydantic_ai_harness/code_mode/README.md b/pydantic_ai_harness/code_mode/README.md
index 62fdda9..22dedc4 100644
--- a/pydantic_ai_harness/code_mode/README.md
+++ b/pydantic_ai_harness/code_mode/README.md
@@ -140,22 +140,66 @@ for msg in result.all_messages():
             tool_returns = part.metadata['tool_returns'] # dict[str, ToolReturnPart]
 ```
 
+## Host-backed OS access
+
+By default the sandbox has no filesystem or clock: `os`/`pathlib` are importable but their I/O
+operations and `datetime.datetime.now()`/`datetime.date.today()` are unavailable. Pass `os` and/or
+`mount` to route those operations to a host-controlled implementation.
+
+```python
+from pydantic_monty import MountDir
+from pydantic_ai_harness import CodeMode
+
+# Expose a host directory inside the sandbox (read/write under /work):
+CodeMode(mount=MountDir('/work', '/tmp/agent-workspace'))
+
+# Or supply a custom OS implementation (an `AbstractOS` instance):
+from pydantic_monty import OSAccess
+CodeMode(os=OSAccess(environ={'STAGE': 'prod'}))
+
+# Or a raw callback `(function_name, args, kwargs) -> result`
+# (return `pydantic_monty.NOT_HANDLED` to fall back to Monty's default):
+from pydantic_monty import NOT_HANDLED
+
+def my_os(fn, args, kwargs):
+    if fn == 'os.getenv':
+        return lookup_secret(args[0])
+    return NOT_HANDLED
+
+CodeMode(os=my_os)
+```
+
+`os` accepts a `pydantic_monty.AbstractOS` instance or a raw callback; both are exposed as the
+`MontyOS` type alias. `mount` accepts one or more `pydantic_monty.MountDir`. To scope access per
+request (per user/session), pass a stateful `AbstractOS` -- for example one rooted at a
+caller-specific directory.
+
+When `os` or `mount` is set, the `run_code` description tells the model that `pathlib`, `os`,
+`datetime.now()`, and `date.today()` are routed to the host. `asyncio.sleep` and the `time` module
+remain unavailable regardless.
+
+> These options are Monty-specific: `CodeMode` is built directly on the Monty sandbox, so its OS
+> hooks use Monty's `AbstractOS`/`MountDir` types.
+
 ## Sandbox restrictions
 
 Code runs inside [Monty](https://github.com/pydantic/monty), a sandboxed Python subset. Key restrictions:
 
 - No class definitions
 - No third-party imports (allowed stdlib: `sys`, `typing`, `asyncio`, `math`, `json`, `re`, `datetime`, `os`, `pathlib`)
-- No wall-clock or timing primitives: `asyncio.sleep`, `datetime.datetime.now()`/`datetime.date.today()`, and the `time` module are unavailable
+- No wall-clock or timing primitives: `asyncio.sleep`, `datetime.datetime.now()`/`datetime.date.today()`, and the `time` module are unavailable -- unless you wire up host-backed OS access (see above), which enables `datetime.now()`/`date.today()` (but not `asyncio.sleep`/`time`)
 - No `import *`
+- Filesystem and `os` I/O are unavailable unless an `os`/`mount` is configured
 - Tools requiring approval or with deferred execution are excluded from the sandbox
 
 ## API
 
 ```python
 CodeMode(
-    tools: ToolSelector = 'all',   # 'all', list[str], callable, or dict
-    max_retries: int = 3,          # retries on sandbox execution errors
+    tools: ToolSelector = 'all',        # 'all', list[str], callable, or dict
+    max_retries: int = 3,               # retries on sandbox execution errors
+    os: MontyOS | None = None,          # AbstractOS instance or (fn, args, kwargs) callback
+    mount: MontyMount | None = None,    # MountDir | list[MountDir] of host directories
 )
 ```
 
diff --git a/pydantic_ai_harness/code_mode/__init__.py b/pydantic_ai_harness/code_mode/__init__.py
index 42304fa..b06d27b 100644
--- a/pydantic_ai_harness/code_mode/__init__.py
+++ b/pydantic_ai_harness/code_mode/__init__.py
@@ -1,6 +1,6 @@
 """Code mode capability: route tool calls through a sandboxed Python environment."""
 
 from pydantic_ai_harness.code_mode._capability import CodeMode
-from pydantic_ai_harness.code_mode._toolset import CodeModeToolset
+from pydantic_ai_harness.code_mode._toolset import CodeModeToolset, MontyMount, MontyOS, MontyOSCallback
 
-__all__ = ['CodeMode', 'CodeModeToolset']
+__all__ = ['CodeMode', 'CodeModeToolset', 'MontyMount', 'MontyOS', 'MontyOSCallback']
diff --git a/pydantic_ai_harness/code_mode/_capability.py b/pydantic_ai_harness/code_mode/_capability.py
index 57eeaab..69df3eb 100644
--- a/pydantic_ai_harness/code_mode/_capability.py
+++ b/pydantic_ai_harness/code_mode/_capability.py
@@ -9,7 +9,7 @@
 from pydantic_ai.capabilities._tool_search import ToolSearch as _ToolSearch
 from pydantic_ai.tools import AgentDepsT, ToolSelector
 
-from pydantic_ai_harness.code_mode._toolset import CodeModeToolset
+from pydantic_ai_harness.code_mode._toolset import CodeModeToolset, MontyMount, MontyOS
 
 
 @dataclass
@@ -34,6 +34,16 @@ class CodeMode(AbstractCapability[AgentDepsT]):
     # Sandbox only specific tools
     agent = Agent('openai:gpt-5', capabilities=[CodeMode(tools=['search', 'fetch'])])
     ```
+
+    Pass `os` (and/or `mount`) to give sandboxed code host-backed filesystem and
+    OS access -- without it, `pathlib`/`os` I/O and `datetime.now()` are
+    unavailable inside `run_code`:
+
+    ```python
+    from pydantic_monty import MountDir
+
+    agent = Agent('openai:gpt-5', capabilities=[CodeMode(mount=MountDir('/work', '/tmp/agent-work'))])
+    ```
     """
 
     tools: ToolSelector[AgentDepsT] = field(default='all')
@@ -48,10 +58,29 @@ class CodeMode(AbstractCapability[AgentDepsT]):
     max_retries: int = 3
     """Maximum number of retries for the `run_code` tool (syntax errors count as retries)."""
 
+    os: MontyOS | None = None
+    """Host-backed OS access for sandboxed code.
+
+    Pass a `pydantic_monty.AbstractOS` instance or a raw Monty OS callback
+    `(function_name, args, kwargs) -> result`. When set, `pathlib.Path`, `os`,
+    `datetime.datetime.now()`, and `datetime.date.today()` calls inside `run_code`
+    are routed to it instead of being unavailable. Scope it per request by giving
+    a stateful `AbstractOS` (e.g. one rooted at a per-user directory).
+    """
+
+    mount: MontyMount | None = None
+    """Host directory mount(s) exposed inside the sandbox as `pydantic_monty.MountDir`."""
+
     def get_ordering(self) -> CapabilityOrdering:
         """CodeMode wraps around ToolSearch so that search_tools stays native."""
         return CapabilityOrdering(position='outermost', wraps=[_ToolSearch])
 
     def get_wrapper_toolset(self, toolset: AbstractToolset[AgentDepsT]) -> AbstractToolset[AgentDepsT] | None:
         """Wrap the agent's assembled toolset, splitting it into native + sandboxed subsets if needed."""
-        return CodeModeToolset(wrapped=toolset, tool_selector=self.tools, max_retries=self.max_retries)
+        return CodeModeToolset(
+            wrapped=toolset,
+            tool_selector=self.tools,
+            max_retries=self.max_retries,
+            os=self.os,
+            mount=self.mount,
+        )
diff --git a/pydantic_ai_harness/code_mode/_toolset.py b/pydantic_ai_harness/code_mode/_toolset.py
index da503a9..a8f6832 100644
--- a/pydantic_ai_harness/code_mode/_toolset.py
+++ b/pydantic_ai_harness/code_mode/_toolset.py
@@ -26,6 +26,7 @@
 
 try:
     from pydantic_monty import (
+        AbstractOS,
         ExternalException,
         ExternalResult,
         ExternalReturnValue,
@@ -37,7 +38,9 @@
         MontyRuntimeError,
         MontySyntaxError,
         MontyTypingError,
+        MountDir,
         NameLookupSnapshot,
+        OsFunction,
     )
 except ImportError as _import_error:  # pragma: no cover
     raise ImportError(
@@ -48,6 +51,16 @@
 # Type alias for the dispatch callback passed to _execution_loop.
 _DispatchFn = Callable[[str, dict[str, Any]], Coroutine[Any, Any, Any]]
 
+# A raw Monty OS callback: `(function_name, args, kwargs) -> result`. Return
+# `pydantic_monty.NOT_HANDLED` to fall back to Monty's default handling.
+MontyOSCallback = Callable[[OsFunction, tuple[Any, ...], dict[str, Any]], Any]
+# What `CodeMode.os` accepts: either an `AbstractOS` instance or a raw callback.
+# Monty's `feed_start`/`resume` accept both interchangeably, so no normalization.
+MontyOS = AbstractOS | MontyOSCallback
+# What `CodeMode.mount` accepts: one or more host-directory mounts (matches Monty's
+# `feed_start`/`resume` `mount=` parameter type exactly).
+MontyMount = MountDir | list[MountDir]
+
 
 class _RunCodeArguments(TypedDict):
     code: Annotated[str, Field(description='The Python code to execute in the sandbox.')]
@@ -69,14 +82,28 @@ class _RunCodeArguments(TypedDict):
 # and to reconstruct multimodal types (e.g. BinaryContent) from Monty results (validate_python).
 _TOOL_RETURN_CONTENT_TA: TypeAdapter[Any] = TypeAdapter(ToolReturnContent)
 
-_RUN_CODE_BASE_DESCRIPTION = """\
+_RUN_CODE_DESCRIPTION_HEAD = """\
 Write and run Python code in a sandboxed environment.
 
 The sandbox uses Monty, a subset of Python. Key restrictions:
 - **No classes**: class definitions are not supported
 - **No third-party libraries**: only the standard library modules listed below can be used
-- **Importable standard library modules**: `sys`, `typing`, `asyncio`, `math`, `json`, `re`, `datetime`, `os`, `pathlib`. These must be imported at the top of your snippet before use, just like in regular Python. For example: `import asyncio` then `results = await asyncio.gather(tool_one(...), tool_two(...))`.
-- **No wall-clock or timing primitives**: `asyncio.sleep`, `datetime.datetime.now()`, `datetime.date.today()`, and the `time` module are unavailable.
+- **Importable standard library modules**: `sys`, `typing`, `asyncio`, `math`, `json`, `re`, `datetime`, `os`, `pathlib`. These must be imported at the top of your snippet before use, just like in regular Python. For example: `import asyncio` then `results = await asyncio.gather(tool_one(...), tool_two(...))`."""
+
+# Timing/OS restriction line, swapped depending on whether the agent configured
+# host-backed OS access (`CodeMode(os=...)` / `mount=...`).
+_NO_OS_RESTRICTION = (
+    '- **No wall-clock or timing primitives**: `asyncio.sleep`, `datetime.datetime.now()`, '
+    '`datetime.date.today()`, and the `time` module are unavailable.'
+)
+_OS_ENABLED_NOTE = (
+    '- **Host-backed OS access**: `pathlib.Path` operations, `os.getenv`/`os.environ`, '
+    '`datetime.datetime.now()`, and `datetime.date.today()` are routed to the host environment '
+    'configured for this agent (availability depends on that configuration). `asyncio.sleep` and '
+    'the `time` module remain unavailable.'
+)
+
+_RUN_CODE_DESCRIPTION_TAIL = """\
 - **No `import *`**: wildcard imports are not supported
 
 State is preserved between calls (REPL-style). Set `restart: true` to reset state.
@@ -90,6 +117,17 @@ class _RunCodeArguments(TypedDict):
 """
 
 
+def _base_description(*, os_enabled: bool) -> str:
+    """Assemble the `run_code` base description, swapping the OS-access line.
+
+    When the agent configured host-backed OS access (`CodeMode(os=...)` or
+    `mount=...`), the static "no wall-clock" restriction is replaced with a note
+    that filesystem/clock operations route to the host.
+    """
+    restriction = _OS_ENABLED_NOTE if os_enabled else _NO_OS_RESTRICTION
+    return f'{_RUN_CODE_DESCRIPTION_HEAD}\n{restriction}\n{_RUN_CODE_DESCRIPTION_TAIL}'
+
+
 def _functions_header(*, has_sync: bool, has_async: bool) -> str:
     """Build the functions-header paragraph for the `run_code` tool description."""
     base = (
@@ -184,6 +222,17 @@ class CodeModeToolset(WrapperToolset[AgentDepsT]):
     max_retries: int = 3
     """Maximum number of retries for the `run_code` tool (syntax errors count as retries)."""
 
+    os: MontyOS | None = None
+    """Host-backed OS access exposed to sandboxed code.
+
+    Either a `pydantic_monty.AbstractOS` instance or a raw Monty OS callback
+    `(function_name, args, kwargs) -> result`. When set, `pathlib.Path`, `os`,
+    `datetime.datetime.now()`, and `datetime.date.today()` calls inside the
+    sandbox are routed to it instead of being unavailable."""
+
+    mount: MontyMount | None = None
+    """Host directory mount(s) exposed inside the sandbox as `pydantic_monty.MountDir`."""
+
     # init=False so `replace()` in `for_run` produces a fresh instance with _repl=None,
     # giving each agent run isolated REPL state. Lazy-initialized on first call_tool.
     _repl: MontyRepl | None = field(default=None, init=False, repr=False)
@@ -236,7 +285,8 @@ async def get_tools(self, ctx: RunContext[AgentDepsT]) -> dict[str, ToolsetTool[
 
         callable_defs, sanitized_to_original = self._partition_callable_tools(sandboxed_tools)
 
-        description = self._build_description(callable_defs)
+        os_enabled = self.os is not None or self.mount is not None
+        description = self._build_description(callable_defs, os_enabled=os_enabled)
 
         if _RUN_CODE_TOOL_NAME in native_tools:
             raise UserError(
@@ -399,7 +449,7 @@ async def dispatch_tool_call(original_name: str, kwargs: dict[str, Any]) -> Any:
         capture = _PrintCapture()
 
         try:
-            monty_state = self._repl.feed_start(code, print_callback=capture)
+            monty_state = self._repl.feed_start(code, print_callback=capture, os=self.os, mount=self.mount)
             completed = await _execution_loop(
                 monty_state,
                 dispatch=dispatch_tool_call,
@@ -407,6 +457,8 @@ async def dispatch_tool_call(original_name: str, kwargs: dict[str, Any]) -> Any:
                 sanitized_to_original=sanitized_to_original,
                 sequential_tools=sequential_tools,
                 global_sequential=global_sequential,
+                os=self.os,
+                mount=self.mount,
             )
         except MontySyntaxError as e:
             raise ModelRetry(f'Syntax error in code:\n{_prepend_prints(e.display(), capture)}') from e
@@ -504,10 +556,11 @@ def _partition_callable_tools(
         return callable_defs, sanitized_to_original
 
     @staticmethod
-    def _build_description(callable_defs: dict[str, ToolDefinition]) -> str:
+    def _build_description(callable_defs: dict[str, ToolDefinition], *, os_enabled: bool) -> str:
         """Render the `run_code` description: base prose + TypedDicts + function signatures."""
+        base = _base_description(os_enabled=os_enabled)
         if not callable_defs:
-            return _RUN_CODE_BASE_DESCRIPTION
+            return base
 
         sigs, conflicting = _get_sigs_and_conflicting(callable_defs)
         type_blocks = FunctionSignature.render_type_definitions(sigs, conflicting)
@@ -520,7 +573,7 @@ def _build_description(callable_defs: dict[str, ToolDefinition]) -> str:
         has_async = any(not td.sequential for td in callable_defs.values())
         header = _functions_header(has_sync=has_sync, has_async=has_async)
 
-        sections = [_RUN_CODE_BASE_DESCRIPTION, header]
+        sections = [base, header]
         if type_blocks:
             sections.append('```python\n' + '\n\n'.join(type_blocks) + '\n```')
         sections.append('```python\n' + '\n\n'.join(function_blocks) + '\n```')
@@ -579,6 +632,8 @@ async def _execution_loop(
     sanitized_to_original: dict[str, str],
     sequential_tools: set[str],
     global_sequential: bool,
+    os: MontyOS | None,
+    mount: MontyMount | None,
 ) -> MontyComplete:
     """Drive the Monty REPL via the synchronous snapshot API until completion.
 
@@ -597,6 +652,9 @@ async def _execution_loop(
     - **Global sequential mode** (DBOS/Temporal): all tools are deferred via
       `resume({'future': ...})` but stored as bare coroutines and awaited
       one-at-a-time at `FutureSnapshot` to prevent interleaving.
+
+    `os`/`mount` must be passed to every `resume` call (not just `feed_start`):
+    Monty's auto-dispatch of OS calls stops the moment a resume omits them.
     """
     pending: dict[int, asyncio.Task[Any] | Coroutine[Any, Any, Any]] = {}
     # Results from parallel tasks that were awaited early (at a sequential-tool
@@ -605,7 +663,7 @@ async def _execution_loop(
     try:
         while not isinstance(monty_state, MontyComplete):
             if isinstance(monty_state, NameLookupSnapshot):
-                monty_state = monty_state.resume()
+                monty_state = monty_state.resume(os=os, mount=mount)
             elif isinstance(monty_state, FunctionSnapshot):
                 monty_state = await _handle_function_snapshot(
                     monty_state,
@@ -616,6 +674,8 @@ async def _execution_loop(
                     global_sequential=global_sequential,
                     pending=pending,
                     pre_resolved=pre_resolved,
+                    os=os,
+                    mount=mount,
                 )
             else:
                 monty_state = await _resolve_future_snapshot(
@@ -623,6 +683,8 @@ async def _execution_loop(
                     pending=pending,
                     pre_resolved=pre_resolved,
                     global_sequential=global_sequential,
+                    os=os,
+                    mount=mount,
                 )
     finally:
         for item in pending.values():  # pragma: no cover
@@ -644,16 +706,20 @@ async def _handle_function_snapshot(
     global_sequential: bool,
     pending: dict[int, asyncio.Task[Any] | Coroutine[Any, Any, Any]],
     pre_resolved: dict[int, ExternalResult],
+    os: MontyOS | None,
+    mount: MontyMount | None,
 ) -> FunctionSnapshot | FutureSnapshot | NameLookupSnapshot | MontyComplete:
     """Handle a single FunctionSnapshot from the Monty execution loop."""
     fn_name = snapshot.function_name
 
     if fn_name not in callable_defs:
-        return snapshot.resume({'exception': NameError(f'Unknown function: {fn_name}')})
+        return snapshot.resume({'exception': NameError(f'Unknown function: {fn_name}')}, os=os, mount=mount)
 
     if snapshot.args:
         return snapshot.resume(
-            {'exception': TypeError(f'{fn_name}() does not accept positional arguments; use keyword arguments')}
+            {'exception': TypeError(f'{fn_name}() does not accept positional arguments; use keyword arguments')},
+            os=os,
+            mount=mount,
         )
 
     original_name = sanitized_to_original.get(fn_name, fn_name)
@@ -666,8 +732,8 @@ async def _handle_function_snapshot(
             pre_resolved[cid] = await _resolve_coro(pending.pop(cid))
         outcome = await _resolve_coro(dispatch(original_name, snapshot.kwargs))
         if 'return_value' in outcome:
-            return snapshot.resume({'return_value': outcome['return_value']})
-        return snapshot.resume({'exception': outcome['exception']})
+            return snapshot.resume({'return_value': outcome['return_value']}, os=os, mount=mount)
+        return snapshot.resume({'exception': outcome['exception']}, os=os, mount=mount)
 
     # Deferred execution — store for later resolution at FutureSnapshot.
     if global_sequential:
@@ -676,7 +742,7 @@ async def _handle_function_snapshot(
     else:
         # Eagerly schedule as a Task for concurrent execution.
         pending[snapshot.call_id] = asyncio.ensure_future(dispatch(original_name, snapshot.kwargs))
-    return snapshot.resume({'future': ...})
+    return snapshot.resume({'future': ...}, os=os, mount=mount)
 
 
 async def _resolve_future_snapshot(
@@ -685,11 +751,13 @@ async def _resolve_future_snapshot(
     pending: dict[int, asyncio.Task[Any] | Coroutine[Any, Any, Any]],
     pre_resolved: dict[int, ExternalResult],
     global_sequential: bool,
+    os: MontyOS | None,
+    mount: MontyMount | None,
 ) -> FunctionSnapshot | FutureSnapshot | NameLookupSnapshot | MontyComplete:
     """Resolve pending tool calls at a FutureSnapshot."""
     pending_ids = snapshot.pending_call_ids
     if not pending_ids:  # pragma: no cover
-        return snapshot.resume(results={})
+        return snapshot.resume(results={}, os=os, mount=mount)
 
     results: dict[int, ExternalResult] = {}
     for cid in pending_ids:
@@ -708,7 +776,7 @@ async def _resolve_future_snapshot(
         for cid, outcome in zip(gather_ids, settled):
             results[cid] = _settle_outcome(outcome)
 
-    return snapshot.resume(results=results)
+    return snapshot.resume(results=results, os=os, mount=mount)
 
 
 async def _resolve_coro(
diff --git a/tests/code_mode/test_code_mode.py b/tests/code_mode/test_code_mode.py
index 1ffb084..ef5c6ae 100644
--- a/tests/code_mode/test_code_mode.py
+++ b/tests/code_mode/test_code_mode.py
@@ -24,6 +24,7 @@
 from pydantic_ai.toolsets.function import FunctionToolset
 from pydantic_ai.usage import RunUsage
 from pydantic_core import SchemaValidator, core_schema
+from pydantic_monty import NOT_HANDLED, MountDir, OSAccess, OsFunction
 from typing_extensions import TypedDict
 
 from pydantic_ai_harness import CodeMode
@@ -1854,6 +1855,111 @@ def test_code_mode_ordering(self) -> None:
         assert ToolSearch in ordering.wraps
 
 
+class TestCodeModeOSAccess:
+    """`CodeMode(os=...)` / `mount=...` give sandboxed code host-backed OS access."""
+
+    async def test_description_default_keeps_no_wallclock_restriction(self) -> None:
+        """Without `os`/`mount`, the description keeps the no-wall-clock restriction."""
+        wrapper = CodeMode[None]().get_wrapper_toolset(_build_function_toolset(add))
+        assert isinstance(wrapper, CodeModeToolset)
+        description = (await wrapper.get_tools(build_run_context(None)))['run_code'].tool_def.description
+        assert description is not None
+        assert 'No wall-clock or timing primitives' in description
+        assert 'Host-backed OS access' not in description
+
+    async def test_description_with_os_callback_notes_host_access(self) -> None:
+        """An `os` callback swaps the restriction line for the host-access note."""
+
+        def os_cb(fn: OsFunction, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any:
+            return NOT_HANDLED  # pragma: no cover - not invoked; this test only checks the description
+
+        wrapper = CodeMode[None](os=os_cb).get_wrapper_toolset(_build_function_toolset(add))
+        assert isinstance(wrapper, CodeModeToolset)
+        description = (await wrapper.get_tools(build_run_context(None)))['run_code'].tool_def.description
+        assert description is not None
+        assert 'Host-backed OS access' in description
+        assert 'No wall-clock or timing primitives' not in description
+
+    async def test_description_with_mount_notes_host_access(self, tmp_path: Any) -> None:
+        """A `mount` (without `os`) also enables the host-access note."""
+        wrapper = CodeMode[None](mount=MountDir('/work', str(tmp_path))).get_wrapper_toolset(
+            _build_function_toolset(add)
+        )
+        assert isinstance(wrapper, CodeModeToolset)
+        description = (await wrapper.get_tools(build_run_context(None)))['run_code'].tool_def.description
+        assert description is not None
+        assert 'Host-backed OS access' in description
+
+    async def test_description_host_access_note_shows_with_no_sandboxed_tools(self) -> None:
+        """The host-access note appears even when no tools are sandboxed (base description)."""
+
+        def os_cb(fn: OsFunction, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any:
+            return NOT_HANDLED  # pragma: no cover - not invoked; this test only checks the description
+
+        # `tools=[]` leaves every tool native, so `run_code` exposes no callable functions.
+        wrapper = CodeMode[None](os=os_cb, tools=[]).get_wrapper_toolset(_build_function_toolset(add))
+        assert isinstance(wrapper, CodeModeToolset)
+        description = (await wrapper.get_tools(build_run_context(None)))['run_code'].tool_def.description
+        assert description is not None
+        assert 'Host-backed OS access' in description
+        assert 'functions are available inside the sandbox' not in description
+
+    async def test_os_callback_dispatches_inside_run_code(self) -> None:
+        """An `os` callback is threaded through `feed_start` and every `resume`, so OS calls
+        keep dispatching even after a tool call suspends and resumes the sandbox."""
+
+        def os_cb(fn: OsFunction, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any:
+            if fn == 'os.getenv':
+                return 'envval'
+            return NOT_HANDLED  # pragma: no cover - sandbox only calls os.getenv here
+
+        wrapper = CodeMode[None](os=os_cb).get_wrapper_toolset(_build_function_toolset(add))
+        assert isinstance(wrapper, CodeModeToolset)
+        ctx = await build_ctx(None, wrapper)
+        tools = await wrapper.get_tools(ctx)
+        # The tool call forces a FunctionSnapshot -> FutureSnapshot round-trip; the os.getenv
+        # afterwards only resolves if `os` survived those resumes.
+        code = "import os\nx = await add(a=2, b=3)\nhome = os.getenv('THING')\n{'sum': x, 'home': home}"
+        result = await wrapper.call_tool('run_code', {'code': code}, ctx, tools['run_code'])
+        assert result.return_value == {'sum': 5, 'home': 'envval'}
+
+    async def test_abstract_os_instance_dispatches_inside_run_code(self) -> None:
+        """An `AbstractOS` instance is accepted as the `os` value and dispatches OS calls."""
+        wrapper = CodeMode[None](os=OSAccess(environ={'THING': 'fromabs'})).get_wrapper_toolset(
+            _build_function_toolset(add)
+        )
+        assert isinstance(wrapper, CodeModeToolset)
+        ctx = await build_ctx(None, wrapper)
+        tools = await wrapper.get_tools(ctx)
+        result = await wrapper.call_tool('run_code', {'code': "import os\nos.getenv('THING')"}, ctx, tools['run_code'])
+        assert result.return_value == 'fromabs'
+
+    async def test_mount_exposes_host_directory(self, tmp_path: Any) -> None:
+        """A `mount` exposes a host directory inside the sandbox, threaded through resumes."""
+        (tmp_path / 'data.txt').write_text('hello-from-host')
+        wrapper = CodeMode[None](mount=MountDir('/work', str(tmp_path))).get_wrapper_toolset(
+            _build_function_toolset(add)
+        )
+        assert isinstance(wrapper, CodeModeToolset)
+        ctx = await build_ctx(None, wrapper)
+        tools = await wrapper.get_tools(ctx)
+        code = "from pathlib import Path\nawait add(a=1, b=1)\nPath('/work/data.txt').read_text()"
+        result = await wrapper.call_tool('run_code', {'code': code}, ctx, tools['run_code'])
+        assert result.return_value == 'hello-from-host'
+
+    def test_capability_forwards_os_and_mount_to_toolset(self, tmp_path: Any) -> None:
+        """`CodeMode` forwards `os`/`mount` onto the `CodeModeToolset` it builds."""
+
+        def os_cb(fn: OsFunction, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any:
+            return NOT_HANDLED  # pragma: no cover - never invoked; only identity is asserted
+
+        mount = MountDir('/work', str(tmp_path))
+        wrapper = CodeMode[None](os=os_cb, mount=mount).get_wrapper_toolset(_build_function_toolset(add))
+        assert isinstance(wrapper, CodeModeToolset)
+        assert wrapper.os is os_cb
+        assert wrapper.mount is mount
+
+
 def _search_tool_def(description: str = 'Search for tools.') -> ToolDefinition:
     """Create a ToolDefinition mimicking the search_tools tool from ToolSearchToolset."""
     from pydantic_ai.toolsets._tool_search import _SEARCH_TOOLS_NAME

From b0cb11cccffb48e9993a45d86c06b35eacbcc96b Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 29 May 2026 12:09:09 +0000
Subject: [PATCH 02/13] test(code_mode): harden OS-access tests around the
 threading invariants

Add edge cases that pin the behaviours most likely to regress: OS access
surviving across REPL-persisted `run_code` calls, a raising `os` callback
degrading to `ModelRetry` instead of crashing the loop, and `mount`
accepting a `list[MountDir]`. Hoist the never-invoked callback used by the
description/forwarding assertions into one shared helper.
---
 tests/code_mode/test_code_mode.py | 73 ++++++++++++++++++++++++-------
 1 file changed, 57 insertions(+), 16 deletions(-)

diff --git a/tests/code_mode/test_code_mode.py b/tests/code_mode/test_code_mode.py
index ef5c6ae..ca5531c 100644
--- a/tests/code_mode/test_code_mode.py
+++ b/tests/code_mode/test_code_mode.py
@@ -1855,6 +1855,11 @@ def test_code_mode_ordering(self) -> None:
         assert ToolSearch in ordering.wraps
 
 
+def _unused_os_callback(fn: OsFunction, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any:
+    """An `os` callback for tests that only assert description/forwarding, never run code."""
+    return NOT_HANDLED  # pragma: no cover - never invoked by these tests
+
+
 class TestCodeModeOSAccess:
     """`CodeMode(os=...)` / `mount=...` give sandboxed code host-backed OS access."""
 
@@ -1869,11 +1874,7 @@ async def test_description_default_keeps_no_wallclock_restriction(self) -> None:
 
     async def test_description_with_os_callback_notes_host_access(self) -> None:
         """An `os` callback swaps the restriction line for the host-access note."""
-
-        def os_cb(fn: OsFunction, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any:
-            return NOT_HANDLED  # pragma: no cover - not invoked; this test only checks the description
-
-        wrapper = CodeMode[None](os=os_cb).get_wrapper_toolset(_build_function_toolset(add))
+        wrapper = CodeMode[None](os=_unused_os_callback).get_wrapper_toolset(_build_function_toolset(add))
         assert isinstance(wrapper, CodeModeToolset)
         description = (await wrapper.get_tools(build_run_context(None)))['run_code'].tool_def.description
         assert description is not None
@@ -1892,12 +1893,8 @@ async def test_description_with_mount_notes_host_access(self, tmp_path: Any) ->
 
     async def test_description_host_access_note_shows_with_no_sandboxed_tools(self) -> None:
         """The host-access note appears even when no tools are sandboxed (base description)."""
-
-        def os_cb(fn: OsFunction, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any:
-            return NOT_HANDLED  # pragma: no cover - not invoked; this test only checks the description
-
         # `tools=[]` leaves every tool native, so `run_code` exposes no callable functions.
-        wrapper = CodeMode[None](os=os_cb, tools=[]).get_wrapper_toolset(_build_function_toolset(add))
+        wrapper = CodeMode[None](os=_unused_os_callback, tools=[]).get_wrapper_toolset(_build_function_toolset(add))
         assert isinstance(wrapper, CodeModeToolset)
         description = (await wrapper.get_tools(build_run_context(None)))['run_code'].tool_def.description
         assert description is not None
@@ -1923,6 +1920,25 @@ def os_cb(fn: OsFunction, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any:
         result = await wrapper.call_tool('run_code', {'code': code}, ctx, tools['run_code'])
         assert result.return_value == {'sum': 5, 'home': 'envval'}
 
+    async def test_os_access_persists_across_run_code_calls(self) -> None:
+        """`os` is supplied on every `feed_start`, so OS access still works on a later
+        `run_code` call that reuses the persisted (non-fresh) REPL."""
+
+        def os_cb(fn: OsFunction, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any:
+            if fn == 'os.getenv':
+                return 'persisted'
+            return NOT_HANDLED  # pragma: no cover - sandbox only calls os.getenv here
+
+        wrapper = CodeMode[None](os=os_cb).get_wrapper_toolset(_build_function_toolset(add))
+        assert isinstance(wrapper, CodeModeToolset)
+        ctx = await build_ctx(None, wrapper)
+        tools = await wrapper.get_tools(ctx)
+        first = await wrapper.call_tool('run_code', {'code': "import os\nos.getenv('A')"}, ctx, tools['run_code'])
+        assert first.return_value == 'persisted'
+        # Second call reuses the REPL (so `import os` carries over) and must still dispatch.
+        second = await wrapper.call_tool('run_code', {'code': "os.getenv('B')"}, ctx, tools['run_code'])
+        assert second.return_value == 'persisted'
+
     async def test_abstract_os_instance_dispatches_inside_run_code(self) -> None:
         """An `AbstractOS` instance is accepted as the `os` value and dispatches OS calls."""
         wrapper = CodeMode[None](os=OSAccess(environ={'THING': 'fromabs'})).get_wrapper_toolset(
@@ -1934,6 +1950,20 @@ async def test_abstract_os_instance_dispatches_inside_run_code(self) -> None:
         result = await wrapper.call_tool('run_code', {'code': "import os\nos.getenv('THING')"}, ctx, tools['run_code'])
         assert result.return_value == 'fromabs'
 
+    async def test_os_callback_exception_becomes_model_retry(self) -> None:
+        """A raising `os` callback surfaces as a `ModelRetry`, like any other sandbox runtime
+        error -- it must not crash the agent loop."""
+
+        def os_cb(fn: OsFunction, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any:
+            raise ValueError('boom from os')
+
+        wrapper = CodeMode[None](os=os_cb).get_wrapper_toolset(_build_function_toolset(add))
+        assert isinstance(wrapper, CodeModeToolset)
+        ctx = await build_ctx(None, wrapper)
+        tools = await wrapper.get_tools(ctx)
+        with pytest.raises(ModelRetry, match='boom from os'):
+            await wrapper.call_tool('run_code', {'code': "import os\nos.getenv('X')"}, ctx, tools['run_code'])
+
     async def test_mount_exposes_host_directory(self, tmp_path: Any) -> None:
         """A `mount` exposes a host directory inside the sandbox, threaded through resumes."""
         (tmp_path / 'data.txt').write_text('hello-from-host')
@@ -1947,16 +1977,27 @@ async def test_mount_exposes_host_directory(self, tmp_path: Any) -> None:
         result = await wrapper.call_tool('run_code', {'code': code}, ctx, tools['run_code'])
         assert result.return_value == 'hello-from-host'
 
+    async def test_mount_accepts_list_of_directories(self, tmp_path: Any) -> None:
+        """`mount` accepts a `list[MountDir]`; each directory is exposed at its virtual path."""
+        (tmp_path / 'a').mkdir()
+        (tmp_path / 'b').mkdir()
+        (tmp_path / 'a' / 'f.txt').write_text('AA')
+        (tmp_path / 'b' / 'f.txt').write_text('BB')
+        mounts = [MountDir('/a', str(tmp_path / 'a')), MountDir('/b', str(tmp_path / 'b'))]
+        wrapper = CodeMode[None](mount=mounts).get_wrapper_toolset(_build_function_toolset(add))
+        assert isinstance(wrapper, CodeModeToolset)
+        ctx = await build_ctx(None, wrapper)
+        tools = await wrapper.get_tools(ctx)
+        code = "from pathlib import Path\nPath('/a/f.txt').read_text() + Path('/b/f.txt').read_text()"
+        result = await wrapper.call_tool('run_code', {'code': code}, ctx, tools['run_code'])
+        assert result.return_value == 'AABB'
+
     def test_capability_forwards_os_and_mount_to_toolset(self, tmp_path: Any) -> None:
         """`CodeMode` forwards `os`/`mount` onto the `CodeModeToolset` it builds."""
-
-        def os_cb(fn: OsFunction, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any:
-            return NOT_HANDLED  # pragma: no cover - never invoked; only identity is asserted
-
         mount = MountDir('/work', str(tmp_path))
-        wrapper = CodeMode[None](os=os_cb, mount=mount).get_wrapper_toolset(_build_function_toolset(add))
+        wrapper = CodeMode[None](os=_unused_os_callback, mount=mount).get_wrapper_toolset(_build_function_toolset(add))
         assert isinstance(wrapper, CodeModeToolset)
-        assert wrapper.os is os_cb
+        assert wrapper.os is _unused_os_callback
         assert wrapper.mount is mount
 
 

From 93e7b0a763ca26a5f3a637adb986652cf95801fd Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 30 May 2026 06:15:56 +0000
Subject: [PATCH 03/13] docs(code_mode): tighten and verify the filesystem/OS
 access section

Trim the host-access docs to the essentials and make the example
self-contained (drop the undefined helper). The snippet and the documented
`mount`/callback constructions are run end-to-end to confirm they work.
---
 pydantic_ai_harness/code_mode/README.md | 42 ++++++++++---------------
 1 file changed, 17 insertions(+), 25 deletions(-)

diff --git a/pydantic_ai_harness/code_mode/README.md b/pydantic_ai_harness/code_mode/README.md
index 22dedc4..cba8718 100644
--- a/pydantic_ai_harness/code_mode/README.md
+++ b/pydantic_ai_harness/code_mode/README.md
@@ -140,46 +140,38 @@ for msg in result.all_messages():
             tool_returns = part.metadata['tool_returns'] # dict[str, ToolReturnPart]
 ```
 
-## Host-backed OS access
+## Filesystem and OS access
 
-By default the sandbox has no filesystem or clock: `os`/`pathlib` are importable but their I/O
-operations and `datetime.datetime.now()`/`datetime.date.today()` are unavailable. Pass `os` and/or
-`mount` to route those operations to a host-controlled implementation.
+The sandbox has no filesystem or clock by default: `os`/`pathlib` import, but their I/O,
+`datetime.now()`, and `date.today()` are unavailable. Pass `os` and/or `mount` to back them with a
+host-controlled implementation.
 
 ```python
-from pydantic_monty import MountDir
+from pydantic_monty import NOT_HANDLED, MountDir, OSAccess
+
 from pydantic_ai_harness import CodeMode
 
-# Expose a host directory inside the sandbox (read/write under /work):
+# Expose a host directory at /work inside the sandbox:
 CodeMode(mount=MountDir('/work', '/tmp/agent-workspace'))
 
-# Or supply a custom OS implementation (an `AbstractOS` instance):
-from pydantic_monty import OSAccess
+# Supply environment/clock via an AbstractOS instance:
 CodeMode(os=OSAccess(environ={'STAGE': 'prod'}))
 
-# Or a raw callback `(function_name, args, kwargs) -> result`
-# (return `pydantic_monty.NOT_HANDLED` to fall back to Monty's default):
-from pydantic_monty import NOT_HANDLED
 
+# ...or a raw `(function_name, args, kwargs)` callback; return NOT_HANDLED to defer to Monty:
 def my_os(fn, args, kwargs):
-    if fn == 'os.getenv':
-        return lookup_secret(args[0])
-    return NOT_HANDLED
+    return 'secret-value' if fn == 'os.getenv' else NOT_HANDLED
+
 
 CodeMode(os=my_os)
 ```
 
-`os` accepts a `pydantic_monty.AbstractOS` instance or a raw callback; both are exposed as the
-`MontyOS` type alias. `mount` accepts one or more `pydantic_monty.MountDir`. To scope access per
-request (per user/session), pass a stateful `AbstractOS` -- for example one rooted at a
-caller-specific directory.
-
-When `os` or `mount` is set, the `run_code` description tells the model that `pathlib`, `os`,
-`datetime.now()`, and `date.today()` are routed to the host. `asyncio.sleep` and the `time` module
-remain unavailable regardless.
+`os` takes a `pydantic_monty.AbstractOS` or that callback; `mount` takes one or more `MountDir`.
+Scope access per request with a stateful `AbstractOS` (e.g. rooted at a per-user directory). When
+set, `run_code`'s description tells the model these operations are host-backed; `asyncio.sleep` and
+`time` stay unavailable.
 
-> These options are Monty-specific: `CodeMode` is built directly on the Monty sandbox, so its OS
-> hooks use Monty's `AbstractOS`/`MountDir` types.
+> Monty-specific: these hooks use Monty's `AbstractOS`/`MountDir` types.
 
 ## Sandbox restrictions
 
@@ -187,7 +179,7 @@ Code runs inside [Monty](https://github.com/pydantic/monty), a sandboxed Python
 
 - No class definitions
 - No third-party imports (allowed stdlib: `sys`, `typing`, `asyncio`, `math`, `json`, `re`, `datetime`, `os`, `pathlib`)
-- No wall-clock or timing primitives: `asyncio.sleep`, `datetime.datetime.now()`/`datetime.date.today()`, and the `time` module are unavailable -- unless you wire up host-backed OS access (see above), which enables `datetime.now()`/`date.today()` (but not `asyncio.sleep`/`time`)
+- No wall-clock or timing primitives by default (`asyncio.sleep`, `datetime.now()`, `date.today()`, `time`) -- `datetime.now()`/`date.today()` become available with host-backed OS access (above)
 - No `import *`
 - Filesystem and `os` I/O are unavailable unless an `os`/`mount` is configured
 - Tools requiring approval or with deferred execution are excluded from the sandbox

From aac81624c6eccc06963519665eb8ff69cdc79736 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 30 May 2026 06:28:21 +0000
Subject: [PATCH 04/13] docs(code_mode): correct per-request scoping wording

`os`/`mount` are static capability fields (no per-run resolver), so the
"stateful AbstractOS rooted at a per-user directory" guidance over-claimed.
Reword to: build CodeMode per request to scope access. Every other doc line
was re-checked empirically against pydantic-monty 0.0.17.
---
 pydantic_ai_harness/code_mode/README.md      | 6 +++---
 pydantic_ai_harness/code_mode/_capability.py | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/pydantic_ai_harness/code_mode/README.md b/pydantic_ai_harness/code_mode/README.md
index cba8718..a62b49d 100644
--- a/pydantic_ai_harness/code_mode/README.md
+++ b/pydantic_ai_harness/code_mode/README.md
@@ -167,9 +167,9 @@ CodeMode(os=my_os)
 ```
 
 `os` takes a `pydantic_monty.AbstractOS` or that callback; `mount` takes one or more `MountDir`.
-Scope access per request with a stateful `AbstractOS` (e.g. rooted at a per-user directory). When
-set, `run_code`'s description tells the model these operations are host-backed; `asyncio.sleep` and
-`time` stay unavailable.
+`os`/`mount` are fixed when the capability is built, so construct `CodeMode` per request to scope
+access. When set, `run_code`'s description tells the model these operations are host-backed;
+`asyncio.sleep` and `time` stay unavailable.
 
 > Monty-specific: these hooks use Monty's `AbstractOS`/`MountDir` types.
 
diff --git a/pydantic_ai_harness/code_mode/_capability.py b/pydantic_ai_harness/code_mode/_capability.py
index 69df3eb..d447b33 100644
--- a/pydantic_ai_harness/code_mode/_capability.py
+++ b/pydantic_ai_harness/code_mode/_capability.py
@@ -64,8 +64,8 @@ class CodeMode(AbstractCapability[AgentDepsT]):
     Pass a `pydantic_monty.AbstractOS` instance or a raw Monty OS callback
     `(function_name, args, kwargs) -> result`. When set, `pathlib.Path`, `os`,
     `datetime.datetime.now()`, and `datetime.date.today()` calls inside `run_code`
-    are routed to it instead of being unavailable. Scope it per request by giving
-    a stateful `AbstractOS` (e.g. one rooted at a per-user directory).
+    are routed to it instead of being unavailable. Fixed at construction, so build
+    `CodeMode` per request to scope access per request.
     """
 
     mount: MontyMount | None = None

From c363fd6a73ce75e0f3d25fa8abd03f13690b7a68 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 30 May 2026 06:50:30 +0000
Subject: [PATCH 05/13] fix(code_mode): don't advertise env/clock for
 mount-only sandboxes

A `mount` only exposes filesystem paths; `os.getenv`/`os.environ` and
`datetime.now()`/`date.today()` still require an `os` handler. The
description used one host-access note for both, so mount-only agents were
told env/clock were routed to the host and would emit calls that fail and
burn run_code retries (verified against pydantic-monty 0.0.17).

Split the description into three states (none / mount-only filesystem / os),
and correct the README and docstrings that conflated the two.
---
 pydantic_ai_harness/code_mode/README.md      | 13 ++++---
 pydantic_ai_harness/code_mode/_capability.py |  6 +--
 pydantic_ai_harness/code_mode/_toolset.py    | 41 ++++++++++++++------
 tests/code_mode/test_code_mode.py            | 11 ++++--
 4 files changed, 47 insertions(+), 24 deletions(-)

diff --git a/pydantic_ai_harness/code_mode/README.md b/pydantic_ai_harness/code_mode/README.md
index a62b49d..9eea1eb 100644
--- a/pydantic_ai_harness/code_mode/README.md
+++ b/pydantic_ai_harness/code_mode/README.md
@@ -166,10 +166,11 @@ def my_os(fn, args, kwargs):
 CodeMode(os=my_os)
 ```
 
-`os` takes a `pydantic_monty.AbstractOS` or that callback; `mount` takes one or more `MountDir`.
-`os`/`mount` are fixed when the capability is built, so construct `CodeMode` per request to scope
-access. When set, `run_code`'s description tells the model these operations are host-backed;
-`asyncio.sleep` and `time` stay unavailable.
+`os` takes a `pydantic_monty.AbstractOS` or that callback and routes environment, clock, and
+filesystem calls; `mount` takes one or more `MountDir` and exposes host filesystem paths only (a
+mount alone does **not** enable `os.getenv` or `datetime.now()`). Both are fixed when the capability
+is built, so construct `CodeMode` per request to scope access. `run_code`'s description reflects
+exactly what's enabled; `asyncio.sleep` and `time` stay unavailable either way.
 
 > Monty-specific: these hooks use Monty's `AbstractOS`/`MountDir` types.
 
@@ -179,9 +180,9 @@ Code runs inside [Monty](https://github.com/pydantic/monty), a sandboxed Python
 
 - No class definitions
 - No third-party imports (allowed stdlib: `sys`, `typing`, `asyncio`, `math`, `json`, `re`, `datetime`, `os`, `pathlib`)
-- No wall-clock or timing primitives by default (`asyncio.sleep`, `datetime.now()`, `date.today()`, `time`) -- `datetime.now()`/`date.today()` become available with host-backed OS access (above)
+- No wall-clock or timing primitives by default (`asyncio.sleep`, `datetime.now()`, `date.today()`, `time`) -- `datetime.now()`/`date.today()` become available with an `os` handler (above); `asyncio.sleep`/`time` never do
 - No `import *`
-- Filesystem and `os` I/O are unavailable unless an `os`/`mount` is configured
+- Filesystem I/O needs an `os` handler or a `mount`; `os.getenv`/`os.environ` need an `os` handler
 - Tools requiring approval or with deferred execution are excluded from the sandbox
 
 ## API
diff --git a/pydantic_ai_harness/code_mode/_capability.py b/pydantic_ai_harness/code_mode/_capability.py
index d447b33..f4c30b2 100644
--- a/pydantic_ai_harness/code_mode/_capability.py
+++ b/pydantic_ai_harness/code_mode/_capability.py
@@ -35,9 +35,9 @@ class CodeMode(AbstractCapability[AgentDepsT]):
     agent = Agent('openai:gpt-5', capabilities=[CodeMode(tools=['search', 'fetch'])])
     ```
 
-    Pass `os` (and/or `mount`) to give sandboxed code host-backed filesystem and
-    OS access -- without it, `pathlib`/`os` I/O and `datetime.now()` are
-    unavailable inside `run_code`:
+    Pass `mount` for host filesystem access and/or `os` for environment/clock
+    (plus filesystem) access -- without them, `pathlib`/`os` I/O and
+    `datetime.now()` are unavailable inside `run_code`:
 
     ```python
     from pydantic_monty import MountDir
diff --git a/pydantic_ai_harness/code_mode/_toolset.py b/pydantic_ai_harness/code_mode/_toolset.py
index a8f6832..cc3009a 100644
--- a/pydantic_ai_harness/code_mode/_toolset.py
+++ b/pydantic_ai_harness/code_mode/_toolset.py
@@ -90,12 +90,19 @@ class _RunCodeArguments(TypedDict):
 - **No third-party libraries**: only the standard library modules listed below can be used
 - **Importable standard library modules**: `sys`, `typing`, `asyncio`, `math`, `json`, `re`, `datetime`, `os`, `pathlib`. These must be imported at the top of your snippet before use, just like in regular Python. For example: `import asyncio` then `results = await asyncio.gather(tool_one(...), tool_two(...))`."""
 
-# Timing/OS restriction line, swapped depending on whether the agent configured
-# host-backed OS access (`CodeMode(os=...)` / `mount=...`).
+# Timing/OS restriction line, swapped depending on what host access the agent
+# configured. Three states, because `mount` and `os` enable different things:
+# a `mount` only exposes filesystem paths, while environment and clock calls
+# require an `os` handler.
 _NO_OS_RESTRICTION = (
     '- **No wall-clock or timing primitives**: `asyncio.sleep`, `datetime.datetime.now()`, '
     '`datetime.date.today()`, and the `time` module are unavailable.'
 )
+_MOUNT_ONLY_NOTE = (
+    '- **Mounted filesystem access**: `pathlib.Path` operations under the configured mount '
+    'point(s) are routed to the host. `os.getenv`/`os.environ`, `datetime.datetime.now()`, '
+    '`datetime.date.today()`, `asyncio.sleep`, and the `time` module remain unavailable.'
+)
 _OS_ENABLED_NOTE = (
     '- **Host-backed OS access**: `pathlib.Path` operations, `os.getenv`/`os.environ`, '
     '`datetime.datetime.now()`, and `datetime.date.today()` are routed to the host environment '
@@ -117,14 +124,23 @@ class _RunCodeArguments(TypedDict):
 """
 
 
-def _base_description(*, os_enabled: bool) -> str:
-    """Assemble the `run_code` base description, swapping the OS-access line.
+def _os_access_restriction(*, has_os: bool, has_mount: bool) -> str:
+    """Pick the OS/filesystem restriction line for the `run_code` description.
 
-    When the agent configured host-backed OS access (`CodeMode(os=...)` or
-    `mount=...`), the static "no wall-clock" restriction is replaced with a note
-    that filesystem/clock operations route to the host.
+    `os` routes environment, clock, and filesystem calls; a `mount` alone only
+    exposes filesystem paths, so a mount-only sandbox must not advertise env or
+    clock access (the model would generate calls that fail and burn retries).
     """
-    restriction = _OS_ENABLED_NOTE if os_enabled else _NO_OS_RESTRICTION
+    if has_os:
+        return _OS_ENABLED_NOTE
+    if has_mount:
+        return _MOUNT_ONLY_NOTE
+    return _NO_OS_RESTRICTION
+
+
+def _base_description(*, has_os: bool, has_mount: bool) -> str:
+    """Assemble the `run_code` base description with the right OS-access line."""
+    restriction = _os_access_restriction(has_os=has_os, has_mount=has_mount)
     return f'{_RUN_CODE_DESCRIPTION_HEAD}\n{restriction}\n{_RUN_CODE_DESCRIPTION_TAIL}'
 
 
@@ -285,8 +301,9 @@ async def get_tools(self, ctx: RunContext[AgentDepsT]) -> dict[str, ToolsetTool[
 
         callable_defs, sanitized_to_original = self._partition_callable_tools(sandboxed_tools)
 
-        os_enabled = self.os is not None or self.mount is not None
-        description = self._build_description(callable_defs, os_enabled=os_enabled)
+        description = self._build_description(
+            callable_defs, has_os=self.os is not None, has_mount=self.mount is not None
+        )
 
         if _RUN_CODE_TOOL_NAME in native_tools:
             raise UserError(
@@ -556,9 +573,9 @@ def _partition_callable_tools(
         return callable_defs, sanitized_to_original
 
     @staticmethod
-    def _build_description(callable_defs: dict[str, ToolDefinition], *, os_enabled: bool) -> str:
+    def _build_description(callable_defs: dict[str, ToolDefinition], *, has_os: bool, has_mount: bool) -> str:
         """Render the `run_code` description: base prose + TypedDicts + function signatures."""
-        base = _base_description(os_enabled=os_enabled)
+        base = _base_description(has_os=has_os, has_mount=has_mount)
         if not callable_defs:
             return base
 
diff --git a/tests/code_mode/test_code_mode.py b/tests/code_mode/test_code_mode.py
index ca5531c..766dec3 100644
--- a/tests/code_mode/test_code_mode.py
+++ b/tests/code_mode/test_code_mode.py
@@ -1881,15 +1881,20 @@ async def test_description_with_os_callback_notes_host_access(self) -> None:
         assert 'Host-backed OS access' in description
         assert 'No wall-clock or timing primitives' not in description
 
-    async def test_description_with_mount_notes_host_access(self, tmp_path: Any) -> None:
-        """A `mount` (without `os`) also enables the host-access note."""
+    async def test_description_mount_only_advertises_filesystem_not_env_or_clock(self, tmp_path: Any) -> None:
+        """A `mount` without `os` advertises filesystem access only -- it must not tell the model
+        that env/clock are host-backed, since a mount cannot route `os.getenv`/`datetime.now()`."""
         wrapper = CodeMode[None](mount=MountDir('/work', str(tmp_path))).get_wrapper_toolset(
             _build_function_toolset(add)
         )
         assert isinstance(wrapper, CodeModeToolset)
         description = (await wrapper.get_tools(build_run_context(None)))['run_code'].tool_def.description
         assert description is not None
-        assert 'Host-backed OS access' in description
+        assert 'Mounted filesystem access' in description
+        assert 'Host-backed OS access' not in description
+        # env/clock are explicitly called out as still unavailable, not advertised as routed.
+        assert '`os.getenv`/`os.environ`, `datetime.datetime.now()`, `datetime.date.today()`' in description
+        assert 'remain unavailable' in description
 
     async def test_description_host_access_note_shows_with_no_sandboxed_tools(self) -> None:
         """The host-access note appears even when no tools are sandboxed (base description)."""

From f5c8e0ed829c1eb3d9fc0d1a2dfa9d12139fb544 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 30 May 2026 07:01:48 +0000
Subject: [PATCH 06/13] docs(code_mode): correct two run_code description
 claims verified against monty

Audited every statement in the run_code description, docstrings, and README
against pydantic-monty 0.0.17. Two were imprecise:
- "imported at the top of your snippet" -- mid-snippet imports work, so the
  rule is just "before use".
- OS-enabled note said calls route "to the host environment", but an
  in-memory AbstractOS (e.g. OSAccess) handles them too -- it's the
  configured OS handler, not necessarily the host.
---
 pydantic_ai_harness/code_mode/_toolset.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pydantic_ai_harness/code_mode/_toolset.py b/pydantic_ai_harness/code_mode/_toolset.py
index cc3009a..bbfe9d1 100644
--- a/pydantic_ai_harness/code_mode/_toolset.py
+++ b/pydantic_ai_harness/code_mode/_toolset.py
@@ -88,7 +88,7 @@ class _RunCodeArguments(TypedDict):
 The sandbox uses Monty, a subset of Python. Key restrictions:
 - **No classes**: class definitions are not supported
 - **No third-party libraries**: only the standard library modules listed below can be used
-- **Importable standard library modules**: `sys`, `typing`, `asyncio`, `math`, `json`, `re`, `datetime`, `os`, `pathlib`. These must be imported at the top of your snippet before use, just like in regular Python. For example: `import asyncio` then `results = await asyncio.gather(tool_one(...), tool_two(...))`."""
+- **Importable standard library modules**: `sys`, `typing`, `asyncio`, `math`, `json`, `re`, `datetime`, `os`, `pathlib`. These must be imported before use, just like in regular Python. For example: `import asyncio` then `results = await asyncio.gather(tool_one(...), tool_two(...))`."""
 
 # Timing/OS restriction line, swapped depending on what host access the agent
 # configured. Three states, because `mount` and `os` enable different things:
@@ -105,7 +105,7 @@ class _RunCodeArguments(TypedDict):
 )
 _OS_ENABLED_NOTE = (
     '- **Host-backed OS access**: `pathlib.Path` operations, `os.getenv`/`os.environ`, '
-    '`datetime.datetime.now()`, and `datetime.date.today()` are routed to the host environment '
+    '`datetime.datetime.now()`, and `datetime.date.today()` are routed to the OS handler '
     'configured for this agent (availability depends on that configuration). `asyncio.sleep` and '
     'the `time` module remain unavailable.'
 )

From 28073594e5015a08bb5ebc5af6ded1d0434bd3b4 Mon Sep 17 00:00:00 2001
From: Aditya Vardhan <adtyavrdhn@gmail.com>
Date: Tue, 2 Jun 2026 18:49:03 +0530
Subject: [PATCH 07/13] docs(code_mode): clarify overlay-mode write persistence
 and fix wording

The mount docs implied writes reach the host, but MountDir defaults to
copy-on-write overlay mode, so writes stay in the sandbox unless mode is
'read-write'. Also tighten two awkward/redundant doc lines.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 pydantic_ai_harness/code_mode/README.md      | 10 +++++++---
 pydantic_ai_harness/code_mode/_capability.py |  2 +-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/pydantic_ai_harness/code_mode/README.md b/pydantic_ai_harness/code_mode/README.md
index 9eea1eb..5e77be8 100644
--- a/pydantic_ai_harness/code_mode/README.md
+++ b/pydantic_ai_harness/code_mode/README.md
@@ -142,9 +142,9 @@ for msg in result.all_messages():
 
 ## Filesystem and OS access
 
-The sandbox has no filesystem or clock by default: `os`/`pathlib` import, but their I/O,
-`datetime.now()`, and `date.today()` are unavailable. Pass `os` and/or `mount` to back them with a
-host-controlled implementation.
+The sandbox has no filesystem or clock by default: the `os` and `pathlib` modules import, but their
+I/O, `datetime.now()`, and `date.today()` are unavailable. Pass `os` and/or `mount` to back them with
+a host-controlled implementation.
 
 ```python
 from pydantic_monty import NOT_HANDLED, MountDir, OSAccess
@@ -172,6 +172,10 @@ mount alone does **not** enable `os.getenv` or `datetime.now()`). Both are fixed
 is built, so construct `CodeMode` per request to scope access. `run_code`'s description reflects
 exactly what's enabled; `asyncio.sleep` and `time` stay unavailable either way.
 
+A `MountDir` defaults to copy-on-write `mode='overlay'`: the sandbox reads host files and sees its
+own writes, but those writes do **not** reach the host directory. Pass `MountDir(..., mode='read-write')`
+to persist writes to the host, or `mode='read-only'` to forbid them.
+
 > Monty-specific: these hooks use Monty's `AbstractOS`/`MountDir` types.
 
 ## Sandbox restrictions
diff --git a/pydantic_ai_harness/code_mode/_capability.py b/pydantic_ai_harness/code_mode/_capability.py
index f4c30b2..67a2f16 100644
--- a/pydantic_ai_harness/code_mode/_capability.py
+++ b/pydantic_ai_harness/code_mode/_capability.py
@@ -65,7 +65,7 @@ class CodeMode(AbstractCapability[AgentDepsT]):
     `(function_name, args, kwargs) -> result`. When set, `pathlib.Path`, `os`,
     `datetime.datetime.now()`, and `datetime.date.today()` calls inside `run_code`
     are routed to it instead of being unavailable. Fixed at construction, so build
-    `CodeMode` per request to scope access per request.
+    `CodeMode` per request to scope access.
     """
 
     mount: MontyMount | None = None

From c87ed3c7c0ee590ccd7161e4dc133499a404d64d Mon Sep 17 00:00:00 2001
From: Aditya Vardhan <adtyavrdhn@gmail.com>
Date: Tue, 2 Jun 2026 19:03:56 +0530
Subject: [PATCH 08/13] refactor(code_mode): rename public OS/mount surface to
 be backend-neutral

The public type aliases leaked the Monty backend name into a surface we
can't rename later. Rename them to match the existing CodeMode/CodeModeToolset
convention, and rename the os= parameter to os_access= so it stops shadowing
the stdlib os module that sandboxed code itself uses.

- MontyOS -> CodeModeOS, MontyOSCallback -> CodeModeOSCallback, MontyMount -> CodeModeMount
- CodeMode/CodeModeToolset param os= -> os_access= (mount unchanged)
- internal resume()/feed_start() forwarding keeps Monty's literal os= kwarg

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 pydantic_ai_harness/code_mode/README.md      | 14 +++----
 pydantic_ai_harness/code_mode/__init__.py    |  4 +-
 pydantic_ai_harness/code_mode/_capability.py | 12 +++---
 pydantic_ai_harness/code_mode/_toolset.py    | 42 ++++++++++----------
 tests/code_mode/test_code_mode.py            | 24 ++++++-----
 5 files changed, 50 insertions(+), 46 deletions(-)

diff --git a/pydantic_ai_harness/code_mode/README.md b/pydantic_ai_harness/code_mode/README.md
index 5e77be8..fccb518 100644
--- a/pydantic_ai_harness/code_mode/README.md
+++ b/pydantic_ai_harness/code_mode/README.md
@@ -155,7 +155,7 @@ from pydantic_ai_harness import CodeMode
 CodeMode(mount=MountDir('/work', '/tmp/agent-workspace'))
 
 # Supply environment/clock via an AbstractOS instance:
-CodeMode(os=OSAccess(environ={'STAGE': 'prod'}))
+CodeMode(os_access=OSAccess(environ={'STAGE': 'prod'}))
 
 
 # ...or a raw `(function_name, args, kwargs)` callback; return NOT_HANDLED to defer to Monty:
@@ -163,10 +163,10 @@ def my_os(fn, args, kwargs):
     return 'secret-value' if fn == 'os.getenv' else NOT_HANDLED
 
 
-CodeMode(os=my_os)
+CodeMode(os_access=my_os)
 ```
 
-`os` takes a `pydantic_monty.AbstractOS` or that callback and routes environment, clock, and
+`os_access` takes a `pydantic_monty.AbstractOS` or that callback and routes environment, clock, and
 filesystem calls; `mount` takes one or more `MountDir` and exposes host filesystem paths only (a
 mount alone does **not** enable `os.getenv` or `datetime.now()`). Both are fixed when the capability
 is built, so construct `CodeMode` per request to scope access. `run_code`'s description reflects
@@ -184,9 +184,9 @@ Code runs inside [Monty](https://github.com/pydantic/monty), a sandboxed Python
 
 - No class definitions
 - No third-party imports (allowed stdlib: `sys`, `typing`, `asyncio`, `math`, `json`, `re`, `datetime`, `os`, `pathlib`)
-- No wall-clock or timing primitives by default (`asyncio.sleep`, `datetime.now()`, `date.today()`, `time`) -- `datetime.now()`/`date.today()` become available with an `os` handler (above); `asyncio.sleep`/`time` never do
+- No wall-clock or timing primitives by default (`asyncio.sleep`, `datetime.now()`, `date.today()`, `time`) -- `datetime.now()`/`date.today()` become available with an `os_access` handler (above); `asyncio.sleep`/`time` never do
 - No `import *`
-- Filesystem I/O needs an `os` handler or a `mount`; `os.getenv`/`os.environ` need an `os` handler
+- Filesystem I/O needs an `os_access` handler or a `mount`; `os.getenv`/`os.environ` need an `os_access` handler
 - Tools requiring approval or with deferred execution are excluded from the sandbox
 
 ## API
@@ -195,8 +195,8 @@ Code runs inside [Monty](https://github.com/pydantic/monty), a sandboxed Python
 CodeMode(
     tools: ToolSelector = 'all',        # 'all', list[str], callable, or dict
     max_retries: int = 3,               # retries on sandbox execution errors
-    os: MontyOS | None = None,          # AbstractOS instance or (fn, args, kwargs) callback
-    mount: MontyMount | None = None,    # MountDir | list[MountDir] of host directories
+    os_access: CodeModeOS | None = None,   # AbstractOS instance or (fn, args, kwargs) callback
+    mount: CodeModeMount | None = None,    # MountDir | list[MountDir] of host directories
 )
 ```
 
diff --git a/pydantic_ai_harness/code_mode/__init__.py b/pydantic_ai_harness/code_mode/__init__.py
index b06d27b..234438c 100644
--- a/pydantic_ai_harness/code_mode/__init__.py
+++ b/pydantic_ai_harness/code_mode/__init__.py
@@ -1,6 +1,6 @@
 """Code mode capability: route tool calls through a sandboxed Python environment."""
 
 from pydantic_ai_harness.code_mode._capability import CodeMode
-from pydantic_ai_harness.code_mode._toolset import CodeModeToolset, MontyMount, MontyOS, MontyOSCallback
+from pydantic_ai_harness.code_mode._toolset import CodeModeMount, CodeModeOS, CodeModeOSCallback, CodeModeToolset
 
-__all__ = ['CodeMode', 'CodeModeToolset', 'MontyMount', 'MontyOS', 'MontyOSCallback']
+__all__ = ['CodeMode', 'CodeModeMount', 'CodeModeOS', 'CodeModeOSCallback', 'CodeModeToolset']
diff --git a/pydantic_ai_harness/code_mode/_capability.py b/pydantic_ai_harness/code_mode/_capability.py
index 67a2f16..cbab615 100644
--- a/pydantic_ai_harness/code_mode/_capability.py
+++ b/pydantic_ai_harness/code_mode/_capability.py
@@ -9,7 +9,7 @@
 from pydantic_ai.capabilities._tool_search import ToolSearch as _ToolSearch
 from pydantic_ai.tools import AgentDepsT, ToolSelector
 
-from pydantic_ai_harness.code_mode._toolset import CodeModeToolset, MontyMount, MontyOS
+from pydantic_ai_harness.code_mode._toolset import CodeModeMount, CodeModeOS, CodeModeToolset
 
 
 @dataclass
@@ -35,7 +35,7 @@ class CodeMode(AbstractCapability[AgentDepsT]):
     agent = Agent('openai:gpt-5', capabilities=[CodeMode(tools=['search', 'fetch'])])
     ```
 
-    Pass `mount` for host filesystem access and/or `os` for environment/clock
+    Pass `mount` for host filesystem access and/or `os_access` for environment/clock
     (plus filesystem) access -- without them, `pathlib`/`os` I/O and
     `datetime.now()` are unavailable inside `run_code`:
 
@@ -58,17 +58,17 @@ class CodeMode(AbstractCapability[AgentDepsT]):
     max_retries: int = 3
     """Maximum number of retries for the `run_code` tool (syntax errors count as retries)."""
 
-    os: MontyOS | None = None
+    os_access: CodeModeOS | None = None
     """Host-backed OS access for sandboxed code.
 
-    Pass a `pydantic_monty.AbstractOS` instance or a raw Monty OS callback
+    Pass a `pydantic_monty.AbstractOS` instance or a raw OS callback
     `(function_name, args, kwargs) -> result`. When set, `pathlib.Path`, `os`,
     `datetime.datetime.now()`, and `datetime.date.today()` calls inside `run_code`
     are routed to it instead of being unavailable. Fixed at construction, so build
     `CodeMode` per request to scope access.
     """
 
-    mount: MontyMount | None = None
+    mount: CodeModeMount | None = None
     """Host directory mount(s) exposed inside the sandbox as `pydantic_monty.MountDir`."""
 
     def get_ordering(self) -> CapabilityOrdering:
@@ -81,6 +81,6 @@ def get_wrapper_toolset(self, toolset: AbstractToolset[AgentDepsT]) -> AbstractT
             wrapped=toolset,
             tool_selector=self.tools,
             max_retries=self.max_retries,
-            os=self.os,
+            os_access=self.os_access,
             mount=self.mount,
         )
diff --git a/pydantic_ai_harness/code_mode/_toolset.py b/pydantic_ai_harness/code_mode/_toolset.py
index bbfe9d1..c884aec 100644
--- a/pydantic_ai_harness/code_mode/_toolset.py
+++ b/pydantic_ai_harness/code_mode/_toolset.py
@@ -51,15 +51,15 @@
 # Type alias for the dispatch callback passed to _execution_loop.
 _DispatchFn = Callable[[str, dict[str, Any]], Coroutine[Any, Any, Any]]
 
-# A raw Monty OS callback: `(function_name, args, kwargs) -> result`. Return
-# `pydantic_monty.NOT_HANDLED` to fall back to Monty's default handling.
-MontyOSCallback = Callable[[OsFunction, tuple[Any, ...], dict[str, Any]], Any]
-# What `CodeMode.os` accepts: either an `AbstractOS` instance or a raw callback.
-# Monty's `feed_start`/`resume` accept both interchangeably, so no normalization.
-MontyOS = AbstractOS | MontyOSCallback
-# What `CodeMode.mount` accepts: one or more host-directory mounts (matches Monty's
-# `feed_start`/`resume` `mount=` parameter type exactly).
-MontyMount = MountDir | list[MountDir]
+# A raw OS callback: `(function_name, args, kwargs) -> result`. Return
+# `pydantic_monty.NOT_HANDLED` to fall back to the sandbox's default handling.
+CodeModeOSCallback = Callable[[OsFunction, tuple[Any, ...], dict[str, Any]], Any]
+# What `CodeMode.os_access` accepts: either an `AbstractOS` instance or a raw callback.
+# The sandbox's `feed_start`/`resume` accept both interchangeably, so no normalization.
+CodeModeOS = AbstractOS | CodeModeOSCallback
+# What `CodeMode.mount` accepts: one or more host-directory mounts (matches the
+# sandbox's `feed_start`/`resume` `mount=` parameter type exactly).
+CodeModeMount = MountDir | list[MountDir]
 
 
 class _RunCodeArguments(TypedDict):
@@ -238,15 +238,15 @@ class CodeModeToolset(WrapperToolset[AgentDepsT]):
     max_retries: int = 3
     """Maximum number of retries for the `run_code` tool (syntax errors count as retries)."""
 
-    os: MontyOS | None = None
+    os_access: CodeModeOS | None = None
     """Host-backed OS access exposed to sandboxed code.
 
-    Either a `pydantic_monty.AbstractOS` instance or a raw Monty OS callback
+    Either a `pydantic_monty.AbstractOS` instance or a raw OS callback
     `(function_name, args, kwargs) -> result`. When set, `pathlib.Path`, `os`,
     `datetime.datetime.now()`, and `datetime.date.today()` calls inside the
     sandbox are routed to it instead of being unavailable."""
 
-    mount: MontyMount | None = None
+    mount: CodeModeMount | None = None
     """Host directory mount(s) exposed inside the sandbox as `pydantic_monty.MountDir`."""
 
     # init=False so `replace()` in `for_run` produces a fresh instance with _repl=None,
@@ -302,7 +302,7 @@ async def get_tools(self, ctx: RunContext[AgentDepsT]) -> dict[str, ToolsetTool[
         callable_defs, sanitized_to_original = self._partition_callable_tools(sandboxed_tools)
 
         description = self._build_description(
-            callable_defs, has_os=self.os is not None, has_mount=self.mount is not None
+            callable_defs, has_os=self.os_access is not None, has_mount=self.mount is not None
         )
 
         if _RUN_CODE_TOOL_NAME in native_tools:
@@ -466,7 +466,7 @@ async def dispatch_tool_call(original_name: str, kwargs: dict[str, Any]) -> Any:
         capture = _PrintCapture()
 
         try:
-            monty_state = self._repl.feed_start(code, print_callback=capture, os=self.os, mount=self.mount)
+            monty_state = self._repl.feed_start(code, print_callback=capture, os=self.os_access, mount=self.mount)
             completed = await _execution_loop(
                 monty_state,
                 dispatch=dispatch_tool_call,
@@ -474,7 +474,7 @@ async def dispatch_tool_call(original_name: str, kwargs: dict[str, Any]) -> Any:
                 sanitized_to_original=sanitized_to_original,
                 sequential_tools=sequential_tools,
                 global_sequential=global_sequential,
-                os=self.os,
+                os=self.os_access,
                 mount=self.mount,
             )
         except MontySyntaxError as e:
@@ -649,8 +649,8 @@ async def _execution_loop(
     sanitized_to_original: dict[str, str],
     sequential_tools: set[str],
     global_sequential: bool,
-    os: MontyOS | None,
-    mount: MontyMount | None,
+    os: CodeModeOS | None,
+    mount: CodeModeMount | None,
 ) -> MontyComplete:
     """Drive the Monty REPL via the synchronous snapshot API until completion.
 
@@ -723,8 +723,8 @@ async def _handle_function_snapshot(
     global_sequential: bool,
     pending: dict[int, asyncio.Task[Any] | Coroutine[Any, Any, Any]],
     pre_resolved: dict[int, ExternalResult],
-    os: MontyOS | None,
-    mount: MontyMount | None,
+    os: CodeModeOS | None,
+    mount: CodeModeMount | None,
 ) -> FunctionSnapshot | FutureSnapshot | NameLookupSnapshot | MontyComplete:
     """Handle a single FunctionSnapshot from the Monty execution loop."""
     fn_name = snapshot.function_name
@@ -768,8 +768,8 @@ async def _resolve_future_snapshot(
     pending: dict[int, asyncio.Task[Any] | Coroutine[Any, Any, Any]],
     pre_resolved: dict[int, ExternalResult],
     global_sequential: bool,
-    os: MontyOS | None,
-    mount: MontyMount | None,
+    os: CodeModeOS | None,
+    mount: CodeModeMount | None,
 ) -> FunctionSnapshot | FutureSnapshot | NameLookupSnapshot | MontyComplete:
     """Resolve pending tool calls at a FutureSnapshot."""
     pending_ids = snapshot.pending_call_ids
diff --git a/tests/code_mode/test_code_mode.py b/tests/code_mode/test_code_mode.py
index 766dec3..67e8d5b 100644
--- a/tests/code_mode/test_code_mode.py
+++ b/tests/code_mode/test_code_mode.py
@@ -1861,7 +1861,7 @@ def _unused_os_callback(fn: OsFunction, args: tuple[Any, ...], kwargs: dict[str,
 
 
 class TestCodeModeOSAccess:
-    """`CodeMode(os=...)` / `mount=...` give sandboxed code host-backed OS access."""
+    """`CodeMode(os_access=...)` / `mount=...` give sandboxed code host-backed OS access."""
 
     async def test_description_default_keeps_no_wallclock_restriction(self) -> None:
         """Without `os`/`mount`, the description keeps the no-wall-clock restriction."""
@@ -1874,7 +1874,7 @@ async def test_description_default_keeps_no_wallclock_restriction(self) -> None:
 
     async def test_description_with_os_callback_notes_host_access(self) -> None:
         """An `os` callback swaps the restriction line for the host-access note."""
-        wrapper = CodeMode[None](os=_unused_os_callback).get_wrapper_toolset(_build_function_toolset(add))
+        wrapper = CodeMode[None](os_access=_unused_os_callback).get_wrapper_toolset(_build_function_toolset(add))
         assert isinstance(wrapper, CodeModeToolset)
         description = (await wrapper.get_tools(build_run_context(None)))['run_code'].tool_def.description
         assert description is not None
@@ -1899,7 +1899,9 @@ async def test_description_mount_only_advertises_filesystem_not_env_or_clock(sel
     async def test_description_host_access_note_shows_with_no_sandboxed_tools(self) -> None:
         """The host-access note appears even when no tools are sandboxed (base description)."""
         # `tools=[]` leaves every tool native, so `run_code` exposes no callable functions.
-        wrapper = CodeMode[None](os=_unused_os_callback, tools=[]).get_wrapper_toolset(_build_function_toolset(add))
+        wrapper = CodeMode[None](os_access=_unused_os_callback, tools=[]).get_wrapper_toolset(
+            _build_function_toolset(add)
+        )
         assert isinstance(wrapper, CodeModeToolset)
         description = (await wrapper.get_tools(build_run_context(None)))['run_code'].tool_def.description
         assert description is not None
@@ -1915,7 +1917,7 @@ def os_cb(fn: OsFunction, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any:
                 return 'envval'
             return NOT_HANDLED  # pragma: no cover - sandbox only calls os.getenv here
 
-        wrapper = CodeMode[None](os=os_cb).get_wrapper_toolset(_build_function_toolset(add))
+        wrapper = CodeMode[None](os_access=os_cb).get_wrapper_toolset(_build_function_toolset(add))
         assert isinstance(wrapper, CodeModeToolset)
         ctx = await build_ctx(None, wrapper)
         tools = await wrapper.get_tools(ctx)
@@ -1934,7 +1936,7 @@ def os_cb(fn: OsFunction, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any:
                 return 'persisted'
             return NOT_HANDLED  # pragma: no cover - sandbox only calls os.getenv here
 
-        wrapper = CodeMode[None](os=os_cb).get_wrapper_toolset(_build_function_toolset(add))
+        wrapper = CodeMode[None](os_access=os_cb).get_wrapper_toolset(_build_function_toolset(add))
         assert isinstance(wrapper, CodeModeToolset)
         ctx = await build_ctx(None, wrapper)
         tools = await wrapper.get_tools(ctx)
@@ -1946,7 +1948,7 @@ def os_cb(fn: OsFunction, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any:
 
     async def test_abstract_os_instance_dispatches_inside_run_code(self) -> None:
         """An `AbstractOS` instance is accepted as the `os` value and dispatches OS calls."""
-        wrapper = CodeMode[None](os=OSAccess(environ={'THING': 'fromabs'})).get_wrapper_toolset(
+        wrapper = CodeMode[None](os_access=OSAccess(environ={'THING': 'fromabs'})).get_wrapper_toolset(
             _build_function_toolset(add)
         )
         assert isinstance(wrapper, CodeModeToolset)
@@ -1962,7 +1964,7 @@ async def test_os_callback_exception_becomes_model_retry(self) -> None:
         def os_cb(fn: OsFunction, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any:
             raise ValueError('boom from os')
 
-        wrapper = CodeMode[None](os=os_cb).get_wrapper_toolset(_build_function_toolset(add))
+        wrapper = CodeMode[None](os_access=os_cb).get_wrapper_toolset(_build_function_toolset(add))
         assert isinstance(wrapper, CodeModeToolset)
         ctx = await build_ctx(None, wrapper)
         tools = await wrapper.get_tools(ctx)
@@ -1998,11 +2000,13 @@ async def test_mount_accepts_list_of_directories(self, tmp_path: Any) -> None:
         assert result.return_value == 'AABB'
 
     def test_capability_forwards_os_and_mount_to_toolset(self, tmp_path: Any) -> None:
-        """`CodeMode` forwards `os`/`mount` onto the `CodeModeToolset` it builds."""
+        """`CodeMode` forwards `os_access`/`mount` onto the `CodeModeToolset` it builds."""
         mount = MountDir('/work', str(tmp_path))
-        wrapper = CodeMode[None](os=_unused_os_callback, mount=mount).get_wrapper_toolset(_build_function_toolset(add))
+        wrapper = CodeMode[None](os_access=_unused_os_callback, mount=mount).get_wrapper_toolset(
+            _build_function_toolset(add)
+        )
         assert isinstance(wrapper, CodeModeToolset)
-        assert wrapper.os is _unused_os_callback
+        assert wrapper.os_access is _unused_os_callback
         assert wrapper.mount is mount
 
 

From f420a9f9fe1790e35df2115ffcfab947b0473dd7 Mon Sep 17 00:00:00 2001
From: Aditya Vardhan <adtyavrdhn@gmail.com>
Date: Tue, 2 Jun 2026 19:58:13 +0530
Subject: [PATCH 09/13] refactor(code_mode): stop shadowing the os module in
 the execution loop

The OS/mount threading named its parameter `os`, shadowing the stdlib
module inside the execution-loop helpers. Rename the variable to
`os_access` (matching the public field) while keeping Monty's required
`os=` keyword only at the resume/feed_start call sites. Also inline the
single-use restriction-line helper into `_base_description`.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 pydantic_ai_harness/code_mode/_toolset.py | 46 +++++++++++------------
 1 file changed, 21 insertions(+), 25 deletions(-)

diff --git a/pydantic_ai_harness/code_mode/_toolset.py b/pydantic_ai_harness/code_mode/_toolset.py
index c884aec..7f5317e 100644
--- a/pydantic_ai_harness/code_mode/_toolset.py
+++ b/pydantic_ai_harness/code_mode/_toolset.py
@@ -124,23 +124,19 @@ class _RunCodeArguments(TypedDict):
 """
 
 
-def _os_access_restriction(*, has_os: bool, has_mount: bool) -> str:
-    """Pick the OS/filesystem restriction line for the `run_code` description.
+def _base_description(*, has_os: bool, has_mount: bool) -> str:
+    """Assemble the `run_code` base description with the right OS-access restriction line.
 
     `os` routes environment, clock, and filesystem calls; a `mount` alone only
     exposes filesystem paths, so a mount-only sandbox must not advertise env or
     clock access (the model would generate calls that fail and burn retries).
     """
     if has_os:
-        return _OS_ENABLED_NOTE
-    if has_mount:
-        return _MOUNT_ONLY_NOTE
-    return _NO_OS_RESTRICTION
-
-
-def _base_description(*, has_os: bool, has_mount: bool) -> str:
-    """Assemble the `run_code` base description with the right OS-access line."""
-    restriction = _os_access_restriction(has_os=has_os, has_mount=has_mount)
+        restriction = _OS_ENABLED_NOTE
+    elif has_mount:
+        restriction = _MOUNT_ONLY_NOTE
+    else:
+        restriction = _NO_OS_RESTRICTION
     return f'{_RUN_CODE_DESCRIPTION_HEAD}\n{restriction}\n{_RUN_CODE_DESCRIPTION_TAIL}'
 
 
@@ -474,7 +470,7 @@ async def dispatch_tool_call(original_name: str, kwargs: dict[str, Any]) -> Any:
                 sanitized_to_original=sanitized_to_original,
                 sequential_tools=sequential_tools,
                 global_sequential=global_sequential,
-                os=self.os_access,
+                os_access=self.os_access,
                 mount=self.mount,
             )
         except MontySyntaxError as e:
@@ -649,7 +645,7 @@ async def _execution_loop(
     sanitized_to_original: dict[str, str],
     sequential_tools: set[str],
     global_sequential: bool,
-    os: CodeModeOS | None,
+    os_access: CodeModeOS | None,
     mount: CodeModeMount | None,
 ) -> MontyComplete:
     """Drive the Monty REPL via the synchronous snapshot API until completion.
@@ -680,7 +676,7 @@ async def _execution_loop(
     try:
         while not isinstance(monty_state, MontyComplete):
             if isinstance(monty_state, NameLookupSnapshot):
-                monty_state = monty_state.resume(os=os, mount=mount)
+                monty_state = monty_state.resume(os=os_access, mount=mount)
             elif isinstance(monty_state, FunctionSnapshot):
                 monty_state = await _handle_function_snapshot(
                     monty_state,
@@ -691,7 +687,7 @@ async def _execution_loop(
                     global_sequential=global_sequential,
                     pending=pending,
                     pre_resolved=pre_resolved,
-                    os=os,
+                    os_access=os_access,
                     mount=mount,
                 )
             else:
@@ -700,7 +696,7 @@ async def _execution_loop(
                     pending=pending,
                     pre_resolved=pre_resolved,
                     global_sequential=global_sequential,
-                    os=os,
+                    os_access=os_access,
                     mount=mount,
                 )
     finally:
@@ -723,19 +719,19 @@ async def _handle_function_snapshot(
     global_sequential: bool,
     pending: dict[int, asyncio.Task[Any] | Coroutine[Any, Any, Any]],
     pre_resolved: dict[int, ExternalResult],
-    os: CodeModeOS | None,
+    os_access: CodeModeOS | None,
     mount: CodeModeMount | None,
 ) -> FunctionSnapshot | FutureSnapshot | NameLookupSnapshot | MontyComplete:
     """Handle a single FunctionSnapshot from the Monty execution loop."""
     fn_name = snapshot.function_name
 
     if fn_name not in callable_defs:
-        return snapshot.resume({'exception': NameError(f'Unknown function: {fn_name}')}, os=os, mount=mount)
+        return snapshot.resume({'exception': NameError(f'Unknown function: {fn_name}')}, os=os_access, mount=mount)
 
     if snapshot.args:
         return snapshot.resume(
             {'exception': TypeError(f'{fn_name}() does not accept positional arguments; use keyword arguments')},
-            os=os,
+            os=os_access,
             mount=mount,
         )
 
@@ -749,8 +745,8 @@ async def _handle_function_snapshot(
             pre_resolved[cid] = await _resolve_coro(pending.pop(cid))
         outcome = await _resolve_coro(dispatch(original_name, snapshot.kwargs))
         if 'return_value' in outcome:
-            return snapshot.resume({'return_value': outcome['return_value']}, os=os, mount=mount)
-        return snapshot.resume({'exception': outcome['exception']}, os=os, mount=mount)
+            return snapshot.resume({'return_value': outcome['return_value']}, os=os_access, mount=mount)
+        return snapshot.resume({'exception': outcome['exception']}, os=os_access, mount=mount)
 
     # Deferred execution — store for later resolution at FutureSnapshot.
     if global_sequential:
@@ -759,7 +755,7 @@ async def _handle_function_snapshot(
     else:
         # Eagerly schedule as a Task for concurrent execution.
         pending[snapshot.call_id] = asyncio.ensure_future(dispatch(original_name, snapshot.kwargs))
-    return snapshot.resume({'future': ...}, os=os, mount=mount)
+    return snapshot.resume({'future': ...}, os=os_access, mount=mount)
 
 
 async def _resolve_future_snapshot(
@@ -768,13 +764,13 @@ async def _resolve_future_snapshot(
     pending: dict[int, asyncio.Task[Any] | Coroutine[Any, Any, Any]],
     pre_resolved: dict[int, ExternalResult],
     global_sequential: bool,
-    os: CodeModeOS | None,
+    os_access: CodeModeOS | None,
     mount: CodeModeMount | None,
 ) -> FunctionSnapshot | FutureSnapshot | NameLookupSnapshot | MontyComplete:
     """Resolve pending tool calls at a FutureSnapshot."""
     pending_ids = snapshot.pending_call_ids
     if not pending_ids:  # pragma: no cover
-        return snapshot.resume(results={}, os=os, mount=mount)
+        return snapshot.resume(results={}, os=os_access, mount=mount)
 
     results: dict[int, ExternalResult] = {}
     for cid in pending_ids:
@@ -793,7 +789,7 @@ async def _resolve_future_snapshot(
         for cid, outcome in zip(gather_ids, settled):
             results[cid] = _settle_outcome(outcome)
 
-    return snapshot.resume(results=results, os=os, mount=mount)
+    return snapshot.resume(results=results, os=os_access, mount=mount)
 
 
 async def _resolve_coro(

From 007aa05662e5a99bf01d9818a45587383e68a545 Mon Sep 17 00:00:00 2001
From: Aditya Vardhan <adtyavrdhn@gmail.com>
Date: Tue, 2 Jun 2026 20:05:24 +0530
Subject: [PATCH 10/13] refactor(code_mode): make CodeMode config fields
 keyword-only

The option list keeps growing; pin tools/max_retries as the only
positional args and force os_access/mount (and future config) to be
passed by name via a KW_ONLY sentinel, so adding options can't silently
shift positional meaning.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 pydantic_ai_harness/code_mode/_capability.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/pydantic_ai_harness/code_mode/_capability.py b/pydantic_ai_harness/code_mode/_capability.py
index cbab615..f824e34 100644
--- a/pydantic_ai_harness/code_mode/_capability.py
+++ b/pydantic_ai_harness/code_mode/_capability.py
@@ -2,7 +2,7 @@
 
 from __future__ import annotations
 
-from dataclasses import dataclass, field
+from dataclasses import KW_ONLY, dataclass, field
 
 from pydantic_ai import AbstractToolset
 from pydantic_ai.capabilities import AbstractCapability, CapabilityOrdering
@@ -58,6 +58,10 @@ class CodeMode(AbstractCapability[AgentDepsT]):
     max_retries: int = 3
     """Maximum number of retries for the `run_code` tool (syntax errors count as retries)."""
 
+    _: KW_ONLY
+    # Everything below is keyword-only: the option list keeps growing, so new
+    # config must be passed by name rather than relying on positional order.
+
     os_access: CodeModeOS | None = None
     """Host-backed OS access for sandboxed code.
 

From 465e8d65f65367435e86c1da81195335003bd27e Mon Sep 17 00:00:00 2001
From: Aditya Vardhan <adtyavrdhn@gmail.com>
Date: Tue, 2 Jun 2026 20:57:33 +0530
Subject: [PATCH 11/13] docs(code_mode): make os_access/mount docs clear on
 first read

Public docs should let a reader grasp the host-access surface without
reverse-engineering it. Reframe the docstrings and README around when to
reach for each primitive instead of what is switched off, drop the
type-restating prose the annotations already carry, and lead with concrete
tasks (share a dataset; inject just the secrets the agent needs).

Tighten the os-access test sweep so each test asserts exactly its invariant:
drop redundant negative description asserts (one note is interpolated, so the
positive phrase alone proves selection), drop an assertion already owned by
another test, and type the tmp_path fixtures.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 pydantic_ai_harness/code_mode/README.md      | 55 +++++++++++++-------
 pydantic_ai_harness/code_mode/_capability.py | 26 +++++----
 pydantic_ai_harness/code_mode/_toolset.py    | 19 +++----
 tests/code_mode/test_code_mode.py            | 20 +++----
 4 files changed, 62 insertions(+), 58 deletions(-)

diff --git a/pydantic_ai_harness/code_mode/README.md b/pydantic_ai_harness/code_mode/README.md
index fccb518..c05b92c 100644
--- a/pydantic_ai_harness/code_mode/README.md
+++ b/pydantic_ai_harness/code_mode/README.md
@@ -142,39 +142,56 @@ for msg in result.all_messages():
 
 ## Filesystem and OS access
 
-The sandbox has no filesystem or clock by default: the `os` and `pathlib` modules import, but their
-I/O, `datetime.now()`, and `date.today()` are unavailable. Pass `os` and/or `mount` to back them with
-a host-controlled implementation.
+Sandboxed code runs with no access to the host's files, environment, or clock. Two parameters grant
+it -- reach for them when the agent's task genuinely needs the host.
+
+**`mount` -- share host directories.** Reach for this when the agent works with real files: analyzing
+a dataset you've dropped in a folder and writing a report back, editing a checkout, or processing a
+batch of documents. Sandboxed `pathlib` code reads and writes under the mounted path. (For
+environment variables or the clock, use `os_access` instead.)
 
 ```python
-from pydantic_monty import NOT_HANDLED, MountDir, OSAccess
+from pydantic_monty import MountDir
 
 from pydantic_ai_harness import CodeMode
 
-# Expose a host directory at /work inside the sandbox:
-CodeMode(mount=MountDir('/work', '/tmp/agent-workspace'))
+# The agent can read /work/data.csv and write /work/summary.md back to the host:
+CodeMode(mount=MountDir('/work', '/tmp/agent-workspace', mode='read-write'))
+```
+
+**`os_access` -- answer the sandbox's OS calls yourself.** Reach for this when the agent needs
+environment variables, the current date and time, or filesystem behavior you control. Hand it a
+ready-made OS implementation, or a callback that decides each call -- so you can inject just the
+secrets it needs, pin "now" for reproducible runs, or route file access to your own store.
+
+```python
+from pydantic_monty import NOT_HANDLED, OSAccess
+
+from pydantic_ai_harness import CodeMode
+
+# Give the agent a fixed set of environment values:
+CodeMode(os_access=OSAccess(environ={'API_BASE': 'https://api.example.com'}))
+
 
-# Supply environment/clock via an AbstractOS instance:
-CodeMode(os_access=OSAccess(environ={'STAGE': 'prod'}))
+# ...or intercept each call to decide what the agent may see:
+allowed_env = {'API_KEY': 'sk-...'}
 
 
-# ...or a raw `(function_name, args, kwargs)` callback; return NOT_HANDLED to defer to Monty:
 def my_os(fn, args, kwargs):
-    return 'secret-value' if fn == 'os.getenv' else NOT_HANDLED
+    if fn == 'os.getenv':
+        return allowed_env.get(args[0], NOT_HANDLED)  # only allow-listed keys; the rest stay hidden
+    return NOT_HANDLED
 
 
 CodeMode(os_access=my_os)
 ```
 
-`os_access` takes a `pydantic_monty.AbstractOS` or that callback and routes environment, clock, and
-filesystem calls; `mount` takes one or more `MountDir` and exposes host filesystem paths only (a
-mount alone does **not** enable `os.getenv` or `datetime.now()`). Both are fixed when the capability
-is built, so construct `CodeMode` per request to scope access. `run_code`'s description reflects
-exactly what's enabled; `asyncio.sleep` and `time` stay unavailable either way.
+Both expose the real host to model-written code, so grant only what the task needs. Access is fixed
+when the capability is built, so construct `CodeMode` per request to scope it.
 
 A `MountDir` defaults to copy-on-write `mode='overlay'`: the sandbox reads host files and sees its
-own writes, but those writes do **not** reach the host directory. Pass `MountDir(..., mode='read-write')`
-to persist writes to the host, or `mode='read-only'` to forbid them.
+own writes, but those writes do **not** reach the host. Pass `mode='read-write'` to persist them, or
+`mode='read-only'` to forbid writes.
 
 > Monty-specific: these hooks use Monty's `AbstractOS`/`MountDir` types.
 
@@ -195,8 +212,8 @@ Code runs inside [Monty](https://github.com/pydantic/monty), a sandboxed Python
 CodeMode(
     tools: ToolSelector = 'all',        # 'all', list[str], callable, or dict
     max_retries: int = 3,               # retries on sandbox execution errors
-    os_access: CodeModeOS | None = None,   # AbstractOS instance or (fn, args, kwargs) callback
-    mount: CodeModeMount | None = None,    # MountDir | list[MountDir] of host directories
+    os_access: CodeModeOS | None = None,   # host handler for env vars, clock, and file I/O
+    mount: CodeModeMount | None = None,    # host directories to share with the sandbox
 )
 ```
 
diff --git a/pydantic_ai_harness/code_mode/_capability.py b/pydantic_ai_harness/code_mode/_capability.py
index f824e34..2dc8702 100644
--- a/pydantic_ai_harness/code_mode/_capability.py
+++ b/pydantic_ai_harness/code_mode/_capability.py
@@ -35,9 +35,16 @@ class CodeMode(AbstractCapability[AgentDepsT]):
     agent = Agent('openai:gpt-5', capabilities=[CodeMode(tools=['search', 'fetch'])])
     ```
 
-    Pass `mount` for host filesystem access and/or `os_access` for environment/clock
-    (plus filesystem) access -- without them, `pathlib`/`os` I/O and
-    `datetime.now()` are unavailable inside `run_code`:
+    By default, sandboxed code cannot touch the host -- no filesystem, environment
+    variables, or clock. Two parameters open it up:
+
+    - `mount` shares specific host directories: reach for it when the agent reads or
+      writes real files.
+    - `os_access` routes the sandbox's OS calls to a handler you provide: reach for it
+      when the agent needs environment variables, the clock, or filesystem behavior you
+      control.
+
+    Both expose the real host to model-written code, so grant only what the task needs.
 
     ```python
     from pydantic_monty import MountDir
@@ -59,21 +66,12 @@ class CodeMode(AbstractCapability[AgentDepsT]):
     """Maximum number of retries for the `run_code` tool (syntax errors count as retries)."""
 
     _: KW_ONLY
-    # Everything below is keyword-only: the option list keeps growing, so new
-    # config must be passed by name rather than relying on positional order.
 
     os_access: CodeModeOS | None = None
-    """Host-backed OS access for sandboxed code.
-
-    Pass a `pydantic_monty.AbstractOS` instance or a raw OS callback
-    `(function_name, args, kwargs) -> result`. When set, `pathlib.Path`, `os`,
-    `datetime.datetime.now()`, and `datetime.date.today()` calls inside `run_code`
-    are routed to it instead of being unavailable. Fixed at construction, so build
-    `CodeMode` per request to scope access.
-    """
+    """Give sandboxed code environment variables, the clock, and file I/O through a handler you provide; unset, they are unavailable."""
 
     mount: CodeModeMount | None = None
-    """Host directory mount(s) exposed inside the sandbox as `pydantic_monty.MountDir`."""
+    """Host directories to expose to sandboxed `pathlib` code; each mount's `mode` controls whether writes reach the host."""
 
     def get_ordering(self) -> CapabilityOrdering:
         """CodeMode wraps around ToolSearch so that search_tools stays native."""
diff --git a/pydantic_ai_harness/code_mode/_toolset.py b/pydantic_ai_harness/code_mode/_toolset.py
index 7f5317e..cc6f1bc 100644
--- a/pydantic_ai_harness/code_mode/_toolset.py
+++ b/pydantic_ai_harness/code_mode/_toolset.py
@@ -51,14 +51,12 @@
 # Type alias for the dispatch callback passed to _execution_loop.
 _DispatchFn = Callable[[str, dict[str, Any]], Coroutine[Any, Any, Any]]
 
-# A raw OS callback: `(function_name, args, kwargs) -> result`. Return
-# `pydantic_monty.NOT_HANDLED` to fall back to the sandbox's default handling.
+# A raw OS callback. Return `pydantic_monty.NOT_HANDLED` to defer the call to the
+# sandbox's default, which leaves it unavailable.
 CodeModeOSCallback = Callable[[OsFunction, tuple[Any, ...], dict[str, Any]], Any]
-# What `CodeMode.os_access` accepts: either an `AbstractOS` instance or a raw callback.
-# The sandbox's `feed_start`/`resume` accept both interchangeably, so no normalization.
+# Accepted by `CodeMode.os_access`: a ready-made OS implementation or a raw callback.
 CodeModeOS = AbstractOS | CodeModeOSCallback
-# What `CodeMode.mount` accepts: one or more host-directory mounts (matches the
-# sandbox's `feed_start`/`resume` `mount=` parameter type exactly).
+# Accepted by `CodeMode.mount`: one or more host-directory mounts.
 CodeModeMount = MountDir | list[MountDir]
 
 
@@ -235,15 +233,10 @@ class CodeModeToolset(WrapperToolset[AgentDepsT]):
     """Maximum number of retries for the `run_code` tool (syntax errors count as retries)."""
 
     os_access: CodeModeOS | None = None
-    """Host-backed OS access exposed to sandboxed code.
-
-    Either a `pydantic_monty.AbstractOS` instance or a raw OS callback
-    `(function_name, args, kwargs) -> result`. When set, `pathlib.Path`, `os`,
-    `datetime.datetime.now()`, and `datetime.date.today()` calls inside the
-    sandbox are routed to it instead of being unavailable."""
+    """Give sandboxed code environment variables, the clock, and file I/O through a handler you provide; unset, they are unavailable."""
 
     mount: CodeModeMount | None = None
-    """Host directory mount(s) exposed inside the sandbox as `pydantic_monty.MountDir`."""
+    """Host directories to expose to sandboxed `pathlib` code; each mount's `mode` controls whether writes reach the host."""
 
     # init=False so `replace()` in `for_run` produces a fresh instance with _repl=None,
     # giving each agent run isolated REPL state. Lazy-initialized on first call_tool.
diff --git a/tests/code_mode/test_code_mode.py b/tests/code_mode/test_code_mode.py
index 67e8d5b..3c820eb 100644
--- a/tests/code_mode/test_code_mode.py
+++ b/tests/code_mode/test_code_mode.py
@@ -8,6 +8,7 @@
 
 from __future__ import annotations
 
+from pathlib import Path
 from typing import Any, TypeVar
 
 import pytest
@@ -1870,7 +1871,6 @@ async def test_description_default_keeps_no_wallclock_restriction(self) -> None:
         description = (await wrapper.get_tools(build_run_context(None)))['run_code'].tool_def.description
         assert description is not None
         assert 'No wall-clock or timing primitives' in description
-        assert 'Host-backed OS access' not in description
 
     async def test_description_with_os_callback_notes_host_access(self) -> None:
         """An `os` callback swaps the restriction line for the host-access note."""
@@ -1879,9 +1879,8 @@ async def test_description_with_os_callback_notes_host_access(self) -> None:
         description = (await wrapper.get_tools(build_run_context(None)))['run_code'].tool_def.description
         assert description is not None
         assert 'Host-backed OS access' in description
-        assert 'No wall-clock or timing primitives' not in description
 
-    async def test_description_mount_only_advertises_filesystem_not_env_or_clock(self, tmp_path: Any) -> None:
+    async def test_description_mount_only_advertises_filesystem_not_env_or_clock(self, tmp_path: Path) -> None:
         """A `mount` without `os` advertises filesystem access only -- it must not tell the model
         that env/clock are host-backed, since a mount cannot route `os.getenv`/`datetime.now()`."""
         wrapper = CodeMode[None](mount=MountDir('/work', str(tmp_path))).get_wrapper_toolset(
@@ -1890,15 +1889,13 @@ async def test_description_mount_only_advertises_filesystem_not_env_or_clock(sel
         assert isinstance(wrapper, CodeModeToolset)
         description = (await wrapper.get_tools(build_run_context(None)))['run_code'].tool_def.description
         assert description is not None
+        # The regression guard: a mount must select the filesystem note, not the OS note that would
+        # (wrongly) advertise env/clock as host-routed -- this assert fails if the OS note is picked.
         assert 'Mounted filesystem access' in description
-        assert 'Host-backed OS access' not in description
-        # env/clock are explicitly called out as still unavailable, not advertised as routed.
-        assert '`os.getenv`/`os.environ`, `datetime.datetime.now()`, `datetime.date.today()`' in description
-        assert 'remain unavailable' in description
 
     async def test_description_host_access_note_shows_with_no_sandboxed_tools(self) -> None:
         """The host-access note appears even when no tools are sandboxed (base description)."""
-        # `tools=[]` leaves every tool native, so `run_code` exposes no callable functions.
+        # `tools=[]` sandboxes nothing, so `run_code` renders the base description path.
         wrapper = CodeMode[None](os_access=_unused_os_callback, tools=[]).get_wrapper_toolset(
             _build_function_toolset(add)
         )
@@ -1906,7 +1903,6 @@ async def test_description_host_access_note_shows_with_no_sandboxed_tools(self)
         description = (await wrapper.get_tools(build_run_context(None)))['run_code'].tool_def.description
         assert description is not None
         assert 'Host-backed OS access' in description
-        assert 'functions are available inside the sandbox' not in description
 
     async def test_os_callback_dispatches_inside_run_code(self) -> None:
         """An `os` callback is threaded through `feed_start` and every `resume`, so OS calls
@@ -1971,7 +1967,7 @@ def os_cb(fn: OsFunction, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any:
         with pytest.raises(ModelRetry, match='boom from os'):
             await wrapper.call_tool('run_code', {'code': "import os\nos.getenv('X')"}, ctx, tools['run_code'])
 
-    async def test_mount_exposes_host_directory(self, tmp_path: Any) -> None:
+    async def test_mount_exposes_host_directory(self, tmp_path: Path) -> None:
         """A `mount` exposes a host directory inside the sandbox, threaded through resumes."""
         (tmp_path / 'data.txt').write_text('hello-from-host')
         wrapper = CodeMode[None](mount=MountDir('/work', str(tmp_path))).get_wrapper_toolset(
@@ -1984,7 +1980,7 @@ async def test_mount_exposes_host_directory(self, tmp_path: Any) -> None:
         result = await wrapper.call_tool('run_code', {'code': code}, ctx, tools['run_code'])
         assert result.return_value == 'hello-from-host'
 
-    async def test_mount_accepts_list_of_directories(self, tmp_path: Any) -> None:
+    async def test_mount_accepts_list_of_directories(self, tmp_path: Path) -> None:
         """`mount` accepts a `list[MountDir]`; each directory is exposed at its virtual path."""
         (tmp_path / 'a').mkdir()
         (tmp_path / 'b').mkdir()
@@ -1999,7 +1995,7 @@ async def test_mount_accepts_list_of_directories(self, tmp_path: Any) -> None:
         result = await wrapper.call_tool('run_code', {'code': code}, ctx, tools['run_code'])
         assert result.return_value == 'AABB'
 
-    def test_capability_forwards_os_and_mount_to_toolset(self, tmp_path: Any) -> None:
+    def test_capability_forwards_os_and_mount_to_toolset(self, tmp_path: Path) -> None:
         """`CodeMode` forwards `os_access`/`mount` onto the `CodeModeToolset` it builds."""
         mount = MountDir('/work', str(tmp_path))
         wrapper = CodeMode[None](os_access=_unused_os_callback, mount=mount).get_wrapper_toolset(

From 5f36134d7686bc12b880bd00ea5d6dc6a586ed2d Mon Sep 17 00:00:00 2001
From: Aditya Vardhan <adtyavrdhn@gmail.com>
Date: Wed, 3 Jun 2026 12:53:52 +0530
Subject: [PATCH 12/13] docs(code_mode): clarify os_access callback return
 semantics

The raw-callback example claimed non-allow-listed keys "stay hidden" by
returning NOT_HANDLED. Verified against Monty: NOT_HANDLED *refuses* the
call (raises in the sandbox -> model retry), it does not return None. A
model probing for an optional secret would crash and burn retries.

Distinguish the two return modes explicitly so users don't pick the
wrong one: return a value (incl. None) to answer/hide, NOT_HANDLED to
refuse a capability outright.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 pydantic_ai_harness/code_mode/README.md | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/pydantic_ai_harness/code_mode/README.md b/pydantic_ai_harness/code_mode/README.md
index c05b92c..963bd58 100644
--- a/pydantic_ai_harness/code_mode/README.md
+++ b/pydantic_ai_harness/code_mode/README.md
@@ -179,13 +179,25 @@ allowed_env = {'API_KEY': 'sk-...'}
 
 def my_os(fn, args, kwargs):
     if fn == 'os.getenv':
-        return allowed_env.get(args[0], NOT_HANDLED)  # only allow-listed keys; the rest stay hidden
+        # Answer the call: allow-listed keys resolve, every other key reads back
+        # as None -- absent, exactly like a real unset variable.
+        return allowed_env.get(args[0])
+    # Refuse everything else: NOT_HANDLED makes the call fail in the sandbox.
     return NOT_HANDLED
 
 
 CodeMode(os_access=my_os)
 ```
 
+Your callback's return value decides the call's fate, and the two outcomes are easy to confuse:
+
+- **Return any value** -- including `None`, `''`, or `0` -- and that becomes the result the sandbox
+  sees. `os.getenv` returning `None` looks exactly like a normal unset variable, so the agent's code
+  keeps running. This is how you *hide* something: answer with an empty value.
+- **Return `NOT_HANDLED`** and the call is treated as unsupported: it raises inside the sandbox and
+  the model gets a retry. This *refuses* a capability outright -- use it to block, not to say "no
+  value". Returning `NOT_HANDLED` for a key the agent reasonably expects will burn retries.
+
 Both expose the real host to model-written code, so grant only what the task needs. Access is fixed
 when the capability is built, so construct `CodeMode` per request to scope it.
 

From 760abeec08c02e155bda0c358dabc5811b2fb20a Mon Sep 17 00:00:00 2001
From: Aditya Vardhan <adtyavrdhn@gmail.com>
Date: Wed, 3 Jun 2026 12:56:31 +0530
Subject: [PATCH 13/13] test(code_mode): lock in os_access value-vs-NOT_HANDLED
 semantics

Returning a value (including None) from an os_access callback answers
the call -- a None reads back like an unset env var, so the sandbox
keeps running. Returning NOT_HANDLED refuses the call, raising in the
sandbox and surfacing as ModelRetry. These two paths are easy to
confuse and silently regress, so pin both.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 tests/code_mode/test_code_mode.py | 40 +++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/tests/code_mode/test_code_mode.py b/tests/code_mode/test_code_mode.py
index 3c820eb..e6f3f1a 100644
--- a/tests/code_mode/test_code_mode.py
+++ b/tests/code_mode/test_code_mode.py
@@ -1967,6 +1967,46 @@ def os_cb(fn: OsFunction, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any:
         with pytest.raises(ModelRetry, match='boom from os'):
             await wrapper.call_tool('run_code', {'code': "import os\nos.getenv('X')"}, ctx, tools['run_code'])
 
+    async def test_os_callback_returning_value_answers_call_including_none(self) -> None:
+        """Returning a value from the `os` callback -- even `None` -- *answers* the call.
+
+        Allow-listed keys resolve; every other key reads back as `None`, exactly like a real
+        unset env var, so the sandbox keeps running with no retry. This is how a callback hides
+        a secret: by answering with an empty value, not by refusing the call.
+        """
+        allowed = {'API_KEY': 'sk-xxx'}
+
+        def os_cb(fn: OsFunction, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any:
+            if fn == 'os.getenv':
+                return allowed.get(args[0])
+            return NOT_HANDLED  # pragma: no cover - sandbox only calls os.getenv here
+
+        wrapper = CodeMode[None](os_access=os_cb).get_wrapper_toolset(_build_function_toolset(add))
+        assert isinstance(wrapper, CodeModeToolset)
+        ctx = await build_ctx(None, wrapper)
+        tools = await wrapper.get_tools(ctx)
+        code = "import os\n{'allowed': os.getenv('API_KEY'), 'hidden': os.getenv('SECRET')}"
+        result = await wrapper.call_tool('run_code', {'code': code}, ctx, tools['run_code'])
+        assert result.return_value == {'allowed': 'sk-xxx', 'hidden': None}
+
+    async def test_os_callback_not_handled_refuses_call_as_model_retry(self) -> None:
+        """Returning `NOT_HANDLED` *refuses* the call rather than answering it.
+
+        The OS function is treated as unsupported, so it raises in the sandbox and surfaces as
+        `ModelRetry`. This is the counterpart to returning a value: refusing is not the same as
+        answering `None`, and using it for a key the model expects will burn retries.
+        """
+
+        def os_cb(fn: OsFunction, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any:
+            return NOT_HANDLED
+
+        wrapper = CodeMode[None](os_access=os_cb).get_wrapper_toolset(_build_function_toolset(add))
+        assert isinstance(wrapper, CodeModeToolset)
+        ctx = await build_ctx(None, wrapper)
+        tools = await wrapper.get_tools(ctx)
+        with pytest.raises(ModelRetry, match='not supported in this environment'):
+            await wrapper.call_tool('run_code', {'code': "import os\nos.getenv('X')"}, ctx, tools['run_code'])
+
     async def test_mount_exposes_host_directory(self, tmp_path: Path) -> None:
         """A `mount` exposes a host directory inside the sandbox, threaded through resumes."""
         (tmp_path / 'data.txt').write_text('hello-from-host')