From 11c7144d5c0045412034da12292c39cd3908b5e8 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 29 May 2026 10:46:13 +0000 Subject: [PATCH 01/13] feat(code_mode): expose host-backed OS access to the sandbox Sandboxed `run_code` had no way to reach the filesystem, environment, or wall clock: Monty supports it through an OS callback / `AbstractOS` and directory mounts, but `CodeMode` never threaded `os`/`mount` into `feed_start` or the snapshot resume loop, so callers couldn't enable it. Add `os` and `mount` options on `CodeMode`/`CodeModeToolset`, thread them through `feed_start` and every `resume` site (OS auto-dispatch stops the moment a resume omits them), and make the `run_code` description reflect whether host-backed access is configured. --- pydantic_ai_harness/code_mode/README.md | 50 ++++++++- pydantic_ai_harness/code_mode/__init__.py | 4 +- pydantic_ai_harness/code_mode/_capability.py | 33 +++++- pydantic_ai_harness/code_mode/_toolset.py | 100 ++++++++++++++--- tests/code_mode/test_code_mode.py | 106 +++++++++++++++++++ 5 files changed, 270 insertions(+), 23 deletions(-) diff --git a/pydantic_ai_harness/code_mode/README.md b/pydantic_ai_harness/code_mode/README.md index 62fdda9..22dedc4 100644 --- a/pydantic_ai_harness/code_mode/README.md +++ b/pydantic_ai_harness/code_mode/README.md @@ -140,22 +140,66 @@ for msg in result.all_messages(): tool_returns = part.metadata['tool_returns'] # dict[str, ToolReturnPart] ``` +## Host-backed OS access + +By default the sandbox has no filesystem or clock: `os`/`pathlib` are importable but their I/O +operations and `datetime.datetime.now()`/`datetime.date.today()` are unavailable. Pass `os` and/or +`mount` to route those operations to a host-controlled implementation. + +```python +from pydantic_monty import MountDir +from pydantic_ai_harness import CodeMode + +# Expose a host directory inside the sandbox (read/write under /work): +CodeMode(mount=MountDir('/work', '/tmp/agent-workspace')) + +# Or supply a custom OS implementation (an `AbstractOS` instance): +from pydantic_monty import OSAccess +CodeMode(os=OSAccess(environ={'STAGE': 'prod'})) + +# Or a raw callback `(function_name, args, kwargs) -> result` +# (return `pydantic_monty.NOT_HANDLED` to fall back to Monty's default): +from pydantic_monty import NOT_HANDLED + +def my_os(fn, args, kwargs): + if fn == 'os.getenv': + return lookup_secret(args[0]) + return NOT_HANDLED + +CodeMode(os=my_os) +``` + +`os` accepts a `pydantic_monty.AbstractOS` instance or a raw callback; both are exposed as the +`MontyOS` type alias. `mount` accepts one or more `pydantic_monty.MountDir`. To scope access per +request (per user/session), pass a stateful `AbstractOS` -- for example one rooted at a +caller-specific directory. + +When `os` or `mount` is set, the `run_code` description tells the model that `pathlib`, `os`, +`datetime.now()`, and `date.today()` are routed to the host. `asyncio.sleep` and the `time` module +remain unavailable regardless. + +> These options are Monty-specific: `CodeMode` is built directly on the Monty sandbox, so its OS +> hooks use Monty's `AbstractOS`/`MountDir` types. + ## Sandbox restrictions Code runs inside [Monty](https://github.com/pydantic/monty), a sandboxed Python subset. Key restrictions: - No class definitions - No third-party imports (allowed stdlib: `sys`, `typing`, `asyncio`, `math`, `json`, `re`, `datetime`, `os`, `pathlib`) -- No wall-clock or timing primitives: `asyncio.sleep`, `datetime.datetime.now()`/`datetime.date.today()`, and the `time` module are unavailable +- No wall-clock or timing primitives: `asyncio.sleep`, `datetime.datetime.now()`/`datetime.date.today()`, and the `time` module are unavailable -- unless you wire up host-backed OS access (see above), which enables `datetime.now()`/`date.today()` (but not `asyncio.sleep`/`time`) - No `import *` +- Filesystem and `os` I/O are unavailable unless an `os`/`mount` is configured - Tools requiring approval or with deferred execution are excluded from the sandbox ## API ```python CodeMode( - tools: ToolSelector = 'all', # 'all', list[str], callable, or dict - max_retries: int = 3, # retries on sandbox execution errors + tools: ToolSelector = 'all', # 'all', list[str], callable, or dict + max_retries: int = 3, # retries on sandbox execution errors + os: MontyOS | None = None, # AbstractOS instance or (fn, args, kwargs) callback + mount: MontyMount | None = None, # MountDir | list[MountDir] of host directories ) ``` diff --git a/pydantic_ai_harness/code_mode/__init__.py b/pydantic_ai_harness/code_mode/__init__.py index 42304fa..b06d27b 100644 --- a/pydantic_ai_harness/code_mode/__init__.py +++ b/pydantic_ai_harness/code_mode/__init__.py @@ -1,6 +1,6 @@ """Code mode capability: route tool calls through a sandboxed Python environment.""" from pydantic_ai_harness.code_mode._capability import CodeMode -from pydantic_ai_harness.code_mode._toolset import CodeModeToolset +from pydantic_ai_harness.code_mode._toolset import CodeModeToolset, MontyMount, MontyOS, MontyOSCallback -__all__ = ['CodeMode', 'CodeModeToolset'] +__all__ = ['CodeMode', 'CodeModeToolset', 'MontyMount', 'MontyOS', 'MontyOSCallback'] diff --git a/pydantic_ai_harness/code_mode/_capability.py b/pydantic_ai_harness/code_mode/_capability.py index 57eeaab..69df3eb 100644 --- a/pydantic_ai_harness/code_mode/_capability.py +++ b/pydantic_ai_harness/code_mode/_capability.py @@ -9,7 +9,7 @@ from pydantic_ai.capabilities._tool_search import ToolSearch as _ToolSearch from pydantic_ai.tools import AgentDepsT, ToolSelector -from pydantic_ai_harness.code_mode._toolset import CodeModeToolset +from pydantic_ai_harness.code_mode._toolset import CodeModeToolset, MontyMount, MontyOS @dataclass @@ -34,6 +34,16 @@ class CodeMode(AbstractCapability[AgentDepsT]): # Sandbox only specific tools agent = Agent('openai:gpt-5', capabilities=[CodeMode(tools=['search', 'fetch'])]) ``` + + Pass `os` (and/or `mount`) to give sandboxed code host-backed filesystem and + OS access -- without it, `pathlib`/`os` I/O and `datetime.now()` are + unavailable inside `run_code`: + + ```python + from pydantic_monty import MountDir + + agent = Agent('openai:gpt-5', capabilities=[CodeMode(mount=MountDir('/work', '/tmp/agent-work'))]) + ``` """ tools: ToolSelector[AgentDepsT] = field(default='all') @@ -48,10 +58,29 @@ class CodeMode(AbstractCapability[AgentDepsT]): max_retries: int = 3 """Maximum number of retries for the `run_code` tool (syntax errors count as retries).""" + os: MontyOS | None = None + """Host-backed OS access for sandboxed code. + + Pass a `pydantic_monty.AbstractOS` instance or a raw Monty OS callback + `(function_name, args, kwargs) -> result`. When set, `pathlib.Path`, `os`, + `datetime.datetime.now()`, and `datetime.date.today()` calls inside `run_code` + are routed to it instead of being unavailable. Scope it per request by giving + a stateful `AbstractOS` (e.g. one rooted at a per-user directory). + """ + + mount: MontyMount | None = None + """Host directory mount(s) exposed inside the sandbox as `pydantic_monty.MountDir`.""" + def get_ordering(self) -> CapabilityOrdering: """CodeMode wraps around ToolSearch so that search_tools stays native.""" return CapabilityOrdering(position='outermost', wraps=[_ToolSearch]) def get_wrapper_toolset(self, toolset: AbstractToolset[AgentDepsT]) -> AbstractToolset[AgentDepsT] | None: """Wrap the agent's assembled toolset, splitting it into native + sandboxed subsets if needed.""" - return CodeModeToolset(wrapped=toolset, tool_selector=self.tools, max_retries=self.max_retries) + return CodeModeToolset( + wrapped=toolset, + tool_selector=self.tools, + max_retries=self.max_retries, + os=self.os, + mount=self.mount, + ) diff --git a/pydantic_ai_harness/code_mode/_toolset.py b/pydantic_ai_harness/code_mode/_toolset.py index da503a9..a8f6832 100644 --- a/pydantic_ai_harness/code_mode/_toolset.py +++ b/pydantic_ai_harness/code_mode/_toolset.py @@ -26,6 +26,7 @@ try: from pydantic_monty import ( + AbstractOS, ExternalException, ExternalResult, ExternalReturnValue, @@ -37,7 +38,9 @@ MontyRuntimeError, MontySyntaxError, MontyTypingError, + MountDir, NameLookupSnapshot, + OsFunction, ) except ImportError as _import_error: # pragma: no cover raise ImportError( @@ -48,6 +51,16 @@ # Type alias for the dispatch callback passed to _execution_loop. _DispatchFn = Callable[[str, dict[str, Any]], Coroutine[Any, Any, Any]] +# A raw Monty OS callback: `(function_name, args, kwargs) -> result`. Return +# `pydantic_monty.NOT_HANDLED` to fall back to Monty's default handling. +MontyOSCallback = Callable[[OsFunction, tuple[Any, ...], dict[str, Any]], Any] +# What `CodeMode.os` accepts: either an `AbstractOS` instance or a raw callback. +# Monty's `feed_start`/`resume` accept both interchangeably, so no normalization. +MontyOS = AbstractOS | MontyOSCallback +# What `CodeMode.mount` accepts: one or more host-directory mounts (matches Monty's +# `feed_start`/`resume` `mount=` parameter type exactly). +MontyMount = MountDir | list[MountDir] + class _RunCodeArguments(TypedDict): code: Annotated[str, Field(description='The Python code to execute in the sandbox.')] @@ -69,14 +82,28 @@ class _RunCodeArguments(TypedDict): # and to reconstruct multimodal types (e.g. BinaryContent) from Monty results (validate_python). _TOOL_RETURN_CONTENT_TA: TypeAdapter[Any] = TypeAdapter(ToolReturnContent) -_RUN_CODE_BASE_DESCRIPTION = """\ +_RUN_CODE_DESCRIPTION_HEAD = """\ Write and run Python code in a sandboxed environment. The sandbox uses Monty, a subset of Python. Key restrictions: - **No classes**: class definitions are not supported - **No third-party libraries**: only the standard library modules listed below can be used -- **Importable standard library modules**: `sys`, `typing`, `asyncio`, `math`, `json`, `re`, `datetime`, `os`, `pathlib`. These must be imported at the top of your snippet before use, just like in regular Python. For example: `import asyncio` then `results = await asyncio.gather(tool_one(...), tool_two(...))`. -- **No wall-clock or timing primitives**: `asyncio.sleep`, `datetime.datetime.now()`, `datetime.date.today()`, and the `time` module are unavailable. +- **Importable standard library modules**: `sys`, `typing`, `asyncio`, `math`, `json`, `re`, `datetime`, `os`, `pathlib`. These must be imported at the top of your snippet before use, just like in regular Python. For example: `import asyncio` then `results = await asyncio.gather(tool_one(...), tool_two(...))`.""" + +# Timing/OS restriction line, swapped depending on whether the agent configured +# host-backed OS access (`CodeMode(os=...)` / `mount=...`). +_NO_OS_RESTRICTION = ( + '- **No wall-clock or timing primitives**: `asyncio.sleep`, `datetime.datetime.now()`, ' + '`datetime.date.today()`, and the `time` module are unavailable.' +) +_OS_ENABLED_NOTE = ( + '- **Host-backed OS access**: `pathlib.Path` operations, `os.getenv`/`os.environ`, ' + '`datetime.datetime.now()`, and `datetime.date.today()` are routed to the host environment ' + 'configured for this agent (availability depends on that configuration). `asyncio.sleep` and ' + 'the `time` module remain unavailable.' +) + +_RUN_CODE_DESCRIPTION_TAIL = """\ - **No `import *`**: wildcard imports are not supported State is preserved between calls (REPL-style). Set `restart: true` to reset state. @@ -90,6 +117,17 @@ class _RunCodeArguments(TypedDict): """ +def _base_description(*, os_enabled: bool) -> str: + """Assemble the `run_code` base description, swapping the OS-access line. + + When the agent configured host-backed OS access (`CodeMode(os=...)` or + `mount=...`), the static "no wall-clock" restriction is replaced with a note + that filesystem/clock operations route to the host. + """ + restriction = _OS_ENABLED_NOTE if os_enabled else _NO_OS_RESTRICTION + return f'{_RUN_CODE_DESCRIPTION_HEAD}\n{restriction}\n{_RUN_CODE_DESCRIPTION_TAIL}' + + def _functions_header(*, has_sync: bool, has_async: bool) -> str: """Build the functions-header paragraph for the `run_code` tool description.""" base = ( @@ -184,6 +222,17 @@ class CodeModeToolset(WrapperToolset[AgentDepsT]): max_retries: int = 3 """Maximum number of retries for the `run_code` tool (syntax errors count as retries).""" + os: MontyOS | None = None + """Host-backed OS access exposed to sandboxed code. + + Either a `pydantic_monty.AbstractOS` instance or a raw Monty OS callback + `(function_name, args, kwargs) -> result`. When set, `pathlib.Path`, `os`, + `datetime.datetime.now()`, and `datetime.date.today()` calls inside the + sandbox are routed to it instead of being unavailable.""" + + mount: MontyMount | None = None + """Host directory mount(s) exposed inside the sandbox as `pydantic_monty.MountDir`.""" + # init=False so `replace()` in `for_run` produces a fresh instance with _repl=None, # giving each agent run isolated REPL state. Lazy-initialized on first call_tool. _repl: MontyRepl | None = field(default=None, init=False, repr=False) @@ -236,7 +285,8 @@ async def get_tools(self, ctx: RunContext[AgentDepsT]) -> dict[str, ToolsetTool[ callable_defs, sanitized_to_original = self._partition_callable_tools(sandboxed_tools) - description = self._build_description(callable_defs) + os_enabled = self.os is not None or self.mount is not None + description = self._build_description(callable_defs, os_enabled=os_enabled) if _RUN_CODE_TOOL_NAME in native_tools: raise UserError( @@ -399,7 +449,7 @@ async def dispatch_tool_call(original_name: str, kwargs: dict[str, Any]) -> Any: capture = _PrintCapture() try: - monty_state = self._repl.feed_start(code, print_callback=capture) + monty_state = self._repl.feed_start(code, print_callback=capture, os=self.os, mount=self.mount) completed = await _execution_loop( monty_state, dispatch=dispatch_tool_call, @@ -407,6 +457,8 @@ async def dispatch_tool_call(original_name: str, kwargs: dict[str, Any]) -> Any: sanitized_to_original=sanitized_to_original, sequential_tools=sequential_tools, global_sequential=global_sequential, + os=self.os, + mount=self.mount, ) except MontySyntaxError as e: raise ModelRetry(f'Syntax error in code:\n{_prepend_prints(e.display(), capture)}') from e @@ -504,10 +556,11 @@ def _partition_callable_tools( return callable_defs, sanitized_to_original @staticmethod - def _build_description(callable_defs: dict[str, ToolDefinition]) -> str: + def _build_description(callable_defs: dict[str, ToolDefinition], *, os_enabled: bool) -> str: """Render the `run_code` description: base prose + TypedDicts + function signatures.""" + base = _base_description(os_enabled=os_enabled) if not callable_defs: - return _RUN_CODE_BASE_DESCRIPTION + return base sigs, conflicting = _get_sigs_and_conflicting(callable_defs) type_blocks = FunctionSignature.render_type_definitions(sigs, conflicting) @@ -520,7 +573,7 @@ def _build_description(callable_defs: dict[str, ToolDefinition]) -> str: has_async = any(not td.sequential for td in callable_defs.values()) header = _functions_header(has_sync=has_sync, has_async=has_async) - sections = [_RUN_CODE_BASE_DESCRIPTION, header] + sections = [base, header] if type_blocks: sections.append('```python\n' + '\n\n'.join(type_blocks) + '\n```') sections.append('```python\n' + '\n\n'.join(function_blocks) + '\n```') @@ -579,6 +632,8 @@ async def _execution_loop( sanitized_to_original: dict[str, str], sequential_tools: set[str], global_sequential: bool, + os: MontyOS | None, + mount: MontyMount | None, ) -> MontyComplete: """Drive the Monty REPL via the synchronous snapshot API until completion. @@ -597,6 +652,9 @@ async def _execution_loop( - **Global sequential mode** (DBOS/Temporal): all tools are deferred via `resume({'future': ...})` but stored as bare coroutines and awaited one-at-a-time at `FutureSnapshot` to prevent interleaving. + + `os`/`mount` must be passed to every `resume` call (not just `feed_start`): + Monty's auto-dispatch of OS calls stops the moment a resume omits them. """ pending: dict[int, asyncio.Task[Any] | Coroutine[Any, Any, Any]] = {} # Results from parallel tasks that were awaited early (at a sequential-tool @@ -605,7 +663,7 @@ async def _execution_loop( try: while not isinstance(monty_state, MontyComplete): if isinstance(monty_state, NameLookupSnapshot): - monty_state = monty_state.resume() + monty_state = monty_state.resume(os=os, mount=mount) elif isinstance(monty_state, FunctionSnapshot): monty_state = await _handle_function_snapshot( monty_state, @@ -616,6 +674,8 @@ async def _execution_loop( global_sequential=global_sequential, pending=pending, pre_resolved=pre_resolved, + os=os, + mount=mount, ) else: monty_state = await _resolve_future_snapshot( @@ -623,6 +683,8 @@ async def _execution_loop( pending=pending, pre_resolved=pre_resolved, global_sequential=global_sequential, + os=os, + mount=mount, ) finally: for item in pending.values(): # pragma: no cover @@ -644,16 +706,20 @@ async def _handle_function_snapshot( global_sequential: bool, pending: dict[int, asyncio.Task[Any] | Coroutine[Any, Any, Any]], pre_resolved: dict[int, ExternalResult], + os: MontyOS | None, + mount: MontyMount | None, ) -> FunctionSnapshot | FutureSnapshot | NameLookupSnapshot | MontyComplete: """Handle a single FunctionSnapshot from the Monty execution loop.""" fn_name = snapshot.function_name if fn_name not in callable_defs: - return snapshot.resume({'exception': NameError(f'Unknown function: {fn_name}')}) + return snapshot.resume({'exception': NameError(f'Unknown function: {fn_name}')}, os=os, mount=mount) if snapshot.args: return snapshot.resume( - {'exception': TypeError(f'{fn_name}() does not accept positional arguments; use keyword arguments')} + {'exception': TypeError(f'{fn_name}() does not accept positional arguments; use keyword arguments')}, + os=os, + mount=mount, ) original_name = sanitized_to_original.get(fn_name, fn_name) @@ -666,8 +732,8 @@ async def _handle_function_snapshot( pre_resolved[cid] = await _resolve_coro(pending.pop(cid)) outcome = await _resolve_coro(dispatch(original_name, snapshot.kwargs)) if 'return_value' in outcome: - return snapshot.resume({'return_value': outcome['return_value']}) - return snapshot.resume({'exception': outcome['exception']}) + return snapshot.resume({'return_value': outcome['return_value']}, os=os, mount=mount) + return snapshot.resume({'exception': outcome['exception']}, os=os, mount=mount) # Deferred execution — store for later resolution at FutureSnapshot. if global_sequential: @@ -676,7 +742,7 @@ async def _handle_function_snapshot( else: # Eagerly schedule as a Task for concurrent execution. pending[snapshot.call_id] = asyncio.ensure_future(dispatch(original_name, snapshot.kwargs)) - return snapshot.resume({'future': ...}) + return snapshot.resume({'future': ...}, os=os, mount=mount) async def _resolve_future_snapshot( @@ -685,11 +751,13 @@ async def _resolve_future_snapshot( pending: dict[int, asyncio.Task[Any] | Coroutine[Any, Any, Any]], pre_resolved: dict[int, ExternalResult], global_sequential: bool, + os: MontyOS | None, + mount: MontyMount | None, ) -> FunctionSnapshot | FutureSnapshot | NameLookupSnapshot | MontyComplete: """Resolve pending tool calls at a FutureSnapshot.""" pending_ids = snapshot.pending_call_ids if not pending_ids: # pragma: no cover - return snapshot.resume(results={}) + return snapshot.resume(results={}, os=os, mount=mount) results: dict[int, ExternalResult] = {} for cid in pending_ids: @@ -708,7 +776,7 @@ async def _resolve_future_snapshot( for cid, outcome in zip(gather_ids, settled): results[cid] = _settle_outcome(outcome) - return snapshot.resume(results=results) + return snapshot.resume(results=results, os=os, mount=mount) async def _resolve_coro( diff --git a/tests/code_mode/test_code_mode.py b/tests/code_mode/test_code_mode.py index 1ffb084..ef5c6ae 100644 --- a/tests/code_mode/test_code_mode.py +++ b/tests/code_mode/test_code_mode.py @@ -24,6 +24,7 @@ from pydantic_ai.toolsets.function import FunctionToolset from pydantic_ai.usage import RunUsage from pydantic_core import SchemaValidator, core_schema +from pydantic_monty import NOT_HANDLED, MountDir, OSAccess, OsFunction from typing_extensions import TypedDict from pydantic_ai_harness import CodeMode @@ -1854,6 +1855,111 @@ def test_code_mode_ordering(self) -> None: assert ToolSearch in ordering.wraps +class TestCodeModeOSAccess: + """`CodeMode(os=...)` / `mount=...` give sandboxed code host-backed OS access.""" + + async def test_description_default_keeps_no_wallclock_restriction(self) -> None: + """Without `os`/`mount`, the description keeps the no-wall-clock restriction.""" + wrapper = CodeMode[None]().get_wrapper_toolset(_build_function_toolset(add)) + assert isinstance(wrapper, CodeModeToolset) + description = (await wrapper.get_tools(build_run_context(None)))['run_code'].tool_def.description + assert description is not None + assert 'No wall-clock or timing primitives' in description + assert 'Host-backed OS access' not in description + + async def test_description_with_os_callback_notes_host_access(self) -> None: + """An `os` callback swaps the restriction line for the host-access note.""" + + def os_cb(fn: OsFunction, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any: + return NOT_HANDLED # pragma: no cover - not invoked; this test only checks the description + + wrapper = CodeMode[None](os=os_cb).get_wrapper_toolset(_build_function_toolset(add)) + assert isinstance(wrapper, CodeModeToolset) + description = (await wrapper.get_tools(build_run_context(None)))['run_code'].tool_def.description + assert description is not None + assert 'Host-backed OS access' in description + assert 'No wall-clock or timing primitives' not in description + + async def test_description_with_mount_notes_host_access(self, tmp_path: Any) -> None: + """A `mount` (without `os`) also enables the host-access note.""" + wrapper = CodeMode[None](mount=MountDir('/work', str(tmp_path))).get_wrapper_toolset( + _build_function_toolset(add) + ) + assert isinstance(wrapper, CodeModeToolset) + description = (await wrapper.get_tools(build_run_context(None)))['run_code'].tool_def.description + assert description is not None + assert 'Host-backed OS access' in description + + async def test_description_host_access_note_shows_with_no_sandboxed_tools(self) -> None: + """The host-access note appears even when no tools are sandboxed (base description).""" + + def os_cb(fn: OsFunction, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any: + return NOT_HANDLED # pragma: no cover - not invoked; this test only checks the description + + # `tools=[]` leaves every tool native, so `run_code` exposes no callable functions. + wrapper = CodeMode[None](os=os_cb, tools=[]).get_wrapper_toolset(_build_function_toolset(add)) + assert isinstance(wrapper, CodeModeToolset) + description = (await wrapper.get_tools(build_run_context(None)))['run_code'].tool_def.description + assert description is not None + assert 'Host-backed OS access' in description + assert 'functions are available inside the sandbox' not in description + + async def test_os_callback_dispatches_inside_run_code(self) -> None: + """An `os` callback is threaded through `feed_start` and every `resume`, so OS calls + keep dispatching even after a tool call suspends and resumes the sandbox.""" + + def os_cb(fn: OsFunction, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any: + if fn == 'os.getenv': + return 'envval' + return NOT_HANDLED # pragma: no cover - sandbox only calls os.getenv here + + wrapper = CodeMode[None](os=os_cb).get_wrapper_toolset(_build_function_toolset(add)) + assert isinstance(wrapper, CodeModeToolset) + ctx = await build_ctx(None, wrapper) + tools = await wrapper.get_tools(ctx) + # The tool call forces a FunctionSnapshot -> FutureSnapshot round-trip; the os.getenv + # afterwards only resolves if `os` survived those resumes. + code = "import os\nx = await add(a=2, b=3)\nhome = os.getenv('THING')\n{'sum': x, 'home': home}" + result = await wrapper.call_tool('run_code', {'code': code}, ctx, tools['run_code']) + assert result.return_value == {'sum': 5, 'home': 'envval'} + + async def test_abstract_os_instance_dispatches_inside_run_code(self) -> None: + """An `AbstractOS` instance is accepted as the `os` value and dispatches OS calls.""" + wrapper = CodeMode[None](os=OSAccess(environ={'THING': 'fromabs'})).get_wrapper_toolset( + _build_function_toolset(add) + ) + assert isinstance(wrapper, CodeModeToolset) + ctx = await build_ctx(None, wrapper) + tools = await wrapper.get_tools(ctx) + result = await wrapper.call_tool('run_code', {'code': "import os\nos.getenv('THING')"}, ctx, tools['run_code']) + assert result.return_value == 'fromabs' + + async def test_mount_exposes_host_directory(self, tmp_path: Any) -> None: + """A `mount` exposes a host directory inside the sandbox, threaded through resumes.""" + (tmp_path / 'data.txt').write_text('hello-from-host') + wrapper = CodeMode[None](mount=MountDir('/work', str(tmp_path))).get_wrapper_toolset( + _build_function_toolset(add) + ) + assert isinstance(wrapper, CodeModeToolset) + ctx = await build_ctx(None, wrapper) + tools = await wrapper.get_tools(ctx) + code = "from pathlib import Path\nawait add(a=1, b=1)\nPath('/work/data.txt').read_text()" + result = await wrapper.call_tool('run_code', {'code': code}, ctx, tools['run_code']) + assert result.return_value == 'hello-from-host' + + def test_capability_forwards_os_and_mount_to_toolset(self, tmp_path: Any) -> None: + """`CodeMode` forwards `os`/`mount` onto the `CodeModeToolset` it builds.""" + + def os_cb(fn: OsFunction, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any: + return NOT_HANDLED # pragma: no cover - never invoked; only identity is asserted + + mount = MountDir('/work', str(tmp_path)) + wrapper = CodeMode[None](os=os_cb, mount=mount).get_wrapper_toolset(_build_function_toolset(add)) + assert isinstance(wrapper, CodeModeToolset) + assert wrapper.os is os_cb + assert wrapper.mount is mount + + def _search_tool_def(description: str = 'Search for tools.') -> ToolDefinition: """Create a ToolDefinition mimicking the search_tools tool from ToolSearchToolset.""" from pydantic_ai.toolsets._tool_search import _SEARCH_TOOLS_NAME From b0cb11cccffb48e9993a45d86c06b35eacbcc96b Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 29 May 2026 12:09:09 +0000 Subject: [PATCH 02/13] test(code_mode): harden OS-access tests around the threading invariants Add edge cases that pin the behaviours most likely to regress: OS access surviving across REPL-persisted `run_code` calls, a raising `os` callback degrading to `ModelRetry` instead of crashing the loop, and `mount` accepting a `list[MountDir]`. Hoist the never-invoked callback used by the description/forwarding assertions into one shared helper. --- tests/code_mode/test_code_mode.py | 73 ++++++++++++++++++++++++------- 1 file changed, 57 insertions(+), 16 deletions(-) diff --git a/tests/code_mode/test_code_mode.py b/tests/code_mode/test_code_mode.py index ef5c6ae..ca5531c 100644 --- a/tests/code_mode/test_code_mode.py +++ b/tests/code_mode/test_code_mode.py @@ -1855,6 +1855,11 @@ def test_code_mode_ordering(self) -> None: assert ToolSearch in ordering.wraps +def _unused_os_callback(fn: OsFunction, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any: + """An `os` callback for tests that only assert description/forwarding, never run code.""" + return NOT_HANDLED # pragma: no cover - never invoked by these tests + + class TestCodeModeOSAccess: """`CodeMode(os=...)` / `mount=...` give sandboxed code host-backed OS access.""" @@ -1869,11 +1874,7 @@ async def test_description_default_keeps_no_wallclock_restriction(self) -> None: async def test_description_with_os_callback_notes_host_access(self) -> None: """An `os` callback swaps the restriction line for the host-access note.""" - - def os_cb(fn: OsFunction, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any: - return NOT_HANDLED # pragma: no cover - not invoked; this test only checks the description - - wrapper = CodeMode[None](os=os_cb).get_wrapper_toolset(_build_function_toolset(add)) + wrapper = CodeMode[None](os=_unused_os_callback).get_wrapper_toolset(_build_function_toolset(add)) assert isinstance(wrapper, CodeModeToolset) description = (await wrapper.get_tools(build_run_context(None)))['run_code'].tool_def.description assert description is not None @@ -1892,12 +1893,8 @@ async def test_description_with_mount_notes_host_access(self, tmp_path: Any) -> async def test_description_host_access_note_shows_with_no_sandboxed_tools(self) -> None: """The host-access note appears even when no tools are sandboxed (base description).""" - - def os_cb(fn: OsFunction, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any: - return NOT_HANDLED # pragma: no cover - not invoked; this test only checks the description - # `tools=[]` leaves every tool native, so `run_code` exposes no callable functions. - wrapper = CodeMode[None](os=os_cb, tools=[]).get_wrapper_toolset(_build_function_toolset(add)) + wrapper = CodeMode[None](os=_unused_os_callback, tools=[]).get_wrapper_toolset(_build_function_toolset(add)) assert isinstance(wrapper, CodeModeToolset) description = (await wrapper.get_tools(build_run_context(None)))['run_code'].tool_def.description assert description is not None @@ -1923,6 +1920,25 @@ def os_cb(fn: OsFunction, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any: result = await wrapper.call_tool('run_code', {'code': code}, ctx, tools['run_code']) assert result.return_value == {'sum': 5, 'home': 'envval'} + async def test_os_access_persists_across_run_code_calls(self) -> None: + """`os` is supplied on every `feed_start`, so OS access still works on a later + `run_code` call that reuses the persisted (non-fresh) REPL.""" + + def os_cb(fn: OsFunction, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any: + if fn == 'os.getenv': + return 'persisted' + return NOT_HANDLED # pragma: no cover - sandbox only calls os.getenv here + + wrapper = CodeMode[None](os=os_cb).get_wrapper_toolset(_build_function_toolset(add)) + assert isinstance(wrapper, CodeModeToolset) + ctx = await build_ctx(None, wrapper) + tools = await wrapper.get_tools(ctx) + first = await wrapper.call_tool('run_code', {'code': "import os\nos.getenv('A')"}, ctx, tools['run_code']) + assert first.return_value == 'persisted' + # Second call reuses the REPL (so `import os` carries over) and must still dispatch. + second = await wrapper.call_tool('run_code', {'code': "os.getenv('B')"}, ctx, tools['run_code']) + assert second.return_value == 'persisted' + async def test_abstract_os_instance_dispatches_inside_run_code(self) -> None: """An `AbstractOS` instance is accepted as the `os` value and dispatches OS calls.""" wrapper = CodeMode[None](os=OSAccess(environ={'THING': 'fromabs'})).get_wrapper_toolset( @@ -1934,6 +1950,20 @@ async def test_abstract_os_instance_dispatches_inside_run_code(self) -> None: result = await wrapper.call_tool('run_code', {'code': "import os\nos.getenv('THING')"}, ctx, tools['run_code']) assert result.return_value == 'fromabs' + async def test_os_callback_exception_becomes_model_retry(self) -> None: + """A raising `os` callback surfaces as a `ModelRetry`, like any other sandbox runtime + error -- it must not crash the agent loop.""" + + def os_cb(fn: OsFunction, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any: + raise ValueError('boom from os') + + wrapper = CodeMode[None](os=os_cb).get_wrapper_toolset(_build_function_toolset(add)) + assert isinstance(wrapper, CodeModeToolset) + ctx = await build_ctx(None, wrapper) + tools = await wrapper.get_tools(ctx) + with pytest.raises(ModelRetry, match='boom from os'): + await wrapper.call_tool('run_code', {'code': "import os\nos.getenv('X')"}, ctx, tools['run_code']) + async def test_mount_exposes_host_directory(self, tmp_path: Any) -> None: """A `mount` exposes a host directory inside the sandbox, threaded through resumes.""" (tmp_path / 'data.txt').write_text('hello-from-host') @@ -1947,16 +1977,27 @@ async def test_mount_exposes_host_directory(self, tmp_path: Any) -> None: result = await wrapper.call_tool('run_code', {'code': code}, ctx, tools['run_code']) assert result.return_value == 'hello-from-host' + async def test_mount_accepts_list_of_directories(self, tmp_path: Any) -> None: + """`mount` accepts a `list[MountDir]`; each directory is exposed at its virtual path.""" + (tmp_path / 'a').mkdir() + (tmp_path / 'b').mkdir() + (tmp_path / 'a' / 'f.txt').write_text('AA') + (tmp_path / 'b' / 'f.txt').write_text('BB') + mounts = [MountDir('/a', str(tmp_path / 'a')), MountDir('/b', str(tmp_path / 'b'))] + wrapper = CodeMode[None](mount=mounts).get_wrapper_toolset(_build_function_toolset(add)) + assert isinstance(wrapper, CodeModeToolset) + ctx = await build_ctx(None, wrapper) + tools = await wrapper.get_tools(ctx) + code = "from pathlib import Path\nPath('/a/f.txt').read_text() + Path('/b/f.txt').read_text()" + result = await wrapper.call_tool('run_code', {'code': code}, ctx, tools['run_code']) + assert result.return_value == 'AABB' + def test_capability_forwards_os_and_mount_to_toolset(self, tmp_path: Any) -> None: """`CodeMode` forwards `os`/`mount` onto the `CodeModeToolset` it builds.""" - - def os_cb(fn: OsFunction, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any: - return NOT_HANDLED # pragma: no cover - never invoked; only identity is asserted - mount = MountDir('/work', str(tmp_path)) - wrapper = CodeMode[None](os=os_cb, mount=mount).get_wrapper_toolset(_build_function_toolset(add)) + wrapper = CodeMode[None](os=_unused_os_callback, mount=mount).get_wrapper_toolset(_build_function_toolset(add)) assert isinstance(wrapper, CodeModeToolset) - assert wrapper.os is os_cb + assert wrapper.os is _unused_os_callback assert wrapper.mount is mount From 93e7b0a763ca26a5f3a637adb986652cf95801fd Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 30 May 2026 06:15:56 +0000 Subject: [PATCH 03/13] docs(code_mode): tighten and verify the filesystem/OS access section Trim the host-access docs to the essentials and make the example self-contained (drop the undefined helper). The snippet and the documented `mount`/callback constructions are run end-to-end to confirm they work. --- pydantic_ai_harness/code_mode/README.md | 42 ++++++++++--------------- 1 file changed, 17 insertions(+), 25 deletions(-) diff --git a/pydantic_ai_harness/code_mode/README.md b/pydantic_ai_harness/code_mode/README.md index 22dedc4..cba8718 100644 --- a/pydantic_ai_harness/code_mode/README.md +++ b/pydantic_ai_harness/code_mode/README.md @@ -140,46 +140,38 @@ for msg in result.all_messages(): tool_returns = part.metadata['tool_returns'] # dict[str, ToolReturnPart] ``` -## Host-backed OS access +## Filesystem and OS access -By default the sandbox has no filesystem or clock: `os`/`pathlib` are importable but their I/O -operations and `datetime.datetime.now()`/`datetime.date.today()` are unavailable. Pass `os` and/or -`mount` to route those operations to a host-controlled implementation. +The sandbox has no filesystem or clock by default: `os`/`pathlib` import, but their I/O, +`datetime.now()`, and `date.today()` are unavailable. Pass `os` and/or `mount` to back them with a +host-controlled implementation. ```python -from pydantic_monty import MountDir +from pydantic_monty import NOT_HANDLED, MountDir, OSAccess + from pydantic_ai_harness import CodeMode -# Expose a host directory inside the sandbox (read/write under /work): +# Expose a host directory at /work inside the sandbox: CodeMode(mount=MountDir('/work', '/tmp/agent-workspace')) -# Or supply a custom OS implementation (an `AbstractOS` instance): -from pydantic_monty import OSAccess +# Supply environment/clock via an AbstractOS instance: CodeMode(os=OSAccess(environ={'STAGE': 'prod'})) -# Or a raw callback `(function_name, args, kwargs) -> result` -# (return `pydantic_monty.NOT_HANDLED` to fall back to Monty's default): -from pydantic_monty import NOT_HANDLED +# ...or a raw `(function_name, args, kwargs)` callback; return NOT_HANDLED to defer to Monty: def my_os(fn, args, kwargs): - if fn == 'os.getenv': - return lookup_secret(args[0]) - return NOT_HANDLED + return 'secret-value' if fn == 'os.getenv' else NOT_HANDLED + CodeMode(os=my_os) ``` -`os` accepts a `pydantic_monty.AbstractOS` instance or a raw callback; both are exposed as the -`MontyOS` type alias. `mount` accepts one or more `pydantic_monty.MountDir`. To scope access per -request (per user/session), pass a stateful `AbstractOS` -- for example one rooted at a -caller-specific directory. - -When `os` or `mount` is set, the `run_code` description tells the model that `pathlib`, `os`, -`datetime.now()`, and `date.today()` are routed to the host. `asyncio.sleep` and the `time` module -remain unavailable regardless. +`os` takes a `pydantic_monty.AbstractOS` or that callback; `mount` takes one or more `MountDir`. +Scope access per request with a stateful `AbstractOS` (e.g. rooted at a per-user directory). When +set, `run_code`'s description tells the model these operations are host-backed; `asyncio.sleep` and +`time` stay unavailable. -> These options are Monty-specific: `CodeMode` is built directly on the Monty sandbox, so its OS -> hooks use Monty's `AbstractOS`/`MountDir` types. +> Monty-specific: these hooks use Monty's `AbstractOS`/`MountDir` types. ## Sandbox restrictions @@ -187,7 +179,7 @@ Code runs inside [Monty](https://github.com/pydantic/monty), a sandboxed Python - No class definitions - No third-party imports (allowed stdlib: `sys`, `typing`, `asyncio`, `math`, `json`, `re`, `datetime`, `os`, `pathlib`) -- No wall-clock or timing primitives: `asyncio.sleep`, `datetime.datetime.now()`/`datetime.date.today()`, and the `time` module are unavailable -- unless you wire up host-backed OS access (see above), which enables `datetime.now()`/`date.today()` (but not `asyncio.sleep`/`time`) +- No wall-clock or timing primitives by default (`asyncio.sleep`, `datetime.now()`, `date.today()`, `time`) -- `datetime.now()`/`date.today()` become available with host-backed OS access (above) - No `import *` - Filesystem and `os` I/O are unavailable unless an `os`/`mount` is configured - Tools requiring approval or with deferred execution are excluded from the sandbox From aac81624c6eccc06963519665eb8ff69cdc79736 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 30 May 2026 06:28:21 +0000 Subject: [PATCH 04/13] docs(code_mode): correct per-request scoping wording `os`/`mount` are static capability fields (no per-run resolver), so the "stateful AbstractOS rooted at a per-user directory" guidance over-claimed. Reword to: build CodeMode per request to scope access. Every other doc line was re-checked empirically against pydantic-monty 0.0.17. --- pydantic_ai_harness/code_mode/README.md | 6 +++--- pydantic_ai_harness/code_mode/_capability.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pydantic_ai_harness/code_mode/README.md b/pydantic_ai_harness/code_mode/README.md index cba8718..a62b49d 100644 --- a/pydantic_ai_harness/code_mode/README.md +++ b/pydantic_ai_harness/code_mode/README.md @@ -167,9 +167,9 @@ CodeMode(os=my_os) ``` `os` takes a `pydantic_monty.AbstractOS` or that callback; `mount` takes one or more `MountDir`. -Scope access per request with a stateful `AbstractOS` (e.g. rooted at a per-user directory). When -set, `run_code`'s description tells the model these operations are host-backed; `asyncio.sleep` and -`time` stay unavailable. +`os`/`mount` are fixed when the capability is built, so construct `CodeMode` per request to scope +access. When set, `run_code`'s description tells the model these operations are host-backed; +`asyncio.sleep` and `time` stay unavailable. > Monty-specific: these hooks use Monty's `AbstractOS`/`MountDir` types. diff --git a/pydantic_ai_harness/code_mode/_capability.py b/pydantic_ai_harness/code_mode/_capability.py index 69df3eb..d447b33 100644 --- a/pydantic_ai_harness/code_mode/_capability.py +++ b/pydantic_ai_harness/code_mode/_capability.py @@ -64,8 +64,8 @@ class CodeMode(AbstractCapability[AgentDepsT]): Pass a `pydantic_monty.AbstractOS` instance or a raw Monty OS callback `(function_name, args, kwargs) -> result`. When set, `pathlib.Path`, `os`, `datetime.datetime.now()`, and `datetime.date.today()` calls inside `run_code` - are routed to it instead of being unavailable. Scope it per request by giving - a stateful `AbstractOS` (e.g. one rooted at a per-user directory). + are routed to it instead of being unavailable. Fixed at construction, so build + `CodeMode` per request to scope access per request. """ mount: MontyMount | None = None From c363fd6a73ce75e0f3d25fa8abd03f13690b7a68 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 30 May 2026 06:50:30 +0000 Subject: [PATCH 05/13] fix(code_mode): don't advertise env/clock for mount-only sandboxes A `mount` only exposes filesystem paths; `os.getenv`/`os.environ` and `datetime.now()`/`date.today()` still require an `os` handler. The description used one host-access note for both, so mount-only agents were told env/clock were routed to the host and would emit calls that fail and burn run_code retries (verified against pydantic-monty 0.0.17). Split the description into three states (none / mount-only filesystem / os), and correct the README and docstrings that conflated the two. --- pydantic_ai_harness/code_mode/README.md | 13 ++++--- pydantic_ai_harness/code_mode/_capability.py | 6 +-- pydantic_ai_harness/code_mode/_toolset.py | 41 ++++++++++++++------ tests/code_mode/test_code_mode.py | 11 ++++-- 4 files changed, 47 insertions(+), 24 deletions(-) diff --git a/pydantic_ai_harness/code_mode/README.md b/pydantic_ai_harness/code_mode/README.md index a62b49d..9eea1eb 100644 --- a/pydantic_ai_harness/code_mode/README.md +++ b/pydantic_ai_harness/code_mode/README.md @@ -166,10 +166,11 @@ def my_os(fn, args, kwargs): CodeMode(os=my_os) ``` -`os` takes a `pydantic_monty.AbstractOS` or that callback; `mount` takes one or more `MountDir`. -`os`/`mount` are fixed when the capability is built, so construct `CodeMode` per request to scope -access. When set, `run_code`'s description tells the model these operations are host-backed; -`asyncio.sleep` and `time` stay unavailable. +`os` takes a `pydantic_monty.AbstractOS` or that callback and routes environment, clock, and +filesystem calls; `mount` takes one or more `MountDir` and exposes host filesystem paths only (a +mount alone does **not** enable `os.getenv` or `datetime.now()`). Both are fixed when the capability +is built, so construct `CodeMode` per request to scope access. `run_code`'s description reflects +exactly what's enabled; `asyncio.sleep` and `time` stay unavailable either way. > Monty-specific: these hooks use Monty's `AbstractOS`/`MountDir` types. @@ -179,9 +180,9 @@ Code runs inside [Monty](https://github.com/pydantic/monty), a sandboxed Python - No class definitions - No third-party imports (allowed stdlib: `sys`, `typing`, `asyncio`, `math`, `json`, `re`, `datetime`, `os`, `pathlib`) -- No wall-clock or timing primitives by default (`asyncio.sleep`, `datetime.now()`, `date.today()`, `time`) -- `datetime.now()`/`date.today()` become available with host-backed OS access (above) +- No wall-clock or timing primitives by default (`asyncio.sleep`, `datetime.now()`, `date.today()`, `time`) -- `datetime.now()`/`date.today()` become available with an `os` handler (above); `asyncio.sleep`/`time` never do - No `import *` -- Filesystem and `os` I/O are unavailable unless an `os`/`mount` is configured +- Filesystem I/O needs an `os` handler or a `mount`; `os.getenv`/`os.environ` need an `os` handler - Tools requiring approval or with deferred execution are excluded from the sandbox ## API diff --git a/pydantic_ai_harness/code_mode/_capability.py b/pydantic_ai_harness/code_mode/_capability.py index d447b33..f4c30b2 100644 --- a/pydantic_ai_harness/code_mode/_capability.py +++ b/pydantic_ai_harness/code_mode/_capability.py @@ -35,9 +35,9 @@ class CodeMode(AbstractCapability[AgentDepsT]): agent = Agent('openai:gpt-5', capabilities=[CodeMode(tools=['search', 'fetch'])]) ``` - Pass `os` (and/or `mount`) to give sandboxed code host-backed filesystem and - OS access -- without it, `pathlib`/`os` I/O and `datetime.now()` are - unavailable inside `run_code`: + Pass `mount` for host filesystem access and/or `os` for environment/clock + (plus filesystem) access -- without them, `pathlib`/`os` I/O and + `datetime.now()` are unavailable inside `run_code`: ```python from pydantic_monty import MountDir diff --git a/pydantic_ai_harness/code_mode/_toolset.py b/pydantic_ai_harness/code_mode/_toolset.py index a8f6832..cc3009a 100644 --- a/pydantic_ai_harness/code_mode/_toolset.py +++ b/pydantic_ai_harness/code_mode/_toolset.py @@ -90,12 +90,19 @@ class _RunCodeArguments(TypedDict): - **No third-party libraries**: only the standard library modules listed below can be used - **Importable standard library modules**: `sys`, `typing`, `asyncio`, `math`, `json`, `re`, `datetime`, `os`, `pathlib`. These must be imported at the top of your snippet before use, just like in regular Python. For example: `import asyncio` then `results = await asyncio.gather(tool_one(...), tool_two(...))`.""" -# Timing/OS restriction line, swapped depending on whether the agent configured -# host-backed OS access (`CodeMode(os=...)` / `mount=...`). +# Timing/OS restriction line, swapped depending on what host access the agent +# configured. Three states, because `mount` and `os` enable different things: +# a `mount` only exposes filesystem paths, while environment and clock calls +# require an `os` handler. _NO_OS_RESTRICTION = ( '- **No wall-clock or timing primitives**: `asyncio.sleep`, `datetime.datetime.now()`, ' '`datetime.date.today()`, and the `time` module are unavailable.' ) +_MOUNT_ONLY_NOTE = ( + '- **Mounted filesystem access**: `pathlib.Path` operations under the configured mount ' + 'point(s) are routed to the host. `os.getenv`/`os.environ`, `datetime.datetime.now()`, ' + '`datetime.date.today()`, `asyncio.sleep`, and the `time` module remain unavailable.' +) _OS_ENABLED_NOTE = ( '- **Host-backed OS access**: `pathlib.Path` operations, `os.getenv`/`os.environ`, ' '`datetime.datetime.now()`, and `datetime.date.today()` are routed to the host environment ' @@ -117,14 +124,23 @@ class _RunCodeArguments(TypedDict): """ -def _base_description(*, os_enabled: bool) -> str: - """Assemble the `run_code` base description, swapping the OS-access line. +def _os_access_restriction(*, has_os: bool, has_mount: bool) -> str: + """Pick the OS/filesystem restriction line for the `run_code` description. - When the agent configured host-backed OS access (`CodeMode(os=...)` or - `mount=...`), the static "no wall-clock" restriction is replaced with a note - that filesystem/clock operations route to the host. + `os` routes environment, clock, and filesystem calls; a `mount` alone only + exposes filesystem paths, so a mount-only sandbox must not advertise env or + clock access (the model would generate calls that fail and burn retries). """ - restriction = _OS_ENABLED_NOTE if os_enabled else _NO_OS_RESTRICTION + if has_os: + return _OS_ENABLED_NOTE + if has_mount: + return _MOUNT_ONLY_NOTE + return _NO_OS_RESTRICTION + + +def _base_description(*, has_os: bool, has_mount: bool) -> str: + """Assemble the `run_code` base description with the right OS-access line.""" + restriction = _os_access_restriction(has_os=has_os, has_mount=has_mount) return f'{_RUN_CODE_DESCRIPTION_HEAD}\n{restriction}\n{_RUN_CODE_DESCRIPTION_TAIL}' @@ -285,8 +301,9 @@ async def get_tools(self, ctx: RunContext[AgentDepsT]) -> dict[str, ToolsetTool[ callable_defs, sanitized_to_original = self._partition_callable_tools(sandboxed_tools) - os_enabled = self.os is not None or self.mount is not None - description = self._build_description(callable_defs, os_enabled=os_enabled) + description = self._build_description( + callable_defs, has_os=self.os is not None, has_mount=self.mount is not None + ) if _RUN_CODE_TOOL_NAME in native_tools: raise UserError( @@ -556,9 +573,9 @@ def _partition_callable_tools( return callable_defs, sanitized_to_original @staticmethod - def _build_description(callable_defs: dict[str, ToolDefinition], *, os_enabled: bool) -> str: + def _build_description(callable_defs: dict[str, ToolDefinition], *, has_os: bool, has_mount: bool) -> str: """Render the `run_code` description: base prose + TypedDicts + function signatures.""" - base = _base_description(os_enabled=os_enabled) + base = _base_description(has_os=has_os, has_mount=has_mount) if not callable_defs: return base diff --git a/tests/code_mode/test_code_mode.py b/tests/code_mode/test_code_mode.py index ca5531c..766dec3 100644 --- a/tests/code_mode/test_code_mode.py +++ b/tests/code_mode/test_code_mode.py @@ -1881,15 +1881,20 @@ async def test_description_with_os_callback_notes_host_access(self) -> None: assert 'Host-backed OS access' in description assert 'No wall-clock or timing primitives' not in description - async def test_description_with_mount_notes_host_access(self, tmp_path: Any) -> None: - """A `mount` (without `os`) also enables the host-access note.""" + async def test_description_mount_only_advertises_filesystem_not_env_or_clock(self, tmp_path: Any) -> None: + """A `mount` without `os` advertises filesystem access only -- it must not tell the model + that env/clock are host-backed, since a mount cannot route `os.getenv`/`datetime.now()`.""" wrapper = CodeMode[None](mount=MountDir('/work', str(tmp_path))).get_wrapper_toolset( _build_function_toolset(add) ) assert isinstance(wrapper, CodeModeToolset) description = (await wrapper.get_tools(build_run_context(None)))['run_code'].tool_def.description assert description is not None - assert 'Host-backed OS access' in description + assert 'Mounted filesystem access' in description + assert 'Host-backed OS access' not in description + # env/clock are explicitly called out as still unavailable, not advertised as routed. + assert '`os.getenv`/`os.environ`, `datetime.datetime.now()`, `datetime.date.today()`' in description + assert 'remain unavailable' in description async def test_description_host_access_note_shows_with_no_sandboxed_tools(self) -> None: """The host-access note appears even when no tools are sandboxed (base description).""" From f5c8e0ed829c1eb3d9fc0d1a2dfa9d12139fb544 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 30 May 2026 07:01:48 +0000 Subject: [PATCH 06/13] docs(code_mode): correct two run_code description claims verified against monty Audited every statement in the run_code description, docstrings, and README against pydantic-monty 0.0.17. Two were imprecise: - "imported at the top of your snippet" -- mid-snippet imports work, so the rule is just "before use". - OS-enabled note said calls route "to the host environment", but an in-memory AbstractOS (e.g. OSAccess) handles them too -- it's the configured OS handler, not necessarily the host. --- pydantic_ai_harness/code_mode/_toolset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pydantic_ai_harness/code_mode/_toolset.py b/pydantic_ai_harness/code_mode/_toolset.py index cc3009a..bbfe9d1 100644 --- a/pydantic_ai_harness/code_mode/_toolset.py +++ b/pydantic_ai_harness/code_mode/_toolset.py @@ -88,7 +88,7 @@ class _RunCodeArguments(TypedDict): The sandbox uses Monty, a subset of Python. Key restrictions: - **No classes**: class definitions are not supported - **No third-party libraries**: only the standard library modules listed below can be used -- **Importable standard library modules**: `sys`, `typing`, `asyncio`, `math`, `json`, `re`, `datetime`, `os`, `pathlib`. These must be imported at the top of your snippet before use, just like in regular Python. For example: `import asyncio` then `results = await asyncio.gather(tool_one(...), tool_two(...))`.""" +- **Importable standard library modules**: `sys`, `typing`, `asyncio`, `math`, `json`, `re`, `datetime`, `os`, `pathlib`. These must be imported before use, just like in regular Python. For example: `import asyncio` then `results = await asyncio.gather(tool_one(...), tool_two(...))`.""" # Timing/OS restriction line, swapped depending on what host access the agent # configured. Three states, because `mount` and `os` enable different things: @@ -105,7 +105,7 @@ class _RunCodeArguments(TypedDict): ) _OS_ENABLED_NOTE = ( '- **Host-backed OS access**: `pathlib.Path` operations, `os.getenv`/`os.environ`, ' - '`datetime.datetime.now()`, and `datetime.date.today()` are routed to the host environment ' + '`datetime.datetime.now()`, and `datetime.date.today()` are routed to the OS handler ' 'configured for this agent (availability depends on that configuration). `asyncio.sleep` and ' 'the `time` module remain unavailable.' ) From 28073594e5015a08bb5ebc5af6ded1d0434bd3b4 Mon Sep 17 00:00:00 2001 From: Aditya Vardhan Date: Tue, 2 Jun 2026 18:49:03 +0530 Subject: [PATCH 07/13] docs(code_mode): clarify overlay-mode write persistence and fix wording The mount docs implied writes reach the host, but MountDir defaults to copy-on-write overlay mode, so writes stay in the sandbox unless mode is 'read-write'. Also tighten two awkward/redundant doc lines. Co-Authored-By: Claude Opus 4.8 (1M context) --- pydantic_ai_harness/code_mode/README.md | 10 +++++++--- pydantic_ai_harness/code_mode/_capability.py | 2 +- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/pydantic_ai_harness/code_mode/README.md b/pydantic_ai_harness/code_mode/README.md index 9eea1eb..5e77be8 100644 --- a/pydantic_ai_harness/code_mode/README.md +++ b/pydantic_ai_harness/code_mode/README.md @@ -142,9 +142,9 @@ for msg in result.all_messages(): ## Filesystem and OS access -The sandbox has no filesystem or clock by default: `os`/`pathlib` import, but their I/O, -`datetime.now()`, and `date.today()` are unavailable. Pass `os` and/or `mount` to back them with a -host-controlled implementation. +The sandbox has no filesystem or clock by default: the `os` and `pathlib` modules import, but their +I/O, `datetime.now()`, and `date.today()` are unavailable. Pass `os` and/or `mount` to back them with +a host-controlled implementation. ```python from pydantic_monty import NOT_HANDLED, MountDir, OSAccess @@ -172,6 +172,10 @@ mount alone does **not** enable `os.getenv` or `datetime.now()`). Both are fixed is built, so construct `CodeMode` per request to scope access. `run_code`'s description reflects exactly what's enabled; `asyncio.sleep` and `time` stay unavailable either way. +A `MountDir` defaults to copy-on-write `mode='overlay'`: the sandbox reads host files and sees its +own writes, but those writes do **not** reach the host directory. Pass `MountDir(..., mode='read-write')` +to persist writes to the host, or `mode='read-only'` to forbid them. + > Monty-specific: these hooks use Monty's `AbstractOS`/`MountDir` types. ## Sandbox restrictions diff --git a/pydantic_ai_harness/code_mode/_capability.py b/pydantic_ai_harness/code_mode/_capability.py index f4c30b2..67a2f16 100644 --- a/pydantic_ai_harness/code_mode/_capability.py +++ b/pydantic_ai_harness/code_mode/_capability.py @@ -65,7 +65,7 @@ class CodeMode(AbstractCapability[AgentDepsT]): `(function_name, args, kwargs) -> result`. When set, `pathlib.Path`, `os`, `datetime.datetime.now()`, and `datetime.date.today()` calls inside `run_code` are routed to it instead of being unavailable. Fixed at construction, so build - `CodeMode` per request to scope access per request. + `CodeMode` per request to scope access. """ mount: MontyMount | None = None From c87ed3c7c0ee590ccd7161e4dc133499a404d64d Mon Sep 17 00:00:00 2001 From: Aditya Vardhan Date: Tue, 2 Jun 2026 19:03:56 +0530 Subject: [PATCH 08/13] refactor(code_mode): rename public OS/mount surface to be backend-neutral The public type aliases leaked the Monty backend name into a surface we can't rename later. Rename them to match the existing CodeMode/CodeModeToolset convention, and rename the os= parameter to os_access= so it stops shadowing the stdlib os module that sandboxed code itself uses. - MontyOS -> CodeModeOS, MontyOSCallback -> CodeModeOSCallback, MontyMount -> CodeModeMount - CodeMode/CodeModeToolset param os= -> os_access= (mount unchanged) - internal resume()/feed_start() forwarding keeps Monty's literal os= kwarg Co-Authored-By: Claude Opus 4.8 (1M context) --- pydantic_ai_harness/code_mode/README.md | 14 +++---- pydantic_ai_harness/code_mode/__init__.py | 4 +- pydantic_ai_harness/code_mode/_capability.py | 12 +++--- pydantic_ai_harness/code_mode/_toolset.py | 42 ++++++++++---------- tests/code_mode/test_code_mode.py | 24 ++++++----- 5 files changed, 50 insertions(+), 46 deletions(-) diff --git a/pydantic_ai_harness/code_mode/README.md b/pydantic_ai_harness/code_mode/README.md index 5e77be8..fccb518 100644 --- a/pydantic_ai_harness/code_mode/README.md +++ b/pydantic_ai_harness/code_mode/README.md @@ -155,7 +155,7 @@ from pydantic_ai_harness import CodeMode CodeMode(mount=MountDir('/work', '/tmp/agent-workspace')) # Supply environment/clock via an AbstractOS instance: -CodeMode(os=OSAccess(environ={'STAGE': 'prod'})) +CodeMode(os_access=OSAccess(environ={'STAGE': 'prod'})) # ...or a raw `(function_name, args, kwargs)` callback; return NOT_HANDLED to defer to Monty: @@ -163,10 +163,10 @@ def my_os(fn, args, kwargs): return 'secret-value' if fn == 'os.getenv' else NOT_HANDLED -CodeMode(os=my_os) +CodeMode(os_access=my_os) ``` -`os` takes a `pydantic_monty.AbstractOS` or that callback and routes environment, clock, and +`os_access` takes a `pydantic_monty.AbstractOS` or that callback and routes environment, clock, and filesystem calls; `mount` takes one or more `MountDir` and exposes host filesystem paths only (a mount alone does **not** enable `os.getenv` or `datetime.now()`). Both are fixed when the capability is built, so construct `CodeMode` per request to scope access. `run_code`'s description reflects @@ -184,9 +184,9 @@ Code runs inside [Monty](https://github.com/pydantic/monty), a sandboxed Python - No class definitions - No third-party imports (allowed stdlib: `sys`, `typing`, `asyncio`, `math`, `json`, `re`, `datetime`, `os`, `pathlib`) -- No wall-clock or timing primitives by default (`asyncio.sleep`, `datetime.now()`, `date.today()`, `time`) -- `datetime.now()`/`date.today()` become available with an `os` handler (above); `asyncio.sleep`/`time` never do +- No wall-clock or timing primitives by default (`asyncio.sleep`, `datetime.now()`, `date.today()`, `time`) -- `datetime.now()`/`date.today()` become available with an `os_access` handler (above); `asyncio.sleep`/`time` never do - No `import *` -- Filesystem I/O needs an `os` handler or a `mount`; `os.getenv`/`os.environ` need an `os` handler +- Filesystem I/O needs an `os_access` handler or a `mount`; `os.getenv`/`os.environ` need an `os_access` handler - Tools requiring approval or with deferred execution are excluded from the sandbox ## API @@ -195,8 +195,8 @@ Code runs inside [Monty](https://github.com/pydantic/monty), a sandboxed Python CodeMode( tools: ToolSelector = 'all', # 'all', list[str], callable, or dict max_retries: int = 3, # retries on sandbox execution errors - os: MontyOS | None = None, # AbstractOS instance or (fn, args, kwargs) callback - mount: MontyMount | None = None, # MountDir | list[MountDir] of host directories + os_access: CodeModeOS | None = None, # AbstractOS instance or (fn, args, kwargs) callback + mount: CodeModeMount | None = None, # MountDir | list[MountDir] of host directories ) ``` diff --git a/pydantic_ai_harness/code_mode/__init__.py b/pydantic_ai_harness/code_mode/__init__.py index b06d27b..234438c 100644 --- a/pydantic_ai_harness/code_mode/__init__.py +++ b/pydantic_ai_harness/code_mode/__init__.py @@ -1,6 +1,6 @@ """Code mode capability: route tool calls through a sandboxed Python environment.""" from pydantic_ai_harness.code_mode._capability import CodeMode -from pydantic_ai_harness.code_mode._toolset import CodeModeToolset, MontyMount, MontyOS, MontyOSCallback +from pydantic_ai_harness.code_mode._toolset import CodeModeMount, CodeModeOS, CodeModeOSCallback, CodeModeToolset -__all__ = ['CodeMode', 'CodeModeToolset', 'MontyMount', 'MontyOS', 'MontyOSCallback'] +__all__ = ['CodeMode', 'CodeModeMount', 'CodeModeOS', 'CodeModeOSCallback', 'CodeModeToolset'] diff --git a/pydantic_ai_harness/code_mode/_capability.py b/pydantic_ai_harness/code_mode/_capability.py index 67a2f16..cbab615 100644 --- a/pydantic_ai_harness/code_mode/_capability.py +++ b/pydantic_ai_harness/code_mode/_capability.py @@ -9,7 +9,7 @@ from pydantic_ai.capabilities._tool_search import ToolSearch as _ToolSearch from pydantic_ai.tools import AgentDepsT, ToolSelector -from pydantic_ai_harness.code_mode._toolset import CodeModeToolset, MontyMount, MontyOS +from pydantic_ai_harness.code_mode._toolset import CodeModeMount, CodeModeOS, CodeModeToolset @dataclass @@ -35,7 +35,7 @@ class CodeMode(AbstractCapability[AgentDepsT]): agent = Agent('openai:gpt-5', capabilities=[CodeMode(tools=['search', 'fetch'])]) ``` - Pass `mount` for host filesystem access and/or `os` for environment/clock + Pass `mount` for host filesystem access and/or `os_access` for environment/clock (plus filesystem) access -- without them, `pathlib`/`os` I/O and `datetime.now()` are unavailable inside `run_code`: @@ -58,17 +58,17 @@ class CodeMode(AbstractCapability[AgentDepsT]): max_retries: int = 3 """Maximum number of retries for the `run_code` tool (syntax errors count as retries).""" - os: MontyOS | None = None + os_access: CodeModeOS | None = None """Host-backed OS access for sandboxed code. - Pass a `pydantic_monty.AbstractOS` instance or a raw Monty OS callback + Pass a `pydantic_monty.AbstractOS` instance or a raw OS callback `(function_name, args, kwargs) -> result`. When set, `pathlib.Path`, `os`, `datetime.datetime.now()`, and `datetime.date.today()` calls inside `run_code` are routed to it instead of being unavailable. Fixed at construction, so build `CodeMode` per request to scope access. """ - mount: MontyMount | None = None + mount: CodeModeMount | None = None """Host directory mount(s) exposed inside the sandbox as `pydantic_monty.MountDir`.""" def get_ordering(self) -> CapabilityOrdering: @@ -81,6 +81,6 @@ def get_wrapper_toolset(self, toolset: AbstractToolset[AgentDepsT]) -> AbstractT wrapped=toolset, tool_selector=self.tools, max_retries=self.max_retries, - os=self.os, + os_access=self.os_access, mount=self.mount, ) diff --git a/pydantic_ai_harness/code_mode/_toolset.py b/pydantic_ai_harness/code_mode/_toolset.py index bbfe9d1..c884aec 100644 --- a/pydantic_ai_harness/code_mode/_toolset.py +++ b/pydantic_ai_harness/code_mode/_toolset.py @@ -51,15 +51,15 @@ # Type alias for the dispatch callback passed to _execution_loop. _DispatchFn = Callable[[str, dict[str, Any]], Coroutine[Any, Any, Any]] -# A raw Monty OS callback: `(function_name, args, kwargs) -> result`. Return -# `pydantic_monty.NOT_HANDLED` to fall back to Monty's default handling. -MontyOSCallback = Callable[[OsFunction, tuple[Any, ...], dict[str, Any]], Any] -# What `CodeMode.os` accepts: either an `AbstractOS` instance or a raw callback. -# Monty's `feed_start`/`resume` accept both interchangeably, so no normalization. -MontyOS = AbstractOS | MontyOSCallback -# What `CodeMode.mount` accepts: one or more host-directory mounts (matches Monty's -# `feed_start`/`resume` `mount=` parameter type exactly). -MontyMount = MountDir | list[MountDir] +# A raw OS callback: `(function_name, args, kwargs) -> result`. Return +# `pydantic_monty.NOT_HANDLED` to fall back to the sandbox's default handling. +CodeModeOSCallback = Callable[[OsFunction, tuple[Any, ...], dict[str, Any]], Any] +# What `CodeMode.os_access` accepts: either an `AbstractOS` instance or a raw callback. +# The sandbox's `feed_start`/`resume` accept both interchangeably, so no normalization. +CodeModeOS = AbstractOS | CodeModeOSCallback +# What `CodeMode.mount` accepts: one or more host-directory mounts (matches the +# sandbox's `feed_start`/`resume` `mount=` parameter type exactly). +CodeModeMount = MountDir | list[MountDir] class _RunCodeArguments(TypedDict): @@ -238,15 +238,15 @@ class CodeModeToolset(WrapperToolset[AgentDepsT]): max_retries: int = 3 """Maximum number of retries for the `run_code` tool (syntax errors count as retries).""" - os: MontyOS | None = None + os_access: CodeModeOS | None = None """Host-backed OS access exposed to sandboxed code. - Either a `pydantic_monty.AbstractOS` instance or a raw Monty OS callback + Either a `pydantic_monty.AbstractOS` instance or a raw OS callback `(function_name, args, kwargs) -> result`. When set, `pathlib.Path`, `os`, `datetime.datetime.now()`, and `datetime.date.today()` calls inside the sandbox are routed to it instead of being unavailable.""" - mount: MontyMount | None = None + mount: CodeModeMount | None = None """Host directory mount(s) exposed inside the sandbox as `pydantic_monty.MountDir`.""" # init=False so `replace()` in `for_run` produces a fresh instance with _repl=None, @@ -302,7 +302,7 @@ async def get_tools(self, ctx: RunContext[AgentDepsT]) -> dict[str, ToolsetTool[ callable_defs, sanitized_to_original = self._partition_callable_tools(sandboxed_tools) description = self._build_description( - callable_defs, has_os=self.os is not None, has_mount=self.mount is not None + callable_defs, has_os=self.os_access is not None, has_mount=self.mount is not None ) if _RUN_CODE_TOOL_NAME in native_tools: @@ -466,7 +466,7 @@ async def dispatch_tool_call(original_name: str, kwargs: dict[str, Any]) -> Any: capture = _PrintCapture() try: - monty_state = self._repl.feed_start(code, print_callback=capture, os=self.os, mount=self.mount) + monty_state = self._repl.feed_start(code, print_callback=capture, os=self.os_access, mount=self.mount) completed = await _execution_loop( monty_state, dispatch=dispatch_tool_call, @@ -474,7 +474,7 @@ async def dispatch_tool_call(original_name: str, kwargs: dict[str, Any]) -> Any: sanitized_to_original=sanitized_to_original, sequential_tools=sequential_tools, global_sequential=global_sequential, - os=self.os, + os=self.os_access, mount=self.mount, ) except MontySyntaxError as e: @@ -649,8 +649,8 @@ async def _execution_loop( sanitized_to_original: dict[str, str], sequential_tools: set[str], global_sequential: bool, - os: MontyOS | None, - mount: MontyMount | None, + os: CodeModeOS | None, + mount: CodeModeMount | None, ) -> MontyComplete: """Drive the Monty REPL via the synchronous snapshot API until completion. @@ -723,8 +723,8 @@ async def _handle_function_snapshot( global_sequential: bool, pending: dict[int, asyncio.Task[Any] | Coroutine[Any, Any, Any]], pre_resolved: dict[int, ExternalResult], - os: MontyOS | None, - mount: MontyMount | None, + os: CodeModeOS | None, + mount: CodeModeMount | None, ) -> FunctionSnapshot | FutureSnapshot | NameLookupSnapshot | MontyComplete: """Handle a single FunctionSnapshot from the Monty execution loop.""" fn_name = snapshot.function_name @@ -768,8 +768,8 @@ async def _resolve_future_snapshot( pending: dict[int, asyncio.Task[Any] | Coroutine[Any, Any, Any]], pre_resolved: dict[int, ExternalResult], global_sequential: bool, - os: MontyOS | None, - mount: MontyMount | None, + os: CodeModeOS | None, + mount: CodeModeMount | None, ) -> FunctionSnapshot | FutureSnapshot | NameLookupSnapshot | MontyComplete: """Resolve pending tool calls at a FutureSnapshot.""" pending_ids = snapshot.pending_call_ids diff --git a/tests/code_mode/test_code_mode.py b/tests/code_mode/test_code_mode.py index 766dec3..67e8d5b 100644 --- a/tests/code_mode/test_code_mode.py +++ b/tests/code_mode/test_code_mode.py @@ -1861,7 +1861,7 @@ def _unused_os_callback(fn: OsFunction, args: tuple[Any, ...], kwargs: dict[str, class TestCodeModeOSAccess: - """`CodeMode(os=...)` / `mount=...` give sandboxed code host-backed OS access.""" + """`CodeMode(os_access=...)` / `mount=...` give sandboxed code host-backed OS access.""" async def test_description_default_keeps_no_wallclock_restriction(self) -> None: """Without `os`/`mount`, the description keeps the no-wall-clock restriction.""" @@ -1874,7 +1874,7 @@ async def test_description_default_keeps_no_wallclock_restriction(self) -> None: async def test_description_with_os_callback_notes_host_access(self) -> None: """An `os` callback swaps the restriction line for the host-access note.""" - wrapper = CodeMode[None](os=_unused_os_callback).get_wrapper_toolset(_build_function_toolset(add)) + wrapper = CodeMode[None](os_access=_unused_os_callback).get_wrapper_toolset(_build_function_toolset(add)) assert isinstance(wrapper, CodeModeToolset) description = (await wrapper.get_tools(build_run_context(None)))['run_code'].tool_def.description assert description is not None @@ -1899,7 +1899,9 @@ async def test_description_mount_only_advertises_filesystem_not_env_or_clock(sel async def test_description_host_access_note_shows_with_no_sandboxed_tools(self) -> None: """The host-access note appears even when no tools are sandboxed (base description).""" # `tools=[]` leaves every tool native, so `run_code` exposes no callable functions. - wrapper = CodeMode[None](os=_unused_os_callback, tools=[]).get_wrapper_toolset(_build_function_toolset(add)) + wrapper = CodeMode[None](os_access=_unused_os_callback, tools=[]).get_wrapper_toolset( + _build_function_toolset(add) + ) assert isinstance(wrapper, CodeModeToolset) description = (await wrapper.get_tools(build_run_context(None)))['run_code'].tool_def.description assert description is not None @@ -1915,7 +1917,7 @@ def os_cb(fn: OsFunction, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any: return 'envval' return NOT_HANDLED # pragma: no cover - sandbox only calls os.getenv here - wrapper = CodeMode[None](os=os_cb).get_wrapper_toolset(_build_function_toolset(add)) + wrapper = CodeMode[None](os_access=os_cb).get_wrapper_toolset(_build_function_toolset(add)) assert isinstance(wrapper, CodeModeToolset) ctx = await build_ctx(None, wrapper) tools = await wrapper.get_tools(ctx) @@ -1934,7 +1936,7 @@ def os_cb(fn: OsFunction, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any: return 'persisted' return NOT_HANDLED # pragma: no cover - sandbox only calls os.getenv here - wrapper = CodeMode[None](os=os_cb).get_wrapper_toolset(_build_function_toolset(add)) + wrapper = CodeMode[None](os_access=os_cb).get_wrapper_toolset(_build_function_toolset(add)) assert isinstance(wrapper, CodeModeToolset) ctx = await build_ctx(None, wrapper) tools = await wrapper.get_tools(ctx) @@ -1946,7 +1948,7 @@ def os_cb(fn: OsFunction, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any: async def test_abstract_os_instance_dispatches_inside_run_code(self) -> None: """An `AbstractOS` instance is accepted as the `os` value and dispatches OS calls.""" - wrapper = CodeMode[None](os=OSAccess(environ={'THING': 'fromabs'})).get_wrapper_toolset( + wrapper = CodeMode[None](os_access=OSAccess(environ={'THING': 'fromabs'})).get_wrapper_toolset( _build_function_toolset(add) ) assert isinstance(wrapper, CodeModeToolset) @@ -1962,7 +1964,7 @@ async def test_os_callback_exception_becomes_model_retry(self) -> None: def os_cb(fn: OsFunction, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any: raise ValueError('boom from os') - wrapper = CodeMode[None](os=os_cb).get_wrapper_toolset(_build_function_toolset(add)) + wrapper = CodeMode[None](os_access=os_cb).get_wrapper_toolset(_build_function_toolset(add)) assert isinstance(wrapper, CodeModeToolset) ctx = await build_ctx(None, wrapper) tools = await wrapper.get_tools(ctx) @@ -1998,11 +2000,13 @@ async def test_mount_accepts_list_of_directories(self, tmp_path: Any) -> None: assert result.return_value == 'AABB' def test_capability_forwards_os_and_mount_to_toolset(self, tmp_path: Any) -> None: - """`CodeMode` forwards `os`/`mount` onto the `CodeModeToolset` it builds.""" + """`CodeMode` forwards `os_access`/`mount` onto the `CodeModeToolset` it builds.""" mount = MountDir('/work', str(tmp_path)) - wrapper = CodeMode[None](os=_unused_os_callback, mount=mount).get_wrapper_toolset(_build_function_toolset(add)) + wrapper = CodeMode[None](os_access=_unused_os_callback, mount=mount).get_wrapper_toolset( + _build_function_toolset(add) + ) assert isinstance(wrapper, CodeModeToolset) - assert wrapper.os is _unused_os_callback + assert wrapper.os_access is _unused_os_callback assert wrapper.mount is mount From f420a9f9fe1790e35df2115ffcfab947b0473dd7 Mon Sep 17 00:00:00 2001 From: Aditya Vardhan Date: Tue, 2 Jun 2026 19:58:13 +0530 Subject: [PATCH 09/13] refactor(code_mode): stop shadowing the os module in the execution loop The OS/mount threading named its parameter `os`, shadowing the stdlib module inside the execution-loop helpers. Rename the variable to `os_access` (matching the public field) while keeping Monty's required `os=` keyword only at the resume/feed_start call sites. Also inline the single-use restriction-line helper into `_base_description`. Co-Authored-By: Claude Opus 4.8 (1M context) --- pydantic_ai_harness/code_mode/_toolset.py | 46 +++++++++++------------ 1 file changed, 21 insertions(+), 25 deletions(-) diff --git a/pydantic_ai_harness/code_mode/_toolset.py b/pydantic_ai_harness/code_mode/_toolset.py index c884aec..7f5317e 100644 --- a/pydantic_ai_harness/code_mode/_toolset.py +++ b/pydantic_ai_harness/code_mode/_toolset.py @@ -124,23 +124,19 @@ class _RunCodeArguments(TypedDict): """ -def _os_access_restriction(*, has_os: bool, has_mount: bool) -> str: - """Pick the OS/filesystem restriction line for the `run_code` description. +def _base_description(*, has_os: bool, has_mount: bool) -> str: + """Assemble the `run_code` base description with the right OS-access restriction line. `os` routes environment, clock, and filesystem calls; a `mount` alone only exposes filesystem paths, so a mount-only sandbox must not advertise env or clock access (the model would generate calls that fail and burn retries). """ if has_os: - return _OS_ENABLED_NOTE - if has_mount: - return _MOUNT_ONLY_NOTE - return _NO_OS_RESTRICTION - - -def _base_description(*, has_os: bool, has_mount: bool) -> str: - """Assemble the `run_code` base description with the right OS-access line.""" - restriction = _os_access_restriction(has_os=has_os, has_mount=has_mount) + restriction = _OS_ENABLED_NOTE + elif has_mount: + restriction = _MOUNT_ONLY_NOTE + else: + restriction = _NO_OS_RESTRICTION return f'{_RUN_CODE_DESCRIPTION_HEAD}\n{restriction}\n{_RUN_CODE_DESCRIPTION_TAIL}' @@ -474,7 +470,7 @@ async def dispatch_tool_call(original_name: str, kwargs: dict[str, Any]) -> Any: sanitized_to_original=sanitized_to_original, sequential_tools=sequential_tools, global_sequential=global_sequential, - os=self.os_access, + os_access=self.os_access, mount=self.mount, ) except MontySyntaxError as e: @@ -649,7 +645,7 @@ async def _execution_loop( sanitized_to_original: dict[str, str], sequential_tools: set[str], global_sequential: bool, - os: CodeModeOS | None, + os_access: CodeModeOS | None, mount: CodeModeMount | None, ) -> MontyComplete: """Drive the Monty REPL via the synchronous snapshot API until completion. @@ -680,7 +676,7 @@ async def _execution_loop( try: while not isinstance(monty_state, MontyComplete): if isinstance(monty_state, NameLookupSnapshot): - monty_state = monty_state.resume(os=os, mount=mount) + monty_state = monty_state.resume(os=os_access, mount=mount) elif isinstance(monty_state, FunctionSnapshot): monty_state = await _handle_function_snapshot( monty_state, @@ -691,7 +687,7 @@ async def _execution_loop( global_sequential=global_sequential, pending=pending, pre_resolved=pre_resolved, - os=os, + os_access=os_access, mount=mount, ) else: @@ -700,7 +696,7 @@ async def _execution_loop( pending=pending, pre_resolved=pre_resolved, global_sequential=global_sequential, - os=os, + os_access=os_access, mount=mount, ) finally: @@ -723,19 +719,19 @@ async def _handle_function_snapshot( global_sequential: bool, pending: dict[int, asyncio.Task[Any] | Coroutine[Any, Any, Any]], pre_resolved: dict[int, ExternalResult], - os: CodeModeOS | None, + os_access: CodeModeOS | None, mount: CodeModeMount | None, ) -> FunctionSnapshot | FutureSnapshot | NameLookupSnapshot | MontyComplete: """Handle a single FunctionSnapshot from the Monty execution loop.""" fn_name = snapshot.function_name if fn_name not in callable_defs: - return snapshot.resume({'exception': NameError(f'Unknown function: {fn_name}')}, os=os, mount=mount) + return snapshot.resume({'exception': NameError(f'Unknown function: {fn_name}')}, os=os_access, mount=mount) if snapshot.args: return snapshot.resume( {'exception': TypeError(f'{fn_name}() does not accept positional arguments; use keyword arguments')}, - os=os, + os=os_access, mount=mount, ) @@ -749,8 +745,8 @@ async def _handle_function_snapshot( pre_resolved[cid] = await _resolve_coro(pending.pop(cid)) outcome = await _resolve_coro(dispatch(original_name, snapshot.kwargs)) if 'return_value' in outcome: - return snapshot.resume({'return_value': outcome['return_value']}, os=os, mount=mount) - return snapshot.resume({'exception': outcome['exception']}, os=os, mount=mount) + return snapshot.resume({'return_value': outcome['return_value']}, os=os_access, mount=mount) + return snapshot.resume({'exception': outcome['exception']}, os=os_access, mount=mount) # Deferred execution — store for later resolution at FutureSnapshot. if global_sequential: @@ -759,7 +755,7 @@ async def _handle_function_snapshot( else: # Eagerly schedule as a Task for concurrent execution. pending[snapshot.call_id] = asyncio.ensure_future(dispatch(original_name, snapshot.kwargs)) - return snapshot.resume({'future': ...}, os=os, mount=mount) + return snapshot.resume({'future': ...}, os=os_access, mount=mount) async def _resolve_future_snapshot( @@ -768,13 +764,13 @@ async def _resolve_future_snapshot( pending: dict[int, asyncio.Task[Any] | Coroutine[Any, Any, Any]], pre_resolved: dict[int, ExternalResult], global_sequential: bool, - os: CodeModeOS | None, + os_access: CodeModeOS | None, mount: CodeModeMount | None, ) -> FunctionSnapshot | FutureSnapshot | NameLookupSnapshot | MontyComplete: """Resolve pending tool calls at a FutureSnapshot.""" pending_ids = snapshot.pending_call_ids if not pending_ids: # pragma: no cover - return snapshot.resume(results={}, os=os, mount=mount) + return snapshot.resume(results={}, os=os_access, mount=mount) results: dict[int, ExternalResult] = {} for cid in pending_ids: @@ -793,7 +789,7 @@ async def _resolve_future_snapshot( for cid, outcome in zip(gather_ids, settled): results[cid] = _settle_outcome(outcome) - return snapshot.resume(results=results, os=os, mount=mount) + return snapshot.resume(results=results, os=os_access, mount=mount) async def _resolve_coro( From 007aa05662e5a99bf01d9818a45587383e68a545 Mon Sep 17 00:00:00 2001 From: Aditya Vardhan Date: Tue, 2 Jun 2026 20:05:24 +0530 Subject: [PATCH 10/13] refactor(code_mode): make CodeMode config fields keyword-only The option list keeps growing; pin tools/max_retries as the only positional args and force os_access/mount (and future config) to be passed by name via a KW_ONLY sentinel, so adding options can't silently shift positional meaning. Co-Authored-By: Claude Opus 4.8 (1M context) --- pydantic_ai_harness/code_mode/_capability.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pydantic_ai_harness/code_mode/_capability.py b/pydantic_ai_harness/code_mode/_capability.py index cbab615..f824e34 100644 --- a/pydantic_ai_harness/code_mode/_capability.py +++ b/pydantic_ai_harness/code_mode/_capability.py @@ -2,7 +2,7 @@ from __future__ import annotations -from dataclasses import dataclass, field +from dataclasses import KW_ONLY, dataclass, field from pydantic_ai import AbstractToolset from pydantic_ai.capabilities import AbstractCapability, CapabilityOrdering @@ -58,6 +58,10 @@ class CodeMode(AbstractCapability[AgentDepsT]): max_retries: int = 3 """Maximum number of retries for the `run_code` tool (syntax errors count as retries).""" + _: KW_ONLY + # Everything below is keyword-only: the option list keeps growing, so new + # config must be passed by name rather than relying on positional order. + os_access: CodeModeOS | None = None """Host-backed OS access for sandboxed code. From 465e8d65f65367435e86c1da81195335003bd27e Mon Sep 17 00:00:00 2001 From: Aditya Vardhan Date: Tue, 2 Jun 2026 20:57:33 +0530 Subject: [PATCH 11/13] docs(code_mode): make os_access/mount docs clear on first read Public docs should let a reader grasp the host-access surface without reverse-engineering it. Reframe the docstrings and README around when to reach for each primitive instead of what is switched off, drop the type-restating prose the annotations already carry, and lead with concrete tasks (share a dataset; inject just the secrets the agent needs). Tighten the os-access test sweep so each test asserts exactly its invariant: drop redundant negative description asserts (one note is interpolated, so the positive phrase alone proves selection), drop an assertion already owned by another test, and type the tmp_path fixtures. Co-Authored-By: Claude Opus 4.8 (1M context) --- pydantic_ai_harness/code_mode/README.md | 55 +++++++++++++------- pydantic_ai_harness/code_mode/_capability.py | 26 +++++---- pydantic_ai_harness/code_mode/_toolset.py | 19 +++---- tests/code_mode/test_code_mode.py | 20 +++---- 4 files changed, 62 insertions(+), 58 deletions(-) diff --git a/pydantic_ai_harness/code_mode/README.md b/pydantic_ai_harness/code_mode/README.md index fccb518..c05b92c 100644 --- a/pydantic_ai_harness/code_mode/README.md +++ b/pydantic_ai_harness/code_mode/README.md @@ -142,39 +142,56 @@ for msg in result.all_messages(): ## Filesystem and OS access -The sandbox has no filesystem or clock by default: the `os` and `pathlib` modules import, but their -I/O, `datetime.now()`, and `date.today()` are unavailable. Pass `os` and/or `mount` to back them with -a host-controlled implementation. +Sandboxed code runs with no access to the host's files, environment, or clock. Two parameters grant +it -- reach for them when the agent's task genuinely needs the host. + +**`mount` -- share host directories.** Reach for this when the agent works with real files: analyzing +a dataset you've dropped in a folder and writing a report back, editing a checkout, or processing a +batch of documents. Sandboxed `pathlib` code reads and writes under the mounted path. (For +environment variables or the clock, use `os_access` instead.) ```python -from pydantic_monty import NOT_HANDLED, MountDir, OSAccess +from pydantic_monty import MountDir from pydantic_ai_harness import CodeMode -# Expose a host directory at /work inside the sandbox: -CodeMode(mount=MountDir('/work', '/tmp/agent-workspace')) +# The agent can read /work/data.csv and write /work/summary.md back to the host: +CodeMode(mount=MountDir('/work', '/tmp/agent-workspace', mode='read-write')) +``` + +**`os_access` -- answer the sandbox's OS calls yourself.** Reach for this when the agent needs +environment variables, the current date and time, or filesystem behavior you control. Hand it a +ready-made OS implementation, or a callback that decides each call -- so you can inject just the +secrets it needs, pin "now" for reproducible runs, or route file access to your own store. + +```python +from pydantic_monty import NOT_HANDLED, OSAccess + +from pydantic_ai_harness import CodeMode + +# Give the agent a fixed set of environment values: +CodeMode(os_access=OSAccess(environ={'API_BASE': 'https://api.example.com'})) + -# Supply environment/clock via an AbstractOS instance: -CodeMode(os_access=OSAccess(environ={'STAGE': 'prod'})) +# ...or intercept each call to decide what the agent may see: +allowed_env = {'API_KEY': 'sk-...'} -# ...or a raw `(function_name, args, kwargs)` callback; return NOT_HANDLED to defer to Monty: def my_os(fn, args, kwargs): - return 'secret-value' if fn == 'os.getenv' else NOT_HANDLED + if fn == 'os.getenv': + return allowed_env.get(args[0], NOT_HANDLED) # only allow-listed keys; the rest stay hidden + return NOT_HANDLED CodeMode(os_access=my_os) ``` -`os_access` takes a `pydantic_monty.AbstractOS` or that callback and routes environment, clock, and -filesystem calls; `mount` takes one or more `MountDir` and exposes host filesystem paths only (a -mount alone does **not** enable `os.getenv` or `datetime.now()`). Both are fixed when the capability -is built, so construct `CodeMode` per request to scope access. `run_code`'s description reflects -exactly what's enabled; `asyncio.sleep` and `time` stay unavailable either way. +Both expose the real host to model-written code, so grant only what the task needs. Access is fixed +when the capability is built, so construct `CodeMode` per request to scope it. A `MountDir` defaults to copy-on-write `mode='overlay'`: the sandbox reads host files and sees its -own writes, but those writes do **not** reach the host directory. Pass `MountDir(..., mode='read-write')` -to persist writes to the host, or `mode='read-only'` to forbid them. +own writes, but those writes do **not** reach the host. Pass `mode='read-write'` to persist them, or +`mode='read-only'` to forbid writes. > Monty-specific: these hooks use Monty's `AbstractOS`/`MountDir` types. @@ -195,8 +212,8 @@ Code runs inside [Monty](https://github.com/pydantic/monty), a sandboxed Python CodeMode( tools: ToolSelector = 'all', # 'all', list[str], callable, or dict max_retries: int = 3, # retries on sandbox execution errors - os_access: CodeModeOS | None = None, # AbstractOS instance or (fn, args, kwargs) callback - mount: CodeModeMount | None = None, # MountDir | list[MountDir] of host directories + os_access: CodeModeOS | None = None, # host handler for env vars, clock, and file I/O + mount: CodeModeMount | None = None, # host directories to share with the sandbox ) ``` diff --git a/pydantic_ai_harness/code_mode/_capability.py b/pydantic_ai_harness/code_mode/_capability.py index f824e34..2dc8702 100644 --- a/pydantic_ai_harness/code_mode/_capability.py +++ b/pydantic_ai_harness/code_mode/_capability.py @@ -35,9 +35,16 @@ class CodeMode(AbstractCapability[AgentDepsT]): agent = Agent('openai:gpt-5', capabilities=[CodeMode(tools=['search', 'fetch'])]) ``` - Pass `mount` for host filesystem access and/or `os_access` for environment/clock - (plus filesystem) access -- without them, `pathlib`/`os` I/O and - `datetime.now()` are unavailable inside `run_code`: + By default, sandboxed code cannot touch the host -- no filesystem, environment + variables, or clock. Two parameters open it up: + + - `mount` shares specific host directories: reach for it when the agent reads or + writes real files. + - `os_access` routes the sandbox's OS calls to a handler you provide: reach for it + when the agent needs environment variables, the clock, or filesystem behavior you + control. + + Both expose the real host to model-written code, so grant only what the task needs. ```python from pydantic_monty import MountDir @@ -59,21 +66,12 @@ class CodeMode(AbstractCapability[AgentDepsT]): """Maximum number of retries for the `run_code` tool (syntax errors count as retries).""" _: KW_ONLY - # Everything below is keyword-only: the option list keeps growing, so new - # config must be passed by name rather than relying on positional order. os_access: CodeModeOS | None = None - """Host-backed OS access for sandboxed code. - - Pass a `pydantic_monty.AbstractOS` instance or a raw OS callback - `(function_name, args, kwargs) -> result`. When set, `pathlib.Path`, `os`, - `datetime.datetime.now()`, and `datetime.date.today()` calls inside `run_code` - are routed to it instead of being unavailable. Fixed at construction, so build - `CodeMode` per request to scope access. - """ + """Give sandboxed code environment variables, the clock, and file I/O through a handler you provide; unset, they are unavailable.""" mount: CodeModeMount | None = None - """Host directory mount(s) exposed inside the sandbox as `pydantic_monty.MountDir`.""" + """Host directories to expose to sandboxed `pathlib` code; each mount's `mode` controls whether writes reach the host.""" def get_ordering(self) -> CapabilityOrdering: """CodeMode wraps around ToolSearch so that search_tools stays native.""" diff --git a/pydantic_ai_harness/code_mode/_toolset.py b/pydantic_ai_harness/code_mode/_toolset.py index 7f5317e..cc6f1bc 100644 --- a/pydantic_ai_harness/code_mode/_toolset.py +++ b/pydantic_ai_harness/code_mode/_toolset.py @@ -51,14 +51,12 @@ # Type alias for the dispatch callback passed to _execution_loop. _DispatchFn = Callable[[str, dict[str, Any]], Coroutine[Any, Any, Any]] -# A raw OS callback: `(function_name, args, kwargs) -> result`. Return -# `pydantic_monty.NOT_HANDLED` to fall back to the sandbox's default handling. +# A raw OS callback. Return `pydantic_monty.NOT_HANDLED` to defer the call to the +# sandbox's default, which leaves it unavailable. CodeModeOSCallback = Callable[[OsFunction, tuple[Any, ...], dict[str, Any]], Any] -# What `CodeMode.os_access` accepts: either an `AbstractOS` instance or a raw callback. -# The sandbox's `feed_start`/`resume` accept both interchangeably, so no normalization. +# Accepted by `CodeMode.os_access`: a ready-made OS implementation or a raw callback. CodeModeOS = AbstractOS | CodeModeOSCallback -# What `CodeMode.mount` accepts: one or more host-directory mounts (matches the -# sandbox's `feed_start`/`resume` `mount=` parameter type exactly). +# Accepted by `CodeMode.mount`: one or more host-directory mounts. CodeModeMount = MountDir | list[MountDir] @@ -235,15 +233,10 @@ class CodeModeToolset(WrapperToolset[AgentDepsT]): """Maximum number of retries for the `run_code` tool (syntax errors count as retries).""" os_access: CodeModeOS | None = None - """Host-backed OS access exposed to sandboxed code. - - Either a `pydantic_monty.AbstractOS` instance or a raw OS callback - `(function_name, args, kwargs) -> result`. When set, `pathlib.Path`, `os`, - `datetime.datetime.now()`, and `datetime.date.today()` calls inside the - sandbox are routed to it instead of being unavailable.""" + """Give sandboxed code environment variables, the clock, and file I/O through a handler you provide; unset, they are unavailable.""" mount: CodeModeMount | None = None - """Host directory mount(s) exposed inside the sandbox as `pydantic_monty.MountDir`.""" + """Host directories to expose to sandboxed `pathlib` code; each mount's `mode` controls whether writes reach the host.""" # init=False so `replace()` in `for_run` produces a fresh instance with _repl=None, # giving each agent run isolated REPL state. Lazy-initialized on first call_tool. diff --git a/tests/code_mode/test_code_mode.py b/tests/code_mode/test_code_mode.py index 67e8d5b..3c820eb 100644 --- a/tests/code_mode/test_code_mode.py +++ b/tests/code_mode/test_code_mode.py @@ -8,6 +8,7 @@ from __future__ import annotations +from pathlib import Path from typing import Any, TypeVar import pytest @@ -1870,7 +1871,6 @@ async def test_description_default_keeps_no_wallclock_restriction(self) -> None: description = (await wrapper.get_tools(build_run_context(None)))['run_code'].tool_def.description assert description is not None assert 'No wall-clock or timing primitives' in description - assert 'Host-backed OS access' not in description async def test_description_with_os_callback_notes_host_access(self) -> None: """An `os` callback swaps the restriction line for the host-access note.""" @@ -1879,9 +1879,8 @@ async def test_description_with_os_callback_notes_host_access(self) -> None: description = (await wrapper.get_tools(build_run_context(None)))['run_code'].tool_def.description assert description is not None assert 'Host-backed OS access' in description - assert 'No wall-clock or timing primitives' not in description - async def test_description_mount_only_advertises_filesystem_not_env_or_clock(self, tmp_path: Any) -> None: + async def test_description_mount_only_advertises_filesystem_not_env_or_clock(self, tmp_path: Path) -> None: """A `mount` without `os` advertises filesystem access only -- it must not tell the model that env/clock are host-backed, since a mount cannot route `os.getenv`/`datetime.now()`.""" wrapper = CodeMode[None](mount=MountDir('/work', str(tmp_path))).get_wrapper_toolset( @@ -1890,15 +1889,13 @@ async def test_description_mount_only_advertises_filesystem_not_env_or_clock(sel assert isinstance(wrapper, CodeModeToolset) description = (await wrapper.get_tools(build_run_context(None)))['run_code'].tool_def.description assert description is not None + # The regression guard: a mount must select the filesystem note, not the OS note that would + # (wrongly) advertise env/clock as host-routed -- this assert fails if the OS note is picked. assert 'Mounted filesystem access' in description - assert 'Host-backed OS access' not in description - # env/clock are explicitly called out as still unavailable, not advertised as routed. - assert '`os.getenv`/`os.environ`, `datetime.datetime.now()`, `datetime.date.today()`' in description - assert 'remain unavailable' in description async def test_description_host_access_note_shows_with_no_sandboxed_tools(self) -> None: """The host-access note appears even when no tools are sandboxed (base description).""" - # `tools=[]` leaves every tool native, so `run_code` exposes no callable functions. + # `tools=[]` sandboxes nothing, so `run_code` renders the base description path. wrapper = CodeMode[None](os_access=_unused_os_callback, tools=[]).get_wrapper_toolset( _build_function_toolset(add) ) @@ -1906,7 +1903,6 @@ async def test_description_host_access_note_shows_with_no_sandboxed_tools(self) description = (await wrapper.get_tools(build_run_context(None)))['run_code'].tool_def.description assert description is not None assert 'Host-backed OS access' in description - assert 'functions are available inside the sandbox' not in description async def test_os_callback_dispatches_inside_run_code(self) -> None: """An `os` callback is threaded through `feed_start` and every `resume`, so OS calls @@ -1971,7 +1967,7 @@ def os_cb(fn: OsFunction, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any: with pytest.raises(ModelRetry, match='boom from os'): await wrapper.call_tool('run_code', {'code': "import os\nos.getenv('X')"}, ctx, tools['run_code']) - async def test_mount_exposes_host_directory(self, tmp_path: Any) -> None: + async def test_mount_exposes_host_directory(self, tmp_path: Path) -> None: """A `mount` exposes a host directory inside the sandbox, threaded through resumes.""" (tmp_path / 'data.txt').write_text('hello-from-host') wrapper = CodeMode[None](mount=MountDir('/work', str(tmp_path))).get_wrapper_toolset( @@ -1984,7 +1980,7 @@ async def test_mount_exposes_host_directory(self, tmp_path: Any) -> None: result = await wrapper.call_tool('run_code', {'code': code}, ctx, tools['run_code']) assert result.return_value == 'hello-from-host' - async def test_mount_accepts_list_of_directories(self, tmp_path: Any) -> None: + async def test_mount_accepts_list_of_directories(self, tmp_path: Path) -> None: """`mount` accepts a `list[MountDir]`; each directory is exposed at its virtual path.""" (tmp_path / 'a').mkdir() (tmp_path / 'b').mkdir() @@ -1999,7 +1995,7 @@ async def test_mount_accepts_list_of_directories(self, tmp_path: Any) -> None: result = await wrapper.call_tool('run_code', {'code': code}, ctx, tools['run_code']) assert result.return_value == 'AABB' - def test_capability_forwards_os_and_mount_to_toolset(self, tmp_path: Any) -> None: + def test_capability_forwards_os_and_mount_to_toolset(self, tmp_path: Path) -> None: """`CodeMode` forwards `os_access`/`mount` onto the `CodeModeToolset` it builds.""" mount = MountDir('/work', str(tmp_path)) wrapper = CodeMode[None](os_access=_unused_os_callback, mount=mount).get_wrapper_toolset( From 5f36134d7686bc12b880bd00ea5d6dc6a586ed2d Mon Sep 17 00:00:00 2001 From: Aditya Vardhan Date: Wed, 3 Jun 2026 12:53:52 +0530 Subject: [PATCH 12/13] docs(code_mode): clarify os_access callback return semantics The raw-callback example claimed non-allow-listed keys "stay hidden" by returning NOT_HANDLED. Verified against Monty: NOT_HANDLED *refuses* the call (raises in the sandbox -> model retry), it does not return None. A model probing for an optional secret would crash and burn retries. Distinguish the two return modes explicitly so users don't pick the wrong one: return a value (incl. None) to answer/hide, NOT_HANDLED to refuse a capability outright. Co-Authored-By: Claude Opus 4.8 (1M context) --- pydantic_ai_harness/code_mode/README.md | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/pydantic_ai_harness/code_mode/README.md b/pydantic_ai_harness/code_mode/README.md index c05b92c..963bd58 100644 --- a/pydantic_ai_harness/code_mode/README.md +++ b/pydantic_ai_harness/code_mode/README.md @@ -179,13 +179,25 @@ allowed_env = {'API_KEY': 'sk-...'} def my_os(fn, args, kwargs): if fn == 'os.getenv': - return allowed_env.get(args[0], NOT_HANDLED) # only allow-listed keys; the rest stay hidden + # Answer the call: allow-listed keys resolve, every other key reads back + # as None -- absent, exactly like a real unset variable. + return allowed_env.get(args[0]) + # Refuse everything else: NOT_HANDLED makes the call fail in the sandbox. return NOT_HANDLED CodeMode(os_access=my_os) ``` +Your callback's return value decides the call's fate, and the two outcomes are easy to confuse: + +- **Return any value** -- including `None`, `''`, or `0` -- and that becomes the result the sandbox + sees. `os.getenv` returning `None` looks exactly like a normal unset variable, so the agent's code + keeps running. This is how you *hide* something: answer with an empty value. +- **Return `NOT_HANDLED`** and the call is treated as unsupported: it raises inside the sandbox and + the model gets a retry. This *refuses* a capability outright -- use it to block, not to say "no + value". Returning `NOT_HANDLED` for a key the agent reasonably expects will burn retries. + Both expose the real host to model-written code, so grant only what the task needs. Access is fixed when the capability is built, so construct `CodeMode` per request to scope it. From 760abeec08c02e155bda0c358dabc5811b2fb20a Mon Sep 17 00:00:00 2001 From: Aditya Vardhan Date: Wed, 3 Jun 2026 12:56:31 +0530 Subject: [PATCH 13/13] test(code_mode): lock in os_access value-vs-NOT_HANDLED semantics Returning a value (including None) from an os_access callback answers the call -- a None reads back like an unset env var, so the sandbox keeps running. Returning NOT_HANDLED refuses the call, raising in the sandbox and surfacing as ModelRetry. These two paths are easy to confuse and silently regress, so pin both. Co-Authored-By: Claude Opus 4.8 (1M context) --- tests/code_mode/test_code_mode.py | 40 +++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/tests/code_mode/test_code_mode.py b/tests/code_mode/test_code_mode.py index 3c820eb..e6f3f1a 100644 --- a/tests/code_mode/test_code_mode.py +++ b/tests/code_mode/test_code_mode.py @@ -1967,6 +1967,46 @@ def os_cb(fn: OsFunction, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any: with pytest.raises(ModelRetry, match='boom from os'): await wrapper.call_tool('run_code', {'code': "import os\nos.getenv('X')"}, ctx, tools['run_code']) + async def test_os_callback_returning_value_answers_call_including_none(self) -> None: + """Returning a value from the `os` callback -- even `None` -- *answers* the call. + + Allow-listed keys resolve; every other key reads back as `None`, exactly like a real + unset env var, so the sandbox keeps running with no retry. This is how a callback hides + a secret: by answering with an empty value, not by refusing the call. + """ + allowed = {'API_KEY': 'sk-xxx'} + + def os_cb(fn: OsFunction, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any: + if fn == 'os.getenv': + return allowed.get(args[0]) + return NOT_HANDLED # pragma: no cover - sandbox only calls os.getenv here + + wrapper = CodeMode[None](os_access=os_cb).get_wrapper_toolset(_build_function_toolset(add)) + assert isinstance(wrapper, CodeModeToolset) + ctx = await build_ctx(None, wrapper) + tools = await wrapper.get_tools(ctx) + code = "import os\n{'allowed': os.getenv('API_KEY'), 'hidden': os.getenv('SECRET')}" + result = await wrapper.call_tool('run_code', {'code': code}, ctx, tools['run_code']) + assert result.return_value == {'allowed': 'sk-xxx', 'hidden': None} + + async def test_os_callback_not_handled_refuses_call_as_model_retry(self) -> None: + """Returning `NOT_HANDLED` *refuses* the call rather than answering it. + + The OS function is treated as unsupported, so it raises in the sandbox and surfaces as + `ModelRetry`. This is the counterpart to returning a value: refusing is not the same as + answering `None`, and using it for a key the model expects will burn retries. + """ + + def os_cb(fn: OsFunction, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any: + return NOT_HANDLED + + wrapper = CodeMode[None](os_access=os_cb).get_wrapper_toolset(_build_function_toolset(add)) + assert isinstance(wrapper, CodeModeToolset) + ctx = await build_ctx(None, wrapper) + tools = await wrapper.get_tools(ctx) + with pytest.raises(ModelRetry, match='not supported in this environment'): + await wrapper.call_tool('run_code', {'code': "import os\nos.getenv('X')"}, ctx, tools['run_code']) + async def test_mount_exposes_host_directory(self, tmp_path: Path) -> None: """A `mount` exposes a host directory inside the sandbox, threaded through resumes.""" (tmp_path / 'data.txt').write_text('hello-from-host')