diff --git a/src/strands_evals/cli/commands/generate.py b/src/strands_evals/cli/commands/generate.py new file mode 100644 index 00000000..d09f53ac --- /dev/null +++ b/src/strands_evals/cli/commands/generate.py @@ -0,0 +1,203 @@ +"""`strands-evals generate` — synthesize an Experiment via ExperimentGenerator.""" + +from __future__ import annotations + +import argparse +import asyncio +import json +import sys + +from ...evaluators import InteractionsEvaluator, OutputEvaluator, TrajectoryEvaluator +from ...evaluators.evaluator import Evaluator +from ...experiment import Experiment +from ...generators import ExperimentGenerator +from .._common import EXIT_BAD_INPUT, EXIT_OK, emit_error +from .._entrypoint import resolve_custom_evaluators +from .._io import write_text_output + +# `ExperimentGenerator.construct_evaluator_async` only supports these three. +# The CLI restricts `--evaluator` to the same set so users get a fast, +# CLI-flavored error instead of a deep ValueError from the generator listing +# raw class objects. +_GENERATOR_EVALUATORS: dict[str, type[Evaluator]] = { + "OutputEvaluator": OutputEvaluator, + "TrajectoryEvaluator": TrajectoryEvaluator, + "InteractionsEvaluator": InteractionsEvaluator, +} + + +def _validate_args(args: argparse.Namespace) -> str | None: + """Return an error string if argument combinations are invalid, else None. + + The argparse mutually-exclusive group already enforces "exactly one of + --context/--experiment". This layer guards mode-specific flag misuse: + + - `--evaluator` and `--num-topics` only apply to context mode; the + `from_experiment_async` library API derives evaluators from the source + and doesn't take a topic plan, so silently dropping them would mislead + users into thinking the flag took effect. + - `--extra-information` and `--custom-evaluator` only apply to experiment + mode; in context mode the equivalent guidance goes inline into + `--context` and there's no source experiment to load custom evaluators + for. + """ + if args.experiment is not None: + if args.evaluator is not None: + return "--evaluator is not supported with --experiment (evaluators are inherited from the source)." + if args.num_topics is not None: + return "--num-topics is not supported with --experiment (topic planning runs only in --context mode)." + else: + if args.extra_information is not None: + return "--extra-information requires --experiment." + if args.custom_evaluator: + return "--custom-evaluator requires --experiment." + return None + + +def _run(args: argparse.Namespace) -> int: + error = _validate_args(args) + if error is not None: + emit_error(error) + return EXIT_BAD_INPUT + + generator: ExperimentGenerator[str, str] = ExperimentGenerator(input_type=str, output_type=str, model=args.model) + + if args.experiment is not None: + custom_evaluators = resolve_custom_evaluators(args.custom_evaluator or []) + source = Experiment.from_file(args.experiment, custom_evaluators=custom_evaluators) + experiment = asyncio.run( + generator.from_experiment_async( + source_experiment=source, + task_description=args.task_description or "", + num_cases=args.num_cases, + extra_information=args.extra_information, + ) + ) + else: + evaluator_cls = _GENERATOR_EVALUATORS[args.evaluator] if args.evaluator else None + experiment = asyncio.run( + generator.from_context_async( + context=args.context, + task_description=args.task_description or "", + num_cases=args.num_cases, + evaluator=evaluator_cls, + num_topics=args.num_topics, + ) + ) + + if args.output is not None: + # `experiment.to_file` enforces a `.json` extension and writes via + # `to_dict()`; route through it so the on-disk shape matches what + # `Experiment.from_file` round-trips. + experiment.to_file(args.output) + else: + write_text_output(json.dumps(experiment.to_dict(), indent=2, ensure_ascii=False), None) + + print( # noqa: T201 + f"generated {len(experiment.cases)} case(s), {len(experiment.evaluators)} evaluator(s)", + file=sys.stderr, + ) + return EXIT_OK + + +def add_subparser( + subparsers: argparse._SubParsersAction, + parent: argparse.ArgumentParser, +) -> argparse.ArgumentParser: + parser = subparsers.add_parser( + "generate", + parents=[parent], + help="generate a starter Experiment via ExperimentGenerator", + description=( + "Synthesize an Experiment by wrapping ExperimentGenerator. Two " + "modes: --context creates cases (and an optional rubric) from a " + "free-form description via from_context_async; --experiment " + "derives new cases from an existing Experiment file via " + "from_experiment_async, inheriting the source's evaluators. " + "--context/--experiment are mutually exclusive. With -o the " + "experiment is written via Experiment.to_file (.json enforced); " + "without -o the JSON document is emitted on stdout." + ), + ) + + source = parser.add_mutually_exclusive_group(required=True) + source.add_argument( + "--context", + metavar="CONTEXT", + default=None, + help=( + "context describing the task, tools, or domain (from_context_async). " + 'Use shell substitution for file contents, e.g. --context "$(cat tools.txt)".' + ), + ) + source.add_argument( + "--experiment", + metavar="EXPERIMENT_FILE", + default=None, + help=( + "path to an existing Experiment JSON used as a reference " + "(from_experiment_async). Cases are inspired by the source; " + "evaluators are derived from the source's defaults." + ), + ) + + parser.add_argument( + "--num-cases", + type=int, + default=5, + help="number of test cases to generate (default: 5)", + ) + parser.add_argument( + "--task-description", + metavar="STR", + default=None, + help="short description of the task the agent will perform (default: empty)", + ) + parser.add_argument( + "--num-topics", + type=int, + default=None, + help="--context only: expand into N topic-specific prompts for diverse coverage", + ) + parser.add_argument( + "--evaluator", + metavar="NAME", + choices=sorted(_GENERATOR_EVALUATORS), + default=None, + help=( + "--context only: default evaluator to attach with a generated rubric. " + f"Choices: {', '.join(sorted(_GENERATOR_EVALUATORS))}. " + "Omit to produce an experiment with a placeholder Evaluator." + ), + ) + parser.add_argument( + "--extra-information", + metavar="TEXT", + default=None, + help="--experiment only: extra context for the new cases/rubric", + ) + parser.add_argument( + "--custom-evaluator", + metavar="MODULE:CLASS", + action="append", + default=None, + help=( + "--experiment only: custom Evaluator subclass to register before " + "loading the source experiment file (repeatable)" + ), + ) + parser.add_argument( + "--model", + metavar="MODEL_ID", + default=None, + help="override the judge model used by the generator", + ) + parser.add_argument( + "-o", + "--output", + metavar="PATH", + default=None, + help="write the generated experiment JSON to PATH (via Experiment.to_file)", + ) + parser.set_defaults(func=_run) + return parser diff --git a/src/strands_evals/cli/main.py b/src/strands_evals/cli/main.py index fb2bae18..4aa8bbde 100644 --- a/src/strands_evals/cli/main.py +++ b/src/strands_evals/cli/main.py @@ -7,6 +7,7 @@ from . import _common from .commands import diagnose as diagnose_cmd +from .commands import generate as generate_cmd from .commands import report as report_cmd from .commands import run as run_cmd from .commands import validate as validate_cmd @@ -27,6 +28,7 @@ def _build_parser() -> argparse.ArgumentParser: report_cmd.add_subparser(subparsers, parent) diagnose_cmd.add_subparser(subparsers, parent) run_cmd.add_subparser(subparsers, parent) + generate_cmd.add_subparser(subparsers, parent) return parser diff --git a/tests/strands_evals/cli/test_generate.py b/tests/strands_evals/cli/test_generate.py new file mode 100644 index 00000000..e4ec1c5a --- /dev/null +++ b/tests/strands_evals/cli/test_generate.py @@ -0,0 +1,256 @@ +"""Tests for ``strands-evals generate``.""" + +from __future__ import annotations + +import json +from pathlib import Path +from unittest.mock import AsyncMock, patch + +import pytest + +from strands_evals.case import Case +from strands_evals.cli.main import main +from strands_evals.evaluators.output_evaluator import OutputEvaluator +from strands_evals.experiment import Experiment + + +def _experiment_with(num_cases: int = 2, evaluator: bool = False) -> Experiment: + cases = [Case[str, str](name=f"c{i}", input=f"q{i}", expected_output=f"a{i}") for i in range(num_cases)] + evaluators = [OutputEvaluator(rubric="generated rubric")] if evaluator else [] + return Experiment(cases=cases, evaluators=evaluators) + + +def test_generate_writes_to_stdout(tmp_path: Path, capsys): + fake = AsyncMock(return_value=_experiment_with(num_cases=3)) + with patch( + "strands_evals.cli.commands.generate.ExperimentGenerator.from_context_async", + new=fake, + ): + exit_code = main(["generate", "--context", "ctx", "--num-cases", "3"]) + + captured = capsys.readouterr() + assert exit_code == 0 + payload = json.loads(captured.out) + assert len(payload["cases"]) == 3 + assert "generated 3 case(s)" in captured.err + + +def test_generate_writes_to_file_via_to_file(tmp_path: Path, capsys): + out_path = tmp_path / "experiment.json" + fake = AsyncMock(return_value=_experiment_with(num_cases=1, evaluator=True)) + with patch( + "strands_evals.cli.commands.generate.ExperimentGenerator.from_context_async", + new=fake, + ): + exit_code = main( + [ + "generate", + "--context", + "ctx", + "--num-cases", + "1", + "--evaluator", + "OutputEvaluator", + "-o", + str(out_path), + ] + ) + + captured = capsys.readouterr() + assert exit_code == 0 + payload = json.loads(out_path.read_text()) + assert len(payload["cases"]) == 1 + assert payload["evaluators"][0]["evaluator_type"] == "OutputEvaluator" + assert "1 case(s), 1 evaluator(s)" in captured.err + # Stdout is empty when -o is set. + assert captured.out == "" + + +def test_generate_to_file_appends_json_extension(tmp_path: Path): + out_path = tmp_path / "experiment" # no extension + fake = AsyncMock(return_value=_experiment_with(num_cases=1)) + with patch( + "strands_evals.cli.commands.generate.ExperimentGenerator.from_context_async", + new=fake, + ): + exit_code = main(["generate", "--context", "ctx", "-o", str(out_path)]) + + assert exit_code == 0 + # Experiment.to_file appends .json automatically. + assert (tmp_path / "experiment.json").exists() + + +def test_generate_passes_through_args(capsys): + fake = AsyncMock(return_value=_experiment_with(num_cases=2, evaluator=True)) + with patch( + "strands_evals.cli.commands.generate.ExperimentGenerator.from_context_async", + new=fake, + ): + exit_code = main( + [ + "generate", + "--context", + "ctx", + "--num-cases", + "2", + "--task-description", + "answer factual qs", + "--num-topics", + "3", + "--evaluator", + "TrajectoryEvaluator", + "--model", + "anthropic.claude-test", + ] + ) + + assert exit_code == 0 + kwargs = dict(fake.await_args.kwargs) + # The class object is passed; assert by name to avoid importing it again. + assert kwargs.pop("evaluator").__name__ == "TrajectoryEvaluator" + assert kwargs == { + "context": "ctx", + "num_cases": 2, + "task_description": "answer factual qs", + "num_topics": 3, + } + + +def test_generate_unknown_evaluator_choice_exits_2(capsys): + """argparse rejects --evaluator values outside the supported set.""" + with pytest.raises(SystemExit): + main(["generate", "--context", "ctx", "--evaluator", "Helpfulness"]) + + +def test_generate_missing_source_exits_2(capsys): + """argparse requires exactly one of --context / --experiment.""" + with pytest.raises(SystemExit): + main(["generate"]) + + +def test_generate_context_and_experiment_mutually_exclusive(tmp_path: Path): + src = tmp_path / "src.json" + src.write_text("{}", encoding="utf-8") + with pytest.raises(SystemExit): + main(["generate", "--context", "ctx", "--experiment", str(src)]) + + +def test_generate_from_experiment_calls_from_experiment_async(experiment_file: Path, capsys): + fake = AsyncMock(return_value=_experiment_with(num_cases=4, evaluator=True)) + with patch( + "strands_evals.cli.commands.generate.ExperimentGenerator.from_experiment_async", + new=fake, + ): + exit_code = main( + [ + "generate", + "--experiment", + str(experiment_file), + "--num-cases", + "4", + "--task-description", + "rephrased task", + ] + ) + + captured = capsys.readouterr() + assert exit_code == 0 + fake.assert_awaited_once() + kwargs = dict(fake.await_args.kwargs) + assert isinstance(kwargs.pop("source_experiment"), Experiment) + assert kwargs == { + "task_description": "rephrased task", + "num_cases": 4, + "extra_information": None, + } + payload = json.loads(captured.out) + assert len(payload["cases"]) == 4 + + +def test_generate_extra_information_passed_through(experiment_file: Path): + fake = AsyncMock(return_value=_experiment_with(num_cases=1)) + with patch( + "strands_evals.cli.commands.generate.ExperimentGenerator.from_experiment_async", + new=fake, + ): + exit_code = main( + [ + "generate", + "--experiment", + str(experiment_file), + "--extra-information", + "Use a friendly tone.", + ] + ) + + assert exit_code == 0 + assert fake.await_args.kwargs["extra_information"] == "Use a friendly tone." + + +def test_generate_experiment_rejects_evaluator(experiment_file: Path, capsys): + """--evaluator only applies to --context; rejected with --experiment.""" + exit_code = main( + [ + "generate", + "--experiment", + str(experiment_file), + "--evaluator", + "OutputEvaluator", + ] + ) + captured = capsys.readouterr() + assert exit_code == 2 + assert "--evaluator" in captured.err + + +def test_generate_experiment_rejects_num_topics(experiment_file: Path, capsys): + exit_code = main( + [ + "generate", + "--experiment", + str(experiment_file), + "--num-topics", + "3", + ] + ) + captured = capsys.readouterr() + assert exit_code == 2 + assert "--num-topics" in captured.err + + +def test_generate_extra_information_requires_experiment(capsys): + exit_code = main( + [ + "generate", + "--context", + "ctx", + "--extra-information", + "more", + ] + ) + captured = capsys.readouterr() + assert exit_code == 2 + assert "--extra-information" in captured.err + + +def test_generate_custom_evaluator_requires_experiment(capsys): + """--custom-evaluator only applies to --experiment; rejected with --context.""" + exit_code = main( + [ + "generate", + "--context", + "ctx", + "--custom-evaluator", + "pkg.mod:MyEvaluator", + ] + ) + captured = capsys.readouterr() + assert exit_code == 2 + assert "--custom-evaluator" in captured.err + + +def test_generate_experiment_missing_source_exits_2(tmp_path: Path, capsys): + exit_code = main(["generate", "--experiment", str(tmp_path / "missing.json")]) + captured = capsys.readouterr() + assert exit_code == 2 + assert "error" in captured.err.lower()