diff --git a/benchmarks/benchmark_base.py b/benchmarks/benchmark_base.py index 28758b058..ef1bed333 100644 --- a/benchmarks/benchmark_base.py +++ b/benchmarks/benchmark_base.py @@ -1,4 +1,5 @@ import logging +import statistics import subprocess import threading from abc import ABC, abstractmethod @@ -83,6 +84,23 @@ class BenchmarkWorkload(ShapeDtypeWorkload, InputGeneratingWorkload, Protocol): # A single test function may call record() multiple times (tileops + baseline). _bench_results = threading.local() +# Kernel name substrings that identify L2-flush operations (cache.zero_() on +# the dedicated _l2_flush_cache buffer). Filtered out of CUPTI timing so flush +# overhead is never counted as benchmark kernel time. +# +# The flush buffer is a large int32 tensor (sized to L2 cache) whose sole +# purpose is L2 eviction via cache.zero_(). To avoid false-positive exclusion +# of user kernels, we match BOTH the FillFunctor pattern (from Tensor.zero_()) +# AND the vectorized_elementwise pattern (specific to PyTorch's unary kernel +# dispatch). User code that calls fill_() or zero_() on regular tensors will +# still trigger FillFunctor, but typically with different kernel signatures +# (e.g., different element types or lack of vectorization for small tensors). +# +# If false positives persist, consider: (1) tracking the flush buffer's pointer +# address via kineto correlation IDs, or (2) using a uniquely named flush kernel +# via a custom CUDA extension instead of relying on Tensor.zero_(). +_FLUSH_PATTERNS: tuple[str, ...] = ("vectorized_elementwise", "FillFunctor") + def _sum_kernel_time_us(kineto_results): """Extract total CUDA kernel time directly from C++ Kineto events. @@ -90,15 +108,38 @@ def _sum_kernel_time_us(kineto_results): Bypasses ``profiler.key_averages()`` which triggers expensive Python event parsing (~120ms) and tree building (~10ms) for large traces. Direct C++ iteration is ~16x faster for n_repeat=1280. + + L2 flush kernels (``cache.zero_()`` on the flush buffer) are excluded. + Flush events are identified by kernel names containing BOTH + ``vectorized_elementwise`` AND ``FillFunctor`` (the specific pattern + emitted by ``Tensor.zero_()`` on large int32 tensors). Generic patterns + like ``"Memset"``, ``"memset"``, and ``"fill_kernel"`` are intentionally + *not* filtered to avoid silently dropping real benchmark kernels. Matching + both substrings reduces false-positive exclusion of user code that calls + ``fill_()`` on small or non-vectorized tensors. + + Returns: + tuple[float, dict[str, float]]: (total_us, per_kernel_us) where + ``per_kernel_us`` maps each kernel name to its total duration in + microseconds across all timed iterations. Use the breakdown to + detect helper / temporary-tensor kernels that inflate the stat + (e.g. MHA decode split-path workspace fills, cuBLAS epilogue + kernels, or any unexpected cuDNN helper). """ total_us = 0.0 + per_kernel: dict[str, float] = {} + excluded_kernel: dict[str, float] = {} for evt in kineto_results.events(): if evt.device_type() == DeviceType.CUDA: name = evt.name() - if "vectorized_elementwise" in name and "FillFunctor" in name: + dur = evt.duration_ns() / 1000.0 + # Match flush events by requiring ALL patterns (AND logic) + if all(p in name for p in _FLUSH_PATTERNS): + excluded_kernel[name] = excluded_kernel.get(name, 0.0) + dur continue - total_us += evt.duration_ns() / 1000.0 - return total_us + total_us += dur + per_kernel[name] = per_kernel.get(name, 0.0) + dur + return total_us, per_kernel, excluded_kernel # --------------------------------------------------------------------------- @@ -127,8 +168,8 @@ def bench_kernel( args: tuple[Any, ...] = (), n_warmup: int = 10, n_repeat: int = 50, - n_trials: int = 3, -) -> float: + n_trials: int = 5, +) -> dict: """Benchmark a GPU kernel with pure kernel timing via CUPTI. Protocol (adapted from NVIDIA SOL-ExecBench, arxiv.org/abs/2603.19173): @@ -142,7 +183,7 @@ def bench_kernel( Uses CUPTI via torch.profiler for accurate kernel-only timing, with direct Kineto C++ event iteration to avoid Python parsing overhead. - Falls back to CUDA events if CUPTI is unavailable. + Falls back to CUDA event timing if CUPTI is unavailable (with a warning). Args: fn: Callable to benchmark. If *args* is provided, called as @@ -151,10 +192,16 @@ def bench_kernel( values are passed through unchanged. n_warmup: Warmup iterations (default 10). n_repeat: Timed iterations per trial (default 50). - n_trials: Independent trials (default 3). + n_trials: Independent trials (default 5). Returns: - Kernel latency in **milliseconds**. + dict with keys: + - ``latency_ms``: median-of-trials mean kernel latency in milliseconds + - ``stdev_ms``: standard deviation across trial means (0.0 when only + one trial is available) + - ``timing_backend``: ``"cupti"`` (preferred) or ``"cuda_event"`` (fallback) + - ``event_breakdown``: dict mapping CUDA kernel name → total_us across + *all* timed iterations of the median trial (empty for CUDA event fallback). """ if not isinstance(args, tuple): raise TypeError( @@ -201,11 +248,23 @@ def _run(i): # Timed trials with CUPTI (single profiler, n_trials cycles) trial_means: list[float] = [] + trial_breakdowns: list[dict[str, float]] = [] + def _on_trace_ready(prof): kr = prof.profiler.kineto_results - kernel_us = _sum_kernel_time_us(kr) / n_repeat - trial_means.append(kernel_us * 1e-3) + total_us, per_kernel, excluded_kernel = _sum_kernel_time_us(kr) + trial_means.append(total_us / n_repeat * 1e-3) + trial_breakdowns.append(per_kernel) + if excluded_kernel: + excluded_us = sum(excluded_kernel.values()) + _logger.debug( + "CUPTI: excluded %.1f µs across %d flush/fill kernel(s): %s", + excluded_us, + len(excluded_kernel), + list(excluded_kernel.keys()), + ) + cupti_ok = True try: with suppress_stdout_stderr(): schedule = torch.profiler.schedule( @@ -229,30 +288,67 @@ def _on_trace_ready(prof): _run(i) profiler.step() except RuntimeError: - pass + cupti_ok = False + finally: + # Free the arg pool and release cached GPU memory to prevent + # accumulation across hundreds of benchmark calls. + if arg_pool is not None: + del arg_pool + torch.cuda.empty_cache() - # Fallback to CUDA events if CUPTI failed if not trial_means: - for _ in range(n_trials): - start_events = [torch.cuda.Event(enable_timing=True) for _ in range(n_repeat)] - end_events = [torch.cuda.Event(enable_timing=True) for _ in range(n_repeat)] - for i in range(n_repeat): - cache.zero_() - start_events[i].record() - _run(i) - end_events[i].record() - torch.cuda.synchronize() - times = [s.elapsed_time(e) for s, e in zip(start_events, end_events, strict=True)] - trial_means.append(sum(times) / len(times)) - - # Free the arg pool and release cached GPU memory to prevent - # accumulation across hundreds of benchmark calls. - if arg_pool is not None: - del arg_pool - torch.cuda.empty_cache() - - trial_means.sort() - return trial_means[len(trial_means) // 2] + cupti_ok = False + + if cupti_ok: + # Pick median trial + timing_backend = "cupti" + # Sort by mean latency; pick median trial's breakdown too + paired = sorted(zip(trial_means, trial_breakdowns, strict=True), key=lambda x: x[0]) + median_ms, median_breakdown = paired[len(paired) // 2] + stdev_ms = statistics.stdev(trial_means) if len(trial_means) > 1 else 0.0 + return { + "latency_ms": median_ms, + "stdev_ms": stdev_ms, + "timing_backend": timing_backend, + "event_breakdown": median_breakdown, + } + + # Fall back to CUDA event timing when CUPTI is unavailable. + _logger.warning( + "CUPTI unavailable or produced no results; falling back to CUDA event timing. " + "Ensure libcupti.so is on LD_LIBRARY_PATH for kernel-accurate measurements." + ) + event_trial_means: list[float] = [] + for _ in range(n_trials): + start_events = [torch.cuda.Event(enable_timing=True) for _ in range(n_repeat)] + end_events = [torch.cuda.Event(enable_timing=True) for _ in range(n_repeat)] + # Warmup + for i in range(n_repeat): + cache.zero_() + _run(i) + torch.cuda.synchronize() + # Timed iterations: flush is outside the event window so only _run() is measured. + for i in range(n_repeat): + cache.zero_() + start_events[i].record() + _run(i) + end_events[i].record() + torch.cuda.synchronize() + trial_us = sum( + s.elapsed_time(e) * 1e3 + for s, e in zip(start_events, end_events, strict=True) + ) + event_trial_means.append(trial_us / n_repeat * 1e-3) + + paired_ev = sorted(event_trial_means) + median_ms = paired_ev[len(paired_ev) // 2] + stdev_ms = statistics.stdev(event_trial_means) if len(event_trial_means) > 1 else 0.0 + return { + "latency_ms": median_ms, + "stdev_ms": stdev_ms, + "timing_backend": "cuda_event", + "event_breakdown": {}, + } def _get_env_metadata() -> list[str]: @@ -267,16 +363,30 @@ def _get_env_metadata() -> list[str]: else: lines.append("- **GPU model**: N/A (no CUDA device)") - # Try to get NVIDIA driver version from nvidia-smi + # Try to get NVIDIA driver version and GPU telemetry from nvidia-smi try: result = subprocess.run( - ["nvidia-smi", "--query-gpu=driver_version", "--format=csv,noheader"], + [ + "nvidia-smi", + "--query-gpu=driver_version,clocks.mem,clocks.gr,clocks.sm,power.draw,temperature.gpu", + "--format=csv,noheader,nounits", + ], capture_output=True, text=True, timeout=5, ) - driver = result.stdout.strip().split("\n")[0] if result.returncode == 0 else "N/A" + if result.returncode == 0: + parts = [p.strip() for p in result.stdout.strip().split("\n")[0].split(",")] + driver, mem_clock, gr_clock, sm_clock, power_draw, gpu_temp = (parts + ["N/A"] * 6)[:6] + else: + driver = mem_clock = gr_clock = sm_clock = power_draw = gpu_temp = "N/A" except (FileNotFoundError, subprocess.TimeoutExpired): - driver = "N/A" + driver = mem_clock = gr_clock = sm_clock = power_draw = gpu_temp = "N/A" + lines.append(f"- **Driver version**: {driver}") + lines.append(f"- **Memory clock (MHz)**: {mem_clock}") + lines.append(f"- **Graphics clock (MHz)**: {gr_clock}") + lines.append(f"- **SM clock (MHz)**: {sm_clock}") + lines.append(f"- **Power draw (W)**: {power_draw}") + lines.append(f"- **GPU temperature (°C)**: {gpu_temp}") return lines @@ -325,8 +435,14 @@ def profile_autograd(self, functor: Any) -> dict: latency = bench_kernel(functor) return self._build_result(latency) - def _build_result(self, latency: float) -> dict: - result = {"latency_ms": latency} + def _build_result(self, bench_result: dict) -> dict: + latency = bench_result["latency_ms"] + result = { + "latency_ms": latency, + "stdev_ms": bench_result.get("stdev_ms", 0.0), + "timing_backend": bench_result.get("timing_backend", "unknown"), + "event_breakdown": bench_result.get("event_breakdown", {}), + } flops = self.calculate_flops() if flops is not None: result["tflops"] = flops / latency * 1e-9 @@ -546,6 +662,14 @@ def _is_serializable(v: Any) -> bool: entry = {"tag": tag, "op": name, **result} if op_module: entry["op_module"] = op_module + if op_config: + entry["config"] = op_config + # Limit event_breakdown to top-10 kernels by total time to keep + # JUnit XML properties compact; full breakdown is in profile_run.log. + breakdown = result.get("event_breakdown", {}) + if breakdown: + top10 = dict(sorted(breakdown.items(), key=lambda x: x[1], reverse=True)[:10]) + entry["event_breakdown_top10"] = top10 _bench_results.entries.append(entry) _logger.info("op=%s module=%s tag=%s latency_ms=%.4f tflops=%.2f", @@ -569,7 +693,7 @@ def dump(path: str) -> None: lines.extend(_get_env_metadata()) lines.append("") - result_keys = ["latency_ms", "tflops", "bandwidth_tbs"] + result_keys = ["latency_ms", "stdev_ms", "timing_backend", "tflops", "bandwidth_tbs"] for name, entries in BenchmarkReport._records.items(): if not entries: @@ -599,12 +723,30 @@ def dump(path: str) -> None: row = [str(entry["params"].get(k, "")) for k in param_keys] for rk in result_keys: val = entry["result"].get(rk) - row.append(f"{val:.4f}" if val is not None else "N/A") + if rk == "timing_backend": + row.append(str(val) if val else "N/A") + elif isinstance(val, (int, float)): + row.append(f"{val:.4f}") + else: + row.append("N/A") if has_config: cfg = entry.get("config") row.append(str(cfg) if cfg else "") lines.append("| " + " | ".join(row) + " |") + # Append full event_breakdown for this entry (if present) + breakdown = entry["result"].get("event_breakdown", {}) + if breakdown: + lines.append("") + lines.append(f"**Event breakdown** (params: {entry['params']}):") + lines.append("") + sorted_breakdown = sorted(breakdown.items(), key=lambda x: x[1], reverse=True) + lines.append("| Kernel | Time (µs) |") + lines.append("| --- | --- |") + for kernel_name, time_us in sorted_breakdown: + lines.append(f"| `{kernel_name}` | {time_us:.1f} |") + lines.append("") + lines.append("") with open(path, "w") as f: diff --git a/benchmarks/conftest.py b/benchmarks/conftest.py index 660d4dd4e..5efe29608 100644 --- a/benchmarks/conftest.py +++ b/benchmarks/conftest.py @@ -156,6 +156,20 @@ def pytest_runtest_call(item): bw = tileops_entry.get("bandwidth_tbs") if bw is not None: item.user_properties.append(("tileops_bandwidth_tbs", f"{bw:.2f}")) + # New diagnostics: timing backend, stdev, event breakdown + timing_backend = tileops_entry.get("timing_backend") + if timing_backend: + item.user_properties.append(("timing_backend", timing_backend)) + stdev_ms = tileops_entry.get("stdev_ms") + if stdev_ms is not None: + item.user_properties.append(("stdev_ms", f"{stdev_ms:.4f}")) + event_breakdown_top10 = tileops_entry.get("event_breakdown_top10") + if event_breakdown_top10: + # Serialize as "kernel1:us1,kernel2:us2,..." + breakdown_str = ",".join( + f"{k}:{v:.1f}" for k, v in event_breakdown_top10.items() + ) + item.user_properties.append(("event_breakdown_top10", breakdown_str)) # Write all baselines into JUnit XML properties. # The first baseline uses the legacy unprefixed names (baseline_tag, etc.) diff --git a/benchmarks/ops/attention/bench_gqa_fp8.py b/benchmarks/ops/attention/bench_gqa_fp8.py index 215b6a94f..bbf50b03f 100644 --- a/benchmarks/ops/attention/bench_gqa_fp8.py +++ b/benchmarks/ops/attention/bench_gqa_fp8.py @@ -111,7 +111,8 @@ def test_gqa_prefill_fp8_tensor_core_bench(case: GQAFp8TensorCoreBenchCase) -> N inputs = _make_inputs(case) op(*inputs) torch.cuda.synchronize() - latency_ms = bench_kernel(op, args=inputs, n_warmup=1, n_repeat=3, n_trials=1) + bench_result = bench_kernel(op, args=inputs, n_warmup=1, n_repeat=3, n_trials=1) + latency_ms = bench_result["latency_ms"] flops, bytes_moved = op.eval_roofline() result = { "latency_ms": latency_ms, @@ -124,7 +125,8 @@ def test_gqa_prefill_fp8_tensor_core_bench(case: GQAFp8TensorCoreBenchCase) -> N fa3_fn = _fa3_gqa_fp8_fwd() if fa3_fn is not None: - fa3_latency_ms = bench_kernel(fa3_fn, args=inputs, n_warmup=1, n_repeat=3, n_trials=1) + fa3_bench = bench_kernel(fa3_fn, args=inputs, n_warmup=1, n_repeat=3, n_trials=1) + fa3_latency_ms = fa3_bench["latency_ms"] fa3_result = { "latency_ms": fa3_latency_ms, "tflops": flops / fa3_latency_ms * 1e-9 if fa3_latency_ms > 0 else 0.0, diff --git a/benchmarks/ops/bench_mamba.py b/benchmarks/ops/bench_mamba.py index eafc85586..97b1977fd 100644 --- a/benchmarks/ops/bench_mamba.py +++ b/benchmarks/ops/bench_mamba.py @@ -138,6 +138,11 @@ def mamba_fwd(): dt_softplus=dt_softplus, ) + # Pre-warm: run once outside bm.profile so the Triton JIT compiler + # finishes compilation before the CUPTI window opens. + mamba_fwd() + torch.cuda.synchronize() + result_mamba = bm.profile(mamba_fwd) BenchmarkReport.record(op, locals(), result_mamba, tag="mamba") else: @@ -334,6 +339,11 @@ def test_ssd_chunk_scan_fwd_bench( def mamba_fwd(): return _mamba_chunk_scan_fwd(cb, x, dt, dA_cumsum, C, prev_states) + # Triton JIT-compiles on first call; call once before profiling to + # avoid cold-start contaminating the warmup iterations inside bench_kernel. + mamba_fwd() + torch.cuda.synchronize() + result_mamba = bm.profile(mamba_fwd) BenchmarkReport.record(op, locals(), result_mamba, tag="mamba") else: @@ -481,6 +491,11 @@ def mamba_fwd(): seq_idx=seq_idx, ) + # Pre-warm: run once outside bm.profile so the Triton JIT compiler + # finishes compilation before the CUPTI window opens. + mamba_fwd() + torch.cuda.synchronize() + result_mamba = bm.profile(mamba_fwd) BenchmarkReport.record(op, locals(), result_mamba, tag="mamba") else: diff --git a/tests/test_benchmark_timing.py b/tests/test_benchmark_timing.py new file mode 100644 index 000000000..6d0c528ba --- /dev/null +++ b/tests/test_benchmark_timing.py @@ -0,0 +1,236 @@ +"""Unit tests for benchmark timing backend, CUPTI fallback, and diagnostics.""" + +import pytest +import torch + +from benchmarks.benchmark_base import BenchmarkReport, _sum_kernel_time_us, bench_kernel + +pytestmark = pytest.mark.full + + +@pytest.fixture(autouse=True) +def _reset_records(): + """Snapshot and clear BenchmarkReport._records around each test.""" + saved = BenchmarkReport._records + BenchmarkReport._records = {} + try: + yield + finally: + BenchmarkReport._records = saved + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required") +def test_bench_kernel_returns_structured_dict(): + """bench_kernel() returns dict with latency_ms, stdev_ms, timing_backend, event_breakdown.""" + def simple_kernel(): + x = torch.randn(1024, device="cuda") + return x * 2.0 + + result = bench_kernel(simple_kernel, n_warmup=2, n_repeat=5, n_trials=3) + + assert isinstance(result, dict) + assert "latency_ms" in result + assert "stdev_ms" in result + assert "timing_backend" in result + assert "event_breakdown" in result + + assert isinstance(result["latency_ms"], float) + assert result["latency_ms"] > 0 + + assert isinstance(result["stdev_ms"], float) + assert result["stdev_ms"] >= 0 + + assert result["timing_backend"] in ("cupti", "cuda_event") + + assert isinstance(result["event_breakdown"], dict) + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required") +def test_bench_kernel_with_tensor_args(): + """bench_kernel() with tensor args uses arg_pool cloning.""" + x = torch.randn(256, 256, device="cuda") + y = torch.randn(256, 256, device="cuda") + + def matmul_kernel(a, b): + return torch.matmul(a, b) + + result = bench_kernel(matmul_kernel, args=(x, y), n_warmup=2, n_repeat=5, n_trials=2) + + assert result["latency_ms"] > 0 + assert result["timing_backend"] in ("cupti", "cuda_event") + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required") +def test_bench_kernel_cupti_excludes_flush_kernels(): + """CUPTI path should exclude FillFunctor flush kernels from timing.""" + def simple_kernel(): + x = torch.randn(512, device="cuda") + return x + 1.0 + + result = bench_kernel(simple_kernel, n_warmup=2, n_repeat=5, n_trials=2) + + # If CUPTI is available, event_breakdown should not contain flush patterns + if result["timing_backend"] == "cupti" and result["event_breakdown"]: + for kernel_name in result["event_breakdown"].keys(): + # Should not match both vectorized_elementwise AND FillFunctor + has_vectorized = "vectorized_elementwise" in kernel_name + has_fill = "FillFunctor" in kernel_name + # If both are present, this would be a flush kernel (should be excluded) + assert not (has_vectorized and has_fill), \ + f"Flush kernel leaked into breakdown: {kernel_name}" + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required") +def test_bench_kernel_stdev_with_multiple_trials(): + """stdev_ms should be non-zero when n_trials > 1.""" + def kernel(): + x = torch.randn(128, device="cuda") + return x * 2.0 + + result = bench_kernel(kernel, n_warmup=2, n_repeat=10, n_trials=5) + + # With 5 trials, stdev should typically be > 0 (unless extremely stable) + # We just check it's a valid number + assert isinstance(result["stdev_ms"], float) + assert result["stdev_ms"] >= 0 + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required") +def test_bench_kernel_single_trial_has_zero_stdev(): + """stdev_ms should be 0.0 when n_trials = 1.""" + def kernel(): + x = torch.randn(128, device="cuda") + return x * 2.0 + + result = bench_kernel(kernel, n_warmup=2, n_repeat=5, n_trials=1) + + assert result["stdev_ms"] == 0.0 + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required") +def test_benchmark_report_propagates_stdev_and_backend(): + """BenchmarkReport.record() should preserve stdev_ms and timing_backend.""" + class _FakeOp: + pass + + result = { + "latency_ms": 1.234, + "stdev_ms": 0.056, + "timing_backend": "cupti", + "event_breakdown": {"kernel_a": 100.0, "kernel_b": 50.0}, + "tflops": 10.5, + } + + BenchmarkReport.record(_FakeOp(), params={"size": 1024}, result=result, tag="test") + + records = BenchmarkReport._records["_FakeOp"] + assert len(records) == 1 + assert records[0]["result"]["stdev_ms"] == 0.056 + assert records[0]["result"]["timing_backend"] == "cupti" + + +def test_sum_kernel_time_us_filters_flush_with_and_logic(): + """_sum_kernel_time_us should only exclude kernels matching ALL flush patterns.""" + # Mock a minimal kineto_results-like object + class MockEvent: + def __init__(self, name, duration_ns, is_cuda=True): + self._name = name + self._duration_ns = duration_ns + self._is_cuda = is_cuda + + def device_type(self): + from torch.autograd.profiler import DeviceType + return DeviceType.CUDA if self._is_cuda else DeviceType.CPU + + def name(self): + return self._name + + def duration_ns(self): + return self._duration_ns + + class MockKinetoResults: + def __init__(self, events): + self._events = events + + def events(self): + return self._events + + events = [ + # Should be excluded (both patterns present) + MockEvent("vectorized_elementwise_kernel>", 10000), + # Should be included (only one pattern) + MockEvent("FillFunctor_custom_kernel", 5000), + MockEvent("vectorized_elementwise_add", 8000), + # Should be included (normal kernel) + MockEvent("my_custom_kernel", 15000), + ] + + kr = MockKinetoResults(events) + total_us, per_kernel, excluded = _sum_kernel_time_us(kr) + + # total_us should exclude the first kernel (10000ns = 10us) + # Include: 5000 + 8000 + 15000 = 28000ns = 28us + assert total_us == 28.0 + + # excluded should contain only the flush kernel + assert len(excluded) == 1 + assert "vectorized_elementwise_kernel>" in excluded + + # per_kernel should contain the three non-flush kernels + assert len(per_kernel) == 3 + assert "FillFunctor_custom_kernel" in per_kernel + assert "vectorized_elementwise_add" in per_kernel + assert "my_custom_kernel" in per_kernel + + +def test_benchmark_report_dump_includes_new_fields(tmp_path): + """BenchmarkReport.dump() should include timing_backend, stdev_ms, and event_breakdown.""" + class _TestOp: + pass + + result = { + "latency_ms": 2.5, + "stdev_ms": 0.1, + "timing_backend": "cupti", + "event_breakdown": { + "kernel_main": 1000.0, + "kernel_helper": 500.0, + }, + "tflops": 5.0, + } + + BenchmarkReport.record(_TestOp(), params={"n": 512}, result=result, tag="tileops") + + log_path = tmp_path / "test_report.log" + BenchmarkReport.dump(str(log_path)) + + content = log_path.read_text() + + # Check that new fields appear in the table + assert "timing_backend" in content + assert "stdev_ms" in content + assert "cupti" in content + assert "0.1000" in content # stdev_ms formatted + + # Check that event breakdown section is present + assert "Event breakdown" in content + assert "kernel_main" in content + assert "kernel_helper" in content + assert "1000.0" in content + assert "500.0" in content + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required") +def test_bench_kernel_arg_pool_survives_fallback(): + """arg_pool should remain accessible in CUDA event fallback path.""" + # This is a regression test for the arg_pool scope bug + x = torch.randn(128, 128, device="cuda") + + def kernel_with_args(tensor): + return tensor @ tensor.T + + # The test should not crash even if CUPTI fails and fallback is triggered + result = bench_kernel(kernel_with_args, args=(x,), n_warmup=1, n_repeat=3, n_trials=2) + + assert result["latency_ms"] > 0 + assert result["timing_backend"] in ("cupti", "cuda_event")