Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 108 additions & 0 deletions benchmarks/perturbation/_combine_gpt_claude.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
"""Compute combined-system (GPT-5.5 OR Claude-Opus-4.7) recall on the 24-paper
frontier subset for tab:recall-overall in perturbation.tex.

A perturbation is detected by the combined system if either GPT-5.5 OR
Claude-Opus-4.7 detected it under the progressive (OpenAIReview) method.
Perturbation IDs are namespaced by (results_dir, paper, type) to avoid
cross-paper collisions.
"""
import json
from pathlib import Path

ROOT = Path(__file__).resolve().parent / "results"

# (results_dir, [paper_ids], [ptypes]) — must mirror configs/submission/subset_bigmodels_*.yaml.
# Note: cs_CC uses a different ptype taxonomy (claim_theoretical/logic instead
# of statement_empirical/experimental).
DOMAINS = [
("cs_CC_scaleup_v2", ["paper_001", "paper_003", "paper_004"], ["surface", "claim_theoretical", "logic"]),
("full_cs_LG", ["paper_001", "paper_002", "paper_003"], ["surface", "statement_empirical", "experimental"]),
("full_econ_EM", ["paper_001", "paper_002", "paper_003"], ["surface", "statement_empirical", "experimental"]),
("full_hep_ex", ["paper_001", "paper_002", "paper_003"], ["surface", "statement_empirical", "experimental"]),
("full_math_all", ["paper_001", "paper_002", "paper_004"], ["surface", "statement_empirical", "experimental"]),
("full_physics_atm_clus", ["paper_001", "paper_002", "paper_003"], ["surface", "statement_empirical", "experimental"]),
("full_q_bio_GN", ["paper_001", "paper_002", "paper_003"], ["surface", "statement_empirical", "experimental"]),
("full_stat_AP", ["paper_001", "paper_002", "paper_003"], ["surface", "statement_empirical", "experimental"]),
]
METHOD = "progressive" # = OpenAIReview column


def load_score(domain: str, model: str, ptype: str, paper: str):
p = ROOT / domain / model / ptype / METHOD / paper / "score" / "llm_t4_grounded" / f"{paper}_score.json"
if not p.exists():
return None
return json.loads(p.read_text())


def keyset(score, prefix: str, kind: str) -> set[str]:
"""kind='detected' or 'all' (detected ∪ missed)."""
if kind == "detected":
items = score["detected"]
else:
items = list(score["detected"]) + list(score["missed"])
return {f"{prefix}::{x}" for x in items}


def collect(model: str) -> tuple[set[str], set[str]]:
detected, injected = set(), set()
for domain, papers, ptypes in DOMAINS:
for paper in papers:
for ptype in ptypes:
s = load_score(domain, model, ptype, paper)
if s is None:
continue
prefix = f"{domain}|{ptype}|{paper}"
detected |= keyset(s, prefix, "detected")
injected |= keyset(s, prefix, "all")
return detected, injected


MODELS = [
("GPT-5.5", "gpt-5.5"),
("Claude-Opus-4.7", "claude-opus-4.7"),
("Grok-4.1-Fast", "grok-4.1-fast"),
("DeepSeek-V4-Flash", "deepseek-v4-flash"),
("Qwen3.6-35B-A3B", "qwen3.6-35b-a3b"),
("Gemini-3.1-Flash-Lite", "gemini-3.1-flash-lite-preview"),
]


def main():
per_model = {}
for label, slug in MODELS:
det, inj = collect(slug)
per_model[label] = (det, inj)
print(f"{label:23s}: {len(det):>4} / {len(inj):>4} = {len(det)/len(inj)*100:5.1f}%")

all_det = set().union(*(d for d, _ in per_model.values()))
all_inj = set().union(*(i for _, i in per_model.values()))
print()
print("Combined (ANY of the 6 models, progressive/OpenAIReview):")
print(f" union injected : {len(all_inj)}")
print(f" union detected : {len(all_det)}")
print(f" recall : {len(all_det)/len(all_inj)*100:5.1f}%")
print()

# Cumulative-by-rank: add models in descending single-model recall order
print("Cumulative recall as models are added (best single first):")
order = sorted(per_model.items(), key=lambda kv: -len(kv[1][0]))
cum_det, cum_inj = set(), set()
for label, (d, i) in order:
cum_det |= d
cum_inj |= i
print(f" + {label:23s}: {len(cum_det):>4} / {len(cum_inj):>4} = {len(cum_det)/len(cum_inj)*100:5.1f}%")
print()

# How many perturbations only ONE model caught
print("Coverage by # of detecting models (perturbations in union):")
coverage = {}
for pid in all_inj:
n = sum(1 for d, _ in per_model.values() if pid in d)
coverage[n] = coverage.get(n, 0) + 1
for n in sorted(coverage):
tag = "missed by all" if n == 0 else f"caught by {n} model(s)"
print(f" {tag:25s}: {coverage[n]:>4}")


if __name__ == "__main__":
main()