From 4e6b712df3847e3db74272208d25463bed741631 Mon Sep 17 00:00:00 2001 From: Dang Nguyen Date: Fri, 5 Jun 2026 16:53:19 -0500 Subject: [PATCH] Add manual validation samples for perturbations and severity labels Two parallel sampling tools with frozen annotation sets committed for reproducibility: perturbation/manual_validation/ samples injected perturbations for human review of substantiveness, and conference_study/severity_validation/ samples review comments for human review of assigned severity tiers. Each dir has the sampling script (_build_sample.py) plus the frozen samples.json/samples.md it produced. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../severity_validation/_build_sample.py | 111 + .../severity_validation/samples.json | 562 +++++ .../severity_validation/samples.md | 373 ++++ .../manual_validation/_build_sample.py | 186 ++ .../manual_validation/samples.json | 602 +++++ .../perturbation/manual_validation/samples.md | 1951 +++++++++++++++++ 6 files changed, 3785 insertions(+) create mode 100644 benchmarks/conference_study/severity_validation/_build_sample.py create mode 100644 benchmarks/conference_study/severity_validation/samples.json create mode 100644 benchmarks/conference_study/severity_validation/samples.md create mode 100644 benchmarks/perturbation/manual_validation/_build_sample.py create mode 100644 benchmarks/perturbation/manual_validation/samples.json create mode 100644 benchmarks/perturbation/manual_validation/samples.md diff --git a/benchmarks/conference_study/severity_validation/_build_sample.py b/benchmarks/conference_study/severity_validation/_build_sample.py new file mode 100644 index 0000000..34db889 --- /dev/null +++ b/benchmarks/conference_study/severity_validation/_build_sample.py @@ -0,0 +1,111 @@ +"""Sample OpenAIReview (GPT-5.5) comments for manual severity validation. + +For each of the 4 quality proxies, pick one low-quality and one high-quality +paper from the frontier subset, and sample 5 comments from each (spread across +severity tiers where possible) = 40 comments. Writes samples.md (checklist) and +samples.json into this directory. +""" +import json, random, collections +from pathlib import Path + +BASE = Path(__file__).resolve().parent.parent # conference_study +MANIFEST = BASE / "manifests" / "v2_frontier" / "combined.json" +RESULTS = BASE / "results" / "frontier_subset_progressive" +MKEY = "progressive__gpt-5.5" +OUT = Path(__file__).resolve().parent +SEED, PER_PAPER = 42, 5 +PAIR_NAME = {1: "Community-level", 2: "Conference-level", 3: "Reviewer-level", 4: "Composite"} + + +def comments_for(slug): + p = RESULTS / f"{slug}.json" + if not p.exists(): + return [] + d = json.loads(p.read_text()) + return d.get("methods", {}).get(MKEY, {}).get("comments", []) + + +def pick_paper(cands, rng): + """First candidate (shuffled) with >= PER_PAPER comments, else the most-commented.""" + rng.shuffle(cands) + ok = [c for c in cands if len(comments_for(c["slug"])) >= PER_PAPER] + if ok: + return ok[0] + return max(cands, key=lambda c: len(comments_for(c["slug"]))) + + +def pick_comments(cmts, rng): + """5 comments, covering each present severity tier at least once, then fill.""" + cmts = list(cmts); rng.shuffle(cmts) + by_sev = collections.defaultdict(list) + for c in cmts: + by_sev[c.get("severity", "?")].append(c) + sel, used = [], set() + for sev in sorted(by_sev): # one per tier present + c = by_sev[sev][0] + sel.append(c); used.add(c["id"]) + for c in cmts: # fill + if len(sel) >= PER_PAPER: + break + if c["id"] not in used: + sel.append(c); used.add(c["id"]) + return sel[:PER_PAPER] + + +def main(): + rng = random.Random(SEED) + papers = json.loads(MANIFEST.read_text())["papers"] + # pair -> side -> [papers] + groups = collections.defaultdict(lambda: collections.defaultdict(list)) + for p in papers: + for m in p["pair_memberships"]: + groups[m["pair"]][m["side"]].append(p) + + flat, md = [], [] + md.append("# Comment severity validation sample\n") + md.append("40 OpenAIReview (GPT-5.5) comments: 1 low + 1 high paper per quality proxy, " + "5 comments each (spread across severity tiers where available).\n") + md.append("For each comment, the model's **LLM severity** is shown; mark **your severity** " + "and whether it's substantive **signal** or **cosmetic**.\n") + + idx = 0 + for pair in sorted(groups): + md.append(f"\n---\n\n## {PAIR_NAME.get(pair, f'Pair {pair}')} proxy\n") + for side, tag in (("low", "WEAK / low-quality"), ("high", "STRONG / high-quality")): + paper = pick_paper(list(groups[pair][side]), rng) + cmts = pick_comments(comments_for(paper["slug"]), rng) + md.append(f"### {tag} — {paper['title']}") + md.append(f"`{paper['slug']}` · decision: {paper.get('normalized_decision')} · " + f"review score avg: {paper.get('review_score_avg')} · " + f"cites/yr: {paper.get('cites_per_year')}\n") + for c in cmts: + idx += 1 + flat.append({"idx": idx, "proxy": PAIR_NAME.get(pair), "group": side, + "paper_slug": paper["slug"], "paper_title": paper["title"], + **{k: c.get(k) for k in + ("id", "title", "quote", "explanation", "comment_type", + "paragraph_index", "severity")}}) + md.append(f"**{idx}. [{c.get('severity', '?').upper()}] {c.get('title', '')}** " + f"_(type: {c.get('comment_type')}, ¶{c.get('paragraph_index')})_") + if (c.get("quote") or "").strip(): + md.append(f"> Quote: {c['quote']}") + md.append(f"{c.get('explanation', '')}\n") + md.append("Your severity: ( ) major ( ) moderate ( ) minor | " + "( ) signal ( ) cosmetic") + md.append("Notes: \n") + + OUT.mkdir(parents=True, exist_ok=True) + (OUT / "samples.json").write_text(json.dumps(flat, indent=2)) + (OUT / "samples.md").write_text("\n".join(md)) + + # coverage report + print(f"total comments: {len(flat)}") + sev = collections.Counter(x["severity"] for x in flat) + print("LLM severity mix:", dict(sev)) + print("papers:", len({x["paper_slug"] for x in flat}), + "| proxies:", len({x["proxy"] for x in flat})) + print(f"wrote {OUT/'samples.md'} and {OUT/'samples.json'}") + + +if __name__ == "__main__": + main() diff --git a/benchmarks/conference_study/severity_validation/samples.json b/benchmarks/conference_study/severity_validation/samples.json new file mode 100644 index 0000000..a5027a4 --- /dev/null +++ b/benchmarks/conference_study/severity_validation/samples.json @@ -0,0 +1,562 @@ +[ + { + "idx": 1, + "proxy": "Community-level", + "group": "low", + "paper_slug": "iclr2021-7Z29QbHx-ftso-effective-nas-via-first-topology-second-operator", + "paper_title": "FTSO: Effective NAS via First Topology Second Operator", + "id": "progressive__gpt-5.5_16", + "title": "Pearson correlation claims need methodological qualification", + "quote": "they still obey a positive correlation, with a Pearson correlation coefficient of 0.77 ... generalize better on ImageNet, with the correlation coefficient of 0.7", + "explanation": "Pearson correlation is a reasonable way to summarize alignment between evaluation protocols, but the conclusions depend on details such as sample size, whether points are independent architectures, whether results are averaged over seeds, whether accuracy or error is used, and sensitivity to outliers. The reported coefficients may support a positive association, but the paper should provide these details to make the correlation and CIFAR-to-ImageNet generalization claim reproducible and interpretable.", + "comment_type": "technical", + "paragraph_index": 45, + "severity": "minor" + }, + { + "idx": 2, + "proxy": "Community-level", + "group": "low", + "paper_slug": "iclr2021-7Z29QbHx-ftso-effective-nas-via-first-topology-second-operator", + "paper_title": "FTSO: Effective NAS via First Topology Second Operator", + "id": "progressive__gpt-5.5_2", + "title": "Skip-only topology search is inconsistent or underspecified about weights and reduction-cell costs", + "quote": "Because the skip connection operator contains no kernel weights, we only need to optimize the architecture parameters \u03b2_i,j ... / Assign each edge e_i,j a skip connection operator o_i,j with kernel weights w_i,j ... Update weights w by descending \u2207_w L_train(w, \u03b2).", + "explanation": "The prose says skip connections have no kernel weights and only \u03b2 is optimized, but Algorithm 1 assigns kernel weights to skip operators and updates w. If w is empty, the update is vacuous and should be removed or explicitly marked as such; if w is nonempty, the zero-kernel-weight claim is false. The zero-parameter/zero-FLOP skip assumption is also underspecified for DARTS-style reduction cells, where skip connections on stride-2 edges are often implemented with factorized reduction or projection to match spatial resolution and channels. The method should define exactly how skip edges are implemented in normal and reduction cells.", + "comment_type": "technical", + "paragraph_index": 16, + "severity": "moderate" + }, + { + "idx": 3, + "proxy": "Community-level", + "group": "low", + "paper_slug": "iclr2021-7Z29QbHx-ftso-effective-nas-via-first-topology-second-operator", + "paper_title": "FTSO: Effective NAS via First Topology Second Operator", + "id": "progressive__gpt-5.5_10", + "title": "Description of the NATS-Bench cell topology seems inconsistent with the standard benchmark", + "quote": "In the search space of NATS-Bench, there are one input node, three intermediate nodes and one output node, and each intermediate node connects to all its predecessors.", + "explanation": "The standard NAS-Bench-201/NATS-Bench topology space is usually described as a 4-node cell with 6 directed edges: one input node, two intermediate nodes, and one output node, with each non-input node connected to all previous nodes. The quoted description appears to describe five nodes unless the authors are using a different indexing convention. Because the node and edge counts determine the search space and the adaptation of FTSO, the topology description should be clarified.", + "comment_type": "technical", + "paragraph_index": 37, + "severity": "moderate" + }, + { + "idx": 4, + "proxy": "Community-level", + "group": "low", + "paper_slug": "iclr2021-7Z29QbHx-ftso-effective-nas-via-first-topology-second-operator", + "paper_title": "FTSO: Effective NAS via First Topology Second Operator", + "id": "progressive__gpt-5.5_7", + "title": "The claim that a generalizing subgraph implies supernet overfitting is not justified", + "quote": "If the sub-graph can generalize perfectly on the testing set, the super-net must over-fit.", + "explanation": "A subgraph generalizing well does not imply that the enclosing supernet must overfit. Both could generalize, or the supernet could perform worse because of weight-sharing interference, optimization mismatch, or discretization bias rather than classical overfitting. Since this implication is used to motivate direct replacement over gradient-based operator search, it should be phrased as a possible explanation rather than a necessary conclusion.", + "comment_type": "logical", + "paragraph_index": 17, + "severity": "moderate" + }, + { + "idx": 5, + "proxy": "Community-level", + "group": "low", + "paper_slug": "iclr2021-7Z29QbHx-ftso-effective-nas-via-first-topology-second-operator", + "paper_title": "FTSO: Effective NAS via First Topology Second Operator", + "id": "progressive__gpt-5.5_4", + "title": "Topology-search forward pass appears to omit edge-weight normalization", + "quote": "Forward-propagate following n_j = \u2211_{i