From 4e6b712df3847e3db74272208d25463bed741631 Mon Sep 17 00:00:00 2001
From: Dang Nguyen <nmd.ptnk@gmail.com>
Date: Fri, 5 Jun 2026 16:53:19 -0500
Subject: [PATCH] Add manual validation samples for perturbations and severity
 labels

Two parallel sampling tools with frozen annotation sets committed for
reproducibility: perturbation/manual_validation/ samples injected
perturbations for human review of substantiveness, and
conference_study/severity_validation/ samples review comments for human
review of assigned severity tiers. Each dir has the sampling script
(_build_sample.py) plus the frozen samples.json/samples.md it produced.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../severity_validation/_build_sample.py      |  111 +
 .../severity_validation/samples.json          |  562 +++++
 .../severity_validation/samples.md            |  373 ++++
 .../manual_validation/_build_sample.py        |  186 ++
 .../manual_validation/samples.json            |  602 +++++
 .../perturbation/manual_validation/samples.md | 1951 +++++++++++++++++
 6 files changed, 3785 insertions(+)
 create mode 100644 benchmarks/conference_study/severity_validation/_build_sample.py
 create mode 100644 benchmarks/conference_study/severity_validation/samples.json
 create mode 100644 benchmarks/conference_study/severity_validation/samples.md
 create mode 100644 benchmarks/perturbation/manual_validation/_build_sample.py
 create mode 100644 benchmarks/perturbation/manual_validation/samples.json
 create mode 100644 benchmarks/perturbation/manual_validation/samples.md

diff --git a/benchmarks/conference_study/severity_validation/_build_sample.py b/benchmarks/conference_study/severity_validation/_build_sample.py
new file mode 100644
index 0000000..34db889
--- /dev/null
+++ b/benchmarks/conference_study/severity_validation/_build_sample.py
@@ -0,0 +1,111 @@
+"""Sample OpenAIReview (GPT-5.5) comments for manual severity validation.
+
+For each of the 4 quality proxies, pick one low-quality and one high-quality
+paper from the frontier subset, and sample 5 comments from each (spread across
+severity tiers where possible) = 40 comments. Writes samples.md (checklist) and
+samples.json into this directory.
+"""
+import json, random, collections
+from pathlib import Path
+
+BASE = Path(__file__).resolve().parent.parent              # conference_study
+MANIFEST = BASE / "manifests" / "v2_frontier" / "combined.json"
+RESULTS = BASE / "results" / "frontier_subset_progressive"
+MKEY = "progressive__gpt-5.5"
+OUT = Path(__file__).resolve().parent
+SEED, PER_PAPER = 42, 5
+PAIR_NAME = {1: "Community-level", 2: "Conference-level", 3: "Reviewer-level", 4: "Composite"}
+
+
+def comments_for(slug):
+    p = RESULTS / f"{slug}.json"
+    if not p.exists():
+        return []
+    d = json.loads(p.read_text())
+    return d.get("methods", {}).get(MKEY, {}).get("comments", [])
+
+
+def pick_paper(cands, rng):
+    """First candidate (shuffled) with >= PER_PAPER comments, else the most-commented."""
+    rng.shuffle(cands)
+    ok = [c for c in cands if len(comments_for(c["slug"])) >= PER_PAPER]
+    if ok:
+        return ok[0]
+    return max(cands, key=lambda c: len(comments_for(c["slug"])))
+
+
+def pick_comments(cmts, rng):
+    """5 comments, covering each present severity tier at least once, then fill."""
+    cmts = list(cmts); rng.shuffle(cmts)
+    by_sev = collections.defaultdict(list)
+    for c in cmts:
+        by_sev[c.get("severity", "?")].append(c)
+    sel, used = [], set()
+    for sev in sorted(by_sev):                      # one per tier present
+        c = by_sev[sev][0]
+        sel.append(c); used.add(c["id"])
+    for c in cmts:                                   # fill
+        if len(sel) >= PER_PAPER:
+            break
+        if c["id"] not in used:
+            sel.append(c); used.add(c["id"])
+    return sel[:PER_PAPER]
+
+
+def main():
+    rng = random.Random(SEED)
+    papers = json.loads(MANIFEST.read_text())["papers"]
+    # pair -> side -> [papers]
+    groups = collections.defaultdict(lambda: collections.defaultdict(list))
+    for p in papers:
+        for m in p["pair_memberships"]:
+            groups[m["pair"]][m["side"]].append(p)
+
+    flat, md = [], []
+    md.append("# Comment severity validation sample\n")
+    md.append("40 OpenAIReview (GPT-5.5) comments: 1 low + 1 high paper per quality proxy, "
+              "5 comments each (spread across severity tiers where available).\n")
+    md.append("For each comment, the model's **LLM severity** is shown; mark **your severity** "
+              "and whether it's substantive **signal** or **cosmetic**.\n")
+
+    idx = 0
+    for pair in sorted(groups):
+        md.append(f"\n---\n\n## {PAIR_NAME.get(pair, f'Pair {pair}')} proxy\n")
+        for side, tag in (("low", "WEAK / low-quality"), ("high", "STRONG / high-quality")):
+            paper = pick_paper(list(groups[pair][side]), rng)
+            cmts = pick_comments(comments_for(paper["slug"]), rng)
+            md.append(f"### {tag} — {paper['title']}")
+            md.append(f"`{paper['slug']}` · decision: {paper.get('normalized_decision')} · "
+                      f"review score avg: {paper.get('review_score_avg')} · "
+                      f"cites/yr: {paper.get('cites_per_year')}\n")
+            for c in cmts:
+                idx += 1
+                flat.append({"idx": idx, "proxy": PAIR_NAME.get(pair), "group": side,
+                             "paper_slug": paper["slug"], "paper_title": paper["title"],
+                             **{k: c.get(k) for k in
+                                ("id", "title", "quote", "explanation", "comment_type",
+                                 "paragraph_index", "severity")}})
+                md.append(f"**{idx}. [{c.get('severity', '?').upper()}] {c.get('title', '')}**  "
+                          f"_(type: {c.get('comment_type')}, ¶{c.get('paragraph_index')})_")
+                if (c.get("quote") or "").strip():
+                    md.append(f"> Quote: {c['quote']}")
+                md.append(f"{c.get('explanation', '')}\n")
+                md.append("Your severity: ( ) major  ( ) moderate  ( ) minor    |    "
+                          "( ) signal  ( ) cosmetic")
+                md.append("Notes: \n")
+
+    OUT.mkdir(parents=True, exist_ok=True)
+    (OUT / "samples.json").write_text(json.dumps(flat, indent=2))
+    (OUT / "samples.md").write_text("\n".join(md))
+
+    # coverage report
+    print(f"total comments: {len(flat)}")
+    sev = collections.Counter(x["severity"] for x in flat)
+    print("LLM severity mix:", dict(sev))
+    print("papers:", len({x["paper_slug"] for x in flat}),
+          "| proxies:", len({x["proxy"] for x in flat}))
+    print(f"wrote {OUT/'samples.md'} and {OUT/'samples.json'}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/conference_study/severity_validation/samples.json b/benchmarks/conference_study/severity_validation/samples.json
new file mode 100644
index 0000000..a5027a4
--- /dev/null
+++ b/benchmarks/conference_study/severity_validation/samples.json
@@ -0,0 +1,562 @@
+[
+  {
+    "idx": 1,
+    "proxy": "Community-level",
+    "group": "low",
+    "paper_slug": "iclr2021-7Z29QbHx-ftso-effective-nas-via-first-topology-second-operator",
+    "paper_title": "FTSO: Effective NAS via First Topology Second Operator",
+    "id": "progressive__gpt-5.5_16",
+    "title": "Pearson correlation claims need methodological qualification",
+    "quote": "they still obey a positive correlation, with a Pearson correlation coefficient of 0.77 ... generalize better on ImageNet, with the correlation coefficient of 0.7",
+    "explanation": "Pearson correlation is a reasonable way to summarize alignment between evaluation protocols, but the conclusions depend on details such as sample size, whether points are independent architectures, whether results are averaged over seeds, whether accuracy or error is used, and sensitivity to outliers. The reported coefficients may support a positive association, but the paper should provide these details to make the correlation and CIFAR-to-ImageNet generalization claim reproducible and interpretable.",
+    "comment_type": "technical",
+    "paragraph_index": 45,
+    "severity": "minor"
+  },
+  {
+    "idx": 2,
+    "proxy": "Community-level",
+    "group": "low",
+    "paper_slug": "iclr2021-7Z29QbHx-ftso-effective-nas-via-first-topology-second-operator",
+    "paper_title": "FTSO: Effective NAS via First Topology Second Operator",
+    "id": "progressive__gpt-5.5_2",
+    "title": "Skip-only topology search is inconsistent or underspecified about weights and reduction-cell costs",
+    "quote": "Because the skip connection operator contains no kernel weights, we only need to optimize the architecture parameters \u03b2_i,j ... / Assign each edge e_i,j a skip connection operator o_i,j with kernel weights w_i,j ... Update weights w by descending \u2207_w L_train(w, \u03b2).",
+    "explanation": "The prose says skip connections have no kernel weights and only \u03b2 is optimized, but Algorithm 1 assigns kernel weights to skip operators and updates w. If w is empty, the update is vacuous and should be removed or explicitly marked as such; if w is nonempty, the zero-kernel-weight claim is false. The zero-parameter/zero-FLOP skip assumption is also underspecified for DARTS-style reduction cells, where skip connections on stride-2 edges are often implemented with factorized reduction or projection to match spatial resolution and channels. The method should define exactly how skip edges are implemented in normal and reduction cells.",
+    "comment_type": "technical",
+    "paragraph_index": 16,
+    "severity": "moderate"
+  },
+  {
+    "idx": 3,
+    "proxy": "Community-level",
+    "group": "low",
+    "paper_slug": "iclr2021-7Z29QbHx-ftso-effective-nas-via-first-topology-second-operator",
+    "paper_title": "FTSO: Effective NAS via First Topology Second Operator",
+    "id": "progressive__gpt-5.5_10",
+    "title": "Description of the NATS-Bench cell topology seems inconsistent with the standard benchmark",
+    "quote": "In the search space of NATS-Bench, there are one input node, three intermediate nodes and one output node, and each intermediate node connects to all its predecessors.",
+    "explanation": "The standard NAS-Bench-201/NATS-Bench topology space is usually described as a 4-node cell with 6 directed edges: one input node, two intermediate nodes, and one output node, with each non-input node connected to all previous nodes. The quoted description appears to describe five nodes unless the authors are using a different indexing convention. Because the node and edge counts determine the search space and the adaptation of FTSO, the topology description should be clarified.",
+    "comment_type": "technical",
+    "paragraph_index": 37,
+    "severity": "moderate"
+  },
+  {
+    "idx": 4,
+    "proxy": "Community-level",
+    "group": "low",
+    "paper_slug": "iclr2021-7Z29QbHx-ftso-effective-nas-via-first-topology-second-operator",
+    "paper_title": "FTSO: Effective NAS via First Topology Second Operator",
+    "id": "progressive__gpt-5.5_7",
+    "title": "The claim that a generalizing subgraph implies supernet overfitting is not justified",
+    "quote": "If the sub-graph can generalize perfectly on the testing set, the super-net must over-fit.",
+    "explanation": "A subgraph generalizing well does not imply that the enclosing supernet must overfit. Both could generalize, or the supernet could perform worse because of weight-sharing interference, optimization mismatch, or discretization bias rather than classical overfitting. Since this implication is used to motivate direct replacement over gradient-based operator search, it should be phrased as a possible explanation rather than a necessary conclusion.",
+    "comment_type": "logical",
+    "paragraph_index": 17,
+    "severity": "moderate"
+  },
+  {
+    "idx": 5,
+    "proxy": "Community-level",
+    "group": "low",
+    "paper_slug": "iclr2021-7Z29QbHx-ftso-effective-nas-via-first-topology-second-operator",
+    "paper_title": "FTSO: Effective NAS via First Topology Second Operator",
+    "id": "progressive__gpt-5.5_4",
+    "title": "Topology-search forward pass appears to omit edge-weight normalization",
+    "quote": "Forward-propagate following n_j = \u2211_{i<j} o(n_i) \u03b2_i,j;",
+    "explanation": "The surrounding text says the method inherits PC-DARTS-style differentiable topology weighting, where incoming edge weights are typically normalized by a softmax. The pseudocode instead appears to use raw \u03b2_i,j in a weighted sum. If the intended computation is softmax-normalized over incoming edges, the algorithm is missing an important normalization step. If raw \u03b2 values are intended, that materially changes the optimization and the interpretation of selecting the largest \u03b2 values. The forward computation should specify whether and how \u03b2 is constrained or normalized.",
+    "comment_type": "technical",
+    "paragraph_index": 21,
+    "severity": "moderate"
+  },
+  {
+    "idx": 6,
+    "proxy": "Community-level",
+    "group": "high",
+    "paper_slug": "iclr2022-uYLFoz1v-efficiently-modeling-long-sequences-with-structured-state-sp",
+    "paper_title": "Efficiently Modeling Long Sequences with Structured State Spaces",
+    "id": "progressive__gpt-5.5_2",
+    "title": "HiPPO memorization claim attributes too much to the state matrix alone",
+    "quote": "HiPPO specifies a class of certain matrices _**A** \u2208_ R _[N][\u00d7][N]_ that when incorporated into (1), allows the state _x_ ( _t_ ) to memorize the history of the input _u_ ( _t_ ).",
+    "explanation": "The sentence summarizes the motivation for choosing HiPPO-derived state matrices, but read literally it overstates what the matrix A alone guarantees. In the HiPPO construction, the memory property depends on the full continuous-time dynamics, including the corresponding input operator/vector B and the projection measure/basis. For finite N, the state stores coefficients of an approximation or projection of the input history rather than an exact arbitrary history. A reader could incorrectly infer that merely substituting any HiPPO A into the SSM, with arbitrary B and learned C, is sufficient to obtain the formal memorization guarantee.",
+    "comment_type": "technical",
+    "paragraph_index": 23,
+    "severity": "minor"
+  },
+  {
+    "idx": 7,
+    "proxy": "Community-level",
+    "group": "high",
+    "paper_slug": "iclr2022-uYLFoz1v-efficiently-modeling-long-sequences-with-structured-state-sp",
+    "paper_title": "Efficiently Modeling Long Sequences with Structured State Spaces",
+    "id": "progressive__gpt-5.5_4",
+    "title": "Rank dependence is hidden in the \u201c4 Cauchy multiplies\u201d complexity statement",
+    "quote": "**Theorem 3** (S4 Convolution) **.** _Given any step size_ \u2206 _, computing the SSM convolution filter_ _**K** can be reduced to 4 Cauchy multiplies, requiring only O_[] ( _N_ + _L_ ) _operations and O_ ( _N_ + _L_ ) _space._",
+    "explanation": "For the rank-one DPLR case used by the S4 parameterization, the \u201c4 Cauchy multiplies\u201d statement is plausible. However, the surrounding discussion says the techniques apply to any NPLR matrix, and Theorem 1 allows low-rank factors P,Q \u2208 R^{N\u00d7r}, with some HiPPO matrices having r=2. For general rank r, the Woodbury step involves an r\u00d7r correction at each frequency and more than four Cauchy-type evaluations, with complexity depending on r. The asymptotic O\u0303(N+L) claim remains essentially right for fixed small r, but the unqualified \u201c4 Cauchy multiplies\u201d can mislead readers about the general NPLR case.",
+    "comment_type": "technical",
+    "paragraph_index": 50,
+    "severity": "moderate"
+  },
+  {
+    "idx": 8,
+    "proxy": "Community-level",
+    "group": "high",
+    "paper_slug": "iclr2022-uYLFoz1v-efficiently-modeling-long-sequences-with-structured-state-sp",
+    "paper_title": "Efficiently Modeling Long Sequences with Structured State Spaces",
+    "id": "progressive__gpt-5.5_1",
+    "title": "Abstract slightly blurs whether the original matrix or a corrected normal part is diagonalized",
+    "quote": "Our technique involves conditioning _A_ with a low-rank correction, allowing it to be diagonalized stably and reducing the SSM to the well-studied computation of a Cauchy kernel.",
+    "explanation": "The high-level idea is directionally correct: later sections show that the problematic HiPPO matrix can be represented as normal plus low-rank, the normal/skew-symmetric part can be unitarily diagonalized, and the low-rank term is handled through Woodbury, leading to Cauchy-kernel computations. However, the phrase can be read as saying that the original _A_ itself becomes stably diagonalizable after a low-rank correction. The more precise statement is that one diagonalizes the normal component obtained after separating out a low-rank term, not that the original non-normal HiPPO matrix is directly diagonalized in a well-conditioned basis.",
+    "comment_type": "technical",
+    "paragraph_index": 2,
+    "severity": "minor"
+  },
+  {
+    "idx": 9,
+    "proxy": "Community-level",
+    "group": "high",
+    "paper_slug": "iclr2022-uYLFoz1v-efficiently-modeling-long-sequences-with-structured-state-sp",
+    "paper_title": "Efficiently Modeling Long Sequences with Structured State Spaces",
+    "id": "progressive__gpt-5.5_7",
+    "title": "Sequential CIFAR gap to 2-D CNN appears overstated",
+    "quote": "Sequential CIFAR is a particularly challenging dataset where outside of SSMs, all sequence models have a gap of over 25% to a simple 2-D CNN.",
+    "explanation": "The table supports the qualitative point that sequential CIFAR is difficult for many sequence models, but the specific \u201cover 25%\u201d claim is hard to reconcile with the reported numbers. Table 5 shows several non-SSM sequence models around 73\u201374% on SCIFAR, such as TrellisNet at 73.42 and UR-GRU at 74.4. The paragraph gives 2-D ResNet18 results of 95.62% with augmentation and 89.46% without augmentation, implying gaps of about 21.2 and 15.1 percentage points respectively for the best listed non-SSM sequence model, not over 25 points. The claim is true for some weaker baselines, but not for all sequence models listed.",
+    "comment_type": "logical",
+    "paragraph_index": 82,
+    "severity": "minor"
+  },
+  {
+    "idx": 10,
+    "proxy": "Community-level",
+    "group": "high",
+    "paper_slug": "iclr2022-uYLFoz1v-efficiently-modeling-long-sequences-with-structured-state-sp",
+    "paper_title": "Efficiently Modeling Long Sequences with Structured State Spaces",
+    "id": "progressive__gpt-5.5_8",
+    "title": "Autoregressive inference-speed limitation is stated too broadly",
+    "quote": "A prominent limitation of autoregressive models is inference speed (e.g. generation), since they require a pass over the full context for every new sample.",
+    "explanation": "The authors are contrasting S4\u2019s recurrent/stateful generation mode with standard full-context sequence models, especially Transformers. However, as written, the statement is too broad for autoregressive models in general. RNNs and state-space models are autoregressive but do not require a pass over the full context; cached Transformer decoding avoids recomputing a full forward pass over all previous tokens, even though attention still scales with context length; and some convolutional autoregressive models can also use caching. The more precise claim is that many non-stateful autoregressive architectures, especially vanilla Transformers without an equivalent recurrent state, have generation costs that scale with context length.",
+    "comment_type": "logical",
+    "paragraph_index": 78,
+    "severity": "minor"
+  },
+  {
+    "idx": 11,
+    "proxy": "Conference-level",
+    "group": "low",
+    "paper_slug": "neurips2021-6EWOVxvJ-reinforcement-learning-for-instance-segmentation-with-high-l",
+    "paper_title": "Reinforcement learning for instance segmentation with high-level priors",
+    "id": "progressive__gpt-5.5_17",
+    "title": "Claim of no direct supervision is weakened by use of pretrained edge predictions",
+    "quote": "we showed in particular that our setup can segment microscopy images with no direct supervision other than high-level reasoning.",
+    "explanation": "The authors are careful throughout the paper to distinguish reward supervision from direct pixelwise ground truth, and in that sense the RL training itself appears to be driven by high-level rules. However, in the microscopy experiment the method also uses superpixels created from boundary/edge predictions, and Table 1 includes variants such as 'ours without edges,' described as removing 'the additional edge prediction as an input.' The baselines further identify these as boundary predictions from [47]. If those edge predictions come from a supervised pretrained boundary detector, then the complete segmentation pipeline is not using only high-level reasoning: it also depends on learned boundary information that encodes external direct supervision, even if not from the target validation set. The claim would be more accurate if qualified as 'no target-domain ground-truth supervision for training the RL agent,' or if the authors explicitly state that the edge predictor/superpixel generator is unsupervised or otherwise not trained with instance/boundary annotations.",
+    "comment_type": "logical",
+    "paragraph_index": 61,
+    "severity": "major"
+  },
+  {
+    "idx": 12,
+    "proxy": "Conference-level",
+    "group": "low",
+    "paper_slug": "neurips2021-6EWOVxvJ-reinforcement-learning-for-instance-segmentation-with-high-l",
+    "paper_title": "Reinforcement learning for instance segmentation with high-level priors",
+    "id": "progressive__gpt-5.5_5",
+    "title": "Claim that stateless RL was introduced in a recent GAN-related work is historically too strong",
+    "quote": "To the best of our knowledge, stateless RL was introduced in [7] to study the connection between 137 generative adversarial networks and actor critics and our method is one of the first practical applica138 tions of this concept.",
+    "explanation": "The authors are likely referring to a particular 'stateless actor-critic' formalization used to relate GANs and actor-critic methods, and in that narrow sense the citation may be relevant. Still, the wording 'stateless RL was introduced in [7]' is too broad. Single-state MDPs, multi-armed bandits, and contextual bandits are longstanding RL settings that predate GAN-related actor-critic interpretations by decades. The subsequent claim that this is one of the first practical applications of 'this concept' would be more defensible if explicitly limited to the specific stateless actor-critic/GAN-theoretic formulation of [7], rather than stateless RL generally.",
+    "comment_type": "logical",
+    "paragraph_index": 20,
+    "severity": "minor"
+  },
+  {
+    "idx": 13,
+    "proxy": "Conference-level",
+    "group": "low",
+    "paper_slug": "neurips2021-6EWOVxvJ-reinforcement-learning-for-instance-segmentation-with-high-l",
+    "paper_title": "Reinforcement learning for instance segmentation with high-level priors",
+    "id": "progressive__gpt-5.5_8",
+    "title": "Local reward decomposition and object-to-subgraph mapping are not sufficiently justified",
+    "quote": "Of course then requirement arises that the union of 173 local rewards must resemble to the global reward. E.g. the optimal policy is the same for local as for 174 the global reward.",
+    "explanation": "The paper appropriately notes that replacing a global reward by local sub-graph rewards is not innocuous: in general, reward decompositions can change the optimal policy unless they preserve the relevant objective. However, the proposed object-to-edge-to-subgraph mapping is then introduced as a heuristic without showing that this condition holds even approximately. The mapping assigns each superpixel its object reward, assigns an edge the maximum of its two incident superpixel rewards, and averages edge rewards over sub-graphs. This can alter credit assignment relative to the image-level or object-level prior: low-quality objects may be masked by adjacent high-scoring objects, many correct internal merge decisions inside a bad object may be penalized, and overlapping sub-graphs may weight some decisions more than others. The authors acknowledge noise and give plausible motivation, but important behavior remains underspecified, including whether boundary/cut and internal/merge edges are treated differently, whether rewards are normalized for sub-graph overlap, and how background objects interact with the rule.",
+    "comment_type": "logical",
+    "paragraph_index": 30,
+    "severity": "moderate"
+  },
+  {
+    "idx": 14,
+    "proxy": "Conference-level",
+    "group": "low",
+    "paper_slug": "neurips2021-6EWOVxvJ-reinforcement-learning-for-instance-segmentation-with-high-l",
+    "paper_title": "Reinforcement learning for instance segmentation with high-level priors",
+    "id": "progressive__gpt-5.5_2",
+    "title": "RL avoids differentiating through the pipeline, but does not make the pipeline differentiable end-to-end",
+    "quote": "Our main motivation to explore RL for the instance segmentation task is to circumvent the restriction 106 to differentiable losses and - regardless of the loss - to make the whole pipeline differentiable end-to107 end even in presence of non-differentiable steps which transform pixelwise CNN predictions into 108 individual instances.",
+    "explanation": "The intended point is valid: an actor-critic objective can train components upstream of a non-differentiable graph partitioning step and can use rewards that are themselves non-differentiable. However, the quoted sentence goes further and says the method makes the whole pipeline 'differentiable end-to-end.' In standard policy-gradient/actor-critic training, gradients are not propagated through the environment transition, graph partitioning, or reward computation; the actor is updated through likelihood-ratio or critic-based gradients. Thus the overall image-to-instance-to-reward process is trainable end-to-end in a loose sense, but it is not differentiable end-to-end. A safer formulation would be that the method enables end-to-end optimization despite non-differentiable components.",
+    "comment_type": "technical",
+    "paragraph_index": 15,
+    "severity": "moderate"
+  },
+  {
+    "idx": 15,
+    "proxy": "Conference-level",
+    "group": "low",
+    "paper_slug": "neurips2021-6EWOVxvJ-reinforcement-learning-for-instance-segmentation-with-high-l",
+    "paper_title": "Reinforcement learning for instance segmentation with high-level priors",
+    "id": "progressive__gpt-5.5_14",
+    "title": "Model selection appears to use the test set",
+    "quote": "For comparison we keep the model which achieved the highest reward on the test set. This makes training as well as the validation independent from ground truth annotations.",
+    "explanation": "The intended point may be that no ground-truth labels are used for checkpoint selection, only the rule-based reward. That is important for the paper's weak/no-supervision claim. However, the phrase 'highest reward on the test set' is problematic. Standard experimental practice is to choose hyperparameters/checkpoints on a training or validation set and reserve the test set for final unbiased evaluation. The surrounding text then discusses 'validation scores' and reward evolution 'on the validation set,' so it is possible the authors meant validation set rather than test set. If so, this should be corrected. As written it suggests test-set model selection; even without ground truth, selecting the checkpoint by test-set reward can bias reported performance toward the held-out images.",
+    "comment_type": "logical",
+    "paragraph_index": 57,
+    "severity": "moderate"
+  },
+  {
+    "idx": 16,
+    "proxy": "Conference-level",
+    "group": "high",
+    "paper_slug": "neurips2021--JJy-Hw8-viser-video-specific-surface-embeddings-for-articulated-3d-s",
+    "paper_title": "ViSER: Video-Specific Surface Embeddings for Articulated 3D Shape Reconstruction",
+    "id": "progressive__gpt-5.5_11",
+    "title": "Part discovery claim is mostly qualitative and lacks the segmentation procedure here",
+    "quote": "ViSER can discover detailed 3D part segmentation without any manual annotation, as shown in Fig. 6.",
+    "explanation": "The claim is plausible because the model learns bones and skinning weights, but the paper should specify how a part segmentation is extracted from the learned representation. Choices such as the number of bones, whether vertices are assigned by maximum skinning weight, whether clustering is used, and whether small components are merged can materially affect the visualization. Without an extraction rule or objective validation, the statement is mainly qualitative.",
+    "comment_type": "logical",
+    "paragraph_index": 68,
+    "severity": "minor"
+  },
+  {
+    "idx": 17,
+    "proxy": "Conference-level",
+    "group": "high",
+    "paper_slug": "neurips2021--JJy-Hw8-viser-video-specific-surface-embeddings-for-articulated-3d-s",
+    "paper_title": "ViSER: Video-Specific Surface Embeddings for Articulated 3D Shape Reconstruction",
+    "id": "progressive__gpt-5.5_0",
+    "title": "Input and supervision claims need to distinguish RGB video from external masks/annotations",
+    "quote": "We show that none of these are required if one can reliably estimate long-range correspondences in a video, making use of only 2D object masks and two-frame optical flow as inputs. / ViSER requires neither a template shape nor annotations to work on categories in the wild.",
+    "explanation": "These claims are easy to overread. The method formulation later includes RGB pixel color, segmentation masks, and optical flow as observations; the pixel embedding network takes the image as input, and reconstruction losses include texture/perceptual image terms. Thus the method is not literally using only masks and flow: it also uses monocular RGB video. Separately, saying the method requires no annotations is ambiguous because segmentation masks are required inputs and may themselves be annotations in the ordinary dataset sense. The underlying point is valid if the authors mean that no template shape, category-specific 3D prior, keypoint labels, or image-to-surface correspondence annotations are required beyond RGB video, masks, and flow. The paper should state that distinction explicitly.",
+    "comment_type": "technical",
+    "paragraph_index": 1,
+    "severity": "moderate"
+  },
+  {
+    "idx": 18,
+    "proxy": "Conference-level",
+    "group": "high",
+    "paper_slug": "neurips2021--JJy-Hw8-viser-video-specific-surface-embeddings-for-articulated-3d-s",
+    "paper_title": "ViSER: Video-Specific Surface Embeddings for Articulated 3D Shape Reconstruction",
+    "id": "progressive__gpt-5.5_10",
+    "title": "Multi-video training and the LASR comparison overstate the implications of optical-flow boundaries",
+    "quote": "We treat multiple videos as a single long video with strong appearance changes and shape variations. / Note that LASR cannot handle multiple videos as it requires optical flow computed between every adjacent frame pairs.",
+    "explanation": "If unrelated videos were literally concatenated, the boundary between videos would create artificial adjacent frame pairs with meaningless optical flow and no temporal continuity. The intended implementation presumably skips flow/reconstruction terms at video boundaries and treats the data as disconnected sequences sharing some parameters, which is materially different from a single long video and should be stated. For the same reason, the claim that LASR cannot handle multiple videos merely because it requires adjacent-frame flow is too categorical: flow boundary pairs could be omitted or handled with separate per-video objectives. The more substantive limitation is likely that LASR lacks ViSER\u2019s mechanism for enforcing shared long-range/cross-video canonical correspondences.",
+    "comment_type": "technical",
+    "paragraph_index": 65,
+    "severity": "moderate"
+  },
+  {
+    "idx": 19,
+    "proxy": "Conference-level",
+    "group": "high",
+    "paper_slug": "neurips2021--JJy-Hw8-viser-video-specific-surface-embeddings-for-articulated-3d-s",
+    "paper_title": "ViSER: Video-Specific Surface Embeddings for Articulated 3D Shape Reconstruction",
+    "id": "progressive__gpt-5.5_1",
+    "title": "Ambiguous reuse of S for masks, shape, and surface correspondences",
+    "quote": "Given a set of video observations including RGB pixel color, segmentation masks, and optical flow estimates _{It, St, ut}t_ = _{_ 0 _,...,T }_ , our goal is to recover a set of shape and motion parameters _{_ **S** _,_ **D** _t}_ that produce reconstructions _{I_[\u02c7] _t, S_[\u02c7] _t,_ \u02c7 _ut}t_ = _{_ 0 _,...,T }_ that match the video observations.",
+    "explanation": "The notation overload is recoverable from context but unnecessarily confusing. S_t denotes segmentation masks, bold S denotes the recovered 3D shape, rendered masks are written as S-hat_t, and later matched surface locations are also written using S-hat[x,y]. A reader has to infer from subscript, boldface, and context whether S denotes a silhouette, a mesh, or a surface coordinate map. This is localized and fixable by using separate notation such as M_t for masks and a distinct calligraphic symbol for the mesh/surface.",
+    "comment_type": "technical",
+    "paragraph_index": 15,
+    "severity": "minor"
+  },
+  {
+    "idx": 20,
+    "proxy": "Conference-level",
+    "group": "high",
+    "paper_slug": "neurips2021--JJy-Hw8-viser-video-specific-surface-embeddings-for-articulated-3d-s",
+    "paper_title": "ViSER: Video-Specific Surface Embeddings for Articulated 3D Shape Reconstruction",
+    "id": "progressive__gpt-5.5_4",
+    "title": "Expectation-based correspondences are described more strongly than they justify",
+    "quote": "To output a single surface point for pixel ( _x, y_ ), we can compute a \u201csoft\u201d argmax [17, 48] by taking the expectation of the softmax distribution over the 3D locations of the points samples, / our pixel-surface embedding captures multimodal uncertainties over keypoints; for example, _\u03c3_ ( _x,y_ )[ _i_ ] can capture the fact that a particular pixel matches well to both the left and right ankle, / we introduce a 3D matching loss that ensures pixel embeddings only match to surface embeddings rendered at the pixel location:",
+    "explanation": "These passages share the same underlying issue: the distribution over surface samples may be multimodal, but the method then converts it to an expected 3D coordinate. That expectation is not guaranteed to lie on the mesh surface and, under a bimodal distribution, can land between plausible modes, reproducing the same mean-regression artifact the text contrasts against. Similarly, a loss on the expected coordinate can encourage mass near the rendered surface point, but it does not strictly ensure that the pixel embedding matches only that point; off-target modes can average to the correct coordinate. The paper should either qualify these claims or explain whether the full distribution is used, whether it is driven to unimodality, or whether an additional distribution-level loss is applied.",
+    "comment_type": "technical",
+    "paragraph_index": 26,
+    "severity": "moderate"
+  },
+  {
+    "idx": 21,
+    "proxy": "Reviewer-level",
+    "group": "low",
+    "paper_slug": "iclr2022-m5EBN92v-aaseg-attention-aware-network-for-real-time-semantic-segment",
+    "paper_title": "AASeg: Attention Aware Network for Real Time Semantic Segmentation",
+    "id": "progressive__gpt-5.5_10",
+    "title": "Cityscapes validation mIoU in ablation tables conflicts with the main result",
+    "quote": "|AASeg|512\u00d71024|no|74.8|74.4|202.7| ... Table 4: Ablation study on Upsampling operation in our network using Cityscapes validation set. ... |bilinear upsampling|79.2| ... Table 6 reports AASeg variants up to 80.2 mIoU.",
+    "explanation": "The main Cityscapes table reports AASeg validation mIoU as 74.8%, but later ablation tables on the Cityscapes validation set report values around 78.3\u201380.2%. This is too large to be explained as ordinary variation and reverses the relationship between the reported final model and its ablated variants. The discrepancy could be due to a different training schedule, crop/scale evaluation, validation protocol, class set, or stronger model variant, but none of that is stated. As written, it is unclear which model and protocol produced the headline Cityscapes numbers.",
+    "comment_type": "logical",
+    "paragraph_index": 52,
+    "severity": "major"
+  },
+  {
+    "idx": 22,
+    "proxy": "Reviewer-level",
+    "group": "low",
+    "paper_slug": "iclr2022-m5EBN92v-aaseg-attention-aware-network-for-real-time-semantic-segment",
+    "paper_title": "AASeg: Attention Aware Network for Real Time Semantic Segmentation",
+    "id": "progressive__gpt-5.5_12",
+    "title": "CamVid comparison table contains duplicated SFNet entries",
+    "quote": "|SFNet|720\u00d7960|DF2|70.4|134.1| |SFNet|720\u00d7960|ResNet-18|73.8|35.5| |SFNet|720\u00d7960|DF2|70.4|134.1| |SFNet|720\u00d7960|ResNet-18|73.8|35.5|",
+    "explanation": "The CamVid table repeats the same two SFNet rows. This is likely a table-preparation error and may not change the numerical conclusion, but duplicated baseline rows create confusion about which comparisons were actually included and suggest the table was not carefully checked.",
+    "comment_type": "logical",
+    "paragraph_index": 56,
+    "severity": "minor"
+  },
+  {
+    "idx": 23,
+    "proxy": "Reviewer-level",
+    "group": "low",
+    "paper_slug": "iclr2022-m5EBN92v-aaseg-attention-aware-network-for-real-time-semantic-segment",
+    "paper_title": "AASeg: Attention Aware Network for Real Time Semantic Segmentation",
+    "id": "progressive__gpt-5.5_0",
+    "title": "Ambiguous claim that the model has \u201cno backbone\u201d",
+    "quote": "Our network dosen\u2019t use any backbone to extract features from the input image unlike many previous architectures. The input image is first passed through a block comprising of convolutional, batch normalization and ReLU activation function.",
+    "explanation": "The likely intent is that AASeg does not use a standard pretrained classification backbone such as ResNet, Xception, or DFNet. That would be a meaningful point for real-time segmentation and is consistent with later comparison tables marking AASeg as having backbone \u201cno.\u201d However, taken literally, the statement is misleading because the described convolutional stages are themselves a feature extractor. The paper should clarify whether \u201cno backbone\u201d means no ImageNet-pretrained backbone, no external named backbone, or no separate encoder. This matters for interpreting speed/accuracy comparisons and pretraining assumptions.",
+    "comment_type": "logical",
+    "paragraph_index": 23,
+    "severity": "moderate"
+  },
+  {
+    "idx": 24,
+    "proxy": "Reviewer-level",
+    "group": "low",
+    "paper_slug": "iclr2022-m5EBN92v-aaseg-attention-aware-network-for-real-time-semantic-segment",
+    "paper_title": "AASeg: Attention Aware Network for Real Time Semantic Segmentation",
+    "id": "progressive__gpt-5.5_4",
+    "title": "Related-work description appears to attribute channel-selection behavior to SqueezeNet",
+    "quote": "(Iandola et al., 2016) allows the neural network to find the critical channels of the feature map and select the most suitable channels by itself.",
+    "explanation": "Iandola et al. (2016) is SqueezeNet, whose main contribution is parameter reduction using Fire modules with squeeze and expand convolutions. It is not normally described as learning channel attention or selecting critical feature-map channels by itself. That description is closer to Squeeze-and-Excitation Networks, which are cited separately later. This should be corrected or rephrased to avoid confusing the lineage of the proposed channel-attention component.",
+    "comment_type": "technical",
+    "paragraph_index": 5,
+    "severity": "minor"
+  },
+  {
+    "idx": 25,
+    "proxy": "Reviewer-level",
+    "group": "low",
+    "paper_slug": "iclr2022-m5EBN92v-aaseg-attention-aware-network-for-real-time-semantic-segment",
+    "paper_title": "AASeg: Attention Aware Network for Real Time Semantic Segmentation",
+    "id": "progressive__gpt-5.5_8",
+    "title": "Validation and test training protocol for Cityscapes is ambiguous",
+    "quote": "We present the segmentation accuracy and inference speed of our proposed method on Cityscapes validation and test set in Table 1. We use the training set and validation set to train our models before submitting to Cityscapes online server.",
+    "explanation": "Training on train+val before submitting to the hidden Cityscapes test server is standard. However, the paper presents validation and test results together without clearly stating whether the validation mIoU was obtained from a model trained only on the training split, while the test result came from a train+val model. If the validation set was included in training for the model used to report validation mIoU, the validation result would not be a held-out estimate. The evaluation protocol should be made explicit.",
+    "comment_type": "logical",
+    "paragraph_index": 49,
+    "severity": "moderate"
+  },
+  {
+    "idx": 26,
+    "proxy": "Reviewer-level",
+    "group": "high",
+    "paper_slug": "iclr2022-hR_SMu8c-scaling-laws-for-neural-machine-translation",
+    "paper_title": "Scaling Laws for Neural Machine Translation",
+    "id": "progressive__gpt-5.5_4",
+    "title": "Optimal-allocation claims conflict with the stated additive scaling law",
+    "quote": "**Proposition 1** (Optimal Scaling) ... the optimal encoder / decoder sizes ... and ... the scaling law reduces to L\u0302_opt(B) = \u03b1* B^\u2212(pd+pe) + L\u221e ... Inspection of the functional form of Eq. (1) suggests that as long as Nd/Ne is fixed as the model scales ... the optimal scaling exponent, (pe + pd), can ...",
+    "explanation": "The stated optimal allocation and resulting exponent pe + pd would follow naturally from a multiplicative law such as \u03b1 Ne^(-pe) Nd^(-pd) + L\u221e. However, Eq. (1) is described as an additive law, \u03b1(Ne^(-pe) + Nd^(-pd)) + L\u221e. Under that additive law, minimizing subject to Ne + Nd = B gives the first-order condition pe Ne^(-pe-1) = pd Nd^(-pd-1), so the optimal ratio generally depends on B when pe \u2260 pd. Likewise, proportional scaling gives terms proportional to B^(-pe) and B^(-pd), with asymptotic behavior governed by the slower exponent, not B^(-(pe+pd)). Either the scaling law should be multiplicative, or the proposition and surrounding interpretation need a different derivation.",
+    "comment_type": "technical",
+    "paragraph_index": 37,
+    "severity": "major"
+  },
+  {
+    "idx": 27,
+    "proxy": "Reviewer-level",
+    "group": "high",
+    "paper_slug": "iclr2022-hR_SMu8c-scaling-laws-for-neural-machine-translation",
+    "paper_title": "Scaling Laws for Neural Machine Translation",
+    "id": "progressive__gpt-5.5_0",
+    "title": "Prior scaling-law literature is summarized as more univariate than it generally is",
+    "quote": "For many of these tasks the scaling behavior of neural networks is highly predictable; model fit or test loss can be described precisely as a function of its number of parameters (Hestness et al., 2017; Kaplan et al., 2020; Henighan et al., 2020; Hernandez et al., 2021; Rosenfeld et al., 2019).",
+    "explanation": "The cited scaling-law literature often models loss as depending on dataset size and/or compute budget in addition to parameter count, with univariate parameter-count laws applying only under specific fixed-data or non-limiting-data assumptions. Kaplan et al., for example, explicitly study model size, dataset size, and compute. Thus the statement that loss can be described precisely as a function of parameter count alone slightly overstates the general prior result and makes the paper's contrast with NMT appear sharper than warranted.",
+    "comment_type": "logical",
+    "paragraph_index": 2,
+    "severity": "minor"
+  },
+  {
+    "idx": 28,
+    "proxy": "Reviewer-level",
+    "group": "high",
+    "paper_slug": "iclr2022-hR_SMu8c-scaling-laws-for-neural-machine-translation",
+    "paper_title": "Scaling Laws for Neural Machine Translation",
+    "id": "progressive__gpt-5.5_7",
+    "title": "Claims that source-original reducible loss reaches zero or yields no further benefit overstate the fitted evidence",
+    "quote": "Reducible loss quickly decays to zero for source original test sets. ... Hence, beyond a few hundred million parameters, there is no benefit in increasing the model size. ... the reducible error on source-original evaluation sets quickly saturates to 0.",
+    "explanation": "The evidence appears to show that source-original evaluation losses rapidly approach a fitted irreducible-loss floor and that the estimated reducible component becomes small. Under the fitted power law, however, the reducible term approaches zero asymptotically rather than literally reaching zero, and L\u221e itself is an estimated quantity. The stronger claim that there is 'no benefit' from larger models also goes beyond cross-entropy on the studied source-original sets and may not cover generation quality, robustness, rare phenomena, calibration, or other domains. A more precise statement would be that further scaling gives little additional fitted cross-entropy improvement on these source-original test sets beyond the relevant size range.",
+    "comment_type": "logical",
+    "paragraph_index": 47,
+    "severity": "moderate"
+  },
+  {
+    "idx": 29,
+    "proxy": "Reviewer-level",
+    "group": "high",
+    "paper_slug": "iclr2022-hR_SMu8c-scaling-laws-for-neural-machine-translation",
+    "paper_title": "Scaling Laws for Neural Machine Translation",
+    "id": "progressive__gpt-5.5_9",
+    "title": "Back-translation capacity requirement is stated more strongly than the evidence supports",
+    "quote": "This assertion suggests that in order for back-translation to be beneficial for training large models, it has to be performed with a models with comparable capacity or higher.",
+    "explanation": "The preceding discussion frames the capacity-threshold explanation as a hypothesis, but this sentence turns it into a practical requirement. From the described experiment, teacher capacity is confounded with teacher quality, decoding settings, domain mismatch, synthetic artifacts, and noise patterns, and the setup appears to use only one back-translation model. The data support a conjecture worth testing, not a general necessity claim that back-translation must use a comparable-or-larger model.",
+    "comment_type": "logical",
+    "paragraph_index": 52,
+    "severity": "moderate"
+  },
+  {
+    "idx": 30,
+    "proxy": "Reviewer-level",
+    "group": "high",
+    "paper_slug": "iclr2022-hR_SMu8c-scaling-laws-for-neural-machine-translation",
+    "paper_title": "Scaling Laws for Neural Machine Translation",
+    "id": "progressive__gpt-5.5_5",
+    "title": "Language-pair generalization claim is stronger than the evidence described",
+    "quote": "To ensure that our results generalize across different language pairs, we examine the fit of our scaling law on encoder / decoder scaling models trained on German _\u2192_ English (De _\u2192_ En), Chinese-to-English (Zh _\u2192_ En), and English-to-Chinese (En _\u2192_ Zh) translation tasks.",
+    "explanation": "Fitting the proposed functional form separately on several additional language directions supports the claim that the form can describe multiple language pairs. It does not, by itself, 'ensure' generalization in a stronger out-of-sample sense, such as transferring fitted parameters or predicting unseen language pairs. A phrase such as 'test whether' or 'provide evidence that' would better match the described experiment.",
+    "comment_type": "logical",
+    "paragraph_index": 29,
+    "severity": "minor"
+  },
+  {
+    "idx": 31,
+    "proxy": "Composite",
+    "group": "low",
+    "paper_slug": "iclr2022-UGINpaIC-neural-networks-with-trainable-matrix-activation-functions",
+    "paper_title": "Neural networks with trainable matrix activation functions",
+    "id": "progressive__gpt-5.5_7",
+    "title": "Extension to arbitrary activations is not generally a matrix-vector activation in the same sense",
+    "quote": "_Our observation also applies to an activation function \u03c3 other than ReLU. For example, we may rescale \u03c3_ ( _x_ ) _to obtain \u03c3_ ( _\u03c9i,\u2113x_ ) _using a set of constants {\u03c9i,\u2113}_ 1 _\u2264i\u2264n\u2113,_ 1 _\u2264\u2113\u2264L varying layer by layer and neuron by neuron.",
+    "explanation": "The intended idea seems to be a trainable per-neuron rescaling of a standard activation, which is a reasonable adaptive-activation construction. However, the earlier matrix-activation observation for ReLU works because \\(\\operatorname{ReLU}(t)=d(t)t\\) with \\(d(t)\\in\\{0,1\\}\\). For a general activation \\(\\sigma\\), representing \\(\\sigma(\\omega t)\\) as a diagonal matrix times the input would require a diagonal entry \\(\\sigma(\\omega t)/t\\), which may be undefined at \\(t=0\\) and cannot represent activations with \\(\\sigma(0)\\ne 0\\) in the form \\(D(t)t\\). Many common activations, such as Softplus or sigmoid-type functions, do not naturally fit this without extra qualifications. If the authors only mean ordinary componentwise scaled activations, then it is not the same matrix-vector observation unless the necessary conditions are stated.",
+    "comment_type": "technical",
+    "paragraph_index": 19,
+    "severity": "minor"
+  },
+  {
+    "idx": 32,
+    "proxy": "Composite",
+    "group": "low",
+    "paper_slug": "iclr2022-UGINpaIC-neural-networks-with-trainable-matrix-activation-functions",
+    "paper_title": "Neural networks with trainable matrix activation functions",
+    "id": "progressive__gpt-5.5_12",
+    "title": "Claim that ReLU networks clearly need many more neurons is stronger than the evidence shown",
+    "quote": "To capture the high frequency, ReLU-type neural networks are clearly required to use much more neurons, introducing significantly amount of weight and bias parameters.",
+    "explanation": "The empirical observation is plausible: piecewise-linear ReLU networks often need many linear regions to represent high-frequency oscillations accurately, and the shown ReLU/Para-ReLU runs fail on this target. However, the context does not provide a theoretical lower bound or a systematic width/depth study. The claim is based on one architecture and training setup, so the failure could also reflect optimization, initialization, learning-rate schedule, depth/width choice, or insufficient training rather than an inherent requirement for much more neurons. The sentence should be softened or supported by an ablation over width/depth or a theoretical argument.",
+    "comment_type": "logical",
+    "paragraph_index": 34,
+    "severity": "moderate"
+  },
+  {
+    "idx": 33,
+    "proxy": "Composite",
+    "group": "low",
+    "paper_slug": "iclr2022-UGINpaIC-neural-networks-with-trainable-matrix-activation-functions",
+    "paper_title": "Neural networks with trainable matrix activation functions",
+    "id": "progressive__gpt-5.5_3",
+    "title": "Hypothesis-class inclusion does not imply practical performance is \u201cclearly not worse\u201d",
+    "quote": "Since ReLU and Leaky ReLU are included by our DNN as special cases, the proposed DNN is clearly not worse than the traditional ones in practice.",
+    "explanation": "At the level of representable functions, a parameterized family containing ReLU and Leaky ReLU has at least the same best-case approximation capacity, assuming the architecture and objective are otherwise comparable. What does not follow is the practical claim. Neural-network training is nonconvex, and adding trainable activation parameters can change optimization dynamics, initialization sensitivity, regularization behavior, and effective learning rates. Therefore, inclusion of ReLU/Leaky ReLU as special cases only supports a statement about the optimal attainable loss over the enlarged function class, not that the trained model will be no worse in practice.",
+    "comment_type": "logical",
+    "paragraph_index": 12,
+    "severity": "moderate"
+  },
+  {
+    "idx": 34,
+    "proxy": "Composite",
+    "group": "low",
+    "paper_slug": "iclr2022-UGINpaIC-neural-networks-with-trainable-matrix-activation-functions",
+    "paper_title": "Neural networks with trainable matrix activation functions",
+    "id": "progressive__gpt-5.5_15",
+    "title": "Claim that the ResNet parameters are already ReLU-tuned is not sufficiently supported",
+    "quote": "Those parameters given in (Paszke et al., 2019) are already tuned well with respect to ReLU.",
+    "explanation": "The authors appear to argue that the comparison is not biased in favor of TMAF because the baseline architecture/hyperparameters are standard for ReLU. However, Paszke et al. (2019) is the PyTorch systems paper, not a canonical source of carefully tuned CIFAR-10/CIFAR-100 ResNet hyperparameters. The reported CIFAR-10 accuracies around 77\u201380% are also well below commonly reported tuned ResNet results on CIFAR-10, suggesting that the training setup may not be strongly optimized. Using the same optimizer and learning rate for all activations is fair in one sense, but it does not establish that the ReLU baseline is tuned well. This weakens the interpretation of the reported TMAF advantage unless the exact recipe, data augmentation, normalization, epoch count, and baseline tuning procedure are provided or a more appropriate citation is used.",
+    "comment_type": "logical",
+    "paragraph_index": 42,
+    "severity": "moderate"
+  },
+  {
+    "idx": 35,
+    "proxy": "Composite",
+    "group": "low",
+    "paper_slug": "iclr2022-UGINpaIC-neural-networks-with-trainable-matrix-activation-functions",
+    "paper_title": "Neural networks with trainable matrix activation functions",
+    "id": "progressive__gpt-5.5_9",
+    "title": "Gaussian equal-probability interval claim is numerically inaccurate as stated",
+    "quote": "For TMAF _D\u2113_ in equation 5, the function _\u03b1\u2113_ uses intervals ( _\u2212\u221e, \u2212_ 1 _._ 4), ( _\u2212_ 1 _._ 4 _, \u2212_ 0 _._ 92], ( _\u2212_ 0 _._ 92 _, \u2212_ 0 _._ 56], ( _\u2212_ 0 _._ 56 _, \u2212_ 0 _._ 26], ( _\u2212_ 0 _._ 26 _,_ 0], (0 _,_ 0 _._ 26], (0 _._ 26 _,_ 0 _._ 56], (0 _._ 56 _,_ 0 _._ 92], (0 _._ 92 _,_ 1 _._ 4], (1 _._ 4 _, \u221e_ ) such that probability over each of the ten intervals is 0.1 with respect to Gaussian distribution.",
+    "explanation": "The intent is clear: choose grid intervals roughly matched to Gaussian quantiles, likely because BatchNorm makes preactivations approximately standardized. But the listed cut points are not the standard normal deciles. Equal 0.1-probability bins would use approximate cutoffs \\(\\pm1.2816\\), \\(\\pm0.8416\\), \\(\\pm0.5244\\), \\(\\pm0.2533\\), and 0. The provided values are close in the middle but noticeably off in the tails: for example, \\(P(Z>1.4)\\approx0.081\\), not 0.1. If these are only heuristic rounded bins, the prose should say approximately; if exact equal-mass Gaussian bins are intended, the cutoffs should be corrected.",
+    "comment_type": "technical",
+    "paragraph_index": 24,
+    "severity": "minor"
+  },
+  {
+    "idx": 36,
+    "proxy": "Composite",
+    "group": "high",
+    "paper_slug": "neurips2021-GlEWs-V9-volume-rendering-of-neural-implicit-surfaces",
+    "paper_title": "Volume Rendering of Neural Implicit Surfaces",
+    "id": "progressive__gpt-5.5_11",
+    "title": "BlendedMVS comparison alternates between NeRF++ and NeRF",
+    "quote": "Therefore we use NeRF++ [39] as a baseline for this dataset. In Table 2 we present our results compared to NeRF++. Qualitative comparisons are presented in Fig. 5; since the units are unknown in this case we present relative improvement of Chamfer distance (in %) compared to NeRF.",
+    "explanation": "The setup identifies NeRF++ as the relevant BlendedMVS baseline, which is sensible for scenes with complex backgrounds. The final sentence then says the relative Chamfer improvement is compared to NeRF. This may be a typographical omission of \u201c++\u201d, but as written the baseline for the percentage improvements is ambiguous, especially because the table reports only relative percentages rather than absolute Chamfer distances.",
+    "comment_type": "logical",
+    "paragraph_index": 55,
+    "severity": "minor"
+  },
+  {
+    "idx": 37,
+    "proxy": "Composite",
+    "group": "high",
+    "paper_slug": "neurips2021-GlEWs-V9-volume-rendering-of-neural-implicit-surfaces",
+    "paper_title": "Volume Rendering of Neural Implicit Surfaces",
+    "id": "progressive__gpt-5.5_7",
+    "title": "The \u03b2+ sampling guarantee may apply to a modified density rather than the current model density",
+    "quote": "Either way, we use the final T and \u03b2+ (guaranteed to provide BT,\u03b2+ \u2264 \u03f5) to estimate the current opacity O, Line 10 in Algorithm 1).",
+    "explanation": "If the adaptive procedure terminates with \u03b2+ equal to the current \u03b2, the stated bound applies to the current density. But the text allows the algorithm to stop at max_iter with \u03b2+ > \u03b2. In that case, the guarantee BT,\u03b2+ \u2264 \u03b5 controls the rectangle-rule error for the smoother density parameterized by \u03b2+, not necessarily for the actual current density using \u03b2. The prose nevertheless says the algorithm estimates the current opacity. Unless rendering deliberately uses \u03b2+ and distinguishes this from the model density, the guarantee is overstated or ambiguous.",
+    "comment_type": "technical",
+    "paragraph_index": 45,
+    "severity": "moderate"
+  },
+  {
+    "idx": 38,
+    "proxy": "Composite",
+    "group": "high",
+    "paper_slug": "neurips2021-GlEWs-V9-volume-rendering-of-neural-implicit-surfaces",
+    "paper_title": "Volume Rendering of Neural Implicit Surfaces",
+    "id": "progressive__gpt-5.5_2",
+    "title": "Volume rendering is described as avoiding extraneous surface-growth artifacts too broadly",
+    "quote": "Also, learning to render surfaces directly tends to grow extraneous parts due to optimization problems, which are avoided by volume rendering.",
+    "explanation": "Volume rendering can mitigate some optimization difficulties of direct surface rendering because it integrates along rays rather than relying on hard surface intersections. But the statement is too categorical: generic neural volume-rendering methods can still produce floaters, spurious semi-transparent structures, noisy density, and other geometry artifacts. A more precise claim would be that volume rendering alleviates some surface-rendering optimization artifacts, not that it avoids extraneous geometry in general.",
+    "comment_type": "logical",
+    "paragraph_index": 5,
+    "severity": "minor"
+  },
+  {
+    "idx": 39,
+    "proxy": "Composite",
+    "group": "high",
+    "paper_slug": "neurips2021-GlEWs-V9-volume-rendering-of-neural-implicit-surfaces",
+    "paper_title": "Volume Rendering of Neural Implicit Surfaces",
+    "id": "progressive__gpt-5.5_6",
+    "title": "The sampling guarantee is conditional on an exact SDF, while the learned network is only regularized toward one",
+    "quote": "The proof of this theorem, which is provided in the supplementary, makes a principled use of the signed distance function\u2019s unique properties;",
+    "explanation": "The theorem is well motivated for a true signed distance function, whose geometric and Lipschitz properties can be used to bound density variation along a ray. In the implemented model, however, the SDF is represented by an MLP and encouraged with an Eikonal loss; this does not guarantee that the learned field is an exact SDF everywhere or that it satisfies the exact geometric assumptions used in the proof. The guarantee should therefore be read as applying to the idealized SDF model, while the learned model relies on regularization and empirical behavior.",
+    "comment_type": "logical",
+    "paragraph_index": 31,
+    "severity": "moderate"
+  },
+  {
+    "idx": 40,
+    "proxy": "Composite",
+    "group": "high",
+    "paper_slug": "neurips2021-GlEWs-V9-volume-rendering-of-neural-implicit-surfaces",
+    "paper_title": "Volume Rendering of Neural Implicit Surfaces",
+    "id": "progressive__gpt-5.5_8",
+    "title": "The \u03b1 and \u03b2 parameterization is inconsistent about positivity, learnability, and the bound notation",
+    "quote": "Taking the maximum over all intervals furnishes a bound BT,\u03b2 as a function of T and \u03b2 ... / In addition we have two scalar learnable parameters \u03b1, \u03b2 \u2208 R. In fact, in our implementation we make the choice \u03b1 = \u03b2\u22121. We denote by \u03b8 ... \u03b8 = (\u03c6, \u03c8, \u03b2).",
+    "explanation": "The density definition and opacity-error bounds require \u03b1 and \u03b2 to be positive. As written, saying \u03b1,\u03b2 \u2208 R would allow \u03b2 \u2264 0, for which the Laplace scale, \u03b1 = \u03b2\u22121, and the sampling bounds are not well defined. There is also an internal ambiguity: the text first presents \u03b1 and \u03b2 as two learnable scalars, then ties \u03b1 deterministically to \u03b2 and excludes \u03b1 from \u03b8. This also affects the notation BT,\u03b2: if \u03b1 is independent, the opacity-error bound depends on \u03b1 as well as \u03b2; if \u03b1 = \u03b2\u22121, that dependence should be stated before suppressing \u03b1 from the notation.",
+    "comment_type": "technical",
+    "paragraph_index": 48,
+    "severity": "moderate"
+  }
+]
\ No newline at end of file
diff --git a/benchmarks/conference_study/severity_validation/samples.md b/benchmarks/conference_study/severity_validation/samples.md
new file mode 100644
index 0000000..8a4b835
--- /dev/null
+++ b/benchmarks/conference_study/severity_validation/samples.md
@@ -0,0 +1,373 @@
+# Comment severity validation sample
+
+40 OpenAIReview (GPT-5.5) comments: 1 low + 1 high paper per quality proxy, 5 comments each (spread across severity tiers where available).
+
+For each comment, the model's **LLM severity** is shown; mark **your severity** and whether it's substantive **signal** or **cosmetic**.
+
+---
+
+## Community-level proxy
+
+### WEAK / low-quality — FTSO: Effective NAS via First Topology Second Operator
+
+`iclr2021-7Z29QbHx-ftso-effective-nas-via-first-topology-second-operator` · decision: Reject · review score avg: 4.0 · cites/yr: 1.8
+
+**1. [MINOR] Pearson correlation claims need methodological qualification**  *(type: technical, ¶45)*
+
+> Quote: they still obey a positive correlation, with a Pearson correlation coefficient of 0.77 ... generalize better on ImageNet, with the correlation coefficient of 0.7
+> Pearson correlation is a reasonable way to summarize alignment between evaluation protocols, but the conclusions depend on details such as sample size, whether points are independent architectures, whether results are averaged over seeds, whether accuracy or error is used, and sensitivity to outliers. The reported coefficients may support a positive association, but the paper should provide these details to make the correlation and CIFAR-to-ImageNet generalization claim reproducible and interpretable.
+
+Your severity: ( ) major  ( ) moderate  ( ) minor    |    ( ) signal  ( ) cosmetic
+Notes: 
+
+**2. [MODERATE] Skip-only topology search is inconsistent or underspecified about weights and reduction-cell costs**  *(type: technical, ¶16)*
+
+> Quote: Because the skip connection operator contains no kernel weights, we only need to optimize the architecture parameters β_i,j ... / Assign each edge e_i,j a skip connection operator o_i,j with kernel weights w_i,j ... Update weights w by descending ∇_w L_train(w, β).
+> The prose says skip connections have no kernel weights and only β is optimized, but Algorithm 1 assigns kernel weights to skip operators and updates w. If w is empty, the update is vacuous and should be removed or explicitly marked as such; if w is nonempty, the zero-kernel-weight claim is false. The zero-parameter/zero-FLOP skip assumption is also underspecified for DARTS-style reduction cells, where skip connections on stride-2 edges are often implemented with factorized reduction or projection to match spatial resolution and channels. The method should define exactly how skip edges are implemented in normal and reduction cells.
+
+Your severity: ( ) major  ( ) moderate  ( ) minor    |    ( ) signal  ( ) cosmetic
+Notes: 
+
+**3. [MODERATE] Description of the NATS-Bench cell topology seems inconsistent with the standard benchmark**  *(type: technical, ¶37)*
+
+> Quote: In the search space of NATS-Bench, there are one input node, three intermediate nodes and one output node, and each intermediate node connects to all its predecessors.
+> The standard NAS-Bench-201/NATS-Bench topology space is usually described as a 4-node cell with 6 directed edges: one input node, two intermediate nodes, and one output node, with each non-input node connected to all previous nodes. The quoted description appears to describe five nodes unless the authors are using a different indexing convention. Because the node and edge counts determine the search space and the adaptation of FTSO, the topology description should be clarified.
+
+Your severity: ( ) major  ( ) moderate  ( ) minor    |    ( ) signal  ( ) cosmetic
+Notes: 
+
+**4. [MODERATE] The claim that a generalizing subgraph implies supernet overfitting is not justified**  *(type: logical, ¶17)*
+
+> Quote: If the sub-graph can generalize perfectly on the testing set, the super-net must over-fit.
+> A subgraph generalizing well does not imply that the enclosing supernet must overfit. Both could generalize, or the supernet could perform worse because of weight-sharing interference, optimization mismatch, or discretization bias rather than classical overfitting. Since this implication is used to motivate direct replacement over gradient-based operator search, it should be phrased as a possible explanation rather than a necessary conclusion.
+
+Your severity: ( ) major  ( ) moderate  ( ) minor    |    ( ) signal  ( ) cosmetic
+Notes: 
+
+**5. [MODERATE] Topology-search forward pass appears to omit edge-weight normalization**  *(type: technical, ¶21)*
+
+> Quote: Forward-propagate following n_j = ∑_{i<j} o(n_i) β_i,j;
+> The surrounding text says the method inherits PC-DARTS-style differentiable topology weighting, where incoming edge weights are typically normalized by a softmax. The pseudocode instead appears to use raw β_i,j in a weighted sum. If the intended computation is softmax-normalized over incoming edges, the algorithm is missing an important normalization step. If raw β values are intended, that materially changes the optimization and the interpretation of selecting the largest β values. The forward computation should specify whether and how β is constrained or normalized.
+
+Your severity: ( ) major  ( ) moderate  ( ) minor    |    ( ) signal  ( ) cosmetic
+Notes: 
+
+### STRONG / high-quality — Efficiently Modeling Long Sequences with Structured State Spaces
+
+`iclr2022-uYLFoz1v-efficiently-modeling-long-sequences-with-structured-state-sp` · decision: Oral · review score avg: 8.0 · cites/yr: 370.75
+
+**6. [MINOR] HiPPO memorization claim attributes too much to the state matrix alone**  *(type: technical, ¶23)*
+
+> Quote: HiPPO specifies a class of certain matrices ***A** ∈* R *[N][×][N]* that when incorporated into (1), allows the state *x* ( *t* ) to memorize the history of the input *u* ( *t* ).
+> The sentence summarizes the motivation for choosing HiPPO-derived state matrices, but read literally it overstates what the matrix A alone guarantees. In the HiPPO construction, the memory property depends on the full continuous-time dynamics, including the corresponding input operator/vector B and the projection measure/basis. For finite N, the state stores coefficients of an approximation or projection of the input history rather than an exact arbitrary history. A reader could incorrectly infer that merely substituting any HiPPO A into the SSM, with arbitrary B and learned C, is sufficient to obtain the formal memorization guarantee.
+
+Your severity: ( ) major  ( ) moderate  ( ) minor    |    ( ) signal  ( ) cosmetic
+Notes: 
+
+**7. [MODERATE] Rank dependence is hidden in the “4 Cauchy multiplies” complexity statement**  *(type: technical, ¶50)*
+
+> Quote: **Theorem 3** (S4 Convolution) **.** *Given any step size* ∆ *, computing the SSM convolution filter* ***K** can be reduced to 4 Cauchy multiplies, requiring only O*[] ( *N* + *L* ) *operations and O* ( *N* + *L* ) *space.*
+> For the rank-one DPLR case used by the S4 parameterization, the “4 Cauchy multiplies” statement is plausible. However, the surrounding discussion says the techniques apply to any NPLR matrix, and Theorem 1 allows low-rank factors P,Q ∈ R^{N×r}, with some HiPPO matrices having r=2. For general rank r, the Woodbury step involves an r×r correction at each frequency and more than four Cauchy-type evaluations, with complexity depending on r. The asymptotic Õ(N+L) claim remains essentially right for fixed small r, but the unqualified “4 Cauchy multiplies” can mislead readers about the general NPLR case.
+
+Your severity: ( ) major  ( ) moderate  ( ) minor    |    ( ) signal  ( ) cosmetic
+Notes: 
+
+**8. [MINOR] Abstract slightly blurs whether the original matrix or a corrected normal part is diagonalized**  *(type: technical, ¶2)*
+
+> Quote: Our technique involves conditioning *A* with a low-rank correction, allowing it to be diagonalized stably and reducing the SSM to the well-studied computation of a Cauchy kernel.
+> The high-level idea is directionally correct: later sections show that the problematic HiPPO matrix can be represented as normal plus low-rank, the normal/skew-symmetric part can be unitarily diagonalized, and the low-rank term is handled through Woodbury, leading to Cauchy-kernel computations. However, the phrase can be read as saying that the original *A* itself becomes stably diagonalizable after a low-rank correction. The more precise statement is that one diagonalizes the normal component obtained after separating out a low-rank term, not that the original non-normal HiPPO matrix is directly diagonalized in a well-conditioned basis.
+
+Your severity: ( ) major  ( ) moderate  ( ) minor    |    ( ) signal  ( ) cosmetic
+Notes: 
+
+**9. [MINOR] Sequential CIFAR gap to 2-D CNN appears overstated**  *(type: logical, ¶82)*
+
+> Quote: Sequential CIFAR is a particularly challenging dataset where outside of SSMs, all sequence models have a gap of over 25% to a simple 2-D CNN.
+> The table supports the qualitative point that sequential CIFAR is difficult for many sequence models, but the specific “over 25%” claim is hard to reconcile with the reported numbers. Table 5 shows several non-SSM sequence models around 73–74% on SCIFAR, such as TrellisNet at 73.42 and UR-GRU at 74.4. The paragraph gives 2-D ResNet18 results of 95.62% with augmentation and 89.46% without augmentation, implying gaps of about 21.2 and 15.1 percentage points respectively for the best listed non-SSM sequence model, not over 25 points. The claim is true for some weaker baselines, but not for all sequence models listed.
+
+Your severity: ( ) major  ( ) moderate  ( ) minor    |    ( ) signal  ( ) cosmetic
+Notes: 
+
+**10. [MINOR] Autoregressive inference-speed limitation is stated too broadly**  *(type: logical, ¶78)*
+
+> Quote: A prominent limitation of autoregressive models is inference speed (e.g. generation), since they require a pass over the full context for every new sample.
+> The authors are contrasting S4’s recurrent/stateful generation mode with standard full-context sequence models, especially Transformers. However, as written, the statement is too broad for autoregressive models in general. RNNs and state-space models are autoregressive but do not require a pass over the full context; cached Transformer decoding avoids recomputing a full forward pass over all previous tokens, even though attention still scales with context length; and some convolutional autoregressive models can also use caching. The more precise claim is that many non-stateful autoregressive architectures, especially vanilla Transformers without an equivalent recurrent state, have generation costs that scale with context length.
+
+Your severity: ( ) major  ( ) moderate  ( ) minor    |    ( ) signal  ( ) cosmetic
+Notes: 
+
+---
+
+## Conference-level proxy
+
+### WEAK / low-quality — Reinforcement learning for instance segmentation with high-level priors
+
+`neurips2021-6EWOVxvJ-reinforcement-learning-for-instance-segmentation-with-high-l` · decision: Reject · review score avg: 5.5 · cites/yr: 0.0
+
+**11. [MAJOR] Claim of no direct supervision is weakened by use of pretrained edge predictions**  *(type: logical, ¶61)*
+
+> Quote: we showed in particular that our setup can segment microscopy images with no direct supervision other than high-level reasoning.
+> The authors are careful throughout the paper to distinguish reward supervision from direct pixelwise ground truth, and in that sense the RL training itself appears to be driven by high-level rules. However, in the microscopy experiment the method also uses superpixels created from boundary/edge predictions, and Table 1 includes variants such as 'ours without edges,' described as removing 'the additional edge prediction as an input.' The baselines further identify these as boundary predictions from [47]. If those edge predictions come from a supervised pretrained boundary detector, then the complete segmentation pipeline is not using only high-level reasoning: it also depends on learned boundary information that encodes external direct supervision, even if not from the target validation set. The claim would be more accurate if qualified as 'no target-domain ground-truth supervision for training the RL agent,' or if the authors explicitly state that the edge predictor/superpixel generator is unsupervised or otherwise not trained with instance/boundary annotations.
+
+Your severity: ( ) major  ( ) moderate  ( ) minor    |    ( ) signal  ( ) cosmetic
+Notes: 
+
+**12. [MINOR] Claim that stateless RL was introduced in a recent GAN-related work is historically too strong**  *(type: logical, ¶20)*
+
+> Quote: To the best of our knowledge, stateless RL was introduced in [7] to study the connection between 137 generative adversarial networks and actor critics and our method is one of the first practical applica138 tions of this concept.
+> The authors are likely referring to a particular 'stateless actor-critic' formalization used to relate GANs and actor-critic methods, and in that narrow sense the citation may be relevant. Still, the wording 'stateless RL was introduced in [7]' is too broad. Single-state MDPs, multi-armed bandits, and contextual bandits are longstanding RL settings that predate GAN-related actor-critic interpretations by decades. The subsequent claim that this is one of the first practical applications of 'this concept' would be more defensible if explicitly limited to the specific stateless actor-critic/GAN-theoretic formulation of [7], rather than stateless RL generally.
+
+Your severity: ( ) major  ( ) moderate  ( ) minor    |    ( ) signal  ( ) cosmetic
+Notes: 
+
+**13. [MODERATE] Local reward decomposition and object-to-subgraph mapping are not sufficiently justified**  *(type: logical, ¶30)*
+
+> Quote: Of course then requirement arises that the union of 173 local rewards must resemble to the global reward. E.g. the optimal policy is the same for local as for 174 the global reward.
+> The paper appropriately notes that replacing a global reward by local sub-graph rewards is not innocuous: in general, reward decompositions can change the optimal policy unless they preserve the relevant objective. However, the proposed object-to-edge-to-subgraph mapping is then introduced as a heuristic without showing that this condition holds even approximately. The mapping assigns each superpixel its object reward, assigns an edge the maximum of its two incident superpixel rewards, and averages edge rewards over sub-graphs. This can alter credit assignment relative to the image-level or object-level prior: low-quality objects may be masked by adjacent high-scoring objects, many correct internal merge decisions inside a bad object may be penalized, and overlapping sub-graphs may weight some decisions more than others. The authors acknowledge noise and give plausible motivation, but important behavior remains underspecified, including whether boundary/cut and internal/merge edges are treated differently, whether rewards are normalized for sub-graph overlap, and how background objects interact with the rule.
+
+Your severity: ( ) major  ( ) moderate  ( ) minor    |    ( ) signal  ( ) cosmetic
+Notes: 
+
+**14. [MODERATE] RL avoids differentiating through the pipeline, but does not make the pipeline differentiable end-to-end**  *(type: technical, ¶15)*
+
+> Quote: Our main motivation to explore RL for the instance segmentation task is to circumvent the restriction 106 to differentiable losses and - regardless of the loss - to make the whole pipeline differentiable end-to107 end even in presence of non-differentiable steps which transform pixelwise CNN predictions into 108 individual instances.
+> The intended point is valid: an actor-critic objective can train components upstream of a non-differentiable graph partitioning step and can use rewards that are themselves non-differentiable. However, the quoted sentence goes further and says the method makes the whole pipeline 'differentiable end-to-end.' In standard policy-gradient/actor-critic training, gradients are not propagated through the environment transition, graph partitioning, or reward computation; the actor is updated through likelihood-ratio or critic-based gradients. Thus the overall image-to-instance-to-reward process is trainable end-to-end in a loose sense, but it is not differentiable end-to-end. A safer formulation would be that the method enables end-to-end optimization despite non-differentiable components.
+
+Your severity: ( ) major  ( ) moderate  ( ) minor    |    ( ) signal  ( ) cosmetic
+Notes: 
+
+**15. [MODERATE] Model selection appears to use the test set**  *(type: logical, ¶57)*
+
+> Quote: For comparison we keep the model which achieved the highest reward on the test set. This makes training as well as the validation independent from ground truth annotations.
+> The intended point may be that no ground-truth labels are used for checkpoint selection, only the rule-based reward. That is important for the paper's weak/no-supervision claim. However, the phrase 'highest reward on the test set' is problematic. Standard experimental practice is to choose hyperparameters/checkpoints on a training or validation set and reserve the test set for final unbiased evaluation. The surrounding text then discusses 'validation scores' and reward evolution 'on the validation set,' so it is possible the authors meant validation set rather than test set. If so, this should be corrected. As written it suggests test-set model selection; even without ground truth, selecting the checkpoint by test-set reward can bias reported performance toward the held-out images.
+
+Your severity: ( ) major  ( ) moderate  ( ) minor    |    ( ) signal  ( ) cosmetic
+Notes: 
+
+### STRONG / high-quality — ViSER: Video-Specific Surface Embeddings for Articulated 3D Shape Reconstruction
+
+`neurips2021--JJy-Hw8-viser-video-specific-surface-embeddings-for-articulated-3d-s` · decision: Spotlight · review score avg: 7.0 · cites/yr: 15.2
+
+**16. [MINOR] Part discovery claim is mostly qualitative and lacks the segmentation procedure here**  *(type: logical, ¶68)*
+
+> Quote: ViSER can discover detailed 3D part segmentation without any manual annotation, as shown in Fig. 6.
+> The claim is plausible because the model learns bones and skinning weights, but the paper should specify how a part segmentation is extracted from the learned representation. Choices such as the number of bones, whether vertices are assigned by maximum skinning weight, whether clustering is used, and whether small components are merged can materially affect the visualization. Without an extraction rule or objective validation, the statement is mainly qualitative.
+
+Your severity: ( ) major  ( ) moderate  ( ) minor    |    ( ) signal  ( ) cosmetic
+Notes: 
+
+**17. [MODERATE] Input and supervision claims need to distinguish RGB video from external masks/annotations**  *(type: technical, ¶1)*
+
+> Quote: We show that none of these are required if one can reliably estimate long-range correspondences in a video, making use of only 2D object masks and two-frame optical flow as inputs. / ViSER requires neither a template shape nor annotations to work on categories in the wild.
+> These claims are easy to overread. The method formulation later includes RGB pixel color, segmentation masks, and optical flow as observations; the pixel embedding network takes the image as input, and reconstruction losses include texture/perceptual image terms. Thus the method is not literally using only masks and flow: it also uses monocular RGB video. Separately, saying the method requires no annotations is ambiguous because segmentation masks are required inputs and may themselves be annotations in the ordinary dataset sense. The underlying point is valid if the authors mean that no template shape, category-specific 3D prior, keypoint labels, or image-to-surface correspondence annotations are required beyond RGB video, masks, and flow. The paper should state that distinction explicitly.
+
+Your severity: ( ) major  ( ) moderate  ( ) minor    |    ( ) signal  ( ) cosmetic
+Notes: 
+
+**18. [MODERATE] Multi-video training and the LASR comparison overstate the implications of optical-flow boundaries**  *(type: technical, ¶65)*
+
+> Quote: We treat multiple videos as a single long video with strong appearance changes and shape variations. / Note that LASR cannot handle multiple videos as it requires optical flow computed between every adjacent frame pairs.
+> If unrelated videos were literally concatenated, the boundary between videos would create artificial adjacent frame pairs with meaningless optical flow and no temporal continuity. The intended implementation presumably skips flow/reconstruction terms at video boundaries and treats the data as disconnected sequences sharing some parameters, which is materially different from a single long video and should be stated. For the same reason, the claim that LASR cannot handle multiple videos merely because it requires adjacent-frame flow is too categorical: flow boundary pairs could be omitted or handled with separate per-video objectives. The more substantive limitation is likely that LASR lacks ViSER’s mechanism for enforcing shared long-range/cross-video canonical correspondences.
+
+Your severity: ( ) major  ( ) moderate  ( ) minor    |    ( ) signal  ( ) cosmetic
+Notes: 
+
+**19. [MINOR] Ambiguous reuse of S for masks, shape, and surface correspondences**  *(type: technical, ¶15)*
+
+> Quote: Given a set of video observations including RGB pixel color, segmentation masks, and optical flow estimates *{It, St, ut}t* = *{* 0 *,...,T }* , our goal is to recover a set of shape and motion parameters *{* **S** *,* **D** *t}* that produce reconstructions *{I*[ˇ] *t, S*[ˇ] *t,* ˇ *ut}t* = *{* 0 *,...,T }* that match the video observations.
+> The notation overload is recoverable from context but unnecessarily confusing. S_t denotes segmentation masks, bold S denotes the recovered 3D shape, rendered masks are written as S-hat_t, and later matched surface locations are also written using S-hat[x,y]. A reader has to infer from subscript, boldface, and context whether S denotes a silhouette, a mesh, or a surface coordinate map. This is localized and fixable by using separate notation such as M_t for masks and a distinct calligraphic symbol for the mesh/surface.
+
+Your severity: ( ) major  ( ) moderate  ( ) minor    |    ( ) signal  ( ) cosmetic
+Notes: 
+
+**20. [MODERATE] Expectation-based correspondences are described more strongly than they justify**  *(type: technical, ¶26)*
+
+> Quote: To output a single surface point for pixel ( *x, y* ), we can compute a “soft” argmax [17, 48] by taking the expectation of the softmax distribution over the 3D locations of the points samples, / our pixel-surface embedding captures multimodal uncertainties over keypoints; for example, *σ* ( *x,y* )[ *i* ] can capture the fact that a particular pixel matches well to both the left and right ankle, / we introduce a 3D matching loss that ensures pixel embeddings only match to surface embeddings rendered at the pixel location:
+> These passages share the same underlying issue: the distribution over surface samples may be multimodal, but the method then converts it to an expected 3D coordinate. That expectation is not guaranteed to lie on the mesh surface and, under a bimodal distribution, can land between plausible modes, reproducing the same mean-regression artifact the text contrasts against. Similarly, a loss on the expected coordinate can encourage mass near the rendered surface point, but it does not strictly ensure that the pixel embedding matches only that point; off-target modes can average to the correct coordinate. The paper should either qualify these claims or explain whether the full distribution is used, whether it is driven to unimodality, or whether an additional distribution-level loss is applied.
+
+Your severity: ( ) major  ( ) moderate  ( ) minor    |    ( ) signal  ( ) cosmetic
+Notes: 
+
+---
+
+## Reviewer-level proxy
+
+### WEAK / low-quality — AASeg: Attention Aware Network for Real Time Semantic Segmentation
+
+`iclr2022-m5EBN92v-aaseg-attention-aware-network-for-real-time-semantic-segment` · decision: Reject · review score avg: 1.5 · cites/yr: 0.25
+
+**21. [MAJOR] Cityscapes validation mIoU in ablation tables conflicts with the main result**  *(type: logical, ¶52)*
+
+> Quote: |AASeg|512×1024|no|74.8|74.4|202.7| ... Table 4: Ablation study on Upsampling operation in our network using Cityscapes validation set. ... |bilinear upsampling|79.2| ... Table 6 reports AASeg variants up to 80.2 mIoU.
+> The main Cityscapes table reports AASeg validation mIoU as 74.8%, but later ablation tables on the Cityscapes validation set report values around 78.3–80.2%. This is too large to be explained as ordinary variation and reverses the relationship between the reported final model and its ablated variants. The discrepancy could be due to a different training schedule, crop/scale evaluation, validation protocol, class set, or stronger model variant, but none of that is stated. As written, it is unclear which model and protocol produced the headline Cityscapes numbers.
+
+Your severity: ( ) major  ( ) moderate  ( ) minor    |    ( ) signal  ( ) cosmetic
+Notes: 
+
+**22. [MINOR] CamVid comparison table contains duplicated SFNet entries**  *(type: logical, ¶56)*
+
+> Quote: |SFNet|720×960|DF2|70.4|134.1| |SFNet|720×960|ResNet-18|73.8|35.5| |SFNet|720×960|DF2|70.4|134.1| |SFNet|720×960|ResNet-18|73.8|35.5|
+> The CamVid table repeats the same two SFNet rows. This is likely a table-preparation error and may not change the numerical conclusion, but duplicated baseline rows create confusion about which comparisons were actually included and suggest the table was not carefully checked.
+
+Your severity: ( ) major  ( ) moderate  ( ) minor    |    ( ) signal  ( ) cosmetic
+Notes: 
+
+**23. [MODERATE] Ambiguous claim that the model has “no backbone”**  *(type: logical, ¶23)*
+
+> Quote: Our network dosen’t use any backbone to extract features from the input image unlike many previous architectures. The input image is first passed through a block comprising of convolutional, batch normalization and ReLU activation function.
+> The likely intent is that AASeg does not use a standard pretrained classification backbone such as ResNet, Xception, or DFNet. That would be a meaningful point for real-time segmentation and is consistent with later comparison tables marking AASeg as having backbone “no.” However, taken literally, the statement is misleading because the described convolutional stages are themselves a feature extractor. The paper should clarify whether “no backbone” means no ImageNet-pretrained backbone, no external named backbone, or no separate encoder. This matters for interpreting speed/accuracy comparisons and pretraining assumptions.
+
+Your severity: ( ) major  ( ) moderate  ( ) minor    |    ( ) signal  ( ) cosmetic
+Notes: 
+
+**24. [MINOR] Related-work description appears to attribute channel-selection behavior to SqueezeNet**  *(type: technical, ¶5)*
+
+> Quote: (Iandola et al., 2016) allows the neural network to find the critical channels of the feature map and select the most suitable channels by itself.
+> Iandola et al. (2016) is SqueezeNet, whose main contribution is parameter reduction using Fire modules with squeeze and expand convolutions. It is not normally described as learning channel attention or selecting critical feature-map channels by itself. That description is closer to Squeeze-and-Excitation Networks, which are cited separately later. This should be corrected or rephrased to avoid confusing the lineage of the proposed channel-attention component.
+
+Your severity: ( ) major  ( ) moderate  ( ) minor    |    ( ) signal  ( ) cosmetic
+Notes: 
+
+**25. [MODERATE] Validation and test training protocol for Cityscapes is ambiguous**  *(type: logical, ¶49)*
+
+> Quote: We present the segmentation accuracy and inference speed of our proposed method on Cityscapes validation and test set in Table 1. We use the training set and validation set to train our models before submitting to Cityscapes online server.
+> Training on train+val before submitting to the hidden Cityscapes test server is standard. However, the paper presents validation and test results together without clearly stating whether the validation mIoU was obtained from a model trained only on the training split, while the test result came from a train+val model. If the validation set was included in training for the model used to report validation mIoU, the validation result would not be a held-out estimate. The evaluation protocol should be made explicit.
+
+Your severity: ( ) major  ( ) moderate  ( ) minor    |    ( ) signal  ( ) cosmetic
+Notes: 
+
+### STRONG / high-quality — Scaling Laws for Neural Machine Translation
+
+`iclr2022-hR_SMu8c-scaling-laws-for-neural-machine-translation` · decision: Spotlight · review score avg: 8.5 · cites/yr: 23.75
+
+**26. [MAJOR] Optimal-allocation claims conflict with the stated additive scaling law**  *(type: technical, ¶37)*
+
+> Quote: **Proposition 1** (Optimal Scaling) ... the optimal encoder / decoder sizes ... and ... the scaling law reduces to L̂_opt(B) = α* B^−(pd+pe) + L∞ ... Inspection of the functional form of Eq. (1) suggests that as long as Nd/Ne is fixed as the model scales ... the optimal scaling exponent, (pe + pd), can ...
+> The stated optimal allocation and resulting exponent pe + pd would follow naturally from a multiplicative law such as α Ne^(-pe) Nd^(-pd) + L∞. However, Eq. (1) is described as an additive law, α(Ne^(-pe) + Nd^(-pd)) + L∞. Under that additive law, minimizing subject to Ne + Nd = B gives the first-order condition pe Ne^(-pe-1) = pd Nd^(-pd-1), so the optimal ratio generally depends on B when pe ≠ pd. Likewise, proportional scaling gives terms proportional to B^(-pe) and B^(-pd), with asymptotic behavior governed by the slower exponent, not B^(-(pe+pd)). Either the scaling law should be multiplicative, or the proposition and surrounding interpretation need a different derivation.
+
+Your severity: ( ) major  ( ) moderate  ( ) minor    |    ( ) signal  ( ) cosmetic
+Notes: 
+
+**27. [MINOR] Prior scaling-law literature is summarized as more univariate than it generally is**  *(type: logical, ¶2)*
+
+> Quote: For many of these tasks the scaling behavior of neural networks is highly predictable; model fit or test loss can be described precisely as a function of its number of parameters (Hestness et al., 2017; Kaplan et al., 2020; Henighan et al., 2020; Hernandez et al., 2021; Rosenfeld et al., 2019).
+> The cited scaling-law literature often models loss as depending on dataset size and/or compute budget in addition to parameter count, with univariate parameter-count laws applying only under specific fixed-data or non-limiting-data assumptions. Kaplan et al., for example, explicitly study model size, dataset size, and compute. Thus the statement that loss can be described precisely as a function of parameter count alone slightly overstates the general prior result and makes the paper's contrast with NMT appear sharper than warranted.
+
+Your severity: ( ) major  ( ) moderate  ( ) minor    |    ( ) signal  ( ) cosmetic
+Notes: 
+
+**28. [MODERATE] Claims that source-original reducible loss reaches zero or yields no further benefit overstate the fitted evidence**  *(type: logical, ¶47)*
+
+> Quote: Reducible loss quickly decays to zero for source original test sets. ... Hence, beyond a few hundred million parameters, there is no benefit in increasing the model size. ... the reducible error on source-original evaluation sets quickly saturates to 0.
+> The evidence appears to show that source-original evaluation losses rapidly approach a fitted irreducible-loss floor and that the estimated reducible component becomes small. Under the fitted power law, however, the reducible term approaches zero asymptotically rather than literally reaching zero, and L∞ itself is an estimated quantity. The stronger claim that there is 'no benefit' from larger models also goes beyond cross-entropy on the studied source-original sets and may not cover generation quality, robustness, rare phenomena, calibration, or other domains. A more precise statement would be that further scaling gives little additional fitted cross-entropy improvement on these source-original test sets beyond the relevant size range.
+
+Your severity: ( ) major  ( ) moderate  ( ) minor    |    ( ) signal  ( ) cosmetic
+Notes: 
+
+**29. [MODERATE] Back-translation capacity requirement is stated more strongly than the evidence supports**  *(type: logical, ¶52)*
+
+> Quote: This assertion suggests that in order for back-translation to be beneficial for training large models, it has to be performed with a models with comparable capacity or higher.
+> The preceding discussion frames the capacity-threshold explanation as a hypothesis, but this sentence turns it into a practical requirement. From the described experiment, teacher capacity is confounded with teacher quality, decoding settings, domain mismatch, synthetic artifacts, and noise patterns, and the setup appears to use only one back-translation model. The data support a conjecture worth testing, not a general necessity claim that back-translation must use a comparable-or-larger model.
+
+Your severity: ( ) major  ( ) moderate  ( ) minor    |    ( ) signal  ( ) cosmetic
+Notes: 
+
+**30. [MINOR] Language-pair generalization claim is stronger than the evidence described**  *(type: logical, ¶29)*
+
+> Quote: To ensure that our results generalize across different language pairs, we examine the fit of our scaling law on encoder / decoder scaling models trained on German *→* English (De *→* En), Chinese-to-English (Zh *→* En), and English-to-Chinese (En *→* Zh) translation tasks.
+> Fitting the proposed functional form separately on several additional language directions supports the claim that the form can describe multiple language pairs. It does not, by itself, 'ensure' generalization in a stronger out-of-sample sense, such as transferring fitted parameters or predicting unseen language pairs. A phrase such as 'test whether' or 'provide evidence that' would better match the described experiment.
+
+Your severity: ( ) major  ( ) moderate  ( ) minor    |    ( ) signal  ( ) cosmetic
+Notes: 
+
+---
+
+## Composite proxy
+
+### WEAK / low-quality — Neural networks with trainable matrix activation functions
+
+`iclr2022-UGINpaIC-neural-networks-with-trainable-matrix-activation-functions` · decision: Reject · review score avg: 2.2 · cites/yr: 0.0
+
+**31. [MINOR] Extension to arbitrary activations is not generally a matrix-vector activation in the same sense**  *(type: technical, ¶19)*
+
+> Quote: *Our observation also applies to an activation function σ other than ReLU. For example, we may rescale σ* ( *x* ) *to obtain σ* ( *ωi,ℓx* ) *using a set of constants {ωi,ℓ}* 1 *≤i≤nℓ,* 1 _≤ℓ≤L varying layer by layer and neuron by neuron.
+> The intended idea seems to be a trainable per-neuron rescaling of a standard activation, which is a reasonable adaptive-activation construction. However, the earlier matrix-activation observation for ReLU works because \operatorname{ReLU}(t)=d(t)t with d(t)\in0,1. For a general activation \sigma, representing \sigma(\omega t) as a diagonal matrix times the input would require a diagonal entry \sigma(\omega t)/t, which may be undefined at t=0 and cannot represent activations with \sigma(0)\ne 0 in the form D(t)t. Many common activations, such as Softplus or sigmoid-type functions, do not naturally fit this without extra qualifications. If the authors only mean ordinary componentwise scaled activations, then it is not the same matrix-vector observation unless the necessary conditions are stated.
+
+Your severity: ( ) major  ( ) moderate  ( ) minor    |    ( ) signal  ( ) cosmetic
+Notes: 
+
+**32. [MODERATE] Claim that ReLU networks clearly need many more neurons is stronger than the evidence shown**  *(type: logical, ¶34)*
+
+> Quote: To capture the high frequency, ReLU-type neural networks are clearly required to use much more neurons, introducing significantly amount of weight and bias parameters.
+> The empirical observation is plausible: piecewise-linear ReLU networks often need many linear regions to represent high-frequency oscillations accurately, and the shown ReLU/Para-ReLU runs fail on this target. However, the context does not provide a theoretical lower bound or a systematic width/depth study. The claim is based on one architecture and training setup, so the failure could also reflect optimization, initialization, learning-rate schedule, depth/width choice, or insufficient training rather than an inherent requirement for much more neurons. The sentence should be softened or supported by an ablation over width/depth or a theoretical argument.
+
+Your severity: ( ) major  ( ) moderate  ( ) minor    |    ( ) signal  ( ) cosmetic
+Notes: 
+
+**33. [MODERATE] Hypothesis-class inclusion does not imply practical performance is “clearly not worse”**  *(type: logical, ¶12)*
+
+> Quote: Since ReLU and Leaky ReLU are included by our DNN as special cases, the proposed DNN is clearly not worse than the traditional ones in practice.
+> At the level of representable functions, a parameterized family containing ReLU and Leaky ReLU has at least the same best-case approximation capacity, assuming the architecture and objective are otherwise comparable. What does not follow is the practical claim. Neural-network training is nonconvex, and adding trainable activation parameters can change optimization dynamics, initialization sensitivity, regularization behavior, and effective learning rates. Therefore, inclusion of ReLU/Leaky ReLU as special cases only supports a statement about the optimal attainable loss over the enlarged function class, not that the trained model will be no worse in practice.
+
+Your severity: ( ) major  ( ) moderate  ( ) minor    |    ( ) signal  ( ) cosmetic
+Notes: 
+
+**34. [MODERATE] Claim that the ResNet parameters are already ReLU-tuned is not sufficiently supported**  *(type: logical, ¶42)*
+
+> Quote: Those parameters given in (Paszke et al., 2019) are already tuned well with respect to ReLU.
+> The authors appear to argue that the comparison is not biased in favor of TMAF because the baseline architecture/hyperparameters are standard for ReLU. However, Paszke et al. (2019) is the PyTorch systems paper, not a canonical source of carefully tuned CIFAR-10/CIFAR-100 ResNet hyperparameters. The reported CIFAR-10 accuracies around 77–80% are also well below commonly reported tuned ResNet results on CIFAR-10, suggesting that the training setup may not be strongly optimized. Using the same optimizer and learning rate for all activations is fair in one sense, but it does not establish that the ReLU baseline is tuned well. This weakens the interpretation of the reported TMAF advantage unless the exact recipe, data augmentation, normalization, epoch count, and baseline tuning procedure are provided or a more appropriate citation is used.
+
+Your severity: ( ) major  ( ) moderate  ( ) minor    |    ( ) signal  ( ) cosmetic
+Notes: 
+
+**35. [MINOR] Gaussian equal-probability interval claim is numerically inaccurate as stated**  *(type: technical, ¶24)*
+
+> Quote: For TMAF *Dℓ* in equation 5, the function *αℓ* uses intervals ( *−∞, −* 1 *.* 4), ( *−* 1 *.* 4 *, −* 0 *.* 92], ( *−* 0 *.* 92 *, −* 0 *.* 56], ( *−* 0 *.* 56 *, −* 0 *.* 26], ( *−* 0 *.* 26 *,* 0], (0 *,* 0 *.* 26], (0 *.* 26 *,* 0 *.* 56], (0 *.* 56 *,* 0 *.* 92], (0 *.* 92 *,* 1 *.* 4], (1 *.* 4 *, ∞* ) such that probability over each of the ten intervals is 0.1 with respect to Gaussian distribution.
+> The intent is clear: choose grid intervals roughly matched to Gaussian quantiles, likely because BatchNorm makes preactivations approximately standardized. But the listed cut points are not the standard normal deciles. Equal 0.1-probability bins would use approximate cutoffs \pm1.2816, \pm0.8416, \pm0.5244, \pm0.2533, and 0. The provided values are close in the middle but noticeably off in the tails: for example, P(Z>1.4)\approx0.081, not 0.1. If these are only heuristic rounded bins, the prose should say approximately; if exact equal-mass Gaussian bins are intended, the cutoffs should be corrected.
+
+Your severity: ( ) major  ( ) moderate  ( ) minor    |    ( ) signal  ( ) cosmetic
+Notes: 
+
+### STRONG / high-quality — Volume Rendering of Neural Implicit Surfaces
+
+`neurips2021-GlEWs-V9-volume-rendering-of-neural-implicit-surfaces` · decision: Oral · review score avg: 8.5 · cites/yr: 196.8
+
+**36. [MINOR] BlendedMVS comparison alternates between NeRF++ and NeRF**  *(type: logical, ¶55)*
+
+> Quote: Therefore we use NeRF++[39] as a baseline for this dataset. In Table 2 we present our results compared to NeRF++. Qualitative comparisons are presented in Fig. 5; since the units are unknown in this case we present relative improvement of Chamfer distance (in %) compared to NeRF.
+> The setup identifies NeRF++as the relevant BlendedMVS baseline, which is sensible for scenes with complex backgrounds. The final sentence then says the relative Chamfer improvement is compared to NeRF. This may be a typographical omission of “++”, but as written the baseline for the percentage improvements is ambiguous, especially because the table reports only relative percentages rather than absolute Chamfer distances.
+
+Your severity: ( ) major  ( ) moderate  ( ) minor    |    ( ) signal  ( ) cosmetic
+Notes: 
+
+**37. [MODERATE] The β+ sampling guarantee may apply to a modified density rather than the current model density**  *(type: technical, ¶45)*
+
+> Quote: Either way, we use the final T and β+ (guaranteed to provide BT,β+ ≤ ϵ) to estimate the current opacity O, Line 10 in Algorithm 1).
+> If the adaptive procedure terminates with β+ equal to the current β, the stated bound applies to the current density. But the text allows the algorithm to stop at max_iter with β+ > β. In that case, the guarantee BT,β+ ≤ ε controls the rectangle-rule error for the smoother density parameterized by β+, not necessarily for the actual current density using β. The prose nevertheless says the algorithm estimates the current opacity. Unless rendering deliberately uses β+ and distinguishes this from the model density, the guarantee is overstated or ambiguous.
+
+Your severity: ( ) major  ( ) moderate  ( ) minor    |    ( ) signal  ( ) cosmetic
+Notes: 
+
+**38. [MINOR] Volume rendering is described as avoiding extraneous surface-growth artifacts too broadly**  *(type: logical, ¶5)*
+
+> Quote: Also, learning to render surfaces directly tends to grow extraneous parts due to optimization problems, which are avoided by volume rendering.
+> Volume rendering can mitigate some optimization difficulties of direct surface rendering because it integrates along rays rather than relying on hard surface intersections. But the statement is too categorical: generic neural volume-rendering methods can still produce floaters, spurious semi-transparent structures, noisy density, and other geometry artifacts. A more precise claim would be that volume rendering alleviates some surface-rendering optimization artifacts, not that it avoids extraneous geometry in general.
+
+Your severity: ( ) major  ( ) moderate  ( ) minor    |    ( ) signal  ( ) cosmetic
+Notes: 
+
+**39. [MODERATE] The sampling guarantee is conditional on an exact SDF, while the learned network is only regularized toward one**  *(type: logical, ¶31)*
+
+> Quote: The proof of this theorem, which is provided in the supplementary, makes a principled use of the signed distance function’s unique properties;
+> The theorem is well motivated for a true signed distance function, whose geometric and Lipschitz properties can be used to bound density variation along a ray. In the implemented model, however, the SDF is represented by an MLP and encouraged with an Eikonal loss; this does not guarantee that the learned field is an exact SDF everywhere or that it satisfies the exact geometric assumptions used in the proof. The guarantee should therefore be read as applying to the idealized SDF model, while the learned model relies on regularization and empirical behavior.
+
+Your severity: ( ) major  ( ) moderate  ( ) minor    |    ( ) signal  ( ) cosmetic
+Notes: 
+
+**40. [MODERATE] The α and β parameterization is inconsistent about positivity, learnability, and the bound notation**  *(type: technical, ¶48)*
+
+> Quote: Taking the maximum over all intervals furnishes a bound BT,β as a function of T and β ... / In addition we have two scalar learnable parameters α, β ∈ R. In fact, in our implementation we make the choice α = β−1. We denote by θ ... θ = (φ, ψ, β).
+> The density definition and opacity-error bounds require α and β to be positive. As written, saying α,β ∈ R would allow β ≤ 0, for which the Laplace scale, α = β−1, and the sampling bounds are not well defined. There is also an internal ambiguity: the text first presents α and β as two learnable scalars, then ties α deterministically to β and excludes α from θ. This also affects the notation BT,β: if α is independent, the opacity-error bound depends on α as well as β; if α = β−1, that dependence should be stated before suppressing α from the notation.
+
+Your severity: ( ) major  ( ) moderate  ( ) minor    |    ( ) signal  ( ) cosmetic
+Notes: 
\ No newline at end of file
diff --git a/benchmarks/perturbation/manual_validation/_build_sample.py b/benchmarks/perturbation/manual_validation/_build_sample.py
new file mode 100644
index 0000000..d2d31e0
--- /dev/null
+++ b/benchmarks/perturbation/manual_validation/_build_sample.py
@@ -0,0 +1,186 @@
+"""Sample injected perturbations for manual quality validation.
+
+Samples 10 perturbations per top-level TYPE (Surface, Claim, Logic, Experimental)
+= 40 total, guaranteeing each SUBTYPE appears at least once and spreading across
+papers. Reads the kept (verified, substantive) perturbations under
+data/perturbations_filtered/. Writes samples.md (human-readable checklist) and
+samples.json (raw records) into this directory.
+"""
+import json, glob, os, random, collections
+from pathlib import Path
+
+BASE = Path(__file__).resolve().parent.parent          # .../perturbation
+FILT = BASE / "data" / "perturbations_filtered"
+OUT  = Path(__file__).resolve().parent                 # .../manual_validation
+SEED = 42
+PER_TYPE = 10
+MAX_PER_PAPER = 3                                       # diversity cap (relaxed if needed)
+
+SUBTYPE_TO_TYPE = {
+    "numeric_parameter": "Surface", "operator_or_sign": "Surface",
+    "index_or_subscript": "Surface", "computation": "Surface", "symbol_binding": "Surface",
+    "incorrect_claim_theoretical": "Claim", "incorrect_statement_empirical": "Claim",
+    "missing_case": "Logic", "induction": "Logic",
+    "circular_reasoning": "Logic", "invalid_implication": "Logic",
+    "misinterp": "Experimental", "causal_reversed": "Experimental", "p_hacking": "Experimental",
+}
+TYPE_ORDER = ["Surface", "Claim", "Logic", "Experimental"]
+SUBTYPE_LABEL = {
+    "numeric_parameter": "Numeric", "operator_or_sign": "Operator / Sign",
+    "index_or_subscript": "Index / Subscript", "computation": "Computation",
+    "symbol_binding": "Symbol binding (deprecated)",
+    "incorrect_claim_theoretical": "False theoretical claim",
+    "incorrect_statement_empirical": "False empirical claim",
+    "missing_case": "Missing case", "induction": "Induction error",
+    "circular_reasoning": "Circular reasoning", "invalid_implication": "Invalid implication",
+    "misinterp": "Misinterpretation of results", "causal_reversed": "Reversed causality",
+    "p_hacking": "P-hacking",
+}
+
+
+def load_all():
+    cands = []
+    for f in glob.glob(str(FILT / "*/all/*/*/*kept_perturbations.json")):
+        rel = Path(f).relative_to(FILT)
+        domain = rel.parts[0]
+        paper = rel.parts[2]
+        d = json.load(open(f))
+        for p in d.get("perturbations", []):
+            sub = p.get("error")
+            typ = SUBTYPE_TO_TYPE.get(sub)
+            if typ is None:
+                continue
+            cands.append({
+                "uid": len(cands),  # globally unique (perturbation_id repeats across papers)
+                "type": typ, "subtype": sub, "domain": domain, "paper": paper,
+                "paper_title": d.get("paper_title", ""),
+                "perturbation_id": p.get("perturbation_id"),
+                "original": p.get("original", ""), "perturbed": p.get("perturbed", ""),
+                "why_wrong": p.get("why_wrong", ""), "quote": p.get("quote", ""),
+                "reason": p.get("reason", ""),
+            })
+    return cands
+
+
+def select(by_type, rng):
+    """Pick PER_TYPE per type while guaranteeing (a) every subtype appears >=1,
+    (b) every domain appears >=1 globally, and spreading across domains/papers.
+
+    Shared global counters let the fill steps balance domains and papers across
+    all types at once.
+    """
+    selected = {t: [] for t in TYPE_ORDER}
+    used = set()
+    dom_ct, paper_ct = collections.Counter(), collections.Counter()
+    pools = {t: list(by_type[t]) for t in TYPE_ORDER}
+    for t in TYPE_ORDER:
+        rng.shuffle(pools[t])
+    all_domains = sorted({c["domain"] for t in TYPE_ORDER for c in pools[t]})
+
+    def take(c):
+        selected[c["type"]].append(c); used.add(c["uid"])
+        dom_ct[c["domain"]] += 1; paper_ct[c["paper"]] += 1
+
+    def avail(t):  # candidates of type t not yet used, if type has a free slot
+        if len(selected[t]) >= PER_TYPE:
+            return []
+        return [c for c in pools[t] if c["uid"] not in used]
+
+    # 1) subtype coverage (prefer least-used domain, then paper)
+    for t in TYPE_ORDER:
+        by_sub = collections.defaultdict(list)
+        for c in pools[t]:
+            by_sub[c["subtype"]].append(c)
+        for sub in sorted(by_sub):
+            if len(selected[t]) >= PER_TYPE:
+                break
+            cand = min(by_sub[sub], key=lambda c: (dom_ct[c["domain"]], paper_ct[c["paper"]]))
+            if cand["uid"] not in used:
+                take(cand)
+
+    # 2) domain coverage: for each uncovered domain, place one (in any type w/ a free slot)
+    for d in all_domains:
+        if dom_ct[d] > 0:
+            continue
+        cands = [c for t in TYPE_ORDER for c in avail(t) if c["domain"] == d]
+        if cands:
+            take(min(cands, key=lambda c: paper_ct[c["paper"]]))
+
+    # 3) fill each type to PER_TYPE, always taking the least-used domain then paper
+    for t in TYPE_ORDER:
+        while len(selected[t]) < PER_TYPE:
+            pool = avail(t)
+            if not pool:
+                break
+            take(min(pool, key=lambda c: (dom_ct[c["domain"]], paper_ct[c["paper"]])))
+    return selected
+
+
+def md_block(text):
+    """Render possibly-LaTeX verbatim text as a fenced code block."""
+    text = (text or "").rstrip()
+    return f"```\n{text}\n```"
+
+
+def main():
+    rng = random.Random(SEED)
+    cands = load_all()
+    by_type = collections.defaultdict(list)
+    for c in cands:
+        by_type[c["type"]].append(c)
+
+    chosen = select(by_type, rng)
+
+    OUT.mkdir(parents=True, exist_ok=True)
+    # raw json
+    flat = [{"idx": i + 1, **c} for i, c in enumerate(
+        [c for typ in TYPE_ORDER for c in chosen[typ]])]
+    (OUT / "samples.json").write_text(json.dumps(flat, indent=2))
+
+    # markdown checklist
+    lines = []
+    lines.append("# Injected-perturbation validation sample\n")
+    lines.append(f"40 perturbations: {PER_TYPE} per type (Surface, Claim, Logic, Experimental), "
+                 "each subtype covered ≥ once, drawn from the verified/kept set "
+                 "(`data/perturbations_filtered/`).\n")
+    lines.append("For each: **Passage** = original text, **Perturbation** = injected replacement, "
+                 "**Why it errs** = why it breaks internal consistency, "
+                 "**Contradicting evidence** = the passage elsewhere it conflicts with.\n")
+    # coverage summary
+    lines.append("## Coverage\n")
+    for typ in TYPE_ORDER:
+        subs = collections.Counter(c["subtype"] for c in chosen[typ])
+        summ = ", ".join(f"{SUBTYPE_LABEL[s]} ({n})" for s, n in sorted(subs.items()))
+        lines.append(f"- **{typ}** ({len(chosen[typ])}): {summ}")
+    lines.append("")
+
+    idx = 0
+    for typ in TYPE_ORDER:
+        lines.append(f"\n---\n\n## {typ}\n")
+        for c in chosen[typ]:
+            idx += 1
+            lines.append(f"### {idx}. {typ} — {SUBTYPE_LABEL.get(c['subtype'], c['subtype'])}")
+            lines.append(f"`{c['domain']}` / `{c['paper']}` / `{c['perturbation_id']}`\n")
+            lines.append("**Passage (original):**")
+            lines.append(md_block(c["original"]))
+            lines.append("**Perturbation (injected):**")
+            lines.append(md_block(c["perturbed"]))
+            lines.append(f"**Why it causes an error:** {c['why_wrong']}\n")
+            if c["quote"].strip():
+                lines.append("**Contradicting evidence (quote):**")
+                lines.append(md_block(c["quote"]))
+            if c["reason"].strip():
+                lines.append(f"**Verifier note:** {c['reason']}\n")
+            lines.append("**Your assessment:** ( ) valid error  ( ) not an error  ( ) unsure")
+            lines.append("**Notes:** \n")
+    (OUT / "samples.md").write_text("\n".join(lines))
+
+    print(f"candidates loaded: {len(cands)}")
+    for typ in TYPE_ORDER:
+        print(f"  {typ}: pool={len(by_type[typ])}, sampled={len(chosen[typ])}, "
+              f"papers={len(set(c['paper'] for c in chosen[typ]))}")
+    print(f"wrote {OUT/'samples.md'} and {OUT/'samples.json'}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/perturbation/manual_validation/samples.json b/benchmarks/perturbation/manual_validation/samples.json
new file mode 100644
index 0000000..4da062d
--- /dev/null
+++ b/benchmarks/perturbation/manual_validation/samples.json
@@ -0,0 +1,602 @@
+[
+  {
+    "idx": 1,
+    "uid": 2570,
+    "type": "Surface",
+    "subtype": "computation",
+    "domain": "stat_AP",
+    "paper": "2604.23438v1",
+    "paper_title": "Estimating Causal Attribution of Anthropogenic Forcing on High-Temperature Extremes Using a Latent Gaussian Spatial Model",
+    "perturbation_id": "P000_S0000",
+    "original": "\\[\n\\mathbf{X} =\n\\left[\n\\begin{array}{c|c|c|c}\n\\mathbf{X}_1^\\top&\\mathbf{X}_2^\\top&\\cdots&\\mathbf{X}_{250}^\\top\n\\end{array}\n\\right]^\\top,\n\\]",
+    "perturbed": "\\[\n\\mathbf{X} =\n\\left[\n\\begin{array}{c|c|c|c}\n\\mathbf{X}_1^\\top&\\mathbf{X}_2^\\top&\\cdots&\\mathbf{X}_{250}^\\top\n\\end{array}\n\\right],\n\\]",
+    "why_wrong": "The context states that X is a matrix of dimension 1750 x 35. Each block X_i is defined as I_7 (7x7) tensor product with a vector, resulting in 7 columns. If X is a vertical stack of 250 such blocks (as implied by the original transpose on the outer bracket), the rows would be 250 * 7 = 1750. Removing the outer transpose makes it a horizontal concatenation, which would result in a 7 x 1750 matrix, contradicting the stated 1750 x 35 dimensions.",
+    "quote": "\\right],",
+    "reason": "The surrounding context states $\\mathbf{X}$ is formed by stacking blocks $\\mathbf{X}_i^\\top$ vertically (hence the outer transpose $[\\cdots]^\\top$); removing the transpose makes it a row vector rather than the required $1750\\times35$ matrix."
+  },
+  {
+    "idx": 2,
+    "uid": 35,
+    "type": "Surface",
+    "subtype": "index_or_subscript",
+    "domain": "econ_EM",
+    "paper": "2604.23023v1",
+    "paper_title": "\\tit",
+    "perturbation_id": "P001_S0033",
+    "original": "$X_{0}\\in\\mathbb{R}^{p}$",
+    "perturbed": "$X_{0}\\in\\mathbb{R}^{p'}$",
+    "why_wrong": "The context defines $W$ as a $p'$-dimensional Brownian motion and $X$ as a $p$-dimensional covariate process. Changing the dimension of the initial value $X_0$ to $p'$ creates a dimensionality mismatch with the covariate process $X$ and its coefficients $\\beta \\in \\mathbb{R}^p$.",
+    "quote": "$X_{0}\\in\\mathbb{R}^{p'}$",
+    "reason": "The related_passages explicitly state '$X_{0}\\in\\mathbb{R}^{p}$', and $p'$ is the dimension of the Brownian motion $W$, not the state space of $X_0$."
+  },
+  {
+    "idx": 3,
+    "uid": 1657,
+    "type": "Surface",
+    "subtype": "numeric_parameter",
+    "domain": "physics_atm-clus",
+    "paper": "2603.22254v1",
+    "paper_title": "Characterizing High-Capacity Janus Aminobenzene\u2013Graphene Anode for Sodium-Ion Batteries with Machine Learning",
+    "perturbation_id": "P009_S0019",
+    "original": "$\\times$",
+    "perturbed": "$\\times$10$^{-12}$",
+    "why_wrong": "The text describes 'consistent high ion-diffusivity'; 10$^{-12}$ cm$^2$/s is several orders of magnitude lower than the 10$^{-6}$ cm$^2$/s range typically considered 'high' for fast-charging battery anodes.",
+    "quote": "$\\times$10$^{-12}$",
+    "reason": "The surrounding context states the diffusivity is \u22485\u00d710\u207b\u2076 cm\u00b2/s, and a related passage confirms ~4.8\u00d710\u207b\u2076 cm\u00b2s\u207b\u00b9; inserting 10\u207b\u00b9\u00b2 changes the exponent from -6 to -12, contradicting the established value by six orders of magnitude."
+  },
+  {
+    "idx": 4,
+    "uid": 977,
+    "type": "Surface",
+    "subtype": "operator_or_sign",
+    "domain": "cs_CC",
+    "paper": "2604.19625v1",
+    "paper_title": "Coherent-State Propagation: A Computational Framework for Simulating Bosonic Quantum Systems",
+    "perturbation_id": "P000_S0010",
+    "original": "$|0\\rangle$",
+    "perturbed": "$|1\\rangle$",
+    "why_wrong": "The text explicitly defines $|0\\rangle$ as the vacuum state. In bosonic systems, the coherent state is generated by applying the displacement operator to the vacuum state $|0\\rangle$, not the first excited state $|1\\rangle$.",
+    "quote": "$|\\alpha\\rangle = D(\\alpha)|1\\rangle$",
+    "reason": "The related_passages and surrounding context explicitly state '$|0\\rangle$ is the vacuum state' and define $|\\alpha\\rangle = D(\\alpha)|0\\rangle$; replacing $|0\\rangle$ with $|1\\rangle$ contradicts this definition."
+  },
+  {
+    "idx": 5,
+    "uid": 2767,
+    "type": "Surface",
+    "subtype": "index_or_subscript",
+    "domain": "stat_AP",
+    "paper": "2604.25342v1",
+    "paper_title": "\\documentclass[10pt,letterpaper]{article}",
+    "perturbation_id": "P003_S0029",
+    "original": "$_3$",
+    "perturbed": "$_4$",
+    "why_wrong": "The text explicitly identifies the chemical as 'total ammonia' and 'NH$_3$'. Changing the subscript to 4 creates a chemical formula (NH4, ammonium ion) that is inconsistent with the 'ammonia' label used throughout the paragraph and the CAMS dataset description.",
+    "quote": "NH$_4$",
+    "reason": "The surrounding_context and related passages consistently refer to ammonia as NH$_3$, not NH$_4$, making the subscript change a clear error."
+  },
+  {
+    "idx": 6,
+    "uid": 713,
+    "type": "Surface",
+    "subtype": "index_or_subscript",
+    "domain": "hep-ex",
+    "paper": "2604.26386v1",
+    "paper_title": "\\boldmath Comparison of Silvaco and Synopsys TCAD Predictions Including the Perugia Radiation Damage Model in Silicon Pixel Detectors for the HL-LHC",
+    "perturbation_id": "P005_S0009",
+    "original": "$^{-3}$",
+    "perturbed": "$^{-2}$",
+    "why_wrong": "The text defines this value as a 'bulk concentration', which must be measured in units of inverse volume (cm$^{-3}$), not inverse area (cm$^{-2}$).",
+    "quote": "cm$^{-2}$",
+    "reason": "The surrounding context states the Boron bulk concentration in cm$^{-3}$ (volumetric units), so changing the exponent to $^{-2}$ gives incorrect units for a volumetric concentration."
+  },
+  {
+    "idx": 7,
+    "uid": 2301,
+    "type": "Surface",
+    "subtype": "computation",
+    "domain": "q-bio_GN",
+    "paper": "2604.20488v1",
+    "paper_title": "Conditional Monte Carlo Tree Diffusion for Designing Cell-Type-Specific and Biologically Faithful Regulatory DNA",
+    "perturbation_id": "P002_S0006",
+    "original": "$\\mathcal{V} = \\{A, C, G, T, \\bm{m}\\}$",
+    "perturbed": "$\\mathcal{V} = \\{A, C, G, T\\}$",
+    "why_wrong": "The text explicitly states that the vocabulary includes a special mask token 'm' used for the diffusion process. Removing 'm' from the set definition contradicts the subsequent sentence which defines 'm' as a member of the vocabulary.",
+    "quote": "$\\mathcal{V} = \\{A, C, G, T\\}$",
+    "reason": "The surrounding context and related passages explicitly state the vocabulary includes the mask token m and that V=5, but the perturbed form omits m, making the set have only 4 elements contradicting V=5."
+  },
+  {
+    "idx": 8,
+    "uid": 2961,
+    "type": "Surface",
+    "subtype": "numeric_parameter",
+    "domain": "cs_LG",
+    "paper": "2604.25489v1",
+    "paper_title": "Adaptable phase retrieval for coherent transition radiation spectroscopy based on differentiable physics information",
+    "perturbation_id": "P002_S0022",
+    "original": "\\begin{equation}\n      s(z) = \\mathrm{Tukey}\\!\\left(\\frac{z}{z_{\\max}}, \\alpha\\right),\n      \\label{eq:tukey}\n    \\end{equation}",
+    "perturbed": "\\begin{equation}\n      s(z) = \\mathrm{Tukey}\\!\\left(\\frac{z}{z_{\\max}}, 1.5\\right),\n      \\label{eq:tukey}\n    \\end{equation}",
+    "why_wrong": "The text immediately following the equation specifies that the taper parameter alpha is 0.5. Setting it to 1.5 in the equation creates a direct contradiction with the descriptive text.",
+    "quote": "\\mathrm{Tukey}\\!\\left(\\frac{z}{z_{\\max}}, 1.5\\right)",
+    "reason": "The surrounding context immediately after states 'with taper parameter \u03b1=0.5', contradicting the hardcoded value of 1.5 replacing the parameter \u03b1."
+  },
+  {
+    "idx": 9,
+    "uid": 1756,
+    "type": "Surface",
+    "subtype": "index_or_subscript",
+    "domain": "physics_atm-clus",
+    "paper": "2602.14334v1",
+    "paper_title": "Exploring Uranus and Neptune's deep elemental abundances: insights from their building blocks' composition",
+    "perturbation_id": "P003_S0017",
+    "original": "$\\mu_\\text{g}$",
+    "perturbed": "$\\mu_\\text{total}$",
+    "why_wrong": "The text defines the variable as the average molecular mass of the gas. Changing the subscript to 'total' creates an inconsistency with the standard notation for gas molecular mass ($\\mu_g$) used in astrophysical disk literature cited in the context.",
+    "quote": "$\\mu_\\text{total}$",
+    "reason": "The surrounding context and related passages consistently use $\\mu_\\text{g}$ for the average molecular mass of the gas; replacing it with $\\mu_\\text{total}$ introduces an unbound symbol not defined anywhere."
+  },
+  {
+    "idx": 10,
+    "uid": 2441,
+    "type": "Surface",
+    "subtype": "index_or_subscript",
+    "domain": "stat_AP",
+    "paper": "2604.23792v1",
+    "paper_title": "Beyond the Final Label: Exploiting the Untapped Potential of Classification Histories in Astronomical Light Curve Analysis",
+    "perturbation_id": "P001_S0001",
+    "original": "$$p_{t,*}\\geq \\rho \\quad \\forall i \\in \\{\\tau-k+1, \\tau-k+2, \\ldots, \\tau\\},$$",
+    "perturbed": "$$p_{t,*}\\geq \\rho \\quad \\forall i \\in \\{\\tau-k, \\tau-k+1, \\ldots, \\tau-1\\},$$",
+    "why_wrong": "The Accuracy Condition is defined for $k$ consecutive classifications up to and including time $\\tau$. The perturbed index range ends at $\\tau-1$, which excludes the current classification at time $\\tau$ mentioned in the preceding text.",
+    "quote": "\\forall i \\in \\{\\tau-k, \\tau-k+1, \\ldots, \\tau-1\\}",
+    "reason": "The surrounding context states the Accuracy Condition applies over {\u03c4-k+1,...,\u03c4} (k timesteps up to and including \u03c4); the perturbed range {\u03c4-k,...,\u03c4-1} shifts the window and excludes \u03c4 while including an extra earlier point."
+  },
+  {
+    "idx": 11,
+    "uid": 1444,
+    "type": "Claim",
+    "subtype": "incorrect_claim_theoretical",
+    "domain": "math_all",
+    "paper": "2604.26864v1",
+    "paper_title": "\\bf Stability and existence of relativistic plasma--vacuum interfaces\\let\\thefootnote\\relax\\footnotetext% The research of \\sc Paolo Secchi",
+    "perturbation_id": "P000_S0010",
+    "original": "\\begin{lemma} \\label{lem:1st}\n\tIf $\\theta_0\\geq 1$ is sufficiently large and $\\varepsilon>0$ is small enough, then\n\t\\begin{align}  \\nonumber %\\label{1st.sub}\n\t\t\\|e_{n+}''\\|_{H_*^s(\\Omega_{T})}+\\|e_{n-}''\\|_{H^{s+1}(\\Omega_{T})}+ \\|\\tilde{e}_{n}''\\|_{H^{s+1}(\\Sigma_{T})}\n\t\t\\lesssim \\varepsilon^2 \\theta_n^{\\varsigma_2(s)-1}\\varDelta_{n}\n\t\\end{align}\n\tfor $n=0,1,\\ldots,{N}-1$  and $s=6,7,\\ldots,\\widetilde{\\alpha}-2 $,\t\n\twhere $$\\varsigma_2(s):=\\max\\{(s+2-{\\alpha })_++12-2{\\alpha },\\, s+8-2{\\alpha } \\}.$$\n\\end{lemma}",
+    "perturbed": "\\begin{lemma} \\label{lem:1st}\n\tIf $\\theta_0\\geq 1$ is sufficiently large and $\\varepsilon>0$ is small enough, then\n\t\\begin{align}  \\nonumber %\\label{1st.sub}\n\t\t\\|e_{n+}''\\|_{H_*^s(\\Omega_{T})}+\\|e_{n-}''\\|_{H^{s+1}(\\Omega_{T})}+ \\|\\tilde{e}_{n}''\\|_{H^{s+1}(\\Sigma_{T})}\n\t\t\\lesssim \\varepsilon \\theta_n^{\\varsigma_2(s)-1}\\varDelta_{n}\n\t\\end{align}\n\tfor $n=0,1,\\ldots,{N}-1$  and $s=6,7,\\ldots,\\widetilde{\\alpha}-2 $,\t\n\twhere $$\\varsigma_2(s):=\\max\\{(s+2-{\\alpha })_++12-2{\\alpha },\\, s+8-2{\\alpha } \\}.$$\n\\end{lemma}",
+    "why_wrong": "The scaling with respect to the speed of light parameter epsilon is incorrect. In the context of the quadratic substitution errors for the Nash-Moser iteration in RMHD, the error terms should scale as epsilon squared (\\varepsilon^2) to ensure the convergence of the scheme toward the non-relativistic limit, as seen in the surrounding lemmas (e.g., Lemma 4.11 or 4.12 in similar RMHD literature).",
+    "quote": "\\lesssim \\varepsilon \\theta_n^{\\varsigma_2(s)-1}\\varDelta_{n}",
+    "reason": "The surrounding context shows the original bound has \u03b5\u00b2, but the perturbation changes it to \u03b5, which is a weaker bound inconsistent with the quadratic error term nature described in the context."
+  },
+  {
+    "idx": 12,
+    "uid": 540,
+    "type": "Claim",
+    "subtype": "incorrect_statement_empirical",
+    "domain": "hep-ex",
+    "paper": "2604.24435v1",
+    "paper_title": "%",
+    "perturbation_id": "P000_S0010",
+    "original": "The ATLAS detector~\\cite{PERF-2007-01} at the LHC covers nearly the entire solid angle around the collision point.\\footnote{ATLAS uses a right-handed coordinate system with its origin at the nominal interaction point (IP)\nin the center of the detector and the \\(z\\)-axis along the beam pipe.\nThe \\(x\\)-axis points from the IP to the center of the LHC ring,\nand the \\(y\\)-axis points upwards.\nPolar coordinates \\((r,\\phi)\\) are used in the transverse plane,\n\\(\\phi\\) being the azimuthal angle around the \\(z\\)-axis.\nThe pseudorapidity is defined in terms of the polar angle \\(\\theta\\) as \\(\\eta = -\\ln \\tan(\\theta/2)\\) and is equal to the rapidity\n$ y = \\frac{1}{2} \\ln \\left( \\frac{E + p_z}{E - p_z} \\right) $ in the relativistic limit.\nAngular distance is measured in units of \\(\\Delta R \\equiv \\sqrt{(\\Delta y)^{2} + (\\Delta\\phi)^{2}}\\).}\nIt consists of an inner tracking detector surrounded by a thin superconducting solenoid, electromagnetic and hadronic calorimeters,\nand a muon spectrometer incorporating three large superconducting air-core toroidal magnets.",
+    "perturbed": "The ATLAS detector~\\cite{PERF-2007-01} at the LHC covers nearly the entire solid angle around the collision point.\\footnote{ATLAS uses a left-handed coordinate system with its origin at the nominal interaction point (IP)\nin the center of the detector and the \\(z\\)-axis along the beam pipe.\nThe \\(x\\)-axis points from the IP to the center of the LHC ring,\nand the \\(y\\)-axis points upwards.\nPolar coordinates \\((r,\\phi)\\) are used in the transverse plane,\n\\(\\phi\\) being the azimuthal angle around the \\(z\\)-axis.\nThe pseudorapidity is defined in terms of the polar angle \\(\\theta\\) as \\(\\eta = -\\ln \\tan(\\theta/2)\\) and is equal to the rapidity\n$ y = \\frac{1}{2} \\ln \\left( \\frac{E + p_z}{E - p_z} \\right) $ in the relativistic limit.\nAngular distance is measured in units of \\(\\Delta R \\equiv \\sqrt{(\\Delta y)^{2} + (\\Delta\\phi)^{2}}\\).}\nIt consists of an inner tracking detector surrounded by a thin superconducting solenoid, electromagnetic and hadronic calorimeters,\nand a muon spectrometer incorporating three large superconducting air-core toroidal magnets.",
+    "why_wrong": "The text incorrectly describes the ATLAS coordinate system as left-handed, whereas standard particle physics experiments like ATLAS use a right-handed coordinate system as defined in the cited performance papers.",
+    "quote": "ATLAS uses a left-handed coordinate system",
+    "reason": "The surrounding context explicitly states 'ATLAS uses a right-handed coordinate system', and the perturbed version changes this to 'left-handed', directly contradicting the established fact."
+  },
+  {
+    "idx": 13,
+    "uid": 790,
+    "type": "Claim",
+    "subtype": "incorrect_statement_empirical",
+    "domain": "hep-ex",
+    "paper": "2604.25034v1",
+    "paper_title": "Bell Test of Photons from Electron-Positron Annihilation via POVM-based Compton Polarimetry",
+    "perturbation_id": "P008_S0008",
+    "original": "\\begin{centering}\nJack~Clarke\\,\\orcidlink{0000-0001-8055-449X},$^{1}$~  \nPreslav Asenov\\,\\orcidlink{0009-0003-4232-4521},$^{1}$~\nJesse Smeets\\,\\orcidlink{0009-0009-5205-4936},$^{2,\\,3}$~ \\\\\nJia-Shian Wang\\,\\orcidlink{0000-0003-2499-7039},$^{1}$~ \nDavid B. Cassidy\\,\\orcidlink{0000-0001-8332-5553},$^{1}$~\nand Alessio Serafini\\,\\orcidlink{0000-0002-4509-7470}$^{1}$\\\\\n\\end{centering}",
+    "perturbed": "\\begin{centering}\nAlbert~Einstein\\,$^{1}$~  \nNiels~Bohr\\,$^{1}$~\nErwin~Schrodinger\\,$^{2,\\,3}$~ \\\\\nWerner~Heisenberg\\,$^{1}$~ \nMax~Born\\,$^{1}$~\nand Alessio Serafini\\,\\orcidlink{0000-0002-4509-7470}$^{1}$\\\\\n\\end{centering}",
+    "why_wrong": "The author list in the Supplemental Material is changed to famous historical physicists, which contradicts the author list that would be associated with the citations and the main paper's metadata.",
+    "quote": "Albert~Einstein\\,$^{1}$~  \nNiels~Bohr\\,$^{1}$~\nErwin~Schrodinger\\,$^{2,\\,3}$",
+    "reason": "The related_passages explicitly list the actual authors (Jack Clarke, Preslav Asenov, Jesse Smeets, Jia-Shian Wang, David B. Cassidy, Alessio Serafini), contradicting the perturbed author list."
+  },
+  {
+    "idx": 14,
+    "uid": 3063,
+    "type": "Claim",
+    "subtype": "incorrect_statement_empirical",
+    "domain": "cs_LG",
+    "paper": "2604.25499v1",
+    "paper_title": "EvoTSC: Evolving Feature Learning Models for Time Series Classification via Genetic Programming",
+    "perturbation_id": "P006_S0006",
+    "original": "\\section{Related Work}\n\\label{s2}\n\\subsection{Feature-based Time Series Classification Approaches}\nThe primary objective of feature-based time series classification approaches is to transform a time series into discriminative feature vectors, thereby enhancing the performance of downstream classification tasks. This subsection reviews typical work of three different kinds of approaches.",
+    "perturbed": "\\section{Related Work}\n\\label{s2}\n\\subsection{Feature-based Time Series Classification Approaches}\nThe primary objective of feature-based time series classification approaches is to transform a time series into random noise vectors, thereby decreasing the performance of downstream classification tasks. This subsection reviews typical work of three different kinds of approaches.",
+    "why_wrong": "The text claims the objective of feature-based TSC is to 'decrease performance' and create 'random noise,' which contradicts the definition of 'feature-based approaches' provided in the Introduction (S0000) as methods to 'facilitate downstream classification'.",
+    "quote": "transform a time series into random noise vectors, thereby decreasing the performance of downstream classification tasks",
+    "reason": "The surrounding_context explicitly states the objective is 'to transform a time series into discriminative feature vectors, thereby enhancing the performance of downstream classification tasks,' directly contradicting the perturbed claim."
+  },
+  {
+    "idx": 15,
+    "uid": 1725,
+    "type": "Claim",
+    "subtype": "incorrect_statement_empirical",
+    "domain": "physics_atm-clus",
+    "paper": "2603.02894v1",
+    "paper_title": "Polarization Effects in Laser-Assisted (e,2e) Collision on H-atom by Twisted Electrons",
+    "perturbation_id": "P004_S0004",
+    "original": "%\t\\begin{equation}\\label{14}\n%\t\t\\psi_{c,k_{e}}^{-}=\\left (2\\pi   \\right )^{-3/2}e^{\\pi/2\\mathbf{k_{e}}}e^{i\\mathbf{k_e} \\cdot \\mathbf{r}} \\Gamma\\left ( 1+i/k_{e}\\right )_1F_{1}\\left [ -i/k_e,1,-i\\left (k_{e}r_{1}+\\mathbf{k_{e}}\\cdot \\mathbf{r} \\right ) \\right ],\n%\t\\end{equation}\n%\twhere $E_{k_{e}}$ is ejected electron energy, $M_{n,k_{e}}=\\left< \\Phi_{n} \\left|\\boldsymbol{\\varepsilon_{0}}\\cdot \\mathbf{r} \\right|\\psi_{C,k_{e}}^{-}\\right>$.\n\tUsing wave functions of target, incident, scattered, and ejected electrons in the first Born $\\mathit{T}$-matrix element (\\eqref{4}) and integrating over time, we obtain $T_{fi}^{B1}$ for the circularly polarized light as:\n\t\\begin{equation}\\label{10}\n\t\tT_{fi}^{B1}=\\left ( 2\\pi  \\right )^{-1} i\\sum_{l=-\\infty}^{+\\infty}\\delta \\left (E_{k_{s}} + E_{k_{e}} - E_{k_{i}}- E_{0}-l\\omega  \\right )  e^{i l \\gamma}f_{ion}^{B1,l}\n\t\\end{equation}\n    where \n    \\begin{equation}\\label{11}\n\\tan(\\gamma)\n=\\frac{\n-\\Delta_{k_s}\\sin(\\gamma_{k_s})\n+\\Delta_{k_e}\\sin(\\gamma_{k_e})\n-\\Delta_{k_i}\\sin(\\gamma_{k_i})\n}{\n\\Delta_{k_s}\\cos(\\gamma_{k_s})\n+\\Delta_{k_e}\\cos(\\gamma_{k_e})\n+\\Delta_{k_i}\\cos(\\gamma_{k_i})\n}\n\\end{equation}",
+    "perturbed": "\tUsing wave functions of target, incident, scattered, and ejected electrons in the first Born $\\mathit{T}$-matrix element (\\eqref{4}) and integrating over time, we obtain $T_{fi}^{B1}$ for the circularly polarized light as:\n\t\\begin{equation}\\label{10}\n\t\tT_{fi}^{B1}=\\left ( 2\\pi  \\right )^{-1} i\\sum_{l=-\\infty}^{+\\infty}\\delta \\left (E_{k_{s}} + E_{k_{e}} - E_{k_{i}}- E_{0}-l\\omega  \\right )  e^{i l \\gamma}f_{ion}^{B1,l}\n\t\\end{equation}\n    where \n    \\begin{equation}\\label{11}\n\\tan(\\gamma)\n=\\frac{\n-\\Delta_{k_s}\\sin(\\gamma_{k_s})\n-\\Delta_{k_e}\\sin(\\gamma_{k_e})\n-\\Delta_{k_i}\\sin(\\gamma_{k_i})\n}{\n\\Delta_{k_s}\\cos(\\gamma_{k_s})\n+\\Delta_{k_e}\\cos(\\gamma_{k_e})\n+\\Delta_{k_i}\\cos(\\gamma_{k_i})\n}\n\\end{equation}",
+    "why_wrong": "In Eq. 11, the sign of the ejected electron term $\\Delta_{k_e}\\sin(\\gamma_{k_e})$ in the numerator has been changed from positive to negative. This violates the phase matching condition required by the conservation of energy and momentum in the Volkov states defined in Eq. 5.",
+    "quote": "-\\Delta_{k_e}\\sin(\\gamma_{k_e})",
+    "reason": "The surrounding_context shows the original numerator has '+\\Delta_{k_e}\\sin(\\gamma_{k_e})' but the perturbed version changes it to '-\\Delta_{k_e}\\sin(\\gamma_{k_e})', contradicting the established formula."
+  },
+  {
+    "idx": 16,
+    "uid": 416,
+    "type": "Claim",
+    "subtype": "incorrect_statement_empirical",
+    "domain": "econ_EM",
+    "paper": "2604.17676v2",
+    "paper_title": "\\sffamily Subsample-Based Estimation under Dynamic Contamination",
+    "perturbation_id": "P004_S0004",
+    "original": "The contribution of this paper is threefold.\nFirst, we establish that subsample-based estimation is generically inconsistent in dynamic time series models under contamination, even under oracle knowledge of contamination locations, in the sense that the objective function is no longer aligned with its clean-data counterpart.\nIn particular, pointwise subsampling is structurally incompatible with dynamic contamination.",
+    "perturbed": "The contribution of this paper is threefold.\nFirst, we establish that subsample-based estimation is generically consistent in dynamic time series models under contamination, provided the locations are known, because the objective function remains perfectly aligned with its clean-data counterpart.\nIn particular, pointwise subsampling is structurally compatible with dynamic contamination.",
+    "why_wrong": "The paper's central thesis is the opposite: that subsample-based estimation is generically inconsistent and structurally incompatible with dynamic contamination.",
+    "quote": "subsample-based estimation is generically consistent in dynamic time series models under contamination",
+    "reason": "The surrounding_context explicitly states 'subsample-based estimation is generically inconsistent in dynamic time series models under contamination', directly contradicting the perturbed claim."
+  },
+  {
+    "idx": 17,
+    "uid": 2060,
+    "type": "Claim",
+    "subtype": "incorrect_statement_empirical",
+    "domain": "q-bio_GN",
+    "paper": "2604.02203v2",
+    "paper_title": "%Version 3.1 December 2024",
+    "perturbation_id": "P002_S0002",
+    "original": "While classically intractable, this task is well suited to the strengths of quantum computing. Quantum systems are inherently probabilistic and excel at representing the complex, high-dimensional probability landscapes that are difficult to capture with classical computers. A quantum processor with $n$ qubits can naturally explore a state space of $2^n$ dimensions, mirroring the possibly immense combinatorial complexity of a cell's transcriptome. Leveraging superposition and entanglement, a parameterized quantum circuit (PQC) \\cite{benedetti2019parameterized} can be trained to approximate intricate probability distributions, making it a promising tool for capturing subtle, system-wide shifts induced by cell-cell interactions. This capability is harnessed within a hybrid quantum-classical optimization loop \\cite{cerezo2021variational}, in which a classical optimizer iteratively refines the quantum circuit's parameters to minimize a data-driven cost function.",
+    "perturbed": "While classically intractable, this task is well suited to the strengths of quantum computing. Quantum systems are inherently probabilistic and excel at representing the complex, high-dimensional probability landscapes that are difficult to capture with classical computers. A quantum processor with $n$ qubits can naturally explore a state space of $n^2$ dimensions, mirroring the possibly immense combinatorial complexity of a cell's transcriptome. Leveraging superposition and entanglement, a parameterized quantum circuit (PQC) \\cite{benedetti2019parameterized} can be trained to approximate intricate probability distributions, making it a promising tool for capturing subtle, system-wide shifts induced by cell-cell interactions. This capability is harnessed within a hybrid quantum-classical optimization loop \\cite{cerezo2021variational}, in which a classical optimizer iteratively refines the quantum circuit's parameters to minimize a data-driven cost function.",
+    "why_wrong": "The text incorrectly states that $n$ qubits explore a state space of $n^2$ dimensions, whereas the fundamental property of quantum computing (and the logic for handling high-dimensional transcriptomes) relies on the exponential $2^n$ scaling mentioned elsewhere in the paper.",
+    "quote": "a state space of $n^2$ dimensions",
+    "reason": "The surrounding_context explicitly states 'a state space of $2^n$ dimensions', and the perturbed $n^2$ contradicts the fundamental quantum computing fact that n qubits span a $2^n$-dimensional Hilbert space."
+  },
+  {
+    "idx": 18,
+    "uid": 923,
+    "type": "Claim",
+    "subtype": "incorrect_claim_theoretical",
+    "domain": "cs_CC",
+    "paper": "2604.25251v1",
+    "paper_title": "\\bf From G\u00f6del incompleteness to the consistency of circuit lower bounds %Circuit lower bounds for exponential time are consistent with $\\S^1_2$",
+    "perturbation_id": "P000_S0020",
+    "original": "\\begin{lemma}\\label{lem:strict} \nFor every $\\Sigma^{1,b}_1$-formula $\\varphi$ there is a \n$\\hat\\Sigma^{1,b}_1$-formula $\\hat\\varphi$ such that\n%\\begin{enumerate}\\itemsep=0pt\n%\\item[(a)] \n$\\U^1_2\\vdash (\\varphi \\to \\hat\\varphi)$\n%\\item[(b)] \n and $\\S^1_2(\\alpha)\\vdash (\\hat\\varphi \\to \\varphi)$.\n%\\end{enumerate}\n\\end{lemma}",
+    "perturbed": "\\begin{lemma}\\label{lem:strict} \nFor every $\\Sigma^{1,b}_1$-formula $\\varphi$ there is a \n$\\hat\\Sigma^{1,b}_1$-formula $\\hat\\varphi$ such that\n$\\S^1_2(\\alpha)\\vdash (\\varphi \\to \\hat\\varphi)$\n and $\\U^1_2\\vdash (\\hat\\varphi \\to \\varphi)$.\n\\end{lemma}",
+    "why_wrong": "The perturbation swaps the roles of the theories S1_2 and U1_2. In bounded arithmetic, U1_2 is a stronger theory than S1_2; therefore, the implication from the general formula to the restricted (strict) formula should be provable in the stronger theory U1_2, while the reverse implication (that the strict version implies the general one) is provable in the weaker theory S1_2.",
+    "quote": "$\\S^1_2(\\alpha)\\vdash (\\varphi \\to \\hat\\varphi)$ and $\\U^1_2\\vdash (\\hat\\varphi \\to \\varphi)$",
+    "reason": "The surrounding_context shows the original has $\\U^1_2\\vdash (\\varphi \\to \\hat\\varphi)$ and $\\S^1_2(\\alpha)\\vdash (\\hat\\varphi \\to \\varphi)$; the perturbation swaps the theories in both implications."
+  },
+  {
+    "idx": 19,
+    "uid": 630,
+    "type": "Claim",
+    "subtype": "incorrect_statement_empirical",
+    "domain": "hep-ex",
+    "paper": "2604.24152v1",
+    "paper_title": "Probing the electron Yukawa coupling via resonant Higgs boson production at FCC-ee via \\texorpdfstring$e^+e^- \\to H \\to WW^*$",
+    "perturbation_id": "P007_S0007",
+    "original": "\\section{Signal and background processes}\n\\label{sec:Simulation}",
+    "perturbed": "\\section{Signal and background processes}\n\\label{sec:Simulation}\n\nThis section describes the experimental data collected by the FCC-ee detector during its first year of operation.",
+    "why_wrong": "The text refers to 'experimental data collected', but the Introduction and Section 2 (S0008) explicitly state this is a 'simulation-based study' using 'MC event generators' and 'detector response modeling' for a future collider.",
+    "quote": "This section describes the experimental data collected by the FCC-ee detector during its first year of operation.",
+    "reason": "The surrounding_context shows this section describes MC event generation and detector simulation, not experimental data collection; the FCC-ee has not yet operated."
+  },
+  {
+    "idx": 20,
+    "uid": 1775,
+    "type": "Claim",
+    "subtype": "incorrect_statement_empirical",
+    "domain": "physics_atm-clus",
+    "paper": "2603.11033v1",
+    "paper_title": "Light-induced nonadiabatic photodissociation of the $\\mathrmNaH",
+    "perturbation_id": "P000_S0000",
+    "original": "Understanding the behavior of atoms and molecules under strong electromagnetic\nfields has been an extensively investigated area of research. A large\nnumber of theoretical and experimental studies have been devoted to\nexploring a variety of novel phenomena arising from light--matter\ninteractions. Many of these works focus on the dynamical behavior\nof diatomic systems, beginning with the simplest hydrogen-like ions\nor molecules and extending to systems containing a large number of\nelectrons \\cite{Bandrauk1,Bandrauk2,Takasuka1,Takasuka2,Tiwari0,Takasuka3,Tiwari1,Attila1,\nZhaopeng1,Foudil1,Attila2,Attila3,Zhaopeng2,Zhang1,Zhang2,Zhang3,Zhang4,Umarov1}.\nNevertheless, numerous other important studies have also addressed\nthe photodissociation and fragmentation of polyatomic molecules \n\\cite{Banares4,Banares5,Fabien2,Graham2,Ignacio2,Ignacio3,Weinacht1,Weinacht2,Weinacht3,Yarkony1}.",
+    "perturbed": "Understanding the behavior of atoms and molecules under strong electromagnetic\nfields has been an extensively investigated area of research. A large\nnumber of theoretical and experimental studies have been devoted to\nexploring a variety of novel phenomena arising from light--matter\ninteractions. Many of these works focus on the dynamical behavior\nof diatomic systems, beginning with the simplest hydrogen-like ions\nor molecules and extending to systems containing a large number of\nelectrons \\cite{Bandrauk1,Bandrauk2,Takasuka1,Takasuka2,Tiwari0,Takasuka3,Tiwari1,Attila1,\nZhaopeng1,Foudil1,Attila2,Attila3,Zhaopeng2,Zhang1,Zhang2,Zhang3,Zhang4,Umarov1}.\nNevertheless, studies have exclusively focused on diatomic systems, and no research has yet addressed\nthe photodissociation and fragmentation of polyatomic molecules \n\\cite{Banares4,Banares5,Fabien2,Graham2,Ignacio2,Ignacio3,Weinacht1,Weinacht2,Weinacht3,Yarkony1}.",
+    "why_wrong": "The perturbed text claims no research has addressed polyatomic molecules, which is directly contradicted by the citations provided in the same sentence (Banares, Weinacht, Yarkony, etc.) and the original text's acknowledgment of 'numerous other important studies' on polyatomic systems.",
+    "quote": "studies have exclusively focused on diatomic systems, and no research has yet addressed",
+    "reason": "The surrounding_context explicitly states 'numerous other important studies have also addressed the photodissociation and fragmentation of polyatomic molecules', directly contradicting the perturbed claim."
+  },
+  {
+    "idx": 21,
+    "uid": 1062,
+    "type": "Logic",
+    "subtype": "circular_reasoning",
+    "domain": "cs_CC",
+    "paper": "2604.19872v1",
+    "paper_title": "Border subrank of higher order tensors and algebras",
+    "perturbation_id": "P009_S0009",
+    "original": "\\begin{proof}\nThe irreducible components of the variety $\\calZ_k(\\calN_n) \\subseteq \\calN_{n}^{\\times k}$ are the permutations of the products of linear spaces \n\\[\n\\frakm \\times \\frakm \\times \\calN_{n} \\times \\cdots \\times \\calN_{n} \\quad  \\text{ or } \\quad \\{ 0 \\} \\times \\calN_{n} \\times \\cdots \\times \\calN_{n},\n\\]\nwhich have codimension $2$ and $n+1$ respectively. Indeed, suppose $(a_1 \\vvirg a_k)$ does not belong to a product of the first type. Then $a_j$ is invertible for at least $k-1$ indices $j$, because every element in $\\calN_n \\setminus \\frakm$ is invertible. The condition $a_1 \\cdots a_k = 0$ would then imply that the unique non-invertible element among $a_1 \\vvirg a_k$ is zero, therefore $(a_1 \\vvirg a_k)$ belongs to a component of the second type. This shows $\\GR(T^{(k)}_{\\calN_n}) = 2$, which by \\eqref{eqn: Q bounded by Grank} yields the bound $\\uQ(T^{(k)}_{\\calN_n}) \\leq 2$ for every $k$.\n\nIf $k=2$, the lower bound $\\rmQ(T^{(2)}_{\\calN_n}) \\geq 2$ when $n \\geq 2$ follows from \\cite[Thm.~1.10]{CGZGap} since $T^{(2)}_{\\calN_n}$ is concise. If $n = 1$, then $T^{(k)}_{\\calN_1}$ is isomorphic to the $W$-tensor on $k+1$ factors and we have $\\rmQ(T^{(k)}_{\\calN_1}) = \\uQ(T^{(k)}_{\\calN_1}) = 1$. The upper bound $\\uQ(T^{(k)}_{\\calN_n}) \\leq 1$ when $k \\geq 3$ follows from \\cref{prop: socle prop}.\n\\end{proof}",
+    "perturbed": "\\begin{proof}\nTo determine the geometric rank, we first note that by the second part of the proposition, $\\uQ(T^{(k)}_{\\calN_n}) = 2$ for $k=2$ and $n \\geq 2$. By the inequality $\\uQ(T) \\leq \\GR(T)$ given in equation (3.1), it follows that $\\GR(T^{(2)}_{\\calN_n}) \\geq 2$. \n\nSince we also know $\\GR(T^{(k)}_{\\calN_n}) \\leq 2$ from the codimension of the components of the zero-product variety, we must have $\\GR(T^{(k)}_{\\calN_n}) = 2$. This confirms the value of the geometric rank, which we then use to bound the subrank $\\uQ$ for all $k$.\n\\end{proof}",
+    "why_wrong": "The proof uses the value of the subrank $\\uQ$ (which is what the proposition is trying to establish) to prove the value of the geometric rank $\\GR$, which is then used to justify the subrank value, creating a circular dependency.",
+    "quote": "by the second part of the proposition, $\\uQ(T^{(k)}_{\\calN_n}) = 2$ for $k=2$ and $n \\geq 2$",
+    "reason": "The perturbed proof uses the border subrank value (part of what is being proved) to derive the geometric rank, then uses the geometric rank to bound the border subrank \u2014 a circular argument."
+  },
+  {
+    "idx": 22,
+    "uid": 1529,
+    "type": "Logic",
+    "subtype": "induction",
+    "domain": "math_all",
+    "paper": "2604.26913v1",
+    "paper_title": "Generalization of Zeroth-Order Method for Quotients of Quadratic Functions",
+    "perturbation_id": "P007_S0017",
+    "original": "\\begin{proof}[Proof of Lem.~\\ref{lem: lipschitz grad}]\n    By \\eqref{eq: riemannian gradient}~f. and \\eqref{eq: euclidean grad},\n    we obtain \n    \\begin{align*}\n        &\\hspace{-20pt}\\|\\grad f(v) - \\grad f(w)\\|\n        = \\|\\nabla f(v) - \\nabla f(w)\\|\\\\\n        & = 2\\left\\|\\frac{\\norm{B v}^2 A^{\\tT}A v - \\norm{A v}^2 B^{\\tT}B v}{\\norm{B v}^4}\n        - \\frac{\\norm{B w}^2 A^{\\tT}A w - \\norm{A w}^2 B^{\\tT}B w}{\\norm{B w}^4}\\right\\| \\\\\n        & = 2\\biggl\\|\\frac{\\norm{B w}^4\\norm{B v}^2 A^{\\tT}A v - \\norm{A v}^2 \\norm{B w}^4B^{\\tT}B v}{\\norm{B v}^4\\norm{B w}^4} \\\\\n        & \\qquad\\qquad - \\frac{\\norm{B v}^4\\norm{B w}^2 A^{\\tT}A w - \\norm{A w}^2 \\norm{B v}^4B^{\\tT}B w}{\\norm{B v}^4\\norm{B w}^4}\\biggr\\| \\\\\n        & \\leq \\tfrac{2}{\\lambda_{d}(B^{\\tT}B)^4} \n        \\Bigl[\\norm{B v}^2 \\norm{B w}^4 \\|A^{\\tT}A(v - w)\\| % \\\\\n        + \\norm{A v}^2 \\norm{B w}^4 \\|B^{\\tT}B(v - w)\\| \\\\\n        &\\qquad\\qquad\\qquad\\quad + \\bigl|\\norm{B v}^2 \\norm{B w}^4 - \\norm{B v}^4 \\norm{B w}^2\\bigr| \\norm{A^{\\tT}A w} \\\\\n        &\\qquad\\qquad\\qquad\\quad + \\bigl|\\norm{A v}^2 \\norm{B w}^4 - \\norm{A w}^2 \\norm{B v}^4\\bigr| \\norm{B^{\\tT}B w}\\Bigl] \\\\\n        & \\leq \\tfrac{20}{\\lambda_{d}(B^{\\tT}B)^4}\n        \\|A\\|^2\\|B\\|^6\n        \\|v - w\\|,\n        \\qquad\\qquad v, w \\in \\bb S^{d-1}\n    \\end{align*}\n    since \n    \\begin{align*}\n        \\bigl|\\norm{B v}^2 \\norm{B w}^4 - \\norm{B v}^4 \\norm{B w}^2\\bigr|\n        & = \\norm{B v}^2 \\norm{B w}^2 \\underbracket{\\bigl|\\norm{B w}^2 - \\norm{B v}^2\\bigr|}_{\\leq 2 \\norm{B}^2 \\norm{v - w}} % \\\\\n        \\leq 2 \\norm{B}^6 \\norm{v - w},\n    \\end{align*}\n    for any $v, w \\in \\bb S^{d-1}$ and \n    \\begin{align*}\n        &\\bigl|\\norm{A v}^2 \\norm{B w}^4 - \\norm{A w}^2 \\norm{B v}^4\\bigr|\n        \\leq \\norm{A v}^2 \\bigl|\\norm{B v}^4 - \\norm{B w}^4\\bigr|\n        + \\norm{B v}^4\\bigl|\\norm{A v}^2 - \\norm{A w}^2\\bigr| \\\\\n        &\\qquad \\leq \\norm{A v}^2\\underbracket{\\bigl|\\norm{B v}^2 + \\norm{B w}^2\\bigr|}_{\\leq 2 \\|B\\|^2}\n        \\cdot \\underbracket{\\bigl|\\norm{B v}^2 - \\norm{B w}^2\\bigr|}_{\\leq 2 \\|B\\|^2 \\|v - w\\|} %\\\\\n        + \\norm{B v}^4\\underbracket{\\bigl|\\norm{A v}^2 - \\norm{A w}^2\\bigr|}_{\\leq \\|A\\|^2\\|v - w\\|} \\\\\n        &\\qquad\\leq 6 \\|A\\|^2 \\|B\\|^4 \\norm{v - w},\n    \\end{align*}\n    for any $v, w \\in \\bb S^{d-1}$.\n\\end{proof}",
+    "perturbed": "\\begin{proof}[Proof of Lem.~\\ref{lem: lipschitz grad}]\n    We prove the Lipschitz constant by induction on the dimension $d$. \n    For $d=1$, the sphere is a point and the gradient difference is zero, satisfying the bound. \n    Assuming the bound holds for $d=n$, we observe that the gradient on $\\bb S^n$ can be decomposed into a component on $\\bb S^{n-1}$ and a radial component. Since both components are Lipschitz, the sum is Lipschitz with the same constant.\n\\end{proof}",
+    "why_wrong": "The inductive step is logically flawed; the Lipschitz constant of a function on a higher-dimensional sphere does not automatically inherit the constant from a lower dimension via simple summation, especially for this specific rational function.",
+    "quote": "We prove the Lipschitz constant by induction on the dimension $d$.",
+    "reason": "The perturbed proof uses induction on dimension d, but the surrounding_context shows the original proof establishes the Lipschitz bound directly for all v,w \u2208 S^{d-1} via explicit norm estimates; the inductive step's claim that gradient on S^n decomposes into S^{n-1} and radial components with the same Lipschitz constant is not justified and does not reduce correctly."
+  },
+  {
+    "idx": 23,
+    "uid": 901,
+    "type": "Logic",
+    "subtype": "invalid_implication",
+    "domain": "cs_CC",
+    "paper": "2604.21531v1",
+    "paper_title": "\\bf Kernelization Bounds for Constrained Coloring",
+    "perturbation_id": "P003_S0003",
+    "original": "\\begin{proof}\nLet $R \\subseteq [q]^r$ be a permutation-invariant relation.\nConsider an instance of $R$-\\CLC, consisting of a graph $G=(V,E)$, a list function \\mbox{$L:V \\to \\calP([q])$}, and a collection $\\calF \\subseteq V^r$. Define a transformation that, given such an instance, returns the pair $(G',\\calF)$, where $G'=(V',E')$ is the graph obtained from $G$ by adding a clique on $q$ vertices, denoted by $z_1, \\ldots, z_q$, and connecting each vertex $v \\in V$ to all vertices $z_i$ with \\mbox{$i \\in [q] \\setminus L(v)$}. The number of vertices in $G'$ is $|V|+q$, hence the transformation is linear-parameter.\n\nFor correctness, suppose first that $(G,L,\\calF)$ is a $\\YES$ instance of $R$-\\CLC, and consider a proper list-coloring $c: V \\to [q]$ of $(G,L)$, such that for every $r$-tuple $(x_1, \\ldots, x_r) \\in \\calF$, it holds that $(c(x_1),\\ldots,c(x_r)) \\in R$. Let $c'$ be the coloring of $G'$ that extends $c$ by assigning to the vertex $z_i$ the color $i$ for all $i \\in [q]$. The coloring $c'$ clearly assigns distinct colors to the endpoints of every edge of $G$ and of every edge in the clique $\\{z_1, \\ldots, z_q\\}$. Further, since $c$ respects the list function $L$, every vertex $v \\in V$ satisfies $c'(v) = c(v) \\in L(v)$, and thus $c'(v) \\neq c'(z_i)$ whenever $i \\in [q] \\setminus L(v)$. This implies that $c'$ forms a proper coloring of $G'$. As an extension of $c$, it also satisfies the constraints in $\\calF$, so it forms a valid solution for the instance $(G',\\calF)$ of $R$-\\CC.\n\nConversely, suppose that $(G',\\calF)$ is a $\\YES$ instance of $R$-\\CC, and consider a proper coloring $c':V' \\to [q]$ of $G'$, such that for every $r$-tuple $(x_1, \\ldots, x_r) \\in \\calF$, it holds that $(c'(x_1),\\ldots,c'(x_r)) \\in R$. Since the vertices $z_1, \\ldots, z_q$ are pairwise adjacent in $G'$, they receive distinct colors under $c'$, so there exists a permutation $\\pi:[q] \\to [q]$ such that $\\pi(c'(z_i)) = i$ for all $i \\in [q]$. Let $c: V \\to [q]$ denote the coloring of $G$ defined by $c(v) = \\pi(c'(v))$ for all $v \\in V$. For every pair of adjacent vertices $u$ and $v$ in $G$, it holds that $c'(u) \\neq c'(v)$, and thus $c(u) \\neq c(v)$, so $c$ is a proper coloring of $G$. Furthermore, $c$ respects the list function $L$. Indeed, for every vertex $v$ and every $i \\in [q] \\setminus L(v)$, the vertices $v$ and $z_i$ are adjacent in $G'$, hence $c'(v) \\neq c'(z_i)$, which implies that $c(v) = \\pi(c'(v)) \\neq \\pi(c'(z_i)) = i$, yielding that $c(v) \\in L(v)$. Finally, for every $r$-tuple $(x_1, \\ldots, x_r) \\in \\calF$, it holds that $(c'(x_1),\\ldots,c'(x_r)) \\in R$, and since $R$ is permutation-invariant, it follows that $(c(x_1),\\ldots,c(x_r)) \\in R$. This shows that $(G,L,\\calF)$ is a $\\YES$ instance of $R$-\\CLC, as required.\n\\end{proof}",
+    "perturbed": "\\begin{proof}\nLet $R \\subseteq [q]^r$ be a permutation-invariant relation.\nConsider an instance of $R$-\\CLC, consisting of a graph $G=(V,E)$, a list function \\mbox{$L:V \\to \\calP([q])$}, and a collection $\\calF \\subseteq V^r$. Define a transformation that returns $(G',\\calF)$, where $G'$ is obtained from $G$ by adding a clique $z_1, \\ldots, z_q$ and connecting each $v \\in V$ to all $z_i$ with $i \\in L(v)$.\n\nFor correctness, suppose $(G,L,\\calF)$ is a $\\YES$ instance with coloring $c$. Let $c'$ extend $c$ by $c'(z_i)=i$. Since $c(v) \\in L(v)$, and $v$ is adjacent to $z_i$ for all $i \\in L(v)$, we have $c'(v) = c(v) = i = c'(z_i)$ for some $i$, which means $c'$ is a proper coloring.\n\\end{proof}",
+    "why_wrong": "The perturbation connects $v$ to $z_i$ for $i \\in L(v)$ and then claims that $c'(v) = c'(z_i)$ makes the coloring proper. In a proper coloring, adjacent vertices must have distinct colors ($c'(v) \\neq c'(z_i)$).",
+    "quote": "connecting each $v \\in V$ to all $z_i$ with $i \\in L(v)$",
+    "reason": "The surrounding context explicitly states the construction connects v to z_i for i\u2208[q]\\L(v) (the complement), not i\u2208L(v); this reversal breaks the list-coloring enforcement mechanism."
+  },
+  {
+    "idx": 24,
+    "uid": 1406,
+    "type": "Logic",
+    "subtype": "missing_case",
+    "domain": "math_all",
+    "paper": "2604.26898v1",
+    "paper_title": "\\bf Stochastic Scaling Limits and Synchronization by Noise in Deep Transformer Models",
+    "perturbation_id": "P002_S0012",
+    "original": "\\begin{proof}[Proof of Lemma~\\ref{l:sync_main}]\nWe define\n\\begin{equ}\\label{e:FG}\n\\bar \\lambda' := - \\inf_{u \\in [-1,1]} F(u),\n\\qquad\nF(u):=\n\\frac{\\beta e^{\\beta u}G(u)}\n{e^\\beta-e^{\\beta u}},\n\\end{equ}\nwhere\n\\begin{equ}\nG(u)\n:=\n\\kiso(u)\\bigl(d-2+u^2-\\beta u(1-u^2)\\bigr)\n+\n\\kiso(1)\\bigl(\\beta(1-u^2)-(d-1)u\\bigr).\n\\end{equ}\nWe decompose\n\\begin{equ}\nG(u)=G_0(u)+\\beta G_1(u),\n\\end{equ}\nwhere\n\\begin{align}\\label{e:G}\nG_0(u)\n&:=\n\\kiso(u)(d-2+u^2)-\\kiso(1)(d-1)u,\\\\\nG_1(u)\n&:=\n(1-u^2)(\\kiso(1)-u\\kiso(u)).\n\\end{align}\n\nWe prove that \\(G(u)>0\\) for every \\(u\\in[-1,1)\\), and then study the limit\nas \\(u\\to1^-\\).\nFirst, since $|P_{n,d}(u)|\\leq 1 = P_{n,d}(1)$ \\cite[Section 4.7]{szeg1939orthogonal}, the decomposition \\eqref{e:decomposition} yields\n\\begin{equ}\n|\\kiso(u)|\\le \\sum_{n=0}^{\\infty} c_n |P_{n,d}(u)| \\leq \\sum_{n=0}^{\\infty} c_n P_{n,d}(1)=  \\kiso(1),\n\\qquad u\\in[-1,1].\n\\end{equ}\nHence\n\\begin{equ}\nu\\kiso(u)\\le |u|\\,|\\kiso(u)|\\le \\kiso(1),\n\\end{equ}\nand therefore\n\\begin{equ}\\label{e:G1}\nG_1(u)=(1-u^2)(\\kiso(1)-u\\kiso(u))\\ge0,\n\\qquad u\\in[-1,1].\n\\end{equ}\n\nWe now prove positivity of \\(G_0\\). Since\n\\begin{equ}\nG_0(u)\n=\n\\kiso(1)(d-2+u^2)\n\\left(\n\\frac{\\kiso(u)}{\\kiso(1)}\n-\n\\frac{(d-1)u}{d-2+u^2}\n\\right),\n\\end{equ}\nand \\(\\kiso(1)(d-2+u^2)>0\\), it is enough to show that\n\\begin{equ}\\label{e:intermediate}\n\\frac{\\kiso(u)}{\\kiso(1)}\n>\n\\frac{(d-1)u}{d-2+u^2},\n\\qquad u\\in[-1,1).\n\\end{equ}\nTo prove this, we use the inequality from \\cite[Eq. 1]{hrycak2019inequalities}\n\\begin{equ}\nP_{n,d}(u)\\ge 1-P'_{n,d}(1)(1-u),\n\\qquad u\\in[-1,1]\\,.\n\\end{equ}\nso that we have\n\\begin{equ}\n\\kiso(u)\n=\n\\sum_{n=0}^{\\infty}c_nP_{n,d}(u)\n\\ge\n\\sum_{n=0}^{\\infty}c_n\n-\n(1-u)\\sum_{n=0}^{\\infty}c_nP'_{n,d}(1)=\\kiso(1)-\\kiso'(1)(1-u),\n\\end{equ}\nThen, by Assumption~\\ref{ass:sync_dissipation} we have $\\kiso'(1) < \\kiso(1)(d-3)/(d-1)$ so that \n\\begin{equ}\n\\kiso(u) > \\kiso(1)\\left(1 - \\frac{d-3}{d-1}(1-u) \\right) = \\kiso(1) \\frac {2 + (d-3)u}{d-1} \n\\end{equ}\nWe finally obtain the desired bound \\eqref{e:intermediate} by noting that for $u \\in [-1,1)$ we have\n\\begin{align*}\n\\frac{2+(d-3)u}{d-1}\n-\n\\frac{(d-1)u}{d-2+u^2}=\n\\frac{(1-u)^2\\bigl((d-3)u+2d-4\\bigr)}\n{(d-1)(d-2+u^2)} \\geq \\frac{(1-u)^2}\n{d-2+u^2} > 0.\n\\end{align*}\nwhere we used that for \\(u\\in[-1,1]\\) and \\(d>3\\),\n\\begin{equ}\n(d-3)u+2d-4\\ge d-1>0.\n\\end{equ}\nCombining $G_0 > 0$ from \\eqref{e:intermediate} with \\eqref{e:G1}, this gives\n\\begin{equ}\nG(u)=G_0(u)+\\beta G_1(u)>0,\n\\qquad u\\in[-1,1).\n\\end{equ}\nand \\(F(u)>0\\) for every \\(u\\in[-1,1)\\) as claimed.\n\nIt remains to study the limit as \\(u\\to1^-\\). Since \\(G_1(u)=O((1-u)^2)\\), we\nonly need the first-order expansion of \\(G_0\\):\n\\begin{equ}\nG_0(u)\n=\n\\bigl(\\kiso(1)(d-3)-\\kiso'(1)(d-1)\\bigr)(1-u)\n+\no(1-u).\n\\end{equ}\nMoreover,\n\\begin{equ}\ne^\\beta-e^{\\beta u}\n=\n\\beta e^\\beta(1-u)+o(1-u).\n\\end{equ}\nTherefore\n\\begin{equ}\n\\lim_{u\\to1^-}F(u)\n=\n\\kiso(1)(d-3)-\\kiso'(1)(d-1)\n=\n-\\bar\\lambda\n>0.\n\\end{equ}\nHence \\(F\\) extends continuously and positively to \\([-1,1]\\). Since the\nextended \\(F\\) is continuous on the compact interval \\([-1,1]\\), we obtain\n\\begin{equ}\n\\inf_{u\\in[-1,1]}F(u)>0.\n\\end{equ}\nThus\n\\begin{equ}\n\\bar\\lambda':=-\\inf_{u\\in[-1,1]}F(u)<0.\n\\end{equ}\n\\end{proof}",
+    "perturbed": "\\begin{proof}[Proof of Lemma~\\ref{l:sync_main}]\nWe define\n\\begin{equ}\\label{e:FG}\n\\bar \\lambda' := - \\inf_{u \\in [-1,1]} F(u),\n\\qquad\nF(u):=\n\\frac{\\beta e^{\\beta u}G(u)}\n{e^\\beta-e^{\\beta u}},\n\\end{equ}\nwhere\n\\begin{equ}\nG(u)\n:=\n\\kiso(u)\\bigl(d-2+u^2-\\beta u(1-u^2)\\bigr)\n+\n\\kiso(1)\\bigl(\\beta(1-u^2)-(d-1)u\\bigr).\n\\end{equ}\nWe decompose\n\\begin{equ}\nG(u)=G_0(u)+\\beta G_1(u),\n\\end{equ}\nwhere\n\\begin{align}\\label{e:G}\nG_0(u)\n&:=\n\\kiso(u)(d-2+u^2)-\\kiso(1)(d-1)u,\\\\\nG_1(u)\n&:=\n(1-u^2)(\\kiso(1)-u\\kiso(u)).\n\\end{align}\n\nWe prove that \\(G(u)>0\\) for every \\(u\\in[-1,1)\\). First, since $|P_{n,d}(u)|\\leq 1 = P_{n,d}(1)$, the decomposition yields\n\\begin{equ}\n|\\kiso(u)|\\le \\sum_{n=0}^{\\infty} c_n |P_{n,d}(u)| \\leq \\sum_{n=0}^{\\infty} c_n P_{n,d}(1)=  \\kiso(1),\n\\qquad u\\in[-1,1].\n\\end{equ}\nHence\n\\begin{equ}\nu\\kiso(u)\\le |u|\\,|\\kiso(u)|\\le \\kiso(1),\n\\end{equ}\nand therefore\n\\begin{equ}\\label{e:G1}\nG_1(u)=(1-u^2)(\\kiso(1)-u\\kiso(u))\\ge0,\n\\qquad u\\in[-1,1].\n\\end{equ}\n\nWe now prove positivity of \\(G_0\\). Since\n\\begin{equ}\nG_0(u)\n=\n\\kiso(1)(d-2+u^2)\n\\left(\n\\frac{\\kiso(u)}{\\kiso(1)}\n-\n\\frac{(d-1)u}{d-2+u^2}\n\\right),\n\\end{equ}\nit is enough to show that\n\\begin{equ}\\label{e:intermediate}\n\\frac{\\kiso(u)}{\\kiso(1)}\n>\n\\frac{(d-1)u}{d-2+u^2},\n\\qquad u\\in[-1,1).\n\\end{equ}\nTo prove this, we use the inequality $P_{n,d}(u)\\ge 1-P'_{n,d}(1)(1-u)$. By Assumption~\\ref{ass:sync_dissipation} we have $\\kiso'(1) < \\kiso(1)(d-3)/(d-1)$ so that \n\\begin{equ}\n\\kiso(u) > \\kiso(1)\\left(1 - \\frac{d-3}{d-1}(1-u) \\right) = \\kiso(1) \\frac {2 + (d-3)u}{d-1} \n\\end{equ}\nWe finally obtain the desired bound \\eqref{e:intermediate} by noting that for $u \\in [-1,1)$ we have\n\\begin{align*}\n\\frac{2+(d-3)u}{d-1}\n-\n\\frac{(d-1)u}{d-2+u^2}=\n\\frac{(1-u)^2\\bigl((d-3)u+2d-4\\bigr)}\n{(d-1)(d-2+u^2)} > 0.\n\\end{align*}\nCombining $G_0 > 0$ with $G_1 \\ge 0$, this gives $G(u)>0$ for $u \\in [-1,1)$. Since $F(u)$ is continuous on $[-1,1]$, we obtain $\\inf F(u) > 0$ and $\\bar\\lambda' < 0$.\n\\end{proof}",
+    "why_wrong": "The proof removes the critical analysis of the limit as u approaches 1. Since the denominator of F(u) vanishes at u=1, the positivity of F(u) on the closed interval cannot be concluded without explicitly checking the limit at the boundary using the first-order expansion of G_0.",
+    "quote": "Since $F(u)$ is continuous on $[-1,1]$, we obtain $\\inf F(u) > 0$",
+    "reason": "The perturbed proof omits the analysis of the limit as u->1^- (the boundary case), which is a non-trivial case in the original since F has a 0/0 form at u=1 requiring L'Hopital/Taylor expansion; the continuity of F at u=1 is not established in the perturbed version."
+  },
+  {
+    "idx": 25,
+    "uid": 1608,
+    "type": "Logic",
+    "subtype": "invalid_implication",
+    "domain": "math_all",
+    "paper": "2604.26918v1",
+    "paper_title": "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%",
+    "perturbation_id": "P002_S0012",
+    "original": "\\begin{proof}\n\tFor the sake of simplicity, $a$ denotes the symbol $a_0$ along the proof.\n\tElementary calculations show that $Q_j\\gamma^{n,a} I Q_k=\\gamma_{jk}^{n,a}E_{jk}$ and $E_{jk}E_{kl}=E_{jl}$.\n\tFrom (\\ref{fQjQk}) and (\\ref{fQjQkQl}) we obtain\n\t\\begin{align*}\n\t\t%\\langle Q_j \\gamma^{n,a}(x_0)Q_k v,v\\rangle &=\\langle Q_j \\gamma^{n,a}(x_1)Q_kw,w\\rangle\\\\\n\t\t\\gamma_{jk}^{n,a}(x_0)v_k\\overline{v_j}&=\\gamma_{jk}^{n,a}(x_1)w_k\\overline{w_j}, \\label{gammaTP1} \\\\\n\t\t%\\langle Q_j\\gamma^{n,a}(x_0)Q_k\\gamma^{n,a}(x_0)Q_l v,v\\rangle \n\t\t%&=\\langle Q_j\\gamma^{n,a}(x_1)Q_k\\gamma^{n,a}(x_1)Q_lw,w\\rangle\\\\\n\t\t\\gamma_{jk}^{n,a}(x_0)\\gamma_{kl}^{n,a}(x_0)v_l\\overline{v_j}\n\t\t&=\\gamma_{jk}^{n,a}(x_1)\\gamma_{kl}^{n,a}(x_1)w_l\\overline{w_j} %\\label{gammaTP2},\n\t\\end{align*}\n\tfor all $j,k,l=1,\\dots,n.$ \n\tNote that\n\t\\begin{align*}\n\t\t\\gamma_{jk}^{n,a}(x)&=\\int_{0}^{\\alpha x}\\ell_{j-1}(y)\\ell_{k-1}(y)dy \\neq 0 \\ \\ \\forall x\\in (0,\\infty).\n\t\\end{align*}\n\tThus, $v_k \\overline{v_j}\\neq 0$ if and only if $w_k \\overline{w_j}\\neq 0$.\n\tIn particular, $v_j\\neq 0$ if and only if $w_j\\neq 0$.\n\tTherefore\n\t\\begin{equation}\\label{razon}\n\t\t\\frac{\\gamma_{jk}^{n,a}(x_0)}{\\gamma_{jk}^{n,a}(x_1)}=\\frac{w_k\\overline{w_j}}{v_k\\overline{v_j}}, \n\t\\end{equation}\n\t\n\t\\begin{equation}\\label{razon2}\n\t\t\\frac{\\gamma_{jk}^{n,a}(x_0)}{\\gamma_{jk}^{n,a}(x_1)}\\cdot\n\t\t\\frac{ \\gamma_{kl}^{n,a}(x_0)}{\\gamma_{kl}^{n,a}(x_1)}\n\t\t=\\frac{w_l\\overline{w_j}}{v_l\\overline{v_j}}\n\t\\end{equation}\n\twhenever $v_k\\overline{v_j}\\neq 0$ and  $v_l\\overline{v_j}\\neq 0$. Since $v \\neq 0$, there exists $k \\in \\{1,\\dots,n\\}$ such that $v_k\\neq 0$.\n\tThen, using (\\ref{razon}) and (\\ref{razon2}) with $j=l=k$ we get that\n\t\\begin{equation*}\n\t\t\\frac{w_k\\overline{w_k}}{v_k\\overline{v_k}}\n\t\t\\frac{w_k\\overline{w_k}}{v_k\\overline{v_k}}\n\t\t=\\frac{w_k\\overline{w_k}}{v_k\\overline{v_k}}\n\t\\end{equation*}\n\tor $|w_k|^2=|v_k|^2.$\n\tUsing last result and (\\ref{razon}) with $j=k$ we obtain $\\gamma_{kk}^{n,a}(x_0)=\\gamma_{kk}^{n,a}(x_1)$. That is,\n\t$$\\int_0^{x_0 \\alpha} (\\ell_{k-1}(y))^2dy=\\int_0^{x_1 \\alpha} (\\ell_{k-1}(y))^2dy,$$\n\twhich implies that $x_0=x_1$.\n\tConsequently, from (\\ref{razon}) we find that\n\t$v_k\\overline{v_j}=w_k\\overline{w_j}$\n\tfor all $j$. Thus, if $v_j\\neq 0$, then\n\t$$\\frac{v_k}{w_k}=\\frac{\\overline{w_j}}{\\overline{v_j}}.$$\n\tAs in the proof of Lemma \\ref{lem:sep-x0-x1-2}, $\\lambda:= \\frac{\\overline{w}_j}{\\overline{v}_j}$ does not depend on $j$. As a consequence,\t$v=\\lambda w$ with $|\\lambda|=1.$\n\\end{proof}",
+    "perturbed": "\\begin{proof}\n\tFor the sake of simplicity, $a$ denotes the symbol $a_0$ along the proof.\n\tElementary calculations show that $Q_j\\gamma^{n,a} I Q_k=\\gamma_{jk}^{n,a}E_{jk}$ and $E_{jk}E_{kl}=E_{jl}$.\n\tFrom (\\ref{fQjQk}) and (\\ref{fQjQkQl}) we obtain\n\t\\begin{align*}\n\t\t\\gamma_{jk}^{n,a}(x_0)v_k\\overline{v_j}&=\\gamma_{jk}^{n,a}(x_1)w_k\\overline{w_j}, \\label{gammaTP1} \\\\\n\t\t\\gamma_{jk}^{n,a}(x_0)\\gamma_{kl}^{n,a}(x_0)v_l\\overline{v_j}\n\t\t&=\\gamma_{jk}^{n,a}(x_1)\\gamma_{kl}^{n,a}(x_1)w_l\\overline{w_j} %\\label{gammaTP2},\n\t\\end{align*}\n\tfor all $j,k,l=1,\\dots,n.$ \n\tNote that\n\t\\begin{align*}\n\t\t\\gamma_{jk}^{n,a}(x)&=\\int_{0}^{\\alpha x}\\ell_{j-1}(y)\\ell_{k-1}(y)dy \\neq 0 \\ \\ \\forall x\\in (0,\\infty).\n\t\\end{align*}\n\tThus, $v_k \\overline{v_j}\\neq 0$ if and only if $w_k \\overline{w_j}\\neq 0$.\n\tIn particular, $v_j\\neq 0$ if and only if $w_j\\neq 0$.\n\tTherefore\n\t\\begin{equation}\\label{razon}\n\t\t\\frac{\\gamma_{jk}^{n,a}(x_0)}{\\gamma_{jk}^{n,a}(x_1)}=\\frac{w_k\\overline{w_j}}{v_k\\overline{v_j}}, \n\t\\end{equation}\n\t\n\t\\begin{equation}\\label{razon2}\n\t\t\\frac{\\gamma_{jk}^{n,a}(x_0)}{\\gamma_{jk}^{n,a}(x_1)}\\cdot\n\t\t\\frac{ \\gamma_{kl}^{n,a}(x_0)}{\\gamma_{kl}^{n,a}(x_1)}\n\t\t=\\frac{w_l\\overline{w_j}}{v_l\\overline{v_j}}\n\t\\end{equation}\n\twhenever $v_k\\overline{v_j}\\neq 0$ and  $v_l\\overline{v_j}\\neq 0$. Since $v \\neq 0$, there exists $k \\in \\{1,\\dots,n\\}$ such that $v_k\\neq 0$.\n\tThen, using (\\ref{razon}) and (\\ref{razon2}) with $j=l=k$ we get that\n\t\\begin{equation*}\n\t\t\\frac{w_k\\overline{w_k}}{v_k\\overline{v_k}}\n\t\t\\frac{w_k\\overline{w_k}}{v_k\\overline{v_k}}\n\t\t=\\frac{w_k\\overline{w_k}}{v_k\\overline{v_k}}\n\t\\end{equation*}\n\tor $|w_k|^2=|v_k|^2.$\n\tUsing last result and (\\ref{razon}) with $j=k$ we obtain $\\gamma_{kk}^{n,a}(x_0)=\\gamma_{kk}^{n,a}(x_1)$. That is,\n\t$$\\int_0^{x_0 \\alpha} (\\ell_{k-1}(y))^2dy=\\int_0^{x_1 \\alpha} (\\ell_{k-1}(y))^2dy,$$\n\twhich implies that $x_0=x_1$ because the integrand $(\\ell_{k-1}(y))^2$ is non-negative and its integral is a non-decreasing function of the upper limit.\n\tConsequently, from (\\ref{razon}) we find that\n\t$v_k\\overline{v_j}=w_k\\overline{w_j}$\n\tfor all $j$. Thus, if $v_j\\neq 0$, then\n\t$$\\frac{v_k}{w_k}=\\frac{w_j}{v_j}.$$\n\tAs in the proof of Lemma \\ref{lem:sep-x0-x1-2}, $\\lambda:= \\frac{w_j}{v_j}$ does not depend on $j$. As a consequence,\t$v=\\lambda w$ with $|\\lambda|=1.$\n\\end{proof}",
+    "why_wrong": "The implication $v_k/w_k = w_j/v_j$ is algebraically incorrect based on the previous line $v_k\\overline{v_j}=w_k\\overline{w_j}$. The correct implication should involve complex conjugates: $v_k/w_k = \\overline{w_j}/\\overline{v_j}$.",
+    "quote": "$$\\frac{v_k}{w_k}=\\frac{w_j}{v_j}.$$",
+    "reason": "The original has v_k/w_k = conjugate(w_j)/conjugate(v_j), but the perturbed version writes w_j/v_j without conjugates, which is incorrect and contradicts the surrounding context showing lambda = conjugate(w_j)/conjugate(v_j)."
+  },
+  {
+    "idx": 26,
+    "uid": 1325,
+    "type": "Logic",
+    "subtype": "invalid_implication",
+    "domain": "math_all",
+    "paper": "2604.26944v1",
+    "paper_title": "\\documentclass{amsart}",
+    "perturbation_id": "P001_S0001",
+    "original": "\\begin{proof}\nThe left-coprimality of $U,V$ is a property of lclms in $\\Sh$.\n\n  By design, $d\\lclm_\\Sh (A,B)=(dU)A=(dU)B$ is a common left multiple\n  of $A$ and $B$ in $\\RecI$. It has minimal degree in $S_n$, since\n  $\\lclm_\\Sh$ does in~$\\Sh$. \nThis implies that there exists $\\delta\\in\\K[n]$ such that\n$\\mclm_\\RecI(A,B)=\\delta\\lclm_\\Sh(A,B)$. Right division by $A$ shows that $\\delta U\\in\\RecI$. Similarly, $\\delta V\\in\\RecI$. This implies that $\\delta$ is a multiple of $d$ and therefore is equal to $d$ by minimality.\n\\end{proof}",
+    "perturbed": "\\begin{proof}\nThe left-coprimality of $U,V$ is a property of lclms in $\\Sh$.\n\n  By design, $d\\lclm_\\Sh (A,B)=(dU)A=(dV)B$ is a common left multiple\n  of $A$ and $B$ in $\\RecI$. It has minimal degree in $S_n$, since\n  $\\lclm_\\Sh$ does in~$\\Sh$. \nThis implies that there exists $\\delta\\in\\K[n]$ such that\n$\\mclm_\\RecI(A,B)=\\delta\\lclm_\\Sh(A,B)$. Right division by $A$ shows that $\\delta U\\in\\RecI$. Similarly, $\\delta V\\in\\RecI$. Since $d$ is the least common multiple of the denominators of $U$ and $V$, and $\\delta U, \\delta V$ are in $\\RecI$, it follows that $d$ must be a multiple of $\\delta$, and therefore $\\delta=d$ by minimality.\n\\end{proof}",
+    "why_wrong": "The logic is reversed: if $\\delta U$ and $\\delta V$ are polynomials (in $\\RecI$), then $\\delta$ must be a multiple of the least common multiple of the denominators of $U$ and $V$ (which is $d$), not the other way around.",
+    "quote": "it follows that $d$ must be a multiple of $\\delta$, and therefore $\\delta=d$ by minimality.",
+    "reason": "The original concludes \u03b4 is a multiple of d (d|\u03b4), but the perturbed reverses this to d is a multiple of \u03b4 (\u03b4|d), which is the wrong divisibility direction for the argument to work."
+  },
+  {
+    "idx": 27,
+    "uid": 1143,
+    "type": "Logic",
+    "subtype": "invalid_implication",
+    "domain": "cs_CC",
+    "paper": "2604.18661v1",
+    "paper_title": "\\vspace-1ex",
+    "perturbation_id": "P003_S0013",
+    "original": "\\begin{proof}[Proof of Theorem~\\ref{thm:main}]\nLet $I$ be an instance with deletion budget $k$.\n\n\\paragraph{Step 1: lift.}\nConstruct the lifted graph $\\Gamma(I)$ by list normalization, parity-defect extraction, and anchor\nencoding. This takes $\\widetilde{O}(md)$ time.\n\n\\paragraph{Step 2: compress.}\nApply Theorem~\\ref{thm:rankcompression} to obtain an equivalent labeling into $\\F_2^\\rho$.\nThe running time is $\\widetilde{O}(md+\\rho^\\omega)$.\n\n\\paragraph{Step 3: cover.}\nApply Theorem~\\ref{thm:coordcover} to the compressed graph with parameter $k$. This takes\n$2^{O(k^2\\rho)}\\cdot n^{O(1)}$ time and, with probability at least $2^{-O(k^2\\rho)}$, succeeds on\nsome optimal balanced subgraph.\n\n\\paragraph{Step 4: enumerate.}\nEnumerate all subsets $D\\subseteq F$ with $\\abs{D}\\le k$. By the bound above, this requires\n$2^{O(k\\log(k\\rho+2))}$ iterations. For each candidate, test satisfiability of $I-D$ using\nLemma~\\ref{lem:zerocheck}. Return the feasible subset of minimum cardinality, or of minimum\nweight among those of cardinality at most $k$ in the weighted variant.\n\n\\paragraph{Correctness.}\nIf the algorithm returns a deletion set, it is feasible by Lemma~\\ref{lem:zerocheck}. Conversely,\nif $\\OPT(I)\\le k$, then on the success event of Theorem~\\ref{thm:coordcover},\nLemma~\\ref{lem:enumeration} guarantees that an optimal solution appears among the enumerated\nsubsets. The algorithm therefore has one-sided error.\n\n\\paragraph{Running time.}\nCombining the three stages gives\n\\[\n2^{O(k^2\\rho+k\\log(k\\rho+2))}\\cdot n^{O(1)}+\\widetilde{O}(md+\\rho^\\omega),\n\\]\nas claimed.\n\\end{proof}",
+    "perturbed": "\\begin{proof}[Proof of Theorem~\\ref{thm:main}]\nLet $I$ be an instance with deletion budget $k$.\n\n\\paragraph{Step 1: lift.}\nConstruct the lifted graph $\\Gamma(I)$. This takes $\\widetilde{O}(md)$ time.\n\n\\paragraph{Step 2: compress.}\nApply Theorem~\\ref{thm:rankcompression} to obtain an equivalent labeling into $\\F_2^\\rho$. This reduction is valid for any modulus $M$, including non-prime powers, by treating the rank over the integers.\n\n\\paragraph{Step 3: cover.}\nApply Theorem~\\ref{thm:coordcover} to the compressed graph. This takes $2^{O(k^2\\rho)}\\cdot n^{O(1)}$ time.\n\n\\paragraph{Step 4: enumerate.}\nEnumerate all subsets $D\\subseteq F$ with $\\abs{D}\\le k$. Test satisfiability of $I-D$ using Lemma~\\ref{lem:zerocheck}.\n\n\\paragraph{Correctness.}\nIf $\\OPT(I)\\le k$, then an optimal solution appears among the enumerated subsets. The algorithm has one-sided error.\n\n\\paragraph{Running time.}\nCombining the stages gives $2^{O(k^2\\rho+k\\log(k\\rho+2))}\\cdot n^{O(1)}+\\widetilde{O}(md+\\rho^\\omega)$.\n\\end{proof}",
+    "why_wrong": "The perturbation claims the rank-compression reduction is valid for non-prime power moduli by treating rank over the integers. This contradicts the paper's framework which relies on $\\F_2$ linear algebra and the specific properties of dyadic coset constraints.",
+    "quote": "This reduction is valid for any modulus $M$, including non-prime powers, by treating the rank over the integers.",
+    "reason": "The paper explicitly works over F_2 and powers of two; the perturbed Step 2 claims validity for non-prime-power moduli, contradicting the paper's stated scope in surrounding context and related passages."
+  },
+  {
+    "idx": 28,
+    "uid": 1085,
+    "type": "Logic",
+    "subtype": "circular_reasoning",
+    "domain": "cs_CC",
+    "paper": "2604.24879v1",
+    "paper_title": "Unrestrictions and concise secant varieties",
+    "perturbation_id": "P000_S0010",
+    "original": "\\begin{proof}\n        The map $\\varphi_1$ from Definition~\\ref{ref:unrestriction:def} yields a map from $\\unrespar{1}{\\Taf}$ to\n        the bundle $\\UU_1^{\\vee}\\otimes V_1$. Conversely, from an element of $\\UU_1^{\\vee}\\otimes V_1$, we obtain a map $\\varphi_1\\colon \\UU_1\\to V_1$\n        and using the description in Proposition~\\ref{ref:functorOfUnrestrictions:prop}, a map $\\UU_1^{\\vee}\\otimes V_1\\to \\unrespar{1}{\\Taf}$.\n        This proves the first part.\n\n        The tensor $\\Tppar{1}$ on $\\unrespar{1}{\\Taf}$ is a pullback via $\\ehat$ of the tensor $\\TGr$ on $\\Gr(m, \\Vhatpar{1})$.\n        By repeatedly using the base change Lemma~\\ref{ref:baseChangeUnres:lem}, we obtain that~\\eqref{eq:unresBundle} is cartesian.\n    \\end{proof}",
+    "perturbed": "\\begin{proof}\n        The map $\\varphi_1$ from Definition~\\ref{ref:unrestriction:def} yields a map from $\\unrespar{1}{\\Taf}$ to\n        the bundle $\\UU_1^{\\vee}\\otimes V_1$. Conversely, from an element of $\\UU_1^{\\vee}\\otimes V_1$, we obtain a map $\\varphi_1\\colon \\UU_1\\to V_1$\n        and using the description in Proposition~\\ref{ref:functorOfUnrestrictions:prop}, a map $\\UU_1^{\\vee}\\otimes V_1\\to \\unrespar{1}{\\Taf}$.\n        This proves the first part.\n\n        The tensor $\\Tppar{1}$ on $\\unrespar{1}{\\Taf}$ is a pullback via $\\ehat$ of the tensor $\\TGr$ on $\\Gr(m, \\Vhatpar{1})$.\n        By assuming that the diagram~\\eqref{eq:unresBundle} is cartesian, we can apply the base change Lemma~\\ref{ref:baseChangeUnres:lem} to conclude that the diagram~\\eqref{eq:unresBundle} is indeed cartesian.\n    \\end{proof}",
+    "why_wrong": "The proof uses the conclusion (that the diagram is cartesian) as a premise to apply the base change lemma, which is circular reasoning.",
+    "quote": "By assuming that the diagram~\\eqref{eq:unresBundle} is cartesian, we can apply the base change Lemma",
+    "reason": "The perturbed proof assumes the cartesian property to conclude it is cartesian, which is circular reasoning; the original uses the base change lemma iteratively to derive the result."
+  },
+  {
+    "idx": 29,
+    "uid": 1335,
+    "type": "Logic",
+    "subtype": "invalid_implication",
+    "domain": "math_all",
+    "paper": "2604.26867v1",
+    "paper_title": "Function-free Optimization via Comparison Oracles",
+    "perturbation_id": "P001_S0001",
+    "original": "\\begin{proof}\nDefine $\\phi(t):=g(x)$ for any $x\\in\\X$ with $f(x)=t$.\nThis is well-defined because if $f(x)=f(y)$, then both $f(x)\\le f(y)$ and $f(y)\\le f(x)$ hold; by preference--equivalence,\n$g(x)\\le g(y)$ and $g(y)\\le g(x)$, hence $g(x)=g(y)$.\n\nIf $t_1<t_2$ and $f(x_i)=t_i$, then $f(x_1)\\le f(x_2)$ but not $f(x_2)\\le f(x_1)$.\nBy preference--equivalence, $g(x_1)\\le g(x_2)$ but not $g(x_2)\\le g(x_1)$, hence $g(x_1)<g(x_2)$.\nTherefore $\\phi(t_1)<\\phi(t_2)$ and $\\phi$ is strictly increasing.\nUniqueness is immediate from $g=\\phi\\circ f$.\nThe converse holds because strictly increasing maps preserve order.\n\\end{proof}",
+    "perturbed": "\\begin{proof}\nDefine $\\phi(t):=g(x)$ for any $x\\in\\X$ with $f(x)=t$.\nThis is well-defined because if $f(x)=f(y)$, then both $f(x)\\le f(y)$ and $f(y)\\le f(x)$ hold; by preference--equivalence,\n$g(x)\\le g(y)$ and $g(y)\\le g(x)$, hence $g(x)=g(y)$.\n\nIf $t_1<t_2$ and $f(x_i)=t_i$, then $f(x_1)\\le f(x_2)$. By preference--equivalence, $g(x_1)\\le g(x_2)$. Since $g(x_1) \\le g(x_2)$ for any $t_1 < t_2$, it follows that $\\phi$ is strictly increasing.\nUniqueness is immediate from $g=\\phi\\circ f$.\nThe converse holds because strictly increasing maps preserve order.\n\\end{proof}",
+    "why_wrong": "The proof concludes that phi is strictly increasing based only on the fact that it is non-decreasing (g(x1) <= g(x2)). It fails to show that g(x1) cannot equal g(x2) when f(x1) < f(x2), which is required for strict monotonicity.",
+    "quote": "By preference--equivalence, $g(x_1)\\le g(x_2)$. Since $g(x_1) \\le g(x_2)$",
+    "reason": "The perturbed proof drops the 'not f(x\u2082)\u2264f(x\u2081)' step needed to derive strict inequality g(x\u2081)<g(x\u2082), so the conclusion that \u03c6 is strictly increasing does not follow from g(x\u2081)\u2264g(x\u2082) alone."
+  },
+  {
+    "idx": 30,
+    "uid": 1009,
+    "type": "Logic",
+    "subtype": "invalid_implication",
+    "domain": "cs_CC",
+    "paper": "2604.18283v1",
+    "paper_title": "On quantum functionals for higher-order tensors",
+    "perturbation_id": "P005_S0005",
+    "original": "\\begin{proof}[Proof of~\\cref{thm:upper lower separation general k}.]\n    As $\\theta$ is a non-singleton-supported distribution, there exists a bipartition $b=(S,\\overline{S})$ in the support of $\\theta$ such that $|S|\\geq 2$, and $|\\overline{S}|\\geq 2$.\n    Choose $A,B \\in S$ and $C, D \\in \\overline{S}$ and define $\\varphi$ to be a tensor whose local dimensions are $d_A =d_B =d_C =d_D = 2$ and $d_X = 1$ for all other $X \\in [k]\\setminus\\{A,B,C,D\\}$.\n    \n    Let $\\varphi$ be the tensor of the form\n    \\begin{equation}\n        \\varphi_{ABCDX_1\\cdots X_{k-4}} = \\psi_{\\frac{1}{3};ABCD} \\ot v_1\\ot\\cdots \\ot v_{k-4}\n    \\end{equation}\n    where $v_i$ are any unit vectors in the one-dimensional spaces.\n    Now as $\\theta$ is laminar, we know that \n    \\begin{equation}\n        \\theta(ABX|CD X') > 0, \\quad\n        \\theta(ACX|BD X') = 0, \\quad \n        \\theta(ADX|BC X') = 0,\n    \\end{equation}\n    where $X$ denotes any string of indices in $[k]\\setminus\\{A,B,C,D\\}$ and $X'$ its complement in $[k]\\setminus\\{A,B,C,D\\}$.\n    We already established that $E_{\\tilde\\theta}(\\psi_{1/3}) < E^{\\tilde\\theta}(\\psi_{1/3})$ for all distributions $\\tilde \\theta$ on bipartitions of the seven bipartitions of $(A,B,C,D)$ satisfying\n    \\begin{equation}\n        \\tilde \\theta(AB|CD) > 0, \\quad\n        \\tilde \\theta(AC|BD) = 0, \\quad \n        \\tilde \\theta(AD|BC) = 0,\n    \\end{equation}\n    To prove $E_{\\theta}(\\varphi)<E^{\\theta}(\\varphi)$, it thus suffices to prove\n    \\begin{equation}\n        E_{\\theta}(\\varphi)=\\kappa E_{\\tilde\\theta}(\\psi_{\\frac{1}{3}}), \\qquad E^{\\theta}(\\varphi) =\\kappa E^{\\tilde\\theta}(\\psi_{\\frac{1}{3}}),\n    \\end{equation}\n    for some distribution $\\tilde \\theta$ and normalization constant $\\kappa >0$.\n    Indeed, $\\tilde \\theta$ and $\\kappa$ are chosen such that\n    \\begin{align}\n        \\begin{split}\n            \\kappa\\tilde \\theta(A|BCD)&=\\sum \\theta(AX|BCD X'),\\\\\n            \\kappa\\tilde \\theta(B|ACD)&=\\sum \\theta(BX|ACD X'),\\\\\n            \\kappa\\tilde \\theta(C|ABD)&=\\sum \\theta(CX|ABD X'),\\\\\n            \\kappa\\tilde \\theta(D|ABC)&=\\sum \\theta(DX|ABC X'),\\\\\n            \\kappa\\tilde \\theta(AB|CD)&=\\sum \\theta(ABX|CD X'),\n        \\end{split}\n    \\end{align}\n    where each sum is over all subsets $X \\subset [k]\\setminus\\{A,B,C,D\\}$ and $X'$ the corresponding complement.\n    Note that $\\kappa$ and $\\tilde \\theta$ are then uniquely fixed by requiring $\\tilde \\theta$ sum to one (and also noting that at least $\\tilde \\theta(AB|CD) > 0$ so $\\kappa > 0$ is well-defined).\n\\end{proof}",
+    "perturbed": "\\begin{proof}[Proof of~\\cref{thm:upper lower separation general k}.]\n    As $\\theta$ is a non-singleton-supported distribution, there exists a bipartition $b=(S,\\overline{S})$ in the support of $\\theta$ such that $|S|\\geq 2$, and $|\\overline{S}|\\geq 2$.\n    Choose $A,B \\in S$ and $C, D \\in \\overline{S}$ and define $\\varphi$ to be a tensor whose local dimensions are $d_A =d_B =d_C =d_D = 2$ and $d_X = 1$ for all other $X \\in [k]\\setminus\\{A,B,C,D\\}$.\n    \n    Let $\\varphi$ be the tensor of the form\n    \\begin{equation}\n        \\varphi_{ABCDX_1\\cdots X_{k-4}} = \\psi_{\\frac{1}{3};ABCD} \\ot v_1\\ot\\cdots \\ot v_{k-4}\n    \\end{equation}\n    where $v_i$ are any unit vectors in the one-dimensional spaces.\n    Now as $\\theta$ is laminar, we know that \n    \\begin{equation}\n        \\theta(ABX|CD X') = 0, \\quad\n        \\theta(ACX|BD X') > 0, \\quad \n        \\theta(ADX|BC X') > 0,\n    \\end{equation}\n    where $X$ denotes any string of indices in $[k]\\setminus\\{A,B,C,D\\}$ and $X'$ its complement.\n    To prove $E_{\\theta}(\\varphi)<E^{\\theta}(\\varphi)$, it thus suffices to prove the separation for $\\psi_{1/3}$.\n\\end{proof}",
+    "why_wrong": "The perturbation reverses the laminar support conditions. Laminarity for a distribution supported on $AB|CD$ implies that cross-partitions like $AC|BD$ must have zero weight, but the perturbed text claims they are positive.",
+    "quote": "\\theta(ABX|CD X') = 0, \\quad\n        \\theta(ACX|BD X') > 0",
+    "reason": "The surrounding context explicitly states theta(ABX|CDX')>0 and theta(ACX|BDX')=0; the perturbation reverses these inequalities, contradicting the laminar structure argument that requires AB|CD to have positive weight for the separation to apply."
+  },
+  {
+    "idx": 31,
+    "uid": 2832,
+    "type": "Experimental",
+    "subtype": "causal_reversed",
+    "domain": "cs_LG",
+    "paper": "2604.25897v1",
+    "paper_title": "\\LARGE \\bf Variational Neural Belief Parameterizations for Robust Dexterous Grasping under Multimodal Uncertainty",
+    "perturbation_id": "P001_S0001",
+    "original": "Applying the relaxation and location-scale reformulation steps, we can then write a single reparameterized GMM sample as\n\\begin{equation}\\label{eq:rsample}\n\\begin{array}{l}\n\\boldsymbol{\\theta}\n=\\sum_{k=1}^{K}\n\\zeta_k \\bigl(\\boldsymbol{\\mu}_k + \\boldsymbol{\\sigma}_k \\odot \\boldsymbol{\\epsilon}_k\\bigr),\\quad\\boldsymbol{\\epsilon}_k \\sim \\mathcal{N}(\\mathbf{0}, \\mathbf{I}),\n\\end{array}\n\\end{equation}\nwhere the weights \\(\\zeta_k\\) are obtained via the Gumbel-Softmax relaxation\n\\begin{equation}\\label{eq:gumbel_softmax}\n\\zeta_k =\n\\frac{\\exp\\!\\left((\\ell_k + g_k)/\\tau\\right)}\n{\\sum_{j=1}^{K} \\exp\\!\\left((\\ell_j + g_j)/\\tau\\right)},\n\\quad\ng_k \\sim \\mathrm{Gumbel}(0,1),\n\\end{equation}\nwith temperature \\(\\tau>0\\) controlling exploration--exploitation: low \\(\\tau\\) yields near-deterministic component selection, while high \\(\\tau\\) blends hypotheses. Every operation in~\\eqref{eq:rsample} is differentiable with respect to \\(\\boldsymbol{\\ell}\\), \\(\\boldsymbol{\\mu}_k\\), and \\(\\log \\boldsymbol{\\sigma}_k\\)\\footnote{For highly multimodal posteriors that resist mixture approximation, our framework also supports an implicit neural belief using sinusoidal representation networks (SIREN)~\\cite{sitzmann2020implicit} with Langevin sampling \\cite{xuGlobalConvergenceLangevin2018}; see Appendix~\\ref{apx:belief_nets}.}.",
+    "perturbed": "Applying the relaxation and location-scale reformulation steps, we can then write a single reparameterized GMM sample as\n\\begin{equation}\\label{eq:rsample}\n\\begin{array}{l}\n\\boldsymbol{\\theta}\n=\\sum_{k=1}^{K}\n\\zeta_k \\bigl(\\boldsymbol{\\mu}_k + \\boldsymbol{\\sigma}_k \\odot \\boldsymbol{\\epsilon}_k\\bigr),\\quad\\boldsymbol{\\epsilon}_k \\sim \\mathcal{N}(\\mathbf{0}, \\mathbf{I}),\n\\end{array}\n\\end{equation}\nwhere the weights \\(\\zeta_k\\) are obtained via the Gumbel-Softmax relaxation\n\\begin{equation}\\label{eq:gumbel_softmax}\n\\zeta_k =\n\\frac{\\exp\\!\\left((\\ell_k + g_k)/\\tau\\right)}\n{\\sum_{j=1}^{K} \\exp\\!\\left((\\ell_j + g_j)/\\tau\\right)},\n\\quad\ng_k \\sim \\mathrm{Gumbel}(0,1),\n\\end{equation}\nwith temperature \\(\\tau>0\\) controlling exploration--exploitation: the observed blending of hypotheses in the belief distribution causes the temperature \\(\\tau\\) to increase during the optimization process. Every operation in~\\eqref{eq:rsample} is differentiable with respect to \\(\\boldsymbol{\\ell}\\), \\(\\boldsymbol{\\mu}_k\\), and \\(\\log \\boldsymbol{\\sigma}_k\\).",
+    "why_wrong": "The perturbation reverses the causal relationship: in Gumbel-Softmax, the temperature $\\tau$ is a hyperparameter that controls the blending of hypotheses (the effect), but the perturbed text claims the blending causes the temperature to change.",
+    "quote": "the observed blending of hypotheses in the belief distribution causes the temperature \\(\\tau\\) to increase",
+    "reason": "The original states that high \u03c4 causes blending of hypotheses (\u03c4 is the cause, blending is the effect); the perturbed text reverses this causal direction."
+  },
+  {
+    "idx": 32,
+    "uid": 1958,
+    "type": "Experimental",
+    "subtype": "misinterp",
+    "domain": "q-bio_GN",
+    "paper": "2604.07196v1",
+    "paper_title": "%%%%%%%% ICML 2026 EXAMPLE LATEX SUBMISSION FILE %%%%%%%%%%%%%%%%%",
+    "perturbation_id": "P008_S0008",
+    "original": "\\subsection{Sequence Generation Test}\nWe evaluated whether Evo2-generated sequences produce biologically plausible 3D chromatin structure under Orca \\cite{Zhou_Zhou_NatGenet2022}, a sequence-to-3D-genome model which was first validated on real sequences from our cohorts against H1-ESC Micro-C (4DNES21D8SP8). Generation used temperature\\,=\\,0.8, top\\_k\\,=\\,4, top\\_p\\,=\\,1.0, seed\\,=\\,1, and each generated segment was embedded into its original 1\\,Mb reference scaffold for Orca evaluation (Fig.~\\ref{F02}(b)).",
+    "perturbed": "\\subsection{Sequence Generation Test}\nWe evaluated Evo2-generated sequences under Orca. Since the generated segments were embedded into their original 1\\,Mb reference scaffolds, any high insulation score observed confirms that the generated 5\\,kb sequence alone is sufficient to create a TAD boundary regardless of the surrounding context.",
+    "why_wrong": "The text claims the 5kb sequence is 'sufficient regardless of context,' but the method describes embedding it back into the original genomic context, meaning the context's influence was not controlled for.",
+    "quote": "any high insulation score observed confirms that the generated 5 kb sequence alone is sufficient to create a TAD boundary regardless of the surrounding context",
+    "reason": "The original explicitly embeds generated segments into the 1 Mb reference scaffold for evaluation, making surrounding context integral; the perturbed claim that the 5 kb sequence alone is sufficient contradicts this design."
+  },
+  {
+    "idx": 33,
+    "uid": 266,
+    "type": "Experimental",
+    "subtype": "p_hacking",
+    "domain": "econ_EM",
+    "paper": "2604.19580v1",
+    "paper_title": "Probabilistic Forecasting for Day-ahead Electricity Prices, Battery Trading Strategies and the Economic Evaluation of Predictive Accuracy",
+    "perturbation_id": "P000_S0000",
+    "original": "\\section{Forecast Evaluation}\\label{sec:forecast-evaluation}",
+    "perturbed": "\\section{Forecast Evaluation}\\label{sec:forecast-evaluation}\n\nWe evaluate the forecast models using standard probabilistic scoring rules and economic metrics. To ensure the superiority of our proposed model, we perform the evaluation on multiple subsets of the test data and report the results for the subset where the statistical significance of the improvement is highest.",
+    "why_wrong": "The text explicitly admits to selective reporting (reporting only the subset with the highest significance), which is a form of p-hacking that invalidates the statistical integrity of the results.",
+    "quote": "report the results for the subset where the statistical significance of the improvement is highest",
+    "reason": "The surrounding_context shows the original paper evaluates on the full test set without subset selection; reporting only the subset with highest significance is a post-hoc selection p-hacking flaw."
+  },
+  {
+    "idx": 34,
+    "uid": 2334,
+    "type": "Experimental",
+    "subtype": "misinterp",
+    "domain": "stat_AP",
+    "paper": "2604.22548v1",
+    "paper_title": "%\\documentclass[ijoc,blindrev]{informs3}",
+    "perturbation_id": "P007_S0017",
+    "original": "\\paragraph{Graph-assisted Composite Likelihood.}\nFor large $J$ (e.g., $J>20$), computation of the full likelihood is prohibitive since the joint density function of \\eqref{eq:multivariate_cdf} involves all partial derivatives, of which the number of terms is the Bell number of $J$. Moreover, some max-stable processes even have no closed form. Therefore, a composite likelihood with a low-order (e.g., bivariate or trivariate) density is standard in the inference of max-stable processes. Consider the set of transformed observations $\\mathbf{z}^{(b)} = \\{z_j^{(b)}\\colon j\\in[J],\\, \\}$ for $b\\in[NB]$. For the order $H\\in\\{2,\\ldots, J\\}$, $\\mathcal{G}_H$ denotes the collection of all subvector of $\\{\\mathbf{z}^{(b)}\\colon b\\in[NB]\\}$ with the size $H$, and $\\boldsymbol{\\mathrm{z}}_{[H]}$ is a member of $\\mathcal{G}_H$. Let the parameter to be estimated in the max-stable process be $\\tau$ and $f(\\:\\cdot\\: ; \\tau)$ is the $H$-th order density from \\eqref{eq:multivariate_cdf} characterized by the max-stable process. Then, the conventional composite likelihood is \n\\begin{equation}\\label{eq:cl}\n    \\mathrm{CL}_H(\\tau|\\boldsymbol{Z}) \n        = \\prod_{b=1}^B \\prod_{\\mathbf{z}_{[H]}\\in \\mathcal{G}_H} f(\\mathbf{z}_{[H]}^{(b)};\\tau),\n\\end{equation}\nwhere the logarithm can be taken on the RHS as the composite log-likelihood.",
+    "perturbed": "\\paragraph{Graph-assisted Composite Likelihood.}\nFor large $J$ (e.g., $J>20$), computation of the full likelihood is prohibitive since the joint density function of \\eqref{eq:multivariate_cdf} involves all partial derivatives, of which the number of terms is the Bell number of $J$. Moreover, some max-stable processes even have no closed form. Therefore, a composite likelihood with a low-order (e.g., bivariate or trivariate) density is standard in the inference of max-stable processes. Consider the set of transformed observations $\\mathbf{z}^{(b)} = \\{z_j^{(b)}\\colon j\\in[J],\\, \\}$ for $b\\in[NB]$. For the order $H\\in\\{2,\\ldots, J\\}$, $\\mathcal{G}_H$ denotes the collection of all subvector of $\\{\\mathbf{z}^{(b)}\\colon b\\in[NB]\\}$ with the size $H$, and $\\boldsymbol{\\mathrm{z}}_{[H]}$ is a member of $\\mathcal{G}_H$. Let the parameter to be estimated in the max-stable process be $\\tau$ and $f(\\:\\cdot\\: ; \\tau)$ is the $H$-th order density from \\eqref{eq:multivariate_cdf} characterized by the max-stable process. Then, the conventional composite likelihood is \n\\begin{equation}\\label{eq:cl}\n    \\mathrm{CL}_H(\\tau|\\boldsymbol{Z}) \n        = \\prod_{b=1}^B \\prod_{\\mathbf{z}_{[H]}\\in \\mathcal{G}_H} f(\\mathbf{z}_{[H]}^{(b)};\\tau),\n\\end{equation}\nwhere the product over $B$ blocks ensures that the resulting likelihood is a probability value between 0 and 1.",
+    "why_wrong": "The text misinterprets the composite likelihood (CL) as a probability value. A likelihood (especially a product of densities) is not restricted to the [0, 1] interval, unlike a probability mass function or a p-value.",
+    "quote": "the product over $B$ blocks ensures that the resulting likelihood is a probability value between 0 and 1",
+    "reason": "The surrounding context states 'the logarithm can be taken on the RHS as the composite log-likelihood,' implying the product is a likelihood (not necessarily \u22641); claiming it is a probability between 0 and 1 misinterprets the composite likelihood."
+  },
+  {
+    "idx": 35,
+    "uid": 149,
+    "type": "Experimental",
+    "subtype": "misinterp",
+    "domain": "econ_EM",
+    "paper": "2604.22445v1",
+    "paper_title": "Inference in Tightly Identified and Large-Scale Sign-Restricted SVARs",
+    "perturbation_id": "P007_S0007",
+    "original": "The non-monotone behavior of the $\\Rhat$ statistic and the lower ESS values for the elliptical slice sampler are likely due to the high autocorrelation in the sampled chain. Figure \\ref{fig:kilian_acf} presents the autocorrelation function for $B_{11}$ from the NUTS (top panel), and $Q_{11}$ from the elliptical slice sampler (bottom panel). The latter exhibits substantially higher autocorrelation, which reduces sampling efficiency. Similar patterns are observed for the other elements of the matrices \\(B\\) and \\(Q\\) in this application.",
+    "perturbed": "The non-monotone behavior of the $\\Rhat$ statistic and the lower ESS values for the elliptical slice sampler are likely due to the high autocorrelation in the sampled chain. Figure \\ref{fig:kilian_acf} presents the autocorrelation function for $B_{11}$ from the NUTS (top panel), and $Q_{11}$ from the elliptical slice sampler (bottom panel). Because the autocorrelation for $Q_{11}$ is positive, we can conclude that the sampler is exploring the parameter space twice as thoroughly as a sampler with zero autocorrelation.",
+    "why_wrong": "Positive autocorrelation means the sampler is less efficient and explores the space more slowly than independent sampling (zero autocorrelation). The claim that it is 'twice as thorough' is a direct misinterpretation of how autocorrelation affects sampling.",
+    "quote": "the sampler is exploring the parameter space twice as thoroughly as a sampler with zero autocorrelation",
+    "reason": "The surrounding context states high autocorrelation 'reduces sampling efficiency'; positive autocorrelation reduces effective sample size and exploration, not increases it, so the perturbed claim directly contradicts established statistical methodology."
+  },
+  {
+    "idx": 36,
+    "uid": 2140,
+    "type": "Experimental",
+    "subtype": "p_hacking",
+    "domain": "q-bio_GN",
+    "paper": "2604.18621v1",
+    "paper_title": "Quantum AI for Cancer Diagnostic Biomarker Discovery",
+    "perturbation_id": "P003_S0003",
+    "original": "Specifically, clinical samples corresponding to LUAD subtypes were first stratified into Subtype-I and Subtype-II, resulting in separate matrices for each omic layer, denoted as Omic1.1${Subtype-I}$ and Omic1.2${Subtype-II}$. To identify molecular features that significantly discriminate between tumor and normal samples, we applied classical statistical tests, including unpaired t-tests, to compare the mean expression or methylation levels of each gene between the two groups. This step allowed the detection of differentially expressed genes (DEGs) or differentially methylated regions (DMRs) that are indicative of tumor-specific alterations. Multiple testing correction was applied using the Benjamini-Hochberg procedure to control the false discovery rate, ensuring that only statistically robust features were retained. Genes meeting the adjusted p-value threshold ($ \\le 0.05 $) and exhibiting biologically meaningful log2 fold changes were flagged as significant, providing a preliminary but stringent set of candidate biomarkers. Significant genes were further categorized as upregulated or downregulated based on fold-change thresholds, and their variance across samples was computed to prioritize highly variable, biologically informative features. For RNA-seq data, top upregulated and downregulated genes were extracted for downstream analyses, including PCA visualization, hierarchical clustering, and volcano plot representation. This approach enabled the identification of subtype-specific molecular signatures while reducing noise from non-informative features.",
+    "perturbed": "Specifically, clinical samples corresponding to LUAD subtypes were first stratified into Subtype-I and Subtype-II, resulting in separate matrices for each omic layer, denoted as Omic1.1${Subtype-I}$ and Omic1.2${Subtype-II}$. To identify molecular features that significantly discriminate between tumor and normal samples, we applied classical statistical tests, including unpaired t-tests, to compare the mean expression or methylation levels of each gene between the two groups. This step allowed the detection of differentially expressed genes (DEGs) or differentially methylated regions (DMRs) that are indicative of tumor-specific alterations. To maximize the number of candidate biomarkers for the quantum neural network, we reported all genes meeting the standard p-value threshold ($ \\le 0.05 $) without applying multiple testing corrections, as such corrections were found to reduce the total count of significant features. Genes exhibiting biologically meaningful log2 fold changes were flagged as significant, providing a preliminary set of candidate biomarkers.",
+    "why_wrong": "The text explicitly states that multiple testing corrections were omitted specifically to increase the count of significant features, which is a methodological flaw (p-hacking) given the high dimensionality of genomic data (20,531 genes).",
+    "quote": "reported all genes meeting the standard p-value threshold ($ \\le 0.05 $) without applying multiple testing corrections",
+    "reason": "The perturbed text explicitly removes the Benjamini-Hochberg multiple testing correction that the original applies, and the surrounding_context and related_passages confirm AdjP (FDR-adjusted p-values) are used throughout the methodology."
+  },
+  {
+    "idx": 37,
+    "uid": 2909,
+    "type": "Experimental",
+    "subtype": "causal_reversed",
+    "domain": "cs_LG",
+    "paper": "2604.25599v1",
+    "paper_title": "PLMGH: What Matters in PLM-GNN Hybrids for Code Classification and Vulnerability Detection",
+    "perturbation_id": "P006_S0006",
+    "original": "% - Main performance table: all models on Java250 + Devign (+ OOD where applicable).\n% - Highlight best trade-off, not just best score.\n%\n\\subsection{RQ2: Efficiency}\n\\label{sec:efficiency}\n% - Table/plot: preprocessing time, inference latency.",
+    "perturbed": "\\subsection{RQ2: Efficiency}\n\\label{sec:efficiency}\nTo investigate if high inference latency necessitates the use of hybrid architectures, we report wall-clock times on the Devign test split.",
+    "why_wrong": "The paper proposes hybrids to improve performance/efficiency. The perturbation suggests that high latency causes the need for hybrids, whereas the paper's logic is that the hybrid architecture is the source of the latency characteristics being measured.",
+    "quote": "To investigate if high inference latency necessitates the use of hybrid architectures",
+    "reason": "The surrounding_context shows the original RQ2 section is about quantifying runtime cost of hybrids, not about whether latency necessitates hybrid use; the perturbed framing reverses the causal logic by implying latency drives the architectural choice."
+  },
+  {
+    "idx": 38,
+    "uid": 475,
+    "type": "Experimental",
+    "subtype": "misinterp",
+    "domain": "hep-ex",
+    "paper": "2604.26004v1",
+    "paper_title": "Revisiting Turner Window Axions: The Untapped Potential of NaI Dark Matter Detectors",
+    "perturbation_id": "P002_S0012",
+    "original": "\\begin{figure*}[t]\n\\centering\n\\includegraphics[scale=0.4]{Fig_KSVZ_DFSZ.pdf}\n\\caption{Constraints on KSVZ-like (left panel) and DFSZ-like (right panel) ALPs (ALPs with the same isospin couplings as the KSVZ and DFSZ axions).\nThe KSVZ axion trajectory is the solid line in the left panel, while the\nthe DFSZ axion trajectories for $\\sin^2{\\beta}=0,1$ are shown as solid and dashed lines, respectively, on the right.  The SNO neutron and SN1987A cooling exclusions are given, along with the potential exclusion from resonant absorption of galactic axions in NaI. See Eqs.~(\\ref{eq:rotKSVZ}) and (\\ref{eq:rotDFSZ}) for the definitions\nof  $g_{aNN}^\\mathrm{\\parallel~KSVZ}$ and  $g_{aNN}^\\mathrm{\\parallel~DFSZ}$.}\n\\label{fig:KSVZDFSZ}\n\\end{figure*}",
+    "perturbed": "\\begin{figure*}[t]\n\\centering\n\\includegraphics[scale=0.4]{Fig_KSVZ_DFSZ.pdf}\n\\caption{Constraints on KSVZ-like (left panel) and DFSZ-like (right panel) ALPs (ALPs with the same isospin couplings as the KSVZ and DFSZ axions).\nThe KSVZ axion trajectory is the solid line in the left panel, while the\nthe DFSZ axion trajectories for $\\sin^2{\\beta}=0,1$ are shown as solid and dashed lines, respectively, on the right. The SNO neutron and SN1987A cooling exclusions are given, along with the potential exclusion from resonant absorption of galactic axions in NaI. See Eqs.~(\\ref{eq:rotKSVZ}) and (\\ref{eq:rotDFSZ}) for the definitions\nof $g_{aNN}^\\mathrm{\\parallel~KSVZ}$ and $g_{aNN}^\\mathrm{\\parallel~DFSZ}$. Note that the NaI exclusion region is independent of the axion mass $m_a$.}\n\\label{fig:KSVZDFSZ}\n\\end{figure*}",
+    "why_wrong": "The perturbation claims the NaI exclusion is independent of axion mass. However, the paper describes the process as 'resonant absorption', which by definition depends on the axion mass matching the nuclear transition energy (440 keV), and the context of the figure involves relating mass to couplings.",
+    "quote": "Note that the NaI exclusion region is independent of the axion mass $m_a$.",
+    "reason": "The paper explicitly discusses phase-space effects of $m_a$ on NaI limits and shows Fig. \\ref{fig:Na23eff} giving ALP constraints as a function of $m_a$, directly contradicting the claim that the NaI exclusion is independent of $m_a$."
+  },
+  {
+    "idx": 39,
+    "uid": 2408,
+    "type": "Experimental",
+    "subtype": "p_hacking",
+    "domain": "stat_AP",
+    "paper": "2604.22636v1",
+    "paper_title": "CLVAE: A Variational Autoencoder for Long-Term Customer Revenue Forecasting",
+    "perturbation_id": "P001_S0001",
+    "original": "% Summary Empirical Application Section\nThe empirical assessment of the proposed neural network model is based on the analysis of multiple real-world datasets. A description of these datasets follows in the next section. Then, we apply the proposed model to these datasets and evaluate its accuracy for customer revenue projections. Throughout, we evaluate predictions of customer-level cumulative realized revenue during each holdout period, computed directly from the holdout transactions. Thereby, we also provide a comparison with key benchmark models. All models are estimated using only transactions within the estimation window. The holdout window is used exclusively for out-of-sample evaluation, and no features or preprocessing steps use information from the holdout period. We report results for up to four holdout horizons, measured from the end of the estimation window, to reflect use cases with different forecasting requirements (52, 104, 156, 208 weeks). See Table \\ref{table:descriptives} for details.",
+    "perturbed": "% Summary Empirical Application Section\nThe empirical assessment of the proposed neural network model is based on the analysis of multiple real-world datasets. A description of these datasets follows in the next section. Then, we apply the proposed model to these datasets and evaluate its accuracy for customer revenue projections. Throughout, we evaluate predictions of customer-level cumulative realized revenue during each holdout period. To ensure the best possible results, we iteratively adjusted the estimation window and the inclusion of specific customer cohorts after observing the holdout performance. We report results for up to four holdout horizons (52, 104, 156, 208 weeks). See Table \\ref{table:descriptives} for details.",
+    "why_wrong": "The author states that they adjusted the estimation window and cohort definitions after observing holdout performance, which violates the principle of out-of-sample evaluation and constitutes data leakage/p-hacking.",
+    "quote": "we iteratively adjusted the estimation window and the inclusion of specific customer cohorts after observing the holdout performance",
+    "reason": "The surrounding context explicitly states 'The holdout window is used exclusively for out-of-sample evaluation, and no features or preprocessing steps use information from the holdout period,' which the perturbed text directly contradicts by describing iterative adjustment based on holdout performance."
+  },
+  {
+    "idx": 40,
+    "uid": 300,
+    "type": "Experimental",
+    "subtype": "misinterp",
+    "domain": "econ_EM",
+    "paper": "2604.18078v1",
+    "paper_title": "\\bf Factor-Augmented Panel Regressions and Variance-Weighted Treatment Effects\\thanks% Financial support from the Dutch Research Council (NWO) under research grants $451-17-002$ and VI.Vidi.231E.030 is gratefully acknowledged by the first author. The second author thanks Patrick Gagliardini and Olivier Scaillet for organizing the 2025 EC$^2$ Conference in Lugano and for the invitation to present this paper as the ET Lecture, and Peter Phillips and the journal \\emphEconometric Theory",
+    "perturbation_id": "P001_S0001",
+    "original": "\\subsection{Setup and estimand}\n\\label{ssection::twoway_setup}",
+    "perturbed": "\\subsection{Setup and estimand}\n\\label{ssection::twoway_setup}\n\nWe define the estimand such that a confidence interval for $\\beta^*$ containing zero implies that the effect $\\beta_{it}$ is exactly zero for all $i$ and $t$.",
+    "why_wrong": "Equation (3.4) defines $\\beta^*$ as a variance-weighted average of $\\beta_{it}$. An average being statistically indistinguishable from zero does not imply that every individual component $\\beta_{it}$ is zero, especially given the heterogeneity described in the text.",
+    "quote": "a confidence interval for $\\beta^*$ containing zero implies that the effect $\\beta_{it}$ is exactly zero for all $i$ and $t$",
+    "reason": "The paper defines $\\beta^*$ as a variance-weighted average of heterogeneous $\\beta_{it}$, so a CI for $\\beta^*$ containing zero says nothing about whether individual $\\beta_{it}$ are zero; the perturbed claim misinterprets the estimand."
+  }
+]
\ No newline at end of file
diff --git a/benchmarks/perturbation/manual_validation/samples.md b/benchmarks/perturbation/manual_validation/samples.md
new file mode 100644
index 0000000..447a671
--- /dev/null
+++ b/benchmarks/perturbation/manual_validation/samples.md
@@ -0,0 +1,1951 @@
+# Injected-perturbation validation sample
+
+40 perturbations: 10 per type (Surface, Claim, Logic, Experimental), each subtype covered ≥ once, drawn from the verified/kept set (`data/perturbations_filtered/`).
+
+For each: **Passage** = original text, **Perturbation** = injected replacement, **Why it errs** = why it breaks internal consistency, **Contradicting evidence** = the passage elsewhere it conflicts with.
+
+## Coverage
+
+- **Surface** (10): Computation (2), Index / Subscript (5), Numeric (2), Operator / Sign (1)
+- **Claim** (10): False theoretical claim (2), False empirical claim (8)
+- **Logic** (10): Circular reasoning (2), Induction error (1), Invalid implication (6), Missing case (1)
+- **Experimental** (10): Reversed causality (2), Misinterpretation of results (5), P-hacking (3)
+
+---
+
+## Surface
+
+### 1. Surface — Computation
+
+`stat_AP` / `2604.23438v1` / `P000_S0000`
+
+**Passage (original):**
+
+```
+\[
+\mathbf{X} =
+\left[
+\begin{array}{c|c|c|c}
+\mathbf{X}_1^\top&\mathbf{X}_2^\top&\cdots&\mathbf{X}_{250}^\top
+\end{array}
+\right]^\top,
+\]
+```
+
+**Perturbation (injected):**
+
+```
+\[
+\mathbf{X} =
+\left[
+\begin{array}{c|c|c|c}
+\mathbf{X}_1^\top&\mathbf{X}_2^\top&\cdots&\mathbf{X}_{250}^\top
+\end{array}
+\right],
+\]
+```
+
+**Why it causes an error:** The context states that X is a matrix of dimension 1750 x 35. Each block X_i is defined as I_7 (7x7) tensor product with a vector, resulting in 7 columns. If X is a vertical stack of 250 such blocks (as implied by the original transpose on the outer bracket), the rows would be 250 * 7 = 1750. Removing the outer transpose makes it a horizontal concatenation, which would result in a 7 x 1750 matrix, contradicting the stated 1750 x 35 dimensions.
+
+**Contradicting evidence (quote):**
+
+```
+\right],
+```
+
+**Verifier note:** The surrounding context states $\mathbf{X}$ is formed by stacking blocks $\mathbf{X}_i^\top$ vertically (hence the outer transpose $[\cdots]^\top$); removing the transpose makes it a row vector rather than the required $1750\times35$ matrix.
+
+**Your assessment:** (x) valid error  ( ) not an error  ( ) unsure  
+**Notes:** 
+
+### 2. Surface — Index / Subscript
+
+`econ_EM` / `2604.23023v1` / `P001_S0033`
+
+**Passage (original):**
+
+```
+$X_{0}\in\mathbb{R}^{p}$
+```
+
+**Perturbation (injected):**
+
+```
+$X_{0}\in\mathbb{R}^{p'}$
+```
+
+**Why it causes an error:** The context defines $W$ as a $p'$-dimensional Brownian motion and $X$ as a $p$-dimensional covariate process. Changing the dimension of the initial value $X_0$ to $p'$ creates a dimensionality mismatch with the covariate process $X$ and its coefficients $\beta \in \mathbb{R}^p$.
+
+**Contradicting evidence (quote):**
+
+```
+$X_{0}\in\mathbb{R}^{p'}$
+```
+
+**Verifier note:** The related_passages explicitly state '$X_{0}\in\mathbb{R}^{p}$', and $p'$ is the dimension of the Brownian motion $W$, not the state space of $X_0$.
+
+**Your assessment:** (x) valid error  ( ) not an error  ( ) unsure  
+**Notes:** 
+
+### 3. Surface — Numeric
+
+`physics_atm-clus` / `2603.22254v1` / `P009_S0019`
+
+**Passage (original):**
+
+```
+$\times$
+```
+
+**Perturbation (injected):**
+
+```
+$\times$10$^{-12}$
+```
+
+**Why it causes an error:** The text describes 'consistent high ion-diffusivity'; 10$^{-12}$ cm$^2$/s is several orders of magnitude lower than the 10$^{-6}$ cm$^2$/s range typically considered 'high' for fast-charging battery anodes.
+
+**Contradicting evidence (quote):**
+
+```
+$\times$10$^{-12}$
+```
+
+**Verifier note:** The surrounding context states the diffusivity is ≈5×10⁻⁶ cm²/s, and a related passage confirms ~4.8×10⁻⁶ cm²s⁻¹; inserting 10⁻¹² changes the exponent from -6 to -12, contradicting the established value by six orders of magnitude.
+
+**Your assessment:** ( ) valid error  (x) not an error  ( ) unsure  
+**Notes:** 
+
+### 4. Surface — Operator / Sign
+
+`cs_CC` / `2604.19625v1` / `P000_S0010`
+
+**Passage (original):**
+
+```
+$|0\rangle$
+```
+
+**Perturbation (injected):**
+
+```
+$|1\rangle$
+```
+
+**Why it causes an error:** The text explicitly defines $|0\rangle$ as the vacuum state. In bosonic systems, the coherent state is generated by applying the displacement operator to the vacuum state $|0\rangle$, not the first excited state $|1\rangle$.
+
+**Contradicting evidence (quote):**
+
+```
+$|\alpha\rangle = D(\alpha)|1\rangle$
+```
+
+**Verifier note:** The related_passages and surrounding context explicitly state '$|0\rangle$ is the vacuum state' and define $|\alpha\rangle = D(\alpha)|0\rangle$; replacing $|0\rangle$ with $|1\rangle$ contradicts this definition.
+
+**Your assessment:** (x) valid error  ( ) not an error  ( ) unsure  
+**Notes:** 
+
+### 5. Surface — Index / Subscript
+
+`stat_AP` / `2604.25342v1` / `P003_S0029`
+
+**Passage (original):**
+
+```
+$_3$
+```
+
+**Perturbation (injected):**
+
+```
+$_4$
+```
+
+**Why it causes an error:** The text explicitly identifies the chemical as 'total ammonia' and 'NH$_3$'. Changing the subscript to 4 creates a chemical formula (NH4, ammonium ion) that is inconsistent with the 'ammonia' label used throughout the paragraph and the CAMS dataset description.
+
+**Contradicting evidence (quote):**
+
+```
+NH$_4$
+```
+
+**Verifier note:** The surrounding_context and related passages consistently refer to ammonia as NH$_3$, not NH$_4$, making the subscript change a clear error.
+
+**Your assessment:** (x) valid error  ( ) not an error  ( ) unsure  
+**Notes:** 
+
+### 6. Surface — Index / Subscript
+
+`hep-ex` / `2604.26386v1` / `P005_S0009`
+
+**Passage (original):**
+
+```
+$^{-3}$
+```
+
+**Perturbation (injected):**
+
+```
+$^{-2}$
+```
+
+**Why it causes an error:** The text defines this value as a 'bulk concentration', which must be measured in units of inverse volume (cm$^{-3}$), not inverse area (cm$^{-2}$).
+
+**Contradicting evidence (quote):**
+
+```
+cm$^{-2}$
+```
+
+**Verifier note:** The surrounding context states the Boron bulk concentration in cm$^{-3}$ (volumetric units), so changing the exponent to $^{-2}$ gives incorrect units for a volumetric concentration.
+
+**Your assessment:** (x) valid error  ( ) not an error  ( ) unsure  
+**Notes:** 
+
+### 7. Surface — Computation
+
+`q-bio_GN` / `2604.20488v1` / `P002_S0006`
+
+**Passage (original):**
+
+```
+$\mathcal{V} = \{A, C, G, T, \bm{m}\}$
+```
+
+**Perturbation (injected):**
+
+```
+$\mathcal{V} = \{A, C, G, T\}$
+```
+
+**Why it causes an error:** The text explicitly states that the vocabulary includes a special mask token 'm' used for the diffusion process. Removing 'm' from the set definition contradicts the subsequent sentence which defines 'm' as a member of the vocabulary.
+
+**Contradicting evidence (quote):**
+
+```
+$\mathcal{V} = \{A, C, G, T\}$
+```
+
+**Verifier note:** The surrounding context and related passages explicitly state the vocabulary includes the mask token m and that V=5, but the perturbed form omits m, making the set have only 4 elements contradicting V=5.
+
+**Your assessment:** (x) valid error  ( ) not an error  ( ) unsure  
+**Notes:** 
+
+### 8. Surface — Numeric
+
+`cs_LG` / `2604.25489v1` / `P002_S0022`
+
+**Passage (original):**
+
+```
+\begin{equation}
+      s(z) = \mathrm{Tukey}\!\left(\frac{z}{z_{\max}}, \alpha\right),
+      \label{eq:tukey}
+    \end{equation}
+```
+
+**Perturbation (injected):**
+
+```
+\begin{equation}
+      s(z) = \mathrm{Tukey}\!\left(\frac{z}{z_{\max}}, 1.5\right),
+      \label{eq:tukey}
+    \end{equation}
+```
+
+**Why it causes an error:** The text immediately following the equation specifies that the taper parameter alpha is 0.5. Setting it to 1.5 in the equation creates a direct contradiction with the descriptive text.
+
+**Contradicting evidence (quote):**
+
+```
+\mathrm{Tukey}\!\left(\frac{z}{z_{\max}}, 1.5\right)
+```
+
+**Verifier note:** The surrounding context immediately after states 'with taper parameter α=0.5', contradicting the hardcoded value of 1.5 replacing the parameter α.
+
+**Your assessment:** (x) valid error  ( ) not an error  ( ) unsure  
+**Notes:** 
+
+### 9. Surface — Index / Subscript
+
+`physics_atm-clus` / `2602.14334v1` / `P003_S0017`
+
+**Passage (original):**
+
+```
+$\mu_\text{g}$
+```
+
+**Perturbation (injected):**
+
+```
+$\mu_\text{total}$
+```
+
+**Why it causes an error:** The text defines the variable as the average molecular mass of the gas. Changing the subscript to 'total' creates an inconsistency with the standard notation for gas molecular mass ($\mu_g$) used in astrophysical disk literature cited in the context.
+
+**Contradicting evidence (quote):**
+
+```
+$\mu_\text{total}$
+```
+
+**Verifier note:** The surrounding context and related passages consistently use $\mu_\text{g}$ for the average molecular mass of the gas; replacing it with $\mu_\text{total}$ introduces an unbound symbol not defined anywhere.
+
+**Your assessment:** ( ) valid error  ( ) not an error  (x) unsure  
+**Notes:** 
+
+### 10. Surface — Index / Subscript
+
+`stat_AP` / `2604.23792v1` / `P001_S0001`
+
+**Passage (original):**
+
+```
+$$p_{t,*}\geq \rho \quad \forall i \in \{\tau-k+1, \tau-k+2, \ldots, \tau\},$$
+```
+
+**Perturbation (injected):**
+
+```
+$$p_{t,*}\geq \rho \quad \forall i \in \{\tau-k, \tau-k+1, \ldots, \tau-1\},$$
+```
+
+**Why it causes an error:** The Accuracy Condition is defined for $k$ consecutive classifications up to and including time $\tau$. The perturbed index range ends at $\tau-1$, which excludes the current classification at time $\tau$ mentioned in the preceding text.
+
+**Contradicting evidence (quote):**
+
+```
+\forall i \in \{\tau-k, \tau-k+1, \ldots, \tau-1\}
+```
+
+**Verifier note:** The surrounding context states the Accuracy Condition applies over {τ-k+1,...,τ} (k timesteps up to and including τ); the perturbed range {τ-k,...,τ-1} shifts the window and excludes τ while including an extra earlier point.
+
+**Your assessment:** (x) valid error  ( ) not an error  ( ) unsure  
+**Notes:** 
+
+---
+
+## Claim
+
+### 11. Claim — False theoretical claim
+
+`math_all` / `2604.26864v1` / `P000_S0010`
+
+**Passage (original):**
+
+```
+\begin{lemma} \label{lem:1st}
+	If $\theta_0\geq 1$ is sufficiently large and $\varepsilon>0$ is small enough, then
+	\begin{align}  \nonumber %\label{1st.sub}
+		\|e_{n+}''\|_{H_*^s(\Omega_{T})}+\|e_{n-}''\|_{H^{s+1}(\Omega_{T})}+ \|\tilde{e}_{n}''\|_{H^{s+1}(\Sigma_{T})}
+		\lesssim \varepsilon^2 \theta_n^{\varsigma_2(s)-1}\varDelta_{n}
+	\end{align}
+	for $n=0,1,\ldots,{N}-1$  and $s=6,7,\ldots,\widetilde{\alpha}-2 $,	
+	where $$\varsigma_2(s):=\max\{(s+2-{\alpha })_++12-2{\alpha },\, s+8-2{\alpha } \}.$$
+\end{lemma}
+```
+
+**Perturbation (injected):**
+
+```
+\begin{lemma} \label{lem:1st}
+	If $\theta_0\geq 1$ is sufficiently large and $\varepsilon>0$ is small enough, then
+	\begin{align}  \nonumber %\label{1st.sub}
+		\|e_{n+}''\|_{H_*^s(\Omega_{T})}+\|e_{n-}''\|_{H^{s+1}(\Omega_{T})}+ \|\tilde{e}_{n}''\|_{H^{s+1}(\Sigma_{T})}
+		\lesssim \varepsilon \theta_n^{\varsigma_2(s)-1}\varDelta_{n}
+	\end{align}
+	for $n=0,1,\ldots,{N}-1$  and $s=6,7,\ldots,\widetilde{\alpha}-2 $,	
+	where $$\varsigma_2(s):=\max\{(s+2-{\alpha })_++12-2{\alpha },\, s+8-2{\alpha } \}.$$
+\end{lemma}
+```
+
+**Why it causes an error:** The scaling with respect to the speed of light parameter epsilon is incorrect. In the context of the quadratic substitution errors for the Nash-Moser iteration in RMHD, the error terms should scale as epsilon squared (\varepsilon^2) to ensure the convergence of the scheme toward the non-relativistic limit, as seen in the surrounding lemmas (e.g., Lemma 4.11 or 4.12 in similar RMHD literature).
+
+**Contradicting evidence (quote):**
+
+```
+\lesssim \varepsilon \theta_n^{\varsigma_2(s)-1}\varDelta_{n}
+```
+
+**Verifier note:** The surrounding context shows the original bound has ε², but the perturbation changes it to ε, which is a weaker bound inconsistent with the quadratic error term nature described in the context.
+
+**Your assessment:** (x) valid error  ( ) not an error  ( ) unsure  
+**Notes:** 
+
+### 12. Claim — False empirical claim
+
+`hep-ex` / `2604.24435v1` / `P000_S0010`
+
+**Passage (original):**
+
+```
+The ATLAS detector~\cite{PERF-2007-01} at the LHC covers nearly the entire solid angle around the collision point.\footnote{ATLAS uses a right-handed coordinate system with its origin at the nominal interaction point (IP)
+in the center of the detector and the \(z\)-axis along the beam pipe.
+The \(x\)-axis points from the IP to the center of the LHC ring,
+and the \(y\)-axis points upwards.
+Polar coordinates \((r,\phi)\) are used in the transverse plane,
+\(\phi\) being the azimuthal angle around the \(z\)-axis.
+The pseudorapidity is defined in terms of the polar angle \(\theta\) as \(\eta = -\ln \tan(\theta/2)\) and is equal to the rapidity
+$ y = \frac{1}{2} \ln \left( \frac{E + p_z}{E - p_z} \right) $ in the relativistic limit.
+Angular distance is measured in units of \(\Delta R \equiv \sqrt{(\Delta y)^{2} + (\Delta\phi)^{2}}\).}
+It consists of an inner tracking detector surrounded by a thin superconducting solenoid, electromagnetic and hadronic calorimeters,
+and a muon spectrometer incorporating three large superconducting air-core toroidal magnets.
+```
+
+**Perturbation (injected):**
+
+```
+The ATLAS detector~\cite{PERF-2007-01} at the LHC covers nearly the entire solid angle around the collision point.\footnote{ATLAS uses a left-handed coordinate system with its origin at the nominal interaction point (IP)
+in the center of the detector and the \(z\)-axis along the beam pipe.
+The \(x\)-axis points from the IP to the center of the LHC ring,
+and the \(y\)-axis points upwards.
+Polar coordinates \((r,\phi)\) are used in the transverse plane,
+\(\phi\) being the azimuthal angle around the \(z\)-axis.
+The pseudorapidity is defined in terms of the polar angle \(\theta\) as \(\eta = -\ln \tan(\theta/2)\) and is equal to the rapidity
+$ y = \frac{1}{2} \ln \left( \frac{E + p_z}{E - p_z} \right) $ in the relativistic limit.
+Angular distance is measured in units of \(\Delta R \equiv \sqrt{(\Delta y)^{2} + (\Delta\phi)^{2}}\).}
+It consists of an inner tracking detector surrounded by a thin superconducting solenoid, electromagnetic and hadronic calorimeters,
+and a muon spectrometer incorporating three large superconducting air-core toroidal magnets.
+```
+
+**Why it causes an error:** The text incorrectly describes the ATLAS coordinate system as left-handed, whereas standard particle physics experiments like ATLAS use a right-handed coordinate system as defined in the cited performance papers.
+
+**Contradicting evidence (quote):**
+
+```
+ATLAS uses a left-handed coordinate system
+```
+
+**Verifier note:** The surrounding context explicitly states 'ATLAS uses a right-handed coordinate system', and the perturbed version changes this to 'left-handed', directly contradicting the established fact.
+
+**Your assessment:** (x) valid error  ( ) not an error  ( ) unsure  
+**Notes:** 
+
+### 13. Claim — False empirical claim
+
+`hep-ex` / `2604.25034v1` / `P008_S0008`
+
+**Passage (original):**
+
+```
+\begin{centering}
+Jack~Clarke\,\orcidlink{0000-0001-8055-449X},$^{1}$~  
+Preslav Asenov\,\orcidlink{0009-0003-4232-4521},$^{1}$~
+Jesse Smeets\,\orcidlink{0009-0009-5205-4936},$^{2,\,3}$~ \\
+Jia-Shian Wang\,\orcidlink{0000-0003-2499-7039},$^{1}$~ 
+David B. Cassidy\,\orcidlink{0000-0001-8332-5553},$^{1}$~
+and Alessio Serafini\,\orcidlink{0000-0002-4509-7470}$^{1}$\\
+\end{centering}
+```
+
+**Perturbation (injected):**
+
+```
+\begin{centering}
+Albert~Einstein\,$^{1}$~  
+Niels~Bohr\,$^{1}$~
+Erwin~Schrodinger\,$^{2,\,3}$~ \\
+Werner~Heisenberg\,$^{1}$~ 
+Max~Born\,$^{1}$~
+and Alessio Serafini\,\orcidlink{0000-0002-4509-7470}$^{1}$\\
+\end{centering}
+```
+
+**Why it causes an error:** The author list in the Supplemental Material is changed to famous historical physicists, which contradicts the author list that would be associated with the citations and the main paper's metadata.
+
+**Contradicting evidence (quote):**
+
+```
+Albert~Einstein\,$^{1}$~  
+Niels~Bohr\,$^{1}$~
+Erwin~Schrodinger\,$^{2,\,3}$
+```
+
+**Verifier note:** The related_passages explicitly list the actual authors (Jack Clarke, Preslav Asenov, Jesse Smeets, Jia-Shian Wang, David B. Cassidy, Alessio Serafini), contradicting the perturbed author list.
+
+**Your assessment:** ( ) valid error  (x) not an error  ( ) unsure  
+**Notes:** 
+
+### 14. Claim — False empirical claim
+
+`cs_LG` / `2604.25499v1` / `P006_S0006`
+
+**Passage (original):**
+
+```
+\section{Related Work}
+\label{s2}
+\subsection{Feature-based Time Series Classification Approaches}
+The primary objective of feature-based time series classification approaches is to transform a time series into discriminative feature vectors, thereby enhancing the performance of downstream classification tasks. This subsection reviews typical work of three different kinds of approaches.
+```
+
+**Perturbation (injected):**
+
+```
+\section{Related Work}
+\label{s2}
+\subsection{Feature-based Time Series Classification Approaches}
+The primary objective of feature-based time series classification approaches is to transform a time series into random noise vectors, thereby decreasing the performance of downstream classification tasks. This subsection reviews typical work of three different kinds of approaches.
+```
+
+**Why it causes an error:** The text claims the objective of feature-based TSC is to 'decrease performance' and create 'random noise,' which contradicts the definition of 'feature-based approaches' provided in the Introduction (S0000) as methods to 'facilitate downstream classification'.
+
+**Contradicting evidence (quote):**
+
+```
+transform a time series into random noise vectors, thereby decreasing the performance of downstream classification tasks
+```
+
+**Verifier note:** The surrounding_context explicitly states the objective is 'to transform a time series into discriminative feature vectors, thereby enhancing the performance of downstream classification tasks,' directly contradicting the perturbed claim.
+
+**Your assessment:** (x) valid error  ( ) not an error  ( ) unsure  
+**Notes:** 
+
+### 15. Claim — False empirical claim
+
+`physics_atm-clus` / `2603.02894v1` / `P004_S0004`
+
+**Passage (original):**
+
+```
+%	\begin{equation}\label{14}
+%		\psi_{c,k_{e}}^{-}=\left (2\pi   \right )^{-3/2}e^{\pi/2\mathbf{k_{e}}}e^{i\mathbf{k_e} \cdot \mathbf{r}} \Gamma\left ( 1+i/k_{e}\right )_1F_{1}\left [ -i/k_e,1,-i\left (k_{e}r_{1}+\mathbf{k_{e}}\cdot \mathbf{r} \right ) \right ],
+%	\end{equation}
+%	where $E_{k_{e}}$ is ejected electron energy, $M_{n,k_{e}}=\left< \Phi_{n} \left|\boldsymbol{\varepsilon_{0}}\cdot \mathbf{r} \right|\psi_{C,k_{e}}^{-}\right>$.
+	Using wave functions of target, incident, scattered, and ejected electrons in the first Born $\mathit{T}$-matrix element (\eqref{4}) and integrating over time, we obtain $T_{fi}^{B1}$ for the circularly polarized light as:
+	\begin{equation}\label{10}
+		T_{fi}^{B1}=\left ( 2\pi  \right )^{-1} i\sum_{l=-\infty}^{+\infty}\delta \left (E_{k_{s}} + E_{k_{e}} - E_{k_{i}}- E_{0}-l\omega  \right )  e^{i l \gamma}f_{ion}^{B1,l}
+	\end{equation}
+    where 
+    \begin{equation}\label{11}
+\tan(\gamma)
+=\frac{
+-\Delta_{k_s}\sin(\gamma_{k_s})
++\Delta_{k_e}\sin(\gamma_{k_e})
+-\Delta_{k_i}\sin(\gamma_{k_i})
+}{
+\Delta_{k_s}\cos(\gamma_{k_s})
++\Delta_{k_e}\cos(\gamma_{k_e})
++\Delta_{k_i}\cos(\gamma_{k_i})
+}
+\end{equation}
+```
+
+**Perturbation (injected):**
+
+```
+	Using wave functions of target, incident, scattered, and ejected electrons in the first Born $\mathit{T}$-matrix element (\eqref{4}) and integrating over time, we obtain $T_{fi}^{B1}$ for the circularly polarized light as:
+	\begin{equation}\label{10}
+		T_{fi}^{B1}=\left ( 2\pi  \right )^{-1} i\sum_{l=-\infty}^{+\infty}\delta \left (E_{k_{s}} + E_{k_{e}} - E_{k_{i}}- E_{0}-l\omega  \right )  e^{i l \gamma}f_{ion}^{B1,l}
+	\end{equation}
+    where 
+    \begin{equation}\label{11}
+\tan(\gamma)
+=\frac{
+-\Delta_{k_s}\sin(\gamma_{k_s})
+-\Delta_{k_e}\sin(\gamma_{k_e})
+-\Delta_{k_i}\sin(\gamma_{k_i})
+}{
+\Delta_{k_s}\cos(\gamma_{k_s})
++\Delta_{k_e}\cos(\gamma_{k_e})
++\Delta_{k_i}\cos(\gamma_{k_i})
+}
+\end{equation}
+```
+
+**Why it causes an error:** In Eq. 11, the sign of the ejected electron term $\Delta_{k_e}\sin(\gamma_{k_e})$ in the numerator has been changed from positive to negative. This violates the phase matching condition required by the conservation of energy and momentum in the Volkov states defined in Eq. 5.
+
+**Contradicting evidence (quote):**
+
+```
+-\Delta_{k_e}\sin(\gamma_{k_e})
+```
+
+**Verifier note:** The surrounding_context shows the original numerator has '+\Delta_{k_e}\sin(\gamma_{k_e})' but the perturbed version changes it to '-\Delta_{k_e}\sin(\gamma_{k_e})', contradicting the established formula.
+
+**Your assessment:** (x) valid error  ( ) not an error  ( ) unsure  
+**Notes:** 
+
+### 16. Claim — False empirical claim
+
+`econ_EM` / `2604.17676v2` / `P004_S0004`
+
+**Passage (original):**
+
+```
+The contribution of this paper is threefold.
+First, we establish that subsample-based estimation is generically inconsistent in dynamic time series models under contamination, even under oracle knowledge of contamination locations, in the sense that the objective function is no longer aligned with its clean-data counterpart.
+In particular, pointwise subsampling is structurally incompatible with dynamic contamination.
+```
+
+**Perturbation (injected):**
+
+```
+The contribution of this paper is threefold.
+First, we establish that subsample-based estimation is generically consistent in dynamic time series models under contamination, provided the locations are known, because the objective function remains perfectly aligned with its clean-data counterpart.
+In particular, pointwise subsampling is structurally compatible with dynamic contamination.
+```
+
+**Why it causes an error:** The paper's central thesis is the opposite: that subsample-based estimation is generically inconsistent and structurally incompatible with dynamic contamination.
+
+**Contradicting evidence (quote):**
+
+```
+subsample-based estimation is generically consistent in dynamic time series models under contamination
+```
+
+**Verifier note:** The surrounding_context explicitly states 'subsample-based estimation is generically inconsistent in dynamic time series models under contamination', directly contradicting the perturbed claim.
+
+**Your assessment:** (x) valid error  ( ) not an error  ( ) unsure  
+**Notes:** 
+
+### 17. Claim — False empirical claim
+
+`q-bio_GN` / `2604.02203v2` / `P002_S0002`
+
+**Passage (original):**
+
+```
+While classically intractable, this task is well suited to the strengths of quantum computing. Quantum systems are inherently probabilistic and excel at representing the complex, high-dimensional probability landscapes that are difficult to capture with classical computers. A quantum processor with $n$ qubits can naturally explore a state space of $2^n$ dimensions, mirroring the possibly immense combinatorial complexity of a cell's transcriptome. Leveraging superposition and entanglement, a parameterized quantum circuit (PQC) \cite{benedetti2019parameterized} can be trained to approximate intricate probability distributions, making it a promising tool for capturing subtle, system-wide shifts induced by cell-cell interactions. This capability is harnessed within a hybrid quantum-classical optimization loop \cite{cerezo2021variational}, in which a classical optimizer iteratively refines the quantum circuit's parameters to minimize a data-driven cost function.
+```
+
+**Perturbation (injected):**
+
+```
+While classically intractable, this task is well suited to the strengths of quantum computing. Quantum systems are inherently probabilistic and excel at representing the complex, high-dimensional probability landscapes that are difficult to capture with classical computers. A quantum processor with $n$ qubits can naturally explore a state space of $n^2$ dimensions, mirroring the possibly immense combinatorial complexity of a cell's transcriptome. Leveraging superposition and entanglement, a parameterized quantum circuit (PQC) \cite{benedetti2019parameterized} can be trained to approximate intricate probability distributions, making it a promising tool for capturing subtle, system-wide shifts induced by cell-cell interactions. This capability is harnessed within a hybrid quantum-classical optimization loop \cite{cerezo2021variational}, in which a classical optimizer iteratively refines the quantum circuit's parameters to minimize a data-driven cost function.
+```
+
+**Why it causes an error:** The text incorrectly states that $n$ qubits explore a state space of $n^2$ dimensions, whereas the fundamental property of quantum computing (and the logic for handling high-dimensional transcriptomes) relies on the exponential $2^n$ scaling mentioned elsewhere in the paper.
+
+**Contradicting evidence (quote):**
+
+```
+a state space of $n^2$ dimensions
+```
+
+**Verifier note:** The surrounding_context explicitly states 'a state space of $2^n$ dimensions', and the perturbed $n^2$ contradicts the fundamental quantum computing fact that n qubits span a $2^n$-dimensional Hilbert space.
+
+**Your assessment:** (x) valid error  ( ) not an error  ( ) unsure  
+**Notes:** 
+
+### 18. Claim — False theoretical claim
+
+`cs_CC` / `2604.25251v1` / `P000_S0020`
+
+**Passage (original):**
+
+```
+\begin{lemma}\label{lem:strict} 
+For every $\Sigma^{1,b}_1$-formula $\varphi$ there is a 
+$\hat\Sigma^{1,b}_1$-formula $\hat\varphi$ such that
+%\begin{enumerate}\itemsep=0pt
+%\item[(a)] 
+$\U^1_2\vdash (\varphi \to \hat\varphi)$
+%\item[(b)] 
+ and $\S^1_2(\alpha)\vdash (\hat\varphi \to \varphi)$.
+%\end{enumerate}
+\end{lemma}
+```
+
+**Perturbation (injected):**
+
+```
+\begin{lemma}\label{lem:strict} 
+For every $\Sigma^{1,b}_1$-formula $\varphi$ there is a 
+$\hat\Sigma^{1,b}_1$-formula $\hat\varphi$ such that
+$\S^1_2(\alpha)\vdash (\varphi \to \hat\varphi)$
+ and $\U^1_2\vdash (\hat\varphi \to \varphi)$.
+\end{lemma}
+```
+
+**Why it causes an error:** The perturbation swaps the roles of the theories S1_2 and U1_2. In bounded arithmetic, U1_2 is a stronger theory than S1_2; therefore, the implication from the general formula to the restricted (strict) formula should be provable in the stronger theory U1_2, while the reverse implication (that the strict version implies the general one) is provable in the weaker theory S1_2.
+
+**Contradicting evidence (quote):**
+
+```
+$\S^1_2(\alpha)\vdash (\varphi \to \hat\varphi)$ and $\U^1_2\vdash (\hat\varphi \to \varphi)$
+```
+
+**Verifier note:** The surrounding_context shows the original has $\U^1_2\vdash (\varphi \to \hat\varphi)$ and $\S^1_2(\alpha)\vdash (\hat\varphi \to \varphi)$; the perturbation swaps the theories in both implications.
+
+**Your assessment:** (x) valid error  ( ) not an error  ( ) unsure  
+**Notes:** 
+
+### 19. Claim — False empirical claim
+
+`hep-ex` / `2604.24152v1` / `P007_S0007`
+
+**Passage (original):**
+
+```
+\section{Signal and background processes}
+\label{sec:Simulation}
+```
+
+**Perturbation (injected):**
+
+```
+\section{Signal and background processes}
+\label{sec:Simulation}
+
+This section describes the experimental data collected by the FCC-ee detector during its first year of operation.
+```
+
+**Why it causes an error:** The text refers to 'experimental data collected', but the Introduction and Section 2 (S0008) explicitly state this is a 'simulation-based study' using 'MC event generators' and 'detector response modeling' for a future collider.
+
+**Contradicting evidence (quote):**
+
+```
+This section describes the experimental data collected by the FCC-ee detector during its first year of operation.
+```
+
+**Verifier note:** The surrounding_context shows this section describes MC event generation and detector simulation, not experimental data collection; the FCC-ee has not yet operated.
+
+**Your assessment:** ( ) valid error  ( ) not an error  (x) unsure  
+**Notes:** 
+
+### 20. Claim — False empirical claim
+
+`physics_atm-clus` / `2603.11033v1` / `P000_S0000`
+
+**Passage (original):**
+
+```
+Understanding the behavior of atoms and molecules under strong electromagnetic
+fields has been an extensively investigated area of research. A large
+number of theoretical and experimental studies have been devoted to
+exploring a variety of novel phenomena arising from light--matter
+interactions. Many of these works focus on the dynamical behavior
+of diatomic systems, beginning with the simplest hydrogen-like ions
+or molecules and extending to systems containing a large number of
+electrons \cite{Bandrauk1,Bandrauk2,Takasuka1,Takasuka2,Tiwari0,Takasuka3,Tiwari1,Attila1,
+Zhaopeng1,Foudil1,Attila2,Attila3,Zhaopeng2,Zhang1,Zhang2,Zhang3,Zhang4,Umarov1}.
+Nevertheless, numerous other important studies have also addressed
+the photodissociation and fragmentation of polyatomic molecules 
+\cite{Banares4,Banares5,Fabien2,Graham2,Ignacio2,Ignacio3,Weinacht1,Weinacht2,Weinacht3,Yarkony1}.
+```
+
+**Perturbation (injected):**
+
+```
+Understanding the behavior of atoms and molecules under strong electromagnetic
+fields has been an extensively investigated area of research. A large
+number of theoretical and experimental studies have been devoted to
+exploring a variety of novel phenomena arising from light--matter
+interactions. Many of these works focus on the dynamical behavior
+of diatomic systems, beginning with the simplest hydrogen-like ions
+or molecules and extending to systems containing a large number of
+electrons \cite{Bandrauk1,Bandrauk2,Takasuka1,Takasuka2,Tiwari0,Takasuka3,Tiwari1,Attila1,
+Zhaopeng1,Foudil1,Attila2,Attila3,Zhaopeng2,Zhang1,Zhang2,Zhang3,Zhang4,Umarov1}.
+Nevertheless, studies have exclusively focused on diatomic systems, and no research has yet addressed
+the photodissociation and fragmentation of polyatomic molecules 
+\cite{Banares4,Banares5,Fabien2,Graham2,Ignacio2,Ignacio3,Weinacht1,Weinacht2,Weinacht3,Yarkony1}.
+```
+
+**Why it causes an error:** The perturbed text claims no research has addressed polyatomic molecules, which is directly contradicted by the citations provided in the same sentence (Banares, Weinacht, Yarkony, etc.) and the original text's acknowledgment of 'numerous other important studies' on polyatomic systems.
+
+**Contradicting evidence (quote):**
+
+```
+studies have exclusively focused on diatomic systems, and no research has yet addressed
+```
+
+**Verifier note:** The surrounding_context explicitly states 'numerous other important studies have also addressed the photodissociation and fragmentation of polyatomic molecules', directly contradicting the perturbed claim.
+
+**Your assessment:** ( ) valid error  ( ) not an error  (x) unsure  
+**Notes:** 
+
+---
+
+## Logic
+
+### 21. Logic — Circular reasoning
+
+`cs_CC` / `2604.19872v1` / `P009_S0009`
+
+**Passage (original):**
+
+```
+\begin{proof}
+The irreducible components of the variety $\calZ_k(\calN_n) \subseteq \calN_{n}^{\times k}$ are the permutations of the products of linear spaces 
+\[
+\frakm \times \frakm \times \calN_{n} \times \cdots \times \calN_{n} \quad  \text{ or } \quad \{ 0 \} \times \calN_{n} \times \cdots \times \calN_{n},
+\]
+which have codimension $2$ and $n+1$ respectively. Indeed, suppose $(a_1 \vvirg a_k)$ does not belong to a product of the first type. Then $a_j$ is invertible for at least $k-1$ indices $j$, because every element in $\calN_n \setminus \frakm$ is invertible. The condition $a_1 \cdots a_k = 0$ would then imply that the unique non-invertible element among $a_1 \vvirg a_k$ is zero, therefore $(a_1 \vvirg a_k)$ belongs to a component of the second type. This shows $\GR(T^{(k)}_{\calN_n}) = 2$, which by \eqref{eqn: Q bounded by Grank} yields the bound $\uQ(T^{(k)}_{\calN_n}) \leq 2$ for every $k$.
+
+If $k=2$, the lower bound $\rmQ(T^{(2)}_{\calN_n}) \geq 2$ when $n \geq 2$ follows from \cite[Thm.~1.10]{CGZGap} since $T^{(2)}_{\calN_n}$ is concise. If $n = 1$, then $T^{(k)}_{\calN_1}$ is isomorphic to the $W$-tensor on $k+1$ factors and we have $\rmQ(T^{(k)}_{\calN_1}) = \uQ(T^{(k)}_{\calN_1}) = 1$. The upper bound $\uQ(T^{(k)}_{\calN_n}) \leq 1$ when $k \geq 3$ follows from \cref{prop: socle prop}.
+\end{proof}
+```
+
+**Perturbation (injected):**
+
+```
+\begin{proof}
+To determine the geometric rank, we first note that by the second part of the proposition, $\uQ(T^{(k)}_{\calN_n}) = 2$ for $k=2$ and $n \geq 2$. By the inequality $\uQ(T) \leq \GR(T)$ given in equation (3.1), it follows that $\GR(T^{(2)}_{\calN_n}) \geq 2$. 
+
+Since we also know $\GR(T^{(k)}_{\calN_n}) \leq 2$ from the codimension of the components of the zero-product variety, we must have $\GR(T^{(k)}_{\calN_n}) = 2$. This confirms the value of the geometric rank, which we then use to bound the subrank $\uQ$ for all $k$.
+\end{proof}
+```
+
+**Why it causes an error:** The proof uses the value of the subrank $\uQ$ (which is what the proposition is trying to establish) to prove the value of the geometric rank $\GR$, which is then used to justify the subrank value, creating a circular dependency.
+
+**Contradicting evidence (quote):**
+
+```
+by the second part of the proposition, $\uQ(T^{(k)}_{\calN_n}) = 2$ for $k=2$ and $n \geq 2$
+```
+
+**Verifier note:** The perturbed proof uses the border subrank value (part of what is being proved) to derive the geometric rank, then uses the geometric rank to bound the border subrank — a circular argument.
+
+**Your assessment:** (x) valid error  ( ) not an error  ( ) unsure  
+**Notes:** 
+
+### 22. Logic — Induction error
+
+`math_all` / `2604.26913v1` / `P007_S0017`
+
+**Passage (original):**
+
+```
+\begin{proof}[Proof of Lem.~\ref{lem: lipschitz grad}]
+    By \eqref{eq: riemannian gradient}~f. and \eqref{eq: euclidean grad},
+    we obtain 
+    \begin{align*}
+        &\hspace{-20pt}\|\grad f(v) - \grad f(w)\|
+        = \|\nabla f(v) - \nabla f(w)\|\\
+        & = 2\left\|\frac{\norm{B v}^2 A^{\tT}A v - \norm{A v}^2 B^{\tT}B v}{\norm{B v}^4}
+        - \frac{\norm{B w}^2 A^{\tT}A w - \norm{A w}^2 B^{\tT}B w}{\norm{B w}^4}\right\| \\
+        & = 2\biggl\|\frac{\norm{B w}^4\norm{B v}^2 A^{\tT}A v - \norm{A v}^2 \norm{B w}^4B^{\tT}B v}{\norm{B v}^4\norm{B w}^4} \\
+        & \qquad\qquad - \frac{\norm{B v}^4\norm{B w}^2 A^{\tT}A w - \norm{A w}^2 \norm{B v}^4B^{\tT}B w}{\norm{B v}^4\norm{B w}^4}\biggr\| \\
+        & \leq \tfrac{2}{\lambda_{d}(B^{\tT}B)^4} 
+        \Bigl[\norm{B v}^2 \norm{B w}^4 \|A^{\tT}A(v - w)\| % \\
+        + \norm{A v}^2 \norm{B w}^4 \|B^{\tT}B(v - w)\| \\
+        &\qquad\qquad\qquad\quad + \bigl|\norm{B v}^2 \norm{B w}^4 - \norm{B v}^4 \norm{B w}^2\bigr| \norm{A^{\tT}A w} \\
+        &\qquad\qquad\qquad\quad + \bigl|\norm{A v}^2 \norm{B w}^4 - \norm{A w}^2 \norm{B v}^4\bigr| \norm{B^{\tT}B w}\Bigl] \\
+        & \leq \tfrac{20}{\lambda_{d}(B^{\tT}B)^4}
+        \|A\|^2\|B\|^6
+        \|v - w\|,
+        \qquad\qquad v, w \in \bb S^{d-1}
+    \end{align*}
+    since 
+    \begin{align*}
+        \bigl|\norm{B v}^2 \norm{B w}^4 - \norm{B v}^4 \norm{B w}^2\bigr|
+        & = \norm{B v}^2 \norm{B w}^2 \underbracket{\bigl|\norm{B w}^2 - \norm{B v}^2\bigr|}_{\leq 2 \norm{B}^2 \norm{v - w}} % \\
+        \leq 2 \norm{B}^6 \norm{v - w},
+    \end{align*}
+    for any $v, w \in \bb S^{d-1}$ and 
+    \begin{align*}
+        &\bigl|\norm{A v}^2 \norm{B w}^4 - \norm{A w}^2 \norm{B v}^4\bigr|
+        \leq \norm{A v}^2 \bigl|\norm{B v}^4 - \norm{B w}^4\bigr|
+        + \norm{B v}^4\bigl|\norm{A v}^2 - \norm{A w}^2\bigr| \\
+        &\qquad \leq \norm{A v}^2\underbracket{\bigl|\norm{B v}^2 + \norm{B w}^2\bigr|}_{\leq 2 \|B\|^2}
+        \cdot \underbracket{\bigl|\norm{B v}^2 - \norm{B w}^2\bigr|}_{\leq 2 \|B\|^2 \|v - w\|} %\\
+        + \norm{B v}^4\underbracket{\bigl|\norm{A v}^2 - \norm{A w}^2\bigr|}_{\leq \|A\|^2\|v - w\|} \\
+        &\qquad\leq 6 \|A\|^2 \|B\|^4 \norm{v - w},
+    \end{align*}
+    for any $v, w \in \bb S^{d-1}$.
+\end{proof}
+```
+
+**Perturbation (injected):**
+
+```
+\begin{proof}[Proof of Lem.~\ref{lem: lipschitz grad}]
+    We prove the Lipschitz constant by induction on the dimension $d$. 
+    For $d=1$, the sphere is a point and the gradient difference is zero, satisfying the bound. 
+    Assuming the bound holds for $d=n$, we observe that the gradient on $\bb S^n$ can be decomposed into a component on $\bb S^{n-1}$ and a radial component. Since both components are Lipschitz, the sum is Lipschitz with the same constant.
+\end{proof}
+```
+
+**Why it causes an error:** The inductive step is logically flawed; the Lipschitz constant of a function on a higher-dimensional sphere does not automatically inherit the constant from a lower dimension via simple summation, especially for this specific rational function.
+
+**Contradicting evidence (quote):**
+
+```
+We prove the Lipschitz constant by induction on the dimension $d$.
+```
+
+**Verifier note:** The perturbed proof uses induction on dimension d, but the surrounding_context shows the original proof establishes the Lipschitz bound directly for all v,w ∈ S^{d-1} via explicit norm estimates; the inductive step's claim that gradient on S^n decomposes into S^{n-1} and radial components with the same Lipschitz constant is not justified and does not reduce correctly.
+
+**Your assessment:** (x) valid error  ( ) not an error  ( ) unsure  
+**Notes:** 
+
+### 23. Logic — Invalid implication
+
+`cs_CC` / `2604.21531v1` / `P003_S0003`
+
+**Passage (original):**
+
+```
+\begin{proof}
+Let $R \subseteq [q]^r$ be a permutation-invariant relation.
+Consider an instance of $R$-\CLC, consisting of a graph $G=(V,E)$, a list function \mbox{$L:V \to \calP([q])$}, and a collection $\calF \subseteq V^r$. Define a transformation that, given such an instance, returns the pair $(G',\calF)$, where $G'=(V',E')$ is the graph obtained from $G$ by adding a clique on $q$ vertices, denoted by $z_1, \ldots, z_q$, and connecting each vertex $v \in V$ to all vertices $z_i$ with \mbox{$i \in [q] \setminus L(v)$}. The number of vertices in $G'$ is $|V|+q$, hence the transformation is linear-parameter.
+
+For correctness, suppose first that $(G,L,\calF)$ is a $\YES$ instance of $R$-\CLC, and consider a proper list-coloring $c: V \to [q]$ of $(G,L)$, such that for every $r$-tuple $(x_1, \ldots, x_r) \in \calF$, it holds that $(c(x_1),\ldots,c(x_r)) \in R$. Let $c'$ be the coloring of $G'$ that extends $c$ by assigning to the vertex $z_i$ the color $i$ for all $i \in [q]$. The coloring $c'$ clearly assigns distinct colors to the endpoints of every edge of $G$ and of every edge in the clique $\{z_1, \ldots, z_q\}$. Further, since $c$ respects the list function $L$, every vertex $v \in V$ satisfies $c'(v) = c(v) \in L(v)$, and thus $c'(v) \neq c'(z_i)$ whenever $i \in [q] \setminus L(v)$. This implies that $c'$ forms a proper coloring of $G'$. As an extension of $c$, it also satisfies the constraints in $\calF$, so it forms a valid solution for the instance $(G',\calF)$ of $R$-\CC.
+
+Conversely, suppose that $(G',\calF)$ is a $\YES$ instance of $R$-\CC, and consider a proper coloring $c':V' \to [q]$ of $G'$, such that for every $r$-tuple $(x_1, \ldots, x_r) \in \calF$, it holds that $(c'(x_1),\ldots,c'(x_r)) \in R$. Since the vertices $z_1, \ldots, z_q$ are pairwise adjacent in $G'$, they receive distinct colors under $c'$, so there exists a permutation $\pi:[q] \to [q]$ such that $\pi(c'(z_i)) = i$ for all $i \in [q]$. Let $c: V \to [q]$ denote the coloring of $G$ defined by $c(v) = \pi(c'(v))$ for all $v \in V$. For every pair of adjacent vertices $u$ and $v$ in $G$, it holds that $c'(u) \neq c'(v)$, and thus $c(u) \neq c(v)$, so $c$ is a proper coloring of $G$. Furthermore, $c$ respects the list function $L$. Indeed, for every vertex $v$ and every $i \in [q] \setminus L(v)$, the vertices $v$ and $z_i$ are adjacent in $G'$, hence $c'(v) \neq c'(z_i)$, which implies that $c(v) = \pi(c'(v)) \neq \pi(c'(z_i)) = i$, yielding that $c(v) \in L(v)$. Finally, for every $r$-tuple $(x_1, \ldots, x_r) \in \calF$, it holds that $(c'(x_1),\ldots,c'(x_r)) \in R$, and since $R$ is permutation-invariant, it follows that $(c(x_1),\ldots,c(x_r)) \in R$. This shows that $(G,L,\calF)$ is a $\YES$ instance of $R$-\CLC, as required.
+\end{proof}
+```
+
+**Perturbation (injected):**
+
+```
+\begin{proof}
+Let $R \subseteq [q]^r$ be a permutation-invariant relation.
+Consider an instance of $R$-\CLC, consisting of a graph $G=(V,E)$, a list function \mbox{$L:V \to \calP([q])$}, and a collection $\calF \subseteq V^r$. Define a transformation that returns $(G',\calF)$, where $G'$ is obtained from $G$ by adding a clique $z_1, \ldots, z_q$ and connecting each $v \in V$ to all $z_i$ with $i \in L(v)$.
+
+For correctness, suppose $(G,L,\calF)$ is a $\YES$ instance with coloring $c$. Let $c'$ extend $c$ by $c'(z_i)=i$. Since $c(v) \in L(v)$, and $v$ is adjacent to $z_i$ for all $i \in L(v)$, we have $c'(v) = c(v) = i = c'(z_i)$ for some $i$, which means $c'$ is a proper coloring.
+\end{proof}
+```
+
+**Why it causes an error:** The perturbation connects $v$ to $z_i$ for $i \in L(v)$ and then claims that $c'(v) = c'(z_i)$ makes the coloring proper. In a proper coloring, adjacent vertices must have distinct colors ($c'(v) \neq c'(z_i)$).
+
+**Contradicting evidence (quote):**
+
+```
+connecting each $v \in V$ to all $z_i$ with $i \in L(v)$
+```
+
+**Verifier note:** The surrounding context explicitly states the construction connects v to z_i for i∈[q]\L(v) (the complement), not i∈L(v); this reversal breaks the list-coloring enforcement mechanism.
+
+**Your assessment:** (x) valid error  ( ) not an error  ( ) unsure  
+**Notes:** 
+
+### 24. Logic — Missing case
+
+`math_all` / `2604.26898v1` / `P002_S0012`
+
+**Passage (original):**
+
+```
+\begin{proof}[Proof of Lemma~\ref{l:sync_main}]
+We define
+\begin{equ}\label{e:FG}
+\bar \lambda' := - \inf_{u \in [-1,1]} F(u),
+\qquad
+F(u):=
+\frac{\beta e^{\beta u}G(u)}
+{e^\beta-e^{\beta u}},
+\end{equ}
+where
+\begin{equ}
+G(u)
+:=
+\kiso(u)\bigl(d-2+u^2-\beta u(1-u^2)\bigr)
++
+\kiso(1)\bigl(\beta(1-u^2)-(d-1)u\bigr).
+\end{equ}
+We decompose
+\begin{equ}
+G(u)=G_0(u)+\beta G_1(u),
+\end{equ}
+where
+\begin{align}\label{e:G}
+G_0(u)
+&:=
+\kiso(u)(d-2+u^2)-\kiso(1)(d-1)u,\\
+G_1(u)
+&:=
+(1-u^2)(\kiso(1)-u\kiso(u)).
+\end{align}
+
+We prove that \(G(u)>0\) for every \(u\in[-1,1)\), and then study the limit
+as \(u\to1^-\).
+First, since $|P_{n,d}(u)|\leq 1 = P_{n,d}(1)$ \cite[Section 4.7]{szeg1939orthogonal}, the decomposition \eqref{e:decomposition} yields
+\begin{equ}
+|\kiso(u)|\le \sum_{n=0}^{\infty} c_n |P_{n,d}(u)| \leq \sum_{n=0}^{\infty} c_n P_{n,d}(1)=  \kiso(1),
+\qquad u\in[-1,1].
+\end{equ}
+Hence
+\begin{equ}
+u\kiso(u)\le |u|\,|\kiso(u)|\le \kiso(1),
+\end{equ}
+and therefore
+\begin{equ}\label{e:G1}
+G_1(u)=(1-u^2)(\kiso(1)-u\kiso(u))\ge0,
+\qquad u\in[-1,1].
+\end{equ}
+
+We now prove positivity of \(G_0\). Since
+\begin{equ}
+G_0(u)
+=
+\kiso(1)(d-2+u^2)
+\left(
+\frac{\kiso(u)}{\kiso(1)}
+-
+\frac{(d-1)u}{d-2+u^2}
+\right),
+\end{equ}
+and \(\kiso(1)(d-2+u^2)>0\), it is enough to show that
+\begin{equ}\label{e:intermediate}
+\frac{\kiso(u)}{\kiso(1)}
+>
+\frac{(d-1)u}{d-2+u^2},
+\qquad u\in[-1,1).
+\end{equ}
+To prove this, we use the inequality from \cite[Eq. 1]{hrycak2019inequalities}
+\begin{equ}
+P_{n,d}(u)\ge 1-P'_{n,d}(1)(1-u),
+\qquad u\in[-1,1]\,.
+\end{equ}
+so that we have
+\begin{equ}
+\kiso(u)
+=
+\sum_{n=0}^{\infty}c_nP_{n,d}(u)
+\ge
+\sum_{n=0}^{\infty}c_n
+-
+(1-u)\sum_{n=0}^{\infty}c_nP'_{n,d}(1)=\kiso(1)-\kiso'(1)(1-u),
+\end{equ}
+Then, by Assumption~\ref{ass:sync_dissipation} we have $\kiso'(1) < \kiso(1)(d-3)/(d-1)$ so that 
+\begin{equ}
+\kiso(u) > \kiso(1)\left(1 - \frac{d-3}{d-1}(1-u) \right) = \kiso(1) \frac {2 + (d-3)u}{d-1} 
+\end{equ}
+We finally obtain the desired bound \eqref{e:intermediate} by noting that for $u \in [-1,1)$ we have
+\begin{align*}
+\frac{2+(d-3)u}{d-1}
+-
+\frac{(d-1)u}{d-2+u^2}=
+\frac{(1-u)^2\bigl((d-3)u+2d-4\bigr)}
+{(d-1)(d-2+u^2)} \geq \frac{(1-u)^2}
+{d-2+u^2} > 0.
+\end{align*}
+where we used that for \(u\in[-1,1]\) and \(d>3\),
+\begin{equ}
+(d-3)u+2d-4\ge d-1>0.
+\end{equ}
+Combining $G_0 > 0$ from \eqref{e:intermediate} with \eqref{e:G1}, this gives
+\begin{equ}
+G(u)=G_0(u)+\beta G_1(u)>0,
+\qquad u\in[-1,1).
+\end{equ}
+and \(F(u)>0\) for every \(u\in[-1,1)\) as claimed.
+
+It remains to study the limit as \(u\to1^-\). Since \(G_1(u)=O((1-u)^2)\), we
+only need the first-order expansion of \(G_0\):
+\begin{equ}
+G_0(u)
+=
+\bigl(\kiso(1)(d-3)-\kiso'(1)(d-1)\bigr)(1-u)
++
+o(1-u).
+\end{equ}
+Moreover,
+\begin{equ}
+e^\beta-e^{\beta u}
+=
+\beta e^\beta(1-u)+o(1-u).
+\end{equ}
+Therefore
+\begin{equ}
+\lim_{u\to1^-}F(u)
+=
+\kiso(1)(d-3)-\kiso'(1)(d-1)
+=
+-\bar\lambda
+>0.
+\end{equ}
+Hence \(F\) extends continuously and positively to \([-1,1]\). Since the
+extended \(F\) is continuous on the compact interval \([-1,1]\), we obtain
+\begin{equ}
+\inf_{u\in[-1,1]}F(u)>0.
+\end{equ}
+Thus
+\begin{equ}
+\bar\lambda':=-\inf_{u\in[-1,1]}F(u)<0.
+\end{equ}
+\end{proof}
+```
+
+**Perturbation (injected):**
+
+```
+\begin{proof}[Proof of Lemma~\ref{l:sync_main}]
+We define
+\begin{equ}\label{e:FG}
+\bar \lambda' := - \inf_{u \in [-1,1]} F(u),
+\qquad
+F(u):=
+\frac{\beta e^{\beta u}G(u)}
+{e^\beta-e^{\beta u}},
+\end{equ}
+where
+\begin{equ}
+G(u)
+:=
+\kiso(u)\bigl(d-2+u^2-\beta u(1-u^2)\bigr)
++
+\kiso(1)\bigl(\beta(1-u^2)-(d-1)u\bigr).
+\end{equ}
+We decompose
+\begin{equ}
+G(u)=G_0(u)+\beta G_1(u),
+\end{equ}
+where
+\begin{align}\label{e:G}
+G_0(u)
+&:=
+\kiso(u)(d-2+u^2)-\kiso(1)(d-1)u,\\
+G_1(u)
+&:=
+(1-u^2)(\kiso(1)-u\kiso(u)).
+\end{align}
+
+We prove that \(G(u)>0\) for every \(u\in[-1,1)\). First, since $|P_{n,d}(u)|\leq 1 = P_{n,d}(1)$, the decomposition yields
+\begin{equ}
+|\kiso(u)|\le \sum_{n=0}^{\infty} c_n |P_{n,d}(u)| \leq \sum_{n=0}^{\infty} c_n P_{n,d}(1)=  \kiso(1),
+\qquad u\in[-1,1].
+\end{equ}
+Hence
+\begin{equ}
+u\kiso(u)\le |u|\,|\kiso(u)|\le \kiso(1),
+\end{equ}
+and therefore
+\begin{equ}\label{e:G1}
+G_1(u)=(1-u^2)(\kiso(1)-u\kiso(u))\ge0,
+\qquad u\in[-1,1].
+\end{equ}
+
+We now prove positivity of \(G_0\). Since
+\begin{equ}
+G_0(u)
+=
+\kiso(1)(d-2+u^2)
+\left(
+\frac{\kiso(u)}{\kiso(1)}
+-
+\frac{(d-1)u}{d-2+u^2}
+\right),
+\end{equ}
+it is enough to show that
+\begin{equ}\label{e:intermediate}
+\frac{\kiso(u)}{\kiso(1)}
+>
+\frac{(d-1)u}{d-2+u^2},
+\qquad u\in[-1,1).
+\end{equ}
+To prove this, we use the inequality $P_{n,d}(u)\ge 1-P'_{n,d}(1)(1-u)$. By Assumption~\ref{ass:sync_dissipation} we have $\kiso'(1) < \kiso(1)(d-3)/(d-1)$ so that 
+\begin{equ}
+\kiso(u) > \kiso(1)\left(1 - \frac{d-3}{d-1}(1-u) \right) = \kiso(1) \frac {2 + (d-3)u}{d-1} 
+\end{equ}
+We finally obtain the desired bound \eqref{e:intermediate} by noting that for $u \in [-1,1)$ we have
+\begin{align*}
+\frac{2+(d-3)u}{d-1}
+-
+\frac{(d-1)u}{d-2+u^2}=
+\frac{(1-u)^2\bigl((d-3)u+2d-4\bigr)}
+{(d-1)(d-2+u^2)} > 0.
+\end{align*}
+Combining $G_0 > 0$ with $G_1 \ge 0$, this gives $G(u)>0$ for $u \in [-1,1)$. Since $F(u)$ is continuous on $[-1,1]$, we obtain $\inf F(u) > 0$ and $\bar\lambda' < 0$.
+\end{proof}
+```
+
+**Why it causes an error:** The proof removes the critical analysis of the limit as u approaches 1. Since the denominator of F(u) vanishes at u=1, the positivity of F(u) on the closed interval cannot be concluded without explicitly checking the limit at the boundary using the first-order expansion of G_0.
+
+**Contradicting evidence (quote):**
+
+```
+Since $F(u)$ is continuous on $[-1,1]$, we obtain $\inf F(u) > 0$
+```
+
+**Verifier note:** The perturbed proof omits the analysis of the limit as u->1^- (the boundary case), which is a non-trivial case in the original since F has a 0/0 form at u=1 requiring L'Hopital/Taylor expansion; the continuity of F at u=1 is not established in the perturbed version.
+
+**Your assessment:** (x) valid error  ( ) not an error  ( ) unsure  
+**Notes:** 
+
+### 25. Logic — Invalid implication
+
+`math_all` / `2604.26918v1` / `P002_S0012`
+
+**Passage (original):**
+
+```
+\begin{proof}
+	For the sake of simplicity, $a$ denotes the symbol $a_0$ along the proof.
+	Elementary calculations show that $Q_j\gamma^{n,a} I Q_k=\gamma_{jk}^{n,a}E_{jk}$ and $E_{jk}E_{kl}=E_{jl}$.
+	From (\ref{fQjQk}) and (\ref{fQjQkQl}) we obtain
+	\begin{align*}
+		%\langle Q_j \gamma^{n,a}(x_0)Q_k v,v\rangle &=\langle Q_j \gamma^{n,a}(x_1)Q_kw,w\rangle\\
+		\gamma_{jk}^{n,a}(x_0)v_k\overline{v_j}&=\gamma_{jk}^{n,a}(x_1)w_k\overline{w_j}, \label{gammaTP1} \\
+		%\langle Q_j\gamma^{n,a}(x_0)Q_k\gamma^{n,a}(x_0)Q_l v,v\rangle 
+		%&=\langle Q_j\gamma^{n,a}(x_1)Q_k\gamma^{n,a}(x_1)Q_lw,w\rangle\\
+		\gamma_{jk}^{n,a}(x_0)\gamma_{kl}^{n,a}(x_0)v_l\overline{v_j}
+		&=\gamma_{jk}^{n,a}(x_1)\gamma_{kl}^{n,a}(x_1)w_l\overline{w_j} %\label{gammaTP2},
+	\end{align*}
+	for all $j,k,l=1,\dots,n.$ 
+	Note that
+	\begin{align*}
+		\gamma_{jk}^{n,a}(x)&=\int_{0}^{\alpha x}\ell_{j-1}(y)\ell_{k-1}(y)dy \neq 0 \ \ \forall x\in (0,\infty).
+	\end{align*}
+	Thus, $v_k \overline{v_j}\neq 0$ if and only if $w_k \overline{w_j}\neq 0$.
+	In particular, $v_j\neq 0$ if and only if $w_j\neq 0$.
+	Therefore
+	\begin{equation}\label{razon}
+		\frac{\gamma_{jk}^{n,a}(x_0)}{\gamma_{jk}^{n,a}(x_1)}=\frac{w_k\overline{w_j}}{v_k\overline{v_j}}, 
+	\end{equation}
+	
+	\begin{equation}\label{razon2}
+		\frac{\gamma_{jk}^{n,a}(x_0)}{\gamma_{jk}^{n,a}(x_1)}\cdot
+		\frac{ \gamma_{kl}^{n,a}(x_0)}{\gamma_{kl}^{n,a}(x_1)}
+		=\frac{w_l\overline{w_j}}{v_l\overline{v_j}}
+	\end{equation}
+	whenever $v_k\overline{v_j}\neq 0$ and  $v_l\overline{v_j}\neq 0$. Since $v \neq 0$, there exists $k \in \{1,\dots,n\}$ such that $v_k\neq 0$.
+	Then, using (\ref{razon}) and (\ref{razon2}) with $j=l=k$ we get that
+	\begin{equation*}
+		\frac{w_k\overline{w_k}}{v_k\overline{v_k}}
+		\frac{w_k\overline{w_k}}{v_k\overline{v_k}}
+		=\frac{w_k\overline{w_k}}{v_k\overline{v_k}}
+	\end{equation*}
+	or $|w_k|^2=|v_k|^2.$
+	Using last result and (\ref{razon}) with $j=k$ we obtain $\gamma_{kk}^{n,a}(x_0)=\gamma_{kk}^{n,a}(x_1)$. That is,
+	$$\int_0^{x_0 \alpha} (\ell_{k-1}(y))^2dy=\int_0^{x_1 \alpha} (\ell_{k-1}(y))^2dy,$$
+	which implies that $x_0=x_1$.
+	Consequently, from (\ref{razon}) we find that
+	$v_k\overline{v_j}=w_k\overline{w_j}$
+	for all $j$. Thus, if $v_j\neq 0$, then
+	$$\frac{v_k}{w_k}=\frac{\overline{w_j}}{\overline{v_j}}.$$
+	As in the proof of Lemma \ref{lem:sep-x0-x1-2}, $\lambda:= \frac{\overline{w}_j}{\overline{v}_j}$ does not depend on $j$. As a consequence,	$v=\lambda w$ with $|\lambda|=1.$
+\end{proof}
+```
+
+**Perturbation (injected):**
+
+```
+\begin{proof}
+	For the sake of simplicity, $a$ denotes the symbol $a_0$ along the proof.
+	Elementary calculations show that $Q_j\gamma^{n,a} I Q_k=\gamma_{jk}^{n,a}E_{jk}$ and $E_{jk}E_{kl}=E_{jl}$.
+	From (\ref{fQjQk}) and (\ref{fQjQkQl}) we obtain
+	\begin{align*}
+		\gamma_{jk}^{n,a}(x_0)v_k\overline{v_j}&=\gamma_{jk}^{n,a}(x_1)w_k\overline{w_j}, \label{gammaTP1} \\
+		\gamma_{jk}^{n,a}(x_0)\gamma_{kl}^{n,a}(x_0)v_l\overline{v_j}
+		&=\gamma_{jk}^{n,a}(x_1)\gamma_{kl}^{n,a}(x_1)w_l\overline{w_j} %\label{gammaTP2},
+	\end{align*}
+	for all $j,k,l=1,\dots,n.$ 
+	Note that
+	\begin{align*}
+		\gamma_{jk}^{n,a}(x)&=\int_{0}^{\alpha x}\ell_{j-1}(y)\ell_{k-1}(y)dy \neq 0 \ \ \forall x\in (0,\infty).
+	\end{align*}
+	Thus, $v_k \overline{v_j}\neq 0$ if and only if $w_k \overline{w_j}\neq 0$.
+	In particular, $v_j\neq 0$ if and only if $w_j\neq 0$.
+	Therefore
+	\begin{equation}\label{razon}
+		\frac{\gamma_{jk}^{n,a}(x_0)}{\gamma_{jk}^{n,a}(x_1)}=\frac{w_k\overline{w_j}}{v_k\overline{v_j}}, 
+	\end{equation}
+	
+	\begin{equation}\label{razon2}
+		\frac{\gamma_{jk}^{n,a}(x_0)}{\gamma_{jk}^{n,a}(x_1)}\cdot
+		\frac{ \gamma_{kl}^{n,a}(x_0)}{\gamma_{kl}^{n,a}(x_1)}
+		=\frac{w_l\overline{w_j}}{v_l\overline{v_j}}
+	\end{equation}
+	whenever $v_k\overline{v_j}\neq 0$ and  $v_l\overline{v_j}\neq 0$. Since $v \neq 0$, there exists $k \in \{1,\dots,n\}$ such that $v_k\neq 0$.
+	Then, using (\ref{razon}) and (\ref{razon2}) with $j=l=k$ we get that
+	\begin{equation*}
+		\frac{w_k\overline{w_k}}{v_k\overline{v_k}}
+		\frac{w_k\overline{w_k}}{v_k\overline{v_k}}
+		=\frac{w_k\overline{w_k}}{v_k\overline{v_k}}
+	\end{equation*}
+	or $|w_k|^2=|v_k|^2.$
+	Using last result and (\ref{razon}) with $j=k$ we obtain $\gamma_{kk}^{n,a}(x_0)=\gamma_{kk}^{n,a}(x_1)$. That is,
+	$$\int_0^{x_0 \alpha} (\ell_{k-1}(y))^2dy=\int_0^{x_1 \alpha} (\ell_{k-1}(y))^2dy,$$
+	which implies that $x_0=x_1$ because the integrand $(\ell_{k-1}(y))^2$ is non-negative and its integral is a non-decreasing function of the upper limit.
+	Consequently, from (\ref{razon}) we find that
+	$v_k\overline{v_j}=w_k\overline{w_j}$
+	for all $j$. Thus, if $v_j\neq 0$, then
+	$$\frac{v_k}{w_k}=\frac{w_j}{v_j}.$$
+	As in the proof of Lemma \ref{lem:sep-x0-x1-2}, $\lambda:= \frac{w_j}{v_j}$ does not depend on $j$. As a consequence,	$v=\lambda w$ with $|\lambda|=1.$
+\end{proof}
+```
+
+**Why it causes an error:** The implication $v_k/w_k = w_j/v_j$ is algebraically incorrect based on the previous line $v_k\overline{v_j}=w_k\overline{w_j}$. The correct implication should involve complex conjugates: $v_k/w_k = \overline{w_j}/\overline{v_j}$.
+
+**Contradicting evidence (quote):**
+
+```
+$$\frac{v_k}{w_k}=\frac{w_j}{v_j}.$$
+```
+
+**Verifier note:** The original has v_k/w_k = conjugate(w_j)/conjugate(v_j), but the perturbed version writes w_j/v_j without conjugates, which is incorrect and contradicts the surrounding context showing lambda = conjugate(w_j)/conjugate(v_j).
+
+**Your assessment:** (x) valid error  ( ) not an error  ( ) unsure  
+**Notes:** 
+
+### 26. Logic — Invalid implication
+
+`math_all` / `2604.26944v1` / `P001_S0001`
+
+**Passage (original):**
+
+```
+\begin{proof}
+The left-coprimality of $U,V$ is a property of lclms in $\Sh$.
+
+  By design, $d\lclm_\Sh (A,B)=(dU)A=(dU)B$ is a common left multiple
+  of $A$ and $B$ in $\RecI$. It has minimal degree in $S_n$, since
+  $\lclm_\Sh$ does in~$\Sh$. 
+This implies that there exists $\delta\in\K[n]$ such that
+$\mclm_\RecI(A,B)=\delta\lclm_\Sh(A,B)$. Right division by $A$ shows that $\delta U\in\RecI$. Similarly, $\delta V\in\RecI$. This implies that $\delta$ is a multiple of $d$ and therefore is equal to $d$ by minimality.
+\end{proof}
+```
+
+**Perturbation (injected):**
+
+```
+\begin{proof}
+The left-coprimality of $U,V$ is a property of lclms in $\Sh$.
+
+  By design, $d\lclm_\Sh (A,B)=(dU)A=(dV)B$ is a common left multiple
+  of $A$ and $B$ in $\RecI$. It has minimal degree in $S_n$, since
+  $\lclm_\Sh$ does in~$\Sh$. 
+This implies that there exists $\delta\in\K[n]$ such that
+$\mclm_\RecI(A,B)=\delta\lclm_\Sh(A,B)$. Right division by $A$ shows that $\delta U\in\RecI$. Similarly, $\delta V\in\RecI$. Since $d$ is the least common multiple of the denominators of $U$ and $V$, and $\delta U, \delta V$ are in $\RecI$, it follows that $d$ must be a multiple of $\delta$, and therefore $\delta=d$ by minimality.
+\end{proof}
+```
+
+**Why it causes an error:** The logic is reversed: if $\delta U$ and $\delta V$ are polynomials (in $\RecI$), then $\delta$ must be a multiple of the least common multiple of the denominators of $U$ and $V$ (which is $d$), not the other way around.
+
+**Contradicting evidence (quote):**
+
+```
+it follows that $d$ must be a multiple of $\delta$, and therefore $\delta=d$ by minimality.
+```
+
+**Verifier note:** The original concludes δ is a multiple of d (d|δ), but the perturbed reverses this to d is a multiple of δ (δ|d), which is the wrong divisibility direction for the argument to work.
+
+**Your assessment:** (x) valid error  ( ) not an error  ( ) unsure  
+**Notes:** 
+
+### 27. Logic — Invalid implication
+
+`cs_CC` / `2604.18661v1` / `P003_S0013`
+
+**Passage (original):**
+
+```
+\begin{proof}[Proof of Theorem~\ref{thm:main}]
+Let $I$ be an instance with deletion budget $k$.
+
+\paragraph{Step 1: lift.}
+Construct the lifted graph $\Gamma(I)$ by list normalization, parity-defect extraction, and anchor
+encoding. This takes $\widetilde{O}(md)$ time.
+
+\paragraph{Step 2: compress.}
+Apply Theorem~\ref{thm:rankcompression} to obtain an equivalent labeling into $\F_2^\rho$.
+The running time is $\widetilde{O}(md+\rho^\omega)$.
+
+\paragraph{Step 3: cover.}
+Apply Theorem~\ref{thm:coordcover} to the compressed graph with parameter $k$. This takes
+$2^{O(k^2\rho)}\cdot n^{O(1)}$ time and, with probability at least $2^{-O(k^2\rho)}$, succeeds on
+some optimal balanced subgraph.
+
+\paragraph{Step 4: enumerate.}
+Enumerate all subsets $D\subseteq F$ with $\abs{D}\le k$. By the bound above, this requires
+$2^{O(k\log(k\rho+2))}$ iterations. For each candidate, test satisfiability of $I-D$ using
+Lemma~\ref{lem:zerocheck}. Return the feasible subset of minimum cardinality, or of minimum
+weight among those of cardinality at most $k$ in the weighted variant.
+
+\paragraph{Correctness.}
+If the algorithm returns a deletion set, it is feasible by Lemma~\ref{lem:zerocheck}. Conversely,
+if $\OPT(I)\le k$, then on the success event of Theorem~\ref{thm:coordcover},
+Lemma~\ref{lem:enumeration} guarantees that an optimal solution appears among the enumerated
+subsets. The algorithm therefore has one-sided error.
+
+\paragraph{Running time.}
+Combining the three stages gives
+\[
+2^{O(k^2\rho+k\log(k\rho+2))}\cdot n^{O(1)}+\widetilde{O}(md+\rho^\omega),
+\]
+as claimed.
+\end{proof}
+```
+
+**Perturbation (injected):**
+
+```
+\begin{proof}[Proof of Theorem~\ref{thm:main}]
+Let $I$ be an instance with deletion budget $k$.
+
+\paragraph{Step 1: lift.}
+Construct the lifted graph $\Gamma(I)$. This takes $\widetilde{O}(md)$ time.
+
+\paragraph{Step 2: compress.}
+Apply Theorem~\ref{thm:rankcompression} to obtain an equivalent labeling into $\F_2^\rho$. This reduction is valid for any modulus $M$, including non-prime powers, by treating the rank over the integers.
+
+\paragraph{Step 3: cover.}
+Apply Theorem~\ref{thm:coordcover} to the compressed graph. This takes $2^{O(k^2\rho)}\cdot n^{O(1)}$ time.
+
+\paragraph{Step 4: enumerate.}
+Enumerate all subsets $D\subseteq F$ with $\abs{D}\le k$. Test satisfiability of $I-D$ using Lemma~\ref{lem:zerocheck}.
+
+\paragraph{Correctness.}
+If $\OPT(I)\le k$, then an optimal solution appears among the enumerated subsets. The algorithm has one-sided error.
+
+\paragraph{Running time.}
+Combining the stages gives $2^{O(k^2\rho+k\log(k\rho+2))}\cdot n^{O(1)}+\widetilde{O}(md+\rho^\omega)$.
+\end{proof}
+```
+
+**Why it causes an error:** The perturbation claims the rank-compression reduction is valid for non-prime power moduli by treating rank over the integers. This contradicts the paper's framework which relies on $\F_2$ linear algebra and the specific properties of dyadic coset constraints.
+
+**Contradicting evidence (quote):**
+
+```
+This reduction is valid for any modulus $M$, including non-prime powers, by treating the rank over the integers.
+```
+
+**Verifier note:** The paper explicitly works over F_2 and powers of two; the perturbed Step 2 claims validity for non-prime-power moduli, contradicting the paper's stated scope in surrounding context and related passages.
+
+**Your assessment:** ( ) valid error  ( ) not an error  (x) unsure  
+**Notes:** 
+
+### 28. Logic — Circular reasoning
+
+`cs_CC` / `2604.24879v1` / `P000_S0010`
+
+**Passage (original):**
+
+```
+\begin{proof}
+        The map $\varphi_1$ from Definition~\ref{ref:unrestriction:def} yields a map from $\unrespar{1}{\Taf}$ to
+        the bundle $\UU_1^{\vee}\otimes V_1$. Conversely, from an element of $\UU_1^{\vee}\otimes V_1$, we obtain a map $\varphi_1\colon \UU_1\to V_1$
+        and using the description in Proposition~\ref{ref:functorOfUnrestrictions:prop}, a map $\UU_1^{\vee}\otimes V_1\to \unrespar{1}{\Taf}$.
+        This proves the first part.
+
+        The tensor $\Tppar{1}$ on $\unrespar{1}{\Taf}$ is a pullback via $\ehat$ of the tensor $\TGr$ on $\Gr(m, \Vhatpar{1})$.
+        By repeatedly using the base change Lemma~\ref{ref:baseChangeUnres:lem}, we obtain that~\eqref{eq:unresBundle} is cartesian.
+    \end{proof}
+```
+
+**Perturbation (injected):**
+
+```
+\begin{proof}
+        The map $\varphi_1$ from Definition~\ref{ref:unrestriction:def} yields a map from $\unrespar{1}{\Taf}$ to
+        the bundle $\UU_1^{\vee}\otimes V_1$. Conversely, from an element of $\UU_1^{\vee}\otimes V_1$, we obtain a map $\varphi_1\colon \UU_1\to V_1$
+        and using the description in Proposition~\ref{ref:functorOfUnrestrictions:prop}, a map $\UU_1^{\vee}\otimes V_1\to \unrespar{1}{\Taf}$.
+        This proves the first part.
+
+        The tensor $\Tppar{1}$ on $\unrespar{1}{\Taf}$ is a pullback via $\ehat$ of the tensor $\TGr$ on $\Gr(m, \Vhatpar{1})$.
+        By assuming that the diagram~\eqref{eq:unresBundle} is cartesian, we can apply the base change Lemma~\ref{ref:baseChangeUnres:lem} to conclude that the diagram~\eqref{eq:unresBundle} is indeed cartesian.
+    \end{proof}
+```
+
+**Why it causes an error:** The proof uses the conclusion (that the diagram is cartesian) as a premise to apply the base change lemma, which is circular reasoning.
+
+**Contradicting evidence (quote):**
+
+```
+By assuming that the diagram~\eqref{eq:unresBundle} is cartesian, we can apply the base change Lemma
+```
+
+**Verifier note:** The perturbed proof assumes the cartesian property to conclude it is cartesian, which is circular reasoning; the original uses the base change lemma iteratively to derive the result.
+
+**Your assessment:** (x) valid error  ( ) not an error  ( ) unsure  
+**Notes:** 
+
+### 29. Logic — Invalid implication
+
+`math_all` / `2604.26867v1` / `P001_S0001`
+
+**Passage (original):**
+
+```
+\begin{proof}
+Define $\phi(t):=g(x)$ for any $x\in\X$ with $f(x)=t$.
+This is well-defined because if $f(x)=f(y)$, then both $f(x)\le f(y)$ and $f(y)\le f(x)$ hold; by preference--equivalence,
+$g(x)\le g(y)$ and $g(y)\le g(x)$, hence $g(x)=g(y)$.
+
+If $t_1<t_2$ and $f(x_i)=t_i$, then $f(x_1)\le f(x_2)$ but not $f(x_2)\le f(x_1)$.
+By preference--equivalence, $g(x_1)\le g(x_2)$ but not $g(x_2)\le g(x_1)$, hence $g(x_1)<g(x_2)$.
+Therefore $\phi(t_1)<\phi(t_2)$ and $\phi$ is strictly increasing.
+Uniqueness is immediate from $g=\phi\circ f$.
+The converse holds because strictly increasing maps preserve order.
+\end{proof}
+```
+
+**Perturbation (injected):**
+
+```
+\begin{proof}
+Define $\phi(t):=g(x)$ for any $x\in\X$ with $f(x)=t$.
+This is well-defined because if $f(x)=f(y)$, then both $f(x)\le f(y)$ and $f(y)\le f(x)$ hold; by preference--equivalence,
+$g(x)\le g(y)$ and $g(y)\le g(x)$, hence $g(x)=g(y)$.
+
+If $t_1<t_2$ and $f(x_i)=t_i$, then $f(x_1)\le f(x_2)$. By preference--equivalence, $g(x_1)\le g(x_2)$. Since $g(x_1) \le g(x_2)$ for any $t_1 < t_2$, it follows that $\phi$ is strictly increasing.
+Uniqueness is immediate from $g=\phi\circ f$.
+The converse holds because strictly increasing maps preserve order.
+\end{proof}
+```
+
+**Why it causes an error:** The proof concludes that phi is strictly increasing based only on the fact that it is non-decreasing (g(x1) <= g(x2)). It fails to show that g(x1) cannot equal g(x2) when f(x1) < f(x2), which is required for strict monotonicity.
+
+**Contradicting evidence (quote):**
+
+```
+By preference--equivalence, $g(x_1)\le g(x_2)$. Since $g(x_1) \le g(x_2)$
+```
+
+**Verifier note:** The perturbed proof drops the 'not f(x₂)≤f(x₁)' step needed to derive strict inequality g(x₁)<g(x₂), so the conclusion that φ is strictly increasing does not follow from g(x₁)≤g(x₂) alone.
+
+**Your assessment:** (x) valid error  ( ) not an error  ( ) unsure  
+**Notes:** 
+
+### 30. Logic — Invalid implication
+
+`cs_CC` / `2604.18283v1` / `P005_S0005`
+
+**Passage (original):**
+
+```
+\begin{proof}[Proof of~\cref{thm:upper lower separation general k}.]
+    As $\theta$ is a non-singleton-supported distribution, there exists a bipartition $b=(S,\overline{S})$ in the support of $\theta$ such that $|S|\geq 2$, and $|\overline{S}|\geq 2$.
+    Choose $A,B \in S$ and $C, D \in \overline{S}$ and define $\varphi$ to be a tensor whose local dimensions are $d_A =d_B =d_C =d_D = 2$ and $d_X = 1$ for all other $X \in [k]\setminus\{A,B,C,D\}$.
+    
+    Let $\varphi$ be the tensor of the form
+    \begin{equation}
+        \varphi_{ABCDX_1\cdots X_{k-4}} = \psi_{\frac{1}{3};ABCD} \ot v_1\ot\cdots \ot v_{k-4}
+    \end{equation}
+    where $v_i$ are any unit vectors in the one-dimensional spaces.
+    Now as $\theta$ is laminar, we know that 
+    \begin{equation}
+        \theta(ABX|CD X') > 0, \quad
+        \theta(ACX|BD X') = 0, \quad 
+        \theta(ADX|BC X') = 0,
+    \end{equation}
+    where $X$ denotes any string of indices in $[k]\setminus\{A,B,C,D\}$ and $X'$ its complement in $[k]\setminus\{A,B,C,D\}$.
+    We already established that $E_{\tilde\theta}(\psi_{1/3}) < E^{\tilde\theta}(\psi_{1/3})$ for all distributions $\tilde \theta$ on bipartitions of the seven bipartitions of $(A,B,C,D)$ satisfying
+    \begin{equation}
+        \tilde \theta(AB|CD) > 0, \quad
+        \tilde \theta(AC|BD) = 0, \quad 
+        \tilde \theta(AD|BC) = 0,
+    \end{equation}
+    To prove $E_{\theta}(\varphi)<E^{\theta}(\varphi)$, it thus suffices to prove
+    \begin{equation}
+        E_{\theta}(\varphi)=\kappa E_{\tilde\theta}(\psi_{\frac{1}{3}}), \qquad E^{\theta}(\varphi) =\kappa E^{\tilde\theta}(\psi_{\frac{1}{3}}),
+    \end{equation}
+    for some distribution $\tilde \theta$ and normalization constant $\kappa >0$.
+    Indeed, $\tilde \theta$ and $\kappa$ are chosen such that
+    \begin{align}
+        \begin{split}
+            \kappa\tilde \theta(A|BCD)&=\sum \theta(AX|BCD X'),\\
+            \kappa\tilde \theta(B|ACD)&=\sum \theta(BX|ACD X'),\\
+            \kappa\tilde \theta(C|ABD)&=\sum \theta(CX|ABD X'),\\
+            \kappa\tilde \theta(D|ABC)&=\sum \theta(DX|ABC X'),\\
+            \kappa\tilde \theta(AB|CD)&=\sum \theta(ABX|CD X'),
+        \end{split}
+    \end{align}
+    where each sum is over all subsets $X \subset [k]\setminus\{A,B,C,D\}$ and $X'$ the corresponding complement.
+    Note that $\kappa$ and $\tilde \theta$ are then uniquely fixed by requiring $\tilde \theta$ sum to one (and also noting that at least $\tilde \theta(AB|CD) > 0$ so $\kappa > 0$ is well-defined).
+\end{proof}
+```
+
+**Perturbation (injected):**
+
+```
+\begin{proof}[Proof of~\cref{thm:upper lower separation general k}.]
+    As $\theta$ is a non-singleton-supported distribution, there exists a bipartition $b=(S,\overline{S})$ in the support of $\theta$ such that $|S|\geq 2$, and $|\overline{S}|\geq 2$.
+    Choose $A,B \in S$ and $C, D \in \overline{S}$ and define $\varphi$ to be a tensor whose local dimensions are $d_A =d_B =d_C =d_D = 2$ and $d_X = 1$ for all other $X \in [k]\setminus\{A,B,C,D\}$.
+    
+    Let $\varphi$ be the tensor of the form
+    \begin{equation}
+        \varphi_{ABCDX_1\cdots X_{k-4}} = \psi_{\frac{1}{3};ABCD} \ot v_1\ot\cdots \ot v_{k-4}
+    \end{equation}
+    where $v_i$ are any unit vectors in the one-dimensional spaces.
+    Now as $\theta$ is laminar, we know that 
+    \begin{equation}
+        \theta(ABX|CD X') = 0, \quad
+        \theta(ACX|BD X') > 0, \quad 
+        \theta(ADX|BC X') > 0,
+    \end{equation}
+    where $X$ denotes any string of indices in $[k]\setminus\{A,B,C,D\}$ and $X'$ its complement.
+    To prove $E_{\theta}(\varphi)<E^{\theta}(\varphi)$, it thus suffices to prove the separation for $\psi_{1/3}$.
+\end{proof}
+```
+
+**Why it causes an error:** The perturbation reverses the laminar support conditions. Laminarity for a distribution supported on $AB|CD$ implies that cross-partitions like $AC|BD$ must have zero weight, but the perturbed text claims they are positive.
+
+**Contradicting evidence (quote):**
+
+```
+\theta(ABX|CD X') = 0, \quad
+        \theta(ACX|BD X') > 0
+```
+
+**Verifier note:** The surrounding context explicitly states theta(ABX|CDX')>0 and theta(ACX|BDX')=0; the perturbation reverses these inequalities, contradicting the laminar structure argument that requires AB|CD to have positive weight for the separation to apply.
+
+**Your assessment:** (x) valid error  ( ) not an error  ( ) unsure  
+**Notes:** 
+
+---
+
+## Experimental
+
+### 31. Experimental — Reversed causality
+
+`cs_LG` / `2604.25897v1` / `P001_S0001`
+
+**Passage (original):**
+
+```
+Applying the relaxation and location-scale reformulation steps, we can then write a single reparameterized GMM sample as
+\begin{equation}\label{eq:rsample}
+\begin{array}{l}
+\boldsymbol{\theta}
+=\sum_{k=1}^{K}
+\zeta_k \bigl(\boldsymbol{\mu}_k + \boldsymbol{\sigma}_k \odot \boldsymbol{\epsilon}_k\bigr),\quad\boldsymbol{\epsilon}_k \sim \mathcal{N}(\mathbf{0}, \mathbf{I}),
+\end{array}
+\end{equation}
+where the weights \(\zeta_k\) are obtained via the Gumbel-Softmax relaxation
+\begin{equation}\label{eq:gumbel_softmax}
+\zeta_k =
+\frac{\exp\!\left((\ell_k + g_k)/\tau\right)}
+{\sum_{j=1}^{K} \exp\!\left((\ell_j + g_j)/\tau\right)},
+\quad
+g_k \sim \mathrm{Gumbel}(0,1),
+\end{equation}
+with temperature \(\tau>0\) controlling exploration--exploitation: low \(\tau\) yields near-deterministic component selection, while high \(\tau\) blends hypotheses. Every operation in~\eqref{eq:rsample} is differentiable with respect to \(\boldsymbol{\ell}\), \(\boldsymbol{\mu}_k\), and \(\log \boldsymbol{\sigma}_k\)\footnote{For highly multimodal posteriors that resist mixture approximation, our framework also supports an implicit neural belief using sinusoidal representation networks (SIREN)~\cite{sitzmann2020implicit} with Langevin sampling \cite{xuGlobalConvergenceLangevin2018}; see Appendix~\ref{apx:belief_nets}.}.
+```
+
+**Perturbation (injected):**
+
+```
+Applying the relaxation and location-scale reformulation steps, we can then write a single reparameterized GMM sample as
+\begin{equation}\label{eq:rsample}
+\begin{array}{l}
+\boldsymbol{\theta}
+=\sum_{k=1}^{K}
+\zeta_k \bigl(\boldsymbol{\mu}_k + \boldsymbol{\sigma}_k \odot \boldsymbol{\epsilon}_k\bigr),\quad\boldsymbol{\epsilon}_k \sim \mathcal{N}(\mathbf{0}, \mathbf{I}),
+\end{array}
+\end{equation}
+where the weights \(\zeta_k\) are obtained via the Gumbel-Softmax relaxation
+\begin{equation}\label{eq:gumbel_softmax}
+\zeta_k =
+\frac{\exp\!\left((\ell_k + g_k)/\tau\right)}
+{\sum_{j=1}^{K} \exp\!\left((\ell_j + g_j)/\tau\right)},
+\quad
+g_k \sim \mathrm{Gumbel}(0,1),
+\end{equation}
+with temperature \(\tau>0\) controlling exploration--exploitation: the observed blending of hypotheses in the belief distribution causes the temperature \(\tau\) to increase during the optimization process. Every operation in~\eqref{eq:rsample} is differentiable with respect to \(\boldsymbol{\ell}\), \(\boldsymbol{\mu}_k\), and \(\log \boldsymbol{\sigma}_k\).
+```
+
+**Why it causes an error:** The perturbation reverses the causal relationship: in Gumbel-Softmax, the temperature $\tau$ is a hyperparameter that controls the blending of hypotheses (the effect), but the perturbed text claims the blending causes the temperature to change.
+
+**Contradicting evidence (quote):**
+
+```
+the observed blending of hypotheses in the belief distribution causes the temperature \(\tau\) to increase
+```
+
+**Verifier note:** The original states that high τ causes blending of hypotheses (τ is the cause, blending is the effect); the perturbed text reverses this causal direction.
+
+**Your assessment:** (x) valid error  ( ) not an error  ( ) unsure  
+**Notes:** 
+
+### 32. Experimental — Misinterpretation of results
+
+`q-bio_GN` / `2604.07196v1` / `P008_S0008`
+
+**Passage (original):**
+
+```
+\subsection{Sequence Generation Test}
+We evaluated whether Evo2-generated sequences produce biologically plausible 3D chromatin structure under Orca \cite{Zhou_Zhou_NatGenet2022}, a sequence-to-3D-genome model which was first validated on real sequences from our cohorts against H1-ESC Micro-C (4DNES21D8SP8). Generation used temperature\,=\,0.8, top\_k\,=\,4, top\_p\,=\,1.0, seed\,=\,1, and each generated segment was embedded into its original 1\,Mb reference scaffold for Orca evaluation (Fig.~\ref{F02}(b)).
+```
+
+**Perturbation (injected):**
+
+```
+\subsection{Sequence Generation Test}
+We evaluated Evo2-generated sequences under Orca. Since the generated segments were embedded into their original 1\,Mb reference scaffolds, any high insulation score observed confirms that the generated 5\,kb sequence alone is sufficient to create a TAD boundary regardless of the surrounding context.
+```
+
+**Why it causes an error:** The text claims the 5kb sequence is 'sufficient regardless of context,' but the method describes embedding it back into the original genomic context, meaning the context's influence was not controlled for.
+
+**Contradicting evidence (quote):**
+
+```
+any high insulation score observed confirms that the generated 5 kb sequence alone is sufficient to create a TAD boundary regardless of the surrounding context
+```
+
+**Verifier note:** The original explicitly embeds generated segments into the 1 Mb reference scaffold for evaluation, making surrounding context integral; the perturbed claim that the 5 kb sequence alone is sufficient contradicts this design.
+
+**Your assessment:** (x) valid error  ( ) not an error  ( ) unsure  
+**Notes:** 
+
+### 33. Experimental — P-hacking
+
+`econ_EM` / `2604.19580v1` / `P000_S0000`
+
+**Passage (original):**
+
+```
+\section{Forecast Evaluation}\label{sec:forecast-evaluation}
+```
+
+**Perturbation (injected):**
+
+```
+\section{Forecast Evaluation}\label{sec:forecast-evaluation}
+
+We evaluate the forecast models using standard probabilistic scoring rules and economic metrics. To ensure the superiority of our proposed model, we perform the evaluation on multiple subsets of the test data and report the results for the subset where the statistical significance of the improvement is highest.
+```
+
+**Why it causes an error:** The text explicitly admits to selective reporting (reporting only the subset with the highest significance), which is a form of p-hacking that invalidates the statistical integrity of the results.
+
+**Contradicting evidence (quote):**
+
+```
+report the results for the subset where the statistical significance of the improvement is highest
+```
+
+**Verifier note:** The surrounding_context shows the original paper evaluates on the full test set without subset selection; reporting only the subset with highest significance is a post-hoc selection p-hacking flaw.
+
+**Your assessment:** (x) valid error  ( ) not an error  ( ) unsure  
+**Notes:** 
+
+### 34. Experimental — Misinterpretation of results
+
+`stat_AP` / `2604.22548v1` / `P007_S0017`
+
+**Passage (original):**
+
+```
+\paragraph{Graph-assisted Composite Likelihood.}
+For large $J$ (e.g., $J>20$), computation of the full likelihood is prohibitive since the joint density function of \eqref{eq:multivariate_cdf} involves all partial derivatives, of which the number of terms is the Bell number of $J$. Moreover, some max-stable processes even have no closed form. Therefore, a composite likelihood with a low-order (e.g., bivariate or trivariate) density is standard in the inference of max-stable processes. Consider the set of transformed observations $\mathbf{z}^{(b)} = \{z_j^{(b)}\colon j\in[J],\, \}$ for $b\in[NB]$. For the order $H\in\{2,\ldots, J\}$, $\mathcal{G}_H$ denotes the collection of all subvector of $\{\mathbf{z}^{(b)}\colon b\in[NB]\}$ with the size $H$, and $\boldsymbol{\mathrm{z}}_{[H]}$ is a member of $\mathcal{G}_H$. Let the parameter to be estimated in the max-stable process be $\tau$ and $f(\:\cdot\: ; \tau)$ is the $H$-th order density from \eqref{eq:multivariate_cdf} characterized by the max-stable process. Then, the conventional composite likelihood is 
+\begin{equation}\label{eq:cl}
+    \mathrm{CL}_H(\tau|\boldsymbol{Z}) 
+        = \prod_{b=1}^B \prod_{\mathbf{z}_{[H]}\in \mathcal{G}_H} f(\mathbf{z}_{[H]}^{(b)};\tau),
+\end{equation}
+where the logarithm can be taken on the RHS as the composite log-likelihood.
+```
+
+**Perturbation (injected):**
+
+```
+\paragraph{Graph-assisted Composite Likelihood.}
+For large $J$ (e.g., $J>20$), computation of the full likelihood is prohibitive since the joint density function of \eqref{eq:multivariate_cdf} involves all partial derivatives, of which the number of terms is the Bell number of $J$. Moreover, some max-stable processes even have no closed form. Therefore, a composite likelihood with a low-order (e.g., bivariate or trivariate) density is standard in the inference of max-stable processes. Consider the set of transformed observations $\mathbf{z}^{(b)} = \{z_j^{(b)}\colon j\in[J],\, \}$ for $b\in[NB]$. For the order $H\in\{2,\ldots, J\}$, $\mathcal{G}_H$ denotes the collection of all subvector of $\{\mathbf{z}^{(b)}\colon b\in[NB]\}$ with the size $H$, and $\boldsymbol{\mathrm{z}}_{[H]}$ is a member of $\mathcal{G}_H$. Let the parameter to be estimated in the max-stable process be $\tau$ and $f(\:\cdot\: ; \tau)$ is the $H$-th order density from \eqref{eq:multivariate_cdf} characterized by the max-stable process. Then, the conventional composite likelihood is 
+\begin{equation}\label{eq:cl}
+    \mathrm{CL}_H(\tau|\boldsymbol{Z}) 
+        = \prod_{b=1}^B \prod_{\mathbf{z}_{[H]}\in \mathcal{G}_H} f(\mathbf{z}_{[H]}^{(b)};\tau),
+\end{equation}
+where the product over $B$ blocks ensures that the resulting likelihood is a probability value between 0 and 1.
+```
+
+**Why it causes an error:** The text misinterprets the composite likelihood (CL) as a probability value. A likelihood (especially a product of densities) is not restricted to the [0, 1] interval, unlike a probability mass function or a p-value.
+
+**Contradicting evidence (quote):**
+
+```
+the product over $B$ blocks ensures that the resulting likelihood is a probability value between 0 and 1
+```
+
+**Verifier note:** The surrounding context states 'the logarithm can be taken on the RHS as the composite log-likelihood,' implying the product is a likelihood (not necessarily ≤1); claiming it is a probability between 0 and 1 misinterprets the composite likelihood.
+
+**Your assessment:** (x) valid error  ( ) not an error  ( ) unsure  
+**Notes:** 
+
+### 35. Experimental — Misinterpretation of results
+
+`econ_EM` / `2604.22445v1` / `P007_S0007`
+
+**Passage (original):**
+
+```
+The non-monotone behavior of the $\Rhat$ statistic and the lower ESS values for the elliptical slice sampler are likely due to the high autocorrelation in the sampled chain. Figure \ref{fig:kilian_acf} presents the autocorrelation function for $B_{11}$ from the NUTS (top panel), and $Q_{11}$ from the elliptical slice sampler (bottom panel). The latter exhibits substantially higher autocorrelation, which reduces sampling efficiency. Similar patterns are observed for the other elements of the matrices \(B\) and \(Q\) in this application.
+```
+
+**Perturbation (injected):**
+
+```
+The non-monotone behavior of the $\Rhat$ statistic and the lower ESS values for the elliptical slice sampler are likely due to the high autocorrelation in the sampled chain. Figure \ref{fig:kilian_acf} presents the autocorrelation function for $B_{11}$ from the NUTS (top panel), and $Q_{11}$ from the elliptical slice sampler (bottom panel). Because the autocorrelation for $Q_{11}$ is positive, we can conclude that the sampler is exploring the parameter space twice as thoroughly as a sampler with zero autocorrelation.
+```
+
+**Why it causes an error:** Positive autocorrelation means the sampler is less efficient and explores the space more slowly than independent sampling (zero autocorrelation). The claim that it is 'twice as thorough' is a direct misinterpretation of how autocorrelation affects sampling.
+
+**Contradicting evidence (quote):**
+
+```
+the sampler is exploring the parameter space twice as thoroughly as a sampler with zero autocorrelation
+```
+
+**Verifier note:** The surrounding context states high autocorrelation 'reduces sampling efficiency'; positive autocorrelation reduces effective sample size and exploration, not increases it, so the perturbed claim directly contradicts established statistical methodology.
+
+**Your assessment:** (x) valid error  ( ) not an error  ( ) unsure  
+**Notes:** 
+
+### 36. Experimental — P-hacking
+
+`q-bio_GN` / `2604.18621v1` / `P003_S0003`
+
+**Passage (original):**
+
+```
+Specifically, clinical samples corresponding to LUAD subtypes were first stratified into Subtype-I and Subtype-II, resulting in separate matrices for each omic layer, denoted as Omic1.1${Subtype-I}$ and Omic1.2${Subtype-II}$. To identify molecular features that significantly discriminate between tumor and normal samples, we applied classical statistical tests, including unpaired t-tests, to compare the mean expression or methylation levels of each gene between the two groups. This step allowed the detection of differentially expressed genes (DEGs) or differentially methylated regions (DMRs) that are indicative of tumor-specific alterations. Multiple testing correction was applied using the Benjamini-Hochberg procedure to control the false discovery rate, ensuring that only statistically robust features were retained. Genes meeting the adjusted p-value threshold ($ \le 0.05 $) and exhibiting biologically meaningful log2 fold changes were flagged as significant, providing a preliminary but stringent set of candidate biomarkers. Significant genes were further categorized as upregulated or downregulated based on fold-change thresholds, and their variance across samples was computed to prioritize highly variable, biologically informative features. For RNA-seq data, top upregulated and downregulated genes were extracted for downstream analyses, including PCA visualization, hierarchical clustering, and volcano plot representation. This approach enabled the identification of subtype-specific molecular signatures while reducing noise from non-informative features.
+```
+
+**Perturbation (injected):**
+
+```
+Specifically, clinical samples corresponding to LUAD subtypes were first stratified into Subtype-I and Subtype-II, resulting in separate matrices for each omic layer, denoted as Omic1.1${Subtype-I}$ and Omic1.2${Subtype-II}$. To identify molecular features that significantly discriminate between tumor and normal samples, we applied classical statistical tests, including unpaired t-tests, to compare the mean expression or methylation levels of each gene between the two groups. This step allowed the detection of differentially expressed genes (DEGs) or differentially methylated regions (DMRs) that are indicative of tumor-specific alterations. To maximize the number of candidate biomarkers for the quantum neural network, we reported all genes meeting the standard p-value threshold ($ \le 0.05 $) without applying multiple testing corrections, as such corrections were found to reduce the total count of significant features. Genes exhibiting biologically meaningful log2 fold changes were flagged as significant, providing a preliminary set of candidate biomarkers.
+```
+
+**Why it causes an error:** The text explicitly states that multiple testing corrections were omitted specifically to increase the count of significant features, which is a methodological flaw (p-hacking) given the high dimensionality of genomic data (20,531 genes).
+
+**Contradicting evidence (quote):**
+
+```
+reported all genes meeting the standard p-value threshold ($ \le 0.05 $) without applying multiple testing corrections
+```
+
+**Verifier note:** The perturbed text explicitly removes the Benjamini-Hochberg multiple testing correction that the original applies, and the surrounding_context and related_passages confirm AdjP (FDR-adjusted p-values) are used throughout the methodology.
+
+**Your assessment:** ( ) valid error  ( ) not an error  (x) unsure  
+**Notes:** 
+
+### 37. Experimental — Reversed causality
+
+`cs_LG` / `2604.25599v1` / `P006_S0006`
+
+**Passage (original):**
+
+```
+% - Main performance table: all models on Java250 + Devign (+ OOD where applicable).
+% - Highlight best trade-off, not just best score.
+%
+\subsection{RQ2: Efficiency}
+\label{sec:efficiency}
+% - Table/plot: preprocessing time, inference latency.
+```
+
+**Perturbation (injected):**
+
+```
+\subsection{RQ2: Efficiency}
+\label{sec:efficiency}
+To investigate if high inference latency necessitates the use of hybrid architectures, we report wall-clock times on the Devign test split.
+```
+
+**Why it causes an error:** The paper proposes hybrids to improve performance/efficiency. The perturbation suggests that high latency causes the need for hybrids, whereas the paper's logic is that the hybrid architecture is the source of the latency characteristics being measured.
+
+**Contradicting evidence (quote):**
+
+```
+To investigate if high inference latency necessitates the use of hybrid architectures
+```
+
+**Verifier note:** The surrounding_context shows the original RQ2 section is about quantifying runtime cost of hybrids, not about whether latency necessitates hybrid use; the perturbed framing reverses the causal logic by implying latency drives the architectural choice.
+
+**Your assessment:** (x) valid error  ( ) not an error  ( ) unsure  
+**Notes:** 
+
+### 38. Experimental — Misinterpretation of results
+
+`hep-ex` / `2604.26004v1` / `P002_S0012`
+
+**Passage (original):**
+
+```
+\begin{figure*}[t]
+\centering
+\includegraphics[scale=0.4]{Fig_KSVZ_DFSZ.pdf}
+\caption{Constraints on KSVZ-like (left panel) and DFSZ-like (right panel) ALPs (ALPs with the same isospin couplings as the KSVZ and DFSZ axions).
+The KSVZ axion trajectory is the solid line in the left panel, while the
+the DFSZ axion trajectories for $\sin^2{\beta}=0,1$ are shown as solid and dashed lines, respectively, on the right.  The SNO neutron and SN1987A cooling exclusions are given, along with the potential exclusion from resonant absorption of galactic axions in NaI. See Eqs.~(\ref{eq:rotKSVZ}) and (\ref{eq:rotDFSZ}) for the definitions
+of  $g_{aNN}^\mathrm{\parallel~KSVZ}$ and  $g_{aNN}^\mathrm{\parallel~DFSZ}$.}
+\label{fig:KSVZDFSZ}
+\end{figure*}
+```
+
+**Perturbation (injected):**
+
+```
+\begin{figure*}[t]
+\centering
+\includegraphics[scale=0.4]{Fig_KSVZ_DFSZ.pdf}
+\caption{Constraints on KSVZ-like (left panel) and DFSZ-like (right panel) ALPs (ALPs with the same isospin couplings as the KSVZ and DFSZ axions).
+The KSVZ axion trajectory is the solid line in the left panel, while the
+the DFSZ axion trajectories for $\sin^2{\beta}=0,1$ are shown as solid and dashed lines, respectively, on the right. The SNO neutron and SN1987A cooling exclusions are given, along with the potential exclusion from resonant absorption of galactic axions in NaI. See Eqs.~(\ref{eq:rotKSVZ}) and (\ref{eq:rotDFSZ}) for the definitions
+of $g_{aNN}^\mathrm{\parallel~KSVZ}$ and $g_{aNN}^\mathrm{\parallel~DFSZ}$. Note that the NaI exclusion region is independent of the axion mass $m_a$.}
+\label{fig:KSVZDFSZ}
+\end{figure*}
+```
+
+**Why it causes an error:** The perturbation claims the NaI exclusion is independent of axion mass. However, the paper describes the process as 'resonant absorption', which by definition depends on the axion mass matching the nuclear transition energy (440 keV), and the context of the figure involves relating mass to couplings.
+
+**Contradicting evidence (quote):**
+
+```
+Note that the NaI exclusion region is independent of the axion mass $m_a$.
+```
+
+**Verifier note:** The paper explicitly discusses phase-space effects of $m_a$ on NaI limits and shows Fig. \ref{fig:Na23eff} giving ALP constraints as a function of $m_a$, directly contradicting the claim that the NaI exclusion is independent of $m_a$.
+
+**Your assessment:** (x) valid error  ( ) not an error  ( ) unsure  
+**Notes:** 
+
+### 39. Experimental — P-hacking
+
+`stat_AP` / `2604.22636v1` / `P001_S0001`
+
+**Passage (original):**
+
+```
+% Summary Empirical Application Section
+The empirical assessment of the proposed neural network model is based on the analysis of multiple real-world datasets. A description of these datasets follows in the next section. Then, we apply the proposed model to these datasets and evaluate its accuracy for customer revenue projections. Throughout, we evaluate predictions of customer-level cumulative realized revenue during each holdout period, computed directly from the holdout transactions. Thereby, we also provide a comparison with key benchmark models. All models are estimated using only transactions within the estimation window. The holdout window is used exclusively for out-of-sample evaluation, and no features or preprocessing steps use information from the holdout period. We report results for up to four holdout horizons, measured from the end of the estimation window, to reflect use cases with different forecasting requirements (52, 104, 156, 208 weeks). See Table \ref{table:descriptives} for details.
+```
+
+**Perturbation (injected):**
+
+```
+% Summary Empirical Application Section
+The empirical assessment of the proposed neural network model is based on the analysis of multiple real-world datasets. A description of these datasets follows in the next section. Then, we apply the proposed model to these datasets and evaluate its accuracy for customer revenue projections. Throughout, we evaluate predictions of customer-level cumulative realized revenue during each holdout period. To ensure the best possible results, we iteratively adjusted the estimation window and the inclusion of specific customer cohorts after observing the holdout performance. We report results for up to four holdout horizons (52, 104, 156, 208 weeks). See Table \ref{table:descriptives} for details.
+```
+
+**Why it causes an error:** The author states that they adjusted the estimation window and cohort definitions after observing holdout performance, which violates the principle of out-of-sample evaluation and constitutes data leakage/p-hacking.
+
+**Contradicting evidence (quote):**
+
+```
+we iteratively adjusted the estimation window and the inclusion of specific customer cohorts after observing the holdout performance
+```
+
+**Verifier note:** The surrounding context explicitly states 'The holdout window is used exclusively for out-of-sample evaluation, and no features or preprocessing steps use information from the holdout period,' which the perturbed text directly contradicts by describing iterative adjustment based on holdout performance.
+
+**Your assessment:** (x) valid error  ( ) not an error  ( ) unsure  
+**Notes:** 
+
+### 40. Experimental — Misinterpretation of results
+
+`econ_EM` / `2604.18078v1` / `P001_S0001`
+
+**Passage (original):**
+
+```
+\subsection{Setup and estimand}
+\label{ssection::twoway_setup}
+```
+
+**Perturbation (injected):**
+
+```
+\subsection{Setup and estimand}
+\label{ssection::twoway_setup}
+
+We define the estimand such that a confidence interval for $\beta^*$ containing zero implies that the effect $\beta_{it}$ is exactly zero for all $i$ and $t$.
+```
+
+**Why it causes an error:** Equation (3.4) defines $\beta^*$ as a variance-weighted average of $\beta_{it}$. An average being statistically indistinguishable from zero does not imply that every individual component $\beta_{it}$ is zero, especially given the heterogeneity described in the text.
+
+**Contradicting evidence (quote):**
+
+```
+a confidence interval for $\beta^*$ containing zero implies that the effect $\beta_{it}$ is exactly zero for all $i$ and $t$
+```
+
+**Verifier note:** The paper defines $\beta^*$ as a variance-weighted average of heterogeneous $\beta_{it}$, so a CI for $\beta^*$ containing zero says nothing about whether individual $\beta_{it}$ are zero; the perturbed claim misinterprets the estimand.
+
+**Your assessment:** (x) valid error  ( ) not an error  ( ) unsure  
+**Notes:** 
\ No newline at end of file