diff --git a/benchmarks/conference_study/analyses/ci_auc.py b/benchmarks/conference_study/analyses/ci_auc.py
new file mode 100644
index 0000000..2fd67a8
--- /dev/null
+++ b/benchmarks/conference_study/analyses/ci_auc.py
@@ -0,0 +1,243 @@
+"""Cluster-bootstrap-over-papers 95% CIs for the pairwise-accuracy (Mann-Whitney
+AUC) cells in the conference study.
+
+Each quality proxy splits the papers into two sides: a high-quality group and a
+low-quality group. The AUC is the pairwise accuracy of that split, the
+probability that the system puts more comments on a randomly drawn low-quality
+paper than on a randomly drawn high-quality one (0.5 means no separation).
+
+Resampling unit = paper, stratified by (proxy, side): within each proxy we
+resample the high-side papers and the low-side papers with replacement and take
+2.5/97.5 percentiles. No significance tests.
+
+Paper sets, result dirs, and the tables to print are defined in a JSON config
+(--config, defaults to ci_auc_tables.json). The config and the result JSONs and
+manifests it points at are kept locally (gitignored, large); the math is covered
+by tests/test_ci_auc.py on in-memory data. Three table kinds, one per paper
+format: "comment_volume" (Table 1, mean comments + delta + overall AUC),
+"by_proxy" (Table 2, accuracy per quality proxy), and "by_severity" (Table 9,
+accuracy per severity tier).
+"""
+import sys
+from pathlib import Path
+from collections import defaultdict
+import numpy as np
+
+HERE = Path(__file__).resolve().parent
+from compute_auc import load_manifest, load_counts, cell_summary, auc_from  # noqa: E402
+
+B = 5000
+RNG = np.random.default_rng(42)
+
+
+def by_proxy_totals(counts_for_cell, slug_to_mem):
+    """proxy_id -> {'high': [counts], 'low': [counts]} (total comments)."""
+    by_proxy = defaultdict(lambda: {"high": [], "low": []})
+    for slug, rec in counts_for_cell.items():
+        for m in slug_to_mem.get(slug, []):
+            # "pair" is the manifest's field name for a quality proxy.
+            by_proxy[m["pair"]][m["side"]].append(rec["total"])
+    return by_proxy
+
+
+def cell_aucs(by_proxy):
+    """Return (overall_auc, {proxy: auc})."""
+    per = {}
+    H = T = 0.0
+    for proxy_id, d in by_proxy.items():
+        a, hits, tot = auc_from(d["high"], d["low"])
+        per[proxy_id] = a
+        H += hits; T += tot
+    return (H / T if T else np.nan), per
+
+
+def bootstrap(by_proxy):
+    """Cluster-bootstrap CIs for one cell's overall and per-proxy AUC.
+
+    Each of B draws resamples papers with replacement, stratified by (proxy,
+    side): within every proxy the high-side and low-side papers are resampled
+    on their own. Per draw, each proxy's resampled AUC is recorded, and hits and
+    pairs are pooled across proxies for that draw's overall AUC. Returns
+    ((overall_lo, overall_hi), {proxy_id: (lo, hi)}) from the 2.5/97.5
+    percentiles of those draws.
+    """
+    overalls = []
+    per_lists = defaultdict(list)
+    proxy_ids = sorted(by_proxy)
+    arrs = {proxy_id: (np.asarray(by_proxy[proxy_id]["high"]), np.asarray(by_proxy[proxy_id]["low"])) for proxy_id in proxy_ids}
+    for _ in range(B):
+        H = T = 0.0
+        for proxy_id in proxy_ids:
+            high, low = arrs[proxy_id]
+            if high.size == 0 or low.size == 0:
+                continue
+            high_resampled = high[RNG.integers(0, high.size, high.size)]
+            low_resampled = low[RNG.integers(0, low.size, low.size)]
+            a, hits, tot = auc_from(high_resampled, low_resampled)
+            per_lists[proxy_id].append(a)
+            H += hits; T += tot
+        overalls.append(H / T if T else np.nan)
+    # A proxy with papers on only one side never gets a draw appended, so guard
+    # the empty case (mirrors the nan the point-estimate paths return there).
+    ci = lambda xs: tuple(np.percentile(xs, [2.5, 97.5])) if xs else (float("nan"), float("nan"))
+    return ci(overalls), {proxy_id: ci(per_lists[proxy_id]) for proxy_id in proxy_ids}
+
+
+PROXY = {1: "Community", 2: "Conference", 3: "Reviewer", 4: "Composite"}
+
+
+# ---- per-severity-tier AUCs (appendix tier tables) ----
+
+def by_proxy_recs(counts_for_cell, slug_to_mem):
+    """proxy_id -> {'high': [rec], 'low': [rec]}.
+
+    A rec is one paper's comment counts from load_counts:
+    {'total', 'major', 'moderate', 'minor'}. Keeps the whole rec (not just the
+    total) so per-severity-tier AUCs can be computed."""
+    by_proxy = defaultdict(lambda: {"high": [], "low": []})
+    for slug, rec in counts_for_cell.items():
+        for m in slug_to_mem.get(slug, []):
+            by_proxy[m["pair"]][m["side"]].append(rec)
+    return by_proxy
+
+
+def bootstrap_tiers(by_proxy, tiers):
+    """Cluster-bootstrap CI per tier. Returns {tier: (lo, hi)}.
+
+    'tiers' may include 'total' (the overall comment count) alongside the
+    severity tiers. One paper resample per (proxy, side) is shared across all
+    tiers within a draw, so the tiers stay correlated. Supplies the CIs for the
+    volume table (overall) and the tier table (overall + per-severity).
+    """
+    proxy_ids = sorted(by_proxy)
+    draws = {t: [] for t in tiers}
+    for _ in range(B):
+        Ht = {t: 0.0 for t in tiers}; Tt = {t: 0.0 for t in tiers}
+        for proxy_id in proxy_ids:
+            high = by_proxy[proxy_id]["high"]; low = by_proxy[proxy_id]["low"]
+            if not high or not low:
+                continue
+            high_idx = RNG.integers(0, len(high), len(high))
+            low_idx = RNG.integers(0, len(low), len(low))
+            for t in tiers:
+                high_resampled = [high[j][t] for j in high_idx]
+                low_resampled = [low[j][t] for j in low_idx]
+                _, hits, tot = auc_from(high_resampled, low_resampled)
+                Ht[t] += hits; Tt[t] += tot
+        for t in tiers:
+            draws[t].append(Ht[t] / Tt[t] if Tt[t] else np.nan)
+    return {t: tuple(np.percentile(draws[t], [2.5, 97.5])) for t in tiers}
+
+
+def auc_cell(point, lo_hi):
+    """Format one AUC point estimate with its CI as 'point [lo, hi]'."""
+    lo, hi = lo_hi
+    return f"{point:.2f} [{lo:.2f}, {hi:.2f}]"
+
+
+def _labels_and_width(cells):
+    """Row labels ('method__model') and the first-column width to align them."""
+    labels = [f"{method}__{model}" for method, model in cells]
+    return labels, max([len("method__model")] + [len(x) for x in labels]) + 2
+
+
+def run_comment_volume(manifest, dirs, cells):
+    """Table 1 (model-aggregate): mean comments on the high/low-quality groups,
+    their difference (delta, % increase), and overall AUC with CI."""
+    slug_to_mem = load_manifest(manifest)
+    counts = load_counts(dirs, set(slug_to_mem))
+    labels, w = _labels_and_width(cells)
+    print(f"{'method__model':<{w}}{'c_high':>8}{'c_low':>8}{'delta':>8}{'%inc':>8}   {'Overall':<18}".rstrip())
+    for (method, model), label in zip(cells, labels):
+        cc = counts.get((method, model))
+        if not cc:
+            print(f"{label:<{w}}NO DATA"); continue
+        s = cell_summary(cc, slug_to_mem)
+        overall_ci = bootstrap_tiers(by_proxy_recs(cc, slug_to_mem), ["total"])["total"]
+        row = (f"{label:<{w}}{s['c_high']:>8.2f}{s['c_low']:>8.2f}{s['delta']:>+8.2f}"
+               f"{s['pct_increase']:>7.1f}%   {auc_cell(s['auc_overall'], overall_ci)}")
+        print(row.rstrip())
+
+
+def run_by_proxy(manifest, dirs, cells):
+    """Table 2 (system-deltas): mean comments and per-quality-proxy AUC plus
+    overall, with CIs, one (method, model) per row."""
+    slug_to_mem = load_manifest(manifest)
+    counts = load_counts(dirs, set(slug_to_mem))
+    labels, w = _labels_and_width(cells)
+    proxy_ids = sorted(PROXY)  # Community / Conference / Reviewer / Composite
+    head = (f"{'method__model':<{w}}{'c_bar':>8}   "
+            + "".join(f"{PROXY[p]:<18}" for p in proxy_ids) + f"{'Overall':<18}")
+    print(head.rstrip())
+    for (method, model), label in zip(cells, labels):
+        cc = counts.get((method, model))
+        if not cc:
+            print(f"{label:<{w}}NO DATA"); continue
+        s = cell_summary(cc, slug_to_mem)
+        by_proxy = by_proxy_totals(cc, slug_to_mem)
+        overall, per = cell_aucs(by_proxy)
+        (olo, ohi), perci = bootstrap(by_proxy)
+        c_bar = (s["c_high"] + s["c_low"]) / 2
+        row = f"{label:<{w}}{c_bar:>8.2f}   "
+        row += "".join(f"{(auc_cell(per[p], perci[p]) if p in per else '-'):<18}" for p in proxy_ids)
+        row += f"{auc_cell(overall, (olo, ohi)):<18}"
+        print(row.rstrip())
+
+
+def run_by_severity(manifest, dirs, cells):
+    """Table 9 (severity-aggregate): mean comments on the low-quality group and
+    per-severity-tier AUC (Major / Moderate / Minor) plus overall, with CIs."""
+    slug_to_mem = load_manifest(manifest)
+    counts = load_counts(dirs, set(slug_to_mem))
+    labels, w = _labels_and_width(cells)
+    # (column header, cell_summary point key, bootstrap tier key)
+    TIER_COLS = [("Major", "auc_major", "major"), ("Moderate", "auc_moderate", "moderate"),
+                 ("Minor", "auc_minor", "minor"), ("Overall", "auc_overall", "total")]
+    print((f"{'method__model':<{w}}{'c_low':>8}   "
+           + "".join(f"{name:<18}" for name, _, _ in TIER_COLS)).rstrip())
+    for (method, model), label in zip(cells, labels):
+        cc = counts.get((method, model))
+        if not cc:
+            print(f"{label:<{w}}NO DATA"); continue
+        s = cell_summary(cc, slug_to_mem)
+        ci = bootstrap_tiers(by_proxy_recs(cc, slug_to_mem), [t for _, _, t in TIER_COLS])
+        row = f"{label:<{w}}{s['c_low']:>8.2f}   "
+        row += "".join(f"{auc_cell(s[pkey], ci[tkey]):<18}" for _, pkey, tkey in TIER_COLS)
+        print(row.rstrip())
+
+
+# Table kind -> the report function that prints it. cells come from the config.
+# Each kind maps to one paper-table format: comment_volume = Table 1
+# (model-aggregate), by_proxy = Table 2 (system-deltas), by_severity = Table 9
+# (severity-aggregate).
+DISPATCH = {"comment_volume": run_comment_volume, "by_proxy": run_by_proxy, "by_severity": run_by_severity}
+
+
+def run_cohort(config, name, base):
+    """Run every table defined for one cohort. manifest and dirs in the config
+    are resolved relative to `base` (the config file's directory)."""
+    cohort = config["cohorts"][name]
+    manifest = base / cohort["manifest"]
+    dirs = [base / d for d in cohort["dirs"]]
+    for table in cohort["tables"]:
+        print(f"\n########## {table['title']} ##########")
+        cells = [tuple(c) for c in table["cells"]]
+        DISPATCH[table["kind"]](manifest, dirs, cells)
+
+
+if __name__ == "__main__":
+    import argparse
+    import json
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument("--config", type=Path, default=HERE / "ci_auc_tables.json",
+                    help="JSON defining cohorts, result dirs, and tables. "
+                         "Paths inside it are relative to the config's directory. "
+                         "Defaults to ci_auc_tables.json.")
+    ap.add_argument("--cohort", help="cohort name from the config "
+                                     "(default: the first one defined)")
+    args = ap.parse_args()
+    config = json.loads(args.config.read_text())
+    cohort = args.cohort or next(iter(config["cohorts"]))
+    if cohort not in config["cohorts"]:
+        ap.error(f"unknown cohort {cohort!r}; choices: {list(config['cohorts'])}")
+    run_cohort(config, cohort, args.config.resolve().parent)
diff --git a/benchmarks/conference_study/analyses/compute_auc.py b/benchmarks/conference_study/analyses/compute_auc.py
index d32f1a9..3ff04a6 100644
--- a/benchmarks/conference_study/analyses/compute_auc.py
+++ b/benchmarks/conference_study/analyses/compute_auc.py
@@ -10,33 +10,9 @@
   - Per-tier AUC (major, moderate, minor) using the same pair-hit definition
     on per-tier comment counts.
   - Mean comments per paper on the high and low groups, Δ, % increase.
-  - Paper coverage and pair-count denominator.
 
 \\coarse's native {minor, major, critical} severity tiers are normalized to
-{minor, moderate, major} via {critical→major, major→moderate, minor→minor}
-so that severity AUCs are comparable across systems.
-
-The script keys papers by the full ``slug`` field from the manifest (not by
-forum-id prefix), since some forum-ids start with ``-`` and would collide
-under naive splitting.
-
-Usage:
-    # Frontier subset (Tables 1/2 in the paper)
-    python compute_auc.py \\
-        --manifest manifests/v2_frontier/combined.json \\
-        --dir frontier_subset_progressive frontier_subset_zero_shot \\
-              scaleup_v2_progressive scaleup_v2_zero_shot coarse_v2 \\
-              scaleup_v2_grok_progressive scaleup_v2_grok_zero_shot coarse_v2_grok
-
-    # Full 240-paper cohort (Tables 3/4 + appendix)
-    python compute_auc.py \\
-        --manifest manifests/v2/combined.json \\
-        --dir scaleup_v2_zero_shot scaleup_v2_progressive coarse_v2 \\
-              scaleup_v2_grok_zero_shot scaleup_v2_grok_progressive coarse_v2_grok
-
-    # Just one (method, model)
-    python compute_auc.py --manifest ... --dir ... \\
-        --method progressive_original --model grok-4.1-fast
+{minor, moderate, major} so that severity AUCs are comparable across systems.
 """
 from __future__ import annotations
 
@@ -47,6 +23,8 @@
 from pathlib import Path
 from statistics import mean
 
+import numpy as np
+
 HERE = Path(__file__).resolve().parent
 REPO_ROOT = HERE.parent  # benchmarks/conference_study/
 RESULTS_ROOT = REPO_ROOT / "results"
@@ -56,6 +34,9 @@
 
 
 def normalize_severity(method: str, raw: str | None) -> str | None:
+    """Map a raw severity label to {major, moderate, minor}, or None if absent
+    or unrecognized. coarse's {critical, major, minor} is remapped so tiers are
+    comparable across systems."""
     if not raw:
         return None
     raw = raw.lower()
@@ -99,29 +80,27 @@ def load_counts(
     return counts
 
 
-def pair_auc(highs: list[int], lows: list[int]) -> tuple[float, int, int]:
-    """Returns (auc, strict_wins, total_pairs). 0.5 credit for ties."""
-    hits = 0.0
-    wins = 0
-    total = 0
-    for cl in lows:
-        for ch in highs:
-            if cl > ch:
-                hits += 1
-                wins += 1
-            elif cl == ch:
-                hits += 0.5
-            total += 1
-    return (hits / total if total else float("nan"), wins, total)
+def auc_from(highs, lows) -> tuple[float, float, int]:
+    """Pairwise-accuracy AUC over the high x low outer product.
+    """
+    high = np.asarray(highs); low = np.asarray(lows)
+    if high.size == 0 or low.size == 0:
+        return float("nan"), 0.0, 0
+    diff = low[:, None] - high[None, :]
+    hits = (diff > 0).sum() + 0.5 * (diff == 0).sum()
+    total = high.size * low.size
+    return hits / total, float(hits), total
 
 
 def cell_summary(
     counts_for_cell: dict[str, dict],
     slug_to_memberships: dict[str, list[dict]],
 ) -> dict:
-    """Compute per-cell summary stats. counts_for_cell is {slug: rec}."""
-    # Group counts by (pair, side) — for the Δ and AUC computations
-    by_pair: dict[int, dict[str, dict[str, list[int]]]] = defaultdict(
+    """Per-cell summary: comment means, Δ, % increase, and overall + per-tier
+    AUC, pooling hits and pairs across proxies. counts_for_cell is {slug: rec}."""
+    # Group counts by (proxy, side). "pair" is the manifest's field name for a
+    # quality proxy.
+    by_proxy: dict[int, dict[str, dict[str, list[int]]]] = defaultdict(
         lambda: {"high": {"total": [], "major": [], "moderate": [], "minor": []},
                  "low": {"total": [], "major": [], "moderate": [], "minor": []}}
     )
@@ -129,7 +108,7 @@ def cell_summary(
         for m in slug_to_memberships.get(slug, []):
             side = m["side"]
             for k in ("total", "major", "moderate", "minor"):
-                by_pair[m["pair"]][side][k].append(rec[k])
+                by_proxy[m["pair"]][side][k].append(rec[k])
 
     # Δ and means across the four (or fewer) proxies
     highs_means: list[float] = []
@@ -137,68 +116,48 @@ def cell_summary(
     deltas: list[float] = []
     auc_overall_hits = 0.0
     auc_overall_total = 0
-    auc_overall_wins = 0
-    auc_tier: dict[str, list[float]] = {t: [] for t in SEVERITY_TIERS}
     auc_tier_hits: dict[str, float] = {t: 0.0 for t in SEVERITY_TIERS}
     auc_tier_total: dict[str, int] = {t: 0 for t in SEVERITY_TIERS}
-    auc_tier_wins: dict[str, int] = {t: 0 for t in SEVERITY_TIERS}
 
-    for pid in sorted(by_pair):
-        h_total = by_pair[pid]["high"]["total"]
-        l_total = by_pair[pid]["low"]["total"]
-        if not h_total or not l_total:
+    for proxy_id in sorted(by_proxy):
+        high_total = by_proxy[proxy_id]["high"]["total"]
+        low_total = by_proxy[proxy_id]["low"]["total"]
+        if not high_total or not low_total:
             continue
-        h_mean = mean(h_total)
-        l_mean = mean(l_total)
-        highs_means.append(h_mean)
-        lows_means.append(l_mean)
-        deltas.append(l_mean - h_mean)
-        # Overall pair AUC
-        _, w, t = pair_auc(h_total, l_total)
-        auc_overall_wins += w
-        auc_overall_total += t
-        # accumulate by counting hits directly to avoid float-mean drift
-        for cl in l_total:
-            for ch in h_total:
-                if cl > ch:
-                    auc_overall_hits += 1
-                elif cl == ch:
-                    auc_overall_hits += 0.5
-        # Per-tier AUCs
+        high_mean = mean(high_total)
+        low_mean = mean(low_total)
+        highs_means.append(high_mean)
+        lows_means.append(low_mean)
+        deltas.append(low_mean - high_mean)
+        _, hits, total = auc_from(high_total, low_total)
+        auc_overall_hits += hits
+        auc_overall_total += total
         for tier in SEVERITY_TIERS:
-            ht = by_pair[pid]["high"][tier]
-            lt = by_pair[pid]["low"][tier]
-            for cl in lt:
-                for ch in ht:
-                    if cl > ch:
-                        auc_tier_hits[tier] += 1
-                        auc_tier_wins[tier] += 1
-                    elif cl == ch:
-                        auc_tier_hits[tier] += 0.5
-                    auc_tier_total[tier] += 1
+            _, tier_hits, tier_total = auc_from(by_proxy[proxy_id]["high"][tier],
+                                                by_proxy[proxy_id]["low"][tier])
+            auc_tier_hits[tier] += tier_hits
+            auc_tier_total[tier] += tier_total
 
     if not highs_means:
         return {}
 
-    h = mean(highs_means)
-    l = mean(lows_means)
-    d = mean(deltas)
-    pct = (d / h * 100) if h else float("nan")
+    c_high = mean(highs_means)
+    c_low = mean(lows_means)
+    delta = mean(deltas)
+    pct = (delta / c_high * 100) if c_high else float("nan")
     out = {
         "n_papers": len(counts_for_cell),
-        "c_high": h,
-        "c_low": l,
-        "delta": d,
+        "c_high": c_high,
+        "c_low": c_low,
+        "delta": delta,
         "pct_increase": pct,
         "auc_overall": auc_overall_hits / auc_overall_total if auc_overall_total else float("nan"),
-        "auc_overall_wins": auc_overall_wins,
         "auc_overall_total": auc_overall_total,
     }
     for tier in SEVERITY_TIERS:
-        tot = auc_tier_total[tier]
-        out[f"auc_{tier}"] = auc_tier_hits[tier] / tot if tot else float("nan")
-        out[f"auc_{tier}_wins"] = auc_tier_wins[tier]
-        out[f"auc_{tier}_total"] = tot
+        total = auc_tier_total[tier]
+        out[f"auc_{tier}"] = auc_tier_hits[tier] / total if total else float("nan")
+        out[f"auc_{tier}_total"] = total
     return out
 
 
@@ -207,7 +166,7 @@ def render_markdown(rows: list[tuple[str, str, dict]]) -> str:
     lines = []
     lines.append(
         "| method | model | n_papers | c_high | c_low | Δ | %inc | "
-        "AUC overall | AUC major | AUC mod | AUC minor | pairs |"
+        "AUC overall | AUC major | AUC moderate | AUC minor | pairs |"
     )
     lines.append(
         "|---|---|---|---|---|---|---|---|---|---|---|---|"
@@ -228,6 +187,8 @@ def render_markdown(rows: list[tuple[str, str, dict]]) -> str:
 
 
 def main() -> int:
+    """CLI: load the manifest and result dirs, compute each (method, model)
+    cell's point summary, and print them as a markdown table."""
     p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
     p.add_argument("--manifest", required=True, type=Path, help="Path to manifest JSON (e.g., manifests/v2/combined.json).")
     p.add_argument("--dir", nargs="+", required=True, dest="dirs",
diff --git a/tests/test_ci_auc.py b/tests/test_ci_auc.py
new file mode 100644
index 0000000..9f00dd5
--- /dev/null
+++ b/tests/test_ci_auc.py
@@ -0,0 +1,106 @@
+"""Unit tests for the conference-study AUC + bootstrap-CI helpers (ci_auc.py).
+
+Uses tiny in-memory count tables (no result files), so the math is checkable by
+hand: two high-quality papers with few comments and two low-quality papers with
+more, giving perfect low-over-high separation (AUC = 1.0).
+"""
+
+import math
+import sys
+from pathlib import Path
+
+_ANALYSES = Path(__file__).resolve().parents[1] / "benchmarks" / "conference_study" / "analyses"
+if str(_ANALYSES) not in sys.path:
+    sys.path.insert(0, str(_ANALYSES))
+
+from ci_auc import (  # noqa: E402
+    DISPATCH,
+    auc_from,
+    bootstrap,
+    bootstrap_tiers,
+    by_proxy_recs,
+    by_proxy_totals,
+)
+from compute_auc import cell_summary  # noqa: E402
+
+
+# ---- auc_from: pairwise accuracy over the high x low outer product ----
+
+def test_auc_from_perfect_separation():
+    auc, hits, total = auc_from([1, 1], [2, 2])  # every low > every high
+    assert auc == 1.0 and hits == 4.0 and total == 4
+
+
+def test_auc_from_reversed():
+    auc, _, _ = auc_from([2, 2], [1, 1])
+    assert auc == 0.0
+
+
+def test_auc_from_ties_get_half_credit():
+    auc, hits, total = auc_from([1], [1])
+    assert auc == 0.5 and hits == 0.5 and total == 1
+
+
+def test_auc_from_empty_side_is_nan():
+    auc, hits, total = auc_from([], [1, 2])
+    assert math.isnan(auc) and hits == 0.0 and total == 0
+
+
+# ---- cell_summary / bootstrap on a toy cell ----
+
+def _toy_cell():
+    """One proxy: 2 high papers (2 comments each), 2 low papers (5 each)."""
+    counts = {
+        "p1": {"total": 2, "major": 1, "moderate": 1, "minor": 0},
+        "p2": {"total": 2, "major": 1, "moderate": 1, "minor": 0},
+        "p3": {"total": 5, "major": 2, "moderate": 2, "minor": 1},
+        "p4": {"total": 5, "major": 2, "moderate": 2, "minor": 1},
+    }
+    mem = {
+        "p1": [{"pair": 1, "side": "high"}], "p2": [{"pair": 1, "side": "high"}],
+        "p3": [{"pair": 1, "side": "low"}], "p4": [{"pair": 1, "side": "low"}],
+    }
+    return counts, mem
+
+
+def test_cell_summary_point_estimates():
+    s = cell_summary(*_toy_cell())
+    assert s["c_high"] == 2.0
+    assert s["c_low"] == 5.0
+    assert s["delta"] == 3.0
+    assert s["auc_overall"] == 1.0  # low always exceeds high
+    assert s["auc_major"] == 1.0
+
+
+def test_bootstrap_tiers_ci_shape():
+    ci = bootstrap_tiers(by_proxy_recs(*_toy_cell()), ("total", "major"))
+    for tier in ("total", "major"):
+        lo, hi = ci[tier]
+        assert 0.0 <= lo <= hi <= 1.0
+        assert hi == 1.0  # degenerate perfect-separation data
+
+
+def test_bootstrap_overall_and_per_proxy():
+    (lo, hi), per_proxy = bootstrap(by_proxy_totals(*_toy_cell()))
+    assert 0.0 <= lo <= hi <= 1.0
+    assert 1 in per_proxy  # CI present for the one proxy
+
+
+def test_bootstrap_single_sided_proxy_no_crash():
+    # proxy 2 has only a high-quality paper -> its per-proxy CI is nan, not a crash
+    counts = {
+        "p1": {"total": 2, "major": 1, "moderate": 1, "minor": 0},  # proxy 1 high
+        "p2": {"total": 5, "major": 2, "moderate": 2, "minor": 1},  # proxy 1 low
+        "p3": {"total": 3, "major": 1, "moderate": 1, "minor": 1},  # proxy 2 high only
+    }
+    mem = {
+        "p1": [{"pair": 1, "side": "high"}], "p2": [{"pair": 1, "side": "low"}],
+        "p3": [{"pair": 2, "side": "high"}],
+    }
+    (lo, hi), per_proxy = bootstrap(by_proxy_totals(counts, mem))
+    assert math.isnan(per_proxy[2][0]) and math.isnan(per_proxy[2][1])
+    assert 0.0 <= per_proxy[1][0] <= per_proxy[1][1] <= 1.0
+
+
+def test_dispatch_covers_the_three_kinds():
+    assert set(DISPATCH) == {"comment_volume", "by_proxy", "by_severity"}