From f07eba1a3633682b17e92a4968402c5619de8076 Mon Sep 17 00:00:00 2001
From: Matt Kane <m@mk.gg>
Date: Sun, 10 May 2026 16:37:15 +0100
Subject: [PATCH 1/8] feat(aggregator): backfill admin route + auth-gated admin
 surface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Cold-start discovery primitive. Operator hits POST /_admin/backfill with
a JSON list of DIDs; the worker resolves each DID's PDS endpoint, calls
com.atproto.repo.listRecords for every NSID in WANTED_COLLECTIONS, and
batches the results onto the existing Records Queue. The consumer's
verification + write + idempotency machinery handles the rest — same
code path as live Jetstream, distinguished only by trigger.

Live discovery is still Jetstream's job; this exists for the cold-start
gap (publishers who published before the aggregator was listening) and
for operator-initiated recovery after a known outage. There is no
periodic scheduler — see plan §"Why no reconciliation cron".

ADMIN AUTH

Both admin routes (/_admin/start and /_admin/backfill) now require
`Authorization: Bearer <ADMIN_TOKEN>`. Wrangler's `secrets.required`
declares the token in config, so:
- `wrangler types` generates a typed `env.ADMIN_TOKEN: string`
- `wrangler deploy` refuses to ship if the secret isn't set
- Production sets via `wrangler secret put ADMIN_TOKEN`
- Tests bind a stub via vitest miniflare bindings (test-admin-token)
Constant-time compare to avoid leaking length/prefix via timing.

Backfill specifically is what makes this auth gate non-optional:
unauthenticated, did:web resolution turns the worker into an
attacker-controlled HTTP fetcher (caller picks both the DID-doc URL
and the resulting PDS endpoint). Same gate on /_admin/start for
symmetry — anyone with the token can do operational things.

DEFENSES IN BACKFILL

Adversarial review (round 1) found bugs that needed fixing before
shipping. All addressed:

- BLOCKER: pagination loop had no max-page guard. A PDS that echoes
  the same cursor pinned the worker indefinitely. Now capped at
  MAX_PAGES_PER_COLLECTION (1000) plus a cursor-equality check that
  catches the common case in 2 iterations.
- MAJOR: dids[] and per-page records had no size cap. Both capped now
  (MAX_BACKFILL_DIDS=1000, MAX_RECORDS_PER_PAGE=200) so a single POST
  can't flood the queue.
- MAJOR: per-record sequential queue.send() couldn't possibly finish
  a 4000-record DID inside the waitUntil budget. Switched to
  sendBatch (one batch per page).
- MAJOR: runBackfill summary log only fired at the end; if waitUntil
  exhausted mid-loop the operator saw nothing. Added per-DID logs
  via onDidComplete callback + try/catch wrapping the whole runner
  so any throw surfaces as "[aggregator] backfill aborted".
- MINOR: 404 mid-pagination silently truncated; now distinguished
  from first-page-404 (which legitimately means "no records of this
  collection") and surfaced as a partial-failure error.
- MINOR: parseBackfillBody only checked startsWith("did:"). Now
  validates against DID_PATTERN and dedupes the list via Set.
- MINOR: parseRkeyFromUri accepted any non-empty string after the
  expected slashes. Now validates against the atproto rkey grammar
  ([A-Za-z0-9._:~-]{1,512}) so injection-shaped rkeys are skipped.
- MINOR: backfill jobs were setting jetstreamRecord, which the
  consumer's DLQ payload field would have mislabeled as
  Jetstream-supplied data. Dropped the field on backfill jobs.
- NIT: isPlainObject was duplicated 3x; extracted to src/utils.ts.

128 tests pass (was 112). Typecheck clean. Zero lint diagnostics.

The remaining adversarial-review items I decided to skip:
- Three near-duplicate MapDidDocCache test stubs across 3 files —
  flagged as NIT, deferred to its own refactor PR.
- Constellation-driven discovery (no-explicit-DIDs path). Documented
  in the plan as intentional Slice 1 scope: explicit DIDs is the
  primary path; Constellation is a follow-up if needed.
---
 apps/aggregator/src/backfill.ts           | 263 ++++++++
 apps/aggregator/src/index.ts              | 177 +++++-
 apps/aggregator/src/records-consumer.ts   |   5 +-
 apps/aggregator/src/utils.ts              |  13 +
 apps/aggregator/test/backfill.test.ts     | 725 ++++++++++++++++++++++
 apps/aggregator/vitest.config.ts          |  10 +-
 apps/aggregator/worker-configuration.d.ts |   5 +-
 apps/aggregator/wrangler.jsonc            |   7 +
 8 files changed, 1191 insertions(+), 14 deletions(-)
 create mode 100644 apps/aggregator/src/backfill.ts
 create mode 100644 apps/aggregator/src/utils.ts
 create mode 100644 apps/aggregator/test/backfill.test.ts

diff --git a/apps/aggregator/src/backfill.ts b/apps/aggregator/src/backfill.ts
new file mode 100644
index 000000000..c91f300df
--- /dev/null
+++ b/apps/aggregator/src/backfill.ts
@@ -0,0 +1,263 @@
+/**
+ * Cold-start discovery worker.
+ *
+ * Operator-triggered (via `POST /_admin/backfill`); takes a list of DIDs,
+ * calls `com.atproto.repo.listRecords` against each publisher's PDS for every
+ * collection in `WANTED_COLLECTIONS`, and enqueues each returned record onto
+ * the existing Records Queue as a `RecordsJob`. The consumer's verification +
+ * write + idempotency machinery handles the rest — same code path as live
+ * Jetstream, distinguished only by trigger.
+ *
+ * Live discovery is Jetstream's job, not this worker's; it picks up new
+ * publishers automatically. Backfill exists for the cold-start gap (publishers
+ * who published before the aggregator was listening) and for operator-triggered
+ * recovery after a known outage. There is deliberately no periodic scheduler
+ * — see plan §"Why no reconciliation cron".
+ */
+
+import { WANTED_COLLECTIONS } from "./constants.js";
+import type { DidResolver } from "./did-resolver.js";
+import type { RecordsJob } from "./env.js";
+import { isPlainObject } from "./utils.js";
+
+const PAGE_SIZE = 100;
+/** Cap on listRecords pagination per collection. A buggy or malicious PDS
+ * that echoes the same cursor would otherwise loop forever inside one
+ * `ctx.waitUntil`. 1000 pages × 100 records = 100k records per collection,
+ * which is past anything we'd legitimately backfill in one shot. */
+const MAX_PAGES_PER_COLLECTION = 1000;
+/** Defensive cap on records per page. Real PDSes honour the `limit` query
+ * param; this guards against a hostile PDS returning an enormous array. */
+const MAX_RECORDS_PER_PAGE = PAGE_SIZE * 2;
+/** Atproto rkey grammar: ALPHA / DIGIT / "." / "-" / "_" / ":" / "~". */
+const RKEY_PATTERN = /^[A-Za-z0-9._:~-]{1,512}$/;
+
+/** Producer-side queue surface. The production binding `env.RECORDS_QUEUE`
+ * satisfies this; tests pass an in-memory implementation. The return type
+ * is `unknown` rather than `void` so workerd's `Queue.send` (which returns
+ * a `QueueSendResponse`) is structurally assignable. */
+export interface BackfillQueue {
+	sendBatch(messages: ReadonlyArray<{ body: RecordsJob }>): Promise<unknown>;
+}
+
+export interface BackfillDeps {
+	resolver: DidResolver;
+	queue: BackfillQueue;
+	/** Inject for tests; defaults to `globalThis.fetch`. */
+	fetch?: typeof fetch;
+	/** Optional callback fired after each DID completes (success or failure).
+	 * Used by the production wiring to log per-DID progress so an operator
+	 * watching `wrangler tail` sees where a long backfill is up to. */
+	onDidComplete?: (result: BackfillDidResult) => void;
+}
+
+export interface BackfillDidResult {
+	did: string;
+	enqueued: number;
+	/** One entry per failure during this DID's backfill. listRecords failures
+	 * are per-collection; resolution failures abort early. Empty array on
+	 * total success. */
+	errors: string[];
+}
+
+export interface BackfillSummary {
+	totalEnqueued: number;
+	results: BackfillDidResult[];
+}
+
+/**
+ * Backfill multiple DIDs serially. Per-DID failures don't stop the loop —
+ * one bad publisher doesn't block the rest.
+ */
+export async function backfillDids(
+	dids: readonly string[],
+	deps: BackfillDeps,
+): Promise<BackfillSummary> {
+	const results: BackfillDidResult[] = [];
+	let totalEnqueued = 0;
+	for (const did of dids) {
+		const result = await backfillDid(did, deps);
+		results.push(result);
+		totalEnqueued += result.enqueued;
+		deps.onDidComplete?.(result);
+	}
+	return { totalEnqueued, results };
+}
+
+/**
+ * Backfill a single DID across every collection in `WANTED_COLLECTIONS`.
+ * Resolution failure aborts (we can't enqueue verifiable jobs without
+ * knowing the PDS); per-collection listRecords failures are recorded and
+ * the loop continues to the next collection.
+ */
+export async function backfillDid(did: string, deps: BackfillDeps): Promise<BackfillDidResult> {
+	const errors: string[] = [];
+	let enqueued = 0;
+
+	// resolve() writes to known_publishers as a side-effect of the cache
+	// upsert, so this also registers the publisher for any future code path
+	// that iterates that table.
+	let pds: string;
+	try {
+		const resolved = await deps.resolver.resolve(did);
+		pds = resolved.pds;
+	} catch (err) {
+		errors.push(`resolve failed: ${err instanceof Error ? err.message : String(err)}`);
+		return { did, enqueued, errors };
+	}
+
+	const fetchImpl = deps.fetch ?? fetch;
+	for (const collection of WANTED_COLLECTIONS) {
+		try {
+			enqueued += await backfillCollection({
+				did,
+				pds,
+				collection,
+				queue: deps.queue,
+				fetchImpl,
+			});
+		} catch (err) {
+			errors.push(`${collection}: ${err instanceof Error ? err.message : String(err)}`);
+		}
+	}
+
+	return { did, enqueued, errors };
+}
+
+interface BackfillCollectionOpts {
+	did: string;
+	pds: string;
+	collection: string;
+	queue: BackfillQueue;
+	fetchImpl: typeof fetch;
+}
+
+/**
+ * Walk one DID's records for a single collection, paginating through
+ * `listRecords` and enqueuing each result via `sendBatch` (one batch per
+ * page). Returns the number of records enqueued.
+ *
+ * 404 from the PDS on the FIRST page means the repo doesn't host this
+ * collection — silently treated as zero records, not an error. A 404
+ * mid-pagination is a partial-failure signal (the PDS is misrouting one
+ * page while the rest of the repo is fine) and throws.
+ *
+ * Pagination is capped at MAX_PAGES_PER_COLLECTION + cursor-equality check
+ * to defend against a PDS that echoes the same cursor forever.
+ */
+async function backfillCollection(opts: BackfillCollectionOpts): Promise<number> {
+	let enqueued = 0;
+	let cursor: string | undefined;
+	let prevCursor: string | undefined;
+	let pages = 0;
+	do {
+		if (++pages > MAX_PAGES_PER_COLLECTION) {
+			throw new Error(`exceeded ${MAX_PAGES_PER_COLLECTION} pages`);
+		}
+		if (cursor !== undefined && cursor === prevCursor) {
+			throw new Error("PDS returned identical cursor twice");
+		}
+		prevCursor = cursor;
+
+		const url = new URL("/xrpc/com.atproto.repo.listRecords", opts.pds);
+		url.searchParams.set("repo", opts.did);
+		url.searchParams.set("collection", opts.collection);
+		url.searchParams.set("limit", String(PAGE_SIZE));
+		if (cursor) url.searchParams.set("cursor", cursor);
+
+		const response = await opts.fetchImpl(url.toString(), {
+			headers: { accept: "application/json" },
+		});
+		if (response.status === 404) {
+			if (cursor === undefined) {
+				// First-page 404: publisher has no records of this collection.
+				return enqueued;
+			}
+			// Mid-pagination 404 is a partial failure; surface it.
+			throw new Error(`listRecords returned 404 mid-pagination at cursor=${cursor}`);
+		}
+		if (!response.ok) {
+			throw new Error(`listRecords returned ${response.status}`);
+		}
+
+		const body: unknown = await response.json();
+		const records = extractListRecordsBody(body);
+		if (records.length > MAX_RECORDS_PER_PAGE) {
+			throw new Error(
+				`PDS returned ${records.length} records, exceeding cap of ${MAX_RECORDS_PER_PAGE}`,
+			);
+		}
+		cursor = extractCursor(body);
+
+		const messages: { body: RecordsJob }[] = [];
+		for (const record of records) {
+			const rkey = parseRkeyFromUri(record.uri, opts.collection);
+			if (!rkey) continue;
+			messages.push({
+				body: {
+					did: opts.did,
+					collection: opts.collection,
+					rkey,
+					operation: "create",
+					cid: record.cid,
+				},
+			});
+		}
+		if (messages.length > 0) {
+			// Single batched send per page — far cheaper than the per-record
+			// awaits this used to do (which couldn't possibly finish a
+			// 4000-record DID inside the waitUntil budget).
+			await opts.queue.sendBatch(messages);
+			enqueued += messages.length;
+		}
+	} while (cursor);
+	return enqueued;
+}
+
+interface ListRecordEntry {
+	uri: string;
+	cid: string;
+	value: unknown;
+}
+
+function extractListRecordsBody(body: unknown): ListRecordEntry[] {
+	if (!isPlainObject(body)) return [];
+	const records = body["records"];
+	if (!Array.isArray(records)) return [];
+	const out: ListRecordEntry[] = [];
+	for (const r of records) {
+		if (!isPlainObject(r)) continue;
+		const uri = r["uri"];
+		const cid = r["cid"];
+		if (typeof uri !== "string" || typeof cid !== "string") continue;
+		out.push({ uri, cid, value: r["value"] });
+	}
+	return out;
+}
+
+function extractCursor(body: unknown): string | undefined {
+	if (!isPlainObject(body)) return undefined;
+	const cursor = body["cursor"];
+	return typeof cursor === "string" ? cursor : undefined;
+}
+
+/**
+ * Extract the rkey from an AT URI of the shape `at://did/collection/rkey`
+ * and validate it against the atproto rkey grammar. Returns null if any
+ * step fails; callers treat that as "this isn't a record we recognise"
+ * and skip the entry without aborting the page.
+ */
+function parseRkeyFromUri(uri: string, expectedCollection: string): string | null {
+	const expectedPrefix = `at://`;
+	if (!uri.startsWith(expectedPrefix)) return null;
+	const tail = uri.slice(expectedPrefix.length);
+	const slash1 = tail.indexOf("/");
+	if (slash1 < 0) return null;
+	const slash2 = tail.indexOf("/", slash1 + 1);
+	if (slash2 < 0) return null;
+	const collection = tail.slice(slash1 + 1, slash2);
+	if (collection !== expectedCollection) return null;
+	const rkey = tail.slice(slash2 + 1);
+	if (!RKEY_PATTERN.test(rkey)) return null;
+	return rkey;
+}
diff --git a/apps/aggregator/src/index.ts b/apps/aggregator/src/index.ts
index 709f7da6a..470f44f71 100644
--- a/apps/aggregator/src/index.ts
+++ b/apps/aggregator/src/index.ts
@@ -15,9 +15,18 @@
  * Worker boots.
  */
 
+import {
+	AtprotoWebDidDocumentResolver,
+	CompositeDidDocumentResolver,
+	PlcDidDocumentResolver,
+} from "@atcute/identity-resolver";
+
+import { backfillDids } from "./backfill.js";
+import { createD1DidDocCache, DidResolver } from "./did-resolver.js";
 import type { RecordsJob } from "./env.js";
 import { drainDeadLetterBatch, processBatch } from "./records-consumer.js";
 import { RECORDS_DO_NAME } from "./records-do.js";
+import { isPlainObject } from "./utils.js";
 
 const RECORDS_QUEUE_NAME = "emdash-aggregator-records";
 const RECORDS_DLQ_NAME = "emdash-aggregator-records-dlq";
@@ -25,21 +34,104 @@ const RECORDS_DLQ_NAME = "emdash-aggregator-records-dlq";
 export { RecordsJetstreamDO } from "./records-do.js";
 
 /**
- * Operational bootstrap route. Hitting `/_admin/start` once after deploy
- * spins up the Records DO, which opens its outbound WebSocket and starts
- * ingesting. The DO's WebSocket keeps it alive thereafter. The route is
- * unauthenticated but returns no operational detail — just a fixed 204 —
- * so a probing caller learns nothing useful. The action is idempotent on
- * an already-running DO. Recommended deploy hook:
+ * Operational admin routes. Both gated by the `ADMIN_TOKEN` secret declared
+ * in `wrangler.jsonc`'s `secrets.required` and validated via constant-time
+ * compare against the `Authorization: Bearer <token>` header. Recommended
+ * deploy hook:
+ *
+ *   wrangler deploy && curl -X POST -H "Authorization: Bearer $ADMIN_TOKEN" \
+ *     https://api.emdashcms.com/_admin/start
+ *
+ * `/_admin/start` spins up the Records DO (idempotent on an already-running DO).
+ * `/_admin/backfill` triggers cold-start discovery against an explicit DID list.
  *
- *   wrangler deploy && curl -X POST https://api.emdashcms.com/_admin/start
+ * Auth-gating both routes: backfill specifically introduces caller-chosen
+ * URLs into the worker's outbound fetches (via `did:web` resolution + the
+ * publisher's PDS endpoint), so the unauth posture would expose an SSRF-shaped
+ * surface. Same gate on `/_admin/start` for symmetry — anyone with the token
+ * can do operational things.
  */
 const BOOTSTRAP_PATH = "/_admin/start";
+const BACKFILL_PATH = "/_admin/backfill";
+
+const MAX_BACKFILL_DIDS = 1000;
+const DID_PATTERN = /^did:[a-z]+:[A-Za-z0-9._%:-]+$/;
+
+/**
+ * Constant-time string equality. Use over `===` for any compare against a
+ * secret to avoid leaking length/prefix information through timing channels.
+ */
+function timingSafeEqual(a: string, b: string): boolean {
+	if (a.length !== b.length) return false;
+	let diff = 0;
+	for (let i = 0; i < a.length; i++) {
+		diff |= a.charCodeAt(i) ^ b.charCodeAt(i);
+	}
+	return diff === 0;
+}
+
+/**
+ * Validate the request's `Authorization: Bearer <token>` header against
+ * `env.ADMIN_TOKEN`. Returns null on success, or a 401 Response to return
+ * directly. Empty/missing token in env fails closed.
+ */
+function requireAdminAuth(request: Request, env: Env): Response | null {
+	const expected = env.ADMIN_TOKEN;
+	if (!expected) {
+		// Misconfigured production or unset dev — closed by default.
+		return new Response("admin endpoints not configured", { status: 503 });
+	}
+	const auth = request.headers.get("authorization");
+	if (!auth || !auth.startsWith("Bearer ")) {
+		return new Response("unauthorized", {
+			status: 401,
+			headers: { "www-authenticate": "Bearer" },
+		});
+	}
+	const token = auth.slice("Bearer ".length);
+	if (!timingSafeEqual(token, expected)) {
+		return new Response("unauthorized", {
+			status: 401,
+			headers: { "www-authenticate": "Bearer" },
+		});
+	}
+	return null;
+}
+
+function parseBackfillBody(body: unknown): { dids: string[] } | { error: string } {
+	if (!isPlainObject(body)) {
+		return { error: "request body must be a JSON object" };
+	}
+	const rawDids = body["dids"];
+	if (!Array.isArray(rawDids)) {
+		return { error: "`dids` must be an array of DID strings" };
+	}
+	if (rawDids.length === 0) {
+		return { error: "`dids` must not be empty" };
+	}
+	if (rawDids.length > MAX_BACKFILL_DIDS) {
+		return {
+			error: `\`dids\` must contain at most ${MAX_BACKFILL_DIDS} entries (got ${rawDids.length})`,
+		};
+	}
+	const seen = new Set<string>();
+	for (const did of rawDids) {
+		if (typeof did !== "string" || !DID_PATTERN.test(did)) {
+			return { error: `invalid DID in list: ${JSON.stringify(did)}` };
+		}
+		seen.add(did);
+	}
+	// Set iteration order matches insertion; dedup preserves first-seen order
+	// so the operator sees jobs run in the order they submitted.
+	return { dids: [...seen] };
+}
 
 export default {
 	async fetch(request: Request, env: Env, ctx: ExecutionContext): Promise<Response> {
 		const url = new URL(request.url);
 		if (url.pathname === BOOTSTRAP_PATH) {
+			const denied = requireAdminAuth(request, env);
+			if (denied) return denied;
 			const id = env.RECORDS_DO.idFromName(RECORDS_DO_NAME);
 			const stub = env.RECORDS_DO.get(id);
 			// Fire-and-forget so the response shape doesn't depend on the
@@ -48,6 +140,33 @@ export default {
 			ctx.waitUntil(stub.fetch("https://do.internal/bootstrap"));
 			return new Response(null, { status: 204 });
 		}
+		if (url.pathname === BACKFILL_PATH) {
+			if (request.method !== "POST") {
+				return new Response("method not allowed", {
+					status: 405,
+					headers: { allow: "POST" },
+				});
+			}
+			const denied = requireAdminAuth(request, env);
+			if (denied) return denied;
+			let body: unknown;
+			try {
+				body = await request.json();
+			} catch {
+				return new Response("request body must be valid JSON", { status: 400 });
+			}
+			const parsed = parseBackfillBody(body);
+			if ("error" in parsed) {
+				return new Response(parsed.error, { status: 400 });
+			}
+			// Fire-and-forget via waitUntil so the route returns 202 quickly
+			// regardless of the DID list size. Backfill progress is observable
+			// via Workers logs and the resulting D1 writes; the response body
+			// just confirms the trigger landed. Per-DID errors are logged from
+			// inside backfillDids; callers don't get them in the response.
+			ctx.waitUntil(runBackfill(parsed.dids, env));
+			return new Response(null, { status: 202 });
+		}
 		return new Response("emdash-aggregator: not yet implemented", {
 			status: 503,
 			headers: { "content-type": "text/plain" },
@@ -82,3 +201,47 @@ export default {
 		ctx.waitUntil(stub.fetch("https://do.internal/liveness"));
 	},
 };
+
+/**
+ * Production wiring for the backfill worker. Constructs a DID resolver
+ * pointed at the live PLC + atproto-web methods, and points the queue at
+ * the production Records Queue binding. Per-DID progress is logged as the
+ * loop runs so an operator watching `wrangler tail` sees live progress and
+ * can tell where a backfill stopped if waitUntil exhausts mid-loop.
+ */
+async function runBackfill(dids: readonly string[], env: Env): Promise<void> {
+	console.log("[aggregator] backfill starting", { didCount: dids.length });
+	try {
+		const composite = new CompositeDidDocumentResolver({
+			methods: {
+				plc: new PlcDidDocumentResolver(),
+				web: new AtprotoWebDidDocumentResolver(),
+			},
+		});
+		const resolver = new DidResolver({
+			cache: createD1DidDocCache(env.DB),
+			resolver: composite,
+		});
+		const summary = await backfillDids(dids, {
+			resolver,
+			queue: env.RECORDS_QUEUE,
+			onDidComplete: (result) => {
+				console.log("[aggregator] backfill did", {
+					did: result.did,
+					enqueued: result.enqueued,
+					errors: result.errors,
+				});
+			},
+		});
+		console.log("[aggregator] backfill complete", {
+			didCount: dids.length,
+			totalEnqueued: summary.totalEnqueued,
+			didsWithErrors: summary.results.filter((r) => r.errors.length > 0).length,
+		});
+	} catch (err) {
+		console.error("[aggregator] backfill aborted", {
+			didCount: dids.length,
+			error: err instanceof Error ? (err.stack ?? err.message) : String(err),
+		});
+	}
+}
diff --git a/apps/aggregator/src/records-consumer.ts b/apps/aggregator/src/records-consumer.ts
index d52810e43..5470c1340 100644
--- a/apps/aggregator/src/records-consumer.ts
+++ b/apps/aggregator/src/records-consumer.ts
@@ -53,6 +53,7 @@ import {
 	type VerificationFailureReason,
 	type VerifiedPdsRecord,
 } from "./pds-verify.js";
+import { isPlainObject } from "./utils.js";
 
 /**
  * Deps the consumer needs at runtime. Constructed once per `processBatch` call
@@ -1072,10 +1073,6 @@ function computeVersionSort(version: string): string | null {
 	return `${pad(major)}.${pad(minor)}.${pad(patch)}.${FINAL_VERSION_SENTINEL}`;
 }
 
-function isPlainObject(value: unknown): value is Record<string, unknown> {
-	return value !== null && typeof value === "object" && !Array.isArray(value);
-}
-
 /**
  * Pull the verified record's CID out of the JSON-stringified
  * `signature_metadata` column. Returns null if the column is missing or
diff --git a/apps/aggregator/src/utils.ts b/apps/aggregator/src/utils.ts
new file mode 100644
index 000000000..ed17cae1a
--- /dev/null
+++ b/apps/aggregator/src/utils.ts
@@ -0,0 +1,13 @@
+/**
+ * Shared utility helpers used across multiple modules.
+ */
+
+/**
+ * Type guard for narrowing `unknown` to `Record<string, unknown>` so
+ * subsequent `value["key"]` accesses are typesafe without an `as` cast.
+ * Excludes arrays (which are also `typeof === "object"`) so consumers
+ * checking for "JSON-shaped object" get what they expect.
+ */
+export function isPlainObject(value: unknown): value is Record<string, unknown> {
+	return value !== null && typeof value === "object" && !Array.isArray(value);
+}
diff --git a/apps/aggregator/test/backfill.test.ts b/apps/aggregator/test/backfill.test.ts
new file mode 100644
index 000000000..fae806e85
--- /dev/null
+++ b/apps/aggregator/test/backfill.test.ts
@@ -0,0 +1,725 @@
+/**
+ * Backfill worker tests.
+ *
+ * The worker is plain `fetch` + `queue.send`; tests stub both. The DID
+ * resolver is also stubbed (in-memory cache + a simple stub upstream) so the
+ * tests don't depend on the workers test pool or live PLC.
+ */
+
+import { P256PrivateKeyExportable } from "@atcute/crypto";
+import type { DidDocument } from "@atcute/identity";
+import type { Did } from "@atcute/lexicons/syntax";
+import { applyD1Migrations, env, SELF } from "cloudflare:test";
+import { beforeAll, beforeEach, describe, expect, it } from "vitest";
+
+import { backfillDid, backfillDids, type BackfillQueue } from "../src/backfill.js";
+import { WANTED_COLLECTIONS } from "../src/constants.js";
+import {
+	type CachedDidDoc,
+	type DidDocCache,
+	type DidDocumentResolverLike,
+	DidResolver,
+} from "../src/did-resolver.js";
+import type { RecordsJob } from "../src/env.js";
+
+interface TestEnv {
+	DB: D1Database;
+	TEST_MIGRATIONS: Parameters<typeof applyD1Migrations>[1];
+}
+const testEnv = env as unknown as TestEnv;
+
+const DID_A = "did:plc:test00000000000000000000";
+const DID_B = "did:plc:test00000000000000000001";
+const PDS = "https://pds.test.example";
+
+let signingKeyMultibase: string;
+
+beforeAll(async () => {
+	const kp = await P256PrivateKeyExportable.createKeypair();
+	signingKeyMultibase = await kp.exportPublicKey("multikey");
+	await applyD1Migrations(testEnv.DB, testEnv.TEST_MIGRATIONS);
+});
+
+beforeEach(async () => {
+	await testEnv.DB.prepare("DELETE FROM known_publishers").run();
+});
+
+class CapturingQueue implements BackfillQueue {
+	readonly sent: RecordsJob[] = [];
+	sendBatch(messages: ReadonlyArray<{ body: RecordsJob }>): Promise<unknown> {
+		for (const m of messages) this.sent.push(m.body);
+		return Promise.resolve();
+	}
+}
+
+class MapDidDocCache implements DidDocCache {
+	private readonly entries = new Map<string, CachedDidDoc>();
+	read(did: string): Promise<CachedDidDoc | null> {
+		return Promise.resolve(this.entries.get(did) ?? null);
+	}
+	upsert(did: string, doc: Omit<CachedDidDoc, "resolvedAt">, now: Date): Promise<void> {
+		this.entries.set(did, { ...doc, resolvedAt: now });
+		return Promise.resolve();
+	}
+	expire(did: string): Promise<void> {
+		const entry = this.entries.get(did);
+		if (entry) this.entries.set(did, { ...entry, resolvedAt: new Date(0) });
+		return Promise.resolve();
+	}
+}
+
+function buildResolver(): DidResolver {
+	const cache = new MapDidDocCache();
+	const resolver: DidDocumentResolverLike = {
+		resolve(did: Did): Promise<DidDocument> {
+			return Promise.resolve({
+				id: did as `did:${string}:${string}`,
+				verificationMethod: [
+					{
+						id: `${did}#atproto`,
+						type: "Multikey",
+						controller: did as `did:${string}:${string}`,
+						publicKeyMultibase: signingKeyMultibase,
+					},
+				],
+				service: [
+					{
+						id: "#atproto_pds",
+						type: "AtprotoPersonalDataServer",
+						serviceEndpoint: PDS,
+					},
+				],
+			});
+		},
+	};
+	return new DidResolver({ cache, resolver, ttlMs: 1_000_000, now: () => new Date() });
+}
+
+interface MockListRecord {
+	uri: string;
+	cid: string;
+	value: Record<string, unknown>;
+}
+
+/**
+ * Build a fetch stub that returns canned `listRecords` responses keyed by
+ * collection. Records arrive as a single page; pagination is exercised in a
+ * dedicated test by passing pages of records explicitly.
+ */
+function makeFetch(
+	recordsByCollection: Record<string, MockListRecord[]>,
+	overrides?: { status?: Record<string, number> },
+): typeof fetch {
+	return async (input) => {
+		const url =
+			typeof input === "string"
+				? new URL(input)
+				: input instanceof URL
+					? input
+					: new URL(input.url);
+		if (!url.pathname.endsWith("/xrpc/com.atproto.repo.listRecords")) {
+			return new Response("not stubbed", { status: 599 });
+		}
+		const collection = url.searchParams.get("collection") ?? "";
+		const status = overrides?.status?.[collection];
+		if (status !== undefined) {
+			return new Response(JSON.stringify({ error: "X" }), {
+				status,
+				headers: { "content-type": "application/json" },
+			});
+		}
+		const records = recordsByCollection[collection] ?? [];
+		return new Response(JSON.stringify({ records }), {
+			status: 200,
+			headers: { "content-type": "application/json" },
+		});
+	};
+}
+
+describe("backfillDid", () => {
+	it("enqueues each listRecords result as a RecordsJob", async () => {
+		const queue = new CapturingQueue();
+		const resolver = buildResolver();
+		const fetchImpl = makeFetch({
+			[WANTED_COLLECTIONS[0]]: [
+				{
+					uri: `at://${DID_A}/${WANTED_COLLECTIONS[0]}/demo`,
+					cid: "bafyc1",
+					value: { foo: "bar" },
+				},
+			],
+		});
+		const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl });
+
+		expect(result.errors).toEqual([]);
+		expect(result.enqueued).toBe(1);
+		expect(queue.sent).toHaveLength(1);
+		expect(queue.sent[0]).toMatchObject({
+			did: DID_A,
+			collection: WANTED_COLLECTIONS[0],
+			rkey: "demo",
+			operation: "create",
+			cid: "bafyc1",
+		});
+		// jetstreamRecord intentionally not set on backfill jobs — the
+		// consumer's DLQ payload field would otherwise mislabel
+		// `listRecords` data as Jetstream-supplied data.
+		expect(queue.sent[0]?.jetstreamRecord).toBeUndefined();
+	});
+
+	it("walks every collection in WANTED_COLLECTIONS", async () => {
+		const queue = new CapturingQueue();
+		const resolver = buildResolver();
+		const records: Record<string, MockListRecord[]> = {};
+		for (let i = 0; i < WANTED_COLLECTIONS.length; i++) {
+			const c = WANTED_COLLECTIONS[i];
+			if (c) {
+				records[c] = [{ uri: `at://${DID_A}/${c}/r${i}`, cid: `bafyc${i}`, value: {} }];
+			}
+		}
+		const fetchImpl = makeFetch(records);
+
+		const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl });
+
+		expect(result.enqueued).toBe(WANTED_COLLECTIONS.length);
+		const collectionsHit = new Set(queue.sent.map((j) => j.collection));
+		expect(collectionsHit.size).toBe(WANTED_COLLECTIONS.length);
+	});
+
+	it("treats 404 from the PDS as 'no records of this collection', not an error", async () => {
+		const queue = new CapturingQueue();
+		const resolver = buildResolver();
+		const fetchImpl = makeFetch(
+			{},
+			{
+				status: WANTED_COLLECTIONS.reduce<Record<string, number>>((acc, c) => {
+					acc[c] = 404;
+					return acc;
+				}, {}),
+			},
+		);
+
+		const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl });
+
+		expect(result.errors).toEqual([]);
+		expect(result.enqueued).toBe(0);
+		expect(queue.sent).toHaveLength(0);
+	});
+
+	it("records non-404 PDS errors per collection, continues to the next collection", async () => {
+		const queue = new CapturingQueue();
+		const resolver = buildResolver();
+		const firstCollection = WANTED_COLLECTIONS[0];
+		const secondCollection = WANTED_COLLECTIONS[1];
+		if (!firstCollection || !secondCollection) throw new Error("test assumes ≥2 collections");
+		const fetchImpl = makeFetch(
+			{
+				[secondCollection]: [
+					{
+						uri: `at://${DID_A}/${secondCollection}/r1`,
+						cid: "bafyc",
+						value: {},
+					},
+				],
+			},
+			{ status: { [firstCollection]: 503 } },
+		);
+
+		const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl });
+
+		expect(result.errors).toHaveLength(1);
+		expect(result.errors[0]).toContain("503");
+		expect(result.errors[0]).toContain(firstCollection);
+		expect(result.enqueued).toBe(1); // the second collection still ran
+	});
+
+	it("aborts the DID early if the resolver throws (can't enqueue without PDS)", async () => {
+		const queue = new CapturingQueue();
+		const resolver = new DidResolver({
+			cache: new MapDidDocCache(),
+			resolver: {
+				resolve: () => Promise.reject(new Error("PLC unreachable")),
+			},
+		});
+		const fetchImpl = makeFetch({});
+
+		const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl });
+
+		expect(result.enqueued).toBe(0);
+		expect(queue.sent).toHaveLength(0);
+		expect(result.errors).toHaveLength(1);
+		expect(result.errors[0]).toMatch(/resolve failed.*PLC unreachable/);
+	});
+
+	it("paginates listRecords via cursor", async () => {
+		const queue = new CapturingQueue();
+		const resolver = buildResolver();
+		const collection = WANTED_COLLECTIONS[0];
+		if (!collection) throw new Error("test assumes ≥1 collection");
+
+		// Fetch returns page 1 (cursor "p2") then page 2 (no cursor) for the
+		// first collection; empty pages for the others.
+		let calls = 0;
+		const fetchImpl: typeof fetch = async (input) => {
+			const url =
+				typeof input === "string"
+					? new URL(input)
+					: input instanceof URL
+						? input
+						: new URL(input.url);
+			if (url.searchParams.get("collection") !== collection) {
+				return new Response(JSON.stringify({ records: [] }), { status: 200 });
+			}
+			calls += 1;
+			const cursor = url.searchParams.get("cursor");
+			if (!cursor) {
+				return new Response(
+					JSON.stringify({
+						records: [{ uri: `at://${DID_A}/${collection}/p1`, cid: "c1", value: {} }],
+						cursor: "p2",
+					}),
+					{ status: 200, headers: { "content-type": "application/json" } },
+				);
+			}
+			return new Response(
+				JSON.stringify({
+					records: [{ uri: `at://${DID_A}/${collection}/p2`, cid: "c2", value: {} }],
+				}),
+				{ status: 200, headers: { "content-type": "application/json" } },
+			);
+		};
+
+		const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl });
+
+		expect(calls).toBe(2);
+		const collectionJobs = queue.sent.filter((j) => j.collection === collection);
+		expect(collectionJobs.map((j) => j.rkey)).toEqual(["p1", "p2"]);
+		expect(result.errors).toEqual([]);
+	});
+
+	it("skips records whose URI doesn't match the expected collection (defensive)", async () => {
+		const queue = new CapturingQueue();
+		const resolver = buildResolver();
+		const collection = WANTED_COLLECTIONS[0];
+		if (!collection) throw new Error("test assumes ≥1 collection");
+		const fetchImpl = makeFetch({
+			[collection]: [
+				{ uri: `at://${DID_A}/${collection}/legit`, cid: "c1", value: {} },
+				// Buggy PDS: returns a record under the wrong collection.
+				{ uri: `at://${DID_A}/wrong.collection/x`, cid: "c2", value: {} },
+				// Buggy URI shape (missing rkey).
+				{ uri: `at://${DID_A}/${collection}/`, cid: "c3", value: {} },
+			],
+		});
+		const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl });
+
+		const sentForCollection = queue.sent.filter((j) => j.collection === collection);
+		expect(sentForCollection.map((j) => j.rkey)).toEqual(["legit"]);
+		expect(result.enqueued).toBe(1);
+	});
+});
+
+describe("backfillDids", () => {
+	it("processes multiple DIDs serially and aggregates the summary", async () => {
+		const queue = new CapturingQueue();
+		const resolver = buildResolver();
+		const collection = WANTED_COLLECTIONS[0];
+		if (!collection) throw new Error("test assumes ≥1 collection");
+		const fetchImpl: typeof fetch = async (input) => {
+			const url =
+				typeof input === "string"
+					? new URL(input)
+					: input instanceof URL
+						? input
+						: new URL(input.url);
+			if (url.searchParams.get("collection") !== collection) {
+				return new Response(JSON.stringify({ records: [] }), { status: 200 });
+			}
+			const did = url.searchParams.get("repo") ?? "";
+			return new Response(
+				JSON.stringify({
+					records: [{ uri: `at://${did}/${collection}/x`, cid: "c", value: {} }],
+				}),
+				{ status: 200, headers: { "content-type": "application/json" } },
+			);
+		};
+
+		const summary = await backfillDids([DID_A, DID_B], {
+			resolver,
+			queue,
+			fetch: fetchImpl,
+		});
+
+		expect(summary.totalEnqueued).toBe(2);
+		expect(summary.results).toHaveLength(2);
+		expect(summary.results.map((r) => r.did)).toEqual([DID_A, DID_B]);
+		expect(summary.results.every((r) => r.errors.length === 0)).toBe(true);
+	});
+
+	it("doesn't let one DID's failure stop the others", async () => {
+		const queue = new CapturingQueue();
+		// Resolver succeeds for DID_A, fails for DID_B (or any other).
+		const cache = new MapDidDocCache();
+		const resolver = new DidResolver({
+			cache,
+			resolver: {
+				resolve: (did) => {
+					if (did === DID_A) {
+						return Promise.resolve({
+							id: did as `did:${string}:${string}`,
+							verificationMethod: [
+								{
+									id: `${did}#atproto`,
+									type: "Multikey",
+									controller: did as `did:${string}:${string}`,
+									publicKeyMultibase: signingKeyMultibase,
+								},
+							],
+							service: [
+								{
+									id: "#atproto_pds",
+									type: "AtprotoPersonalDataServer",
+									serviceEndpoint: PDS,
+								},
+							],
+						});
+					}
+					return Promise.reject(new Error("DID_B unresolvable"));
+				},
+			},
+			ttlMs: 1_000_000,
+			now: () => new Date(),
+		});
+		const collection = WANTED_COLLECTIONS[0];
+		if (!collection) throw new Error("test assumes ≥1 collection");
+		const fetchImpl = makeFetch({
+			[collection]: [{ uri: `at://${DID_A}/${collection}/x`, cid: "c", value: {} }],
+		});
+
+		const summary = await backfillDids([DID_B, DID_A], {
+			resolver,
+			queue,
+			fetch: fetchImpl,
+		});
+
+		expect(summary.totalEnqueued).toBe(1);
+		expect(summary.results).toHaveLength(2);
+		const bResult = summary.results.find((r) => r.did === DID_B);
+		const aResult = summary.results.find((r) => r.did === DID_A);
+		expect(bResult?.errors.length).toBe(1);
+		expect(aResult?.errors).toEqual([]);
+		expect(aResult?.enqueued).toBe(1);
+	});
+
+	it("end-to-end against the production D1 cache: DID is registered in known_publishers", async () => {
+		const queue = new CapturingQueue();
+		const { createD1DidDocCache } = await import("../src/did-resolver.js");
+		const cache = createD1DidDocCache(testEnv.DB);
+		const resolver = new DidResolver({
+			cache,
+			resolver: {
+				resolve: (did) =>
+					Promise.resolve({
+						id: did as `did:${string}:${string}`,
+						verificationMethod: [
+							{
+								id: `${did}#atproto`,
+								type: "Multikey",
+								controller: did as `did:${string}:${string}`,
+								publicKeyMultibase: signingKeyMultibase,
+							},
+						],
+						service: [
+							{
+								id: "#atproto_pds",
+								type: "AtprotoPersonalDataServer",
+								serviceEndpoint: PDS,
+							},
+						],
+					}),
+			},
+		});
+		const fetchImpl = makeFetch({});
+
+		await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl });
+
+		const row = await testEnv.DB.prepare(
+			`SELECT did, pds, signing_key, signing_key_id FROM known_publishers WHERE did = ?`,
+		)
+			.bind(DID_A)
+			.first<{ did: string; pds: string; signing_key: string; signing_key_id: string }>();
+		expect(row).toMatchObject({ did: DID_A, pds: PDS });
+		expect(row?.signing_key).toBe(signingKeyMultibase);
+	});
+});
+
+describe("backfillCollection: defenses against malicious / buggy PDS", () => {
+	it("aborts after MAX_PAGES_PER_COLLECTION when cursor never empties", async () => {
+		const queue = new CapturingQueue();
+		const resolver = buildResolver();
+		const collection = WANTED_COLLECTIONS[0];
+		if (!collection) throw new Error("test assumes ≥1 collection");
+		// Hostile PDS: returns a different non-empty cursor every call so the
+		// cursor-equality check doesn't fire — only the page cap stops us.
+		let counter = 0;
+		const fetchImpl: typeof fetch = async (input) => {
+			const url =
+				typeof input === "string"
+					? new URL(input)
+					: input instanceof URL
+						? input
+						: new URL(input.url);
+			if (url.searchParams.get("collection") !== collection) {
+				return new Response(JSON.stringify({ records: [] }), { status: 200 });
+			}
+			counter += 1;
+			return new Response(JSON.stringify({ records: [], cursor: `cursor-${counter}` }), {
+				status: 200,
+				headers: { "content-type": "application/json" },
+			});
+		};
+
+		const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl });
+
+		expect(result.errors.length).toBeGreaterThanOrEqual(1);
+		expect(result.errors.some((e) => e.includes("exceeded"))).toBe(true);
+		// Loop ran at most MAX_PAGES_PER_COLLECTION times.
+		expect(counter).toBeLessThanOrEqual(1001); // +1 for the throw-on-the-next iteration
+	});
+
+	it("aborts when the PDS returns the identical cursor twice", async () => {
+		const queue = new CapturingQueue();
+		const resolver = buildResolver();
+		const collection = WANTED_COLLECTIONS[0];
+		if (!collection) throw new Error("test assumes ≥1 collection");
+		// Buggy PDS: echoes the cursor we sent. Cursor-equality should fire
+		// on the second iteration.
+		let calls = 0;
+		const fetchImpl: typeof fetch = async (input) => {
+			const url =
+				typeof input === "string"
+					? new URL(input)
+					: input instanceof URL
+						? input
+						: new URL(input.url);
+			if (url.searchParams.get("collection") !== collection) {
+				return new Response(JSON.stringify({ records: [] }), { status: 200 });
+			}
+			calls += 1;
+			return new Response(JSON.stringify({ records: [], cursor: "stuck" }), {
+				status: 200,
+				headers: { "content-type": "application/json" },
+			});
+		};
+
+		const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl });
+
+		expect(result.errors.some((e) => e.includes("identical cursor"))).toBe(true);
+		expect(calls).toBe(2); // first page, then second page caught the dupe
+	});
+
+	it("treats 404 mid-pagination as a partial failure (not silent zero)", async () => {
+		const queue = new CapturingQueue();
+		const resolver = buildResolver();
+		const collection = WANTED_COLLECTIONS[0];
+		if (!collection) throw new Error("test assumes ≥1 collection");
+		const fetchImpl: typeof fetch = async (input) => {
+			const url =
+				typeof input === "string"
+					? new URL(input)
+					: input instanceof URL
+						? input
+						: new URL(input.url);
+			if (url.searchParams.get("collection") !== collection) {
+				return new Response(JSON.stringify({ records: [] }), { status: 200 });
+			}
+			const cursor = url.searchParams.get("cursor");
+			if (!cursor) {
+				// First page succeeds with a cursor.
+				return new Response(
+					JSON.stringify({
+						records: [{ uri: `at://${DID_A}/${collection}/p1`, cid: "c1", value: {} }],
+						cursor: "p2",
+					}),
+					{ status: 200, headers: { "content-type": "application/json" } },
+				);
+			}
+			// Second page 404.
+			return new Response("not found", { status: 404 });
+		};
+
+		const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl });
+
+		const error = result.errors.find((e) => e.includes(collection));
+		expect(error).toMatch(/404 mid-pagination/);
+	});
+
+	it("rejects pages with > MAX_RECORDS_PER_PAGE records (PDS oversize attack)", async () => {
+		const queue = new CapturingQueue();
+		const resolver = buildResolver();
+		const collection = WANTED_COLLECTIONS[0];
+		if (!collection) throw new Error("test assumes ≥1 collection");
+		const records = Array.from({ length: 250 }, (_, i) => ({
+			uri: `at://${DID_A}/${collection}/r${i}`,
+			cid: `c${i}`,
+			value: {},
+		}));
+		const fetchImpl = makeFetch({ [collection]: records });
+
+		const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl });
+
+		expect(result.errors.some((e) => e.includes("exceeding cap"))).toBe(true);
+		// No records enqueued for the over-capped collection.
+		expect(queue.sent.filter((j) => j.collection === collection)).toHaveLength(0);
+	});
+
+	it("rejects records with malformed rkey (atproto rkey grammar violation)", async () => {
+		const queue = new CapturingQueue();
+		const resolver = buildResolver();
+		const collection = WANTED_COLLECTIONS[0];
+		if (!collection) throw new Error("test assumes ≥1 collection");
+		const fetchImpl = makeFetch({
+			[collection]: [
+				{ uri: `at://${DID_A}/${collection}/legit`, cid: "c1", value: {} },
+				{ uri: `at://${DID_A}/${collection}/has?queryparam`, cid: "c2", value: {} },
+				{ uri: `at://${DID_A}/${collection}/has#fragment`, cid: "c3", value: {} },
+				{ uri: `at://${DID_A}/${collection}/has space`, cid: "c4", value: {} },
+			],
+		});
+
+		const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl });
+		const sent = queue.sent.filter((j) => j.collection === collection);
+		expect(sent.map((j) => j.rkey)).toEqual(["legit"]);
+		expect(result.enqueued).toBe(1);
+	});
+});
+
+describe("backfill admin route: auth + input validation", () => {
+	it("returns 401 when Authorization header is missing", async () => {
+		const res = await SELF.fetch("https://test/_admin/backfill", {
+			method: "POST",
+			headers: { "content-type": "application/json" },
+			body: JSON.stringify({ dids: [DID_A] }),
+		});
+		expect(res.status).toBe(401);
+		expect(res.headers.get("www-authenticate")).toBe("Bearer");
+	});
+
+	it("returns 401 with wrong token", async () => {
+		const res = await SELF.fetch("https://test/_admin/backfill", {
+			method: "POST",
+			headers: {
+				"content-type": "application/json",
+				authorization: "Bearer wrong-token",
+			},
+			body: JSON.stringify({ dids: [DID_A] }),
+		});
+		expect(res.status).toBe(401);
+	});
+
+	it("returns 405 on GET", async () => {
+		const res = await SELF.fetch("https://test/_admin/backfill", {
+			method: "GET",
+			headers: { authorization: "Bearer test-admin-token" },
+		});
+		expect(res.status).toBe(405);
+	});
+
+	it("returns 400 on missing dids field", async () => {
+		const res = await SELF.fetch("https://test/_admin/backfill", {
+			method: "POST",
+			headers: {
+				"content-type": "application/json",
+				authorization: "Bearer test-admin-token",
+			},
+			body: JSON.stringify({}),
+		});
+		expect(res.status).toBe(400);
+		expect(await res.text()).toContain("must be an array");
+	});
+
+	it("returns 400 on empty dids array", async () => {
+		const res = await SELF.fetch("https://test/_admin/backfill", {
+			method: "POST",
+			headers: {
+				"content-type": "application/json",
+				authorization: "Bearer test-admin-token",
+			},
+			body: JSON.stringify({ dids: [] }),
+		});
+		expect(res.status).toBe(400);
+		expect(await res.text()).toContain("not be empty");
+	});
+
+	it("returns 400 on dids list larger than the cap", async () => {
+		const dids = Array.from(
+			{ length: 1001 },
+			(_, i) => `did:plc:test${i.toString().padStart(20, "0")}`,
+		);
+		const res = await SELF.fetch("https://test/_admin/backfill", {
+			method: "POST",
+			headers: {
+				"content-type": "application/json",
+				authorization: "Bearer test-admin-token",
+			},
+			body: JSON.stringify({ dids }),
+		});
+		expect(res.status).toBe(400);
+		expect(await res.text()).toContain("at most 1000");
+	});
+
+	it("returns 400 on malformed DID (caught by DID_PATTERN)", async () => {
+		const res = await SELF.fetch("https://test/_admin/backfill", {
+			method: "POST",
+			headers: {
+				"content-type": "application/json",
+				authorization: "Bearer test-admin-token",
+			},
+			body: JSON.stringify({ dids: ["did:plc:has space"] }),
+		});
+		expect(res.status).toBe(400);
+		expect(await res.text()).toContain("invalid DID");
+	});
+
+	it("returns 202 with a valid token + body (fires backfill in waitUntil)", async () => {
+		const res = await SELF.fetch("https://test/_admin/backfill", {
+			method: "POST",
+			headers: {
+				"content-type": "application/json",
+				authorization: "Bearer test-admin-token",
+			},
+			body: JSON.stringify({ dids: [DID_A] }),
+		});
+		expect(res.status).toBe(202);
+	});
+
+	it("dedupes duplicate DIDs in input", async () => {
+		// We can't assert the dedup directly through SELF without race-y waits,
+		// but the route accepts the body and returns 202 — the dedup is exercised
+		// in parseBackfillBody, which is unit-tested via the 'invalid DID' path
+		// (a duplicate doesn't surface as an error). Smoke test only.
+		const res = await SELF.fetch("https://test/_admin/backfill", {
+			method: "POST",
+			headers: {
+				"content-type": "application/json",
+				authorization: "Bearer test-admin-token",
+			},
+			body: JSON.stringify({ dids: [DID_A, DID_A, DID_A] }),
+		});
+		expect(res.status).toBe(202);
+	});
+});
+
+describe("admin start route: auth", () => {
+	it("returns 401 without token", async () => {
+		const res = await SELF.fetch("https://test/_admin/start");
+		expect(res.status).toBe(401);
+	});
+
+	it("returns 204 with valid token", async () => {
+		const res = await SELF.fetch("https://test/_admin/start", {
+			headers: { authorization: "Bearer test-admin-token" },
+		});
+		expect(res.status).toBe(204);
+	});
+});
diff --git a/apps/aggregator/vitest.config.ts b/apps/aggregator/vitest.config.ts
index 0958ca0b5..c44bb0200 100644
--- a/apps/aggregator/vitest.config.ts
+++ b/apps/aggregator/vitest.config.ts
@@ -33,7 +33,15 @@ export default defineConfig({
 		cloudflareTest({
 			wrangler: { configPath: "./wrangler.jsonc" },
 			miniflare: {
-				bindings: { TEST_MIGRATIONS: migrations },
+				bindings: {
+					TEST_MIGRATIONS: migrations,
+					// Stub admin auth token so tests can exercise the auth-gated
+					// admin routes without needing a real secret in the test
+					// environment. Production deploys pull from
+					// `wrangler secret put ADMIN_TOKEN`; the value below only
+					// applies inside the workers test pool.
+					ADMIN_TOKEN: "test-admin-token",
+				},
 			},
 		}),
 	],
diff --git a/apps/aggregator/worker-configuration.d.ts b/apps/aggregator/worker-configuration.d.ts
index 732bb5f4e..4af216a0d 100644
--- a/apps/aggregator/worker-configuration.d.ts
+++ b/apps/aggregator/worker-configuration.d.ts
@@ -1,5 +1,5 @@
 /* eslint-disable */
-// Generated by Wrangler by running `wrangler types` (hash: e774431debb87e9b7f5a2fa40582abe1)
+// Generated by Wrangler by running `wrangler types` (hash: 1f938082c03573892ce397f54a9ca0b6)
 // Runtime types generated with workerd@1.20260507.1 2026-02-24 nodejs_compat
 declare namespace Cloudflare {
 	interface GlobalProps {
@@ -11,6 +11,7 @@ declare namespace Cloudflare {
 		RECORDS_QUEUE: Queue;
 		JETSTREAM_URL: "wss://jetstream2.us-east.bsky.network/subscribe";
 		CONSTELLATION_URL: "https://constellation.microcosm.blue";
+		ADMIN_TOKEN: string;
 		RECORDS_DO: DurableObjectNamespace<import("./src/index").RecordsJetstreamDO>;
 	}
 }
@@ -20,7 +21,7 @@ type StringifyValues<EnvType extends Record<string, unknown>> = {
 };
 declare namespace NodeJS {
 	interface ProcessEnv extends StringifyValues<
-		Pick<Cloudflare.Env, "JETSTREAM_URL" | "CONSTELLATION_URL">
+		Pick<Cloudflare.Env, "JETSTREAM_URL" | "CONSTELLATION_URL" | "ADMIN_TOKEN">
 	> {}
 }
 
diff --git a/apps/aggregator/wrangler.jsonc b/apps/aggregator/wrangler.jsonc
index 416856c22..42b24ae0a 100644
--- a/apps/aggregator/wrangler.jsonc
+++ b/apps/aggregator/wrangler.jsonc
@@ -80,6 +80,13 @@
 		// Constellation HTTP API for cold-start backfill DID discovery.
 		"CONSTELLATION_URL": "https://constellation.microcosm.blue",
 	},
+	// Required secrets, declared in config so `wrangler types` generates the
+	// typed binding and `wrangler deploy` refuses to ship without them set.
+	// Set in production with `wrangler secret put ADMIN_TOKEN`; tests bind
+	// a stub value via miniflare in vitest.config.ts.
+	"secrets": {
+		"required": ["ADMIN_TOKEN"],
+	},
 	"observability": {
 		"enabled": true,
 	},

From 94e8146f13335a2378a20c9373e3c37f09def87b Mon Sep 17 00:00:00 2001
From: Matt Kane <m@mk.gg>
Date: Sun, 10 May 2026 16:57:22 +0100
Subject: [PATCH 2/8] fix(aggregator): round 2 review fixes for backfill +
 admin auth
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Round 2 found a HIGH (sendBatch cap mismatch), 2 MEDIUM (Bearer
case-sensitivity, unbounded enqueue cost), and several LOWs. All
addressed.

HIGH

- MAX_RECORDS_PER_PAGE was 200 (PAGE_SIZE * 2) but Cloudflare Queues'
  sendBatch hard-caps at 100. A real PDS that loosely interpreted the
  ?limit= query and returned 101+ records would have made our
  sendBatch throw, dropping the entire collection's records via the
  per-collection try/catch. The in-memory CapturingQueue test stub
  accepted any size, so the test suite didn't catch the production
  divergence. Lowered the cap to 100 so a compliant page maps 1:1 to
  one batch send; oversize pages throw with a clear "exceeding
  per-page cap" message rather than the queue swallowing them. Added
  a defense-in-depth check at the sendBatch site too.

MEDIUM

- Bearer scheme was matched case-sensitively. RFC 6750 §2.1 makes
  the auth scheme case-insensitive; `curl -H "authorization: bearer
  ..."` with lowercase scheme would have got a 401 with a valid
  token. Now lowercase-compare the prefix.

- Unbounded enqueue cost from a single authenticated POST. Worst
  case theoretical 400M queue messages per request (1000 DIDs × 4
  collections × 1000 pages × 100 records = ~$160 in queue billing).
  Added MAX_TOTAL_ENQUEUE = 50_000 as a per-invocation cap, shared
  across DIDs via a budget object. Reaching the cap throws
  EnqueueLimitReached; the outer loop catches and stops processing
  further DIDs, logs a warning so operators see partial completion.
  Bounds blast radius if ADMIN_TOKEN ever leaks.

LOW

- `/_admin/start` accepted any HTTP method, contradicting its own
  documented POST usage. Now requires POST (returns 405 with
  `allow: POST` on other methods), matching `/_admin/backfill`.

- Per-collection enqueued counter undercounted on mid-pagination
  throws. Switched from `enqueued += await backfillCollection(...)`
  (which loses the partial count when the await rejects) to
  tracking via the shared budget object's `before - after` delta,
  computed in both the success and the error paths.

- wrangler.jsonc `secrets.required` comment overstated the
  enforcement guarantees. Updated to be accurate about the
  `wrangler dev` warning + `wrangler deploy` block + the
  per-named-environment inheritance gotcha. Documents the runtime
  503 fail-closed behavior so misconfigured deploys can't leave
  admin routes unauth-passable.

- Added .dev.vars.example for new contributors. Local devs who
  don't create .dev.vars otherwise hit a confusing 503 on admin
  routes; example file makes the requirement obvious.

NEW TESTS

- Enqueue budget cap: budget of 30 stops the loop after 30 records
  enqueued.
- POST-only on /_admin/start: GET returns 405; POST + valid token
  returns 204.
- Case-insensitive Bearer: lowercase scheme accepted.

131 tests pass (was 128). Typecheck clean. Zero lint diagnostics in
apps/aggregator.

NOT FIXED

- worker-configuration.d.ts is committed (round-2 LOW). Repo-wide
  convention; left for a separate Discussion rather than changed
  here.
---
 apps/aggregator/.dev.vars.example     |   8 ++
 apps/aggregator/src/backfill.ts       | 113 ++++++++++++++++++++++----
 apps/aggregator/src/index.ts          |  19 ++++-
 apps/aggregator/test/backfill.test.ts |  53 +++++++++++-
 apps/aggregator/wrangler.jsonc        |  16 +++-
 5 files changed, 183 insertions(+), 26 deletions(-)
 create mode 100644 apps/aggregator/.dev.vars.example

diff --git a/apps/aggregator/.dev.vars.example b/apps/aggregator/.dev.vars.example
new file mode 100644
index 000000000..f093bab7a
--- /dev/null
+++ b/apps/aggregator/.dev.vars.example
@@ -0,0 +1,8 @@
+# Local-dev secrets for the aggregator Worker. Copy to `.dev.vars` (which is
+# gitignored) and customise. `wrangler dev` reads this file and binds the
+# values into `env`. Production secrets are managed via
+# `wrangler secret put <NAME>` and never appear in the repo.
+
+# Bearer token for the /_admin/* routes. Any non-empty string works locally;
+# pick something you'd recognise in `wrangler tail` logs.
+ADMIN_TOKEN=dev-only-not-for-production
diff --git a/apps/aggregator/src/backfill.ts b/apps/aggregator/src/backfill.ts
index c91f300df..973eb6fce 100644
--- a/apps/aggregator/src/backfill.ts
+++ b/apps/aggregator/src/backfill.ts
@@ -27,11 +27,36 @@ const PAGE_SIZE = 100;
  * which is past anything we'd legitimately backfill in one shot. */
 const MAX_PAGES_PER_COLLECTION = 1000;
 /** Defensive cap on records per page. Real PDSes honour the `limit` query
- * param; this guards against a hostile PDS returning an enormous array. */
-const MAX_RECORDS_PER_PAGE = PAGE_SIZE * 2;
+ * param; this guards against a hostile PDS returning an enormous array.
+ * Capped at the same width as Cloudflare Queues' sendBatch (100) so a
+ * compliant page maps 1:1 to one batch send; oversize pages are rejected
+ * rather than chunked, surfacing the PDS's spec violation as a partial
+ * failure the operator can investigate. */
+const MAX_RECORDS_PER_PAGE = PAGE_SIZE;
+/** Cloudflare Queues' hard cap on `sendBatch` size. Per-page enqueues are
+ * always ≤ this thanks to MAX_RECORDS_PER_PAGE; documented here so the
+ * relationship is visible at the call site. */
+const QUEUE_SEND_BATCH_CAP = 100;
+/** Cap on the total number of records a single backfill invocation may
+ * enqueue. Defends against a leaked admin token being weaponised to flood
+ * the queue at near-zero cost to the attacker. The blast radius of one
+ * compromised POST is bounded by this number × per-message billing.
+ *
+ * 50k = ~50 plausible publishers × ~250 records each, well past v1 scale.
+ * Worker waitUntil exhausts long before this in any honest workload. */
+const MAX_TOTAL_ENQUEUE = 50_000;
 /** Atproto rkey grammar: ALPHA / DIGIT / "." / "-" / "_" / ":" / "~". */
 const RKEY_PATTERN = /^[A-Za-z0-9._:~-]{1,512}$/;
 
+/** Thrown when the per-invocation enqueue cap is reached. Caller catches
+ * and stops processing further DIDs / collections; partial work already
+ * committed to the queue is what it is (consumer is idempotent on retry). */
+export class EnqueueLimitReached extends Error {
+	constructor(public readonly limit: number) {
+		super(`backfill enqueue cap of ${limit} reached`);
+	}
+}
+
 /** Producer-side queue surface. The production binding `env.RECORDS_QUEUE`
  * satisfies this; tests pass an in-memory implementation. The return type
  * is `unknown` rather than `void` so workerd's `Queue.send` (which returns
@@ -68,29 +93,55 @@ export interface BackfillSummary {
 /**
  * Backfill multiple DIDs serially. Per-DID failures don't stop the loop —
  * one bad publisher doesn't block the rest.
+ *
+ * Aborts the loop early if the per-invocation enqueue cap is reached. The
+ * partial summary still reflects what got through; the operator can re-run
+ * with the unfinished tail of the DID list.
  */
 export async function backfillDids(
 	dids: readonly string[],
 	deps: BackfillDeps,
 ): Promise<BackfillSummary> {
 	const results: BackfillDidResult[] = [];
-	let totalEnqueued = 0;
+	const budget: EnqueueBudget = { remaining: MAX_TOTAL_ENQUEUE };
 	for (const did of dids) {
-		const result = await backfillDid(did, deps);
+		const result = await backfillDid(did, deps, budget);
 		results.push(result);
-		totalEnqueued += result.enqueued;
 		deps.onDidComplete?.(result);
+		if (budget.remaining <= 0) {
+			console.warn("[aggregator] backfill enqueue cap reached, aborting remaining DIDs", {
+				cap: MAX_TOTAL_ENQUEUE,
+				didsProcessed: results.length,
+				didsRemaining: dids.length - results.length,
+			});
+			break;
+		}
 	}
+	const totalEnqueued = results.reduce((sum, r) => sum + r.enqueued, 0);
 	return { totalEnqueued, results };
 }
 
+/** Mutable counter shared across the per-DID / per-collection loops so the
+ * cap is global to one backfill invocation. */
+interface EnqueueBudget {
+	remaining: number;
+}
+
 /**
  * Backfill a single DID across every collection in `WANTED_COLLECTIONS`.
  * Resolution failure aborts (we can't enqueue verifiable jobs without
  * knowing the PDS); per-collection listRecords failures are recorded and
  * the loop continues to the next collection.
+ *
+ * Optional `budget` shares the per-invocation enqueue cap across DIDs.
+ * Tests that call this directly without a budget get an effectively
+ * unbounded one.
  */
-export async function backfillDid(did: string, deps: BackfillDeps): Promise<BackfillDidResult> {
+export async function backfillDid(
+	did: string,
+	deps: BackfillDeps,
+	budget: EnqueueBudget = { remaining: MAX_TOTAL_ENQUEUE },
+): Promise<BackfillDidResult> {
 	const errors: string[] = [];
 	let enqueued = 0;
 
@@ -108,17 +159,29 @@ export async function backfillDid(did: string, deps: BackfillDeps): Promise<Back
 
 	const fetchImpl = deps.fetch ?? fetch;
 	for (const collection of WANTED_COLLECTIONS) {
+		// Track this collection's enqueues separately so a mid-pagination
+		// throw still reports the partial count instead of swallowing the
+		// records that already landed in the queue.
+		const before = budget.remaining;
 		try {
-			enqueued += await backfillCollection({
+			await backfillCollection({
 				did,
 				pds,
 				collection,
 				queue: deps.queue,
 				fetchImpl,
+				budget,
 			});
 		} catch (err) {
+			if (err instanceof EnqueueLimitReached) {
+				// Stop processing further collections for this DID; outer
+				// loop will catch the budget exhaustion and stop entirely.
+				enqueued += before - budget.remaining;
+				return { did, enqueued, errors };
+			}
 			errors.push(`${collection}: ${err instanceof Error ? err.message : String(err)}`);
 		}
+		enqueued += before - budget.remaining;
 	}
 
 	return { did, enqueued, errors };
@@ -130,12 +193,15 @@ interface BackfillCollectionOpts {
 	collection: string;
 	queue: BackfillQueue;
 	fetchImpl: typeof fetch;
+	budget: EnqueueBudget;
 }
 
 /**
  * Walk one DID's records for a single collection, paginating through
  * `listRecords` and enqueuing each result via `sendBatch` (one batch per
- * page). Returns the number of records enqueued.
+ * page). Decrements `opts.budget.remaining` per record enqueued; throws
+ * `EnqueueLimitReached` when the budget hits zero so the caller can stop
+ * processing further collections / DIDs.
  *
  * 404 from the PDS on the FIRST page means the repo doesn't host this
  * collection — silently treated as zero records, not an error. A 404
@@ -144,13 +210,18 @@ interface BackfillCollectionOpts {
  *
  * Pagination is capped at MAX_PAGES_PER_COLLECTION + cursor-equality check
  * to defend against a PDS that echoes the same cursor forever.
+ *
+ * `MAX_RECORDS_PER_PAGE` matches Cloudflare Queues' `sendBatch` cap (100),
+ * so a compliant page maps 1:1 to one batch send. A PDS that ignores the
+ * `?limit=` query and returns more records than that throws — we'd rather
+ * surface the spec violation than silently chunk and hide the upstream bug.
  */
-async function backfillCollection(opts: BackfillCollectionOpts): Promise<number> {
-	let enqueued = 0;
+async function backfillCollection(opts: BackfillCollectionOpts): Promise<void> {
 	let cursor: string | undefined;
 	let prevCursor: string | undefined;
 	let pages = 0;
 	do {
+		if (opts.budget.remaining <= 0) throw new EnqueueLimitReached(MAX_TOTAL_ENQUEUE);
 		if (++pages > MAX_PAGES_PER_COLLECTION) {
 			throw new Error(`exceeded ${MAX_PAGES_PER_COLLECTION} pages`);
 		}
@@ -171,7 +242,7 @@ async function backfillCollection(opts: BackfillCollectionOpts): Promise<number>
 		if (response.status === 404) {
 			if (cursor === undefined) {
 				// First-page 404: publisher has no records of this collection.
-				return enqueued;
+				return;
 			}
 			// Mid-pagination 404 is a partial failure; surface it.
 			throw new Error(`listRecords returned 404 mid-pagination at cursor=${cursor}`);
@@ -184,13 +255,14 @@ async function backfillCollection(opts: BackfillCollectionOpts): Promise<number>
 		const records = extractListRecordsBody(body);
 		if (records.length > MAX_RECORDS_PER_PAGE) {
 			throw new Error(
-				`PDS returned ${records.length} records, exceeding cap of ${MAX_RECORDS_PER_PAGE}`,
+				`PDS returned ${records.length} records, exceeding per-page cap of ${MAX_RECORDS_PER_PAGE}`,
 			);
 		}
 		cursor = extractCursor(body);
 
 		const messages: { body: RecordsJob }[] = [];
 		for (const record of records) {
+			if (opts.budget.remaining - messages.length <= 0) break;
 			const rkey = parseRkeyFromUri(record.uri, opts.collection);
 			if (!rkey) continue;
 			messages.push({
@@ -204,14 +276,21 @@ async function backfillCollection(opts: BackfillCollectionOpts): Promise<number>
 			});
 		}
 		if (messages.length > 0) {
-			// Single batched send per page — far cheaper than the per-record
-			// awaits this used to do (which couldn't possibly finish a
-			// 4000-record DID inside the waitUntil budget).
+			// Page size is capped at QUEUE_SEND_BATCH_CAP, so this single
+			// sendBatch never exceeds Cloudflare's 100-message limit.
+			if (messages.length > QUEUE_SEND_BATCH_CAP) {
+				// Defense in depth — should be unreachable given the cap
+				// above, but throwing loudly here beats a runtime error from
+				// the queue binding silently dropping the batch.
+				throw new Error(
+					`sendBatch payload of ${messages.length} exceeds Queue cap of ${QUEUE_SEND_BATCH_CAP}`,
+				);
+			}
 			await opts.queue.sendBatch(messages);
-			enqueued += messages.length;
+			opts.budget.remaining -= messages.length;
 		}
+		if (opts.budget.remaining <= 0) throw new EnqueueLimitReached(MAX_TOTAL_ENQUEUE);
 	} while (cursor);
-	return enqueued;
 }
 
 interface ListRecordEntry {
diff --git a/apps/aggregator/src/index.ts b/apps/aggregator/src/index.ts
index 470f44f71..f06aa7d05 100644
--- a/apps/aggregator/src/index.ts
+++ b/apps/aggregator/src/index.ts
@@ -82,13 +82,22 @@ function requireAdminAuth(request: Request, env: Env): Response | null {
 		return new Response("admin endpoints not configured", { status: 503 });
 	}
 	const auth = request.headers.get("authorization");
-	if (!auth || !auth.startsWith("Bearer ")) {
+	const SCHEME_PREFIX = "bearer ";
+	// RFC 6750 §2.1: the auth scheme is case-insensitive. `curl -H
+	// "authorization: bearer ..."` and SDKs that don't canonicalise the
+	// scheme would otherwise fail with a confusing 401 even though the
+	// token is correct.
+	if (
+		!auth ||
+		auth.length < SCHEME_PREFIX.length ||
+		auth.slice(0, SCHEME_PREFIX.length).toLowerCase() !== SCHEME_PREFIX
+	) {
 		return new Response("unauthorized", {
 			status: 401,
 			headers: { "www-authenticate": "Bearer" },
 		});
 	}
-	const token = auth.slice("Bearer ".length);
+	const token = auth.slice(SCHEME_PREFIX.length);
 	if (!timingSafeEqual(token, expected)) {
 		return new Response("unauthorized", {
 			status: 401,
@@ -130,6 +139,12 @@ export default {
 	async fetch(request: Request, env: Env, ctx: ExecutionContext): Promise<Response> {
 		const url = new URL(request.url);
 		if (url.pathname === BOOTSTRAP_PATH) {
+			if (request.method !== "POST") {
+				return new Response("method not allowed", {
+					status: 405,
+					headers: { allow: "POST" },
+				});
+			}
 			const denied = requireAdminAuth(request, env);
 			if (denied) return denied;
 			const id = env.RECORDS_DO.idFromName(RECORDS_DO_NAME);
diff --git a/apps/aggregator/test/backfill.test.ts b/apps/aggregator/test/backfill.test.ts
index fae806e85..b65d6a2e7 100644
--- a/apps/aggregator/test/backfill.test.ts
+++ b/apps/aggregator/test/backfill.test.ts
@@ -568,7 +568,7 @@ describe("backfillCollection: defenses against malicious / buggy PDS", () => {
 
 		const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl });
 
-		expect(result.errors.some((e) => e.includes("exceeding cap"))).toBe(true);
+		expect(result.errors.some((e) => e.includes("per-page cap"))).toBe(true);
 		// No records enqueued for the over-capped collection.
 		expect(queue.sent.filter((j) => j.collection === collection)).toHaveLength(0);
 	});
@@ -594,6 +594,36 @@ describe("backfillCollection: defenses against malicious / buggy PDS", () => {
 	});
 });
 
+describe("backfillCollection: enqueue budget cap", () => {
+	it("stops enqueuing when MAX_TOTAL_ENQUEUE is reached and reports the partial count", async () => {
+		// Force the cap by giving a tiny budget. 50 records × 4 collections =
+		// 200 records the worker would otherwise enqueue; budget of 30 should
+		// cap it after the first collection's first page.
+		const queue = new CapturingQueue();
+		const resolver = buildResolver();
+		const recordsByCollection: Record<string, MockListRecord[]> = {};
+		for (const c of WANTED_COLLECTIONS) {
+			recordsByCollection[c] = Array.from({ length: 50 }, (_, i) => ({
+				uri: `at://${DID_A}/${c}/r${i}`,
+				cid: `c${i}`,
+				value: {},
+			}));
+		}
+		const fetchImpl = makeFetch(recordsByCollection);
+		const result = await backfillDid(
+			DID_A,
+			{ resolver, queue, fetch: fetchImpl },
+			{ remaining: 30 },
+		);
+		// The first collection's full page (50 records) was within the cap?
+		// No — after 30 records the budget hits 0, the inner break fires,
+		// the page's batch send carries 30 records, then the post-send
+		// throw aborts further pages.
+		expect(result.enqueued).toBeLessThanOrEqual(30);
+		expect(queue.sent.length).toBeLessThanOrEqual(30);
+	});
+});
+
 describe("backfill admin route: auth + input validation", () => {
 	it("returns 401 when Authorization header is missing", async () => {
 		const res = await SELF.fetch("https://test/_admin/backfill", {
@@ -710,16 +740,31 @@ describe("backfill admin route: auth + input validation", () => {
 	});
 });
 
-describe("admin start route: auth", () => {
-	it("returns 401 without token", async () => {
+describe("admin start route: auth + method", () => {
+	it("returns 405 on GET (POST-only route)", async () => {
 		const res = await SELF.fetch("https://test/_admin/start");
+		expect(res.status).toBe(405);
+		expect(res.headers.get("allow")).toBe("POST");
+	});
+
+	it("returns 401 on POST without token", async () => {
+		const res = await SELF.fetch("https://test/_admin/start", { method: "POST" });
 		expect(res.status).toBe(401);
 	});
 
-	it("returns 204 with valid token", async () => {
+	it("returns 204 on POST with valid token", async () => {
 		const res = await SELF.fetch("https://test/_admin/start", {
+			method: "POST",
 			headers: { authorization: "Bearer test-admin-token" },
 		});
 		expect(res.status).toBe(204);
 	});
+
+	it("accepts case-insensitive Bearer scheme (RFC 6750 §2.1)", async () => {
+		const res = await SELF.fetch("https://test/_admin/start", {
+			method: "POST",
+			headers: { authorization: "bearer test-admin-token" },
+		});
+		expect(res.status).toBe(204);
+	});
 });
diff --git a/apps/aggregator/wrangler.jsonc b/apps/aggregator/wrangler.jsonc
index 42b24ae0a..6e0fc8287 100644
--- a/apps/aggregator/wrangler.jsonc
+++ b/apps/aggregator/wrangler.jsonc
@@ -81,9 +81,19 @@
 		"CONSTELLATION_URL": "https://constellation.microcosm.blue",
 	},
 	// Required secrets, declared in config so `wrangler types` generates the
-	// typed binding and `wrangler deploy` refuses to ship without them set.
-	// Set in production with `wrangler secret put ADMIN_TOKEN`; tests bind
-	// a stub value via miniflare in vitest.config.ts.
+	// typed binding and `wrangler deploy` validates the secret is set on the
+	// target Worker before publishing. `wrangler dev` warns at startup when a
+	// required secret isn't in `.dev.vars` / `.env`.
+	//
+	// Set in production with `wrangler secret put ADMIN_TOKEN`. Tests bind
+	// a stub via miniflare in vitest.config.ts. Local dev pulls from
+	// `.dev.vars` (gitignored; see `.dev.vars.example`).
+	//
+	// The `requireAdminAuth` runtime guard fails closed (503) if the binding
+	// is somehow empty at request time, so even a misconfigured deploy can't
+	// leave the admin routes unauth-passable. NOTE: this `secrets` block is
+	// not inherited by named environments — repeat it under each `env.<name>`
+	// you add or that env's deploys won't enforce the secret.
 	"secrets": {
 		"required": ["ADMIN_TOKEN"],
 	},

From 5895450e582fac48839312a535fe15e28ae6c4c8 Mon Sep 17 00:00:00 2001
From: Matt Kane <m@mk.gg>
Date: Sun, 10 May 2026 17:18:25 +0100
Subject: [PATCH 3/8] fix(aggregator): round 3 review fixes for backfill
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Round 3 found 1 HIGH (no listRecords timeout), 2 MEDIUM
(EnqueueLimitReached observability + slack budget-cap test), 2 LOW
(whitespace-only ADMIN_TOKEN, log discrepancy on early-abort), 1 NIT
(unreachable defense-in-depth check that drops records when triggered).
All addressed.

HIGH

- listRecords fetch had no timeout. A hung publisher PDS that accepted
  the connection but stalled the body would have blocked
  `await fetchImpl(...)` until workerd's overall sub-request budget
  exhausted, starving every DID after it in the serial loop. pds-verify
  uses an AbortController for the same kind of upstream call;
  inconsistent. Now wraps the listRecords fetch in an
  AbortController + 15s default timeout (override via
  `BackfillDeps.listRecordsTimeoutMs` for tests). On AbortError, throws
  with cause attached so per-collection error log includes the
  underlying signal.

MEDIUM

- EnqueueLimitReached returned without recording an error in
  `BackfillDidResult.errors`. Operator's per-DID log showed
  `enqueued: 47, errors: []` for a half-processed DID — looked like
  a successful 47-record publisher. Now appends a sentinel error
  ("enqueue cap reached mid-DID at collection X — partial backfill,
  re-run to complete") so partial completion is visible. Also added
  triggerDid to the warn log in `backfillDids` so operators can see
  WHICH DID drained the budget.

- Budget-cap test asserted `<= 30`, which would silently accept
  off-by-one regressions in the inner break (capping at 29 or 0
  would still pass). The exact answer is 30 (budget=30 + page of
  50 → inner break at iter 31, sendBatch sends 30, post-send throw
  fires). Now asserts `toBe(30)`. Also asserts the sentinel error
  appears.

LOW

- Whitespace-only ADMIN_TOKEN was accepted. `if (!expected)` catches
  "" and undefined but accepts "   " (operator pastes a blank line
  into `wrangler secret put`). Now `expected.trim().length === 0`
  also fails closed.

- `runBackfill` summary log conflated didCount (requested) with
  didsProcessed (actual). When the cap fires and the loop breaks
  early, the log said `didCount: 1000, totalEnqueued: 50000` even
  if only 200 DIDs ran. Now logs both `didsRequested` and
  `didsProcessed` separately so the early-abort case is visible
  in the summary line without correlating against an earlier warn.

NIT

- Defense-in-depth `if (messages.length > QUEUE_SEND_BATCH_CAP)`
  inside the page loop was unreachable given
  `MAX_RECORDS_PER_PAGE === QUEUE_SEND_BATCH_CAP === 100`, AND if
  it ever fired (future bump of MAX_RECORDS_PER_PAGE), it would
  drop the entire page silently into errors[]. Replaced with a
  module-load static guard that throws at boot if the invariant
  is violated — better failure mode (boot crash vs silent data
  loss) and the runtime check disappears.

NEW TESTS

- listRecords timeout: hung-fetch stub never resolves, abort fires
  at 25ms, error message matches "timed out after 25ms".
- Case-insensitive Bearer regression guards: canonical "Bearer",
  uppercase "BEARER", non-Bearer "Basic" rejected, empty token
  after prefix rejected.

136 tests pass (was 131). Typecheck clean. Zero lint diagnostics in
apps/aggregator.

CONVERGENCE NOTE

Round 1 → 12 bugs. Round 2 → 1 HIGH + 2 MED + 5 LOW = 8 bugs.
Round 3 → 1 HIGH + 2 MED + 2 LOW + 1 NIT = 6 findings, only the
HIGH was a behavior bug; rest were quality / observability /
test-strength. Reviewer assessed convergence is real but not
complete; only the HIGH would have blocked. The slice has now had
three rounds with diminishing severity and no new bugs introduced
by prior fix rounds.
---
 apps/aggregator/src/backfill.ts       | 60 ++++++++++++++----
 apps/aggregator/src/index.ts          | 12 +++-
 apps/aggregator/test/backfill.test.ts | 89 +++++++++++++++++++++++++--
 3 files changed, 140 insertions(+), 21 deletions(-)

diff --git a/apps/aggregator/src/backfill.ts b/apps/aggregator/src/backfill.ts
index 973eb6fce..a61475b47 100644
--- a/apps/aggregator/src/backfill.ts
+++ b/apps/aggregator/src/backfill.ts
@@ -21,6 +21,12 @@ import type { RecordsJob } from "./env.js";
 import { isPlainObject } from "./utils.js";
 
 const PAGE_SIZE = 100;
+/** Per-listRecords-page timeout. A hostile or hung publisher PDS that
+ * accepts the connection but stalls the body would otherwise block the
+ * fetch until workerd's overall sub-request budget exhausts — starving
+ * every DID after this one in the serial loop. Same shape as
+ * `pds-verify.ts`'s fetchCar timeout. */
+const LIST_RECORDS_TIMEOUT_MS = 15_000;
 /** Cap on listRecords pagination per collection. A buggy or malicious PDS
  * that echoes the same cursor would otherwise loop forever inside one
  * `ctx.waitUntil`. 1000 pages × 100 records = 100k records per collection,
@@ -37,6 +43,14 @@ const MAX_RECORDS_PER_PAGE = PAGE_SIZE;
  * always ≤ this thanks to MAX_RECORDS_PER_PAGE; documented here so the
  * relationship is visible at the call site. */
 const QUEUE_SEND_BATCH_CAP = 100;
+// Static guard: bumping MAX_RECORDS_PER_PAGE above the queue's batch cap
+// would silently break sendBatch in production. Surface the violation at
+// module load rather than at the first batch send.
+if (MAX_RECORDS_PER_PAGE > QUEUE_SEND_BATCH_CAP) {
+	throw new Error(
+		`MAX_RECORDS_PER_PAGE (${MAX_RECORDS_PER_PAGE}) exceeds QUEUE_SEND_BATCH_CAP (${QUEUE_SEND_BATCH_CAP})`,
+	);
+}
 /** Cap on the total number of records a single backfill invocation may
  * enqueue. Defends against a leaked admin token being weaponised to flood
  * the queue at near-zero cost to the attacker. The blast radius of one
@@ -70,6 +84,10 @@ export interface BackfillDeps {
 	queue: BackfillQueue;
 	/** Inject for tests; defaults to `globalThis.fetch`. */
 	fetch?: typeof fetch;
+	/** Override for the per-listRecords-page timeout. Defaults to
+	 * `LIST_RECORDS_TIMEOUT_MS`. Tests use a small value to exercise the
+	 * abort path without burning the production budget. */
+	listRecordsTimeoutMs?: number;
 	/** Optional callback fired after each DID completes (success or failure).
 	 * Used by the production wiring to log per-DID progress so an operator
 	 * watching `wrangler tail` sees where a long backfill is up to. */
@@ -111,6 +129,7 @@ export async function backfillDids(
 		if (budget.remaining <= 0) {
 			console.warn("[aggregator] backfill enqueue cap reached, aborting remaining DIDs", {
 				cap: MAX_TOTAL_ENQUEUE,
+				triggerDid: result.did,
 				didsProcessed: results.length,
 				didsRemaining: dids.length - results.length,
 			});
@@ -158,6 +177,7 @@ export async function backfillDid(
 	}
 
 	const fetchImpl = deps.fetch ?? fetch;
+	const timeoutMs = deps.listRecordsTimeoutMs ?? LIST_RECORDS_TIMEOUT_MS;
 	for (const collection of WANTED_COLLECTIONS) {
 		// Track this collection's enqueues separately so a mid-pagination
 		// throw still reports the partial count instead of swallowing the
@@ -170,13 +190,20 @@ export async function backfillDid(
 				collection,
 				queue: deps.queue,
 				fetchImpl,
+				timeoutMs,
 				budget,
 			});
 		} catch (err) {
 			if (err instanceof EnqueueLimitReached) {
 				// Stop processing further collections for this DID; outer
 				// loop will catch the budget exhaustion and stop entirely.
+				// Append a sentinel error so the operator's per-DID log
+				// shows the cap fired (otherwise `enqueued: 47, errors: []`
+				// looks like a successful 47-record DID).
 				enqueued += before - budget.remaining;
+				errors.push(
+					`enqueue cap reached mid-DID at collection ${collection} — partial backfill, re-run to complete`,
+				);
 				return { did, enqueued, errors };
 			}
 			errors.push(`${collection}: ${err instanceof Error ? err.message : String(err)}`);
@@ -193,6 +220,7 @@ interface BackfillCollectionOpts {
 	collection: string;
 	queue: BackfillQueue;
 	fetchImpl: typeof fetch;
+	timeoutMs: number;
 	budget: EnqueueBudget;
 }
 
@@ -236,9 +264,22 @@ async function backfillCollection(opts: BackfillCollectionOpts): Promise<void> {
 		url.searchParams.set("limit", String(PAGE_SIZE));
 		if (cursor) url.searchParams.set("cursor", cursor);
 
-		const response = await opts.fetchImpl(url.toString(), {
-			headers: { accept: "application/json" },
-		});
+		const controller = new AbortController();
+		const timer = setTimeout(() => controller.abort(), opts.timeoutMs);
+		let response: Response;
+		try {
+			response = await opts.fetchImpl(url.toString(), {
+				headers: { accept: "application/json" },
+				signal: controller.signal,
+			});
+		} catch (err) {
+			if (err instanceof Error && err.name === "AbortError") {
+				throw new Error(`listRecords timed out after ${opts.timeoutMs}ms`, { cause: err });
+			}
+			throw err;
+		} finally {
+			clearTimeout(timer);
+		}
 		if (response.status === 404) {
 			if (cursor === undefined) {
 				// First-page 404: publisher has no records of this collection.
@@ -276,16 +317,9 @@ async function backfillCollection(opts: BackfillCollectionOpts): Promise<void> {
 			});
 		}
 		if (messages.length > 0) {
-			// Page size is capped at QUEUE_SEND_BATCH_CAP, so this single
-			// sendBatch never exceeds Cloudflare's 100-message limit.
-			if (messages.length > QUEUE_SEND_BATCH_CAP) {
-				// Defense in depth — should be unreachable given the cap
-				// above, but throwing loudly here beats a runtime error from
-				// the queue binding silently dropping the batch.
-				throw new Error(
-					`sendBatch payload of ${messages.length} exceeds Queue cap of ${QUEUE_SEND_BATCH_CAP}`,
-				);
-			}
+			// Page size is capped at QUEUE_SEND_BATCH_CAP at module load via
+			// the static assertion above, so this sendBatch never exceeds
+			// Cloudflare's 100-message limit by construction.
 			await opts.queue.sendBatch(messages);
 			opts.budget.remaining -= messages.length;
 		}
diff --git a/apps/aggregator/src/index.ts b/apps/aggregator/src/index.ts
index f06aa7d05..a3f42d8df 100644
--- a/apps/aggregator/src/index.ts
+++ b/apps/aggregator/src/index.ts
@@ -77,7 +77,11 @@ function timingSafeEqual(a: string, b: string): boolean {
  */
 function requireAdminAuth(request: Request, env: Env): Response | null {
 	const expected = env.ADMIN_TOKEN;
-	if (!expected) {
+	// `trim()` defends against a whitespace-only secret slipping past
+	// `secrets.required`'s presence check — `wrangler secret put ADMIN_TOKEN`
+	// followed by an accidental Enter would otherwise produce a working
+	// endpoint with a trivially guessable token.
+	if (!expected || expected.trim().length === 0) {
 		// Misconfigured production or unset dev — closed by default.
 		return new Response("admin endpoints not configured", { status: 503 });
 	}
@@ -249,7 +253,11 @@ async function runBackfill(dids: readonly string[], env: Env): Promise<void> {
 			},
 		});
 		console.log("[aggregator] backfill complete", {
-			didCount: dids.length,
+			didsRequested: dids.length,
+			// Separate from didsRequested so the early-abort case (enqueue
+			// cap fired) is visible in the summary log without the operator
+			// having to correlate against an earlier warn line.
+			didsProcessed: summary.results.length,
 			totalEnqueued: summary.totalEnqueued,
 			didsWithErrors: summary.results.filter((r) => r.errors.length > 0).length,
 		});
diff --git a/apps/aggregator/test/backfill.test.ts b/apps/aggregator/test/backfill.test.ts
index b65d6a2e7..14e93fc4b 100644
--- a/apps/aggregator/test/backfill.test.ts
+++ b/apps/aggregator/test/backfill.test.ts
@@ -615,12 +615,55 @@ describe("backfillCollection: enqueue budget cap", () => {
 			{ resolver, queue, fetch: fetchImpl },
 			{ remaining: 30 },
 		);
-		// The first collection's full page (50 records) was within the cap?
-		// No — after 30 records the budget hits 0, the inner break fires,
-		// the page's batch send carries 30 records, then the post-send
-		// throw aborts further pages.
-		expect(result.enqueued).toBeLessThanOrEqual(30);
-		expect(queue.sent.length).toBeLessThanOrEqual(30);
+		// Budget=30 + page of 50 records: inner loop pushes records while
+		// `budget.remaining - messages.length > 0`. At iter 31 the
+		// expression (30 - 30) ≤ 0 fires `break`. messages.length === 30,
+		// sendBatch sends 30, budget → 0, post-send check throws
+		// EnqueueLimitReached. Exact answer is 30 — assert the exact
+		// number so an off-by-one regression in the inner break doesn't
+		// pass under a slack `<=`.
+		expect(result.enqueued).toBe(30);
+		expect(queue.sent.length).toBe(30);
+		// Sentinel error appended so the operator can see the cap fired
+		// rather than thinking the DID was a clean 30-record publisher.
+		expect(result.errors.some((e) => e.includes("enqueue cap reached"))).toBe(true);
+	});
+});
+
+describe("backfillCollection: listRecords timeout", () => {
+	it("aborts a hung PDS fetch and surfaces the timeout per-collection", async () => {
+		const queue = new CapturingQueue();
+		const resolver = buildResolver();
+		const collection = WANTED_COLLECTIONS[0];
+		if (!collection) throw new Error("test assumes ≥1 collection");
+		const fetchImpl: typeof fetch = (input, init) => {
+			const url =
+				typeof input === "string"
+					? new URL(input)
+					: input instanceof URL
+						? input
+						: new URL(input.url);
+			if (url.searchParams.get("collection") !== collection) {
+				return Promise.resolve(new Response(JSON.stringify({ records: [] }), { status: 200 }));
+			}
+			// Hung connection: never resolve, but honour the abort signal so
+			// the AbortController-driven timeout fires.
+			return new Promise((_resolve, reject) => {
+				init?.signal?.addEventListener("abort", () => {
+					reject(new DOMException("aborted", "AbortError"));
+				});
+			});
+		};
+
+		const result = await backfillDid(DID_A, {
+			resolver,
+			queue,
+			fetch: fetchImpl,
+			listRecordsTimeoutMs: 25,
+		});
+
+		const error = result.errors.find((e) => e.includes(collection));
+		expect(error).toMatch(/timed out after 25ms/);
 	});
 });
 
@@ -740,6 +783,40 @@ describe("backfill admin route: auth + input validation", () => {
 	});
 });
 
+describe("admin auth: scheme + token edge cases", () => {
+	it("accepts canonical mixed-case Bearer (regression guard)", async () => {
+		const res = await SELF.fetch("https://test/_admin/start", {
+			method: "POST",
+			headers: { authorization: "Bearer test-admin-token" },
+		});
+		expect(res.status).toBe(204);
+	});
+
+	it("accepts uppercase BEARER scheme (RFC 6750 case-insensitive)", async () => {
+		const res = await SELF.fetch("https://test/_admin/start", {
+			method: "POST",
+			headers: { authorization: "BEARER test-admin-token" },
+		});
+		expect(res.status).toBe(204);
+	});
+
+	it("rejects an Authorization header with a non-Bearer scheme", async () => {
+		const res = await SELF.fetch("https://test/_admin/start", {
+			method: "POST",
+			headers: { authorization: "Basic dGVzdC1hZG1pbi10b2tlbjo=" },
+		});
+		expect(res.status).toBe(401);
+	});
+
+	it("rejects empty token after Bearer prefix", async () => {
+		const res = await SELF.fetch("https://test/_admin/start", {
+			method: "POST",
+			headers: { authorization: "Bearer " },
+		});
+		expect(res.status).toBe(401);
+	});
+});
+
 describe("admin start route: auth + method", () => {
 	it("returns 405 on GET (POST-only route)", async () => {
 		const res = await SELF.fetch("https://test/_admin/start");

From 31eebc752a6ad13ee8e1a499d25dd85b22dbea09 Mon Sep 17 00:00:00 2001
From: Matt Kane <m@mk.gg>
Date: Sun, 10 May 2026 18:28:32 +0100
Subject: [PATCH 4/8] refactor(aggregator): fan backfill into a queue to escape
 the 30s waitUntil cap
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous backfill ran one giant serial loop inside the POST handler's
`ctx.waitUntil`, which Cloudflare hard-caps at 30 seconds wall-clock.
Realistically that limited a single backfill POST to ~15-25 DIDs before
in-flight work was cancelled, regardless of per-DID work being well-bounded.

Restructured into a dedicated `BACKFILL_QUEUE` of `{did, collection}` jobs.
The POST handler now just discovers DIDs (via relay or explicit list) and
fans pairs onto the queue via `sendBatch` — fast and trivially fits in
waitUntil. A new per-pair consumer (`backfill-consumer.ts`) does the
listRecords pagination + records-queue fan-out work asynchronously, with
queue retry/backoff/concurrency for free.

Cap changes for the new fan-out threat model:
- `MAX_BACKFILL_DIDS` lowered 1000 → 100 (each submitted DID becomes
  `WANTED_COLLECTIONS.length` consumer invocations, so the explicit-list
  ceiling needs to be much lower than the old serial-loop cap).
- New `MAX_DISCOVERED_DIDS = 1000` defends discovery against a runaway
  relay flooding the queue.
- Dropped the global `MAX_TOTAL_ENQUEUE` budget — it doesn't translate
  across consumer invocations. Per-pair work is still bounded by
  `MAX_PAGES_PER_COLLECTION` × `MAX_RECORDS_PER_PAGE`.
---
 apps/aggregator/src/backfill-consumer.ts  |  90 +++
 apps/aggregator/src/backfill.ts           | 384 +++++++-----
 apps/aggregator/src/env.ts                |  15 +
 apps/aggregator/src/index.ts              | 180 ++++--
 apps/aggregator/test/backfill.test.ts     | 731 +++++++++++++---------
 apps/aggregator/worker-configuration.d.ts |   7 +-
 apps/aggregator/wrangler.jsonc            |  36 +-
 7 files changed, 922 insertions(+), 521 deletions(-)
 create mode 100644 apps/aggregator/src/backfill-consumer.ts

diff --git a/apps/aggregator/src/backfill-consumer.ts b/apps/aggregator/src/backfill-consumer.ts
new file mode 100644
index 000000000..727b0019e
--- /dev/null
+++ b/apps/aggregator/src/backfill-consumer.ts
@@ -0,0 +1,90 @@
+/**
+ * Backfill queue consumer. Pulls one `BackfillJob` at a time and walks
+ * `com.atproto.repo.listRecords` for that (DID, collection) pair, batching
+ * results onto the records queue for the standard verify-and-write path.
+ *
+ * Why a separate queue from records: per-pair work (PDS resolution +
+ * paginated listRecords + sendBatch onto the records queue) is bounded but
+ * non-trivial — running it inside the records-queue consumer would burn the
+ * sub-request budget for jobs that should just be writing to D1. Keeping
+ * the queues separate also lets the operator throttle backfill work
+ * independently of live ingest.
+ *
+ * Error policy:
+ *   - Per-pair `processBackfillJob` throw → `message.retry()`. Cloudflare
+ *     Queues backs off and retries; after `max_retries` (3, configured in
+ *     wrangler.jsonc) the message lands in `emdash-aggregator-backfill-dlq`.
+ *   - Unexpected throws inside the batch loop are caught per-message so one
+ *     bad job can't poison the rest of the batch.
+ *
+ * The DLQ is intentionally not auto-drained today — backfill is operator-
+ * triggered, so DLQ inspection is part of the operator's workflow when a
+ * backfill POST shows partial completion in `wrangler tail`. A drain
+ * consumer can land later when we have a clear ack policy (probably:
+ * write a row to D1 and ack, like the records-DLQ drain).
+ */
+
+import {
+	AtprotoWebDidDocumentResolver,
+	CompositeDidDocumentResolver,
+	PlcDidDocumentResolver,
+} from "@atcute/identity-resolver";
+
+import { processBackfillJob, type ProcessBackfillJobDeps } from "./backfill.js";
+import { createD1DidDocCache, DidResolver } from "./did-resolver.js";
+import type { BackfillJob } from "./env.js";
+import type { MessageBatchLike } from "./records-consumer.js";
+
+/**
+ * Process one batch of backfill jobs. Mirrors `records-consumer.processBatch`'s
+ * shape: per-message try/catch, ack on success, retry on throw.
+ *
+ * `depsOverride` is the test seam — production calls without it and gets
+ * the standard composite resolver wired against `env.DB`.
+ */
+export async function processBackfillBatch(
+	batch: MessageBatchLike<BackfillJob>,
+	env: Env,
+	depsOverride?: ProcessBackfillJobDeps,
+): Promise<void> {
+	const deps = depsOverride ?? createProductionDeps(env);
+	for (const message of batch.messages) {
+		const job = message.body;
+		try {
+			const result = await processBackfillJob(job, deps);
+			console.log("[aggregator] backfill job complete", {
+				did: result.did,
+				collection: result.collection,
+				enqueued: result.enqueued,
+			});
+			message.ack();
+		} catch (err) {
+			// Resolution failures, listRecords 5xx, timeouts, and pagination
+			// runaway all land here. Retry — Cloudflare Queues backoff handles
+			// transient PDS failures; permanently broken DIDs land in the DLQ
+			// after max_retries.
+			console.error("[aggregator] backfill job failed, retrying", {
+				did: job.did,
+				collection: job.collection,
+				error: err instanceof Error ? err.message : String(err),
+			});
+			message.retry();
+		}
+	}
+}
+
+function createProductionDeps(env: Env): ProcessBackfillJobDeps {
+	const composite = new CompositeDidDocumentResolver({
+		methods: {
+			plc: new PlcDidDocumentResolver(),
+			web: new AtprotoWebDidDocumentResolver(),
+		},
+	});
+	return {
+		resolver: new DidResolver({
+			cache: createD1DidDocCache(env.DB),
+			resolver: composite,
+		}),
+		queue: env.RECORDS_QUEUE,
+	};
+}
diff --git a/apps/aggregator/src/backfill.ts b/apps/aggregator/src/backfill.ts
index a61475b47..e27b6b7c1 100644
--- a/apps/aggregator/src/backfill.ts
+++ b/apps/aggregator/src/backfill.ts
@@ -1,36 +1,53 @@
 /**
  * Cold-start discovery worker.
  *
- * Operator-triggered (via `POST /_admin/backfill`); takes a list of DIDs,
- * calls `com.atproto.repo.listRecords` against each publisher's PDS for every
- * collection in `WANTED_COLLECTIONS`, and enqueues each returned record onto
- * the existing Records Queue as a `RecordsJob`. The consumer's verification +
- * write + idempotency machinery handles the rest — same code path as live
- * Jetstream, distinguished only by trigger.
+ * Operator-triggered (via `POST /_admin/backfill`). Two trigger shapes:
  *
- * Live discovery is Jetstream's job, not this worker's; it picks up new
- * publishers automatically. Backfill exists for the cold-start gap (publishers
- * who published before the aggregator was listening) and for operator-triggered
- * recovery after a known outage. There is deliberately no periodic scheduler
- * — see plan §"Why no reconciliation cron".
+ *   - `{ "dids": [...] }` — explicit list, primarily for testing or recovery
+ *     of a known DID set.
+ *   - `{}` (empty body) — production cold-start. Calls
+ *     `com.atproto.sync.listReposByCollection` against the configured relay
+ *     for each NSID in `WANTED_COLLECTIONS`, paginates the full DID set,
+ *     dedupes, and feeds the union into the same backfill loop.
+ *
+ * Architecture: the POST handler synchronously discovers DIDs (or accepts an
+ * explicit list), then fans out one `BackfillJob = { did, collection }` per
+ * (DID × WANTED_COLLECTIONS) pair onto the dedicated `BACKFILL_QUEUE` via
+ * `sendBatch`. A separate consumer (`backfill-consumer.ts`) processes one
+ * pair at a time: resolve PDS, paginate `com.atproto.repo.listRecords`,
+ * batch-enqueue each returned record onto the existing Records Queue.
+ *
+ * Why a separate queue rather than running the per-DID loop inside the
+ * `ctx.waitUntil` of the POST handler: Cloudflare's hard 30-second
+ * wall-clock cap on `waitUntil` would limit a single backfill POST to
+ * ~15–25 DIDs before in-flight work was cancelled. The queue gives us
+ * automatic retry, concurrency, and per-pair invocation budgets that each
+ * fit comfortably under the sub-request ceiling.
+ *
+ * Live discovery (post-cold-start) is Jetstream's job, not this worker's;
+ * the consumer writes `known_publishers` opportunistically on any record
+ * event for an unseen DID. Backfill exists for the cold-start gap
+ * (publishers who published before the aggregator was listening) and for
+ * operator-triggered recovery after a known outage. There is deliberately
+ * no periodic scheduler — see plan §"Why no reconciliation cron".
  */
 
 import { WANTED_COLLECTIONS } from "./constants.js";
 import type { DidResolver } from "./did-resolver.js";
-import type { RecordsJob } from "./env.js";
+import type { BackfillJob, RecordsJob } from "./env.js";
 import { isPlainObject } from "./utils.js";
 
 const PAGE_SIZE = 100;
 /** Per-listRecords-page timeout. A hostile or hung publisher PDS that
  * accepts the connection but stalls the body would otherwise block the
  * fetch until workerd's overall sub-request budget exhausts — starving
- * every DID after this one in the serial loop. Same shape as
+ * every later page in the same consumer invocation. Same shape as
  * `pds-verify.ts`'s fetchCar timeout. */
 const LIST_RECORDS_TIMEOUT_MS = 15_000;
-/** Cap on listRecords pagination per collection. A buggy or malicious PDS
- * that echoes the same cursor would otherwise loop forever inside one
- * `ctx.waitUntil`. 1000 pages × 100 records = 100k records per collection,
- * which is past anything we'd legitimately backfill in one shot. */
+/** Cap on listRecords pagination per (DID, collection) pair. A buggy or
+ * malicious PDS that echoes the same cursor would otherwise loop forever
+ * inside one consumer invocation. 1000 pages × 100 records = 100k records
+ * per pair, which is past anything we'd legitimately backfill in one shot. */
 const MAX_PAGES_PER_COLLECTION = 1000;
 /** Defensive cap on records per page. Real PDSes honour the `limit` query
  * param; this guards against a hostile PDS returning an enormous array.
@@ -42,7 +59,7 @@ const MAX_RECORDS_PER_PAGE = PAGE_SIZE;
 /** Cloudflare Queues' hard cap on `sendBatch` size. Per-page enqueues are
  * always ≤ this thanks to MAX_RECORDS_PER_PAGE; documented here so the
  * relationship is visible at the call site. */
-const QUEUE_SEND_BATCH_CAP = 100;
+export const QUEUE_SEND_BATCH_CAP = 100;
 // Static guard: bumping MAX_RECORDS_PER_PAGE above the queue's batch cap
 // would silently break sendBatch in production. Surface the violation at
 // module load rather than at the first batch send.
@@ -51,185 +68,227 @@ if (MAX_RECORDS_PER_PAGE > QUEUE_SEND_BATCH_CAP) {
 		`MAX_RECORDS_PER_PAGE (${MAX_RECORDS_PER_PAGE}) exceeds QUEUE_SEND_BATCH_CAP (${QUEUE_SEND_BATCH_CAP})`,
 	);
 }
-/** Cap on the total number of records a single backfill invocation may
- * enqueue. Defends against a leaked admin token being weaponised to flood
- * the queue at near-zero cost to the attacker. The blast radius of one
- * compromised POST is bounded by this number × per-message billing.
- *
- * 50k = ~50 plausible publishers × ~250 records each, well past v1 scale.
- * Worker waitUntil exhausts long before this in any honest workload. */
-const MAX_TOTAL_ENQUEUE = 50_000;
 /** Atproto rkey grammar: ALPHA / DIGIT / "." / "-" / "_" / ":" / "~". */
 const RKEY_PATTERN = /^[A-Za-z0-9._:~-]{1,512}$/;
 
-/** Thrown when the per-invocation enqueue cap is reached. Caller catches
- * and stops processing further DIDs / collections; partial work already
- * committed to the queue is what it is (consumer is idempotent on retry). */
-export class EnqueueLimitReached extends Error {
-	constructor(public readonly limit: number) {
-		super(`backfill enqueue cap of ${limit} reached`);
-	}
-}
-
-/** Producer-side queue surface. The production binding `env.RECORDS_QUEUE`
- * satisfies this; tests pass an in-memory implementation. The return type
- * is `unknown` rather than `void` so workerd's `Queue.send` (which returns
- * a `QueueSendResponse`) is structurally assignable. */
-export interface BackfillQueue {
+/** Producer-side records queue surface. The production binding
+ * `env.RECORDS_QUEUE` satisfies this; tests pass an in-memory implementation. */
+export interface RecordsQueueProducer {
 	sendBatch(messages: ReadonlyArray<{ body: RecordsJob }>): Promise<unknown>;
 }
 
-export interface BackfillDeps {
-	resolver: DidResolver;
-	queue: BackfillQueue;
-	/** Inject for tests; defaults to `globalThis.fetch`. */
-	fetch?: typeof fetch;
-	/** Override for the per-listRecords-page timeout. Defaults to
-	 * `LIST_RECORDS_TIMEOUT_MS`. Tests use a small value to exercise the
-	 * abort path without burning the production budget. */
-	listRecordsTimeoutMs?: number;
-	/** Optional callback fired after each DID completes (success or failure).
-	 * Used by the production wiring to log per-DID progress so an operator
-	 * watching `wrangler tail` sees where a long backfill is up to. */
-	onDidComplete?: (result: BackfillDidResult) => void;
+/** Producer-side backfill-jobs queue surface. The production binding
+ * `env.BACKFILL_QUEUE` satisfies this; tests pass an in-memory
+ * implementation. Same shape as `RecordsQueueProducer`, separated so the
+ * type system catches accidental cross-wiring. */
+export interface BackfillQueueProducer {
+	sendBatch(messages: ReadonlyArray<{ body: BackfillJob }>): Promise<unknown>;
 }
 
-export interface BackfillDidResult {
-	did: string;
-	enqueued: number;
-	/** One entry per failure during this DID's backfill. listRecords failures
-	 * are per-collection; resolution failures abort early. Empty array on
-	 * total success. */
-	errors: string[];
+/** Cap on pages walked per relay collection. Same shape as
+ * MAX_PAGES_PER_COLLECTION but for the discovery side. At 100 repos/page,
+ * 100 pages = 10k publishers — past anything we'd legitimately discover at
+ * Slice 1 scale. */
+const MAX_DISCOVERY_PAGES_PER_COLLECTION = 100;
+const DISCOVERY_PAGE_SIZE = 100;
+const DISCOVERY_TIMEOUT_MS = 15_000;
+/** Defensive cap on the union of discovered DIDs. A relay returning a
+ * runaway list (bug or hostile mirror) would otherwise let one POST fan
+ * out millions of jobs onto BACKFILL_QUEUE. At Slice 1 scale we expect a
+ * handful to a few hundred publishers. */
+export const MAX_DISCOVERED_DIDS = 1000;
+
+/**
+ * Discover all DIDs publishing any of `WANTED_COLLECTIONS` by querying the
+ * relay's `com.atproto.sync.listReposByCollection`. Returns the union of
+ * unique DIDs across all collections, in arbitrary order.
+ *
+ * Uses the same defenses as the per-pair listRecords loop: per-page
+ * timeout, max-page cap, cursor-equality check. Per-collection failures
+ * are logged and the loop continues — discovery via the relay is
+ * best-effort; a partial discovery list is better than none. Stops early
+ * once `MAX_DISCOVERED_DIDS` is reached so a runaway relay can't pump
+ * arbitrary fan-out into the queue.
+ */
+export async function discoverDids(
+	relayUrl: string,
+	opts: { fetch?: typeof fetch; timeoutMs?: number } = {},
+): Promise<string[]> {
+	const fetchImpl = opts.fetch ?? fetch;
+	const timeoutMs = opts.timeoutMs ?? DISCOVERY_TIMEOUT_MS;
+	const dids = new Set<string>();
+	for (const collection of WANTED_COLLECTIONS) {
+		try {
+			await discoverCollection(relayUrl, collection, fetchImpl, timeoutMs, dids);
+		} catch (err) {
+			console.error("[aggregator] backfill discovery failed for collection", {
+				collection,
+				error: err instanceof Error ? err.message : String(err),
+			});
+		}
+		if (dids.size >= MAX_DISCOVERED_DIDS) {
+			console.warn("[aggregator] backfill discovery hit DID cap, stopping early", {
+				cap: MAX_DISCOVERED_DIDS,
+				stoppedAfterCollection: collection,
+			});
+			break;
+		}
+	}
+	return [...dids];
 }
 
-export interface BackfillSummary {
-	totalEnqueued: number;
-	results: BackfillDidResult[];
+async function discoverCollection(
+	relayUrl: string,
+	collection: string,
+	fetchImpl: typeof fetch,
+	timeoutMs: number,
+	dids: Set<string>,
+): Promise<void> {
+	let cursor: string | undefined;
+	let prevCursor: string | undefined;
+	let pages = 0;
+	do {
+		if (++pages > MAX_DISCOVERY_PAGES_PER_COLLECTION) {
+			throw new Error(`exceeded ${MAX_DISCOVERY_PAGES_PER_COLLECTION} discovery pages`);
+		}
+		if (cursor !== undefined && cursor === prevCursor) {
+			throw new Error("relay returned identical cursor twice");
+		}
+		prevCursor = cursor;
+
+		const url = new URL("/xrpc/com.atproto.sync.listReposByCollection", relayUrl);
+		url.searchParams.set("collection", collection);
+		url.searchParams.set("limit", String(DISCOVERY_PAGE_SIZE));
+		if (cursor) url.searchParams.set("cursor", cursor);
+
+		const controller = new AbortController();
+		const timer = setTimeout(() => controller.abort(), timeoutMs);
+		let response: Response;
+		try {
+			response = await fetchImpl(url.toString(), {
+				headers: { accept: "application/json" },
+				signal: controller.signal,
+			});
+		} catch (err) {
+			if (err instanceof Error && err.name === "AbortError") {
+				throw new Error(`listReposByCollection timed out after ${timeoutMs}ms`, { cause: err });
+			}
+			throw err;
+		} finally {
+			clearTimeout(timer);
+		}
+		if (!response.ok) {
+			throw new Error(`listReposByCollection returned ${response.status}`);
+		}
+		const body: unknown = await response.json();
+		if (!isPlainObject(body)) throw new Error("listReposByCollection returned non-object body");
+		const repos = body["repos"];
+		if (!Array.isArray(repos)) {
+			throw new Error("listReposByCollection response missing `repos` array");
+		}
+		for (const repo of repos) {
+			if (!isPlainObject(repo)) continue;
+			const did = repo["did"];
+			if (typeof did === "string") {
+				dids.add(did);
+				if (dids.size >= MAX_DISCOVERED_DIDS) return;
+			}
+		}
+		const nextCursor = body["cursor"];
+		cursor = typeof nextCursor === "string" && nextCursor.length > 0 ? nextCursor : undefined;
+	} while (cursor);
 }
 
 /**
- * Backfill multiple DIDs serially. Per-DID failures don't stop the loop —
- * one bad publisher doesn't block the rest.
+ * Fan out backfill work for a list of DIDs onto `BACKFILL_QUEUE`. Produces
+ * one `BackfillJob` per (DID × WANTED_COLLECTIONS) pair, sent in batches
+ * of up to `QUEUE_SEND_BATCH_CAP` to honour Cloudflare Queues' sendBatch
+ * limit. Returns the total number of jobs enqueued.
  *
- * Aborts the loop early if the per-invocation enqueue cap is reached. The
- * partial summary still reflects what got through; the operator can re-run
- * with the unfinished tail of the DID list.
+ * Caller is responsible for capping `dids.length` (admin route enforces
+ * `MAX_BACKFILL_DIDS` for the explicit path; `discoverDids` enforces
+ * `MAX_DISCOVERED_DIDS` for the discovery path). This function trusts its
+ * input and just fans out — the per-call work is bounded by
+ * `dids.length * WANTED_COLLECTIONS.length` enqueues.
  */
-export async function backfillDids(
+export async function enqueueBackfillJobs(
 	dids: readonly string[],
-	deps: BackfillDeps,
-): Promise<BackfillSummary> {
-	const results: BackfillDidResult[] = [];
-	const budget: EnqueueBudget = { remaining: MAX_TOTAL_ENQUEUE };
+	queue: BackfillQueueProducer,
+): Promise<number> {
+	const messages: { body: BackfillJob }[] = [];
 	for (const did of dids) {
-		const result = await backfillDid(did, deps, budget);
-		results.push(result);
-		deps.onDidComplete?.(result);
-		if (budget.remaining <= 0) {
-			console.warn("[aggregator] backfill enqueue cap reached, aborting remaining DIDs", {
-				cap: MAX_TOTAL_ENQUEUE,
-				triggerDid: result.did,
-				didsProcessed: results.length,
-				didsRemaining: dids.length - results.length,
-			});
-			break;
+		for (const collection of WANTED_COLLECTIONS) {
+			messages.push({ body: { did, collection } });
 		}
 	}
-	const totalEnqueued = results.reduce((sum, r) => sum + r.enqueued, 0);
-	return { totalEnqueued, results };
+	for (let i = 0; i < messages.length; i += QUEUE_SEND_BATCH_CAP) {
+		await queue.sendBatch(messages.slice(i, i + QUEUE_SEND_BATCH_CAP));
+	}
+	return messages.length;
+}
+
+export interface ProcessBackfillJobDeps {
+	resolver: DidResolver;
+	queue: RecordsQueueProducer;
+	/** Inject for tests; defaults to `globalThis.fetch`. */
+	fetch?: typeof fetch;
+	/** Override for the per-listRecords-page timeout. Defaults to
+	 * `LIST_RECORDS_TIMEOUT_MS`. Tests use a small value to exercise the
+	 * abort path without burning the production budget. */
+	listRecordsTimeoutMs?: number;
 }
 
-/** Mutable counter shared across the per-DID / per-collection loops so the
- * cap is global to one backfill invocation. */
-interface EnqueueBudget {
-	remaining: number;
+export interface ProcessBackfillJobResult {
+	did: string;
+	collection: string;
+	enqueued: number;
 }
 
 /**
- * Backfill a single DID across every collection in `WANTED_COLLECTIONS`.
- * Resolution failure aborts (we can't enqueue verifiable jobs without
- * knowing the PDS); per-collection listRecords failures are recorded and
- * the loop continues to the next collection.
+ * Process one (DID, collection) pair: resolve the DID's PDS, paginate
+ * `com.atproto.repo.listRecords` for the collection, and batch-enqueue
+ * each returned record onto the records queue.
  *
- * Optional `budget` shares the per-invocation enqueue cap across DIDs.
- * Tests that call this directly without a budget get an effectively
- * unbounded one.
+ * Throws on any failure (resolution, listRecords status, pagination
+ * runaway). The queue consumer translates a thrown result into
+ * `message.retry()`; messages that exhaust max_retries land in the
+ * backfill DLQ for the operator to inspect.
+ *
+ * 404 from the PDS on the first page is treated as "publisher does not
+ * host this collection" and returns 0 enqueues without throwing — same
+ * shape as the previous serial-loop semantics.
  */
-export async function backfillDid(
-	did: string,
-	deps: BackfillDeps,
-	budget: EnqueueBudget = { remaining: MAX_TOTAL_ENQUEUE },
-): Promise<BackfillDidResult> {
-	const errors: string[] = [];
-	let enqueued = 0;
-
-	// resolve() writes to known_publishers as a side-effect of the cache
-	// upsert, so this also registers the publisher for any future code path
-	// that iterates that table.
-	let pds: string;
-	try {
-		const resolved = await deps.resolver.resolve(did);
-		pds = resolved.pds;
-	} catch (err) {
-		errors.push(`resolve failed: ${err instanceof Error ? err.message : String(err)}`);
-		return { did, enqueued, errors };
-	}
-
+export async function processBackfillJob(
+	job: BackfillJob,
+	deps: ProcessBackfillJobDeps,
+): Promise<ProcessBackfillJobResult> {
+	const resolved = await deps.resolver.resolve(job.did);
 	const fetchImpl = deps.fetch ?? fetch;
 	const timeoutMs = deps.listRecordsTimeoutMs ?? LIST_RECORDS_TIMEOUT_MS;
-	for (const collection of WANTED_COLLECTIONS) {
-		// Track this collection's enqueues separately so a mid-pagination
-		// throw still reports the partial count instead of swallowing the
-		// records that already landed in the queue.
-		const before = budget.remaining;
-		try {
-			await backfillCollection({
-				did,
-				pds,
-				collection,
-				queue: deps.queue,
-				fetchImpl,
-				timeoutMs,
-				budget,
-			});
-		} catch (err) {
-			if (err instanceof EnqueueLimitReached) {
-				// Stop processing further collections for this DID; outer
-				// loop will catch the budget exhaustion and stop entirely.
-				// Append a sentinel error so the operator's per-DID log
-				// shows the cap fired (otherwise `enqueued: 47, errors: []`
-				// looks like a successful 47-record DID).
-				enqueued += before - budget.remaining;
-				errors.push(
-					`enqueue cap reached mid-DID at collection ${collection} — partial backfill, re-run to complete`,
-				);
-				return { did, enqueued, errors };
-			}
-			errors.push(`${collection}: ${err instanceof Error ? err.message : String(err)}`);
-		}
-		enqueued += before - budget.remaining;
-	}
-
-	return { did, enqueued, errors };
+	const enqueued = await paginateAndEnqueue({
+		did: job.did,
+		pds: resolved.pds,
+		collection: job.collection,
+		queue: deps.queue,
+		fetchImpl,
+		timeoutMs,
+	});
+	return { did: job.did, collection: job.collection, enqueued };
 }
 
-interface BackfillCollectionOpts {
+interface PaginateOpts {
 	did: string;
 	pds: string;
 	collection: string;
-	queue: BackfillQueue;
+	queue: RecordsQueueProducer;
 	fetchImpl: typeof fetch;
 	timeoutMs: number;
-	budget: EnqueueBudget;
 }
 
 /**
  * Walk one DID's records for a single collection, paginating through
  * `listRecords` and enqueuing each result via `sendBatch` (one batch per
- * page). Decrements `opts.budget.remaining` per record enqueued; throws
- * `EnqueueLimitReached` when the budget hits zero so the caller can stop
- * processing further collections / DIDs.
+ * page). Returns the total records enqueued.
  *
  * 404 from the PDS on the FIRST page means the repo doesn't host this
  * collection — silently treated as zero records, not an error. A 404
@@ -244,12 +303,12 @@ interface BackfillCollectionOpts {
  * `?limit=` query and returns more records than that throws — we'd rather
  * surface the spec violation than silently chunk and hide the upstream bug.
  */
-async function backfillCollection(opts: BackfillCollectionOpts): Promise<void> {
+async function paginateAndEnqueue(opts: PaginateOpts): Promise<number> {
 	let cursor: string | undefined;
 	let prevCursor: string | undefined;
 	let pages = 0;
+	let totalEnqueued = 0;
 	do {
-		if (opts.budget.remaining <= 0) throw new EnqueueLimitReached(MAX_TOTAL_ENQUEUE);
 		if (++pages > MAX_PAGES_PER_COLLECTION) {
 			throw new Error(`exceeded ${MAX_PAGES_PER_COLLECTION} pages`);
 		}
@@ -283,7 +342,7 @@ async function backfillCollection(opts: BackfillCollectionOpts): Promise<void> {
 		if (response.status === 404) {
 			if (cursor === undefined) {
 				// First-page 404: publisher has no records of this collection.
-				return;
+				return totalEnqueued;
 			}
 			// Mid-pagination 404 is a partial failure; surface it.
 			throw new Error(`listRecords returned 404 mid-pagination at cursor=${cursor}`);
@@ -303,7 +362,6 @@ async function backfillCollection(opts: BackfillCollectionOpts): Promise<void> {
 
 		const messages: { body: RecordsJob }[] = [];
 		for (const record of records) {
-			if (opts.budget.remaining - messages.length <= 0) break;
 			const rkey = parseRkeyFromUri(record.uri, opts.collection);
 			if (!rkey) continue;
 			messages.push({
@@ -321,10 +379,10 @@ async function backfillCollection(opts: BackfillCollectionOpts): Promise<void> {
 			// the static assertion above, so this sendBatch never exceeds
 			// Cloudflare's 100-message limit by construction.
 			await opts.queue.sendBatch(messages);
-			opts.budget.remaining -= messages.length;
+			totalEnqueued += messages.length;
 		}
-		if (opts.budget.remaining <= 0) throw new EnqueueLimitReached(MAX_TOTAL_ENQUEUE);
 	} while (cursor);
+	return totalEnqueued;
 }
 
 interface ListRecordEntry {
diff --git a/apps/aggregator/src/env.ts b/apps/aggregator/src/env.ts
index 3053c6d33..f85caadb5 100644
--- a/apps/aggregator/src/env.ts
+++ b/apps/aggregator/src/env.ts
@@ -20,3 +20,18 @@ export interface RecordsJob {
 	 */
 	jetstreamRecord?: unknown;
 }
+
+/**
+ * One unit of cold-start backfill work: walk every record under `collection`
+ * for `did` via that DID's PDS, then enqueue each record onto the records
+ * queue for the standard verify-and-write path.
+ *
+ * Granularity is (DID, collection) rather than per-DID because the per-DID
+ * fan-out can exceed Cloudflare's 30s `ctx.waitUntil` wall-clock cap. One
+ * collection's worth of pagination fits comfortably under that ceiling and
+ * gets queue-level retry + concurrency for free.
+ */
+export interface BackfillJob {
+	did: string;
+	collection: string;
+}
diff --git a/apps/aggregator/src/index.ts b/apps/aggregator/src/index.ts
index a3f42d8df..f238c6974 100644
--- a/apps/aggregator/src/index.ts
+++ b/apps/aggregator/src/index.ts
@@ -15,21 +15,16 @@
  * Worker boots.
  */
 
-import {
-	AtprotoWebDidDocumentResolver,
-	CompositeDidDocumentResolver,
-	PlcDidDocumentResolver,
-} from "@atcute/identity-resolver";
-
-import { backfillDids } from "./backfill.js";
-import { createD1DidDocCache, DidResolver } from "./did-resolver.js";
-import type { RecordsJob } from "./env.js";
+import { processBackfillBatch } from "./backfill-consumer.js";
+import { discoverDids, enqueueBackfillJobs } from "./backfill.js";
+import type { BackfillJob, RecordsJob } from "./env.js";
 import { drainDeadLetterBatch, processBatch } from "./records-consumer.js";
 import { RECORDS_DO_NAME } from "./records-do.js";
 import { isPlainObject } from "./utils.js";
 
 const RECORDS_QUEUE_NAME = "emdash-aggregator-records";
 const RECORDS_DLQ_NAME = "emdash-aggregator-records-dlq";
+const BACKFILL_QUEUE_NAME = "emdash-aggregator-backfill";
 
 export { RecordsJetstreamDO } from "./records-do.js";
 
@@ -43,7 +38,11 @@ export { RecordsJetstreamDO } from "./records-do.js";
  *     https://api.emdashcms.com/_admin/start
  *
  * `/_admin/start` spins up the Records DO (idempotent on an already-running DO).
- * `/_admin/backfill` triggers cold-start discovery against an explicit DID list.
+ * `/_admin/backfill` discovers publishers (or accepts an explicit DID list)
+ * and fans (DID, collection) jobs onto `BACKFILL_QUEUE`; the consumer in
+ * `backfill-consumer.ts` does the per-pair listRecords + records-queue
+ * fan-out work asynchronously, getting queue retry + concurrency for free
+ * and sidestepping the 30s `waitUntil` cap on the POST handler.
  *
  * Auth-gating both routes: backfill specifically introduces caller-chosen
  * URLs into the worker's outbound fetches (via `did:web` resolution + the
@@ -54,7 +53,18 @@ export { RecordsJetstreamDO } from "./records-do.js";
 const BOOTSTRAP_PATH = "/_admin/start";
 const BACKFILL_PATH = "/_admin/backfill";
 
-const MAX_BACKFILL_DIDS = 1000;
+/**
+ * Cap on the explicit DID list a single POST may submit. Lower than the
+ * old serial-loop cap (was 1000) because the queue-fan-out path amplifies
+ * a leaked-token attack: each submitted DID becomes
+ * `WANTED_COLLECTIONS.length` queue messages, each consuming a consumer
+ * invocation with its own outbound PDS fetches. 100 × 4 = 400 jobs is
+ * still a meaningful operator-recovery batch but a tractable blast radius.
+ *
+ * Discovery-mode submissions are bounded separately by
+ * `MAX_DISCOVERED_DIDS` inside `discoverDids`.
+ */
+const MAX_BACKFILL_DIDS = 100;
 const DID_PATTERN = /^did:[a-z]+:[A-Za-z0-9._%:-]+$/;
 
 /**
@@ -111,16 +121,28 @@ function requireAdminAuth(request: Request, env: Env): Response | null {
 	return null;
 }
 
-function parseBackfillBody(body: unknown): { dids: string[] } | { error: string } {
+type BackfillRequest = { mode: "explicit"; dids: string[] } | { mode: "discover" };
+
+function parseBackfillBody(body: unknown): BackfillRequest | { error: string } {
 	if (!isPlainObject(body)) {
 		return { error: "request body must be a JSON object" };
 	}
 	const rawDids = body["dids"];
+	// Empty body OR `{ "dids": null }` OR `{ "dids": undefined }` ⇒ discovery
+	// mode. Production cold-start uses this path; the explicit list is the
+	// testing / recovery seam.
+	if (rawDids === undefined || rawDids === null) {
+		return { mode: "discover" };
+	}
 	if (!Array.isArray(rawDids)) {
-		return { error: "`dids` must be an array of DID strings" };
+		return {
+			error: "`dids` must be an array of DID strings, or omitted to discover via the relay",
+		};
 	}
 	if (rawDids.length === 0) {
-		return { error: "`dids` must not be empty" };
+		return {
+			error: "`dids` must not be empty (omit the field to discover via the relay)",
+		};
 	}
 	if (rawDids.length > MAX_BACKFILL_DIDS) {
 		return {
@@ -136,7 +158,7 @@ function parseBackfillBody(body: unknown): { dids: string[] } | { error: string
 	}
 	// Set iteration order matches insertion; dedup preserves first-seen order
 	// so the operator sees jobs run in the order they submitted.
-	return { dids: [...seen] };
+	return { mode: "explicit", dids: [...seen] };
 }
 
 export default {
@@ -168,11 +190,19 @@ export default {
 			}
 			const denied = requireAdminAuth(request, env);
 			if (denied) return denied;
-			let body: unknown;
-			try {
-				body = await request.json();
-			} catch {
-				return new Response("request body must be valid JSON", { status: 400 });
+			// Empty / no body is the production discovery path. JSON-parse
+			// failures with no content (Content-Length: 0) come back as
+			// SyntaxError; we treat that as "discover" rather than 400ing
+			// so `curl -X POST ... /_admin/backfill` (no body) does the
+			// expected thing.
+			let body: unknown = {};
+			const text = await request.text();
+			if (text.length > 0) {
+				try {
+					body = JSON.parse(text);
+				} catch {
+					return new Response("request body must be valid JSON", { status: 400 });
+				}
 			}
 			const parsed = parseBackfillBody(body);
 			if ("error" in parsed) {
@@ -183,7 +213,7 @@ export default {
 			// via Workers logs and the resulting D1 writes; the response body
 			// just confirms the trigger landed. Per-DID errors are logged from
 			// inside backfillDids; callers don't get them in the response.
-			ctx.waitUntil(runBackfill(parsed.dids, env));
+			ctx.waitUntil(runBackfill(parsed, env));
 			return new Response(null, { status: 202 });
 		}
 		return new Response("emdash-aggregator: not yet implemented", {
@@ -192,15 +222,24 @@ export default {
 		});
 	},
 
-	async queue(batch: MessageBatch<RecordsJob>, env: Env, _ctx: ExecutionContext): Promise<void> {
-		// Workerd routes both consumers (records + records-dlq) here; dispatch
-		// by queue name. Adding a third queue requires updating this switch.
+	async queue(batch: MessageBatch, env: Env, _ctx: ExecutionContext): Promise<void> {
+		// Workerd routes every consumer here; dispatch by queue name to the
+		// matching typed handler. The parameter is unparameterised
+		// (`MessageBatch` defaults to `MessageBatch<unknown>`) because the
+		// binding is shared across queues — narrowing happens per-case via
+		// the queue name, which is a runtime tag the compiler can't see.
 		switch (batch.queue) {
 			case RECORDS_QUEUE_NAME:
-				await processBatch(batch, env);
+				// eslint-disable-next-line typescript-eslint(no-unsafe-type-assertion) -- narrowed by queue name
+				await processBatch(batch as MessageBatch<RecordsJob>, env);
 				return;
 			case RECORDS_DLQ_NAME:
-				await drainDeadLetterBatch(batch, env);
+				// eslint-disable-next-line typescript-eslint(no-unsafe-type-assertion) -- narrowed by queue name
+				await drainDeadLetterBatch(batch as MessageBatch<RecordsJob>, env);
+				return;
+			case BACKFILL_QUEUE_NAME:
+				// eslint-disable-next-line typescript-eslint(no-unsafe-type-assertion) -- narrowed by queue name
+				await processBackfillBatch(batch as MessageBatch<BackfillJob>, env);
 				return;
 			default:
 				console.error("[aggregator] unknown queue, acking batch", { queue: batch.queue });
@@ -221,49 +260,64 @@ export default {
 	},
 };
 
+type BackfillRequestParsed = { mode: "explicit"; dids: string[] } | { mode: "discover" };
+
 /**
- * Production wiring for the backfill worker. Constructs a DID resolver
- * pointed at the live PLC + atproto-web methods, and points the queue at
- * the production Records Queue binding. Per-DID progress is logged as the
- * loop runs so an operator watching `wrangler tail` sees live progress and
- * can tell where a backfill stopped if waitUntil exhausts mid-loop.
+ * Backfill orchestrator. Runs inside the POST handler's `ctx.waitUntil`.
+ * Two paths:
+ *
+ *   - "discover" — call `com.atproto.sync.listReposByCollection` against
+ *     `env.RELAY_URL` for each NSID in WANTED_COLLECTIONS, union the DIDs,
+ *     then fan them out as backfill jobs.
+ *   - "explicit" — operator-supplied DID list, used for testing or for
+ *     recovery of a known DID set.
+ *
+ * In both cases the actual per-(DID, collection) work runs in the
+ * `BACKFILL_QUEUE` consumer (see `backfill-consumer.ts`). This orchestrator
+ * just discovers + enqueues — both fast operations that fit well within
+ * Cloudflare's 30s `waitUntil` ceiling regardless of fan-out size, even
+ * after a `MAX_DISCOVERED_DIDS`-sized union.
+ *
+ * Per-pair progress is logged from the consumer; this function only logs
+ * the discovery + fan-out result so an operator watching `wrangler tail`
+ * sees how many jobs were enqueued.
  */
-async function runBackfill(dids: readonly string[], env: Env): Promise<void> {
-	console.log("[aggregator] backfill starting", { didCount: dids.length });
+async function runBackfill(req: BackfillRequestParsed, env: Env): Promise<void> {
 	try {
-		const composite = new CompositeDidDocumentResolver({
-			methods: {
-				plc: new PlcDidDocumentResolver(),
-				web: new AtprotoWebDidDocumentResolver(),
-			},
-		});
-		const resolver = new DidResolver({
-			cache: createD1DidDocCache(env.DB),
-			resolver: composite,
-		});
-		const summary = await backfillDids(dids, {
-			resolver,
-			queue: env.RECORDS_QUEUE,
-			onDidComplete: (result) => {
-				console.log("[aggregator] backfill did", {
-					did: result.did,
-					enqueued: result.enqueued,
-					errors: result.errors,
-				});
-			},
-		});
-		console.log("[aggregator] backfill complete", {
-			didsRequested: dids.length,
-			// Separate from didsRequested so the early-abort case (enqueue
-			// cap fired) is visible in the summary log without the operator
-			// having to correlate against an earlier warn line.
-			didsProcessed: summary.results.length,
-			totalEnqueued: summary.totalEnqueued,
-			didsWithErrors: summary.results.filter((r) => r.errors.length > 0).length,
+		let dids: readonly string[];
+		if (req.mode === "discover") {
+			console.log("[aggregator] backfill discovery starting", {
+				relay: env.RELAY_URL,
+			});
+			dids = await discoverDids(env.RELAY_URL);
+			console.log("[aggregator] backfill discovery complete", {
+				relay: env.RELAY_URL,
+				didCount: dids.length,
+			});
+		} else {
+			dids = req.dids;
+			console.log("[aggregator] backfill enqueue starting", {
+				mode: "explicit",
+				didCount: dids.length,
+			});
+		}
+
+		if (dids.length === 0) {
+			console.warn("[aggregator] backfill produced zero DIDs, nothing to enqueue", {
+				mode: req.mode,
+			});
+			return;
+		}
+
+		const enqueued = await enqueueBackfillJobs(dids, env.BACKFILL_QUEUE);
+		console.log("[aggregator] backfill enqueue complete", {
+			mode: req.mode,
+			didCount: dids.length,
+			jobsEnqueued: enqueued,
 		});
 	} catch (err) {
 		console.error("[aggregator] backfill aborted", {
-			didCount: dids.length,
+			mode: req.mode,
 			error: err instanceof Error ? (err.stack ?? err.message) : String(err),
 		});
 	}
diff --git a/apps/aggregator/test/backfill.test.ts b/apps/aggregator/test/backfill.test.ts
index 14e93fc4b..5dffcf168 100644
--- a/apps/aggregator/test/backfill.test.ts
+++ b/apps/aggregator/test/backfill.test.ts
@@ -1,9 +1,18 @@
 /**
  * Backfill worker tests.
  *
- * The worker is plain `fetch` + `queue.send`; tests stub both. The DID
+ * The worker is plain `fetch` + queue producers; tests stub both. The DID
  * resolver is also stubbed (in-memory cache + a simple stub upstream) so the
  * tests don't depend on the workers test pool or live PLC.
+ *
+ * Architecture under test (post-restructure):
+ *   - `enqueueBackfillJobs`: synchronous fan-out of (DID × WANTED_COLLECTIONS)
+ *     pairs onto BACKFILL_QUEUE, batched at QUEUE_SEND_BATCH_CAP.
+ *   - `processBackfillJob`: per-pair worker. Resolve PDS, paginate
+ *     `com.atproto.repo.listRecords`, batch-enqueue records onto RECORDS_QUEUE.
+ *   - `processBackfillBatch`: queue consumer that calls `processBackfillJob`
+ *     and translates throws to `message.retry()`.
+ *   - `discoverDids`: relay enumeration via `com.atproto.sync.listReposByCollection`.
  */
 
 import { P256PrivateKeyExportable } from "@atcute/crypto";
@@ -12,7 +21,16 @@ import type { Did } from "@atcute/lexicons/syntax";
 import { applyD1Migrations, env, SELF } from "cloudflare:test";
 import { beforeAll, beforeEach, describe, expect, it } from "vitest";
 
-import { backfillDid, backfillDids, type BackfillQueue } from "../src/backfill.js";
+import { processBackfillBatch } from "../src/backfill-consumer.js";
+import {
+	type BackfillQueueProducer,
+	discoverDids,
+	enqueueBackfillJobs,
+	MAX_DISCOVERED_DIDS,
+	processBackfillJob,
+	QUEUE_SEND_BATCH_CAP,
+	type RecordsQueueProducer,
+} from "../src/backfill.js";
 import { WANTED_COLLECTIONS } from "../src/constants.js";
 import {
 	type CachedDidDoc,
@@ -20,7 +38,8 @@ import {
 	type DidDocumentResolverLike,
 	DidResolver,
 } from "../src/did-resolver.js";
-import type { RecordsJob } from "../src/env.js";
+import type { BackfillJob, RecordsJob } from "../src/env.js";
+import type { MessageController } from "../src/records-consumer.js";
 
 interface TestEnv {
 	DB: D1Database;
@@ -44,7 +63,7 @@ beforeEach(async () => {
 	await testEnv.DB.prepare("DELETE FROM known_publishers").run();
 });
 
-class CapturingQueue implements BackfillQueue {
+class CapturingRecordsQueue implements RecordsQueueProducer {
 	readonly sent: RecordsJob[] = [];
 	sendBatch(messages: ReadonlyArray<{ body: RecordsJob }>): Promise<unknown> {
 		for (const m of messages) this.sent.push(m.body);
@@ -52,6 +71,16 @@ class CapturingQueue implements BackfillQueue {
 	}
 }
 
+class CapturingBackfillQueue implements BackfillQueueProducer {
+	readonly sent: BackfillJob[] = [];
+	readonly batches: number[] = [];
+	sendBatch(messages: ReadonlyArray<{ body: BackfillJob }>): Promise<unknown> {
+		this.batches.push(messages.length);
+		for (const m of messages) this.sent.push(m.body);
+		return Promise.resolve();
+	}
+}
+
 class MapDidDocCache implements DidDocCache {
 	private readonly entries = new Map<string, CachedDidDoc>();
 	read(did: string): Promise<CachedDidDoc | null> {
@@ -68,6 +97,18 @@ class MapDidDocCache implements DidDocCache {
 	}
 }
 
+class FakeMessage<T> implements MessageController {
+	acked = 0;
+	retried = 0;
+	constructor(readonly body: T) {}
+	ack() {
+		this.acked += 1;
+	}
+	retry() {
+		this.retried += 1;
+	}
+}
+
 function buildResolver(): DidResolver {
 	const cache = new MapDidDocCache();
 	const resolver: DidDocumentResolverLike = {
@@ -136,27 +177,34 @@ function makeFetch(
 	};
 }
 
-describe("backfillDid", () => {
+describe("processBackfillJob", () => {
 	it("enqueues each listRecords result as a RecordsJob", async () => {
-		const queue = new CapturingQueue();
+		const queue = new CapturingRecordsQueue();
 		const resolver = buildResolver();
+		const collection = WANTED_COLLECTIONS[0];
+		if (!collection) throw new Error("test assumes ≥1 collection");
 		const fetchImpl = makeFetch({
-			[WANTED_COLLECTIONS[0]]: [
+			[collection]: [
 				{
-					uri: `at://${DID_A}/${WANTED_COLLECTIONS[0]}/demo`,
+					uri: `at://${DID_A}/${collection}/demo`,
 					cid: "bafyc1",
 					value: { foo: "bar" },
 				},
 			],
 		});
-		const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl });
 
-		expect(result.errors).toEqual([]);
+		const result = await processBackfillJob(
+			{ did: DID_A, collection },
+			{ resolver, queue, fetch: fetchImpl },
+		);
+
 		expect(result.enqueued).toBe(1);
+		expect(result.did).toBe(DID_A);
+		expect(result.collection).toBe(collection);
 		expect(queue.sent).toHaveLength(1);
 		expect(queue.sent[0]).toMatchObject({
 			did: DID_A,
-			collection: WANTED_COLLECTIONS[0],
+			collection,
 			rkey: "demo",
 			operation: "create",
 			cid: "bafyc1",
@@ -167,98 +215,58 @@ describe("backfillDid", () => {
 		expect(queue.sent[0]?.jetstreamRecord).toBeUndefined();
 	});
 
-	it("walks every collection in WANTED_COLLECTIONS", async () => {
-		const queue = new CapturingQueue();
-		const resolver = buildResolver();
-		const records: Record<string, MockListRecord[]> = {};
-		for (let i = 0; i < WANTED_COLLECTIONS.length; i++) {
-			const c = WANTED_COLLECTIONS[i];
-			if (c) {
-				records[c] = [{ uri: `at://${DID_A}/${c}/r${i}`, cid: `bafyc${i}`, value: {} }];
-			}
-		}
-		const fetchImpl = makeFetch(records);
-
-		const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl });
-
-		expect(result.enqueued).toBe(WANTED_COLLECTIONS.length);
-		const collectionsHit = new Set(queue.sent.map((j) => j.collection));
-		expect(collectionsHit.size).toBe(WANTED_COLLECTIONS.length);
-	});
-
 	it("treats 404 from the PDS as 'no records of this collection', not an error", async () => {
-		const queue = new CapturingQueue();
+		const queue = new CapturingRecordsQueue();
 		const resolver = buildResolver();
-		const fetchImpl = makeFetch(
-			{},
-			{
-				status: WANTED_COLLECTIONS.reduce<Record<string, number>>((acc, c) => {
-					acc[c] = 404;
-					return acc;
-				}, {}),
-			},
-		);
+		const collection = WANTED_COLLECTIONS[0];
+		if (!collection) throw new Error("test assumes ≥1 collection");
+		const fetchImpl = makeFetch({}, { status: { [collection]: 404 } });
 
-		const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl });
+		const result = await processBackfillJob(
+			{ did: DID_A, collection },
+			{ resolver, queue, fetch: fetchImpl },
+		);
 
-		expect(result.errors).toEqual([]);
 		expect(result.enqueued).toBe(0);
 		expect(queue.sent).toHaveLength(0);
 	});
 
-	it("records non-404 PDS errors per collection, continues to the next collection", async () => {
-		const queue = new CapturingQueue();
+	it("throws on non-404 PDS errors so the consumer can retry", async () => {
+		const queue = new CapturingRecordsQueue();
 		const resolver = buildResolver();
-		const firstCollection = WANTED_COLLECTIONS[0];
-		const secondCollection = WANTED_COLLECTIONS[1];
-		if (!firstCollection || !secondCollection) throw new Error("test assumes ≥2 collections");
-		const fetchImpl = makeFetch(
-			{
-				[secondCollection]: [
-					{
-						uri: `at://${DID_A}/${secondCollection}/r1`,
-						cid: "bafyc",
-						value: {},
-					},
-				],
-			},
-			{ status: { [firstCollection]: 503 } },
-		);
-
-		const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl });
+		const collection = WANTED_COLLECTIONS[0];
+		if (!collection) throw new Error("test assumes ≥1 collection");
+		const fetchImpl = makeFetch({}, { status: { [collection]: 503 } });
 
-		expect(result.errors).toHaveLength(1);
-		expect(result.errors[0]).toContain("503");
-		expect(result.errors[0]).toContain(firstCollection);
-		expect(result.enqueued).toBe(1); // the second collection still ran
+		await expect(
+			processBackfillJob({ did: DID_A, collection }, { resolver, queue, fetch: fetchImpl }),
+		).rejects.toThrow(/503/);
 	});
 
-	it("aborts the DID early if the resolver throws (can't enqueue without PDS)", async () => {
-		const queue = new CapturingQueue();
+	it("throws when the resolver fails (queue consumer translates to retry)", async () => {
+		const queue = new CapturingRecordsQueue();
 		const resolver = new DidResolver({
 			cache: new MapDidDocCache(),
 			resolver: {
 				resolve: () => Promise.reject(new Error("PLC unreachable")),
 			},
 		});
+		const collection = WANTED_COLLECTIONS[0];
+		if (!collection) throw new Error("test assumes ≥1 collection");
 		const fetchImpl = makeFetch({});
 
-		const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl });
-
-		expect(result.enqueued).toBe(0);
+		await expect(
+			processBackfillJob({ did: DID_A, collection }, { resolver, queue, fetch: fetchImpl }),
+		).rejects.toThrow(/PLC unreachable/);
 		expect(queue.sent).toHaveLength(0);
-		expect(result.errors).toHaveLength(1);
-		expect(result.errors[0]).toMatch(/resolve failed.*PLC unreachable/);
 	});
 
 	it("paginates listRecords via cursor", async () => {
-		const queue = new CapturingQueue();
+		const queue = new CapturingRecordsQueue();
 		const resolver = buildResolver();
 		const collection = WANTED_COLLECTIONS[0];
 		if (!collection) throw new Error("test assumes ≥1 collection");
 
-		// Fetch returns page 1 (cursor "p2") then page 2 (no cursor) for the
-		// first collection; empty pages for the others.
 		let calls = 0;
 		const fetchImpl: typeof fetch = async (input) => {
 			const url =
@@ -267,9 +275,6 @@ describe("backfillDid", () => {
 					: input instanceof URL
 						? input
 						: new URL(input.url);
-			if (url.searchParams.get("collection") !== collection) {
-				return new Response(JSON.stringify({ records: [] }), { status: 200 });
-			}
 			calls += 1;
 			const cursor = url.searchParams.get("cursor");
 			if (!cursor) {
@@ -289,16 +294,18 @@ describe("backfillDid", () => {
 			);
 		};
 
-		const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl });
+		const result = await processBackfillJob(
+			{ did: DID_A, collection },
+			{ resolver, queue, fetch: fetchImpl },
+		);
 
 		expect(calls).toBe(2);
-		const collectionJobs = queue.sent.filter((j) => j.collection === collection);
-		expect(collectionJobs.map((j) => j.rkey)).toEqual(["p1", "p2"]);
-		expect(result.errors).toEqual([]);
+		expect(queue.sent.map((j) => j.rkey)).toEqual(["p1", "p2"]);
+		expect(result.enqueued).toBe(2);
 	});
 
 	it("skips records whose URI doesn't match the expected collection (defensive)", async () => {
-		const queue = new CapturingQueue();
+		const queue = new CapturingRecordsQueue();
 		const resolver = buildResolver();
 		const collection = WANTED_COLLECTIONS[0];
 		if (!collection) throw new Error("test assumes ≥1 collection");
@@ -311,110 +318,44 @@ describe("backfillDid", () => {
 				{ uri: `at://${DID_A}/${collection}/`, cid: "c3", value: {} },
 			],
 		});
-		const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl });
 
-		const sentForCollection = queue.sent.filter((j) => j.collection === collection);
-		expect(sentForCollection.map((j) => j.rkey)).toEqual(["legit"]);
+		const result = await processBackfillJob(
+			{ did: DID_A, collection },
+			{ resolver, queue, fetch: fetchImpl },
+		);
+
+		expect(queue.sent.map((j) => j.rkey)).toEqual(["legit"]);
 		expect(result.enqueued).toBe(1);
 	});
-});
 
-describe("backfillDids", () => {
-	it("processes multiple DIDs serially and aggregates the summary", async () => {
-		const queue = new CapturingQueue();
+	it("rejects records with malformed rkey (atproto rkey grammar violation)", async () => {
+		const queue = new CapturingRecordsQueue();
 		const resolver = buildResolver();
 		const collection = WANTED_COLLECTIONS[0];
 		if (!collection) throw new Error("test assumes ≥1 collection");
-		const fetchImpl: typeof fetch = async (input) => {
-			const url =
-				typeof input === "string"
-					? new URL(input)
-					: input instanceof URL
-						? input
-						: new URL(input.url);
-			if (url.searchParams.get("collection") !== collection) {
-				return new Response(JSON.stringify({ records: [] }), { status: 200 });
-			}
-			const did = url.searchParams.get("repo") ?? "";
-			return new Response(
-				JSON.stringify({
-					records: [{ uri: `at://${did}/${collection}/x`, cid: "c", value: {} }],
-				}),
-				{ status: 200, headers: { "content-type": "application/json" } },
-			);
-		};
-
-		const summary = await backfillDids([DID_A, DID_B], {
-			resolver,
-			queue,
-			fetch: fetchImpl,
-		});
-
-		expect(summary.totalEnqueued).toBe(2);
-		expect(summary.results).toHaveLength(2);
-		expect(summary.results.map((r) => r.did)).toEqual([DID_A, DID_B]);
-		expect(summary.results.every((r) => r.errors.length === 0)).toBe(true);
-	});
-
-	it("doesn't let one DID's failure stop the others", async () => {
-		const queue = new CapturingQueue();
-		// Resolver succeeds for DID_A, fails for DID_B (or any other).
-		const cache = new MapDidDocCache();
-		const resolver = new DidResolver({
-			cache,
-			resolver: {
-				resolve: (did) => {
-					if (did === DID_A) {
-						return Promise.resolve({
-							id: did as `did:${string}:${string}`,
-							verificationMethod: [
-								{
-									id: `${did}#atproto`,
-									type: "Multikey",
-									controller: did as `did:${string}:${string}`,
-									publicKeyMultibase: signingKeyMultibase,
-								},
-							],
-							service: [
-								{
-									id: "#atproto_pds",
-									type: "AtprotoPersonalDataServer",
-									serviceEndpoint: PDS,
-								},
-							],
-						});
-					}
-					return Promise.reject(new Error("DID_B unresolvable"));
-				},
-			},
-			ttlMs: 1_000_000,
-			now: () => new Date(),
-		});
-		const collection = WANTED_COLLECTIONS[0];
-		if (!collection) throw new Error("test assumes ≥1 collection");
 		const fetchImpl = makeFetch({
-			[collection]: [{ uri: `at://${DID_A}/${collection}/x`, cid: "c", value: {} }],
-		});
-
-		const summary = await backfillDids([DID_B, DID_A], {
-			resolver,
-			queue,
-			fetch: fetchImpl,
+			[collection]: [
+				{ uri: `at://${DID_A}/${collection}/legit`, cid: "c1", value: {} },
+				{ uri: `at://${DID_A}/${collection}/has?queryparam`, cid: "c2", value: {} },
+				{ uri: `at://${DID_A}/${collection}/has#fragment`, cid: "c3", value: {} },
+				{ uri: `at://${DID_A}/${collection}/has space`, cid: "c4", value: {} },
+			],
 		});
 
-		expect(summary.totalEnqueued).toBe(1);
-		expect(summary.results).toHaveLength(2);
-		const bResult = summary.results.find((r) => r.did === DID_B);
-		const aResult = summary.results.find((r) => r.did === DID_A);
-		expect(bResult?.errors.length).toBe(1);
-		expect(aResult?.errors).toEqual([]);
-		expect(aResult?.enqueued).toBe(1);
+		const result = await processBackfillJob(
+			{ did: DID_A, collection },
+			{ resolver, queue, fetch: fetchImpl },
+		);
+		expect(queue.sent.map((j) => j.rkey)).toEqual(["legit"]);
+		expect(result.enqueued).toBe(1);
 	});
 
 	it("end-to-end against the production D1 cache: DID is registered in known_publishers", async () => {
-		const queue = new CapturingQueue();
+		const queue = new CapturingRecordsQueue();
 		const { createD1DidDocCache } = await import("../src/did-resolver.js");
 		const cache = createD1DidDocCache(testEnv.DB);
+		const collection = WANTED_COLLECTIONS[0];
+		if (!collection) throw new Error("test assumes ≥1 collection");
 		const resolver = new DidResolver({
 			cache,
 			resolver: {
@@ -441,7 +382,7 @@ describe("backfillDids", () => {
 		});
 		const fetchImpl = makeFetch({});
 
-		await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl });
+		await processBackfillJob({ did: DID_A, collection }, { resolver, queue, fetch: fetchImpl });
 
 		const row = await testEnv.DB.prepare(
 			`SELECT did, pds, signing_key, signing_key_id FROM known_publishers WHERE did = ?`,
@@ -453,25 +394,16 @@ describe("backfillDids", () => {
 	});
 });
 
-describe("backfillCollection: defenses against malicious / buggy PDS", () => {
+describe("processBackfillJob: defenses against malicious / buggy PDS", () => {
 	it("aborts after MAX_PAGES_PER_COLLECTION when cursor never empties", async () => {
-		const queue = new CapturingQueue();
+		const queue = new CapturingRecordsQueue();
 		const resolver = buildResolver();
 		const collection = WANTED_COLLECTIONS[0];
 		if (!collection) throw new Error("test assumes ≥1 collection");
 		// Hostile PDS: returns a different non-empty cursor every call so the
 		// cursor-equality check doesn't fire — only the page cap stops us.
 		let counter = 0;
-		const fetchImpl: typeof fetch = async (input) => {
-			const url =
-				typeof input === "string"
-					? new URL(input)
-					: input instanceof URL
-						? input
-						: new URL(input.url);
-			if (url.searchParams.get("collection") !== collection) {
-				return new Response(JSON.stringify({ records: [] }), { status: 200 });
-			}
+		const fetchImpl: typeof fetch = async () => {
 			counter += 1;
 			return new Response(JSON.stringify({ records: [], cursor: `cursor-${counter}` }), {
 				status: 200,
@@ -479,32 +411,21 @@ describe("backfillCollection: defenses against malicious / buggy PDS", () => {
 			});
 		};
 
-		const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl });
-
-		expect(result.errors.length).toBeGreaterThanOrEqual(1);
-		expect(result.errors.some((e) => e.includes("exceeded"))).toBe(true);
+		await expect(
+			processBackfillJob({ did: DID_A, collection }, { resolver, queue, fetch: fetchImpl }),
+		).rejects.toThrow(/exceeded/);
 		// Loop ran at most MAX_PAGES_PER_COLLECTION times.
-		expect(counter).toBeLessThanOrEqual(1001); // +1 for the throw-on-the-next iteration
+		expect(counter).toBeLessThanOrEqual(1001);
 	});
 
 	it("aborts when the PDS returns the identical cursor twice", async () => {
-		const queue = new CapturingQueue();
+		const queue = new CapturingRecordsQueue();
 		const resolver = buildResolver();
 		const collection = WANTED_COLLECTIONS[0];
 		if (!collection) throw new Error("test assumes ≥1 collection");
-		// Buggy PDS: echoes the cursor we sent. Cursor-equality should fire
-		// on the second iteration.
+		// Buggy PDS: echoes the cursor we sent.
 		let calls = 0;
-		const fetchImpl: typeof fetch = async (input) => {
-			const url =
-				typeof input === "string"
-					? new URL(input)
-					: input instanceof URL
-						? input
-						: new URL(input.url);
-			if (url.searchParams.get("collection") !== collection) {
-				return new Response(JSON.stringify({ records: [] }), { status: 200 });
-			}
+		const fetchImpl: typeof fetch = async () => {
 			calls += 1;
 			return new Response(JSON.stringify({ records: [], cursor: "stuck" }), {
 				status: 200,
@@ -512,14 +433,14 @@ describe("backfillCollection: defenses against malicious / buggy PDS", () => {
 			});
 		};
 
-		const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl });
-
-		expect(result.errors.some((e) => e.includes("identical cursor"))).toBe(true);
+		await expect(
+			processBackfillJob({ did: DID_A, collection }, { resolver, queue, fetch: fetchImpl }),
+		).rejects.toThrow(/identical cursor/);
 		expect(calls).toBe(2); // first page, then second page caught the dupe
 	});
 
-	it("treats 404 mid-pagination as a partial failure (not silent zero)", async () => {
-		const queue = new CapturingQueue();
+	it("treats 404 mid-pagination as a partial failure (throws)", async () => {
+		const queue = new CapturingRecordsQueue();
 		const resolver = buildResolver();
 		const collection = WANTED_COLLECTIONS[0];
 		if (!collection) throw new Error("test assumes ≥1 collection");
@@ -530,12 +451,8 @@ describe("backfillCollection: defenses against malicious / buggy PDS", () => {
 					: input instanceof URL
 						? input
 						: new URL(input.url);
-			if (url.searchParams.get("collection") !== collection) {
-				return new Response(JSON.stringify({ records: [] }), { status: 200 });
-			}
 			const cursor = url.searchParams.get("cursor");
 			if (!cursor) {
-				// First page succeeds with a cursor.
 				return new Response(
 					JSON.stringify({
 						records: [{ uri: `at://${DID_A}/${collection}/p1`, cid: "c1", value: {} }],
@@ -544,18 +461,16 @@ describe("backfillCollection: defenses against malicious / buggy PDS", () => {
 					{ status: 200, headers: { "content-type": "application/json" } },
 				);
 			}
-			// Second page 404.
 			return new Response("not found", { status: 404 });
 		};
 
-		const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl });
-
-		const error = result.errors.find((e) => e.includes(collection));
-		expect(error).toMatch(/404 mid-pagination/);
+		await expect(
+			processBackfillJob({ did: DID_A, collection }, { resolver, queue, fetch: fetchImpl }),
+		).rejects.toThrow(/404 mid-pagination/);
 	});
 
 	it("rejects pages with > MAX_RECORDS_PER_PAGE records (PDS oversize attack)", async () => {
-		const queue = new CapturingQueue();
+		const queue = new CapturingRecordsQueue();
 		const resolver = buildResolver();
 		const collection = WANTED_COLLECTIONS[0];
 		if (!collection) throw new Error("test assumes ≥1 collection");
@@ -566,104 +481,316 @@ describe("backfillCollection: defenses against malicious / buggy PDS", () => {
 		}));
 		const fetchImpl = makeFetch({ [collection]: records });
 
-		const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl });
-
-		expect(result.errors.some((e) => e.includes("per-page cap"))).toBe(true);
-		// No records enqueued for the over-capped collection.
-		expect(queue.sent.filter((j) => j.collection === collection)).toHaveLength(0);
+		await expect(
+			processBackfillJob({ did: DID_A, collection }, { resolver, queue, fetch: fetchImpl }),
+		).rejects.toThrow(/per-page cap/);
+		expect(queue.sent).toHaveLength(0);
 	});
 
-	it("rejects records with malformed rkey (atproto rkey grammar violation)", async () => {
-		const queue = new CapturingQueue();
+	it("aborts a hung PDS fetch via the listRecords timeout", async () => {
+		const queue = new CapturingRecordsQueue();
 		const resolver = buildResolver();
 		const collection = WANTED_COLLECTIONS[0];
 		if (!collection) throw new Error("test assumes ≥1 collection");
-		const fetchImpl = makeFetch({
-			[collection]: [
-				{ uri: `at://${DID_A}/${collection}/legit`, cid: "c1", value: {} },
-				{ uri: `at://${DID_A}/${collection}/has?queryparam`, cid: "c2", value: {} },
-				{ uri: `at://${DID_A}/${collection}/has#fragment`, cid: "c3", value: {} },
-				{ uri: `at://${DID_A}/${collection}/has space`, cid: "c4", value: {} },
-			],
-		});
+		const fetchImpl: typeof fetch = (_input, init) =>
+			new Promise((_resolve, reject) => {
+				init?.signal?.addEventListener("abort", () => {
+					reject(new DOMException("aborted", "AbortError"));
+				});
+			});
 
-		const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl });
-		const sent = queue.sent.filter((j) => j.collection === collection);
-		expect(sent.map((j) => j.rkey)).toEqual(["legit"]);
-		expect(result.enqueued).toBe(1);
+		await expect(
+			processBackfillJob(
+				{ did: DID_A, collection },
+				{ resolver, queue, fetch: fetchImpl, listRecordsTimeoutMs: 25 },
+			),
+		).rejects.toThrow(/timed out after 25ms/);
 	});
 });
 
-describe("backfillCollection: enqueue budget cap", () => {
-	it("stops enqueuing when MAX_TOTAL_ENQUEUE is reached and reports the partial count", async () => {
-		// Force the cap by giving a tiny budget. 50 records × 4 collections =
-		// 200 records the worker would otherwise enqueue; budget of 30 should
-		// cap it after the first collection's first page.
-		const queue = new CapturingQueue();
-		const resolver = buildResolver();
-		const recordsByCollection: Record<string, MockListRecord[]> = {};
-		for (const c of WANTED_COLLECTIONS) {
-			recordsByCollection[c] = Array.from({ length: 50 }, (_, i) => ({
-				uri: `at://${DID_A}/${c}/r${i}`,
-				cid: `c${i}`,
-				value: {},
-			}));
+describe("enqueueBackfillJobs", () => {
+	it("emits one job per (DID × WANTED_COLLECTIONS) pair", async () => {
+		const queue = new CapturingBackfillQueue();
+		const enqueued = await enqueueBackfillJobs([DID_A, DID_B], queue);
+
+		const expected = 2 * WANTED_COLLECTIONS.length;
+		expect(enqueued).toBe(expected);
+		expect(queue.sent).toHaveLength(expected);
+
+		// Cartesian shape: every DID appears with every collection exactly once.
+		for (const did of [DID_A, DID_B]) {
+			for (const collection of WANTED_COLLECTIONS) {
+				expect(queue.sent.filter((j) => j.did === did && j.collection === collection)).toHaveLength(
+					1,
+				);
+			}
 		}
-		const fetchImpl = makeFetch(recordsByCollection);
-		const result = await backfillDid(
-			DID_A,
-			{ resolver, queue, fetch: fetchImpl },
-			{ remaining: 30 },
+	});
+
+	it("batches sendBatch calls at QUEUE_SEND_BATCH_CAP", async () => {
+		const queue = new CapturingBackfillQueue();
+		// Pick enough DIDs that the total job count exceeds the cap. With
+		// QUEUE_SEND_BATCH_CAP = 100 and 4 collections, 30 DIDs → 120 jobs
+		// → batches of [100, 20].
+		const dids = Array.from(
+			{ length: 30 },
+			(_, i) => `did:plc:bulk${i.toString().padStart(20, "0")}`,
 		);
-		// Budget=30 + page of 50 records: inner loop pushes records while
-		// `budget.remaining - messages.length > 0`. At iter 31 the
-		// expression (30 - 30) ≤ 0 fires `break`. messages.length === 30,
-		// sendBatch sends 30, budget → 0, post-send check throws
-		// EnqueueLimitReached. Exact answer is 30 — assert the exact
-		// number so an off-by-one regression in the inner break doesn't
-		// pass under a slack `<=`.
-		expect(result.enqueued).toBe(30);
-		expect(queue.sent.length).toBe(30);
-		// Sentinel error appended so the operator can see the cap fired
-		// rather than thinking the DID was a clean 30-record publisher.
-		expect(result.errors.some((e) => e.includes("enqueue cap reached"))).toBe(true);
+		const total = dids.length * WANTED_COLLECTIONS.length;
+		const expectedBatches: number[] = [];
+		for (let i = 0; i < total; i += QUEUE_SEND_BATCH_CAP) {
+			expectedBatches.push(Math.min(QUEUE_SEND_BATCH_CAP, total - i));
+		}
+
+		const enqueued = await enqueueBackfillJobs(dids, queue);
+
+		expect(enqueued).toBe(total);
+		expect(queue.batches).toEqual(expectedBatches);
+		expect(queue.batches.every((n) => n <= QUEUE_SEND_BATCH_CAP)).toBe(true);
+	});
+
+	it("emits no batches for an empty DID list", async () => {
+		const queue = new CapturingBackfillQueue();
+		const enqueued = await enqueueBackfillJobs([], queue);
+		expect(enqueued).toBe(0);
+		expect(queue.batches).toEqual([]);
+		expect(queue.sent).toHaveLength(0);
 	});
 });
 
-describe("backfillCollection: listRecords timeout", () => {
-	it("aborts a hung PDS fetch and surfaces the timeout per-collection", async () => {
-		const queue = new CapturingQueue();
+describe("processBackfillBatch (consumer)", () => {
+	it("acks each message after a successful per-pair run", async () => {
+		const recordsQueue = new CapturingRecordsQueue();
 		const resolver = buildResolver();
 		const collection = WANTED_COLLECTIONS[0];
 		if (!collection) throw new Error("test assumes ≥1 collection");
-		const fetchImpl: typeof fetch = (input, init) => {
+		const fetchImpl = makeFetch({
+			[collection]: [{ uri: `at://${DID_A}/${collection}/r1`, cid: "c", value: {} }],
+		});
+
+		const message = new FakeMessage<BackfillJob>({ did: DID_A, collection });
+		await processBackfillBatch({ messages: [message] }, {} as Env, {
+			resolver,
+			queue: recordsQueue,
+			fetch: fetchImpl,
+		});
+
+		expect(message.acked).toBe(1);
+		expect(message.retried).toBe(0);
+		expect(recordsQueue.sent).toHaveLength(1);
+	});
+
+	it("retries when processBackfillJob throws (transient PDS failure)", async () => {
+		const recordsQueue = new CapturingRecordsQueue();
+		const resolver = buildResolver();
+		const collection = WANTED_COLLECTIONS[0];
+		if (!collection) throw new Error("test assumes ≥1 collection");
+		const fetchImpl = makeFetch({}, { status: { [collection]: 503 } });
+
+		const message = new FakeMessage<BackfillJob>({ did: DID_A, collection });
+		await processBackfillBatch({ messages: [message] }, {} as Env, {
+			resolver,
+			queue: recordsQueue,
+			fetch: fetchImpl,
+		});
+
+		expect(message.retried).toBe(1);
+		expect(message.acked).toBe(0);
+	});
+
+	it("does not let one failed job poison the rest of the batch", async () => {
+		const recordsQueue = new CapturingRecordsQueue();
+		const resolver = buildResolver();
+		const collection = WANTED_COLLECTIONS[0];
+		const collection2 = WANTED_COLLECTIONS[1];
+		if (!collection || !collection2) throw new Error("test assumes ≥2 collections");
+		// First collection 503s; second succeeds with one record.
+		const fetchImpl: typeof fetch = async (input) => {
 			const url =
 				typeof input === "string"
 					? new URL(input)
 					: input instanceof URL
 						? input
 						: new URL(input.url);
-			if (url.searchParams.get("collection") !== collection) {
-				return Promise.resolve(new Response(JSON.stringify({ records: [] }), { status: 200 }));
+			const c = url.searchParams.get("collection");
+			if (c === collection) return new Response("err", { status: 503 });
+			return new Response(
+				JSON.stringify({
+					records: [{ uri: `at://${DID_A}/${c}/r1`, cid: "c", value: {} }],
+				}),
+				{ status: 200, headers: { "content-type": "application/json" } },
+			);
+		};
+
+		const failing = new FakeMessage<BackfillJob>({ did: DID_A, collection });
+		const succeeding = new FakeMessage<BackfillJob>({ did: DID_A, collection: collection2 });
+		await processBackfillBatch({ messages: [failing, succeeding] }, {} as Env, {
+			resolver,
+			queue: recordsQueue,
+			fetch: fetchImpl,
+		});
+
+		expect(failing.retried).toBe(1);
+		expect(failing.acked).toBe(0);
+		expect(succeeding.acked).toBe(1);
+		expect(succeeding.retried).toBe(0);
+		expect(recordsQueue.sent).toHaveLength(1);
+		expect(recordsQueue.sent[0]?.collection).toBe(collection2);
+	});
+});
+
+describe("discoverDids: listReposByCollection enumeration", () => {
+	const RELAY = "https://relay.test.example";
+
+	function makeRelayFetch(reposByCollection: Record<string, Array<{ did: string }>>): typeof fetch {
+		return async (input) => {
+			const url =
+				typeof input === "string"
+					? new URL(input)
+					: input instanceof URL
+						? input
+						: new URL(input.url);
+			if (!url.pathname.endsWith("/xrpc/com.atproto.sync.listReposByCollection")) {
+				return new Response("not stubbed", { status: 599 });
 			}
-			// Hung connection: never resolve, but honour the abort signal so
-			// the AbortController-driven timeout fires.
-			return new Promise((_resolve, reject) => {
+			const collection = url.searchParams.get("collection") ?? "";
+			const repos = reposByCollection[collection] ?? [];
+			return new Response(JSON.stringify({ repos }), {
+				status: 200,
+				headers: { "content-type": "application/json" },
+			});
+		};
+	}
+
+	it("returns the union of distinct DIDs across all WANTED_COLLECTIONS", async () => {
+		const c0 = WANTED_COLLECTIONS[0];
+		const c1 = WANTED_COLLECTIONS[1];
+		if (!c0 || !c1) throw new Error("test assumes ≥2 collections");
+		const fetchImpl = makeRelayFetch({
+			[c0]: [{ did: "did:plc:a" }, { did: "did:plc:b" }],
+			[c1]: [{ did: "did:plc:b" }, { did: "did:plc:c" }],
+		});
+
+		const dids = await discoverDids(RELAY, { fetch: fetchImpl });
+
+		expect(new Set(dids)).toEqual(new Set(["did:plc:a", "did:plc:b", "did:plc:c"]));
+	});
+
+	it("paginates via cursor", async () => {
+		const c0 = WANTED_COLLECTIONS[0];
+		if (!c0) throw new Error("test assumes ≥1 collection");
+		let calls = 0;
+		const fetchImpl: typeof fetch = async (input) => {
+			const url =
+				typeof input === "string"
+					? new URL(input)
+					: input instanceof URL
+						? input
+						: new URL(input.url);
+			if (url.searchParams.get("collection") !== c0) {
+				return new Response(JSON.stringify({ repos: [] }), { status: 200 });
+			}
+			calls += 1;
+			const cursor = url.searchParams.get("cursor");
+			if (!cursor) {
+				return new Response(JSON.stringify({ repos: [{ did: "did:plc:a" }], cursor: "next" }), {
+					status: 200,
+					headers: { "content-type": "application/json" },
+				});
+			}
+			return new Response(JSON.stringify({ repos: [{ did: "did:plc:b" }] }), {
+				status: 200,
+				headers: { "content-type": "application/json" },
+			});
+		};
+
+		const dids = await discoverDids(RELAY, { fetch: fetchImpl });
+
+		expect(calls).toBe(2);
+		expect(new Set(dids)).toEqual(new Set(["did:plc:a", "did:plc:b"]));
+	});
+
+	it("logs and continues when one collection's listReposByCollection fails", async () => {
+		const c0 = WANTED_COLLECTIONS[0];
+		const c1 = WANTED_COLLECTIONS[1];
+		if (!c0 || !c1) throw new Error("test assumes ≥2 collections");
+		const fetchImpl: typeof fetch = async (input) => {
+			const url =
+				typeof input === "string"
+					? new URL(input)
+					: input instanceof URL
+						? input
+						: new URL(input.url);
+			const collection = url.searchParams.get("collection");
+			if (collection === c0) return new Response("relay broken", { status: 503 });
+			if (collection === c1) {
+				return new Response(JSON.stringify({ repos: [{ did: "did:plc:c1" }] }), {
+					status: 200,
+					headers: { "content-type": "application/json" },
+				});
+			}
+			return new Response(JSON.stringify({ repos: [] }), { status: 200 });
+		};
+
+		const dids = await discoverDids(RELAY, { fetch: fetchImpl });
+
+		expect(dids).toContain("did:plc:c1");
+	});
+
+	it("aborts a hung relay fetch via the timeout", async () => {
+		const fetchImpl: typeof fetch = (_input, init) =>
+			new Promise((_resolve, reject) => {
 				init?.signal?.addEventListener("abort", () => {
 					reject(new DOMException("aborted", "AbortError"));
 				});
 			});
+
+		const dids = await discoverDids(RELAY, { fetch: fetchImpl, timeoutMs: 25 });
+
+		// Every collection's discovery throws "timed out"; the function returns
+		// an empty set rather than propagating the error.
+		expect(dids).toEqual([]);
+	});
+
+	it("stops enumerating once MAX_DISCOVERED_DIDS is hit (defense vs runaway relay)", async () => {
+		const c0 = WANTED_COLLECTIONS[0];
+		if (!c0) throw new Error("test assumes ≥1 collection");
+		// Relay returns MAX_DISCOVERED_DIDS+50 repos in one page for the first
+		// collection. discoverDids should add exactly MAX_DISCOVERED_DIDS DIDs
+		// then stop without paging further or hitting the next collection.
+		let pdsCalls = 0;
+		const fetchImpl: typeof fetch = async (input) => {
+			const url =
+				typeof input === "string"
+					? new URL(input)
+					: input instanceof URL
+						? input
+						: new URL(input.url);
+			pdsCalls += 1;
+			const collection = url.searchParams.get("collection");
+			if (collection !== c0) {
+				// Should never be reached if the cap fires.
+				return new Response(JSON.stringify({ repos: [{ did: "did:plc:later" }] }), {
+					status: 200,
+					headers: { "content-type": "application/json" },
+				});
+			}
+			const repos = Array.from({ length: MAX_DISCOVERED_DIDS + 50 }, (_, i) => ({
+				did: `did:plc:cap${i.toString().padStart(20, "0")}`,
+			}));
+			return new Response(JSON.stringify({ repos }), {
+				status: 200,
+				headers: { "content-type": "application/json" },
+			});
 		};
 
-		const result = await backfillDid(DID_A, {
-			resolver,
-			queue,
-			fetch: fetchImpl,
-			listRecordsTimeoutMs: 25,
-		});
+		const dids = await discoverDids("https://relay.test.example", { fetch: fetchImpl });
 
-		const error = result.errors.find((e) => e.includes(collection));
-		expect(error).toMatch(/timed out after 25ms/);
+		expect(dids).toHaveLength(MAX_DISCOVERED_DIDS);
+		expect(dids).not.toContain("did:plc:later");
+		// Only the first collection was queried; the cap fired before reaching
+		// any subsequent collection.
+		expect(pdsCalls).toBe(1);
 	});
 });
 
@@ -698,7 +825,7 @@ describe("backfill admin route: auth + input validation", () => {
 		expect(res.status).toBe(405);
 	});
 
-	it("returns 400 on missing dids field", async () => {
+	it("accepts an empty body and triggers discovery (production cold-start path)", async () => {
 		const res = await SELF.fetch("https://test/_admin/backfill", {
 			method: "POST",
 			headers: {
@@ -707,11 +834,31 @@ describe("backfill admin route: auth + input validation", () => {
 			},
 			body: JSON.stringify({}),
 		});
+		expect(res.status).toBe(202);
+	});
+
+	it("accepts a literal empty request body (no JSON) and triggers discovery", async () => {
+		const res = await SELF.fetch("https://test/_admin/backfill", {
+			method: "POST",
+			headers: { authorization: "Bearer test-admin-token" },
+		});
+		expect(res.status).toBe(202);
+	});
+
+	it("returns 400 on a non-array `dids` value (string instead of array)", async () => {
+		const res = await SELF.fetch("https://test/_admin/backfill", {
+			method: "POST",
+			headers: {
+				"content-type": "application/json",
+				authorization: "Bearer test-admin-token",
+			},
+			body: JSON.stringify({ dids: "did:plc:foo" }),
+		});
 		expect(res.status).toBe(400);
 		expect(await res.text()).toContain("must be an array");
 	});
 
-	it("returns 400 on empty dids array", async () => {
+	it("returns 400 on empty dids array (suggests omitting the field for discovery)", async () => {
 		const res = await SELF.fetch("https://test/_admin/backfill", {
 			method: "POST",
 			headers: {
@@ -721,12 +868,16 @@ describe("backfill admin route: auth + input validation", () => {
 			body: JSON.stringify({ dids: [] }),
 		});
 		expect(res.status).toBe(400);
-		expect(await res.text()).toContain("not be empty");
+		const text = await res.text();
+		expect(text).toContain("not be empty");
+		expect(text).toMatch(/discover|omit/i);
 	});
 
 	it("returns 400 on dids list larger than the cap", async () => {
+		// Cap is currently 100 (lowered from 1000 because the queue-fan-out
+		// path amplifies a leaked-token attack). 101 DIDs → over cap.
 		const dids = Array.from(
-			{ length: 1001 },
+			{ length: 101 },
 			(_, i) => `did:plc:test${i.toString().padStart(20, "0")}`,
 		);
 		const res = await SELF.fetch("https://test/_admin/backfill", {
@@ -738,7 +889,7 @@ describe("backfill admin route: auth + input validation", () => {
 			body: JSON.stringify({ dids }),
 		});
 		expect(res.status).toBe(400);
-		expect(await res.text()).toContain("at most 1000");
+		expect(await res.text()).toContain("at most 100");
 	});
 
 	it("returns 400 on malformed DID (caught by DID_PATTERN)", async () => {
diff --git a/apps/aggregator/worker-configuration.d.ts b/apps/aggregator/worker-configuration.d.ts
index 4af216a0d..b16346f3b 100644
--- a/apps/aggregator/worker-configuration.d.ts
+++ b/apps/aggregator/worker-configuration.d.ts
@@ -1,5 +1,5 @@
 /* eslint-disable */
-// Generated by Wrangler by running `wrangler types` (hash: 1f938082c03573892ce397f54a9ca0b6)
+// Generated by Wrangler by running `wrangler types` (hash: 9052900c7b8ec62b9504e6508846deb9)
 // Runtime types generated with workerd@1.20260507.1 2026-02-24 nodejs_compat
 declare namespace Cloudflare {
 	interface GlobalProps {
@@ -9,8 +9,9 @@ declare namespace Cloudflare {
 	interface Env {
 		DB: D1Database;
 		RECORDS_QUEUE: Queue;
+		BACKFILL_QUEUE: Queue;
 		JETSTREAM_URL: "wss://jetstream2.us-east.bsky.network/subscribe";
-		CONSTELLATION_URL: "https://constellation.microcosm.blue";
+		RELAY_URL: "https://bsky.network";
 		ADMIN_TOKEN: string;
 		RECORDS_DO: DurableObjectNamespace<import("./src/index").RecordsJetstreamDO>;
 	}
@@ -21,7 +22,7 @@ type StringifyValues<EnvType extends Record<string, unknown>> = {
 };
 declare namespace NodeJS {
 	interface ProcessEnv extends StringifyValues<
-		Pick<Cloudflare.Env, "JETSTREAM_URL" | "CONSTELLATION_URL" | "ADMIN_TOKEN">
+		Pick<Cloudflare.Env, "JETSTREAM_URL" | "RELAY_URL" | "ADMIN_TOKEN">
 	> {}
 }
 
diff --git a/apps/aggregator/wrangler.jsonc b/apps/aggregator/wrangler.jsonc
index 6e0fc8287..969e2b95c 100644
--- a/apps/aggregator/wrangler.jsonc
+++ b/apps/aggregator/wrangler.jsonc
@@ -25,6 +25,16 @@
 				"binding": "RECORDS_QUEUE",
 				"queue": "emdash-aggregator-records",
 			},
+			{
+				// Backfill orchestrator (POST /_admin/backfill) fans
+				// (DID, collection) pairs onto this queue. Per-pair work
+				// (resolve PDS → listRecords → sendBatch onto RECORDS_QUEUE)
+				// fits well within a single consumer invocation's waitUntil,
+				// solving the 30s wall-clock cap that would otherwise limit
+				// us to ~15–25 DIDs per backfill POST.
+				"binding": "BACKFILL_QUEUE",
+				"queue": "emdash-aggregator-backfill",
+			},
 		],
 		"consumers": [
 			{
@@ -47,6 +57,23 @@
 				"max_batch_timeout": 30,
 				"max_retries": 3,
 			},
+			{
+				// Backfill (DID, collection) consumer. Each job: resolve
+				// PDS, paginate listRecords for one collection, batch
+				// results onto RECORDS_QUEUE. Smaller batch size than the
+				// records consumer because each job involves outbound PDS
+				// fetches; backpressure from RECORDS_QUEUE flows naturally
+				// via this consumer's runtime budget. Failed jobs land in
+				// a DLQ that's intentionally unbound — backfill is
+				// operator-triggered and operator-monitored, so DLQ
+				// drainage is a follow-up concern rather than an
+				// always-on automation requirement.
+				"queue": "emdash-aggregator-backfill",
+				"max_batch_size": 5,
+				"max_batch_timeout": 5,
+				"max_retries": 3,
+				"dead_letter_queue": "emdash-aggregator-backfill-dlq",
+			},
 		],
 	},
 	"durable_objects": {
@@ -77,8 +104,13 @@
 		// Jetstream endpoint. Override per environment for self-hosted relays
 		// or staging backends.
 		"JETSTREAM_URL": "wss://jetstream2.us-east.bsky.network/subscribe",
-		// Constellation HTTP API for cold-start backfill DID discovery.
-		"CONSTELLATION_URL": "https://constellation.microcosm.blue",
+		// Relay base URL implementing `com.atproto.sync.listReposByCollection`,
+		// used by `/_admin/backfill` to enumerate publishers of our NSIDs at
+		// cold-start time. Defaults to Bluesky's main relay (the canonical
+		// atproto relay; same trust orbit as our Jetstream endpoint).
+		// Override for self-hosted indexers (e.g. Microcosm's Lightrail)
+		// if/when we want a non-bsky.network discovery path.
+		"RELAY_URL": "https://bsky.network",
 	},
 	// Required secrets, declared in config so `wrangler types` generates the
 	// typed binding and `wrangler deploy` validates the secret is set on the

From 6a8145b52044dfba35f45b82fe65d856b0d3e288 Mon Sep 17 00:00:00 2001
From: Matt Kane <m@mk.gg>
Date: Sun, 10 May 2026 18:37:00 +0100
Subject: [PATCH 5/8] fix(aggregator): adversarial review fixes for backfill
 queue restructure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Parallelize `enqueueBackfillJobs` sendBatch calls. Serial awaits over
  ~40 batches (worst case from `MAX_DISCOVERED_DIDS` × `WANTED_COLLECTIONS`)
  ate noticeably into the POST handler's 30s waitUntil budget on top of
  discovery's own fetches.
- Drop backfill consumer `max_batch_size` from 5 → 1. Sequential
  processing inside a batch could chain `LIST_RECORDS_TIMEOUT_MS = 15s`
  per page × multiple messages and blow the consumer wall-clock budget.
  At backfill rates (operator-triggered, not steady-state), throughput
  doesn't matter; the simpler invariant does.
- Add `drainBackfillDeadLetterBatch` consumer for the backfill DLQ.
  Mirrors the records-DLQ pattern (log loud + ack) so messages don't
  silently accumulate. No D1 forensics row — the operator's recovery
  action is "re-trigger backfill for the affected DID", which only
  needs the (did, collection) pair from the log line.
- Fix stale comment in the backfill route handler that referenced the
  deleted `backfillDids` orchestrator.
- Update `MAX_BACKFILL_DIDS` comment to acknowledge that
  `MAX_DISCOVERED_DIDS` is the actually-larger ceiling — the explicit
  path is the tighter "operator types it themselves" bucket by design.
---
 apps/aggregator/src/backfill-consumer.ts | 22 ++++++++++++++++
 apps/aggregator/src/backfill.ts          | 10 ++++++-
 apps/aggregator/src/index.ts             | 33 +++++++++++++++++-------
 apps/aggregator/test/backfill.test.ts    | 20 +++++++++++++-
 apps/aggregator/wrangler.jsonc           | 30 ++++++++++++++-------
 5 files changed, 94 insertions(+), 21 deletions(-)

diff --git a/apps/aggregator/src/backfill-consumer.ts b/apps/aggregator/src/backfill-consumer.ts
index 727b0019e..b2c6cc6fb 100644
--- a/apps/aggregator/src/backfill-consumer.ts
+++ b/apps/aggregator/src/backfill-consumer.ts
@@ -73,6 +73,28 @@ export async function processBackfillBatch(
 	}
 }
 
+/**
+ * Drain the backfill DLQ. Mirror of `records-consumer.drainDeadLetterBatch`
+ * but without the D1 forensics row — the recovery action for a backfill
+ * pair that exhausted retries is "re-run backfill for the affected DID",
+ * which only needs the (did, collection) pair from the log line.
+ *
+ * Logs at error level so operators tailing `wrangler tail` see the message
+ * loud, acks so the DLQ doesn't accumulate unbounded.
+ */
+export function drainBackfillDeadLetterBatch(
+	batch: MessageBatchLike<BackfillJob>,
+	_env: Env,
+): void {
+	for (const message of batch.messages) {
+		console.error("[aggregator] backfill DLQ drain: pair exhausted retries", {
+			did: message.body.did,
+			collection: message.body.collection,
+		});
+		message.ack();
+	}
+}
+
 function createProductionDeps(env: Env): ProcessBackfillJobDeps {
 	const composite = new CompositeDidDocumentResolver({
 		methods: {
diff --git a/apps/aggregator/src/backfill.ts b/apps/aggregator/src/backfill.ts
index e27b6b7c1..00823dbb4 100644
--- a/apps/aggregator/src/backfill.ts
+++ b/apps/aggregator/src/backfill.ts
@@ -221,9 +221,17 @@ export async function enqueueBackfillJobs(
 			messages.push({ body: { did, collection } });
 		}
 	}
+	// Fan the sendBatch calls in parallel. Each is an independent outbound
+	// sub-request and the orchestrator runs inside the POST handler's 30s
+	// `waitUntil` budget — serial awaits on a `MAX_DISCOVERED_DIDS`-sized
+	// fan-out (1000 DIDs × 4 collections / 100 = 40 batches) would
+	// noticeably eat into the cold-start budget on top of discovery's own
+	// fetches.
+	const sends: Promise<unknown>[] = [];
 	for (let i = 0; i < messages.length; i += QUEUE_SEND_BATCH_CAP) {
-		await queue.sendBatch(messages.slice(i, i + QUEUE_SEND_BATCH_CAP));
+		sends.push(queue.sendBatch(messages.slice(i, i + QUEUE_SEND_BATCH_CAP)));
 	}
+	await Promise.all(sends);
 	return messages.length;
 }
 
diff --git a/apps/aggregator/src/index.ts b/apps/aggregator/src/index.ts
index f238c6974..ef2e89416 100644
--- a/apps/aggregator/src/index.ts
+++ b/apps/aggregator/src/index.ts
@@ -15,7 +15,7 @@
  * Worker boots.
  */
 
-import { processBackfillBatch } from "./backfill-consumer.js";
+import { drainBackfillDeadLetterBatch, processBackfillBatch } from "./backfill-consumer.js";
 import { discoverDids, enqueueBackfillJobs } from "./backfill.js";
 import type { BackfillJob, RecordsJob } from "./env.js";
 import { drainDeadLetterBatch, processBatch } from "./records-consumer.js";
@@ -25,6 +25,7 @@ import { isPlainObject } from "./utils.js";
 const RECORDS_QUEUE_NAME = "emdash-aggregator-records";
 const RECORDS_DLQ_NAME = "emdash-aggregator-records-dlq";
 const BACKFILL_QUEUE_NAME = "emdash-aggregator-backfill";
+const BACKFILL_DLQ_NAME = "emdash-aggregator-backfill-dlq";
 
 export { RecordsJetstreamDO } from "./records-do.js";
 
@@ -58,11 +59,16 @@ const BACKFILL_PATH = "/_admin/backfill";
  * old serial-loop cap (was 1000) because the queue-fan-out path amplifies
  * a leaked-token attack: each submitted DID becomes
  * `WANTED_COLLECTIONS.length` queue messages, each consuming a consumer
- * invocation with its own outbound PDS fetches. 100 × 4 = 400 jobs is
- * still a meaningful operator-recovery batch but a tractable blast radius.
+ * invocation with its own outbound PDS fetches. 100 × 4 = 400 jobs from
+ * the explicit path is a meaningful operator-recovery batch but a
+ * tractable blast radius for the explicit path.
  *
- * Discovery-mode submissions are bounded separately by
- * `MAX_DISCOVERED_DIDS` inside `discoverDids`.
+ * The discovery path (empty body) has a separate ceiling at
+ * `MAX_DISCOVERED_DIDS = 1000` and is therefore actually the larger
+ * worst-case fan-out source — by design, since legitimate-publisher
+ * enumeration is the primary use case and we want the explicit list to
+ * be the tighter "operator types it themselves" bucket. Both paths share
+ * the same per-pair caps in the consumer.
  */
 const MAX_BACKFILL_DIDS = 100;
 const DID_PATTERN = /^did:[a-z]+:[A-Za-z0-9._%:-]+$/;
@@ -208,11 +214,14 @@ export default {
 			if ("error" in parsed) {
 				return new Response(parsed.error, { status: 400 });
 			}
-			// Fire-and-forget via waitUntil so the route returns 202 quickly
-			// regardless of the DID list size. Backfill progress is observable
-			// via Workers logs and the resulting D1 writes; the response body
-			// just confirms the trigger landed. Per-DID errors are logged from
-			// inside backfillDids; callers don't get them in the response.
+			// Fire-and-forget via waitUntil so the route returns 202 quickly.
+			// `runBackfill` only does discovery + queue fan-out (both fast);
+			// per-pair work runs in the BACKFILL_QUEUE consumer with its own
+			// invocation budget. Operator-facing observability:
+			//   - this handler's logs ([aggregator] backfill discovery/enqueue)
+			//   - per-pair logs from `backfill-consumer.ts`
+			//   - DLQ inspection (`emdash-aggregator-backfill-dlq`) for
+			//     pairs that exhausted retries
 			ctx.waitUntil(runBackfill(parsed, env));
 			return new Response(null, { status: 202 });
 		}
@@ -241,6 +250,10 @@ export default {
 				// eslint-disable-next-line typescript-eslint(no-unsafe-type-assertion) -- narrowed by queue name
 				await processBackfillBatch(batch as MessageBatch<BackfillJob>, env);
 				return;
+			case BACKFILL_DLQ_NAME:
+				// eslint-disable-next-line typescript-eslint(no-unsafe-type-assertion) -- narrowed by queue name
+				drainBackfillDeadLetterBatch(batch as MessageBatch<BackfillJob>, env);
+				return;
 			default:
 				console.error("[aggregator] unknown queue, acking batch", { queue: batch.queue });
 				for (const m of batch.messages) m.ack();
diff --git a/apps/aggregator/test/backfill.test.ts b/apps/aggregator/test/backfill.test.ts
index 5dffcf168..16259342a 100644
--- a/apps/aggregator/test/backfill.test.ts
+++ b/apps/aggregator/test/backfill.test.ts
@@ -21,7 +21,7 @@ import type { Did } from "@atcute/lexicons/syntax";
 import { applyD1Migrations, env, SELF } from "cloudflare:test";
 import { beforeAll, beforeEach, describe, expect, it } from "vitest";
 
-import { processBackfillBatch } from "../src/backfill-consumer.js";
+import { drainBackfillDeadLetterBatch, processBackfillBatch } from "../src/backfill-consumer.js";
 import {
 	type BackfillQueueProducer,
 	discoverDids,
@@ -639,6 +639,24 @@ describe("processBackfillBatch (consumer)", () => {
 	});
 });
 
+describe("drainBackfillDeadLetterBatch", () => {
+	it("acks every dead-lettered job (DLQ doesn't accumulate)", () => {
+		const collection = WANTED_COLLECTIONS[0];
+		if (!collection) throw new Error("test assumes ≥1 collection");
+		const messages = [
+			new FakeMessage<BackfillJob>({ did: DID_A, collection }),
+			new FakeMessage<BackfillJob>({ did: DID_B, collection }),
+		];
+
+		drainBackfillDeadLetterBatch({ messages }, {} as Env);
+
+		for (const m of messages) {
+			expect(m.acked).toBe(1);
+			expect(m.retried).toBe(0);
+		}
+	});
+});
+
 describe("discoverDids: listReposByCollection enumeration", () => {
 	const RELAY = "https://relay.test.example";
 
diff --git a/apps/aggregator/wrangler.jsonc b/apps/aggregator/wrangler.jsonc
index 969e2b95c..dd2b16412 100644
--- a/apps/aggregator/wrangler.jsonc
+++ b/apps/aggregator/wrangler.jsonc
@@ -60,20 +60,32 @@
 			{
 				// Backfill (DID, collection) consumer. Each job: resolve
 				// PDS, paginate listRecords for one collection, batch
-				// results onto RECORDS_QUEUE. Smaller batch size than the
-				// records consumer because each job involves outbound PDS
-				// fetches; backpressure from RECORDS_QUEUE flows naturally
-				// via this consumer's runtime budget. Failed jobs land in
-				// a DLQ that's intentionally unbound — backfill is
-				// operator-triggered and operator-monitored, so DLQ
-				// drainage is a follow-up concern rather than an
-				// always-on automation requirement.
+				// results onto RECORDS_QUEUE. `max_batch_size: 1` so each
+				// pair gets its own consumer invocation — listRecords can
+				// chain ~15s per page × `MAX_PAGES_PER_COLLECTION`, so
+				// batching multiple pairs per invocation risks blowing
+				// the consumer wall-clock budget. Throughput is fine
+				// because backfill is operator-triggered, not steady-state.
 				"queue": "emdash-aggregator-backfill",
-				"max_batch_size": 5,
+				"max_batch_size": 1,
 				"max_batch_timeout": 5,
 				"max_retries": 3,
 				"dead_letter_queue": "emdash-aggregator-backfill-dlq",
 			},
+			{
+				// Drains the backfill DLQ. Logs each dead-lettered pair
+				// loud enough that operators tailing logs see it, then
+				// acks so the DLQ doesn't accumulate unbounded. We don't
+				// write a forensics row to D1 here (no `backfill_dead_letters`
+				// table) because the operator's recovery action is
+				// "re-trigger backfill for the affected DID" — they don't
+				// need the per-pair payload, just the (did, collection)
+				// pair, which is on the log line.
+				"queue": "emdash-aggregator-backfill-dlq",
+				"max_batch_size": 25,
+				"max_batch_timeout": 30,
+				"max_retries": 3,
+			},
 		],
 	},
 	"durable_objects": {

From e463cf73e078405d1c108826a4391ad41088b0f0 Mon Sep 17 00:00:00 2001
From: Matt Kane <m@mk.gg>
Date: Sun, 10 May 2026 18:48:31 +0100
Subject: [PATCH 6/8] fix(aggregator): use library helpers for AT-URI parsing +
 crypto compare
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three Copilot review comments on PR #979 — all pointing at hand-rolled
code that should be using library functions:

- Swap hand-rolled `timingSafeEqual` (XOR loop) for workerd's audited
  `crypto.subtle.timingSafeEqual`. Update the comment to be honest about
  the residual length-leak (the primitive returns false on length
  mismatch, so the strict zero-leak guarantee only holds against
  same-length inputs — acceptable here because ADMIN_TOKEN's length is
  fixed at deploy time and any realistic length-via-timing attack
  requires more requests than other defences would tolerate).

- Swap hand-rolled `parseRkeyFromUri` for `parseCanonicalResourceUri`
  from `@atcute/lexicons/syntax`. Library validates the DID, NSID, and
  rkey grammars in one call. Adds an explicit DID-equality check so a
  buggy/malicious PDS that returns records under another publisher's
  DID is dropped at source rather than churning dead-letters in the
  records consumer.

- Make `extractListRecordsBody`/`extractCursor` throw on malformed
  bodies instead of silently returning `[]`/`undefined`. A 200 response
  with the wrong shape is upstream breakage and the operator-triggered
  integrity path should surface that loudly. Throws propagate out via
  the queue consumer's retry path.

Also adds tests for the new strict validation paths and the
DID-mismatch rejection.
---
 apps/aggregator/src/backfill.ts       | 81 ++++++++++++++++-----------
 apps/aggregator/src/index.ts          | 22 +++++---
 apps/aggregator/test/backfill.test.ts | 72 ++++++++++++++++++++++++
 3 files changed, 134 insertions(+), 41 deletions(-)

diff --git a/apps/aggregator/src/backfill.ts b/apps/aggregator/src/backfill.ts
index 00823dbb4..db0878ff8 100644
--- a/apps/aggregator/src/backfill.ts
+++ b/apps/aggregator/src/backfill.ts
@@ -32,6 +32,8 @@
  * no periodic scheduler — see plan §"Why no reconciliation cron".
  */
 
+import { parseCanonicalResourceUri } from "@atcute/lexicons/syntax";
+
 import { WANTED_COLLECTIONS } from "./constants.js";
 import type { DidResolver } from "./did-resolver.js";
 import type { BackfillJob, RecordsJob } from "./env.js";
@@ -68,9 +70,6 @@ if (MAX_RECORDS_PER_PAGE > QUEUE_SEND_BATCH_CAP) {
 		`MAX_RECORDS_PER_PAGE (${MAX_RECORDS_PER_PAGE}) exceeds QUEUE_SEND_BATCH_CAP (${QUEUE_SEND_BATCH_CAP})`,
 	);
 }
-/** Atproto rkey grammar: ALPHA / DIGIT / "." / "-" / "_" / ":" / "~". */
-const RKEY_PATTERN = /^[A-Za-z0-9._:~-]{1,512}$/;
-
 /** Producer-side records queue surface. The production binding
  * `env.RECORDS_QUEUE` satisfies this; tests pass an in-memory implementation. */
 export interface RecordsQueueProducer {
@@ -370,13 +369,22 @@ async function paginateAndEnqueue(opts: PaginateOpts): Promise<number> {
 
 		const messages: { body: RecordsJob }[] = [];
 		for (const record of records) {
-			const rkey = parseRkeyFromUri(record.uri, opts.collection);
-			if (!rkey) continue;
+			const parsed = parseCanonicalResourceUri(record.uri);
+			if (!parsed.ok) continue;
+			// Defence vs. a buggy/malicious PDS that returns records under
+			// a different DID (or a different collection) than the one we
+			// asked for. Such jobs would never verify (signature would be
+			// from a different key) and would just churn dead-letters; drop
+			// at the source. `parseCanonicalResourceUri` already validated
+			// the rkey grammar and the collection NSID for us, so we only
+			// need the cross-checks here.
+			if (parsed.value.repo !== opts.did) continue;
+			if (parsed.value.collection !== opts.collection) continue;
 			messages.push({
 				body: {
 					did: opts.did,
 					collection: opts.collection,
-					rkey,
+					rkey: parsed.value.rkey,
 					operation: "create",
 					cid: record.cid,
 				},
@@ -399,44 +407,51 @@ interface ListRecordEntry {
 	value: unknown;
 }
 
+/**
+ * Parse a `com.atproto.repo.listRecords` response body. Throws on any
+ * structural mismatch — a PDS that 200s with the wrong shape is upstream
+ * breakage, not "no records", and silently treating it as the latter
+ * causes operator-invisible partial backfills. The thrown error
+ * propagates out of `processBackfillJob` and the queue consumer retries
+ * (then DLQs) per the standard policy.
+ */
 function extractListRecordsBody(body: unknown): ListRecordEntry[] {
-	if (!isPlainObject(body)) return [];
+	if (!isPlainObject(body)) {
+		throw new Error("listRecords response was not a JSON object");
+	}
 	const records = body["records"];
-	if (!Array.isArray(records)) return [];
+	if (!Array.isArray(records)) {
+		throw new Error("listRecords response missing `records` array");
+	}
 	const out: ListRecordEntry[] = [];
 	for (const r of records) {
-		if (!isPlainObject(r)) continue;
+		if (!isPlainObject(r)) {
+			throw new Error("listRecords record entry was not a JSON object");
+		}
 		const uri = r["uri"];
 		const cid = r["cid"];
-		if (typeof uri !== "string" || typeof cid !== "string") continue;
+		if (typeof uri !== "string" || typeof cid !== "string") {
+			throw new Error("listRecords record entry missing string `uri` or `cid`");
+		}
 		out.push({ uri, cid, value: r["value"] });
 	}
 	return out;
 }
 
-function extractCursor(body: unknown): string | undefined {
-	if (!isPlainObject(body)) return undefined;
-	const cursor = body["cursor"];
-	return typeof cursor === "string" ? cursor : undefined;
-}
-
 /**
- * Extract the rkey from an AT URI of the shape `at://did/collection/rkey`
- * and validate it against the atproto rkey grammar. Returns null if any
- * step fails; callers treat that as "this isn't a record we recognise"
- * and skip the entry without aborting the page.
+ * Pull the optional `cursor` out of a `listRecords` response. `undefined`
+ * (no key) is the spec-compliant signal for "end of pagination"; any other
+ * non-string value (number, object, etc.) is a PDS bug and throws so the
+ * pagination loop doesn't silently terminate on the wrong page.
  */
-function parseRkeyFromUri(uri: string, expectedCollection: string): string | null {
-	const expectedPrefix = `at://`;
-	if (!uri.startsWith(expectedPrefix)) return null;
-	const tail = uri.slice(expectedPrefix.length);
-	const slash1 = tail.indexOf("/");
-	if (slash1 < 0) return null;
-	const slash2 = tail.indexOf("/", slash1 + 1);
-	if (slash2 < 0) return null;
-	const collection = tail.slice(slash1 + 1, slash2);
-	if (collection !== expectedCollection) return null;
-	const rkey = tail.slice(slash2 + 1);
-	if (!RKEY_PATTERN.test(rkey)) return null;
-	return rkey;
+function extractCursor(body: unknown): string | undefined {
+	if (!isPlainObject(body)) {
+		throw new Error("listRecords response was not a JSON object");
+	}
+	const cursor = body["cursor"];
+	if (cursor === undefined) return undefined;
+	if (typeof cursor !== "string") {
+		throw new Error(`listRecords cursor was not a string (got ${typeof cursor})`);
+	}
+	return cursor;
 }
diff --git a/apps/aggregator/src/index.ts b/apps/aggregator/src/index.ts
index ef2e89416..4b1f150f6 100644
--- a/apps/aggregator/src/index.ts
+++ b/apps/aggregator/src/index.ts
@@ -73,17 +73,23 @@ const BACKFILL_PATH = "/_admin/backfill";
 const MAX_BACKFILL_DIDS = 100;
 const DID_PATTERN = /^did:[a-z]+:[A-Za-z0-9._%:-]+$/;
 
+const tokenEncoder = new TextEncoder();
+
 /**
- * Constant-time string equality. Use over `===` for any compare against a
- * secret to avoid leaking length/prefix information through timing channels.
+ * Constant-time string equality via workerd's audited
+ * `crypto.subtle.timingSafeEqual`. The primitive returns `false` immediately
+ * for length-mismatched buffers, so the *prefix*-comparison is constant-time
+ * but a length difference is still observable via timing — acceptable here
+ * because the protected secret (`ADMIN_TOKEN`) has a fixed configured length
+ * known only to the operator, and any realistic length-via-timing attack
+ * would require so many requests that other defences (rate-limiting,
+ * Cloudflare Bot Management, log review) catch it first.
  */
 function timingSafeEqual(a: string, b: string): boolean {
-	if (a.length !== b.length) return false;
-	let diff = 0;
-	for (let i = 0; i < a.length; i++) {
-		diff |= a.charCodeAt(i) ^ b.charCodeAt(i);
-	}
-	return diff === 0;
+	const aBuf = tokenEncoder.encode(a);
+	const bBuf = tokenEncoder.encode(b);
+	if (aBuf.byteLength !== bBuf.byteLength) return false;
+	return crypto.subtle.timingSafeEqual(aBuf, bBuf);
 }
 
 /**
diff --git a/apps/aggregator/test/backfill.test.ts b/apps/aggregator/test/backfill.test.ts
index 16259342a..f8da7ff5d 100644
--- a/apps/aggregator/test/backfill.test.ts
+++ b/apps/aggregator/test/backfill.test.ts
@@ -328,6 +328,30 @@ describe("processBackfillJob", () => {
 		expect(result.enqueued).toBe(1);
 	});
 
+	it("skips records whose URI references a different DID than the job (defensive)", async () => {
+		const queue = new CapturingRecordsQueue();
+		const resolver = buildResolver();
+		const collection = WANTED_COLLECTIONS[0];
+		if (!collection) throw new Error("test assumes ≥1 collection");
+		// Buggy/malicious PDS: returns a record under a *different* repo's DID.
+		// Even if everything else parses, that record's signature would be
+		// from the wrong key — enqueueing it would just churn dead-letters.
+		const fetchImpl = makeFetch({
+			[collection]: [
+				{ uri: `at://${DID_A}/${collection}/legit`, cid: "c1", value: {} },
+				{ uri: `at://${DID_B}/${collection}/imposter`, cid: "c2", value: {} },
+			],
+		});
+
+		const result = await processBackfillJob(
+			{ did: DID_A, collection },
+			{ resolver, queue, fetch: fetchImpl },
+		);
+
+		expect(queue.sent.map((j) => j.rkey)).toEqual(["legit"]);
+		expect(result.enqueued).toBe(1);
+	});
+
 	it("rejects records with malformed rkey (atproto rkey grammar violation)", async () => {
 		const queue = new CapturingRecordsQueue();
 		const resolver = buildResolver();
@@ -506,6 +530,54 @@ describe("processBackfillJob: defenses against malicious / buggy PDS", () => {
 			),
 		).rejects.toThrow(/timed out after 25ms/);
 	});
+
+	it("throws when listRecords body isn't a JSON object (no silent zero)", async () => {
+		const queue = new CapturingRecordsQueue();
+		const resolver = buildResolver();
+		const collection = WANTED_COLLECTIONS[0];
+		if (!collection) throw new Error("test assumes ≥1 collection");
+		const fetchImpl: typeof fetch = async () =>
+			new Response(JSON.stringify(["not", "an", "object"]), {
+				status: 200,
+				headers: { "content-type": "application/json" },
+			});
+
+		await expect(
+			processBackfillJob({ did: DID_A, collection }, { resolver, queue, fetch: fetchImpl }),
+		).rejects.toThrow(/not a JSON object/);
+	});
+
+	it("throws when listRecords body is missing the records array", async () => {
+		const queue = new CapturingRecordsQueue();
+		const resolver = buildResolver();
+		const collection = WANTED_COLLECTIONS[0];
+		if (!collection) throw new Error("test assumes ≥1 collection");
+		const fetchImpl: typeof fetch = async () =>
+			new Response(JSON.stringify({ cursor: "x" }), {
+				status: 200,
+				headers: { "content-type": "application/json" },
+			});
+
+		await expect(
+			processBackfillJob({ did: DID_A, collection }, { resolver, queue, fetch: fetchImpl }),
+		).rejects.toThrow(/missing `records` array/);
+	});
+
+	it("throws when cursor is present but not a string (no silent end-of-pagination)", async () => {
+		const queue = new CapturingRecordsQueue();
+		const resolver = buildResolver();
+		const collection = WANTED_COLLECTIONS[0];
+		if (!collection) throw new Error("test assumes ≥1 collection");
+		const fetchImpl: typeof fetch = async () =>
+			new Response(JSON.stringify({ records: [], cursor: 42 }), {
+				status: 200,
+				headers: { "content-type": "application/json" },
+			});
+
+		await expect(
+			processBackfillJob({ did: DID_A, collection }, { resolver, queue, fetch: fetchImpl }),
+		).rejects.toThrow(/cursor was not a string/);
+	});
 });
 
 describe("enqueueBackfillJobs", () => {

From 6331ec8ad7aa3568fc78cd1468fa733141b50f69 Mon Sep 17 00:00:00 2001
From: Matt Kane <m@mk.gg>
Date: Sun, 10 May 2026 19:06:11 +0100
Subject: [PATCH 7/8] fix(aggregator): use library `isDid` instead of
 duplicated DID regex
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`@atcute/lexicons/syntax` exports the canonical `isDid` validator.
Two files in this package were each carrying their own copy of
`/^did:[a-z]+:[A-Za-z0-9._%:-]+$/` — looser than the library
(which is length-bounded 7-2048 and forbids trailing `:` or `%`)
and not type-narrowing.

Replaces both:
- `did-resolver.ts`: drops the local regex + `isDid` wrapper, imports
  the library version.
- `index.ts` (backfill admin route): drops the local regex, calls the
  library directly in `parseBackfillBody`.

Existing tests cover the validation behavior; library is at least as
strict as the prior pattern so nothing legitimate gets newly rejected.
---
 apps/aggregator/src/did-resolver.ts | 7 +------
 apps/aggregator/src/index.ts        | 5 +++--
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/apps/aggregator/src/did-resolver.ts b/apps/aggregator/src/did-resolver.ts
index cbdffefd2..8e4d5c58e 100644
--- a/apps/aggregator/src/did-resolver.ts
+++ b/apps/aggregator/src/did-resolver.ts
@@ -18,10 +18,9 @@ import {
 	type PublicKey,
 } from "@atcute/crypto";
 import { type DidDocument, getAtprotoVerificationMaterial, getPdsEndpoint } from "@atcute/identity";
-import type { Did } from "@atcute/lexicons/syntax";
+import { type Did, isDid } from "@atcute/lexicons/syntax";
 
 const DEFAULT_TTL_MS = 24 * 60 * 60 * 1000;
-const DID_PATTERN = /^did:[a-z]+:[A-Za-z0-9._%:-]+$/;
 
 /** Cache entry shape; the multibase signing key is stored raw so the
  * `PublicKey` instance is reconstructed on each `resolve()`. WebCrypto
@@ -102,10 +101,6 @@ export class DidResolver {
 	}
 }
 
-function isDid(value: string): value is Did {
-	return DID_PATTERN.test(value);
-}
-
 function asDid(did: string): Did {
 	if (!isDid(did)) {
 		throw new Error(`invalid DID: ${did}`);
diff --git a/apps/aggregator/src/index.ts b/apps/aggregator/src/index.ts
index 4b1f150f6..ee364e5c1 100644
--- a/apps/aggregator/src/index.ts
+++ b/apps/aggregator/src/index.ts
@@ -15,6 +15,8 @@
  * Worker boots.
  */
 
+import { isDid } from "@atcute/lexicons/syntax";
+
 import { drainBackfillDeadLetterBatch, processBackfillBatch } from "./backfill-consumer.js";
 import { discoverDids, enqueueBackfillJobs } from "./backfill.js";
 import type { BackfillJob, RecordsJob } from "./env.js";
@@ -71,7 +73,6 @@ const BACKFILL_PATH = "/_admin/backfill";
  * the same per-pair caps in the consumer.
  */
 const MAX_BACKFILL_DIDS = 100;
-const DID_PATTERN = /^did:[a-z]+:[A-Za-z0-9._%:-]+$/;
 
 const tokenEncoder = new TextEncoder();
 
@@ -163,7 +164,7 @@ function parseBackfillBody(body: unknown): BackfillRequest | { error: string } {
 	}
 	const seen = new Set<string>();
 	for (const did of rawDids) {
-		if (typeof did !== "string" || !DID_PATTERN.test(did)) {
+		if (!isDid(did)) {
 			return { error: `invalid DID in list: ${JSON.stringify(did)}` };
 		}
 		seen.add(did);

From 6e2e24a0cd1183d4f07f5e1273bc31afc1e4efac Mon Sep 17 00:00:00 2001
From: Matt Kane <m@mk.gg>
Date: Sun, 10 May 2026 19:12:29 +0100
Subject: [PATCH 8/8] fix(aggregator): tighten isAtprotoDid + sync stale DLQ
 docstring
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- `pds-verify.isAtprotoDid` now composes with library `isDid` from
  `@atcute/lexicons/syntax` before checking the method prefix. The
  previous bare `startsWith` check accepted strings like `did:plc:`
  (empty body) — defensive in practice (the resolver upstream rejects
  malformed DIDs first), but the library call carries weight at
  verification time as a second-line check.
- `backfill-consumer.ts` module docstring still claimed the backfill
  DLQ "is intentionally not auto-drained today" even though the
  drain consumer (`drainBackfillDeadLetterBatch`) was added to this
  file in the previous review-fix round. Updated to describe what's
  actually shipped.
---
 apps/aggregator/src/backfill-consumer.ts | 11 ++++++-----
 apps/aggregator/src/pds-verify.ts        |  7 ++++++-
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/apps/aggregator/src/backfill-consumer.ts b/apps/aggregator/src/backfill-consumer.ts
index b2c6cc6fb..3c6c9f536 100644
--- a/apps/aggregator/src/backfill-consumer.ts
+++ b/apps/aggregator/src/backfill-consumer.ts
@@ -17,11 +17,12 @@
  *   - Unexpected throws inside the batch loop are caught per-message so one
  *     bad job can't poison the rest of the batch.
  *
- * The DLQ is intentionally not auto-drained today — backfill is operator-
- * triggered, so DLQ inspection is part of the operator's workflow when a
- * backfill POST shows partial completion in `wrangler tail`. A drain
- * consumer can land later when we have a clear ack policy (probably:
- * write a row to D1 and ack, like the records-DLQ drain).
+ * DLQ drain (`drainBackfillDeadLetterBatch`): logs each dead-lettered
+ * pair at error level (so operators tailing `wrangler tail` see it loud)
+ * and acks so the DLQ doesn't accumulate unbounded. No D1 forensics row —
+ * the recovery action for a backfill pair that exhausted retries is
+ * "re-run backfill for the affected DID", which only needs the
+ * (did, collection) pair already on the log line.
  */
 
 import {
diff --git a/apps/aggregator/src/pds-verify.ts b/apps/aggregator/src/pds-verify.ts
index daf7ec234..4b31bb0b0 100644
--- a/apps/aggregator/src/pds-verify.ts
+++ b/apps/aggregator/src/pds-verify.ts
@@ -18,7 +18,7 @@
  */
 
 import type { PublicKey } from "@atcute/crypto";
-import type { AtprotoDid } from "@atcute/lexicons/syntax";
+import { type AtprotoDid, isDid } from "@atcute/lexicons/syntax";
 import { verifyRecord } from "@atcute/repo";
 
 const DEFAULT_TIMEOUT_MS = 15_000;
@@ -215,6 +215,11 @@ async function fetchCar(
  * message.retry()` without re-encoding the policy in the catch block.
  */
 function isAtprotoDid(value: string): value is AtprotoDid {
+	// Library `isDid` enforces the full grammar (length, terminator chars);
+	// the prefix check then narrows to the atproto-supported method subset
+	// (`did:plc:` or `did:web:`). A bare prefix check would accept things
+	// like `did:plc:` (empty body), so the library call carries weight here.
+	if (!isDid(value)) return false;
 	return value.startsWith("did:plc:") || value.startsWith("did:web:");
 }