From f07eba1a3633682b17e92a4968402c5619de8076 Mon Sep 17 00:00:00 2001 From: Matt Kane Date: Sun, 10 May 2026 16:37:15 +0100 Subject: [PATCH 1/8] feat(aggregator): backfill admin route + auth-gated admin surface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cold-start discovery primitive. Operator hits POST /_admin/backfill with a JSON list of DIDs; the worker resolves each DID's PDS endpoint, calls com.atproto.repo.listRecords for every NSID in WANTED_COLLECTIONS, and batches the results onto the existing Records Queue. The consumer's verification + write + idempotency machinery handles the rest — same code path as live Jetstream, distinguished only by trigger. Live discovery is still Jetstream's job; this exists for the cold-start gap (publishers who published before the aggregator was listening) and for operator-initiated recovery after a known outage. There is no periodic scheduler — see plan §"Why no reconciliation cron". ADMIN AUTH Both admin routes (/_admin/start and /_admin/backfill) now require `Authorization: Bearer `. Wrangler's `secrets.required` declares the token in config, so: - `wrangler types` generates a typed `env.ADMIN_TOKEN: string` - `wrangler deploy` refuses to ship if the secret isn't set - Production sets via `wrangler secret put ADMIN_TOKEN` - Tests bind a stub via vitest miniflare bindings (test-admin-token) Constant-time compare to avoid leaking length/prefix via timing. Backfill specifically is what makes this auth gate non-optional: unauthenticated, did:web resolution turns the worker into an attacker-controlled HTTP fetcher (caller picks both the DID-doc URL and the resulting PDS endpoint). Same gate on /_admin/start for symmetry — anyone with the token can do operational things. DEFENSES IN BACKFILL Adversarial review (round 1) found bugs that needed fixing before shipping. All addressed: - BLOCKER: pagination loop had no max-page guard. A PDS that echoes the same cursor pinned the worker indefinitely. Now capped at MAX_PAGES_PER_COLLECTION (1000) plus a cursor-equality check that catches the common case in 2 iterations. - MAJOR: dids[] and per-page records had no size cap. Both capped now (MAX_BACKFILL_DIDS=1000, MAX_RECORDS_PER_PAGE=200) so a single POST can't flood the queue. - MAJOR: per-record sequential queue.send() couldn't possibly finish a 4000-record DID inside the waitUntil budget. Switched to sendBatch (one batch per page). - MAJOR: runBackfill summary log only fired at the end; if waitUntil exhausted mid-loop the operator saw nothing. Added per-DID logs via onDidComplete callback + try/catch wrapping the whole runner so any throw surfaces as "[aggregator] backfill aborted". - MINOR: 404 mid-pagination silently truncated; now distinguished from first-page-404 (which legitimately means "no records of this collection") and surfaced as a partial-failure error. - MINOR: parseBackfillBody only checked startsWith("did:"). Now validates against DID_PATTERN and dedupes the list via Set. - MINOR: parseRkeyFromUri accepted any non-empty string after the expected slashes. Now validates against the atproto rkey grammar ([A-Za-z0-9._:~-]{1,512}) so injection-shaped rkeys are skipped. - MINOR: backfill jobs were setting jetstreamRecord, which the consumer's DLQ payload field would have mislabeled as Jetstream-supplied data. Dropped the field on backfill jobs. - NIT: isPlainObject was duplicated 3x; extracted to src/utils.ts. 128 tests pass (was 112). Typecheck clean. Zero lint diagnostics. The remaining adversarial-review items I decided to skip: - Three near-duplicate MapDidDocCache test stubs across 3 files — flagged as NIT, deferred to its own refactor PR. - Constellation-driven discovery (no-explicit-DIDs path). Documented in the plan as intentional Slice 1 scope: explicit DIDs is the primary path; Constellation is a follow-up if needed. --- apps/aggregator/src/backfill.ts | 263 ++++++++ apps/aggregator/src/index.ts | 177 +++++- apps/aggregator/src/records-consumer.ts | 5 +- apps/aggregator/src/utils.ts | 13 + apps/aggregator/test/backfill.test.ts | 725 ++++++++++++++++++++++ apps/aggregator/vitest.config.ts | 10 +- apps/aggregator/worker-configuration.d.ts | 5 +- apps/aggregator/wrangler.jsonc | 7 + 8 files changed, 1191 insertions(+), 14 deletions(-) create mode 100644 apps/aggregator/src/backfill.ts create mode 100644 apps/aggregator/src/utils.ts create mode 100644 apps/aggregator/test/backfill.test.ts diff --git a/apps/aggregator/src/backfill.ts b/apps/aggregator/src/backfill.ts new file mode 100644 index 000000000..c91f300df --- /dev/null +++ b/apps/aggregator/src/backfill.ts @@ -0,0 +1,263 @@ +/** + * Cold-start discovery worker. + * + * Operator-triggered (via `POST /_admin/backfill`); takes a list of DIDs, + * calls `com.atproto.repo.listRecords` against each publisher's PDS for every + * collection in `WANTED_COLLECTIONS`, and enqueues each returned record onto + * the existing Records Queue as a `RecordsJob`. The consumer's verification + + * write + idempotency machinery handles the rest — same code path as live + * Jetstream, distinguished only by trigger. + * + * Live discovery is Jetstream's job, not this worker's; it picks up new + * publishers automatically. Backfill exists for the cold-start gap (publishers + * who published before the aggregator was listening) and for operator-triggered + * recovery after a known outage. There is deliberately no periodic scheduler + * — see plan §"Why no reconciliation cron". + */ + +import { WANTED_COLLECTIONS } from "./constants.js"; +import type { DidResolver } from "./did-resolver.js"; +import type { RecordsJob } from "./env.js"; +import { isPlainObject } from "./utils.js"; + +const PAGE_SIZE = 100; +/** Cap on listRecords pagination per collection. A buggy or malicious PDS + * that echoes the same cursor would otherwise loop forever inside one + * `ctx.waitUntil`. 1000 pages × 100 records = 100k records per collection, + * which is past anything we'd legitimately backfill in one shot. */ +const MAX_PAGES_PER_COLLECTION = 1000; +/** Defensive cap on records per page. Real PDSes honour the `limit` query + * param; this guards against a hostile PDS returning an enormous array. */ +const MAX_RECORDS_PER_PAGE = PAGE_SIZE * 2; +/** Atproto rkey grammar: ALPHA / DIGIT / "." / "-" / "_" / ":" / "~". */ +const RKEY_PATTERN = /^[A-Za-z0-9._:~-]{1,512}$/; + +/** Producer-side queue surface. The production binding `env.RECORDS_QUEUE` + * satisfies this; tests pass an in-memory implementation. The return type + * is `unknown` rather than `void` so workerd's `Queue.send` (which returns + * a `QueueSendResponse`) is structurally assignable. */ +export interface BackfillQueue { + sendBatch(messages: ReadonlyArray<{ body: RecordsJob }>): Promise; +} + +export interface BackfillDeps { + resolver: DidResolver; + queue: BackfillQueue; + /** Inject for tests; defaults to `globalThis.fetch`. */ + fetch?: typeof fetch; + /** Optional callback fired after each DID completes (success or failure). + * Used by the production wiring to log per-DID progress so an operator + * watching `wrangler tail` sees where a long backfill is up to. */ + onDidComplete?: (result: BackfillDidResult) => void; +} + +export interface BackfillDidResult { + did: string; + enqueued: number; + /** One entry per failure during this DID's backfill. listRecords failures + * are per-collection; resolution failures abort early. Empty array on + * total success. */ + errors: string[]; +} + +export interface BackfillSummary { + totalEnqueued: number; + results: BackfillDidResult[]; +} + +/** + * Backfill multiple DIDs serially. Per-DID failures don't stop the loop — + * one bad publisher doesn't block the rest. + */ +export async function backfillDids( + dids: readonly string[], + deps: BackfillDeps, +): Promise { + const results: BackfillDidResult[] = []; + let totalEnqueued = 0; + for (const did of dids) { + const result = await backfillDid(did, deps); + results.push(result); + totalEnqueued += result.enqueued; + deps.onDidComplete?.(result); + } + return { totalEnqueued, results }; +} + +/** + * Backfill a single DID across every collection in `WANTED_COLLECTIONS`. + * Resolution failure aborts (we can't enqueue verifiable jobs without + * knowing the PDS); per-collection listRecords failures are recorded and + * the loop continues to the next collection. + */ +export async function backfillDid(did: string, deps: BackfillDeps): Promise { + const errors: string[] = []; + let enqueued = 0; + + // resolve() writes to known_publishers as a side-effect of the cache + // upsert, so this also registers the publisher for any future code path + // that iterates that table. + let pds: string; + try { + const resolved = await deps.resolver.resolve(did); + pds = resolved.pds; + } catch (err) { + errors.push(`resolve failed: ${err instanceof Error ? err.message : String(err)}`); + return { did, enqueued, errors }; + } + + const fetchImpl = deps.fetch ?? fetch; + for (const collection of WANTED_COLLECTIONS) { + try { + enqueued += await backfillCollection({ + did, + pds, + collection, + queue: deps.queue, + fetchImpl, + }); + } catch (err) { + errors.push(`${collection}: ${err instanceof Error ? err.message : String(err)}`); + } + } + + return { did, enqueued, errors }; +} + +interface BackfillCollectionOpts { + did: string; + pds: string; + collection: string; + queue: BackfillQueue; + fetchImpl: typeof fetch; +} + +/** + * Walk one DID's records for a single collection, paginating through + * `listRecords` and enqueuing each result via `sendBatch` (one batch per + * page). Returns the number of records enqueued. + * + * 404 from the PDS on the FIRST page means the repo doesn't host this + * collection — silently treated as zero records, not an error. A 404 + * mid-pagination is a partial-failure signal (the PDS is misrouting one + * page while the rest of the repo is fine) and throws. + * + * Pagination is capped at MAX_PAGES_PER_COLLECTION + cursor-equality check + * to defend against a PDS that echoes the same cursor forever. + */ +async function backfillCollection(opts: BackfillCollectionOpts): Promise { + let enqueued = 0; + let cursor: string | undefined; + let prevCursor: string | undefined; + let pages = 0; + do { + if (++pages > MAX_PAGES_PER_COLLECTION) { + throw new Error(`exceeded ${MAX_PAGES_PER_COLLECTION} pages`); + } + if (cursor !== undefined && cursor === prevCursor) { + throw new Error("PDS returned identical cursor twice"); + } + prevCursor = cursor; + + const url = new URL("/xrpc/com.atproto.repo.listRecords", opts.pds); + url.searchParams.set("repo", opts.did); + url.searchParams.set("collection", opts.collection); + url.searchParams.set("limit", String(PAGE_SIZE)); + if (cursor) url.searchParams.set("cursor", cursor); + + const response = await opts.fetchImpl(url.toString(), { + headers: { accept: "application/json" }, + }); + if (response.status === 404) { + if (cursor === undefined) { + // First-page 404: publisher has no records of this collection. + return enqueued; + } + // Mid-pagination 404 is a partial failure; surface it. + throw new Error(`listRecords returned 404 mid-pagination at cursor=${cursor}`); + } + if (!response.ok) { + throw new Error(`listRecords returned ${response.status}`); + } + + const body: unknown = await response.json(); + const records = extractListRecordsBody(body); + if (records.length > MAX_RECORDS_PER_PAGE) { + throw new Error( + `PDS returned ${records.length} records, exceeding cap of ${MAX_RECORDS_PER_PAGE}`, + ); + } + cursor = extractCursor(body); + + const messages: { body: RecordsJob }[] = []; + for (const record of records) { + const rkey = parseRkeyFromUri(record.uri, opts.collection); + if (!rkey) continue; + messages.push({ + body: { + did: opts.did, + collection: opts.collection, + rkey, + operation: "create", + cid: record.cid, + }, + }); + } + if (messages.length > 0) { + // Single batched send per page — far cheaper than the per-record + // awaits this used to do (which couldn't possibly finish a + // 4000-record DID inside the waitUntil budget). + await opts.queue.sendBatch(messages); + enqueued += messages.length; + } + } while (cursor); + return enqueued; +} + +interface ListRecordEntry { + uri: string; + cid: string; + value: unknown; +} + +function extractListRecordsBody(body: unknown): ListRecordEntry[] { + if (!isPlainObject(body)) return []; + const records = body["records"]; + if (!Array.isArray(records)) return []; + const out: ListRecordEntry[] = []; + for (const r of records) { + if (!isPlainObject(r)) continue; + const uri = r["uri"]; + const cid = r["cid"]; + if (typeof uri !== "string" || typeof cid !== "string") continue; + out.push({ uri, cid, value: r["value"] }); + } + return out; +} + +function extractCursor(body: unknown): string | undefined { + if (!isPlainObject(body)) return undefined; + const cursor = body["cursor"]; + return typeof cursor === "string" ? cursor : undefined; +} + +/** + * Extract the rkey from an AT URI of the shape `at://did/collection/rkey` + * and validate it against the atproto rkey grammar. Returns null if any + * step fails; callers treat that as "this isn't a record we recognise" + * and skip the entry without aborting the page. + */ +function parseRkeyFromUri(uri: string, expectedCollection: string): string | null { + const expectedPrefix = `at://`; + if (!uri.startsWith(expectedPrefix)) return null; + const tail = uri.slice(expectedPrefix.length); + const slash1 = tail.indexOf("/"); + if (slash1 < 0) return null; + const slash2 = tail.indexOf("/", slash1 + 1); + if (slash2 < 0) return null; + const collection = tail.slice(slash1 + 1, slash2); + if (collection !== expectedCollection) return null; + const rkey = tail.slice(slash2 + 1); + if (!RKEY_PATTERN.test(rkey)) return null; + return rkey; +} diff --git a/apps/aggregator/src/index.ts b/apps/aggregator/src/index.ts index 709f7da6a..470f44f71 100644 --- a/apps/aggregator/src/index.ts +++ b/apps/aggregator/src/index.ts @@ -15,9 +15,18 @@ * Worker boots. */ +import { + AtprotoWebDidDocumentResolver, + CompositeDidDocumentResolver, + PlcDidDocumentResolver, +} from "@atcute/identity-resolver"; + +import { backfillDids } from "./backfill.js"; +import { createD1DidDocCache, DidResolver } from "./did-resolver.js"; import type { RecordsJob } from "./env.js"; import { drainDeadLetterBatch, processBatch } from "./records-consumer.js"; import { RECORDS_DO_NAME } from "./records-do.js"; +import { isPlainObject } from "./utils.js"; const RECORDS_QUEUE_NAME = "emdash-aggregator-records"; const RECORDS_DLQ_NAME = "emdash-aggregator-records-dlq"; @@ -25,21 +34,104 @@ const RECORDS_DLQ_NAME = "emdash-aggregator-records-dlq"; export { RecordsJetstreamDO } from "./records-do.js"; /** - * Operational bootstrap route. Hitting `/_admin/start` once after deploy - * spins up the Records DO, which opens its outbound WebSocket and starts - * ingesting. The DO's WebSocket keeps it alive thereafter. The route is - * unauthenticated but returns no operational detail — just a fixed 204 — - * so a probing caller learns nothing useful. The action is idempotent on - * an already-running DO. Recommended deploy hook: + * Operational admin routes. Both gated by the `ADMIN_TOKEN` secret declared + * in `wrangler.jsonc`'s `secrets.required` and validated via constant-time + * compare against the `Authorization: Bearer ` header. Recommended + * deploy hook: + * + * wrangler deploy && curl -X POST -H "Authorization: Bearer $ADMIN_TOKEN" \ + * https://api.emdashcms.com/_admin/start + * + * `/_admin/start` spins up the Records DO (idempotent on an already-running DO). + * `/_admin/backfill` triggers cold-start discovery against an explicit DID list. * - * wrangler deploy && curl -X POST https://api.emdashcms.com/_admin/start + * Auth-gating both routes: backfill specifically introduces caller-chosen + * URLs into the worker's outbound fetches (via `did:web` resolution + the + * publisher's PDS endpoint), so the unauth posture would expose an SSRF-shaped + * surface. Same gate on `/_admin/start` for symmetry — anyone with the token + * can do operational things. */ const BOOTSTRAP_PATH = "/_admin/start"; +const BACKFILL_PATH = "/_admin/backfill"; + +const MAX_BACKFILL_DIDS = 1000; +const DID_PATTERN = /^did:[a-z]+:[A-Za-z0-9._%:-]+$/; + +/** + * Constant-time string equality. Use over `===` for any compare against a + * secret to avoid leaking length/prefix information through timing channels. + */ +function timingSafeEqual(a: string, b: string): boolean { + if (a.length !== b.length) return false; + let diff = 0; + for (let i = 0; i < a.length; i++) { + diff |= a.charCodeAt(i) ^ b.charCodeAt(i); + } + return diff === 0; +} + +/** + * Validate the request's `Authorization: Bearer ` header against + * `env.ADMIN_TOKEN`. Returns null on success, or a 401 Response to return + * directly. Empty/missing token in env fails closed. + */ +function requireAdminAuth(request: Request, env: Env): Response | null { + const expected = env.ADMIN_TOKEN; + if (!expected) { + // Misconfigured production or unset dev — closed by default. + return new Response("admin endpoints not configured", { status: 503 }); + } + const auth = request.headers.get("authorization"); + if (!auth || !auth.startsWith("Bearer ")) { + return new Response("unauthorized", { + status: 401, + headers: { "www-authenticate": "Bearer" }, + }); + } + const token = auth.slice("Bearer ".length); + if (!timingSafeEqual(token, expected)) { + return new Response("unauthorized", { + status: 401, + headers: { "www-authenticate": "Bearer" }, + }); + } + return null; +} + +function parseBackfillBody(body: unknown): { dids: string[] } | { error: string } { + if (!isPlainObject(body)) { + return { error: "request body must be a JSON object" }; + } + const rawDids = body["dids"]; + if (!Array.isArray(rawDids)) { + return { error: "`dids` must be an array of DID strings" }; + } + if (rawDids.length === 0) { + return { error: "`dids` must not be empty" }; + } + if (rawDids.length > MAX_BACKFILL_DIDS) { + return { + error: `\`dids\` must contain at most ${MAX_BACKFILL_DIDS} entries (got ${rawDids.length})`, + }; + } + const seen = new Set(); + for (const did of rawDids) { + if (typeof did !== "string" || !DID_PATTERN.test(did)) { + return { error: `invalid DID in list: ${JSON.stringify(did)}` }; + } + seen.add(did); + } + // Set iteration order matches insertion; dedup preserves first-seen order + // so the operator sees jobs run in the order they submitted. + return { dids: [...seen] }; +} export default { async fetch(request: Request, env: Env, ctx: ExecutionContext): Promise { const url = new URL(request.url); if (url.pathname === BOOTSTRAP_PATH) { + const denied = requireAdminAuth(request, env); + if (denied) return denied; const id = env.RECORDS_DO.idFromName(RECORDS_DO_NAME); const stub = env.RECORDS_DO.get(id); // Fire-and-forget so the response shape doesn't depend on the @@ -48,6 +140,33 @@ export default { ctx.waitUntil(stub.fetch("https://do.internal/bootstrap")); return new Response(null, { status: 204 }); } + if (url.pathname === BACKFILL_PATH) { + if (request.method !== "POST") { + return new Response("method not allowed", { + status: 405, + headers: { allow: "POST" }, + }); + } + const denied = requireAdminAuth(request, env); + if (denied) return denied; + let body: unknown; + try { + body = await request.json(); + } catch { + return new Response("request body must be valid JSON", { status: 400 }); + } + const parsed = parseBackfillBody(body); + if ("error" in parsed) { + return new Response(parsed.error, { status: 400 }); + } + // Fire-and-forget via waitUntil so the route returns 202 quickly + // regardless of the DID list size. Backfill progress is observable + // via Workers logs and the resulting D1 writes; the response body + // just confirms the trigger landed. Per-DID errors are logged from + // inside backfillDids; callers don't get them in the response. + ctx.waitUntil(runBackfill(parsed.dids, env)); + return new Response(null, { status: 202 }); + } return new Response("emdash-aggregator: not yet implemented", { status: 503, headers: { "content-type": "text/plain" }, @@ -82,3 +201,47 @@ export default { ctx.waitUntil(stub.fetch("https://do.internal/liveness")); }, }; + +/** + * Production wiring for the backfill worker. Constructs a DID resolver + * pointed at the live PLC + atproto-web methods, and points the queue at + * the production Records Queue binding. Per-DID progress is logged as the + * loop runs so an operator watching `wrangler tail` sees live progress and + * can tell where a backfill stopped if waitUntil exhausts mid-loop. + */ +async function runBackfill(dids: readonly string[], env: Env): Promise { + console.log("[aggregator] backfill starting", { didCount: dids.length }); + try { + const composite = new CompositeDidDocumentResolver({ + methods: { + plc: new PlcDidDocumentResolver(), + web: new AtprotoWebDidDocumentResolver(), + }, + }); + const resolver = new DidResolver({ + cache: createD1DidDocCache(env.DB), + resolver: composite, + }); + const summary = await backfillDids(dids, { + resolver, + queue: env.RECORDS_QUEUE, + onDidComplete: (result) => { + console.log("[aggregator] backfill did", { + did: result.did, + enqueued: result.enqueued, + errors: result.errors, + }); + }, + }); + console.log("[aggregator] backfill complete", { + didCount: dids.length, + totalEnqueued: summary.totalEnqueued, + didsWithErrors: summary.results.filter((r) => r.errors.length > 0).length, + }); + } catch (err) { + console.error("[aggregator] backfill aborted", { + didCount: dids.length, + error: err instanceof Error ? (err.stack ?? err.message) : String(err), + }); + } +} diff --git a/apps/aggregator/src/records-consumer.ts b/apps/aggregator/src/records-consumer.ts index d52810e43..5470c1340 100644 --- a/apps/aggregator/src/records-consumer.ts +++ b/apps/aggregator/src/records-consumer.ts @@ -53,6 +53,7 @@ import { type VerificationFailureReason, type VerifiedPdsRecord, } from "./pds-verify.js"; +import { isPlainObject } from "./utils.js"; /** * Deps the consumer needs at runtime. Constructed once per `processBatch` call @@ -1072,10 +1073,6 @@ function computeVersionSort(version: string): string | null { return `${pad(major)}.${pad(minor)}.${pad(patch)}.${FINAL_VERSION_SENTINEL}`; } -function isPlainObject(value: unknown): value is Record { - return value !== null && typeof value === "object" && !Array.isArray(value); -} - /** * Pull the verified record's CID out of the JSON-stringified * `signature_metadata` column. Returns null if the column is missing or diff --git a/apps/aggregator/src/utils.ts b/apps/aggregator/src/utils.ts new file mode 100644 index 000000000..ed17cae1a --- /dev/null +++ b/apps/aggregator/src/utils.ts @@ -0,0 +1,13 @@ +/** + * Shared utility helpers used across multiple modules. + */ + +/** + * Type guard for narrowing `unknown` to `Record` so + * subsequent `value["key"]` accesses are typesafe without an `as` cast. + * Excludes arrays (which are also `typeof === "object"`) so consumers + * checking for "JSON-shaped object" get what they expect. + */ +export function isPlainObject(value: unknown): value is Record { + return value !== null && typeof value === "object" && !Array.isArray(value); +} diff --git a/apps/aggregator/test/backfill.test.ts b/apps/aggregator/test/backfill.test.ts new file mode 100644 index 000000000..fae806e85 --- /dev/null +++ b/apps/aggregator/test/backfill.test.ts @@ -0,0 +1,725 @@ +/** + * Backfill worker tests. + * + * The worker is plain `fetch` + `queue.send`; tests stub both. The DID + * resolver is also stubbed (in-memory cache + a simple stub upstream) so the + * tests don't depend on the workers test pool or live PLC. + */ + +import { P256PrivateKeyExportable } from "@atcute/crypto"; +import type { DidDocument } from "@atcute/identity"; +import type { Did } from "@atcute/lexicons/syntax"; +import { applyD1Migrations, env, SELF } from "cloudflare:test"; +import { beforeAll, beforeEach, describe, expect, it } from "vitest"; + +import { backfillDid, backfillDids, type BackfillQueue } from "../src/backfill.js"; +import { WANTED_COLLECTIONS } from "../src/constants.js"; +import { + type CachedDidDoc, + type DidDocCache, + type DidDocumentResolverLike, + DidResolver, +} from "../src/did-resolver.js"; +import type { RecordsJob } from "../src/env.js"; + +interface TestEnv { + DB: D1Database; + TEST_MIGRATIONS: Parameters[1]; +} +const testEnv = env as unknown as TestEnv; + +const DID_A = "did:plc:test00000000000000000000"; +const DID_B = "did:plc:test00000000000000000001"; +const PDS = "https://pds.test.example"; + +let signingKeyMultibase: string; + +beforeAll(async () => { + const kp = await P256PrivateKeyExportable.createKeypair(); + signingKeyMultibase = await kp.exportPublicKey("multikey"); + await applyD1Migrations(testEnv.DB, testEnv.TEST_MIGRATIONS); +}); + +beforeEach(async () => { + await testEnv.DB.prepare("DELETE FROM known_publishers").run(); +}); + +class CapturingQueue implements BackfillQueue { + readonly sent: RecordsJob[] = []; + sendBatch(messages: ReadonlyArray<{ body: RecordsJob }>): Promise { + for (const m of messages) this.sent.push(m.body); + return Promise.resolve(); + } +} + +class MapDidDocCache implements DidDocCache { + private readonly entries = new Map(); + read(did: string): Promise { + return Promise.resolve(this.entries.get(did) ?? null); + } + upsert(did: string, doc: Omit, now: Date): Promise { + this.entries.set(did, { ...doc, resolvedAt: now }); + return Promise.resolve(); + } + expire(did: string): Promise { + const entry = this.entries.get(did); + if (entry) this.entries.set(did, { ...entry, resolvedAt: new Date(0) }); + return Promise.resolve(); + } +} + +function buildResolver(): DidResolver { + const cache = new MapDidDocCache(); + const resolver: DidDocumentResolverLike = { + resolve(did: Did): Promise { + return Promise.resolve({ + id: did as `did:${string}:${string}`, + verificationMethod: [ + { + id: `${did}#atproto`, + type: "Multikey", + controller: did as `did:${string}:${string}`, + publicKeyMultibase: signingKeyMultibase, + }, + ], + service: [ + { + id: "#atproto_pds", + type: "AtprotoPersonalDataServer", + serviceEndpoint: PDS, + }, + ], + }); + }, + }; + return new DidResolver({ cache, resolver, ttlMs: 1_000_000, now: () => new Date() }); +} + +interface MockListRecord { + uri: string; + cid: string; + value: Record; +} + +/** + * Build a fetch stub that returns canned `listRecords` responses keyed by + * collection. Records arrive as a single page; pagination is exercised in a + * dedicated test by passing pages of records explicitly. + */ +function makeFetch( + recordsByCollection: Record, + overrides?: { status?: Record }, +): typeof fetch { + return async (input) => { + const url = + typeof input === "string" + ? new URL(input) + : input instanceof URL + ? input + : new URL(input.url); + if (!url.pathname.endsWith("/xrpc/com.atproto.repo.listRecords")) { + return new Response("not stubbed", { status: 599 }); + } + const collection = url.searchParams.get("collection") ?? ""; + const status = overrides?.status?.[collection]; + if (status !== undefined) { + return new Response(JSON.stringify({ error: "X" }), { + status, + headers: { "content-type": "application/json" }, + }); + } + const records = recordsByCollection[collection] ?? []; + return new Response(JSON.stringify({ records }), { + status: 200, + headers: { "content-type": "application/json" }, + }); + }; +} + +describe("backfillDid", () => { + it("enqueues each listRecords result as a RecordsJob", async () => { + const queue = new CapturingQueue(); + const resolver = buildResolver(); + const fetchImpl = makeFetch({ + [WANTED_COLLECTIONS[0]]: [ + { + uri: `at://${DID_A}/${WANTED_COLLECTIONS[0]}/demo`, + cid: "bafyc1", + value: { foo: "bar" }, + }, + ], + }); + const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl }); + + expect(result.errors).toEqual([]); + expect(result.enqueued).toBe(1); + expect(queue.sent).toHaveLength(1); + expect(queue.sent[0]).toMatchObject({ + did: DID_A, + collection: WANTED_COLLECTIONS[0], + rkey: "demo", + operation: "create", + cid: "bafyc1", + }); + // jetstreamRecord intentionally not set on backfill jobs — the + // consumer's DLQ payload field would otherwise mislabel + // `listRecords` data as Jetstream-supplied data. + expect(queue.sent[0]?.jetstreamRecord).toBeUndefined(); + }); + + it("walks every collection in WANTED_COLLECTIONS", async () => { + const queue = new CapturingQueue(); + const resolver = buildResolver(); + const records: Record = {}; + for (let i = 0; i < WANTED_COLLECTIONS.length; i++) { + const c = WANTED_COLLECTIONS[i]; + if (c) { + records[c] = [{ uri: `at://${DID_A}/${c}/r${i}`, cid: `bafyc${i}`, value: {} }]; + } + } + const fetchImpl = makeFetch(records); + + const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl }); + + expect(result.enqueued).toBe(WANTED_COLLECTIONS.length); + const collectionsHit = new Set(queue.sent.map((j) => j.collection)); + expect(collectionsHit.size).toBe(WANTED_COLLECTIONS.length); + }); + + it("treats 404 from the PDS as 'no records of this collection', not an error", async () => { + const queue = new CapturingQueue(); + const resolver = buildResolver(); + const fetchImpl = makeFetch( + {}, + { + status: WANTED_COLLECTIONS.reduce>((acc, c) => { + acc[c] = 404; + return acc; + }, {}), + }, + ); + + const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl }); + + expect(result.errors).toEqual([]); + expect(result.enqueued).toBe(0); + expect(queue.sent).toHaveLength(0); + }); + + it("records non-404 PDS errors per collection, continues to the next collection", async () => { + const queue = new CapturingQueue(); + const resolver = buildResolver(); + const firstCollection = WANTED_COLLECTIONS[0]; + const secondCollection = WANTED_COLLECTIONS[1]; + if (!firstCollection || !secondCollection) throw new Error("test assumes ≥2 collections"); + const fetchImpl = makeFetch( + { + [secondCollection]: [ + { + uri: `at://${DID_A}/${secondCollection}/r1`, + cid: "bafyc", + value: {}, + }, + ], + }, + { status: { [firstCollection]: 503 } }, + ); + + const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl }); + + expect(result.errors).toHaveLength(1); + expect(result.errors[0]).toContain("503"); + expect(result.errors[0]).toContain(firstCollection); + expect(result.enqueued).toBe(1); // the second collection still ran + }); + + it("aborts the DID early if the resolver throws (can't enqueue without PDS)", async () => { + const queue = new CapturingQueue(); + const resolver = new DidResolver({ + cache: new MapDidDocCache(), + resolver: { + resolve: () => Promise.reject(new Error("PLC unreachable")), + }, + }); + const fetchImpl = makeFetch({}); + + const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl }); + + expect(result.enqueued).toBe(0); + expect(queue.sent).toHaveLength(0); + expect(result.errors).toHaveLength(1); + expect(result.errors[0]).toMatch(/resolve failed.*PLC unreachable/); + }); + + it("paginates listRecords via cursor", async () => { + const queue = new CapturingQueue(); + const resolver = buildResolver(); + const collection = WANTED_COLLECTIONS[0]; + if (!collection) throw new Error("test assumes ≥1 collection"); + + // Fetch returns page 1 (cursor "p2") then page 2 (no cursor) for the + // first collection; empty pages for the others. + let calls = 0; + const fetchImpl: typeof fetch = async (input) => { + const url = + typeof input === "string" + ? new URL(input) + : input instanceof URL + ? input + : new URL(input.url); + if (url.searchParams.get("collection") !== collection) { + return new Response(JSON.stringify({ records: [] }), { status: 200 }); + } + calls += 1; + const cursor = url.searchParams.get("cursor"); + if (!cursor) { + return new Response( + JSON.stringify({ + records: [{ uri: `at://${DID_A}/${collection}/p1`, cid: "c1", value: {} }], + cursor: "p2", + }), + { status: 200, headers: { "content-type": "application/json" } }, + ); + } + return new Response( + JSON.stringify({ + records: [{ uri: `at://${DID_A}/${collection}/p2`, cid: "c2", value: {} }], + }), + { status: 200, headers: { "content-type": "application/json" } }, + ); + }; + + const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl }); + + expect(calls).toBe(2); + const collectionJobs = queue.sent.filter((j) => j.collection === collection); + expect(collectionJobs.map((j) => j.rkey)).toEqual(["p1", "p2"]); + expect(result.errors).toEqual([]); + }); + + it("skips records whose URI doesn't match the expected collection (defensive)", async () => { + const queue = new CapturingQueue(); + const resolver = buildResolver(); + const collection = WANTED_COLLECTIONS[0]; + if (!collection) throw new Error("test assumes ≥1 collection"); + const fetchImpl = makeFetch({ + [collection]: [ + { uri: `at://${DID_A}/${collection}/legit`, cid: "c1", value: {} }, + // Buggy PDS: returns a record under the wrong collection. + { uri: `at://${DID_A}/wrong.collection/x`, cid: "c2", value: {} }, + // Buggy URI shape (missing rkey). + { uri: `at://${DID_A}/${collection}/`, cid: "c3", value: {} }, + ], + }); + const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl }); + + const sentForCollection = queue.sent.filter((j) => j.collection === collection); + expect(sentForCollection.map((j) => j.rkey)).toEqual(["legit"]); + expect(result.enqueued).toBe(1); + }); +}); + +describe("backfillDids", () => { + it("processes multiple DIDs serially and aggregates the summary", async () => { + const queue = new CapturingQueue(); + const resolver = buildResolver(); + const collection = WANTED_COLLECTIONS[0]; + if (!collection) throw new Error("test assumes ≥1 collection"); + const fetchImpl: typeof fetch = async (input) => { + const url = + typeof input === "string" + ? new URL(input) + : input instanceof URL + ? input + : new URL(input.url); + if (url.searchParams.get("collection") !== collection) { + return new Response(JSON.stringify({ records: [] }), { status: 200 }); + } + const did = url.searchParams.get("repo") ?? ""; + return new Response( + JSON.stringify({ + records: [{ uri: `at://${did}/${collection}/x`, cid: "c", value: {} }], + }), + { status: 200, headers: { "content-type": "application/json" } }, + ); + }; + + const summary = await backfillDids([DID_A, DID_B], { + resolver, + queue, + fetch: fetchImpl, + }); + + expect(summary.totalEnqueued).toBe(2); + expect(summary.results).toHaveLength(2); + expect(summary.results.map((r) => r.did)).toEqual([DID_A, DID_B]); + expect(summary.results.every((r) => r.errors.length === 0)).toBe(true); + }); + + it("doesn't let one DID's failure stop the others", async () => { + const queue = new CapturingQueue(); + // Resolver succeeds for DID_A, fails for DID_B (or any other). + const cache = new MapDidDocCache(); + const resolver = new DidResolver({ + cache, + resolver: { + resolve: (did) => { + if (did === DID_A) { + return Promise.resolve({ + id: did as `did:${string}:${string}`, + verificationMethod: [ + { + id: `${did}#atproto`, + type: "Multikey", + controller: did as `did:${string}:${string}`, + publicKeyMultibase: signingKeyMultibase, + }, + ], + service: [ + { + id: "#atproto_pds", + type: "AtprotoPersonalDataServer", + serviceEndpoint: PDS, + }, + ], + }); + } + return Promise.reject(new Error("DID_B unresolvable")); + }, + }, + ttlMs: 1_000_000, + now: () => new Date(), + }); + const collection = WANTED_COLLECTIONS[0]; + if (!collection) throw new Error("test assumes ≥1 collection"); + const fetchImpl = makeFetch({ + [collection]: [{ uri: `at://${DID_A}/${collection}/x`, cid: "c", value: {} }], + }); + + const summary = await backfillDids([DID_B, DID_A], { + resolver, + queue, + fetch: fetchImpl, + }); + + expect(summary.totalEnqueued).toBe(1); + expect(summary.results).toHaveLength(2); + const bResult = summary.results.find((r) => r.did === DID_B); + const aResult = summary.results.find((r) => r.did === DID_A); + expect(bResult?.errors.length).toBe(1); + expect(aResult?.errors).toEqual([]); + expect(aResult?.enqueued).toBe(1); + }); + + it("end-to-end against the production D1 cache: DID is registered in known_publishers", async () => { + const queue = new CapturingQueue(); + const { createD1DidDocCache } = await import("../src/did-resolver.js"); + const cache = createD1DidDocCache(testEnv.DB); + const resolver = new DidResolver({ + cache, + resolver: { + resolve: (did) => + Promise.resolve({ + id: did as `did:${string}:${string}`, + verificationMethod: [ + { + id: `${did}#atproto`, + type: "Multikey", + controller: did as `did:${string}:${string}`, + publicKeyMultibase: signingKeyMultibase, + }, + ], + service: [ + { + id: "#atproto_pds", + type: "AtprotoPersonalDataServer", + serviceEndpoint: PDS, + }, + ], + }), + }, + }); + const fetchImpl = makeFetch({}); + + await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl }); + + const row = await testEnv.DB.prepare( + `SELECT did, pds, signing_key, signing_key_id FROM known_publishers WHERE did = ?`, + ) + .bind(DID_A) + .first<{ did: string; pds: string; signing_key: string; signing_key_id: string }>(); + expect(row).toMatchObject({ did: DID_A, pds: PDS }); + expect(row?.signing_key).toBe(signingKeyMultibase); + }); +}); + +describe("backfillCollection: defenses against malicious / buggy PDS", () => { + it("aborts after MAX_PAGES_PER_COLLECTION when cursor never empties", async () => { + const queue = new CapturingQueue(); + const resolver = buildResolver(); + const collection = WANTED_COLLECTIONS[0]; + if (!collection) throw new Error("test assumes ≥1 collection"); + // Hostile PDS: returns a different non-empty cursor every call so the + // cursor-equality check doesn't fire — only the page cap stops us. + let counter = 0; + const fetchImpl: typeof fetch = async (input) => { + const url = + typeof input === "string" + ? new URL(input) + : input instanceof URL + ? input + : new URL(input.url); + if (url.searchParams.get("collection") !== collection) { + return new Response(JSON.stringify({ records: [] }), { status: 200 }); + } + counter += 1; + return new Response(JSON.stringify({ records: [], cursor: `cursor-${counter}` }), { + status: 200, + headers: { "content-type": "application/json" }, + }); + }; + + const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl }); + + expect(result.errors.length).toBeGreaterThanOrEqual(1); + expect(result.errors.some((e) => e.includes("exceeded"))).toBe(true); + // Loop ran at most MAX_PAGES_PER_COLLECTION times. + expect(counter).toBeLessThanOrEqual(1001); // +1 for the throw-on-the-next iteration + }); + + it("aborts when the PDS returns the identical cursor twice", async () => { + const queue = new CapturingQueue(); + const resolver = buildResolver(); + const collection = WANTED_COLLECTIONS[0]; + if (!collection) throw new Error("test assumes ≥1 collection"); + // Buggy PDS: echoes the cursor we sent. Cursor-equality should fire + // on the second iteration. + let calls = 0; + const fetchImpl: typeof fetch = async (input) => { + const url = + typeof input === "string" + ? new URL(input) + : input instanceof URL + ? input + : new URL(input.url); + if (url.searchParams.get("collection") !== collection) { + return new Response(JSON.stringify({ records: [] }), { status: 200 }); + } + calls += 1; + return new Response(JSON.stringify({ records: [], cursor: "stuck" }), { + status: 200, + headers: { "content-type": "application/json" }, + }); + }; + + const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl }); + + expect(result.errors.some((e) => e.includes("identical cursor"))).toBe(true); + expect(calls).toBe(2); // first page, then second page caught the dupe + }); + + it("treats 404 mid-pagination as a partial failure (not silent zero)", async () => { + const queue = new CapturingQueue(); + const resolver = buildResolver(); + const collection = WANTED_COLLECTIONS[0]; + if (!collection) throw new Error("test assumes ≥1 collection"); + const fetchImpl: typeof fetch = async (input) => { + const url = + typeof input === "string" + ? new URL(input) + : input instanceof URL + ? input + : new URL(input.url); + if (url.searchParams.get("collection") !== collection) { + return new Response(JSON.stringify({ records: [] }), { status: 200 }); + } + const cursor = url.searchParams.get("cursor"); + if (!cursor) { + // First page succeeds with a cursor. + return new Response( + JSON.stringify({ + records: [{ uri: `at://${DID_A}/${collection}/p1`, cid: "c1", value: {} }], + cursor: "p2", + }), + { status: 200, headers: { "content-type": "application/json" } }, + ); + } + // Second page 404. + return new Response("not found", { status: 404 }); + }; + + const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl }); + + const error = result.errors.find((e) => e.includes(collection)); + expect(error).toMatch(/404 mid-pagination/); + }); + + it("rejects pages with > MAX_RECORDS_PER_PAGE records (PDS oversize attack)", async () => { + const queue = new CapturingQueue(); + const resolver = buildResolver(); + const collection = WANTED_COLLECTIONS[0]; + if (!collection) throw new Error("test assumes ≥1 collection"); + const records = Array.from({ length: 250 }, (_, i) => ({ + uri: `at://${DID_A}/${collection}/r${i}`, + cid: `c${i}`, + value: {}, + })); + const fetchImpl = makeFetch({ [collection]: records }); + + const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl }); + + expect(result.errors.some((e) => e.includes("exceeding cap"))).toBe(true); + // No records enqueued for the over-capped collection. + expect(queue.sent.filter((j) => j.collection === collection)).toHaveLength(0); + }); + + it("rejects records with malformed rkey (atproto rkey grammar violation)", async () => { + const queue = new CapturingQueue(); + const resolver = buildResolver(); + const collection = WANTED_COLLECTIONS[0]; + if (!collection) throw new Error("test assumes ≥1 collection"); + const fetchImpl = makeFetch({ + [collection]: [ + { uri: `at://${DID_A}/${collection}/legit`, cid: "c1", value: {} }, + { uri: `at://${DID_A}/${collection}/has?queryparam`, cid: "c2", value: {} }, + { uri: `at://${DID_A}/${collection}/has#fragment`, cid: "c3", value: {} }, + { uri: `at://${DID_A}/${collection}/has space`, cid: "c4", value: {} }, + ], + }); + + const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl }); + const sent = queue.sent.filter((j) => j.collection === collection); + expect(sent.map((j) => j.rkey)).toEqual(["legit"]); + expect(result.enqueued).toBe(1); + }); +}); + +describe("backfill admin route: auth + input validation", () => { + it("returns 401 when Authorization header is missing", async () => { + const res = await SELF.fetch("https://test/_admin/backfill", { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ dids: [DID_A] }), + }); + expect(res.status).toBe(401); + expect(res.headers.get("www-authenticate")).toBe("Bearer"); + }); + + it("returns 401 with wrong token", async () => { + const res = await SELF.fetch("https://test/_admin/backfill", { + method: "POST", + headers: { + "content-type": "application/json", + authorization: "Bearer wrong-token", + }, + body: JSON.stringify({ dids: [DID_A] }), + }); + expect(res.status).toBe(401); + }); + + it("returns 405 on GET", async () => { + const res = await SELF.fetch("https://test/_admin/backfill", { + method: "GET", + headers: { authorization: "Bearer test-admin-token" }, + }); + expect(res.status).toBe(405); + }); + + it("returns 400 on missing dids field", async () => { + const res = await SELF.fetch("https://test/_admin/backfill", { + method: "POST", + headers: { + "content-type": "application/json", + authorization: "Bearer test-admin-token", + }, + body: JSON.stringify({}), + }); + expect(res.status).toBe(400); + expect(await res.text()).toContain("must be an array"); + }); + + it("returns 400 on empty dids array", async () => { + const res = await SELF.fetch("https://test/_admin/backfill", { + method: "POST", + headers: { + "content-type": "application/json", + authorization: "Bearer test-admin-token", + }, + body: JSON.stringify({ dids: [] }), + }); + expect(res.status).toBe(400); + expect(await res.text()).toContain("not be empty"); + }); + + it("returns 400 on dids list larger than the cap", async () => { + const dids = Array.from( + { length: 1001 }, + (_, i) => `did:plc:test${i.toString().padStart(20, "0")}`, + ); + const res = await SELF.fetch("https://test/_admin/backfill", { + method: "POST", + headers: { + "content-type": "application/json", + authorization: "Bearer test-admin-token", + }, + body: JSON.stringify({ dids }), + }); + expect(res.status).toBe(400); + expect(await res.text()).toContain("at most 1000"); + }); + + it("returns 400 on malformed DID (caught by DID_PATTERN)", async () => { + const res = await SELF.fetch("https://test/_admin/backfill", { + method: "POST", + headers: { + "content-type": "application/json", + authorization: "Bearer test-admin-token", + }, + body: JSON.stringify({ dids: ["did:plc:has space"] }), + }); + expect(res.status).toBe(400); + expect(await res.text()).toContain("invalid DID"); + }); + + it("returns 202 with a valid token + body (fires backfill in waitUntil)", async () => { + const res = await SELF.fetch("https://test/_admin/backfill", { + method: "POST", + headers: { + "content-type": "application/json", + authorization: "Bearer test-admin-token", + }, + body: JSON.stringify({ dids: [DID_A] }), + }); + expect(res.status).toBe(202); + }); + + it("dedupes duplicate DIDs in input", async () => { + // We can't assert the dedup directly through SELF without race-y waits, + // but the route accepts the body and returns 202 — the dedup is exercised + // in parseBackfillBody, which is unit-tested via the 'invalid DID' path + // (a duplicate doesn't surface as an error). Smoke test only. + const res = await SELF.fetch("https://test/_admin/backfill", { + method: "POST", + headers: { + "content-type": "application/json", + authorization: "Bearer test-admin-token", + }, + body: JSON.stringify({ dids: [DID_A, DID_A, DID_A] }), + }); + expect(res.status).toBe(202); + }); +}); + +describe("admin start route: auth", () => { + it("returns 401 without token", async () => { + const res = await SELF.fetch("https://test/_admin/start"); + expect(res.status).toBe(401); + }); + + it("returns 204 with valid token", async () => { + const res = await SELF.fetch("https://test/_admin/start", { + headers: { authorization: "Bearer test-admin-token" }, + }); + expect(res.status).toBe(204); + }); +}); diff --git a/apps/aggregator/vitest.config.ts b/apps/aggregator/vitest.config.ts index 0958ca0b5..c44bb0200 100644 --- a/apps/aggregator/vitest.config.ts +++ b/apps/aggregator/vitest.config.ts @@ -33,7 +33,15 @@ export default defineConfig({ cloudflareTest({ wrangler: { configPath: "./wrangler.jsonc" }, miniflare: { - bindings: { TEST_MIGRATIONS: migrations }, + bindings: { + TEST_MIGRATIONS: migrations, + // Stub admin auth token so tests can exercise the auth-gated + // admin routes without needing a real secret in the test + // environment. Production deploys pull from + // `wrangler secret put ADMIN_TOKEN`; the value below only + // applies inside the workers test pool. + ADMIN_TOKEN: "test-admin-token", + }, }, }), ], diff --git a/apps/aggregator/worker-configuration.d.ts b/apps/aggregator/worker-configuration.d.ts index 732bb5f4e..4af216a0d 100644 --- a/apps/aggregator/worker-configuration.d.ts +++ b/apps/aggregator/worker-configuration.d.ts @@ -1,5 +1,5 @@ /* eslint-disable */ -// Generated by Wrangler by running `wrangler types` (hash: e774431debb87e9b7f5a2fa40582abe1) +// Generated by Wrangler by running `wrangler types` (hash: 1f938082c03573892ce397f54a9ca0b6) // Runtime types generated with workerd@1.20260507.1 2026-02-24 nodejs_compat declare namespace Cloudflare { interface GlobalProps { @@ -11,6 +11,7 @@ declare namespace Cloudflare { RECORDS_QUEUE: Queue; JETSTREAM_URL: "wss://jetstream2.us-east.bsky.network/subscribe"; CONSTELLATION_URL: "https://constellation.microcosm.blue"; + ADMIN_TOKEN: string; RECORDS_DO: DurableObjectNamespace; } } @@ -20,7 +21,7 @@ type StringifyValues> = { }; declare namespace NodeJS { interface ProcessEnv extends StringifyValues< - Pick + Pick > {} } diff --git a/apps/aggregator/wrangler.jsonc b/apps/aggregator/wrangler.jsonc index 416856c22..42b24ae0a 100644 --- a/apps/aggregator/wrangler.jsonc +++ b/apps/aggregator/wrangler.jsonc @@ -80,6 +80,13 @@ // Constellation HTTP API for cold-start backfill DID discovery. "CONSTELLATION_URL": "https://constellation.microcosm.blue", }, + // Required secrets, declared in config so `wrangler types` generates the + // typed binding and `wrangler deploy` refuses to ship without them set. + // Set in production with `wrangler secret put ADMIN_TOKEN`; tests bind + // a stub value via miniflare in vitest.config.ts. + "secrets": { + "required": ["ADMIN_TOKEN"], + }, "observability": { "enabled": true, }, From 94e8146f13335a2378a20c9373e3c37f09def87b Mon Sep 17 00:00:00 2001 From: Matt Kane Date: Sun, 10 May 2026 16:57:22 +0100 Subject: [PATCH 2/8] fix(aggregator): round 2 review fixes for backfill + admin auth MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round 2 found a HIGH (sendBatch cap mismatch), 2 MEDIUM (Bearer case-sensitivity, unbounded enqueue cost), and several LOWs. All addressed. HIGH - MAX_RECORDS_PER_PAGE was 200 (PAGE_SIZE * 2) but Cloudflare Queues' sendBatch hard-caps at 100. A real PDS that loosely interpreted the ?limit= query and returned 101+ records would have made our sendBatch throw, dropping the entire collection's records via the per-collection try/catch. The in-memory CapturingQueue test stub accepted any size, so the test suite didn't catch the production divergence. Lowered the cap to 100 so a compliant page maps 1:1 to one batch send; oversize pages throw with a clear "exceeding per-page cap" message rather than the queue swallowing them. Added a defense-in-depth check at the sendBatch site too. MEDIUM - Bearer scheme was matched case-sensitively. RFC 6750 §2.1 makes the auth scheme case-insensitive; `curl -H "authorization: bearer ..."` with lowercase scheme would have got a 401 with a valid token. Now lowercase-compare the prefix. - Unbounded enqueue cost from a single authenticated POST. Worst case theoretical 400M queue messages per request (1000 DIDs × 4 collections × 1000 pages × 100 records = ~$160 in queue billing). Added MAX_TOTAL_ENQUEUE = 50_000 as a per-invocation cap, shared across DIDs via a budget object. Reaching the cap throws EnqueueLimitReached; the outer loop catches and stops processing further DIDs, logs a warning so operators see partial completion. Bounds blast radius if ADMIN_TOKEN ever leaks. LOW - `/_admin/start` accepted any HTTP method, contradicting its own documented POST usage. Now requires POST (returns 405 with `allow: POST` on other methods), matching `/_admin/backfill`. - Per-collection enqueued counter undercounted on mid-pagination throws. Switched from `enqueued += await backfillCollection(...)` (which loses the partial count when the await rejects) to tracking via the shared budget object's `before - after` delta, computed in both the success and the error paths. - wrangler.jsonc `secrets.required` comment overstated the enforcement guarantees. Updated to be accurate about the `wrangler dev` warning + `wrangler deploy` block + the per-named-environment inheritance gotcha. Documents the runtime 503 fail-closed behavior so misconfigured deploys can't leave admin routes unauth-passable. - Added .dev.vars.example for new contributors. Local devs who don't create .dev.vars otherwise hit a confusing 503 on admin routes; example file makes the requirement obvious. NEW TESTS - Enqueue budget cap: budget of 30 stops the loop after 30 records enqueued. - POST-only on /_admin/start: GET returns 405; POST + valid token returns 204. - Case-insensitive Bearer: lowercase scheme accepted. 131 tests pass (was 128). Typecheck clean. Zero lint diagnostics in apps/aggregator. NOT FIXED - worker-configuration.d.ts is committed (round-2 LOW). Repo-wide convention; left for a separate Discussion rather than changed here. --- apps/aggregator/.dev.vars.example | 8 ++ apps/aggregator/src/backfill.ts | 113 ++++++++++++++++++++++---- apps/aggregator/src/index.ts | 19 ++++- apps/aggregator/test/backfill.test.ts | 53 +++++++++++- apps/aggregator/wrangler.jsonc | 16 +++- 5 files changed, 183 insertions(+), 26 deletions(-) create mode 100644 apps/aggregator/.dev.vars.example diff --git a/apps/aggregator/.dev.vars.example b/apps/aggregator/.dev.vars.example new file mode 100644 index 000000000..f093bab7a --- /dev/null +++ b/apps/aggregator/.dev.vars.example @@ -0,0 +1,8 @@ +# Local-dev secrets for the aggregator Worker. Copy to `.dev.vars` (which is +# gitignored) and customise. `wrangler dev` reads this file and binds the +# values into `env`. Production secrets are managed via +# `wrangler secret put ` and never appear in the repo. + +# Bearer token for the /_admin/* routes. Any non-empty string works locally; +# pick something you'd recognise in `wrangler tail` logs. +ADMIN_TOKEN=dev-only-not-for-production diff --git a/apps/aggregator/src/backfill.ts b/apps/aggregator/src/backfill.ts index c91f300df..973eb6fce 100644 --- a/apps/aggregator/src/backfill.ts +++ b/apps/aggregator/src/backfill.ts @@ -27,11 +27,36 @@ const PAGE_SIZE = 100; * which is past anything we'd legitimately backfill in one shot. */ const MAX_PAGES_PER_COLLECTION = 1000; /** Defensive cap on records per page. Real PDSes honour the `limit` query - * param; this guards against a hostile PDS returning an enormous array. */ -const MAX_RECORDS_PER_PAGE = PAGE_SIZE * 2; + * param; this guards against a hostile PDS returning an enormous array. + * Capped at the same width as Cloudflare Queues' sendBatch (100) so a + * compliant page maps 1:1 to one batch send; oversize pages are rejected + * rather than chunked, surfacing the PDS's spec violation as a partial + * failure the operator can investigate. */ +const MAX_RECORDS_PER_PAGE = PAGE_SIZE; +/** Cloudflare Queues' hard cap on `sendBatch` size. Per-page enqueues are + * always ≤ this thanks to MAX_RECORDS_PER_PAGE; documented here so the + * relationship is visible at the call site. */ +const QUEUE_SEND_BATCH_CAP = 100; +/** Cap on the total number of records a single backfill invocation may + * enqueue. Defends against a leaked admin token being weaponised to flood + * the queue at near-zero cost to the attacker. The blast radius of one + * compromised POST is bounded by this number × per-message billing. + * + * 50k = ~50 plausible publishers × ~250 records each, well past v1 scale. + * Worker waitUntil exhausts long before this in any honest workload. */ +const MAX_TOTAL_ENQUEUE = 50_000; /** Atproto rkey grammar: ALPHA / DIGIT / "." / "-" / "_" / ":" / "~". */ const RKEY_PATTERN = /^[A-Za-z0-9._:~-]{1,512}$/; +/** Thrown when the per-invocation enqueue cap is reached. Caller catches + * and stops processing further DIDs / collections; partial work already + * committed to the queue is what it is (consumer is idempotent on retry). */ +export class EnqueueLimitReached extends Error { + constructor(public readonly limit: number) { + super(`backfill enqueue cap of ${limit} reached`); + } +} + /** Producer-side queue surface. The production binding `env.RECORDS_QUEUE` * satisfies this; tests pass an in-memory implementation. The return type * is `unknown` rather than `void` so workerd's `Queue.send` (which returns @@ -68,29 +93,55 @@ export interface BackfillSummary { /** * Backfill multiple DIDs serially. Per-DID failures don't stop the loop — * one bad publisher doesn't block the rest. + * + * Aborts the loop early if the per-invocation enqueue cap is reached. The + * partial summary still reflects what got through; the operator can re-run + * with the unfinished tail of the DID list. */ export async function backfillDids( dids: readonly string[], deps: BackfillDeps, ): Promise { const results: BackfillDidResult[] = []; - let totalEnqueued = 0; + const budget: EnqueueBudget = { remaining: MAX_TOTAL_ENQUEUE }; for (const did of dids) { - const result = await backfillDid(did, deps); + const result = await backfillDid(did, deps, budget); results.push(result); - totalEnqueued += result.enqueued; deps.onDidComplete?.(result); + if (budget.remaining <= 0) { + console.warn("[aggregator] backfill enqueue cap reached, aborting remaining DIDs", { + cap: MAX_TOTAL_ENQUEUE, + didsProcessed: results.length, + didsRemaining: dids.length - results.length, + }); + break; + } } + const totalEnqueued = results.reduce((sum, r) => sum + r.enqueued, 0); return { totalEnqueued, results }; } +/** Mutable counter shared across the per-DID / per-collection loops so the + * cap is global to one backfill invocation. */ +interface EnqueueBudget { + remaining: number; +} + /** * Backfill a single DID across every collection in `WANTED_COLLECTIONS`. * Resolution failure aborts (we can't enqueue verifiable jobs without * knowing the PDS); per-collection listRecords failures are recorded and * the loop continues to the next collection. + * + * Optional `budget` shares the per-invocation enqueue cap across DIDs. + * Tests that call this directly without a budget get an effectively + * unbounded one. */ -export async function backfillDid(did: string, deps: BackfillDeps): Promise { +export async function backfillDid( + did: string, + deps: BackfillDeps, + budget: EnqueueBudget = { remaining: MAX_TOTAL_ENQUEUE }, +): Promise { const errors: string[] = []; let enqueued = 0; @@ -108,17 +159,29 @@ export async function backfillDid(did: string, deps: BackfillDeps): Promise { - let enqueued = 0; +async function backfillCollection(opts: BackfillCollectionOpts): Promise { let cursor: string | undefined; let prevCursor: string | undefined; let pages = 0; do { + if (opts.budget.remaining <= 0) throw new EnqueueLimitReached(MAX_TOTAL_ENQUEUE); if (++pages > MAX_PAGES_PER_COLLECTION) { throw new Error(`exceeded ${MAX_PAGES_PER_COLLECTION} pages`); } @@ -171,7 +242,7 @@ async function backfillCollection(opts: BackfillCollectionOpts): Promise if (response.status === 404) { if (cursor === undefined) { // First-page 404: publisher has no records of this collection. - return enqueued; + return; } // Mid-pagination 404 is a partial failure; surface it. throw new Error(`listRecords returned 404 mid-pagination at cursor=${cursor}`); @@ -184,13 +255,14 @@ async function backfillCollection(opts: BackfillCollectionOpts): Promise const records = extractListRecordsBody(body); if (records.length > MAX_RECORDS_PER_PAGE) { throw new Error( - `PDS returned ${records.length} records, exceeding cap of ${MAX_RECORDS_PER_PAGE}`, + `PDS returned ${records.length} records, exceeding per-page cap of ${MAX_RECORDS_PER_PAGE}`, ); } cursor = extractCursor(body); const messages: { body: RecordsJob }[] = []; for (const record of records) { + if (opts.budget.remaining - messages.length <= 0) break; const rkey = parseRkeyFromUri(record.uri, opts.collection); if (!rkey) continue; messages.push({ @@ -204,14 +276,21 @@ async function backfillCollection(opts: BackfillCollectionOpts): Promise }); } if (messages.length > 0) { - // Single batched send per page — far cheaper than the per-record - // awaits this used to do (which couldn't possibly finish a - // 4000-record DID inside the waitUntil budget). + // Page size is capped at QUEUE_SEND_BATCH_CAP, so this single + // sendBatch never exceeds Cloudflare's 100-message limit. + if (messages.length > QUEUE_SEND_BATCH_CAP) { + // Defense in depth — should be unreachable given the cap + // above, but throwing loudly here beats a runtime error from + // the queue binding silently dropping the batch. + throw new Error( + `sendBatch payload of ${messages.length} exceeds Queue cap of ${QUEUE_SEND_BATCH_CAP}`, + ); + } await opts.queue.sendBatch(messages); - enqueued += messages.length; + opts.budget.remaining -= messages.length; } + if (opts.budget.remaining <= 0) throw new EnqueueLimitReached(MAX_TOTAL_ENQUEUE); } while (cursor); - return enqueued; } interface ListRecordEntry { diff --git a/apps/aggregator/src/index.ts b/apps/aggregator/src/index.ts index 470f44f71..f06aa7d05 100644 --- a/apps/aggregator/src/index.ts +++ b/apps/aggregator/src/index.ts @@ -82,13 +82,22 @@ function requireAdminAuth(request: Request, env: Env): Response | null { return new Response("admin endpoints not configured", { status: 503 }); } const auth = request.headers.get("authorization"); - if (!auth || !auth.startsWith("Bearer ")) { + const SCHEME_PREFIX = "bearer "; + // RFC 6750 §2.1: the auth scheme is case-insensitive. `curl -H + // "authorization: bearer ..."` and SDKs that don't canonicalise the + // scheme would otherwise fail with a confusing 401 even though the + // token is correct. + if ( + !auth || + auth.length < SCHEME_PREFIX.length || + auth.slice(0, SCHEME_PREFIX.length).toLowerCase() !== SCHEME_PREFIX + ) { return new Response("unauthorized", { status: 401, headers: { "www-authenticate": "Bearer" }, }); } - const token = auth.slice("Bearer ".length); + const token = auth.slice(SCHEME_PREFIX.length); if (!timingSafeEqual(token, expected)) { return new Response("unauthorized", { status: 401, @@ -130,6 +139,12 @@ export default { async fetch(request: Request, env: Env, ctx: ExecutionContext): Promise { const url = new URL(request.url); if (url.pathname === BOOTSTRAP_PATH) { + if (request.method !== "POST") { + return new Response("method not allowed", { + status: 405, + headers: { allow: "POST" }, + }); + } const denied = requireAdminAuth(request, env); if (denied) return denied; const id = env.RECORDS_DO.idFromName(RECORDS_DO_NAME); diff --git a/apps/aggregator/test/backfill.test.ts b/apps/aggregator/test/backfill.test.ts index fae806e85..b65d6a2e7 100644 --- a/apps/aggregator/test/backfill.test.ts +++ b/apps/aggregator/test/backfill.test.ts @@ -568,7 +568,7 @@ describe("backfillCollection: defenses against malicious / buggy PDS", () => { const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl }); - expect(result.errors.some((e) => e.includes("exceeding cap"))).toBe(true); + expect(result.errors.some((e) => e.includes("per-page cap"))).toBe(true); // No records enqueued for the over-capped collection. expect(queue.sent.filter((j) => j.collection === collection)).toHaveLength(0); }); @@ -594,6 +594,36 @@ describe("backfillCollection: defenses against malicious / buggy PDS", () => { }); }); +describe("backfillCollection: enqueue budget cap", () => { + it("stops enqueuing when MAX_TOTAL_ENQUEUE is reached and reports the partial count", async () => { + // Force the cap by giving a tiny budget. 50 records × 4 collections = + // 200 records the worker would otherwise enqueue; budget of 30 should + // cap it after the first collection's first page. + const queue = new CapturingQueue(); + const resolver = buildResolver(); + const recordsByCollection: Record = {}; + for (const c of WANTED_COLLECTIONS) { + recordsByCollection[c] = Array.from({ length: 50 }, (_, i) => ({ + uri: `at://${DID_A}/${c}/r${i}`, + cid: `c${i}`, + value: {}, + })); + } + const fetchImpl = makeFetch(recordsByCollection); + const result = await backfillDid( + DID_A, + { resolver, queue, fetch: fetchImpl }, + { remaining: 30 }, + ); + // The first collection's full page (50 records) was within the cap? + // No — after 30 records the budget hits 0, the inner break fires, + // the page's batch send carries 30 records, then the post-send + // throw aborts further pages. + expect(result.enqueued).toBeLessThanOrEqual(30); + expect(queue.sent.length).toBeLessThanOrEqual(30); + }); +}); + describe("backfill admin route: auth + input validation", () => { it("returns 401 when Authorization header is missing", async () => { const res = await SELF.fetch("https://test/_admin/backfill", { @@ -710,16 +740,31 @@ describe("backfill admin route: auth + input validation", () => { }); }); -describe("admin start route: auth", () => { - it("returns 401 without token", async () => { +describe("admin start route: auth + method", () => { + it("returns 405 on GET (POST-only route)", async () => { const res = await SELF.fetch("https://test/_admin/start"); + expect(res.status).toBe(405); + expect(res.headers.get("allow")).toBe("POST"); + }); + + it("returns 401 on POST without token", async () => { + const res = await SELF.fetch("https://test/_admin/start", { method: "POST" }); expect(res.status).toBe(401); }); - it("returns 204 with valid token", async () => { + it("returns 204 on POST with valid token", async () => { const res = await SELF.fetch("https://test/_admin/start", { + method: "POST", headers: { authorization: "Bearer test-admin-token" }, }); expect(res.status).toBe(204); }); + + it("accepts case-insensitive Bearer scheme (RFC 6750 §2.1)", async () => { + const res = await SELF.fetch("https://test/_admin/start", { + method: "POST", + headers: { authorization: "bearer test-admin-token" }, + }); + expect(res.status).toBe(204); + }); }); diff --git a/apps/aggregator/wrangler.jsonc b/apps/aggregator/wrangler.jsonc index 42b24ae0a..6e0fc8287 100644 --- a/apps/aggregator/wrangler.jsonc +++ b/apps/aggregator/wrangler.jsonc @@ -81,9 +81,19 @@ "CONSTELLATION_URL": "https://constellation.microcosm.blue", }, // Required secrets, declared in config so `wrangler types` generates the - // typed binding and `wrangler deploy` refuses to ship without them set. - // Set in production with `wrangler secret put ADMIN_TOKEN`; tests bind - // a stub value via miniflare in vitest.config.ts. + // typed binding and `wrangler deploy` validates the secret is set on the + // target Worker before publishing. `wrangler dev` warns at startup when a + // required secret isn't in `.dev.vars` / `.env`. + // + // Set in production with `wrangler secret put ADMIN_TOKEN`. Tests bind + // a stub via miniflare in vitest.config.ts. Local dev pulls from + // `.dev.vars` (gitignored; see `.dev.vars.example`). + // + // The `requireAdminAuth` runtime guard fails closed (503) if the binding + // is somehow empty at request time, so even a misconfigured deploy can't + // leave the admin routes unauth-passable. NOTE: this `secrets` block is + // not inherited by named environments — repeat it under each `env.` + // you add or that env's deploys won't enforce the secret. "secrets": { "required": ["ADMIN_TOKEN"], }, From 5895450e582fac48839312a535fe15e28ae6c4c8 Mon Sep 17 00:00:00 2001 From: Matt Kane Date: Sun, 10 May 2026 17:18:25 +0100 Subject: [PATCH 3/8] fix(aggregator): round 3 review fixes for backfill MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round 3 found 1 HIGH (no listRecords timeout), 2 MEDIUM (EnqueueLimitReached observability + slack budget-cap test), 2 LOW (whitespace-only ADMIN_TOKEN, log discrepancy on early-abort), 1 NIT (unreachable defense-in-depth check that drops records when triggered). All addressed. HIGH - listRecords fetch had no timeout. A hung publisher PDS that accepted the connection but stalled the body would have blocked `await fetchImpl(...)` until workerd's overall sub-request budget exhausted, starving every DID after it in the serial loop. pds-verify uses an AbortController for the same kind of upstream call; inconsistent. Now wraps the listRecords fetch in an AbortController + 15s default timeout (override via `BackfillDeps.listRecordsTimeoutMs` for tests). On AbortError, throws with cause attached so per-collection error log includes the underlying signal. MEDIUM - EnqueueLimitReached returned without recording an error in `BackfillDidResult.errors`. Operator's per-DID log showed `enqueued: 47, errors: []` for a half-processed DID — looked like a successful 47-record publisher. Now appends a sentinel error ("enqueue cap reached mid-DID at collection X — partial backfill, re-run to complete") so partial completion is visible. Also added triggerDid to the warn log in `backfillDids` so operators can see WHICH DID drained the budget. - Budget-cap test asserted `<= 30`, which would silently accept off-by-one regressions in the inner break (capping at 29 or 0 would still pass). The exact answer is 30 (budget=30 + page of 50 → inner break at iter 31, sendBatch sends 30, post-send throw fires). Now asserts `toBe(30)`. Also asserts the sentinel error appears. LOW - Whitespace-only ADMIN_TOKEN was accepted. `if (!expected)` catches "" and undefined but accepts " " (operator pastes a blank line into `wrangler secret put`). Now `expected.trim().length === 0` also fails closed. - `runBackfill` summary log conflated didCount (requested) with didsProcessed (actual). When the cap fires and the loop breaks early, the log said `didCount: 1000, totalEnqueued: 50000` even if only 200 DIDs ran. Now logs both `didsRequested` and `didsProcessed` separately so the early-abort case is visible in the summary line without correlating against an earlier warn. NIT - Defense-in-depth `if (messages.length > QUEUE_SEND_BATCH_CAP)` inside the page loop was unreachable given `MAX_RECORDS_PER_PAGE === QUEUE_SEND_BATCH_CAP === 100`, AND if it ever fired (future bump of MAX_RECORDS_PER_PAGE), it would drop the entire page silently into errors[]. Replaced with a module-load static guard that throws at boot if the invariant is violated — better failure mode (boot crash vs silent data loss) and the runtime check disappears. NEW TESTS - listRecords timeout: hung-fetch stub never resolves, abort fires at 25ms, error message matches "timed out after 25ms". - Case-insensitive Bearer regression guards: canonical "Bearer", uppercase "BEARER", non-Bearer "Basic" rejected, empty token after prefix rejected. 136 tests pass (was 131). Typecheck clean. Zero lint diagnostics in apps/aggregator. CONVERGENCE NOTE Round 1 → 12 bugs. Round 2 → 1 HIGH + 2 MED + 5 LOW = 8 bugs. Round 3 → 1 HIGH + 2 MED + 2 LOW + 1 NIT = 6 findings, only the HIGH was a behavior bug; rest were quality / observability / test-strength. Reviewer assessed convergence is real but not complete; only the HIGH would have blocked. The slice has now had three rounds with diminishing severity and no new bugs introduced by prior fix rounds. --- apps/aggregator/src/backfill.ts | 60 ++++++++++++++---- apps/aggregator/src/index.ts | 12 +++- apps/aggregator/test/backfill.test.ts | 89 +++++++++++++++++++++++++-- 3 files changed, 140 insertions(+), 21 deletions(-) diff --git a/apps/aggregator/src/backfill.ts b/apps/aggregator/src/backfill.ts index 973eb6fce..a61475b47 100644 --- a/apps/aggregator/src/backfill.ts +++ b/apps/aggregator/src/backfill.ts @@ -21,6 +21,12 @@ import type { RecordsJob } from "./env.js"; import { isPlainObject } from "./utils.js"; const PAGE_SIZE = 100; +/** Per-listRecords-page timeout. A hostile or hung publisher PDS that + * accepts the connection but stalls the body would otherwise block the + * fetch until workerd's overall sub-request budget exhausts — starving + * every DID after this one in the serial loop. Same shape as + * `pds-verify.ts`'s fetchCar timeout. */ +const LIST_RECORDS_TIMEOUT_MS = 15_000; /** Cap on listRecords pagination per collection. A buggy or malicious PDS * that echoes the same cursor would otherwise loop forever inside one * `ctx.waitUntil`. 1000 pages × 100 records = 100k records per collection, @@ -37,6 +43,14 @@ const MAX_RECORDS_PER_PAGE = PAGE_SIZE; * always ≤ this thanks to MAX_RECORDS_PER_PAGE; documented here so the * relationship is visible at the call site. */ const QUEUE_SEND_BATCH_CAP = 100; +// Static guard: bumping MAX_RECORDS_PER_PAGE above the queue's batch cap +// would silently break sendBatch in production. Surface the violation at +// module load rather than at the first batch send. +if (MAX_RECORDS_PER_PAGE > QUEUE_SEND_BATCH_CAP) { + throw new Error( + `MAX_RECORDS_PER_PAGE (${MAX_RECORDS_PER_PAGE}) exceeds QUEUE_SEND_BATCH_CAP (${QUEUE_SEND_BATCH_CAP})`, + ); +} /** Cap on the total number of records a single backfill invocation may * enqueue. Defends against a leaked admin token being weaponised to flood * the queue at near-zero cost to the attacker. The blast radius of one @@ -70,6 +84,10 @@ export interface BackfillDeps { queue: BackfillQueue; /** Inject for tests; defaults to `globalThis.fetch`. */ fetch?: typeof fetch; + /** Override for the per-listRecords-page timeout. Defaults to + * `LIST_RECORDS_TIMEOUT_MS`. Tests use a small value to exercise the + * abort path without burning the production budget. */ + listRecordsTimeoutMs?: number; /** Optional callback fired after each DID completes (success or failure). * Used by the production wiring to log per-DID progress so an operator * watching `wrangler tail` sees where a long backfill is up to. */ @@ -111,6 +129,7 @@ export async function backfillDids( if (budget.remaining <= 0) { console.warn("[aggregator] backfill enqueue cap reached, aborting remaining DIDs", { cap: MAX_TOTAL_ENQUEUE, + triggerDid: result.did, didsProcessed: results.length, didsRemaining: dids.length - results.length, }); @@ -158,6 +177,7 @@ export async function backfillDid( } const fetchImpl = deps.fetch ?? fetch; + const timeoutMs = deps.listRecordsTimeoutMs ?? LIST_RECORDS_TIMEOUT_MS; for (const collection of WANTED_COLLECTIONS) { // Track this collection's enqueues separately so a mid-pagination // throw still reports the partial count instead of swallowing the @@ -170,13 +190,20 @@ export async function backfillDid( collection, queue: deps.queue, fetchImpl, + timeoutMs, budget, }); } catch (err) { if (err instanceof EnqueueLimitReached) { // Stop processing further collections for this DID; outer // loop will catch the budget exhaustion and stop entirely. + // Append a sentinel error so the operator's per-DID log + // shows the cap fired (otherwise `enqueued: 47, errors: []` + // looks like a successful 47-record DID). enqueued += before - budget.remaining; + errors.push( + `enqueue cap reached mid-DID at collection ${collection} — partial backfill, re-run to complete`, + ); return { did, enqueued, errors }; } errors.push(`${collection}: ${err instanceof Error ? err.message : String(err)}`); @@ -193,6 +220,7 @@ interface BackfillCollectionOpts { collection: string; queue: BackfillQueue; fetchImpl: typeof fetch; + timeoutMs: number; budget: EnqueueBudget; } @@ -236,9 +264,22 @@ async function backfillCollection(opts: BackfillCollectionOpts): Promise { url.searchParams.set("limit", String(PAGE_SIZE)); if (cursor) url.searchParams.set("cursor", cursor); - const response = await opts.fetchImpl(url.toString(), { - headers: { accept: "application/json" }, - }); + const controller = new AbortController(); + const timer = setTimeout(() => controller.abort(), opts.timeoutMs); + let response: Response; + try { + response = await opts.fetchImpl(url.toString(), { + headers: { accept: "application/json" }, + signal: controller.signal, + }); + } catch (err) { + if (err instanceof Error && err.name === "AbortError") { + throw new Error(`listRecords timed out after ${opts.timeoutMs}ms`, { cause: err }); + } + throw err; + } finally { + clearTimeout(timer); + } if (response.status === 404) { if (cursor === undefined) { // First-page 404: publisher has no records of this collection. @@ -276,16 +317,9 @@ async function backfillCollection(opts: BackfillCollectionOpts): Promise { }); } if (messages.length > 0) { - // Page size is capped at QUEUE_SEND_BATCH_CAP, so this single - // sendBatch never exceeds Cloudflare's 100-message limit. - if (messages.length > QUEUE_SEND_BATCH_CAP) { - // Defense in depth — should be unreachable given the cap - // above, but throwing loudly here beats a runtime error from - // the queue binding silently dropping the batch. - throw new Error( - `sendBatch payload of ${messages.length} exceeds Queue cap of ${QUEUE_SEND_BATCH_CAP}`, - ); - } + // Page size is capped at QUEUE_SEND_BATCH_CAP at module load via + // the static assertion above, so this sendBatch never exceeds + // Cloudflare's 100-message limit by construction. await opts.queue.sendBatch(messages); opts.budget.remaining -= messages.length; } diff --git a/apps/aggregator/src/index.ts b/apps/aggregator/src/index.ts index f06aa7d05..a3f42d8df 100644 --- a/apps/aggregator/src/index.ts +++ b/apps/aggregator/src/index.ts @@ -77,7 +77,11 @@ function timingSafeEqual(a: string, b: string): boolean { */ function requireAdminAuth(request: Request, env: Env): Response | null { const expected = env.ADMIN_TOKEN; - if (!expected) { + // `trim()` defends against a whitespace-only secret slipping past + // `secrets.required`'s presence check — `wrangler secret put ADMIN_TOKEN` + // followed by an accidental Enter would otherwise produce a working + // endpoint with a trivially guessable token. + if (!expected || expected.trim().length === 0) { // Misconfigured production or unset dev — closed by default. return new Response("admin endpoints not configured", { status: 503 }); } @@ -249,7 +253,11 @@ async function runBackfill(dids: readonly string[], env: Env): Promise { }, }); console.log("[aggregator] backfill complete", { - didCount: dids.length, + didsRequested: dids.length, + // Separate from didsRequested so the early-abort case (enqueue + // cap fired) is visible in the summary log without the operator + // having to correlate against an earlier warn line. + didsProcessed: summary.results.length, totalEnqueued: summary.totalEnqueued, didsWithErrors: summary.results.filter((r) => r.errors.length > 0).length, }); diff --git a/apps/aggregator/test/backfill.test.ts b/apps/aggregator/test/backfill.test.ts index b65d6a2e7..14e93fc4b 100644 --- a/apps/aggregator/test/backfill.test.ts +++ b/apps/aggregator/test/backfill.test.ts @@ -615,12 +615,55 @@ describe("backfillCollection: enqueue budget cap", () => { { resolver, queue, fetch: fetchImpl }, { remaining: 30 }, ); - // The first collection's full page (50 records) was within the cap? - // No — after 30 records the budget hits 0, the inner break fires, - // the page's batch send carries 30 records, then the post-send - // throw aborts further pages. - expect(result.enqueued).toBeLessThanOrEqual(30); - expect(queue.sent.length).toBeLessThanOrEqual(30); + // Budget=30 + page of 50 records: inner loop pushes records while + // `budget.remaining - messages.length > 0`. At iter 31 the + // expression (30 - 30) ≤ 0 fires `break`. messages.length === 30, + // sendBatch sends 30, budget → 0, post-send check throws + // EnqueueLimitReached. Exact answer is 30 — assert the exact + // number so an off-by-one regression in the inner break doesn't + // pass under a slack `<=`. + expect(result.enqueued).toBe(30); + expect(queue.sent.length).toBe(30); + // Sentinel error appended so the operator can see the cap fired + // rather than thinking the DID was a clean 30-record publisher. + expect(result.errors.some((e) => e.includes("enqueue cap reached"))).toBe(true); + }); +}); + +describe("backfillCollection: listRecords timeout", () => { + it("aborts a hung PDS fetch and surfaces the timeout per-collection", async () => { + const queue = new CapturingQueue(); + const resolver = buildResolver(); + const collection = WANTED_COLLECTIONS[0]; + if (!collection) throw new Error("test assumes ≥1 collection"); + const fetchImpl: typeof fetch = (input, init) => { + const url = + typeof input === "string" + ? new URL(input) + : input instanceof URL + ? input + : new URL(input.url); + if (url.searchParams.get("collection") !== collection) { + return Promise.resolve(new Response(JSON.stringify({ records: [] }), { status: 200 })); + } + // Hung connection: never resolve, but honour the abort signal so + // the AbortController-driven timeout fires. + return new Promise((_resolve, reject) => { + init?.signal?.addEventListener("abort", () => { + reject(new DOMException("aborted", "AbortError")); + }); + }); + }; + + const result = await backfillDid(DID_A, { + resolver, + queue, + fetch: fetchImpl, + listRecordsTimeoutMs: 25, + }); + + const error = result.errors.find((e) => e.includes(collection)); + expect(error).toMatch(/timed out after 25ms/); }); }); @@ -740,6 +783,40 @@ describe("backfill admin route: auth + input validation", () => { }); }); +describe("admin auth: scheme + token edge cases", () => { + it("accepts canonical mixed-case Bearer (regression guard)", async () => { + const res = await SELF.fetch("https://test/_admin/start", { + method: "POST", + headers: { authorization: "Bearer test-admin-token" }, + }); + expect(res.status).toBe(204); + }); + + it("accepts uppercase BEARER scheme (RFC 6750 case-insensitive)", async () => { + const res = await SELF.fetch("https://test/_admin/start", { + method: "POST", + headers: { authorization: "BEARER test-admin-token" }, + }); + expect(res.status).toBe(204); + }); + + it("rejects an Authorization header with a non-Bearer scheme", async () => { + const res = await SELF.fetch("https://test/_admin/start", { + method: "POST", + headers: { authorization: "Basic dGVzdC1hZG1pbi10b2tlbjo=" }, + }); + expect(res.status).toBe(401); + }); + + it("rejects empty token after Bearer prefix", async () => { + const res = await SELF.fetch("https://test/_admin/start", { + method: "POST", + headers: { authorization: "Bearer " }, + }); + expect(res.status).toBe(401); + }); +}); + describe("admin start route: auth + method", () => { it("returns 405 on GET (POST-only route)", async () => { const res = await SELF.fetch("https://test/_admin/start"); From 31eebc752a6ad13ee8e1a499d25dd85b22dbea09 Mon Sep 17 00:00:00 2001 From: Matt Kane Date: Sun, 10 May 2026 18:28:32 +0100 Subject: [PATCH 4/8] refactor(aggregator): fan backfill into a queue to escape the 30s waitUntil cap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous backfill ran one giant serial loop inside the POST handler's `ctx.waitUntil`, which Cloudflare hard-caps at 30 seconds wall-clock. Realistically that limited a single backfill POST to ~15-25 DIDs before in-flight work was cancelled, regardless of per-DID work being well-bounded. Restructured into a dedicated `BACKFILL_QUEUE` of `{did, collection}` jobs. The POST handler now just discovers DIDs (via relay or explicit list) and fans pairs onto the queue via `sendBatch` — fast and trivially fits in waitUntil. A new per-pair consumer (`backfill-consumer.ts`) does the listRecords pagination + records-queue fan-out work asynchronously, with queue retry/backoff/concurrency for free. Cap changes for the new fan-out threat model: - `MAX_BACKFILL_DIDS` lowered 1000 → 100 (each submitted DID becomes `WANTED_COLLECTIONS.length` consumer invocations, so the explicit-list ceiling needs to be much lower than the old serial-loop cap). - New `MAX_DISCOVERED_DIDS = 1000` defends discovery against a runaway relay flooding the queue. - Dropped the global `MAX_TOTAL_ENQUEUE` budget — it doesn't translate across consumer invocations. Per-pair work is still bounded by `MAX_PAGES_PER_COLLECTION` × `MAX_RECORDS_PER_PAGE`. --- apps/aggregator/src/backfill-consumer.ts | 90 +++ apps/aggregator/src/backfill.ts | 384 +++++++----- apps/aggregator/src/env.ts | 15 + apps/aggregator/src/index.ts | 180 ++++-- apps/aggregator/test/backfill.test.ts | 731 +++++++++++++--------- apps/aggregator/worker-configuration.d.ts | 7 +- apps/aggregator/wrangler.jsonc | 36 +- 7 files changed, 922 insertions(+), 521 deletions(-) create mode 100644 apps/aggregator/src/backfill-consumer.ts diff --git a/apps/aggregator/src/backfill-consumer.ts b/apps/aggregator/src/backfill-consumer.ts new file mode 100644 index 000000000..727b0019e --- /dev/null +++ b/apps/aggregator/src/backfill-consumer.ts @@ -0,0 +1,90 @@ +/** + * Backfill queue consumer. Pulls one `BackfillJob` at a time and walks + * `com.atproto.repo.listRecords` for that (DID, collection) pair, batching + * results onto the records queue for the standard verify-and-write path. + * + * Why a separate queue from records: per-pair work (PDS resolution + + * paginated listRecords + sendBatch onto the records queue) is bounded but + * non-trivial — running it inside the records-queue consumer would burn the + * sub-request budget for jobs that should just be writing to D1. Keeping + * the queues separate also lets the operator throttle backfill work + * independently of live ingest. + * + * Error policy: + * - Per-pair `processBackfillJob` throw → `message.retry()`. Cloudflare + * Queues backs off and retries; after `max_retries` (3, configured in + * wrangler.jsonc) the message lands in `emdash-aggregator-backfill-dlq`. + * - Unexpected throws inside the batch loop are caught per-message so one + * bad job can't poison the rest of the batch. + * + * The DLQ is intentionally not auto-drained today — backfill is operator- + * triggered, so DLQ inspection is part of the operator's workflow when a + * backfill POST shows partial completion in `wrangler tail`. A drain + * consumer can land later when we have a clear ack policy (probably: + * write a row to D1 and ack, like the records-DLQ drain). + */ + +import { + AtprotoWebDidDocumentResolver, + CompositeDidDocumentResolver, + PlcDidDocumentResolver, +} from "@atcute/identity-resolver"; + +import { processBackfillJob, type ProcessBackfillJobDeps } from "./backfill.js"; +import { createD1DidDocCache, DidResolver } from "./did-resolver.js"; +import type { BackfillJob } from "./env.js"; +import type { MessageBatchLike } from "./records-consumer.js"; + +/** + * Process one batch of backfill jobs. Mirrors `records-consumer.processBatch`'s + * shape: per-message try/catch, ack on success, retry on throw. + * + * `depsOverride` is the test seam — production calls without it and gets + * the standard composite resolver wired against `env.DB`. + */ +export async function processBackfillBatch( + batch: MessageBatchLike, + env: Env, + depsOverride?: ProcessBackfillJobDeps, +): Promise { + const deps = depsOverride ?? createProductionDeps(env); + for (const message of batch.messages) { + const job = message.body; + try { + const result = await processBackfillJob(job, deps); + console.log("[aggregator] backfill job complete", { + did: result.did, + collection: result.collection, + enqueued: result.enqueued, + }); + message.ack(); + } catch (err) { + // Resolution failures, listRecords 5xx, timeouts, and pagination + // runaway all land here. Retry — Cloudflare Queues backoff handles + // transient PDS failures; permanently broken DIDs land in the DLQ + // after max_retries. + console.error("[aggregator] backfill job failed, retrying", { + did: job.did, + collection: job.collection, + error: err instanceof Error ? err.message : String(err), + }); + message.retry(); + } + } +} + +function createProductionDeps(env: Env): ProcessBackfillJobDeps { + const composite = new CompositeDidDocumentResolver({ + methods: { + plc: new PlcDidDocumentResolver(), + web: new AtprotoWebDidDocumentResolver(), + }, + }); + return { + resolver: new DidResolver({ + cache: createD1DidDocCache(env.DB), + resolver: composite, + }), + queue: env.RECORDS_QUEUE, + }; +} diff --git a/apps/aggregator/src/backfill.ts b/apps/aggregator/src/backfill.ts index a61475b47..e27b6b7c1 100644 --- a/apps/aggregator/src/backfill.ts +++ b/apps/aggregator/src/backfill.ts @@ -1,36 +1,53 @@ /** * Cold-start discovery worker. * - * Operator-triggered (via `POST /_admin/backfill`); takes a list of DIDs, - * calls `com.atproto.repo.listRecords` against each publisher's PDS for every - * collection in `WANTED_COLLECTIONS`, and enqueues each returned record onto - * the existing Records Queue as a `RecordsJob`. The consumer's verification + - * write + idempotency machinery handles the rest — same code path as live - * Jetstream, distinguished only by trigger. + * Operator-triggered (via `POST /_admin/backfill`). Two trigger shapes: * - * Live discovery is Jetstream's job, not this worker's; it picks up new - * publishers automatically. Backfill exists for the cold-start gap (publishers - * who published before the aggregator was listening) and for operator-triggered - * recovery after a known outage. There is deliberately no periodic scheduler - * — see plan §"Why no reconciliation cron". + * - `{ "dids": [...] }` — explicit list, primarily for testing or recovery + * of a known DID set. + * - `{}` (empty body) — production cold-start. Calls + * `com.atproto.sync.listReposByCollection` against the configured relay + * for each NSID in `WANTED_COLLECTIONS`, paginates the full DID set, + * dedupes, and feeds the union into the same backfill loop. + * + * Architecture: the POST handler synchronously discovers DIDs (or accepts an + * explicit list), then fans out one `BackfillJob = { did, collection }` per + * (DID × WANTED_COLLECTIONS) pair onto the dedicated `BACKFILL_QUEUE` via + * `sendBatch`. A separate consumer (`backfill-consumer.ts`) processes one + * pair at a time: resolve PDS, paginate `com.atproto.repo.listRecords`, + * batch-enqueue each returned record onto the existing Records Queue. + * + * Why a separate queue rather than running the per-DID loop inside the + * `ctx.waitUntil` of the POST handler: Cloudflare's hard 30-second + * wall-clock cap on `waitUntil` would limit a single backfill POST to + * ~15–25 DIDs before in-flight work was cancelled. The queue gives us + * automatic retry, concurrency, and per-pair invocation budgets that each + * fit comfortably under the sub-request ceiling. + * + * Live discovery (post-cold-start) is Jetstream's job, not this worker's; + * the consumer writes `known_publishers` opportunistically on any record + * event for an unseen DID. Backfill exists for the cold-start gap + * (publishers who published before the aggregator was listening) and for + * operator-triggered recovery after a known outage. There is deliberately + * no periodic scheduler — see plan §"Why no reconciliation cron". */ import { WANTED_COLLECTIONS } from "./constants.js"; import type { DidResolver } from "./did-resolver.js"; -import type { RecordsJob } from "./env.js"; +import type { BackfillJob, RecordsJob } from "./env.js"; import { isPlainObject } from "./utils.js"; const PAGE_SIZE = 100; /** Per-listRecords-page timeout. A hostile or hung publisher PDS that * accepts the connection but stalls the body would otherwise block the * fetch until workerd's overall sub-request budget exhausts — starving - * every DID after this one in the serial loop. Same shape as + * every later page in the same consumer invocation. Same shape as * `pds-verify.ts`'s fetchCar timeout. */ const LIST_RECORDS_TIMEOUT_MS = 15_000; -/** Cap on listRecords pagination per collection. A buggy or malicious PDS - * that echoes the same cursor would otherwise loop forever inside one - * `ctx.waitUntil`. 1000 pages × 100 records = 100k records per collection, - * which is past anything we'd legitimately backfill in one shot. */ +/** Cap on listRecords pagination per (DID, collection) pair. A buggy or + * malicious PDS that echoes the same cursor would otherwise loop forever + * inside one consumer invocation. 1000 pages × 100 records = 100k records + * per pair, which is past anything we'd legitimately backfill in one shot. */ const MAX_PAGES_PER_COLLECTION = 1000; /** Defensive cap on records per page. Real PDSes honour the `limit` query * param; this guards against a hostile PDS returning an enormous array. @@ -42,7 +59,7 @@ const MAX_RECORDS_PER_PAGE = PAGE_SIZE; /** Cloudflare Queues' hard cap on `sendBatch` size. Per-page enqueues are * always ≤ this thanks to MAX_RECORDS_PER_PAGE; documented here so the * relationship is visible at the call site. */ -const QUEUE_SEND_BATCH_CAP = 100; +export const QUEUE_SEND_BATCH_CAP = 100; // Static guard: bumping MAX_RECORDS_PER_PAGE above the queue's batch cap // would silently break sendBatch in production. Surface the violation at // module load rather than at the first batch send. @@ -51,185 +68,227 @@ if (MAX_RECORDS_PER_PAGE > QUEUE_SEND_BATCH_CAP) { `MAX_RECORDS_PER_PAGE (${MAX_RECORDS_PER_PAGE}) exceeds QUEUE_SEND_BATCH_CAP (${QUEUE_SEND_BATCH_CAP})`, ); } -/** Cap on the total number of records a single backfill invocation may - * enqueue. Defends against a leaked admin token being weaponised to flood - * the queue at near-zero cost to the attacker. The blast radius of one - * compromised POST is bounded by this number × per-message billing. - * - * 50k = ~50 plausible publishers × ~250 records each, well past v1 scale. - * Worker waitUntil exhausts long before this in any honest workload. */ -const MAX_TOTAL_ENQUEUE = 50_000; /** Atproto rkey grammar: ALPHA / DIGIT / "." / "-" / "_" / ":" / "~". */ const RKEY_PATTERN = /^[A-Za-z0-9._:~-]{1,512}$/; -/** Thrown when the per-invocation enqueue cap is reached. Caller catches - * and stops processing further DIDs / collections; partial work already - * committed to the queue is what it is (consumer is idempotent on retry). */ -export class EnqueueLimitReached extends Error { - constructor(public readonly limit: number) { - super(`backfill enqueue cap of ${limit} reached`); - } -} - -/** Producer-side queue surface. The production binding `env.RECORDS_QUEUE` - * satisfies this; tests pass an in-memory implementation. The return type - * is `unknown` rather than `void` so workerd's `Queue.send` (which returns - * a `QueueSendResponse`) is structurally assignable. */ -export interface BackfillQueue { +/** Producer-side records queue surface. The production binding + * `env.RECORDS_QUEUE` satisfies this; tests pass an in-memory implementation. */ +export interface RecordsQueueProducer { sendBatch(messages: ReadonlyArray<{ body: RecordsJob }>): Promise; } -export interface BackfillDeps { - resolver: DidResolver; - queue: BackfillQueue; - /** Inject for tests; defaults to `globalThis.fetch`. */ - fetch?: typeof fetch; - /** Override for the per-listRecords-page timeout. Defaults to - * `LIST_RECORDS_TIMEOUT_MS`. Tests use a small value to exercise the - * abort path without burning the production budget. */ - listRecordsTimeoutMs?: number; - /** Optional callback fired after each DID completes (success or failure). - * Used by the production wiring to log per-DID progress so an operator - * watching `wrangler tail` sees where a long backfill is up to. */ - onDidComplete?: (result: BackfillDidResult) => void; +/** Producer-side backfill-jobs queue surface. The production binding + * `env.BACKFILL_QUEUE` satisfies this; tests pass an in-memory + * implementation. Same shape as `RecordsQueueProducer`, separated so the + * type system catches accidental cross-wiring. */ +export interface BackfillQueueProducer { + sendBatch(messages: ReadonlyArray<{ body: BackfillJob }>): Promise; } -export interface BackfillDidResult { - did: string; - enqueued: number; - /** One entry per failure during this DID's backfill. listRecords failures - * are per-collection; resolution failures abort early. Empty array on - * total success. */ - errors: string[]; +/** Cap on pages walked per relay collection. Same shape as + * MAX_PAGES_PER_COLLECTION but for the discovery side. At 100 repos/page, + * 100 pages = 10k publishers — past anything we'd legitimately discover at + * Slice 1 scale. */ +const MAX_DISCOVERY_PAGES_PER_COLLECTION = 100; +const DISCOVERY_PAGE_SIZE = 100; +const DISCOVERY_TIMEOUT_MS = 15_000; +/** Defensive cap on the union of discovered DIDs. A relay returning a + * runaway list (bug or hostile mirror) would otherwise let one POST fan + * out millions of jobs onto BACKFILL_QUEUE. At Slice 1 scale we expect a + * handful to a few hundred publishers. */ +export const MAX_DISCOVERED_DIDS = 1000; + +/** + * Discover all DIDs publishing any of `WANTED_COLLECTIONS` by querying the + * relay's `com.atproto.sync.listReposByCollection`. Returns the union of + * unique DIDs across all collections, in arbitrary order. + * + * Uses the same defenses as the per-pair listRecords loop: per-page + * timeout, max-page cap, cursor-equality check. Per-collection failures + * are logged and the loop continues — discovery via the relay is + * best-effort; a partial discovery list is better than none. Stops early + * once `MAX_DISCOVERED_DIDS` is reached so a runaway relay can't pump + * arbitrary fan-out into the queue. + */ +export async function discoverDids( + relayUrl: string, + opts: { fetch?: typeof fetch; timeoutMs?: number } = {}, +): Promise { + const fetchImpl = opts.fetch ?? fetch; + const timeoutMs = opts.timeoutMs ?? DISCOVERY_TIMEOUT_MS; + const dids = new Set(); + for (const collection of WANTED_COLLECTIONS) { + try { + await discoverCollection(relayUrl, collection, fetchImpl, timeoutMs, dids); + } catch (err) { + console.error("[aggregator] backfill discovery failed for collection", { + collection, + error: err instanceof Error ? err.message : String(err), + }); + } + if (dids.size >= MAX_DISCOVERED_DIDS) { + console.warn("[aggregator] backfill discovery hit DID cap, stopping early", { + cap: MAX_DISCOVERED_DIDS, + stoppedAfterCollection: collection, + }); + break; + } + } + return [...dids]; } -export interface BackfillSummary { - totalEnqueued: number; - results: BackfillDidResult[]; +async function discoverCollection( + relayUrl: string, + collection: string, + fetchImpl: typeof fetch, + timeoutMs: number, + dids: Set, +): Promise { + let cursor: string | undefined; + let prevCursor: string | undefined; + let pages = 0; + do { + if (++pages > MAX_DISCOVERY_PAGES_PER_COLLECTION) { + throw new Error(`exceeded ${MAX_DISCOVERY_PAGES_PER_COLLECTION} discovery pages`); + } + if (cursor !== undefined && cursor === prevCursor) { + throw new Error("relay returned identical cursor twice"); + } + prevCursor = cursor; + + const url = new URL("/xrpc/com.atproto.sync.listReposByCollection", relayUrl); + url.searchParams.set("collection", collection); + url.searchParams.set("limit", String(DISCOVERY_PAGE_SIZE)); + if (cursor) url.searchParams.set("cursor", cursor); + + const controller = new AbortController(); + const timer = setTimeout(() => controller.abort(), timeoutMs); + let response: Response; + try { + response = await fetchImpl(url.toString(), { + headers: { accept: "application/json" }, + signal: controller.signal, + }); + } catch (err) { + if (err instanceof Error && err.name === "AbortError") { + throw new Error(`listReposByCollection timed out after ${timeoutMs}ms`, { cause: err }); + } + throw err; + } finally { + clearTimeout(timer); + } + if (!response.ok) { + throw new Error(`listReposByCollection returned ${response.status}`); + } + const body: unknown = await response.json(); + if (!isPlainObject(body)) throw new Error("listReposByCollection returned non-object body"); + const repos = body["repos"]; + if (!Array.isArray(repos)) { + throw new Error("listReposByCollection response missing `repos` array"); + } + for (const repo of repos) { + if (!isPlainObject(repo)) continue; + const did = repo["did"]; + if (typeof did === "string") { + dids.add(did); + if (dids.size >= MAX_DISCOVERED_DIDS) return; + } + } + const nextCursor = body["cursor"]; + cursor = typeof nextCursor === "string" && nextCursor.length > 0 ? nextCursor : undefined; + } while (cursor); } /** - * Backfill multiple DIDs serially. Per-DID failures don't stop the loop — - * one bad publisher doesn't block the rest. + * Fan out backfill work for a list of DIDs onto `BACKFILL_QUEUE`. Produces + * one `BackfillJob` per (DID × WANTED_COLLECTIONS) pair, sent in batches + * of up to `QUEUE_SEND_BATCH_CAP` to honour Cloudflare Queues' sendBatch + * limit. Returns the total number of jobs enqueued. * - * Aborts the loop early if the per-invocation enqueue cap is reached. The - * partial summary still reflects what got through; the operator can re-run - * with the unfinished tail of the DID list. + * Caller is responsible for capping `dids.length` (admin route enforces + * `MAX_BACKFILL_DIDS` for the explicit path; `discoverDids` enforces + * `MAX_DISCOVERED_DIDS` for the discovery path). This function trusts its + * input and just fans out — the per-call work is bounded by + * `dids.length * WANTED_COLLECTIONS.length` enqueues. */ -export async function backfillDids( +export async function enqueueBackfillJobs( dids: readonly string[], - deps: BackfillDeps, -): Promise { - const results: BackfillDidResult[] = []; - const budget: EnqueueBudget = { remaining: MAX_TOTAL_ENQUEUE }; + queue: BackfillQueueProducer, +): Promise { + const messages: { body: BackfillJob }[] = []; for (const did of dids) { - const result = await backfillDid(did, deps, budget); - results.push(result); - deps.onDidComplete?.(result); - if (budget.remaining <= 0) { - console.warn("[aggregator] backfill enqueue cap reached, aborting remaining DIDs", { - cap: MAX_TOTAL_ENQUEUE, - triggerDid: result.did, - didsProcessed: results.length, - didsRemaining: dids.length - results.length, - }); - break; + for (const collection of WANTED_COLLECTIONS) { + messages.push({ body: { did, collection } }); } } - const totalEnqueued = results.reduce((sum, r) => sum + r.enqueued, 0); - return { totalEnqueued, results }; + for (let i = 0; i < messages.length; i += QUEUE_SEND_BATCH_CAP) { + await queue.sendBatch(messages.slice(i, i + QUEUE_SEND_BATCH_CAP)); + } + return messages.length; +} + +export interface ProcessBackfillJobDeps { + resolver: DidResolver; + queue: RecordsQueueProducer; + /** Inject for tests; defaults to `globalThis.fetch`. */ + fetch?: typeof fetch; + /** Override for the per-listRecords-page timeout. Defaults to + * `LIST_RECORDS_TIMEOUT_MS`. Tests use a small value to exercise the + * abort path without burning the production budget. */ + listRecordsTimeoutMs?: number; } -/** Mutable counter shared across the per-DID / per-collection loops so the - * cap is global to one backfill invocation. */ -interface EnqueueBudget { - remaining: number; +export interface ProcessBackfillJobResult { + did: string; + collection: string; + enqueued: number; } /** - * Backfill a single DID across every collection in `WANTED_COLLECTIONS`. - * Resolution failure aborts (we can't enqueue verifiable jobs without - * knowing the PDS); per-collection listRecords failures are recorded and - * the loop continues to the next collection. + * Process one (DID, collection) pair: resolve the DID's PDS, paginate + * `com.atproto.repo.listRecords` for the collection, and batch-enqueue + * each returned record onto the records queue. * - * Optional `budget` shares the per-invocation enqueue cap across DIDs. - * Tests that call this directly without a budget get an effectively - * unbounded one. + * Throws on any failure (resolution, listRecords status, pagination + * runaway). The queue consumer translates a thrown result into + * `message.retry()`; messages that exhaust max_retries land in the + * backfill DLQ for the operator to inspect. + * + * 404 from the PDS on the first page is treated as "publisher does not + * host this collection" and returns 0 enqueues without throwing — same + * shape as the previous serial-loop semantics. */ -export async function backfillDid( - did: string, - deps: BackfillDeps, - budget: EnqueueBudget = { remaining: MAX_TOTAL_ENQUEUE }, -): Promise { - const errors: string[] = []; - let enqueued = 0; - - // resolve() writes to known_publishers as a side-effect of the cache - // upsert, so this also registers the publisher for any future code path - // that iterates that table. - let pds: string; - try { - const resolved = await deps.resolver.resolve(did); - pds = resolved.pds; - } catch (err) { - errors.push(`resolve failed: ${err instanceof Error ? err.message : String(err)}`); - return { did, enqueued, errors }; - } - +export async function processBackfillJob( + job: BackfillJob, + deps: ProcessBackfillJobDeps, +): Promise { + const resolved = await deps.resolver.resolve(job.did); const fetchImpl = deps.fetch ?? fetch; const timeoutMs = deps.listRecordsTimeoutMs ?? LIST_RECORDS_TIMEOUT_MS; - for (const collection of WANTED_COLLECTIONS) { - // Track this collection's enqueues separately so a mid-pagination - // throw still reports the partial count instead of swallowing the - // records that already landed in the queue. - const before = budget.remaining; - try { - await backfillCollection({ - did, - pds, - collection, - queue: deps.queue, - fetchImpl, - timeoutMs, - budget, - }); - } catch (err) { - if (err instanceof EnqueueLimitReached) { - // Stop processing further collections for this DID; outer - // loop will catch the budget exhaustion and stop entirely. - // Append a sentinel error so the operator's per-DID log - // shows the cap fired (otherwise `enqueued: 47, errors: []` - // looks like a successful 47-record DID). - enqueued += before - budget.remaining; - errors.push( - `enqueue cap reached mid-DID at collection ${collection} — partial backfill, re-run to complete`, - ); - return { did, enqueued, errors }; - } - errors.push(`${collection}: ${err instanceof Error ? err.message : String(err)}`); - } - enqueued += before - budget.remaining; - } - - return { did, enqueued, errors }; + const enqueued = await paginateAndEnqueue({ + did: job.did, + pds: resolved.pds, + collection: job.collection, + queue: deps.queue, + fetchImpl, + timeoutMs, + }); + return { did: job.did, collection: job.collection, enqueued }; } -interface BackfillCollectionOpts { +interface PaginateOpts { did: string; pds: string; collection: string; - queue: BackfillQueue; + queue: RecordsQueueProducer; fetchImpl: typeof fetch; timeoutMs: number; - budget: EnqueueBudget; } /** * Walk one DID's records for a single collection, paginating through * `listRecords` and enqueuing each result via `sendBatch` (one batch per - * page). Decrements `opts.budget.remaining` per record enqueued; throws - * `EnqueueLimitReached` when the budget hits zero so the caller can stop - * processing further collections / DIDs. + * page). Returns the total records enqueued. * * 404 from the PDS on the FIRST page means the repo doesn't host this * collection — silently treated as zero records, not an error. A 404 @@ -244,12 +303,12 @@ interface BackfillCollectionOpts { * `?limit=` query and returns more records than that throws — we'd rather * surface the spec violation than silently chunk and hide the upstream bug. */ -async function backfillCollection(opts: BackfillCollectionOpts): Promise { +async function paginateAndEnqueue(opts: PaginateOpts): Promise { let cursor: string | undefined; let prevCursor: string | undefined; let pages = 0; + let totalEnqueued = 0; do { - if (opts.budget.remaining <= 0) throw new EnqueueLimitReached(MAX_TOTAL_ENQUEUE); if (++pages > MAX_PAGES_PER_COLLECTION) { throw new Error(`exceeded ${MAX_PAGES_PER_COLLECTION} pages`); } @@ -283,7 +342,7 @@ async function backfillCollection(opts: BackfillCollectionOpts): Promise { if (response.status === 404) { if (cursor === undefined) { // First-page 404: publisher has no records of this collection. - return; + return totalEnqueued; } // Mid-pagination 404 is a partial failure; surface it. throw new Error(`listRecords returned 404 mid-pagination at cursor=${cursor}`); @@ -303,7 +362,6 @@ async function backfillCollection(opts: BackfillCollectionOpts): Promise { const messages: { body: RecordsJob }[] = []; for (const record of records) { - if (opts.budget.remaining - messages.length <= 0) break; const rkey = parseRkeyFromUri(record.uri, opts.collection); if (!rkey) continue; messages.push({ @@ -321,10 +379,10 @@ async function backfillCollection(opts: BackfillCollectionOpts): Promise { // the static assertion above, so this sendBatch never exceeds // Cloudflare's 100-message limit by construction. await opts.queue.sendBatch(messages); - opts.budget.remaining -= messages.length; + totalEnqueued += messages.length; } - if (opts.budget.remaining <= 0) throw new EnqueueLimitReached(MAX_TOTAL_ENQUEUE); } while (cursor); + return totalEnqueued; } interface ListRecordEntry { diff --git a/apps/aggregator/src/env.ts b/apps/aggregator/src/env.ts index 3053c6d33..f85caadb5 100644 --- a/apps/aggregator/src/env.ts +++ b/apps/aggregator/src/env.ts @@ -20,3 +20,18 @@ export interface RecordsJob { */ jetstreamRecord?: unknown; } + +/** + * One unit of cold-start backfill work: walk every record under `collection` + * for `did` via that DID's PDS, then enqueue each record onto the records + * queue for the standard verify-and-write path. + * + * Granularity is (DID, collection) rather than per-DID because the per-DID + * fan-out can exceed Cloudflare's 30s `ctx.waitUntil` wall-clock cap. One + * collection's worth of pagination fits comfortably under that ceiling and + * gets queue-level retry + concurrency for free. + */ +export interface BackfillJob { + did: string; + collection: string; +} diff --git a/apps/aggregator/src/index.ts b/apps/aggregator/src/index.ts index a3f42d8df..f238c6974 100644 --- a/apps/aggregator/src/index.ts +++ b/apps/aggregator/src/index.ts @@ -15,21 +15,16 @@ * Worker boots. */ -import { - AtprotoWebDidDocumentResolver, - CompositeDidDocumentResolver, - PlcDidDocumentResolver, -} from "@atcute/identity-resolver"; - -import { backfillDids } from "./backfill.js"; -import { createD1DidDocCache, DidResolver } from "./did-resolver.js"; -import type { RecordsJob } from "./env.js"; +import { processBackfillBatch } from "./backfill-consumer.js"; +import { discoverDids, enqueueBackfillJobs } from "./backfill.js"; +import type { BackfillJob, RecordsJob } from "./env.js"; import { drainDeadLetterBatch, processBatch } from "./records-consumer.js"; import { RECORDS_DO_NAME } from "./records-do.js"; import { isPlainObject } from "./utils.js"; const RECORDS_QUEUE_NAME = "emdash-aggregator-records"; const RECORDS_DLQ_NAME = "emdash-aggregator-records-dlq"; +const BACKFILL_QUEUE_NAME = "emdash-aggregator-backfill"; export { RecordsJetstreamDO } from "./records-do.js"; @@ -43,7 +38,11 @@ export { RecordsJetstreamDO } from "./records-do.js"; * https://api.emdashcms.com/_admin/start * * `/_admin/start` spins up the Records DO (idempotent on an already-running DO). - * `/_admin/backfill` triggers cold-start discovery against an explicit DID list. + * `/_admin/backfill` discovers publishers (or accepts an explicit DID list) + * and fans (DID, collection) jobs onto `BACKFILL_QUEUE`; the consumer in + * `backfill-consumer.ts` does the per-pair listRecords + records-queue + * fan-out work asynchronously, getting queue retry + concurrency for free + * and sidestepping the 30s `waitUntil` cap on the POST handler. * * Auth-gating both routes: backfill specifically introduces caller-chosen * URLs into the worker's outbound fetches (via `did:web` resolution + the @@ -54,7 +53,18 @@ export { RecordsJetstreamDO } from "./records-do.js"; const BOOTSTRAP_PATH = "/_admin/start"; const BACKFILL_PATH = "/_admin/backfill"; -const MAX_BACKFILL_DIDS = 1000; +/** + * Cap on the explicit DID list a single POST may submit. Lower than the + * old serial-loop cap (was 1000) because the queue-fan-out path amplifies + * a leaked-token attack: each submitted DID becomes + * `WANTED_COLLECTIONS.length` queue messages, each consuming a consumer + * invocation with its own outbound PDS fetches. 100 × 4 = 400 jobs is + * still a meaningful operator-recovery batch but a tractable blast radius. + * + * Discovery-mode submissions are bounded separately by + * `MAX_DISCOVERED_DIDS` inside `discoverDids`. + */ +const MAX_BACKFILL_DIDS = 100; const DID_PATTERN = /^did:[a-z]+:[A-Za-z0-9._%:-]+$/; /** @@ -111,16 +121,28 @@ function requireAdminAuth(request: Request, env: Env): Response | null { return null; } -function parseBackfillBody(body: unknown): { dids: string[] } | { error: string } { +type BackfillRequest = { mode: "explicit"; dids: string[] } | { mode: "discover" }; + +function parseBackfillBody(body: unknown): BackfillRequest | { error: string } { if (!isPlainObject(body)) { return { error: "request body must be a JSON object" }; } const rawDids = body["dids"]; + // Empty body OR `{ "dids": null }` OR `{ "dids": undefined }` ⇒ discovery + // mode. Production cold-start uses this path; the explicit list is the + // testing / recovery seam. + if (rawDids === undefined || rawDids === null) { + return { mode: "discover" }; + } if (!Array.isArray(rawDids)) { - return { error: "`dids` must be an array of DID strings" }; + return { + error: "`dids` must be an array of DID strings, or omitted to discover via the relay", + }; } if (rawDids.length === 0) { - return { error: "`dids` must not be empty" }; + return { + error: "`dids` must not be empty (omit the field to discover via the relay)", + }; } if (rawDids.length > MAX_BACKFILL_DIDS) { return { @@ -136,7 +158,7 @@ function parseBackfillBody(body: unknown): { dids: string[] } | { error: string } // Set iteration order matches insertion; dedup preserves first-seen order // so the operator sees jobs run in the order they submitted. - return { dids: [...seen] }; + return { mode: "explicit", dids: [...seen] }; } export default { @@ -168,11 +190,19 @@ export default { } const denied = requireAdminAuth(request, env); if (denied) return denied; - let body: unknown; - try { - body = await request.json(); - } catch { - return new Response("request body must be valid JSON", { status: 400 }); + // Empty / no body is the production discovery path. JSON-parse + // failures with no content (Content-Length: 0) come back as + // SyntaxError; we treat that as "discover" rather than 400ing + // so `curl -X POST ... /_admin/backfill` (no body) does the + // expected thing. + let body: unknown = {}; + const text = await request.text(); + if (text.length > 0) { + try { + body = JSON.parse(text); + } catch { + return new Response("request body must be valid JSON", { status: 400 }); + } } const parsed = parseBackfillBody(body); if ("error" in parsed) { @@ -183,7 +213,7 @@ export default { // via Workers logs and the resulting D1 writes; the response body // just confirms the trigger landed. Per-DID errors are logged from // inside backfillDids; callers don't get them in the response. - ctx.waitUntil(runBackfill(parsed.dids, env)); + ctx.waitUntil(runBackfill(parsed, env)); return new Response(null, { status: 202 }); } return new Response("emdash-aggregator: not yet implemented", { @@ -192,15 +222,24 @@ export default { }); }, - async queue(batch: MessageBatch, env: Env, _ctx: ExecutionContext): Promise { - // Workerd routes both consumers (records + records-dlq) here; dispatch - // by queue name. Adding a third queue requires updating this switch. + async queue(batch: MessageBatch, env: Env, _ctx: ExecutionContext): Promise { + // Workerd routes every consumer here; dispatch by queue name to the + // matching typed handler. The parameter is unparameterised + // (`MessageBatch` defaults to `MessageBatch`) because the + // binding is shared across queues — narrowing happens per-case via + // the queue name, which is a runtime tag the compiler can't see. switch (batch.queue) { case RECORDS_QUEUE_NAME: - await processBatch(batch, env); + // eslint-disable-next-line typescript-eslint(no-unsafe-type-assertion) -- narrowed by queue name + await processBatch(batch as MessageBatch, env); return; case RECORDS_DLQ_NAME: - await drainDeadLetterBatch(batch, env); + // eslint-disable-next-line typescript-eslint(no-unsafe-type-assertion) -- narrowed by queue name + await drainDeadLetterBatch(batch as MessageBatch, env); + return; + case BACKFILL_QUEUE_NAME: + // eslint-disable-next-line typescript-eslint(no-unsafe-type-assertion) -- narrowed by queue name + await processBackfillBatch(batch as MessageBatch, env); return; default: console.error("[aggregator] unknown queue, acking batch", { queue: batch.queue }); @@ -221,49 +260,64 @@ export default { }, }; +type BackfillRequestParsed = { mode: "explicit"; dids: string[] } | { mode: "discover" }; + /** - * Production wiring for the backfill worker. Constructs a DID resolver - * pointed at the live PLC + atproto-web methods, and points the queue at - * the production Records Queue binding. Per-DID progress is logged as the - * loop runs so an operator watching `wrangler tail` sees live progress and - * can tell where a backfill stopped if waitUntil exhausts mid-loop. + * Backfill orchestrator. Runs inside the POST handler's `ctx.waitUntil`. + * Two paths: + * + * - "discover" — call `com.atproto.sync.listReposByCollection` against + * `env.RELAY_URL` for each NSID in WANTED_COLLECTIONS, union the DIDs, + * then fan them out as backfill jobs. + * - "explicit" — operator-supplied DID list, used for testing or for + * recovery of a known DID set. + * + * In both cases the actual per-(DID, collection) work runs in the + * `BACKFILL_QUEUE` consumer (see `backfill-consumer.ts`). This orchestrator + * just discovers + enqueues — both fast operations that fit well within + * Cloudflare's 30s `waitUntil` ceiling regardless of fan-out size, even + * after a `MAX_DISCOVERED_DIDS`-sized union. + * + * Per-pair progress is logged from the consumer; this function only logs + * the discovery + fan-out result so an operator watching `wrangler tail` + * sees how many jobs were enqueued. */ -async function runBackfill(dids: readonly string[], env: Env): Promise { - console.log("[aggregator] backfill starting", { didCount: dids.length }); +async function runBackfill(req: BackfillRequestParsed, env: Env): Promise { try { - const composite = new CompositeDidDocumentResolver({ - methods: { - plc: new PlcDidDocumentResolver(), - web: new AtprotoWebDidDocumentResolver(), - }, - }); - const resolver = new DidResolver({ - cache: createD1DidDocCache(env.DB), - resolver: composite, - }); - const summary = await backfillDids(dids, { - resolver, - queue: env.RECORDS_QUEUE, - onDidComplete: (result) => { - console.log("[aggregator] backfill did", { - did: result.did, - enqueued: result.enqueued, - errors: result.errors, - }); - }, - }); - console.log("[aggregator] backfill complete", { - didsRequested: dids.length, - // Separate from didsRequested so the early-abort case (enqueue - // cap fired) is visible in the summary log without the operator - // having to correlate against an earlier warn line. - didsProcessed: summary.results.length, - totalEnqueued: summary.totalEnqueued, - didsWithErrors: summary.results.filter((r) => r.errors.length > 0).length, + let dids: readonly string[]; + if (req.mode === "discover") { + console.log("[aggregator] backfill discovery starting", { + relay: env.RELAY_URL, + }); + dids = await discoverDids(env.RELAY_URL); + console.log("[aggregator] backfill discovery complete", { + relay: env.RELAY_URL, + didCount: dids.length, + }); + } else { + dids = req.dids; + console.log("[aggregator] backfill enqueue starting", { + mode: "explicit", + didCount: dids.length, + }); + } + + if (dids.length === 0) { + console.warn("[aggregator] backfill produced zero DIDs, nothing to enqueue", { + mode: req.mode, + }); + return; + } + + const enqueued = await enqueueBackfillJobs(dids, env.BACKFILL_QUEUE); + console.log("[aggregator] backfill enqueue complete", { + mode: req.mode, + didCount: dids.length, + jobsEnqueued: enqueued, }); } catch (err) { console.error("[aggregator] backfill aborted", { - didCount: dids.length, + mode: req.mode, error: err instanceof Error ? (err.stack ?? err.message) : String(err), }); } diff --git a/apps/aggregator/test/backfill.test.ts b/apps/aggregator/test/backfill.test.ts index 14e93fc4b..5dffcf168 100644 --- a/apps/aggregator/test/backfill.test.ts +++ b/apps/aggregator/test/backfill.test.ts @@ -1,9 +1,18 @@ /** * Backfill worker tests. * - * The worker is plain `fetch` + `queue.send`; tests stub both. The DID + * The worker is plain `fetch` + queue producers; tests stub both. The DID * resolver is also stubbed (in-memory cache + a simple stub upstream) so the * tests don't depend on the workers test pool or live PLC. + * + * Architecture under test (post-restructure): + * - `enqueueBackfillJobs`: synchronous fan-out of (DID × WANTED_COLLECTIONS) + * pairs onto BACKFILL_QUEUE, batched at QUEUE_SEND_BATCH_CAP. + * - `processBackfillJob`: per-pair worker. Resolve PDS, paginate + * `com.atproto.repo.listRecords`, batch-enqueue records onto RECORDS_QUEUE. + * - `processBackfillBatch`: queue consumer that calls `processBackfillJob` + * and translates throws to `message.retry()`. + * - `discoverDids`: relay enumeration via `com.atproto.sync.listReposByCollection`. */ import { P256PrivateKeyExportable } from "@atcute/crypto"; @@ -12,7 +21,16 @@ import type { Did } from "@atcute/lexicons/syntax"; import { applyD1Migrations, env, SELF } from "cloudflare:test"; import { beforeAll, beforeEach, describe, expect, it } from "vitest"; -import { backfillDid, backfillDids, type BackfillQueue } from "../src/backfill.js"; +import { processBackfillBatch } from "../src/backfill-consumer.js"; +import { + type BackfillQueueProducer, + discoverDids, + enqueueBackfillJobs, + MAX_DISCOVERED_DIDS, + processBackfillJob, + QUEUE_SEND_BATCH_CAP, + type RecordsQueueProducer, +} from "../src/backfill.js"; import { WANTED_COLLECTIONS } from "../src/constants.js"; import { type CachedDidDoc, @@ -20,7 +38,8 @@ import { type DidDocumentResolverLike, DidResolver, } from "../src/did-resolver.js"; -import type { RecordsJob } from "../src/env.js"; +import type { BackfillJob, RecordsJob } from "../src/env.js"; +import type { MessageController } from "../src/records-consumer.js"; interface TestEnv { DB: D1Database; @@ -44,7 +63,7 @@ beforeEach(async () => { await testEnv.DB.prepare("DELETE FROM known_publishers").run(); }); -class CapturingQueue implements BackfillQueue { +class CapturingRecordsQueue implements RecordsQueueProducer { readonly sent: RecordsJob[] = []; sendBatch(messages: ReadonlyArray<{ body: RecordsJob }>): Promise { for (const m of messages) this.sent.push(m.body); @@ -52,6 +71,16 @@ class CapturingQueue implements BackfillQueue { } } +class CapturingBackfillQueue implements BackfillQueueProducer { + readonly sent: BackfillJob[] = []; + readonly batches: number[] = []; + sendBatch(messages: ReadonlyArray<{ body: BackfillJob }>): Promise { + this.batches.push(messages.length); + for (const m of messages) this.sent.push(m.body); + return Promise.resolve(); + } +} + class MapDidDocCache implements DidDocCache { private readonly entries = new Map(); read(did: string): Promise { @@ -68,6 +97,18 @@ class MapDidDocCache implements DidDocCache { } } +class FakeMessage implements MessageController { + acked = 0; + retried = 0; + constructor(readonly body: T) {} + ack() { + this.acked += 1; + } + retry() { + this.retried += 1; + } +} + function buildResolver(): DidResolver { const cache = new MapDidDocCache(); const resolver: DidDocumentResolverLike = { @@ -136,27 +177,34 @@ function makeFetch( }; } -describe("backfillDid", () => { +describe("processBackfillJob", () => { it("enqueues each listRecords result as a RecordsJob", async () => { - const queue = new CapturingQueue(); + const queue = new CapturingRecordsQueue(); const resolver = buildResolver(); + const collection = WANTED_COLLECTIONS[0]; + if (!collection) throw new Error("test assumes ≥1 collection"); const fetchImpl = makeFetch({ - [WANTED_COLLECTIONS[0]]: [ + [collection]: [ { - uri: `at://${DID_A}/${WANTED_COLLECTIONS[0]}/demo`, + uri: `at://${DID_A}/${collection}/demo`, cid: "bafyc1", value: { foo: "bar" }, }, ], }); - const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl }); - expect(result.errors).toEqual([]); + const result = await processBackfillJob( + { did: DID_A, collection }, + { resolver, queue, fetch: fetchImpl }, + ); + expect(result.enqueued).toBe(1); + expect(result.did).toBe(DID_A); + expect(result.collection).toBe(collection); expect(queue.sent).toHaveLength(1); expect(queue.sent[0]).toMatchObject({ did: DID_A, - collection: WANTED_COLLECTIONS[0], + collection, rkey: "demo", operation: "create", cid: "bafyc1", @@ -167,98 +215,58 @@ describe("backfillDid", () => { expect(queue.sent[0]?.jetstreamRecord).toBeUndefined(); }); - it("walks every collection in WANTED_COLLECTIONS", async () => { - const queue = new CapturingQueue(); - const resolver = buildResolver(); - const records: Record = {}; - for (let i = 0; i < WANTED_COLLECTIONS.length; i++) { - const c = WANTED_COLLECTIONS[i]; - if (c) { - records[c] = [{ uri: `at://${DID_A}/${c}/r${i}`, cid: `bafyc${i}`, value: {} }]; - } - } - const fetchImpl = makeFetch(records); - - const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl }); - - expect(result.enqueued).toBe(WANTED_COLLECTIONS.length); - const collectionsHit = new Set(queue.sent.map((j) => j.collection)); - expect(collectionsHit.size).toBe(WANTED_COLLECTIONS.length); - }); - it("treats 404 from the PDS as 'no records of this collection', not an error", async () => { - const queue = new CapturingQueue(); + const queue = new CapturingRecordsQueue(); const resolver = buildResolver(); - const fetchImpl = makeFetch( - {}, - { - status: WANTED_COLLECTIONS.reduce>((acc, c) => { - acc[c] = 404; - return acc; - }, {}), - }, - ); + const collection = WANTED_COLLECTIONS[0]; + if (!collection) throw new Error("test assumes ≥1 collection"); + const fetchImpl = makeFetch({}, { status: { [collection]: 404 } }); - const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl }); + const result = await processBackfillJob( + { did: DID_A, collection }, + { resolver, queue, fetch: fetchImpl }, + ); - expect(result.errors).toEqual([]); expect(result.enqueued).toBe(0); expect(queue.sent).toHaveLength(0); }); - it("records non-404 PDS errors per collection, continues to the next collection", async () => { - const queue = new CapturingQueue(); + it("throws on non-404 PDS errors so the consumer can retry", async () => { + const queue = new CapturingRecordsQueue(); const resolver = buildResolver(); - const firstCollection = WANTED_COLLECTIONS[0]; - const secondCollection = WANTED_COLLECTIONS[1]; - if (!firstCollection || !secondCollection) throw new Error("test assumes ≥2 collections"); - const fetchImpl = makeFetch( - { - [secondCollection]: [ - { - uri: `at://${DID_A}/${secondCollection}/r1`, - cid: "bafyc", - value: {}, - }, - ], - }, - { status: { [firstCollection]: 503 } }, - ); - - const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl }); + const collection = WANTED_COLLECTIONS[0]; + if (!collection) throw new Error("test assumes ≥1 collection"); + const fetchImpl = makeFetch({}, { status: { [collection]: 503 } }); - expect(result.errors).toHaveLength(1); - expect(result.errors[0]).toContain("503"); - expect(result.errors[0]).toContain(firstCollection); - expect(result.enqueued).toBe(1); // the second collection still ran + await expect( + processBackfillJob({ did: DID_A, collection }, { resolver, queue, fetch: fetchImpl }), + ).rejects.toThrow(/503/); }); - it("aborts the DID early if the resolver throws (can't enqueue without PDS)", async () => { - const queue = new CapturingQueue(); + it("throws when the resolver fails (queue consumer translates to retry)", async () => { + const queue = new CapturingRecordsQueue(); const resolver = new DidResolver({ cache: new MapDidDocCache(), resolver: { resolve: () => Promise.reject(new Error("PLC unreachable")), }, }); + const collection = WANTED_COLLECTIONS[0]; + if (!collection) throw new Error("test assumes ≥1 collection"); const fetchImpl = makeFetch({}); - const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl }); - - expect(result.enqueued).toBe(0); + await expect( + processBackfillJob({ did: DID_A, collection }, { resolver, queue, fetch: fetchImpl }), + ).rejects.toThrow(/PLC unreachable/); expect(queue.sent).toHaveLength(0); - expect(result.errors).toHaveLength(1); - expect(result.errors[0]).toMatch(/resolve failed.*PLC unreachable/); }); it("paginates listRecords via cursor", async () => { - const queue = new CapturingQueue(); + const queue = new CapturingRecordsQueue(); const resolver = buildResolver(); const collection = WANTED_COLLECTIONS[0]; if (!collection) throw new Error("test assumes ≥1 collection"); - // Fetch returns page 1 (cursor "p2") then page 2 (no cursor) for the - // first collection; empty pages for the others. let calls = 0; const fetchImpl: typeof fetch = async (input) => { const url = @@ -267,9 +275,6 @@ describe("backfillDid", () => { : input instanceof URL ? input : new URL(input.url); - if (url.searchParams.get("collection") !== collection) { - return new Response(JSON.stringify({ records: [] }), { status: 200 }); - } calls += 1; const cursor = url.searchParams.get("cursor"); if (!cursor) { @@ -289,16 +294,18 @@ describe("backfillDid", () => { ); }; - const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl }); + const result = await processBackfillJob( + { did: DID_A, collection }, + { resolver, queue, fetch: fetchImpl }, + ); expect(calls).toBe(2); - const collectionJobs = queue.sent.filter((j) => j.collection === collection); - expect(collectionJobs.map((j) => j.rkey)).toEqual(["p1", "p2"]); - expect(result.errors).toEqual([]); + expect(queue.sent.map((j) => j.rkey)).toEqual(["p1", "p2"]); + expect(result.enqueued).toBe(2); }); it("skips records whose URI doesn't match the expected collection (defensive)", async () => { - const queue = new CapturingQueue(); + const queue = new CapturingRecordsQueue(); const resolver = buildResolver(); const collection = WANTED_COLLECTIONS[0]; if (!collection) throw new Error("test assumes ≥1 collection"); @@ -311,110 +318,44 @@ describe("backfillDid", () => { { uri: `at://${DID_A}/${collection}/`, cid: "c3", value: {} }, ], }); - const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl }); - const sentForCollection = queue.sent.filter((j) => j.collection === collection); - expect(sentForCollection.map((j) => j.rkey)).toEqual(["legit"]); + const result = await processBackfillJob( + { did: DID_A, collection }, + { resolver, queue, fetch: fetchImpl }, + ); + + expect(queue.sent.map((j) => j.rkey)).toEqual(["legit"]); expect(result.enqueued).toBe(1); }); -}); -describe("backfillDids", () => { - it("processes multiple DIDs serially and aggregates the summary", async () => { - const queue = new CapturingQueue(); + it("rejects records with malformed rkey (atproto rkey grammar violation)", async () => { + const queue = new CapturingRecordsQueue(); const resolver = buildResolver(); const collection = WANTED_COLLECTIONS[0]; if (!collection) throw new Error("test assumes ≥1 collection"); - const fetchImpl: typeof fetch = async (input) => { - const url = - typeof input === "string" - ? new URL(input) - : input instanceof URL - ? input - : new URL(input.url); - if (url.searchParams.get("collection") !== collection) { - return new Response(JSON.stringify({ records: [] }), { status: 200 }); - } - const did = url.searchParams.get("repo") ?? ""; - return new Response( - JSON.stringify({ - records: [{ uri: `at://${did}/${collection}/x`, cid: "c", value: {} }], - }), - { status: 200, headers: { "content-type": "application/json" } }, - ); - }; - - const summary = await backfillDids([DID_A, DID_B], { - resolver, - queue, - fetch: fetchImpl, - }); - - expect(summary.totalEnqueued).toBe(2); - expect(summary.results).toHaveLength(2); - expect(summary.results.map((r) => r.did)).toEqual([DID_A, DID_B]); - expect(summary.results.every((r) => r.errors.length === 0)).toBe(true); - }); - - it("doesn't let one DID's failure stop the others", async () => { - const queue = new CapturingQueue(); - // Resolver succeeds for DID_A, fails for DID_B (or any other). - const cache = new MapDidDocCache(); - const resolver = new DidResolver({ - cache, - resolver: { - resolve: (did) => { - if (did === DID_A) { - return Promise.resolve({ - id: did as `did:${string}:${string}`, - verificationMethod: [ - { - id: `${did}#atproto`, - type: "Multikey", - controller: did as `did:${string}:${string}`, - publicKeyMultibase: signingKeyMultibase, - }, - ], - service: [ - { - id: "#atproto_pds", - type: "AtprotoPersonalDataServer", - serviceEndpoint: PDS, - }, - ], - }); - } - return Promise.reject(new Error("DID_B unresolvable")); - }, - }, - ttlMs: 1_000_000, - now: () => new Date(), - }); - const collection = WANTED_COLLECTIONS[0]; - if (!collection) throw new Error("test assumes ≥1 collection"); const fetchImpl = makeFetch({ - [collection]: [{ uri: `at://${DID_A}/${collection}/x`, cid: "c", value: {} }], - }); - - const summary = await backfillDids([DID_B, DID_A], { - resolver, - queue, - fetch: fetchImpl, + [collection]: [ + { uri: `at://${DID_A}/${collection}/legit`, cid: "c1", value: {} }, + { uri: `at://${DID_A}/${collection}/has?queryparam`, cid: "c2", value: {} }, + { uri: `at://${DID_A}/${collection}/has#fragment`, cid: "c3", value: {} }, + { uri: `at://${DID_A}/${collection}/has space`, cid: "c4", value: {} }, + ], }); - expect(summary.totalEnqueued).toBe(1); - expect(summary.results).toHaveLength(2); - const bResult = summary.results.find((r) => r.did === DID_B); - const aResult = summary.results.find((r) => r.did === DID_A); - expect(bResult?.errors.length).toBe(1); - expect(aResult?.errors).toEqual([]); - expect(aResult?.enqueued).toBe(1); + const result = await processBackfillJob( + { did: DID_A, collection }, + { resolver, queue, fetch: fetchImpl }, + ); + expect(queue.sent.map((j) => j.rkey)).toEqual(["legit"]); + expect(result.enqueued).toBe(1); }); it("end-to-end against the production D1 cache: DID is registered in known_publishers", async () => { - const queue = new CapturingQueue(); + const queue = new CapturingRecordsQueue(); const { createD1DidDocCache } = await import("../src/did-resolver.js"); const cache = createD1DidDocCache(testEnv.DB); + const collection = WANTED_COLLECTIONS[0]; + if (!collection) throw new Error("test assumes ≥1 collection"); const resolver = new DidResolver({ cache, resolver: { @@ -441,7 +382,7 @@ describe("backfillDids", () => { }); const fetchImpl = makeFetch({}); - await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl }); + await processBackfillJob({ did: DID_A, collection }, { resolver, queue, fetch: fetchImpl }); const row = await testEnv.DB.prepare( `SELECT did, pds, signing_key, signing_key_id FROM known_publishers WHERE did = ?`, @@ -453,25 +394,16 @@ describe("backfillDids", () => { }); }); -describe("backfillCollection: defenses against malicious / buggy PDS", () => { +describe("processBackfillJob: defenses against malicious / buggy PDS", () => { it("aborts after MAX_PAGES_PER_COLLECTION when cursor never empties", async () => { - const queue = new CapturingQueue(); + const queue = new CapturingRecordsQueue(); const resolver = buildResolver(); const collection = WANTED_COLLECTIONS[0]; if (!collection) throw new Error("test assumes ≥1 collection"); // Hostile PDS: returns a different non-empty cursor every call so the // cursor-equality check doesn't fire — only the page cap stops us. let counter = 0; - const fetchImpl: typeof fetch = async (input) => { - const url = - typeof input === "string" - ? new URL(input) - : input instanceof URL - ? input - : new URL(input.url); - if (url.searchParams.get("collection") !== collection) { - return new Response(JSON.stringify({ records: [] }), { status: 200 }); - } + const fetchImpl: typeof fetch = async () => { counter += 1; return new Response(JSON.stringify({ records: [], cursor: `cursor-${counter}` }), { status: 200, @@ -479,32 +411,21 @@ describe("backfillCollection: defenses against malicious / buggy PDS", () => { }); }; - const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl }); - - expect(result.errors.length).toBeGreaterThanOrEqual(1); - expect(result.errors.some((e) => e.includes("exceeded"))).toBe(true); + await expect( + processBackfillJob({ did: DID_A, collection }, { resolver, queue, fetch: fetchImpl }), + ).rejects.toThrow(/exceeded/); // Loop ran at most MAX_PAGES_PER_COLLECTION times. - expect(counter).toBeLessThanOrEqual(1001); // +1 for the throw-on-the-next iteration + expect(counter).toBeLessThanOrEqual(1001); }); it("aborts when the PDS returns the identical cursor twice", async () => { - const queue = new CapturingQueue(); + const queue = new CapturingRecordsQueue(); const resolver = buildResolver(); const collection = WANTED_COLLECTIONS[0]; if (!collection) throw new Error("test assumes ≥1 collection"); - // Buggy PDS: echoes the cursor we sent. Cursor-equality should fire - // on the second iteration. + // Buggy PDS: echoes the cursor we sent. let calls = 0; - const fetchImpl: typeof fetch = async (input) => { - const url = - typeof input === "string" - ? new URL(input) - : input instanceof URL - ? input - : new URL(input.url); - if (url.searchParams.get("collection") !== collection) { - return new Response(JSON.stringify({ records: [] }), { status: 200 }); - } + const fetchImpl: typeof fetch = async () => { calls += 1; return new Response(JSON.stringify({ records: [], cursor: "stuck" }), { status: 200, @@ -512,14 +433,14 @@ describe("backfillCollection: defenses against malicious / buggy PDS", () => { }); }; - const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl }); - - expect(result.errors.some((e) => e.includes("identical cursor"))).toBe(true); + await expect( + processBackfillJob({ did: DID_A, collection }, { resolver, queue, fetch: fetchImpl }), + ).rejects.toThrow(/identical cursor/); expect(calls).toBe(2); // first page, then second page caught the dupe }); - it("treats 404 mid-pagination as a partial failure (not silent zero)", async () => { - const queue = new CapturingQueue(); + it("treats 404 mid-pagination as a partial failure (throws)", async () => { + const queue = new CapturingRecordsQueue(); const resolver = buildResolver(); const collection = WANTED_COLLECTIONS[0]; if (!collection) throw new Error("test assumes ≥1 collection"); @@ -530,12 +451,8 @@ describe("backfillCollection: defenses against malicious / buggy PDS", () => { : input instanceof URL ? input : new URL(input.url); - if (url.searchParams.get("collection") !== collection) { - return new Response(JSON.stringify({ records: [] }), { status: 200 }); - } const cursor = url.searchParams.get("cursor"); if (!cursor) { - // First page succeeds with a cursor. return new Response( JSON.stringify({ records: [{ uri: `at://${DID_A}/${collection}/p1`, cid: "c1", value: {} }], @@ -544,18 +461,16 @@ describe("backfillCollection: defenses against malicious / buggy PDS", () => { { status: 200, headers: { "content-type": "application/json" } }, ); } - // Second page 404. return new Response("not found", { status: 404 }); }; - const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl }); - - const error = result.errors.find((e) => e.includes(collection)); - expect(error).toMatch(/404 mid-pagination/); + await expect( + processBackfillJob({ did: DID_A, collection }, { resolver, queue, fetch: fetchImpl }), + ).rejects.toThrow(/404 mid-pagination/); }); it("rejects pages with > MAX_RECORDS_PER_PAGE records (PDS oversize attack)", async () => { - const queue = new CapturingQueue(); + const queue = new CapturingRecordsQueue(); const resolver = buildResolver(); const collection = WANTED_COLLECTIONS[0]; if (!collection) throw new Error("test assumes ≥1 collection"); @@ -566,104 +481,316 @@ describe("backfillCollection: defenses against malicious / buggy PDS", () => { })); const fetchImpl = makeFetch({ [collection]: records }); - const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl }); - - expect(result.errors.some((e) => e.includes("per-page cap"))).toBe(true); - // No records enqueued for the over-capped collection. - expect(queue.sent.filter((j) => j.collection === collection)).toHaveLength(0); + await expect( + processBackfillJob({ did: DID_A, collection }, { resolver, queue, fetch: fetchImpl }), + ).rejects.toThrow(/per-page cap/); + expect(queue.sent).toHaveLength(0); }); - it("rejects records with malformed rkey (atproto rkey grammar violation)", async () => { - const queue = new CapturingQueue(); + it("aborts a hung PDS fetch via the listRecords timeout", async () => { + const queue = new CapturingRecordsQueue(); const resolver = buildResolver(); const collection = WANTED_COLLECTIONS[0]; if (!collection) throw new Error("test assumes ≥1 collection"); - const fetchImpl = makeFetch({ - [collection]: [ - { uri: `at://${DID_A}/${collection}/legit`, cid: "c1", value: {} }, - { uri: `at://${DID_A}/${collection}/has?queryparam`, cid: "c2", value: {} }, - { uri: `at://${DID_A}/${collection}/has#fragment`, cid: "c3", value: {} }, - { uri: `at://${DID_A}/${collection}/has space`, cid: "c4", value: {} }, - ], - }); + const fetchImpl: typeof fetch = (_input, init) => + new Promise((_resolve, reject) => { + init?.signal?.addEventListener("abort", () => { + reject(new DOMException("aborted", "AbortError")); + }); + }); - const result = await backfillDid(DID_A, { resolver, queue, fetch: fetchImpl }); - const sent = queue.sent.filter((j) => j.collection === collection); - expect(sent.map((j) => j.rkey)).toEqual(["legit"]); - expect(result.enqueued).toBe(1); + await expect( + processBackfillJob( + { did: DID_A, collection }, + { resolver, queue, fetch: fetchImpl, listRecordsTimeoutMs: 25 }, + ), + ).rejects.toThrow(/timed out after 25ms/); }); }); -describe("backfillCollection: enqueue budget cap", () => { - it("stops enqueuing when MAX_TOTAL_ENQUEUE is reached and reports the partial count", async () => { - // Force the cap by giving a tiny budget. 50 records × 4 collections = - // 200 records the worker would otherwise enqueue; budget of 30 should - // cap it after the first collection's first page. - const queue = new CapturingQueue(); - const resolver = buildResolver(); - const recordsByCollection: Record = {}; - for (const c of WANTED_COLLECTIONS) { - recordsByCollection[c] = Array.from({ length: 50 }, (_, i) => ({ - uri: `at://${DID_A}/${c}/r${i}`, - cid: `c${i}`, - value: {}, - })); +describe("enqueueBackfillJobs", () => { + it("emits one job per (DID × WANTED_COLLECTIONS) pair", async () => { + const queue = new CapturingBackfillQueue(); + const enqueued = await enqueueBackfillJobs([DID_A, DID_B], queue); + + const expected = 2 * WANTED_COLLECTIONS.length; + expect(enqueued).toBe(expected); + expect(queue.sent).toHaveLength(expected); + + // Cartesian shape: every DID appears with every collection exactly once. + for (const did of [DID_A, DID_B]) { + for (const collection of WANTED_COLLECTIONS) { + expect(queue.sent.filter((j) => j.did === did && j.collection === collection)).toHaveLength( + 1, + ); + } } - const fetchImpl = makeFetch(recordsByCollection); - const result = await backfillDid( - DID_A, - { resolver, queue, fetch: fetchImpl }, - { remaining: 30 }, + }); + + it("batches sendBatch calls at QUEUE_SEND_BATCH_CAP", async () => { + const queue = new CapturingBackfillQueue(); + // Pick enough DIDs that the total job count exceeds the cap. With + // QUEUE_SEND_BATCH_CAP = 100 and 4 collections, 30 DIDs → 120 jobs + // → batches of [100, 20]. + const dids = Array.from( + { length: 30 }, + (_, i) => `did:plc:bulk${i.toString().padStart(20, "0")}`, ); - // Budget=30 + page of 50 records: inner loop pushes records while - // `budget.remaining - messages.length > 0`. At iter 31 the - // expression (30 - 30) ≤ 0 fires `break`. messages.length === 30, - // sendBatch sends 30, budget → 0, post-send check throws - // EnqueueLimitReached. Exact answer is 30 — assert the exact - // number so an off-by-one regression in the inner break doesn't - // pass under a slack `<=`. - expect(result.enqueued).toBe(30); - expect(queue.sent.length).toBe(30); - // Sentinel error appended so the operator can see the cap fired - // rather than thinking the DID was a clean 30-record publisher. - expect(result.errors.some((e) => e.includes("enqueue cap reached"))).toBe(true); + const total = dids.length * WANTED_COLLECTIONS.length; + const expectedBatches: number[] = []; + for (let i = 0; i < total; i += QUEUE_SEND_BATCH_CAP) { + expectedBatches.push(Math.min(QUEUE_SEND_BATCH_CAP, total - i)); + } + + const enqueued = await enqueueBackfillJobs(dids, queue); + + expect(enqueued).toBe(total); + expect(queue.batches).toEqual(expectedBatches); + expect(queue.batches.every((n) => n <= QUEUE_SEND_BATCH_CAP)).toBe(true); + }); + + it("emits no batches for an empty DID list", async () => { + const queue = new CapturingBackfillQueue(); + const enqueued = await enqueueBackfillJobs([], queue); + expect(enqueued).toBe(0); + expect(queue.batches).toEqual([]); + expect(queue.sent).toHaveLength(0); }); }); -describe("backfillCollection: listRecords timeout", () => { - it("aborts a hung PDS fetch and surfaces the timeout per-collection", async () => { - const queue = new CapturingQueue(); +describe("processBackfillBatch (consumer)", () => { + it("acks each message after a successful per-pair run", async () => { + const recordsQueue = new CapturingRecordsQueue(); const resolver = buildResolver(); const collection = WANTED_COLLECTIONS[0]; if (!collection) throw new Error("test assumes ≥1 collection"); - const fetchImpl: typeof fetch = (input, init) => { + const fetchImpl = makeFetch({ + [collection]: [{ uri: `at://${DID_A}/${collection}/r1`, cid: "c", value: {} }], + }); + + const message = new FakeMessage({ did: DID_A, collection }); + await processBackfillBatch({ messages: [message] }, {} as Env, { + resolver, + queue: recordsQueue, + fetch: fetchImpl, + }); + + expect(message.acked).toBe(1); + expect(message.retried).toBe(0); + expect(recordsQueue.sent).toHaveLength(1); + }); + + it("retries when processBackfillJob throws (transient PDS failure)", async () => { + const recordsQueue = new CapturingRecordsQueue(); + const resolver = buildResolver(); + const collection = WANTED_COLLECTIONS[0]; + if (!collection) throw new Error("test assumes ≥1 collection"); + const fetchImpl = makeFetch({}, { status: { [collection]: 503 } }); + + const message = new FakeMessage({ did: DID_A, collection }); + await processBackfillBatch({ messages: [message] }, {} as Env, { + resolver, + queue: recordsQueue, + fetch: fetchImpl, + }); + + expect(message.retried).toBe(1); + expect(message.acked).toBe(0); + }); + + it("does not let one failed job poison the rest of the batch", async () => { + const recordsQueue = new CapturingRecordsQueue(); + const resolver = buildResolver(); + const collection = WANTED_COLLECTIONS[0]; + const collection2 = WANTED_COLLECTIONS[1]; + if (!collection || !collection2) throw new Error("test assumes ≥2 collections"); + // First collection 503s; second succeeds with one record. + const fetchImpl: typeof fetch = async (input) => { const url = typeof input === "string" ? new URL(input) : input instanceof URL ? input : new URL(input.url); - if (url.searchParams.get("collection") !== collection) { - return Promise.resolve(new Response(JSON.stringify({ records: [] }), { status: 200 })); + const c = url.searchParams.get("collection"); + if (c === collection) return new Response("err", { status: 503 }); + return new Response( + JSON.stringify({ + records: [{ uri: `at://${DID_A}/${c}/r1`, cid: "c", value: {} }], + }), + { status: 200, headers: { "content-type": "application/json" } }, + ); + }; + + const failing = new FakeMessage({ did: DID_A, collection }); + const succeeding = new FakeMessage({ did: DID_A, collection: collection2 }); + await processBackfillBatch({ messages: [failing, succeeding] }, {} as Env, { + resolver, + queue: recordsQueue, + fetch: fetchImpl, + }); + + expect(failing.retried).toBe(1); + expect(failing.acked).toBe(0); + expect(succeeding.acked).toBe(1); + expect(succeeding.retried).toBe(0); + expect(recordsQueue.sent).toHaveLength(1); + expect(recordsQueue.sent[0]?.collection).toBe(collection2); + }); +}); + +describe("discoverDids: listReposByCollection enumeration", () => { + const RELAY = "https://relay.test.example"; + + function makeRelayFetch(reposByCollection: Record>): typeof fetch { + return async (input) => { + const url = + typeof input === "string" + ? new URL(input) + : input instanceof URL + ? input + : new URL(input.url); + if (!url.pathname.endsWith("/xrpc/com.atproto.sync.listReposByCollection")) { + return new Response("not stubbed", { status: 599 }); } - // Hung connection: never resolve, but honour the abort signal so - // the AbortController-driven timeout fires. - return new Promise((_resolve, reject) => { + const collection = url.searchParams.get("collection") ?? ""; + const repos = reposByCollection[collection] ?? []; + return new Response(JSON.stringify({ repos }), { + status: 200, + headers: { "content-type": "application/json" }, + }); + }; + } + + it("returns the union of distinct DIDs across all WANTED_COLLECTIONS", async () => { + const c0 = WANTED_COLLECTIONS[0]; + const c1 = WANTED_COLLECTIONS[1]; + if (!c0 || !c1) throw new Error("test assumes ≥2 collections"); + const fetchImpl = makeRelayFetch({ + [c0]: [{ did: "did:plc:a" }, { did: "did:plc:b" }], + [c1]: [{ did: "did:plc:b" }, { did: "did:plc:c" }], + }); + + const dids = await discoverDids(RELAY, { fetch: fetchImpl }); + + expect(new Set(dids)).toEqual(new Set(["did:plc:a", "did:plc:b", "did:plc:c"])); + }); + + it("paginates via cursor", async () => { + const c0 = WANTED_COLLECTIONS[0]; + if (!c0) throw new Error("test assumes ≥1 collection"); + let calls = 0; + const fetchImpl: typeof fetch = async (input) => { + const url = + typeof input === "string" + ? new URL(input) + : input instanceof URL + ? input + : new URL(input.url); + if (url.searchParams.get("collection") !== c0) { + return new Response(JSON.stringify({ repos: [] }), { status: 200 }); + } + calls += 1; + const cursor = url.searchParams.get("cursor"); + if (!cursor) { + return new Response(JSON.stringify({ repos: [{ did: "did:plc:a" }], cursor: "next" }), { + status: 200, + headers: { "content-type": "application/json" }, + }); + } + return new Response(JSON.stringify({ repos: [{ did: "did:plc:b" }] }), { + status: 200, + headers: { "content-type": "application/json" }, + }); + }; + + const dids = await discoverDids(RELAY, { fetch: fetchImpl }); + + expect(calls).toBe(2); + expect(new Set(dids)).toEqual(new Set(["did:plc:a", "did:plc:b"])); + }); + + it("logs and continues when one collection's listReposByCollection fails", async () => { + const c0 = WANTED_COLLECTIONS[0]; + const c1 = WANTED_COLLECTIONS[1]; + if (!c0 || !c1) throw new Error("test assumes ≥2 collections"); + const fetchImpl: typeof fetch = async (input) => { + const url = + typeof input === "string" + ? new URL(input) + : input instanceof URL + ? input + : new URL(input.url); + const collection = url.searchParams.get("collection"); + if (collection === c0) return new Response("relay broken", { status: 503 }); + if (collection === c1) { + return new Response(JSON.stringify({ repos: [{ did: "did:plc:c1" }] }), { + status: 200, + headers: { "content-type": "application/json" }, + }); + } + return new Response(JSON.stringify({ repos: [] }), { status: 200 }); + }; + + const dids = await discoverDids(RELAY, { fetch: fetchImpl }); + + expect(dids).toContain("did:plc:c1"); + }); + + it("aborts a hung relay fetch via the timeout", async () => { + const fetchImpl: typeof fetch = (_input, init) => + new Promise((_resolve, reject) => { init?.signal?.addEventListener("abort", () => { reject(new DOMException("aborted", "AbortError")); }); }); + + const dids = await discoverDids(RELAY, { fetch: fetchImpl, timeoutMs: 25 }); + + // Every collection's discovery throws "timed out"; the function returns + // an empty set rather than propagating the error. + expect(dids).toEqual([]); + }); + + it("stops enumerating once MAX_DISCOVERED_DIDS is hit (defense vs runaway relay)", async () => { + const c0 = WANTED_COLLECTIONS[0]; + if (!c0) throw new Error("test assumes ≥1 collection"); + // Relay returns MAX_DISCOVERED_DIDS+50 repos in one page for the first + // collection. discoverDids should add exactly MAX_DISCOVERED_DIDS DIDs + // then stop without paging further or hitting the next collection. + let pdsCalls = 0; + const fetchImpl: typeof fetch = async (input) => { + const url = + typeof input === "string" + ? new URL(input) + : input instanceof URL + ? input + : new URL(input.url); + pdsCalls += 1; + const collection = url.searchParams.get("collection"); + if (collection !== c0) { + // Should never be reached if the cap fires. + return new Response(JSON.stringify({ repos: [{ did: "did:plc:later" }] }), { + status: 200, + headers: { "content-type": "application/json" }, + }); + } + const repos = Array.from({ length: MAX_DISCOVERED_DIDS + 50 }, (_, i) => ({ + did: `did:plc:cap${i.toString().padStart(20, "0")}`, + })); + return new Response(JSON.stringify({ repos }), { + status: 200, + headers: { "content-type": "application/json" }, + }); }; - const result = await backfillDid(DID_A, { - resolver, - queue, - fetch: fetchImpl, - listRecordsTimeoutMs: 25, - }); + const dids = await discoverDids("https://relay.test.example", { fetch: fetchImpl }); - const error = result.errors.find((e) => e.includes(collection)); - expect(error).toMatch(/timed out after 25ms/); + expect(dids).toHaveLength(MAX_DISCOVERED_DIDS); + expect(dids).not.toContain("did:plc:later"); + // Only the first collection was queried; the cap fired before reaching + // any subsequent collection. + expect(pdsCalls).toBe(1); }); }); @@ -698,7 +825,7 @@ describe("backfill admin route: auth + input validation", () => { expect(res.status).toBe(405); }); - it("returns 400 on missing dids field", async () => { + it("accepts an empty body and triggers discovery (production cold-start path)", async () => { const res = await SELF.fetch("https://test/_admin/backfill", { method: "POST", headers: { @@ -707,11 +834,31 @@ describe("backfill admin route: auth + input validation", () => { }, body: JSON.stringify({}), }); + expect(res.status).toBe(202); + }); + + it("accepts a literal empty request body (no JSON) and triggers discovery", async () => { + const res = await SELF.fetch("https://test/_admin/backfill", { + method: "POST", + headers: { authorization: "Bearer test-admin-token" }, + }); + expect(res.status).toBe(202); + }); + + it("returns 400 on a non-array `dids` value (string instead of array)", async () => { + const res = await SELF.fetch("https://test/_admin/backfill", { + method: "POST", + headers: { + "content-type": "application/json", + authorization: "Bearer test-admin-token", + }, + body: JSON.stringify({ dids: "did:plc:foo" }), + }); expect(res.status).toBe(400); expect(await res.text()).toContain("must be an array"); }); - it("returns 400 on empty dids array", async () => { + it("returns 400 on empty dids array (suggests omitting the field for discovery)", async () => { const res = await SELF.fetch("https://test/_admin/backfill", { method: "POST", headers: { @@ -721,12 +868,16 @@ describe("backfill admin route: auth + input validation", () => { body: JSON.stringify({ dids: [] }), }); expect(res.status).toBe(400); - expect(await res.text()).toContain("not be empty"); + const text = await res.text(); + expect(text).toContain("not be empty"); + expect(text).toMatch(/discover|omit/i); }); it("returns 400 on dids list larger than the cap", async () => { + // Cap is currently 100 (lowered from 1000 because the queue-fan-out + // path amplifies a leaked-token attack). 101 DIDs → over cap. const dids = Array.from( - { length: 1001 }, + { length: 101 }, (_, i) => `did:plc:test${i.toString().padStart(20, "0")}`, ); const res = await SELF.fetch("https://test/_admin/backfill", { @@ -738,7 +889,7 @@ describe("backfill admin route: auth + input validation", () => { body: JSON.stringify({ dids }), }); expect(res.status).toBe(400); - expect(await res.text()).toContain("at most 1000"); + expect(await res.text()).toContain("at most 100"); }); it("returns 400 on malformed DID (caught by DID_PATTERN)", async () => { diff --git a/apps/aggregator/worker-configuration.d.ts b/apps/aggregator/worker-configuration.d.ts index 4af216a0d..b16346f3b 100644 --- a/apps/aggregator/worker-configuration.d.ts +++ b/apps/aggregator/worker-configuration.d.ts @@ -1,5 +1,5 @@ /* eslint-disable */ -// Generated by Wrangler by running `wrangler types` (hash: 1f938082c03573892ce397f54a9ca0b6) +// Generated by Wrangler by running `wrangler types` (hash: 9052900c7b8ec62b9504e6508846deb9) // Runtime types generated with workerd@1.20260507.1 2026-02-24 nodejs_compat declare namespace Cloudflare { interface GlobalProps { @@ -9,8 +9,9 @@ declare namespace Cloudflare { interface Env { DB: D1Database; RECORDS_QUEUE: Queue; + BACKFILL_QUEUE: Queue; JETSTREAM_URL: "wss://jetstream2.us-east.bsky.network/subscribe"; - CONSTELLATION_URL: "https://constellation.microcosm.blue"; + RELAY_URL: "https://bsky.network"; ADMIN_TOKEN: string; RECORDS_DO: DurableObjectNamespace; } @@ -21,7 +22,7 @@ type StringifyValues> = { }; declare namespace NodeJS { interface ProcessEnv extends StringifyValues< - Pick + Pick > {} } diff --git a/apps/aggregator/wrangler.jsonc b/apps/aggregator/wrangler.jsonc index 6e0fc8287..969e2b95c 100644 --- a/apps/aggregator/wrangler.jsonc +++ b/apps/aggregator/wrangler.jsonc @@ -25,6 +25,16 @@ "binding": "RECORDS_QUEUE", "queue": "emdash-aggregator-records", }, + { + // Backfill orchestrator (POST /_admin/backfill) fans + // (DID, collection) pairs onto this queue. Per-pair work + // (resolve PDS → listRecords → sendBatch onto RECORDS_QUEUE) + // fits well within a single consumer invocation's waitUntil, + // solving the 30s wall-clock cap that would otherwise limit + // us to ~15–25 DIDs per backfill POST. + "binding": "BACKFILL_QUEUE", + "queue": "emdash-aggregator-backfill", + }, ], "consumers": [ { @@ -47,6 +57,23 @@ "max_batch_timeout": 30, "max_retries": 3, }, + { + // Backfill (DID, collection) consumer. Each job: resolve + // PDS, paginate listRecords for one collection, batch + // results onto RECORDS_QUEUE. Smaller batch size than the + // records consumer because each job involves outbound PDS + // fetches; backpressure from RECORDS_QUEUE flows naturally + // via this consumer's runtime budget. Failed jobs land in + // a DLQ that's intentionally unbound — backfill is + // operator-triggered and operator-monitored, so DLQ + // drainage is a follow-up concern rather than an + // always-on automation requirement. + "queue": "emdash-aggregator-backfill", + "max_batch_size": 5, + "max_batch_timeout": 5, + "max_retries": 3, + "dead_letter_queue": "emdash-aggregator-backfill-dlq", + }, ], }, "durable_objects": { @@ -77,8 +104,13 @@ // Jetstream endpoint. Override per environment for self-hosted relays // or staging backends. "JETSTREAM_URL": "wss://jetstream2.us-east.bsky.network/subscribe", - // Constellation HTTP API for cold-start backfill DID discovery. - "CONSTELLATION_URL": "https://constellation.microcosm.blue", + // Relay base URL implementing `com.atproto.sync.listReposByCollection`, + // used by `/_admin/backfill` to enumerate publishers of our NSIDs at + // cold-start time. Defaults to Bluesky's main relay (the canonical + // atproto relay; same trust orbit as our Jetstream endpoint). + // Override for self-hosted indexers (e.g. Microcosm's Lightrail) + // if/when we want a non-bsky.network discovery path. + "RELAY_URL": "https://bsky.network", }, // Required secrets, declared in config so `wrangler types` generates the // typed binding and `wrangler deploy` validates the secret is set on the From 6a8145b52044dfba35f45b82fe65d856b0d3e288 Mon Sep 17 00:00:00 2001 From: Matt Kane Date: Sun, 10 May 2026 18:37:00 +0100 Subject: [PATCH 5/8] fix(aggregator): adversarial review fixes for backfill queue restructure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Parallelize `enqueueBackfillJobs` sendBatch calls. Serial awaits over ~40 batches (worst case from `MAX_DISCOVERED_DIDS` × `WANTED_COLLECTIONS`) ate noticeably into the POST handler's 30s waitUntil budget on top of discovery's own fetches. - Drop backfill consumer `max_batch_size` from 5 → 1. Sequential processing inside a batch could chain `LIST_RECORDS_TIMEOUT_MS = 15s` per page × multiple messages and blow the consumer wall-clock budget. At backfill rates (operator-triggered, not steady-state), throughput doesn't matter; the simpler invariant does. - Add `drainBackfillDeadLetterBatch` consumer for the backfill DLQ. Mirrors the records-DLQ pattern (log loud + ack) so messages don't silently accumulate. No D1 forensics row — the operator's recovery action is "re-trigger backfill for the affected DID", which only needs the (did, collection) pair from the log line. - Fix stale comment in the backfill route handler that referenced the deleted `backfillDids` orchestrator. - Update `MAX_BACKFILL_DIDS` comment to acknowledge that `MAX_DISCOVERED_DIDS` is the actually-larger ceiling — the explicit path is the tighter "operator types it themselves" bucket by design. --- apps/aggregator/src/backfill-consumer.ts | 22 ++++++++++++++++ apps/aggregator/src/backfill.ts | 10 ++++++- apps/aggregator/src/index.ts | 33 +++++++++++++++++------- apps/aggregator/test/backfill.test.ts | 20 +++++++++++++- apps/aggregator/wrangler.jsonc | 30 ++++++++++++++------- 5 files changed, 94 insertions(+), 21 deletions(-) diff --git a/apps/aggregator/src/backfill-consumer.ts b/apps/aggregator/src/backfill-consumer.ts index 727b0019e..b2c6cc6fb 100644 --- a/apps/aggregator/src/backfill-consumer.ts +++ b/apps/aggregator/src/backfill-consumer.ts @@ -73,6 +73,28 @@ export async function processBackfillBatch( } } +/** + * Drain the backfill DLQ. Mirror of `records-consumer.drainDeadLetterBatch` + * but without the D1 forensics row — the recovery action for a backfill + * pair that exhausted retries is "re-run backfill for the affected DID", + * which only needs the (did, collection) pair from the log line. + * + * Logs at error level so operators tailing `wrangler tail` see the message + * loud, acks so the DLQ doesn't accumulate unbounded. + */ +export function drainBackfillDeadLetterBatch( + batch: MessageBatchLike, + _env: Env, +): void { + for (const message of batch.messages) { + console.error("[aggregator] backfill DLQ drain: pair exhausted retries", { + did: message.body.did, + collection: message.body.collection, + }); + message.ack(); + } +} + function createProductionDeps(env: Env): ProcessBackfillJobDeps { const composite = new CompositeDidDocumentResolver({ methods: { diff --git a/apps/aggregator/src/backfill.ts b/apps/aggregator/src/backfill.ts index e27b6b7c1..00823dbb4 100644 --- a/apps/aggregator/src/backfill.ts +++ b/apps/aggregator/src/backfill.ts @@ -221,9 +221,17 @@ export async function enqueueBackfillJobs( messages.push({ body: { did, collection } }); } } + // Fan the sendBatch calls in parallel. Each is an independent outbound + // sub-request and the orchestrator runs inside the POST handler's 30s + // `waitUntil` budget — serial awaits on a `MAX_DISCOVERED_DIDS`-sized + // fan-out (1000 DIDs × 4 collections / 100 = 40 batches) would + // noticeably eat into the cold-start budget on top of discovery's own + // fetches. + const sends: Promise[] = []; for (let i = 0; i < messages.length; i += QUEUE_SEND_BATCH_CAP) { - await queue.sendBatch(messages.slice(i, i + QUEUE_SEND_BATCH_CAP)); + sends.push(queue.sendBatch(messages.slice(i, i + QUEUE_SEND_BATCH_CAP))); } + await Promise.all(sends); return messages.length; } diff --git a/apps/aggregator/src/index.ts b/apps/aggregator/src/index.ts index f238c6974..ef2e89416 100644 --- a/apps/aggregator/src/index.ts +++ b/apps/aggregator/src/index.ts @@ -15,7 +15,7 @@ * Worker boots. */ -import { processBackfillBatch } from "./backfill-consumer.js"; +import { drainBackfillDeadLetterBatch, processBackfillBatch } from "./backfill-consumer.js"; import { discoverDids, enqueueBackfillJobs } from "./backfill.js"; import type { BackfillJob, RecordsJob } from "./env.js"; import { drainDeadLetterBatch, processBatch } from "./records-consumer.js"; @@ -25,6 +25,7 @@ import { isPlainObject } from "./utils.js"; const RECORDS_QUEUE_NAME = "emdash-aggregator-records"; const RECORDS_DLQ_NAME = "emdash-aggregator-records-dlq"; const BACKFILL_QUEUE_NAME = "emdash-aggregator-backfill"; +const BACKFILL_DLQ_NAME = "emdash-aggregator-backfill-dlq"; export { RecordsJetstreamDO } from "./records-do.js"; @@ -58,11 +59,16 @@ const BACKFILL_PATH = "/_admin/backfill"; * old serial-loop cap (was 1000) because the queue-fan-out path amplifies * a leaked-token attack: each submitted DID becomes * `WANTED_COLLECTIONS.length` queue messages, each consuming a consumer - * invocation with its own outbound PDS fetches. 100 × 4 = 400 jobs is - * still a meaningful operator-recovery batch but a tractable blast radius. + * invocation with its own outbound PDS fetches. 100 × 4 = 400 jobs from + * the explicit path is a meaningful operator-recovery batch but a + * tractable blast radius for the explicit path. * - * Discovery-mode submissions are bounded separately by - * `MAX_DISCOVERED_DIDS` inside `discoverDids`. + * The discovery path (empty body) has a separate ceiling at + * `MAX_DISCOVERED_DIDS = 1000` and is therefore actually the larger + * worst-case fan-out source — by design, since legitimate-publisher + * enumeration is the primary use case and we want the explicit list to + * be the tighter "operator types it themselves" bucket. Both paths share + * the same per-pair caps in the consumer. */ const MAX_BACKFILL_DIDS = 100; const DID_PATTERN = /^did:[a-z]+:[A-Za-z0-9._%:-]+$/; @@ -208,11 +214,14 @@ export default { if ("error" in parsed) { return new Response(parsed.error, { status: 400 }); } - // Fire-and-forget via waitUntil so the route returns 202 quickly - // regardless of the DID list size. Backfill progress is observable - // via Workers logs and the resulting D1 writes; the response body - // just confirms the trigger landed. Per-DID errors are logged from - // inside backfillDids; callers don't get them in the response. + // Fire-and-forget via waitUntil so the route returns 202 quickly. + // `runBackfill` only does discovery + queue fan-out (both fast); + // per-pair work runs in the BACKFILL_QUEUE consumer with its own + // invocation budget. Operator-facing observability: + // - this handler's logs ([aggregator] backfill discovery/enqueue) + // - per-pair logs from `backfill-consumer.ts` + // - DLQ inspection (`emdash-aggregator-backfill-dlq`) for + // pairs that exhausted retries ctx.waitUntil(runBackfill(parsed, env)); return new Response(null, { status: 202 }); } @@ -241,6 +250,10 @@ export default { // eslint-disable-next-line typescript-eslint(no-unsafe-type-assertion) -- narrowed by queue name await processBackfillBatch(batch as MessageBatch, env); return; + case BACKFILL_DLQ_NAME: + // eslint-disable-next-line typescript-eslint(no-unsafe-type-assertion) -- narrowed by queue name + drainBackfillDeadLetterBatch(batch as MessageBatch, env); + return; default: console.error("[aggregator] unknown queue, acking batch", { queue: batch.queue }); for (const m of batch.messages) m.ack(); diff --git a/apps/aggregator/test/backfill.test.ts b/apps/aggregator/test/backfill.test.ts index 5dffcf168..16259342a 100644 --- a/apps/aggregator/test/backfill.test.ts +++ b/apps/aggregator/test/backfill.test.ts @@ -21,7 +21,7 @@ import type { Did } from "@atcute/lexicons/syntax"; import { applyD1Migrations, env, SELF } from "cloudflare:test"; import { beforeAll, beforeEach, describe, expect, it } from "vitest"; -import { processBackfillBatch } from "../src/backfill-consumer.js"; +import { drainBackfillDeadLetterBatch, processBackfillBatch } from "../src/backfill-consumer.js"; import { type BackfillQueueProducer, discoverDids, @@ -639,6 +639,24 @@ describe("processBackfillBatch (consumer)", () => { }); }); +describe("drainBackfillDeadLetterBatch", () => { + it("acks every dead-lettered job (DLQ doesn't accumulate)", () => { + const collection = WANTED_COLLECTIONS[0]; + if (!collection) throw new Error("test assumes ≥1 collection"); + const messages = [ + new FakeMessage({ did: DID_A, collection }), + new FakeMessage({ did: DID_B, collection }), + ]; + + drainBackfillDeadLetterBatch({ messages }, {} as Env); + + for (const m of messages) { + expect(m.acked).toBe(1); + expect(m.retried).toBe(0); + } + }); +}); + describe("discoverDids: listReposByCollection enumeration", () => { const RELAY = "https://relay.test.example"; diff --git a/apps/aggregator/wrangler.jsonc b/apps/aggregator/wrangler.jsonc index 969e2b95c..dd2b16412 100644 --- a/apps/aggregator/wrangler.jsonc +++ b/apps/aggregator/wrangler.jsonc @@ -60,20 +60,32 @@ { // Backfill (DID, collection) consumer. Each job: resolve // PDS, paginate listRecords for one collection, batch - // results onto RECORDS_QUEUE. Smaller batch size than the - // records consumer because each job involves outbound PDS - // fetches; backpressure from RECORDS_QUEUE flows naturally - // via this consumer's runtime budget. Failed jobs land in - // a DLQ that's intentionally unbound — backfill is - // operator-triggered and operator-monitored, so DLQ - // drainage is a follow-up concern rather than an - // always-on automation requirement. + // results onto RECORDS_QUEUE. `max_batch_size: 1` so each + // pair gets its own consumer invocation — listRecords can + // chain ~15s per page × `MAX_PAGES_PER_COLLECTION`, so + // batching multiple pairs per invocation risks blowing + // the consumer wall-clock budget. Throughput is fine + // because backfill is operator-triggered, not steady-state. "queue": "emdash-aggregator-backfill", - "max_batch_size": 5, + "max_batch_size": 1, "max_batch_timeout": 5, "max_retries": 3, "dead_letter_queue": "emdash-aggregator-backfill-dlq", }, + { + // Drains the backfill DLQ. Logs each dead-lettered pair + // loud enough that operators tailing logs see it, then + // acks so the DLQ doesn't accumulate unbounded. We don't + // write a forensics row to D1 here (no `backfill_dead_letters` + // table) because the operator's recovery action is + // "re-trigger backfill for the affected DID" — they don't + // need the per-pair payload, just the (did, collection) + // pair, which is on the log line. + "queue": "emdash-aggregator-backfill-dlq", + "max_batch_size": 25, + "max_batch_timeout": 30, + "max_retries": 3, + }, ], }, "durable_objects": { From e463cf73e078405d1c108826a4391ad41088b0f0 Mon Sep 17 00:00:00 2001 From: Matt Kane Date: Sun, 10 May 2026 18:48:31 +0100 Subject: [PATCH 6/8] fix(aggregator): use library helpers for AT-URI parsing + crypto compare MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three Copilot review comments on PR #979 — all pointing at hand-rolled code that should be using library functions: - Swap hand-rolled `timingSafeEqual` (XOR loop) for workerd's audited `crypto.subtle.timingSafeEqual`. Update the comment to be honest about the residual length-leak (the primitive returns false on length mismatch, so the strict zero-leak guarantee only holds against same-length inputs — acceptable here because ADMIN_TOKEN's length is fixed at deploy time and any realistic length-via-timing attack requires more requests than other defences would tolerate). - Swap hand-rolled `parseRkeyFromUri` for `parseCanonicalResourceUri` from `@atcute/lexicons/syntax`. Library validates the DID, NSID, and rkey grammars in one call. Adds an explicit DID-equality check so a buggy/malicious PDS that returns records under another publisher's DID is dropped at source rather than churning dead-letters in the records consumer. - Make `extractListRecordsBody`/`extractCursor` throw on malformed bodies instead of silently returning `[]`/`undefined`. A 200 response with the wrong shape is upstream breakage and the operator-triggered integrity path should surface that loudly. Throws propagate out via the queue consumer's retry path. Also adds tests for the new strict validation paths and the DID-mismatch rejection. --- apps/aggregator/src/backfill.ts | 81 ++++++++++++++++----------- apps/aggregator/src/index.ts | 22 +++++--- apps/aggregator/test/backfill.test.ts | 72 ++++++++++++++++++++++++ 3 files changed, 134 insertions(+), 41 deletions(-) diff --git a/apps/aggregator/src/backfill.ts b/apps/aggregator/src/backfill.ts index 00823dbb4..db0878ff8 100644 --- a/apps/aggregator/src/backfill.ts +++ b/apps/aggregator/src/backfill.ts @@ -32,6 +32,8 @@ * no periodic scheduler — see plan §"Why no reconciliation cron". */ +import { parseCanonicalResourceUri } from "@atcute/lexicons/syntax"; + import { WANTED_COLLECTIONS } from "./constants.js"; import type { DidResolver } from "./did-resolver.js"; import type { BackfillJob, RecordsJob } from "./env.js"; @@ -68,9 +70,6 @@ if (MAX_RECORDS_PER_PAGE > QUEUE_SEND_BATCH_CAP) { `MAX_RECORDS_PER_PAGE (${MAX_RECORDS_PER_PAGE}) exceeds QUEUE_SEND_BATCH_CAP (${QUEUE_SEND_BATCH_CAP})`, ); } -/** Atproto rkey grammar: ALPHA / DIGIT / "." / "-" / "_" / ":" / "~". */ -const RKEY_PATTERN = /^[A-Za-z0-9._:~-]{1,512}$/; - /** Producer-side records queue surface. The production binding * `env.RECORDS_QUEUE` satisfies this; tests pass an in-memory implementation. */ export interface RecordsQueueProducer { @@ -370,13 +369,22 @@ async function paginateAndEnqueue(opts: PaginateOpts): Promise { const messages: { body: RecordsJob }[] = []; for (const record of records) { - const rkey = parseRkeyFromUri(record.uri, opts.collection); - if (!rkey) continue; + const parsed = parseCanonicalResourceUri(record.uri); + if (!parsed.ok) continue; + // Defence vs. a buggy/malicious PDS that returns records under + // a different DID (or a different collection) than the one we + // asked for. Such jobs would never verify (signature would be + // from a different key) and would just churn dead-letters; drop + // at the source. `parseCanonicalResourceUri` already validated + // the rkey grammar and the collection NSID for us, so we only + // need the cross-checks here. + if (parsed.value.repo !== opts.did) continue; + if (parsed.value.collection !== opts.collection) continue; messages.push({ body: { did: opts.did, collection: opts.collection, - rkey, + rkey: parsed.value.rkey, operation: "create", cid: record.cid, }, @@ -399,44 +407,51 @@ interface ListRecordEntry { value: unknown; } +/** + * Parse a `com.atproto.repo.listRecords` response body. Throws on any + * structural mismatch — a PDS that 200s with the wrong shape is upstream + * breakage, not "no records", and silently treating it as the latter + * causes operator-invisible partial backfills. The thrown error + * propagates out of `processBackfillJob` and the queue consumer retries + * (then DLQs) per the standard policy. + */ function extractListRecordsBody(body: unknown): ListRecordEntry[] { - if (!isPlainObject(body)) return []; + if (!isPlainObject(body)) { + throw new Error("listRecords response was not a JSON object"); + } const records = body["records"]; - if (!Array.isArray(records)) return []; + if (!Array.isArray(records)) { + throw new Error("listRecords response missing `records` array"); + } const out: ListRecordEntry[] = []; for (const r of records) { - if (!isPlainObject(r)) continue; + if (!isPlainObject(r)) { + throw new Error("listRecords record entry was not a JSON object"); + } const uri = r["uri"]; const cid = r["cid"]; - if (typeof uri !== "string" || typeof cid !== "string") continue; + if (typeof uri !== "string" || typeof cid !== "string") { + throw new Error("listRecords record entry missing string `uri` or `cid`"); + } out.push({ uri, cid, value: r["value"] }); } return out; } -function extractCursor(body: unknown): string | undefined { - if (!isPlainObject(body)) return undefined; - const cursor = body["cursor"]; - return typeof cursor === "string" ? cursor : undefined; -} - /** - * Extract the rkey from an AT URI of the shape `at://did/collection/rkey` - * and validate it against the atproto rkey grammar. Returns null if any - * step fails; callers treat that as "this isn't a record we recognise" - * and skip the entry without aborting the page. + * Pull the optional `cursor` out of a `listRecords` response. `undefined` + * (no key) is the spec-compliant signal for "end of pagination"; any other + * non-string value (number, object, etc.) is a PDS bug and throws so the + * pagination loop doesn't silently terminate on the wrong page. */ -function parseRkeyFromUri(uri: string, expectedCollection: string): string | null { - const expectedPrefix = `at://`; - if (!uri.startsWith(expectedPrefix)) return null; - const tail = uri.slice(expectedPrefix.length); - const slash1 = tail.indexOf("/"); - if (slash1 < 0) return null; - const slash2 = tail.indexOf("/", slash1 + 1); - if (slash2 < 0) return null; - const collection = tail.slice(slash1 + 1, slash2); - if (collection !== expectedCollection) return null; - const rkey = tail.slice(slash2 + 1); - if (!RKEY_PATTERN.test(rkey)) return null; - return rkey; +function extractCursor(body: unknown): string | undefined { + if (!isPlainObject(body)) { + throw new Error("listRecords response was not a JSON object"); + } + const cursor = body["cursor"]; + if (cursor === undefined) return undefined; + if (typeof cursor !== "string") { + throw new Error(`listRecords cursor was not a string (got ${typeof cursor})`); + } + return cursor; } diff --git a/apps/aggregator/src/index.ts b/apps/aggregator/src/index.ts index ef2e89416..4b1f150f6 100644 --- a/apps/aggregator/src/index.ts +++ b/apps/aggregator/src/index.ts @@ -73,17 +73,23 @@ const BACKFILL_PATH = "/_admin/backfill"; const MAX_BACKFILL_DIDS = 100; const DID_PATTERN = /^did:[a-z]+:[A-Za-z0-9._%:-]+$/; +const tokenEncoder = new TextEncoder(); + /** - * Constant-time string equality. Use over `===` for any compare against a - * secret to avoid leaking length/prefix information through timing channels. + * Constant-time string equality via workerd's audited + * `crypto.subtle.timingSafeEqual`. The primitive returns `false` immediately + * for length-mismatched buffers, so the *prefix*-comparison is constant-time + * but a length difference is still observable via timing — acceptable here + * because the protected secret (`ADMIN_TOKEN`) has a fixed configured length + * known only to the operator, and any realistic length-via-timing attack + * would require so many requests that other defences (rate-limiting, + * Cloudflare Bot Management, log review) catch it first. */ function timingSafeEqual(a: string, b: string): boolean { - if (a.length !== b.length) return false; - let diff = 0; - for (let i = 0; i < a.length; i++) { - diff |= a.charCodeAt(i) ^ b.charCodeAt(i); - } - return diff === 0; + const aBuf = tokenEncoder.encode(a); + const bBuf = tokenEncoder.encode(b); + if (aBuf.byteLength !== bBuf.byteLength) return false; + return crypto.subtle.timingSafeEqual(aBuf, bBuf); } /** diff --git a/apps/aggregator/test/backfill.test.ts b/apps/aggregator/test/backfill.test.ts index 16259342a..f8da7ff5d 100644 --- a/apps/aggregator/test/backfill.test.ts +++ b/apps/aggregator/test/backfill.test.ts @@ -328,6 +328,30 @@ describe("processBackfillJob", () => { expect(result.enqueued).toBe(1); }); + it("skips records whose URI references a different DID than the job (defensive)", async () => { + const queue = new CapturingRecordsQueue(); + const resolver = buildResolver(); + const collection = WANTED_COLLECTIONS[0]; + if (!collection) throw new Error("test assumes ≥1 collection"); + // Buggy/malicious PDS: returns a record under a *different* repo's DID. + // Even if everything else parses, that record's signature would be + // from the wrong key — enqueueing it would just churn dead-letters. + const fetchImpl = makeFetch({ + [collection]: [ + { uri: `at://${DID_A}/${collection}/legit`, cid: "c1", value: {} }, + { uri: `at://${DID_B}/${collection}/imposter`, cid: "c2", value: {} }, + ], + }); + + const result = await processBackfillJob( + { did: DID_A, collection }, + { resolver, queue, fetch: fetchImpl }, + ); + + expect(queue.sent.map((j) => j.rkey)).toEqual(["legit"]); + expect(result.enqueued).toBe(1); + }); + it("rejects records with malformed rkey (atproto rkey grammar violation)", async () => { const queue = new CapturingRecordsQueue(); const resolver = buildResolver(); @@ -506,6 +530,54 @@ describe("processBackfillJob: defenses against malicious / buggy PDS", () => { ), ).rejects.toThrow(/timed out after 25ms/); }); + + it("throws when listRecords body isn't a JSON object (no silent zero)", async () => { + const queue = new CapturingRecordsQueue(); + const resolver = buildResolver(); + const collection = WANTED_COLLECTIONS[0]; + if (!collection) throw new Error("test assumes ≥1 collection"); + const fetchImpl: typeof fetch = async () => + new Response(JSON.stringify(["not", "an", "object"]), { + status: 200, + headers: { "content-type": "application/json" }, + }); + + await expect( + processBackfillJob({ did: DID_A, collection }, { resolver, queue, fetch: fetchImpl }), + ).rejects.toThrow(/not a JSON object/); + }); + + it("throws when listRecords body is missing the records array", async () => { + const queue = new CapturingRecordsQueue(); + const resolver = buildResolver(); + const collection = WANTED_COLLECTIONS[0]; + if (!collection) throw new Error("test assumes ≥1 collection"); + const fetchImpl: typeof fetch = async () => + new Response(JSON.stringify({ cursor: "x" }), { + status: 200, + headers: { "content-type": "application/json" }, + }); + + await expect( + processBackfillJob({ did: DID_A, collection }, { resolver, queue, fetch: fetchImpl }), + ).rejects.toThrow(/missing `records` array/); + }); + + it("throws when cursor is present but not a string (no silent end-of-pagination)", async () => { + const queue = new CapturingRecordsQueue(); + const resolver = buildResolver(); + const collection = WANTED_COLLECTIONS[0]; + if (!collection) throw new Error("test assumes ≥1 collection"); + const fetchImpl: typeof fetch = async () => + new Response(JSON.stringify({ records: [], cursor: 42 }), { + status: 200, + headers: { "content-type": "application/json" }, + }); + + await expect( + processBackfillJob({ did: DID_A, collection }, { resolver, queue, fetch: fetchImpl }), + ).rejects.toThrow(/cursor was not a string/); + }); }); describe("enqueueBackfillJobs", () => { From 6331ec8ad7aa3568fc78cd1468fa733141b50f69 Mon Sep 17 00:00:00 2001 From: Matt Kane Date: Sun, 10 May 2026 19:06:11 +0100 Subject: [PATCH 7/8] fix(aggregator): use library `isDid` instead of duplicated DID regex MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `@atcute/lexicons/syntax` exports the canonical `isDid` validator. Two files in this package were each carrying their own copy of `/^did:[a-z]+:[A-Za-z0-9._%:-]+$/` — looser than the library (which is length-bounded 7-2048 and forbids trailing `:` or `%`) and not type-narrowing. Replaces both: - `did-resolver.ts`: drops the local regex + `isDid` wrapper, imports the library version. - `index.ts` (backfill admin route): drops the local regex, calls the library directly in `parseBackfillBody`. Existing tests cover the validation behavior; library is at least as strict as the prior pattern so nothing legitimate gets newly rejected. --- apps/aggregator/src/did-resolver.ts | 7 +------ apps/aggregator/src/index.ts | 5 +++-- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/apps/aggregator/src/did-resolver.ts b/apps/aggregator/src/did-resolver.ts index cbdffefd2..8e4d5c58e 100644 --- a/apps/aggregator/src/did-resolver.ts +++ b/apps/aggregator/src/did-resolver.ts @@ -18,10 +18,9 @@ import { type PublicKey, } from "@atcute/crypto"; import { type DidDocument, getAtprotoVerificationMaterial, getPdsEndpoint } from "@atcute/identity"; -import type { Did } from "@atcute/lexicons/syntax"; +import { type Did, isDid } from "@atcute/lexicons/syntax"; const DEFAULT_TTL_MS = 24 * 60 * 60 * 1000; -const DID_PATTERN = /^did:[a-z]+:[A-Za-z0-9._%:-]+$/; /** Cache entry shape; the multibase signing key is stored raw so the * `PublicKey` instance is reconstructed on each `resolve()`. WebCrypto @@ -102,10 +101,6 @@ export class DidResolver { } } -function isDid(value: string): value is Did { - return DID_PATTERN.test(value); -} - function asDid(did: string): Did { if (!isDid(did)) { throw new Error(`invalid DID: ${did}`); diff --git a/apps/aggregator/src/index.ts b/apps/aggregator/src/index.ts index 4b1f150f6..ee364e5c1 100644 --- a/apps/aggregator/src/index.ts +++ b/apps/aggregator/src/index.ts @@ -15,6 +15,8 @@ * Worker boots. */ +import { isDid } from "@atcute/lexicons/syntax"; + import { drainBackfillDeadLetterBatch, processBackfillBatch } from "./backfill-consumer.js"; import { discoverDids, enqueueBackfillJobs } from "./backfill.js"; import type { BackfillJob, RecordsJob } from "./env.js"; @@ -71,7 +73,6 @@ const BACKFILL_PATH = "/_admin/backfill"; * the same per-pair caps in the consumer. */ const MAX_BACKFILL_DIDS = 100; -const DID_PATTERN = /^did:[a-z]+:[A-Za-z0-9._%:-]+$/; const tokenEncoder = new TextEncoder(); @@ -163,7 +164,7 @@ function parseBackfillBody(body: unknown): BackfillRequest | { error: string } { } const seen = new Set(); for (const did of rawDids) { - if (typeof did !== "string" || !DID_PATTERN.test(did)) { + if (!isDid(did)) { return { error: `invalid DID in list: ${JSON.stringify(did)}` }; } seen.add(did); From 6e2e24a0cd1183d4f07f5e1273bc31afc1e4efac Mon Sep 17 00:00:00 2001 From: Matt Kane Date: Sun, 10 May 2026 19:12:29 +0100 Subject: [PATCH 8/8] fix(aggregator): tighten isAtprotoDid + sync stale DLQ docstring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - `pds-verify.isAtprotoDid` now composes with library `isDid` from `@atcute/lexicons/syntax` before checking the method prefix. The previous bare `startsWith` check accepted strings like `did:plc:` (empty body) — defensive in practice (the resolver upstream rejects malformed DIDs first), but the library call carries weight at verification time as a second-line check. - `backfill-consumer.ts` module docstring still claimed the backfill DLQ "is intentionally not auto-drained today" even though the drain consumer (`drainBackfillDeadLetterBatch`) was added to this file in the previous review-fix round. Updated to describe what's actually shipped. --- apps/aggregator/src/backfill-consumer.ts | 11 ++++++----- apps/aggregator/src/pds-verify.ts | 7 ++++++- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/apps/aggregator/src/backfill-consumer.ts b/apps/aggregator/src/backfill-consumer.ts index b2c6cc6fb..3c6c9f536 100644 --- a/apps/aggregator/src/backfill-consumer.ts +++ b/apps/aggregator/src/backfill-consumer.ts @@ -17,11 +17,12 @@ * - Unexpected throws inside the batch loop are caught per-message so one * bad job can't poison the rest of the batch. * - * The DLQ is intentionally not auto-drained today — backfill is operator- - * triggered, so DLQ inspection is part of the operator's workflow when a - * backfill POST shows partial completion in `wrangler tail`. A drain - * consumer can land later when we have a clear ack policy (probably: - * write a row to D1 and ack, like the records-DLQ drain). + * DLQ drain (`drainBackfillDeadLetterBatch`): logs each dead-lettered + * pair at error level (so operators tailing `wrangler tail` see it loud) + * and acks so the DLQ doesn't accumulate unbounded. No D1 forensics row — + * the recovery action for a backfill pair that exhausted retries is + * "re-run backfill for the affected DID", which only needs the + * (did, collection) pair already on the log line. */ import { diff --git a/apps/aggregator/src/pds-verify.ts b/apps/aggregator/src/pds-verify.ts index daf7ec234..4b31bb0b0 100644 --- a/apps/aggregator/src/pds-verify.ts +++ b/apps/aggregator/src/pds-verify.ts @@ -18,7 +18,7 @@ */ import type { PublicKey } from "@atcute/crypto"; -import type { AtprotoDid } from "@atcute/lexicons/syntax"; +import { type AtprotoDid, isDid } from "@atcute/lexicons/syntax"; import { verifyRecord } from "@atcute/repo"; const DEFAULT_TIMEOUT_MS = 15_000; @@ -215,6 +215,11 @@ async function fetchCar( * message.retry()` without re-encoding the policy in the catch block. */ function isAtprotoDid(value: string): value is AtprotoDid { + // Library `isDid` enforces the full grammar (length, terminator chars); + // the prefix check then narrows to the atproto-supported method subset + // (`did:plc:` or `did:web:`). A bare prefix check would accept things + // like `did:plc:` (empty body), so the library call carries weight here. + if (!isDid(value)) return false; return value.startsWith("did:plc:") || value.startsWith("did:web:"); }