diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 000000000..8cc59a019 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,73 @@ +name: ci + +# Skip CI for docs-only changes. Note: if you later add required status +# checks, switch to per-job `if:` conditionals since paths-ignore will +# leave the checks in a "pending" state instead of passing them. +on: + push: + branches: [main] + paths-ignore: + - "docs/**" + - "**.md" + pull_request: + paths-ignore: + - "docs/**" + - "**.md" + +# Cancel in-progress runs for the same branch when a new commit lands. +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +permissions: + contents: read + +env: + # Bumped to 1.25.9 for the April 7, 2026 security release (10 CVEs: + # os symlink, html/template, crypto/x509, crypto/tls, etc.). + # go.mod still declares `go 1.25.8` — 1.25.9 satisfies that constraint; + # Dockerfiles get bumped separately. + GO_VERSION: "1.25.9" + # Pin golangci-lint to a known-good version so a new release can't break + # CI without a config update. Bump deliberately. + GOLANGCI_LINT_VERSION: "v2.11.4" + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - uses: actions/setup-go@v6 + with: + go-version: ${{ env.GO_VERSION }} + cache: true + - uses: golangci/golangci-lint-action@v9 + with: + version: ${{ env.GOLANGCI_LINT_VERSION }} + + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - uses: actions/setup-go@v6 + with: + go-version: ${{ env.GO_VERSION }} + cache: true + - name: Unit tests (race detector) + run: make test + + # Integration tests run the full testcontainers-go stack (Elasticsearch, + # NATS JetStream, MongoDB, Cassandra as applicable per service). Docker is + # pre-installed on ubuntu-latest runners. Bump the timeout if container + # image pulls are slow on cold cache. + test-integration: + runs-on: ubuntu-latest + timeout-minutes: 20 + steps: + - uses: actions/checkout@v6 + - uses: actions/setup-go@v6 + with: + go-version: ${{ env.GO_VERSION }} + cache: true + - name: Integration tests (search-sync-worker) + run: make test-integration SERVICE=search-sync-worker diff --git a/.golangci.yml b/.golangci.yml index d5fb71df1..eecc8847b 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -10,9 +10,6 @@ linters: - bodyclose - exhaustive settings: - goimports: - local-prefixes: - - github.com/hmchangw/chat exhaustive: default-signifies-exhaustive: true gocritic: diff --git a/docs/superpowers/plans/2026-04-20-search-sync-inbox-recovery.md b/docs/superpowers/plans/2026-04-20-search-sync-inbox-recovery.md new file mode 100644 index 000000000..47a63f616 --- /dev/null +++ b/docs/superpowers/plans/2026-04-20-search-sync-inbox-recovery.md @@ -0,0 +1,316 @@ +# search-sync-worker INBOX Recovery & Payload Migration Plan + +**Goal:** Recover the search-sync-worker spotlight + user-room collections from +PR #78's pre-force-push state (INBOX-based architecture) and adapt them to +main's new bulk member event format (`Accounts []string` + event-level +`HistorySharedSince`). + +**Branch:** `claude/recover-search-sync-worker-dCjyV` + +**Architecture:** Two collections consume the INBOX stream and maintain +Elasticsearch indexes for room typeahead (spotlight) and per-user message- +search access control (user-room). Cross-site federation flows via existing +`OUTBOX_{remote}` → `INBOX_{local}` stream sourcing with SubjectTransforms — +no new streams. Collections are payload-driven only; publishers (room-worker) +are explicitly out of scope for this PR. + +**Tech Stack:** Go 1.25, NATS JetStream (v2.10+ for SubjectTransforms), +Elasticsearch (bulk API + external versioning + painless scripts). + +--- + +## Context: why recovery + +PR #78 was force-pushed on 2026-04-16 from `3026f46` → `61a3cf9`, then +rewritten further to a ROOMS-stream architecture (`0fc366a`). The +ROOMS-stream approach duplicates what the existing OUTBOX/INBOX federation +already does (Sources + SubjectTransforms) and creates a third parallel +federation stream just for search indexing. + +The original INBOX-based design (commits `c906357` → `3026f46`) is the +correct architectural fit: one federation pipe per site, many local consumers +off INBOX. This plan recovers that work and adapts it to main's newer event +format. + +## Scope + +**In scope (this PR):** +- `pkg/model/event.go` — new `InboxMemberEvent` payload type +- `search-sync-worker/inbox_stream.go` — `parseMemberEvent` helper + cross- + site bootstrap stream config +- `search-sync-worker/spotlight.go` + tests — per-(user, room) typeahead docs +- `search-sync-worker/user_room.go` + tests — per-user rooms array +- `search-sync-worker/inbox_integration_test.go` — testcontainers-go + integration coverage +- `pkg/subject/subject.go` — `InboxMember*` builders (already landed in + recovered commits) +- `pkg/stream/stream.go` — `Inbox(siteID)` with two subject patterns + (already landed) + +**Out of scope (follow-up PRs):** +- `room-worker` publisher migration — publishing `InboxMemberEvent` to local + `chat.inbox.{site}.member_added/removed` for same-site members + to OUTBOX + for cross-site. The integration tests hand-publish to INBOX to prove the + collections work end-to-end without the publisher change. +- `inbox-worker` migration — owns INBOX stream creation with cross-site + Sources in production (currently search-sync-worker's bootstrap does it + behind `BOOTSTRAP_STREAMS=true`). + +## Key design decisions + +1. **Event-level `HistorySharedSince`.** The new `InboxMemberEvent` carries + ONE `HistorySharedSince int64` for the whole bulk — all accounts in one + event share the same restricted-or-not flag. When non-zero, the entire + event is skipped (empty actions slice, handler acks without touching ES). + The search service handles restricted rooms via DB+cache at query time. + +2. **Synthesized spotlight DocID = `{account}_{roomID}`.** The new payload + drops subscription IDs (only `Accounts []string`), so spotlight docs are + keyed by the synthesized ID. Safe because the ID is ES-internal and has + no external contract. User-room DocID is unchanged (`{account}`). + +3. **Fan-out by account.** Each inbox event produces N bulk actions (one per + account). Handler already supports this via `BuildAction` returning + `[]BulkAction`. + +4. **Single `InboxMemberEvent` type for add + remove.** The `OutboxEvent.Type` + field discriminates; `HistorySharedSince` and `JoinedAt` are omitted on + removes via `omitempty`. Keeps the payload shape consistent across the + two event types. + +5. **`int64` millis, not `*time.Time`.** `HistorySharedSince` is `int64` + (zero = not restricted) matching main's existing `MemberAddEvent`. Avoids + a nullable pointer field in JSON wire format. + +--- + +## File structure + +### Files modified in Phase 1 (committed as `023aadb`) + +- `pkg/model/event.go` — replaced `MemberAddedPayload` struct with + `InboxMemberEvent`. Fields: `RoomID, RoomName, RoomType, SiteID, + Accounts, HistorySharedSince (omitempty), JoinedAt (omitempty), Timestamp`. +- `pkg/model/model_test.go` — replaced `TestMemberAddedPayloadJSON` with + `TestInboxMemberEventJSON` (3 subtests: unrestricted add, restricted add, + remove-with-zeros-omitted). +- `search-sync-worker/inbox_stream.go` — `parseMemberEvent` now returns + `(*model.OutboxEvent, *model.InboxMemberEvent, error)`. Validation: envelope + unmarshal, positive timestamp, payload unmarshal. Event-level HSS short- + circuit is caller's responsibility. +- `search-sync-worker/spotlight.go` — `BuildAction` fans out by account with + DocID `fmt.Sprintf("%s_%s", account, payload.RoomID)`; `SpotlightSearchIndex` + now has `UserAccount, RoomID, RoomName, RoomType, SiteID, JoinedAt` + (dropped `SubscriptionID`, `UserID`). +- `search-sync-worker/user_room.go` — `BuildAction` fans out by account; docID + = account; event-level HSS short-circuit. +- `search-sync-worker/spotlight_test.go` + `user_room_test.go` — updated + helpers (`baseInboxMemberEvent`, `makeInboxMemberEvent`) and all assertions. + +### Files migrated in Phase 2 (commit `c7a303b`) + +- `search-sync-worker/inbox_integration_test.go` — full migration to new + helpers (`buildInboxMemberEvent`, `publishInboxMemberEvent`), synthesized + `{account}_{roomID}` DocID assertions, and all-or-nothing restricted-room + coverage. + +### Files NOT modified (confirmed unchanged on this branch) + +- `pkg/subject/subject.go` — `InboxMemberAdded`, `InboxMemberRemoved`, + `InboxMemberAddedAggregate`, `InboxMemberRemovedAggregate`, + `InboxMemberEventSubjects` already present. +- `pkg/stream/stream.go` — `Inbox(siteID)` already returns two subject + patterns (`chat.inbox.{site}.*` + `chat.inbox.{site}.aggregate.>`). +- `search-sync-worker/main.go` — bootstrap flow + INBOX stream creation + already in place. No changes needed for payload migration. +- `search-sync-worker/handler.go` + `handler_test.go` — handler is payload- + agnostic; fan-out path already supported via `BuildAction` returning + `[]BulkAction`. + +--- + +## Task list + +### Phase 1: payload + collections (DONE — commit `023aadb`) + +- [x] Replace `MemberAddedPayload` with `InboxMemberEvent` in `pkg/model` +- [x] Update `parseMemberEvent` for new shape +- [x] Update `spotlightCollection.BuildAction` for `Accounts` fan-out +- [x] Update `userRoomCollection.BuildAction` for `Accounts` fan-out +- [x] Update unit tests + helpers for new shape +- [x] `make lint` clean + `make test` green +- [x] Commit + push + +### Phase 2: integration tests (DONE — commit `c7a303b`) + +Historical note — the task breakdown below is kept for traceability. All +items landed in `c7a303b`. + +- [ ] **Task 2.1: Update shared helpers in `inbox_integration_test.go`** + + Replace these functions/types: + + ```go + // OLD + type memberFixture struct { + SubID string + Account string + Restricted bool + HistorySharedSince *time.Time + } + + func buildMemberEventPayload(subID, account, roomID, roomName, siteID string, + joinedAt time.Time, historyShared *time.Time) model.MemberAddedPayload + + func buildBulkMemberEventPayload(roomID, roomName, siteID string, + joinedAt time.Time, members []memberFixture) model.MemberAddedPayload + + func publishMemberOutboxEvent(t *testing.T, ctx context.Context, + js jetstream.JetStream, subj, eventType string, + payload model.MemberAddedPayload, timestamp int64) + ``` + + With: + + ```go + // NEW + func buildInboxMemberEvent(roomID, roomName, siteID string, + joinedAt int64, accounts []string, historySharedSince int64, + timestamp int64) model.InboxMemberEvent { + return model.InboxMemberEvent{ + RoomID: roomID, + RoomName: roomName, + RoomType: model.RoomTypeGroup, + SiteID: siteID, + Accounts: accounts, + HistorySharedSince: historySharedSince, + JoinedAt: joinedAt, + Timestamp: timestamp, + } + } + + func publishInboxMemberEvent(t *testing.T, ctx context.Context, + js jetstream.JetStream, subj, eventType string, + payload model.InboxMemberEvent) { + t.Helper() + payloadData, err := json.Marshal(payload) + require.NoError(t, err) + evt := model.OutboxEvent{ + Type: eventType, + SiteID: payload.SiteID, + DestSiteID: payload.SiteID, + Payload: payloadData, + Timestamp: payload.Timestamp, + } + data, err := json.Marshal(evt) + require.NoError(t, err) + _, err = js.Publish(ctx, subj, data) + require.NoError(t, err, "publish to %s", subj) + } + ``` + + Drop the `memberFixture` type entirely — bulk tests now just pass + `[]string` for accounts. Drop per-fixture `Restricted` — restricted-room + testing becomes a separate scenario (homogeneous event with non-zero + HSS). + +- [ ] **Task 2.2: Update `TestSpotlightSyncIntegration`** + + Same 4 published events (local alice-eng, local alice-platform, federated + bob-eng, federated alice-platform remove). Changes: + - `countDocs` assertion unchanged (2) + - DocID lookups: `sub-alice-eng` → `alice_r-eng`, `sub-bob-eng` → + `bob_r-eng`, `sub-alice-platform` → `alice_r-platform` + - Drop `assert.Equal(t, "sub-alice-eng", doc["subscriptionId"])` and + `assert.Equal(t, "u-alice", doc["userId"])` — those fields are gone. + - Keep: `userAccount`, `roomId`, `roomName`, `roomType`, `siteId` + assertions. + +- [ ] **Task 2.3: Update `TestSpotlightSync_BulkInvite`** + + Bulk event of 3 users. Changes: + - `buildBulkMemberEventPayload` → `buildInboxMemberEvent` with + `accounts = []string{"dave", "erin", "frank"}` + - DocIDs: `sub-dave-platform` → `dave_r-platform`, etc. + - Drop `subscriptionId` assertion + - `countDocs` still 3 before remove, 0 after. + +- [ ] **Task 2.4: Update `TestUserRoomSyncIntegration`** + + Same 6 published events. Changes: + - Replace `buildMemberEventPayload` calls throughout. + - The restricted event: `historySharedSince = 1735689500000` (any + non-zero value); the test proves the WHOLE event is skipped (alice not + indexed for r-restricted, no timestamp entry). + - `roomTimestamps` assertions unchanged (r1, r2, r3 timestamps, no + r-restricted). + +- [ ] **Task 2.5: Replace `TestUserRoomSync_BulkInvite`** + + Old test had mixed restricted/unrestricted within one bulk — impossible + under event-level HSS. Replace with two scenarios: + + 1. **Bulk unrestricted**: 3 accounts, HSS=0 → 3 user-room docs with + correct `rooms` arrays. + 2. **Bulk restricted (all-or-nothing)**: 3 accounts, HSS=12345 → no user- + room docs (whole event skipped). + +- [ ] **Task 2.6: Update `TestUserRoomSync_LWWGuard`** + + Linear sequence unchanged (6 steps: initial add → stale add → stale + remove → newer remove → re-add → stale add after re-add). Only change: + the per-publish `buildMemberEventPayload` call → `buildInboxMemberEvent` + with `accounts = []string{"charlie"}`. + +- [ ] **Task 2.7: Run integration tests** + + Requires Docker for testcontainers-go. + + ```shell + make test-integration SERVICE=search-sync-worker + ``` + + Expected: all 4 tests pass. + +- [ ] **Task 2.8: Commit + push** + + ```shell + git add search-sync-worker/inbox_integration_test.go + git commit -m "test(search-sync-worker): migrate integration tests to InboxMemberEvent" + git push + ``` + +### Phase 3: PR (TODO) + +- [ ] **Task 3.1: Open new PR** + + Base: `main` (e871010), head: `claude/recover-search-sync-worker-dCjyV`. + Title: `feat(search-sync-worker): add spotlight + user-room sync via INBOX` + + PR description highlights: + - Architecture: INBOX-based, reuses existing OUTBOX→INBOX federation + - New `InboxMemberEvent` payload (account-list + event-level HSS) + - Spotlight typeahead + user-room access-control collections + - room-worker publisher migration is a separate, follow-up PR + - Links to this plan doc + +- [ ] **Task 3.2: Close PR #78** + + Comment: "Superseded by #{new} — architectural direction changed to + INBOX-based per earlier discussion (ROOMS-stream approach duplicates + existing OUTBOX/INBOX federation)." + +--- + +## Follow-up (separate PR, separate plan) + +**room-worker publisher migration** — add a second publish alongside the +existing `chat.room.{roomID}.event.member` publish: +- Same-site members → `chat.inbox.{site}.member_added/removed` +- Cross-site members → `outbox.{site}.to.{dest}.member_added/removed` + (existing behavior, payload shape changes to `InboxMemberEvent`) + +Coordinate with `inbox-worker` to own INBOX stream creation with +`Sources + SubjectTransforms` in production (currently +`inboxBootstrapStreamConfig` in search-sync-worker is behind a +`BOOTSTRAP_STREAMS` dev-only toggle). diff --git a/go.mod b/go.mod index 13daa0f5e..814bf417f 100644 --- a/go.mod +++ b/go.mod @@ -7,6 +7,7 @@ require ( github.com/caarlos0/env/v11 v11.4.0 github.com/coreos/go-oidc/v3 v3.17.0 github.com/docker/docker v27.1.1+incompatible + github.com/elastic/go-elasticsearch/v8 v8.19.3 github.com/gin-gonic/gin v1.12.0 github.com/gocql/gocql v1.7.0 github.com/google/uuid v1.6.0 @@ -52,7 +53,6 @@ require ( github.com/docker/go-connections v0.5.0 // indirect github.com/docker/go-units v0.5.0 // indirect github.com/elastic/elastic-transport-go/v8 v8.8.0 // indirect - github.com/elastic/go-elasticsearch/v8 v8.19.3 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect github.com/gabriel-vasile/mimetype v1.4.13 // indirect github.com/gin-contrib/sse v1.1.1 // indirect diff --git a/go.sum b/go.sum index c48f0615f..55dd2dccc 100644 --- a/go.sum +++ b/go.sum @@ -8,8 +8,6 @@ github.com/Marz32onE/instrumentation-go/otel-nats v0.2.0 h1:J+S/NmcUf+dSXQMzNkNV github.com/Marz32onE/instrumentation-go/otel-nats v0.2.0/go.mod h1:xgj7JbYX3qHLZ8X7A6Hvc1yeE+t4L+KAgeo9h0JWJ1o= github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY= github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU= -github.com/antithesishq/antithesis-sdk-go v0.4.3-default-no-op h1:+OSa/t11TFhqfrX0EOSqQBDJ0YlpmK0rDSiB19dg9M0= -github.com/antithesishq/antithesis-sdk-go v0.4.3-default-no-op/go.mod h1:IUpT2DPAKh6i/YhSbt6Gl3v2yvUZjmKncl7U91fup7E= github.com/antithesishq/antithesis-sdk-go v0.6.0-default-no-op h1:kpBdlEPbRvff0mDD1gk7o9BhI16b9p5yYAXRlidpqJE= github.com/antithesishq/antithesis-sdk-go v0.6.0-default-no-op/go.mod h1:IUpT2DPAKh6i/YhSbt6Gl3v2yvUZjmKncl7U91fup7E= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= @@ -110,8 +108,6 @@ github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeN github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= -github.com/google/go-tpm v0.9.3 h1:+yx0/anQuGzi+ssRqeD6WpXjW2L/V0dItUayO0i9sRc= -github.com/google/go-tpm v0.9.3/go.mod h1:h9jEsEECg7gtLis0upRBQU+GhYVH6jMjrFxI8u6bVUY= github.com/google/go-tpm v0.9.8 h1:slArAR9Ft+1ybZu0lBwpSmpwhRXaa85hWtMinMyRAWo= github.com/google/go-tpm v0.9.8/go.mod h1:h9jEsEECg7gtLis0upRBQU+GhYVH6jMjrFxI8u6bVUY= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= @@ -146,8 +142,6 @@ github.com/magiconair/properties v1.8.7 h1:IeQXZAiQcpL9mgcAe1Nu6cX9LLw6ExEHKjN0V github.com/magiconair/properties v1.8.7/go.mod h1:Dhd985XPs7jluiymwWYZ0G4Z61jb3vdS329zhj2hYo0= github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= -github.com/minio/highwayhash v1.0.3 h1:kbnuUMoHYyVl7szWjSxJnxw11k2U709jqFPPmIUyD6Q= -github.com/minio/highwayhash v1.0.3/go.mod h1:GGYsuwP/fPD6Y9hMiXuapVvlIUEhFhMTh0rxU3ik1LQ= github.com/minio/highwayhash v1.0.4-0.20251030100505-070ab1a87a76 h1:KGuD/pM2JpL9FAYvBrnBBeENKZNh6eNtjqytV6TYjnk= github.com/minio/highwayhash v1.0.4-0.20251030100505-070ab1a87a76/go.mod h1:GGYsuwP/fPD6Y9hMiXuapVvlIUEhFhMTh0rxU3ik1LQ= github.com/moby/docker-image-spec v1.3.1 h1:jMKff3w6PgbfSa69GfNg+zN/XLhfXJGnEx3Nl2EsFP0= @@ -173,8 +167,6 @@ github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/nats-io/jwt/v2 v2.8.1 h1:V0xpGuD/N8Mi+fQNDynXohVvp7ZztevW5io8CUWlPmU= github.com/nats-io/jwt/v2 v2.8.1/go.mod h1:nWnOEEiVMiKHQpnAy4eXlizVEtSfzacZ1Q43LIRavZg= -github.com/nats-io/nats-server/v2 v2.11.0 h1:fdwAT1d6DZW/4LUz5rkvQUe5leGEwjjOQYntzVRKvjE= -github.com/nats-io/nats-server/v2 v2.11.0/go.mod h1:leXySghbdtXSUmWem8K9McnJ6xbJOb0t9+NQ5HTRZjI= github.com/nats-io/nats-server/v2 v2.12.6 h1:Egbx9Vl7Ch8wTtpXPGqbehkZ+IncKqShUxvrt1+Enc8= github.com/nats-io/nats-server/v2 v2.12.6/go.mod h1:4HPlrvtmSO3yd7KcElDNMx9kv5EBJBnJJzQPptXlheo= github.com/nats-io/nats.go v1.50.0 h1:5zAeQrTvyrKrWLJ0fu02W3br8ym57qf7csDzgLOpcds= @@ -277,8 +269,6 @@ go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0 h1:88Y4s2C8oTui1LGM6bT go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0/go.mod h1:Vl1/iaggsuRlrHf/hfPJPvVag77kKyvrLeD10kpMl+A= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.43.0 h1:RAE+JPfvEmvy+0LzyUA25/SGawPwIUbZ6u0Wug54sLc= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.43.0/go.mod h1:AGmbycVGEsRx9mXMZ75CsOyhSP6MFIcj/6dnG+vhVjk= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.19.0 h1:IeMeyr1aBvBiPVYihXIaeIZba6b8E1bYp7lbdxK8CQg= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.19.0/go.mod h1:oVdCUtjq9MK9BlS7TtucsQwUcXcymNiEDjgDD2jMtZU= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.43.0 h1:3iZJKlCZufyRzPzlQhUIWVmfltrXuGyfjREgGP3UUjc= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.43.0/go.mod h1:/G+nUPfhq2e+qiXMGxMwumDrP5jtzU+mWN7/sjT2rak= go.opentelemetry.io/otel/exporters/prometheus v0.65.0 h1:jOveH/b4lU9HT7y+Gfamf18BqlOuz2PWEvs8yM7Q6XE= @@ -356,8 +346,6 @@ golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ= golang.org/x/text v0.35.0 h1:JOVx6vVDFokkpaq1AEptVzLTpDe9KGpj5tR4/X+ybL8= golang.org/x/text v0.35.0/go.mod h1:khi/HExzZJ2pGnjenulevKNX1W67CUy0AsXcNubPGCA= -golang.org/x/time v0.11.0 h1:/bpjEDfN9tkoN/ryeYHnv5hcMlc8ncjMcM4XBk5NWV0= -golang.org/x/time v0.11.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg= golang.org/x/time v0.15.0 h1:bbrp8t3bGUeFOx08pvsMYRTCVSMk89u4tKbNOZbp88U= golang.org/x/time v0.15.0/go.mod h1:Y4YMaQmXwGQZoFaVFk4YpCt4FLQMYKZe9oeV/f4MSno= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= diff --git a/pkg/model/event.go b/pkg/model/event.go index 2ea43e345..9d7fd2767 100644 --- a/pkg/model/event.go +++ b/pkg/model/event.go @@ -39,6 +39,27 @@ type UpdateRoleRequest struct { NewRole Role `json:"newRole" bson:"newRole"` } +// InboxMemberEvent is the payload of an OutboxEvent{Type: "member_added" | +// "member_removed"} carried on the INBOX stream for local consumers like +// search-sync-worker. One event represents a bulk add/remove of N Accounts +// against a single room; downstream consumers fan out per-account. +// +// HistorySharedSince is a single event-level flag shared by all accounts in +// the bulk: when non-zero, the room is history-restricted and consumers MUST +// skip indexing the entire event (the search service handles restricted rooms +// via DB+cache at query time). JoinedAt is only meaningful on add events and +// omitted on removes. +type InboxMemberEvent struct { + RoomID string `json:"roomId"` + RoomName string `json:"roomName"` + RoomType RoomType `json:"roomType"` + SiteID string `json:"siteId"` + Accounts []string `json:"accounts"` + HistorySharedSince int64 `json:"historySharedSince,omitempty"` + JoinedAt int64 `json:"joinedAt,omitempty"` + Timestamp int64 `json:"timestamp" bson:"timestamp"` +} + type InviteMemberRequest struct { InviterID string `json:"inviterId"` InviteeID string `json:"inviteeId"` @@ -55,12 +76,21 @@ type NotificationEvent struct { Timestamp int64 `json:"timestamp" bson:"timestamp"` } +// OutboxEventType is the type tag on an OutboxEvent used to route it to the +// correct handler on the destination site. +type OutboxEventType = string + +const ( + OutboxMemberAdded OutboxEventType = "member_added" + OutboxMemberRemoved OutboxEventType = "member_removed" +) + type OutboxEvent struct { - Type string `json:"type"` // "member_added", "room_sync" - SiteID string `json:"siteId"` - DestSiteID string `json:"destSiteId"` - Payload []byte `json:"payload"` // JSON-encoded inner event - Timestamp int64 `json:"timestamp" bson:"timestamp"` + Type OutboxEventType `json:"type"` + SiteID string `json:"siteId"` + DestSiteID string `json:"destSiteId"` + Payload []byte `json:"payload"` // JSON-encoded inner event + Timestamp int64 `json:"timestamp" bson:"timestamp"` } type MemberAddEvent struct { diff --git a/pkg/model/model_test.go b/pkg/model/model_test.go index 415859708..c88d52c9c 100644 --- a/pkg/model/model_test.go +++ b/pkg/model/model_test.go @@ -598,6 +598,62 @@ func TestOutboxEventJSON(t *testing.T) { } } +func TestInboxMemberEventJSON(t *testing.T) { + t.Run("add event, unrestricted room", func(t *testing.T) { + src := model.InboxMemberEvent{ + RoomID: "r1", + RoomName: "engineering", + RoomType: model.RoomTypeGroup, + SiteID: "site-a", + Accounts: []string{"alice", "bob"}, + JoinedAt: 1735689600000, + Timestamp: 1735689600000, + } + data, err := json.Marshal(&src) + require.NoError(t, err) + var dst model.InboxMemberEvent + require.NoError(t, json.Unmarshal(data, &dst)) + assert.Equal(t, src, dst) + }) + + t.Run("add event, restricted room carries HistorySharedSince", func(t *testing.T) { + src := model.InboxMemberEvent{ + RoomID: "r1", + RoomName: "engineering", + RoomType: model.RoomTypeGroup, + SiteID: "site-a", + Accounts: []string{"alice"}, + HistorySharedSince: 1735689500000, + JoinedAt: 1735689600000, + Timestamp: 1735689600000, + } + data, err := json.Marshal(&src) + require.NoError(t, err) + var dst model.InboxMemberEvent + require.NoError(t, json.Unmarshal(data, &dst)) + assert.Equal(t, src, dst) + }) + + t.Run("remove event omits HistorySharedSince and JoinedAt when zero", func(t *testing.T) { + src := model.InboxMemberEvent{ + RoomID: "r1", + RoomName: "engineering", + RoomType: model.RoomTypeGroup, + SiteID: "site-a", + Accounts: []string{"alice", "bob"}, + Timestamp: 1735689600000, + } + data, err := json.Marshal(&src) + require.NoError(t, err) + var raw map[string]any + require.NoError(t, json.Unmarshal(data, &raw)) + _, hasHSS := raw["historySharedSince"] + assert.False(t, hasHSS, "historySharedSince should be omitted when zero") + _, hasJoinedAt := raw["joinedAt"] + assert.False(t, hasJoinedAt, "joinedAt should be omitted when zero") + }) +} + func TestRoomMetadataUpdateEventJSON(t *testing.T) { src := model.RoomMetadataUpdateEvent{ RoomID: "r1", diff --git a/pkg/natsutil/ack.go b/pkg/natsutil/ack.go new file mode 100644 index 000000000..d074658a9 --- /dev/null +++ b/pkg/natsutil/ack.go @@ -0,0 +1,42 @@ +package natsutil + +import "log/slog" + +// Acker is the minimal JetStream message interface the Ack helper needs. +// Both `jetstream.Msg` (nats.go) and otel-wrapped variants +// (e.g. `oteljetstream.Msg`) satisfy it. +type Acker interface { + Ack() error +} + +// Naker is the minimal JetStream message interface the Nak helper needs. +// Same compatibility story as Acker. +type Naker interface { + Nak() error +} + +// Ack acks `msg` and logs any failure under a consistent structured-log +// shape (`reason` + `error`). `reason` is a short label describing WHY +// the message is being acked — e.g. "handler succeeded", "filtered", +// "malformed payload" — so operators can query logs by cause without +// parsing free-text phrases. +// +// Use this from every JetStream consumer in the repo rather than hand-rolling +// an `if err := msg.Ack(); err != nil { slog.Error(...) }` block. Consolidating +// the pattern gives us one place to add tracing spans, metrics counters, or +// delivery-context fields later, and keeps log keys consistent across services. +func Ack(msg Acker, reason string) { + if err := msg.Ack(); err != nil { + slog.Error("ack failed", "reason", reason, "error", err) + } +} + +// Nak naks `msg` for redelivery and logs any failure under the same +// structured-log shape as Ack. `reason` describes WHY the message is being +// redelivered — e.g. "handler error", "bulk index failure", "transient +// downstream error". +func Nak(msg Naker, reason string) { + if err := msg.Nak(); err != nil { + slog.Error("nak failed", "reason", reason, "error", err) + } +} diff --git a/pkg/natsutil/ack_test.go b/pkg/natsutil/ack_test.go new file mode 100644 index 000000000..85d560ae9 --- /dev/null +++ b/pkg/natsutil/ack_test.go @@ -0,0 +1,64 @@ +package natsutil_test + +import ( + "errors" + "testing" + + "github.com/stretchr/testify/assert" + + "github.com/hmchangw/chat/pkg/natsutil" +) + +// stubMsg is a tiny test double for the Acker/Naker interfaces. Records +// whether Ack/Nak was called and returns a configurable error. +type stubMsg struct { + ackCalled bool + nakCalled bool + ackErr error + nakErr error +} + +func (s *stubMsg) Ack() error { + s.ackCalled = true + return s.ackErr +} + +func (s *stubMsg) Nak() error { + s.nakCalled = true + return s.nakErr +} + +func TestAck_Success(t *testing.T) { + msg := &stubMsg{} + natsutil.Ack(msg, "handler succeeded") + assert.True(t, msg.ackCalled, "Ack() should be invoked on the message") +} + +func TestAck_ErrorIsLoggedNotReturned(t *testing.T) { + // Ack is fire-and-forget by design — any error from msg.Ack() is logged + // and swallowed so callers don't have to branch on it. The helper's + // contract is "try to ack; if it fails, log it and move on." + msg := &stubMsg{ackErr: errors.New("connection closed")} + natsutil.Ack(msg, "filtered") + assert.True(t, msg.ackCalled) +} + +func TestNak_Success(t *testing.T) { + msg := &stubMsg{} + natsutil.Nak(msg, "handler error") + assert.True(t, msg.nakCalled, "Nak() should be invoked on the message") +} + +func TestNak_ErrorIsLoggedNotReturned(t *testing.T) { + msg := &stubMsg{nakErr: errors.New("consumer deleted")} + natsutil.Nak(msg, "bulk failure") + assert.True(t, msg.nakCalled) +} + +// Compile-time checks that the stub implements the interfaces the helpers +// require — this is what lets production code pass `jetstream.Msg` and +// `oteljetstream.Msg` to the helpers without a wrapper. +var ( + _ natsutil.Acker = (*stubMsg)(nil) + _ natsutil.Naker = (*stubMsg)(nil) +) diff --git a/pkg/searchengine/adapter.go b/pkg/searchengine/adapter.go index 992a83822..8bb9592fe 100644 --- a/pkg/searchengine/adapter.go +++ b/pkg/searchengine/adapter.go @@ -72,6 +72,20 @@ func (a *httpAdapter) Bulk(ctx context.Context, actions []BulkAction) ([]BulkRes line, _ := json.Marshal(map[string]bulkActionMeta{"delete": meta}) buf.Write(line) buf.WriteByte('\n') + case ActionUpdate: + // ES 8.x / OpenSearch 2.x bulk _update DOES accept version + + // version_type=external. We intentionally omit them because + // user-room scripted updates already enforce ordering via the + // painless LWW guard (`params.ts > stored` against the stored + // roomTimestamps), so layering external versioning on top would + // be redundant and complicate 409-handling semantics. Omit Version + // even when the caller sets it. + updateMeta := bulkActionMeta{Index: action.Index, ID: action.DocID} + line, _ := json.Marshal(map[string]bulkActionMeta{"update": updateMeta}) + buf.Write(line) + buf.WriteByte('\n') + buf.Write(action.Doc) + buf.WriteByte('\n') } } @@ -101,8 +115,9 @@ func (a *httpAdapter) Bulk(ctx context.Context, actions []BulkAction) ([]BulkRes for i, item := range bulkResp.Items { for _, detail := range item { results[i] = BulkResult{ - Status: detail.Status, - Error: detail.Error.Reason, + Status: detail.Status, + ErrorType: detail.Error.Type, + Error: detail.Error.Reason, } } } diff --git a/pkg/searchengine/adapter_test.go b/pkg/searchengine/adapter_test.go index 676d45a78..87ac9bbf4 100644 --- a/pkg/searchengine/adapter_test.go +++ b/pkg/searchengine/adapter_test.go @@ -95,6 +95,37 @@ func TestAdapter_Bulk(t *testing.T) { assert.Equal(t, "m2", del["_id"]) }) + t.Run("update action uses update meta without version", func(t *testing.T) { + var capturedBody string + ft := &fakeTransport{handler: func(req *http.Request) (*http.Response, error) { + body, _ := io.ReadAll(req.Body) + capturedBody = string(body) + return jsonResponse(200, `{"items":[{"update":{"status":200}}]}`), nil + }} + a := newAdapter(ft) + updateBody := json.RawMessage(`{"script":{"source":"ctx._source.rooms.add(params.rid)","params":{"rid":"r1"}},"upsert":{"userAccount":"alice","rooms":["r1"]}}`) + results, err := a.Bulk(context.Background(), []BulkAction{ + {Action: ActionUpdate, Index: "user-room-site1", DocID: "alice", Doc: updateBody}, + }) + require.NoError(t, err) + require.Len(t, results, 1) + assert.Equal(t, 200, results[0].Status) + + lines := strings.Split(strings.TrimSpace(capturedBody), "\n") + require.Len(t, lines, 2) + + var updateMeta map[string]any + require.NoError(t, json.Unmarshal([]byte(lines[0]), &updateMeta)) + upd := updateMeta["update"].(map[string]any) + assert.Equal(t, "user-room-site1", upd["_index"]) + assert.Equal(t, "alice", upd["_id"]) + // external versioning must NOT be set on update actions + assert.NotContains(t, upd, "version") + assert.NotContains(t, upd, "version_type") + + assert.JSONEq(t, string(updateBody), lines[1]) + }) + t.Run("version conflict treated as result not error", func(t *testing.T) { ft := &fakeTransport{handler: func(req *http.Request) (*http.Response, error) { return jsonResponse(200, `{ @@ -108,9 +139,36 @@ func TestAdapter_Bulk(t *testing.T) { require.NoError(t, err) require.Len(t, results, 1) assert.Equal(t, 409, results[0].Status) + assert.Equal(t, "version_conflict_engine_exception", results[0].ErrorType) assert.Equal(t, "stale", results[0].Error) }) + t.Run("bulk error types propagate (document_missing, index_not_found)", func(t *testing.T) { + // Exercises both benign and fatal 404 shapes so the handler's + // isBulkItemSuccess has a real ErrorType to key on. + ft := &fakeTransport{handler: func(req *http.Request) (*http.Response, error) { + return jsonResponse(200, `{ + "items": [ + {"update": {"status": 404, "error": {"type": "document_missing_exception", "reason": "[alice]: document missing"}}}, + {"update": {"status": 404, "error": {"type": "index_not_found_exception", "reason": "no such index [user-room-site-a]"}}} + ] + }`), nil + }} + a := newAdapter(ft) + results, err := a.Bulk(context.Background(), []BulkAction{ + {Action: ActionUpdate, Index: "user-room-site-a", DocID: "alice", Doc: json.RawMessage(`{}`)}, + {Action: ActionUpdate, Index: "user-room-site-a", DocID: "bob", Doc: json.RawMessage(`{}`)}, + }) + require.NoError(t, err) + require.Len(t, results, 2) + + assert.Equal(t, 404, results[0].Status) + assert.Equal(t, "document_missing_exception", results[0].ErrorType) + + assert.Equal(t, 404, results[1].Status) + assert.Equal(t, "index_not_found_exception", results[1].ErrorType) + }) + t.Run("HTTP error returns error", func(t *testing.T) { ft := &fakeTransport{handler: func(req *http.Request) (*http.Response, error) { return jsonResponse(503, `service unavailable`), nil diff --git a/pkg/searchengine/searchengine.go b/pkg/searchengine/searchengine.go index dad971428..1c8553d0e 100644 --- a/pkg/searchengine/searchengine.go +++ b/pkg/searchengine/searchengine.go @@ -18,21 +18,38 @@ type ActionType string const ( ActionIndex ActionType = "index" ActionDelete ActionType = "delete" + ActionUpdate ActionType = "update" ) // BulkAction represents a single action in a bulk request. +// +// For ActionUpdate, Doc contains the full ES update body (doc / script / +// upsert) and Version is ignored. The _update operation is read-modify-write +// on the ES side and does not accept `version`/`version_type=external`; that +// parameter pair is only valid for `index` (full-document replacement). type BulkAction struct { Action ActionType Index string DocID string - Version int64 // used as ES external version - Doc json.RawMessage // nil for delete actions + Version int64 // used as ES external version (ignored for ActionUpdate) + Doc json.RawMessage // index: full doc; update: update body; delete: nil } // BulkResult represents the result of a single bulk action item. +// +// ErrorType is the ES error type string (e.g., `document_missing_exception`, +// `index_not_found_exception`, `version_conflict_engine_exception`) when the +// item failed with an error block. Empty on 2xx success and on delete-404 +// responses (delete of a missing doc sets `result:"not_found"` without an +// error block). +// +// Callers that need to classify 4xx outcomes (e.g., deciding whether a 404 +// is a benign "doc already absent" or a fatal "index missing") should match +// on ErrorType rather than parsing the human-readable Error string. type BulkResult struct { - Status int - Error string + Status int + ErrorType string + Error string } // SearchEngine defines domain operations for search indexing. diff --git a/pkg/stream/stream.go b/pkg/stream/stream.go index 76256ccae..cddd7ed09 100644 --- a/pkg/stream/stream.go +++ b/pkg/stream/stream.go @@ -40,9 +40,40 @@ func Outbox(siteID string) Config { } } -// Inbox uses JetStream Sources from other sites' OUTBOX streams (no local subjects). +// Inbox returns the canonical config for the `INBOX_{siteID}` stream that +// carries subscription lifecycle events (member_added, member_removed) +// plus any other aggregated events federated in from other sites. +// +// The stream declares TWO non-overlapping subject patterns so the local vs. +// federated split is explicit in the stream schema itself: +// +// - `chat.inbox.{siteID}.*` +// Local direct publishes from same-site services (e.g., room-worker +// publishing `chat.inbox.{siteID}.member_added`). Single-token suffix, +// so it matches local event names only. +// +// - `chat.inbox.{siteID}.aggregate.>` +// Federated events sourced from remote OUTBOX streams. These land here +// via a JetStream SubjectTransform that rewrites +// `outbox.{remote}.to.{siteID}.>` → `chat.inbox.{siteID}.aggregate.>` +// on the way into this stream. Multi-token-safe so the transform can +// preserve any event shape. +// +// Together the two patterns carry exactly the four member events consumers +// care about today — local member_added / member_removed and federated +// aggregate.member_added / aggregate.member_removed — without an overly +// broad catch-all that would silently accept typos. +// +// Cross-site Sources + SubjectTransforms themselves are a deployment-time +// concern layered on by the service that owns stream creation, so this +// baseline leaves Sources empty. Consumers only need Name + Subjects to +// bind. func Inbox(siteID string) Config { return Config{ Name: fmt.Sprintf("INBOX_%s", siteID), + Subjects: []string{ + fmt.Sprintf("chat.inbox.%s.*", siteID), + fmt.Sprintf("chat.inbox.%s.aggregate.>", siteID), + }, } } diff --git a/pkg/stream/stream_test.go b/pkg/stream/stream_test.go index 232389911..b52d71e1d 100644 --- a/pkg/stream/stream_test.go +++ b/pkg/stream/stream_test.go @@ -3,9 +3,15 @@ package stream_test import ( "testing" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "github.com/hmchangw/chat/pkg/stream" ) +// TestStreamConfigs covers single-subject streams where one pattern is the +// full story. Multi-subject streams (currently just Inbox) get their own +// dedicated test so both patterns are asserted explicitly. func TestStreamConfigs(t *testing.T) { siteID := "site-a" @@ -19,18 +25,23 @@ func TestStreamConfigs(t *testing.T) { {"MessagesCanonical", stream.MessagesCanonical(siteID), "MESSAGES_CANONICAL_site-a", "chat.msg.canonical.site-a.>"}, {"Rooms", stream.Rooms(siteID), "ROOMS_site-a", "chat.room.canonical.site-a.>"}, {"Outbox", stream.Outbox(siteID), "OUTBOX_site-a", "outbox.site-a.>"}, - {"Inbox", stream.Inbox(siteID), "INBOX_site-a", ""}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - if tt.cfg.Name != tt.wantName { - t.Errorf("Name = %q, want %q", tt.cfg.Name, tt.wantName) - } - if tt.wantSubj != "" { - if len(tt.cfg.Subjects) == 0 || tt.cfg.Subjects[0] != tt.wantSubj { - t.Errorf("Subjects = %v, want [%q]", tt.cfg.Subjects, tt.wantSubj) - } - } + assert.Equal(t, tt.wantName, tt.cfg.Name) + require.Len(t, tt.cfg.Subjects, 1) + assert.Equal(t, tt.wantSubj, tt.cfg.Subjects[0]) }) } } + +func TestInboxConfig(t *testing.T) { + cfg := stream.Inbox("site-a") + + assert.Equal(t, "INBOX_site-a", cfg.Name) + // Two non-overlapping patterns: local (`*`) and federated (`aggregate.>`). + assert.Equal(t, []string{ + "chat.inbox.site-a.*", + "chat.inbox.site-a.aggregate.>", + }, cfg.Subjects) +} diff --git a/pkg/subject/subject.go b/pkg/subject/subject.go index 8ac820a65..3413158c5 100644 --- a/pkg/subject/subject.go +++ b/pkg/subject/subject.go @@ -93,6 +93,44 @@ func Outbox(siteID, destSiteID, eventType string) string { return fmt.Sprintf("outbox.%s.to.%s.%s", siteID, destSiteID, eventType) } +// InboxMemberAdded is the local-publish subject for a same-site member_added +// event. It lands in the local INBOX stream without the `aggregate` segment. +func InboxMemberAdded(siteID string) string { + return fmt.Sprintf("chat.inbox.%s.member_added", siteID) +} + +// InboxMemberRemoved is the local-publish subject for a same-site +// member_removed event. It lands in the local INBOX stream without the +// `aggregate` segment. +func InboxMemberRemoved(siteID string) string { + return fmt.Sprintf("chat.inbox.%s.member_removed", siteID) +} + +// InboxMemberAddedAggregate is the transformed subject for a federated +// member_added event after INBOX SubjectTransform rewrites +// `outbox.{src}.to.{siteID}.member_added` to this form. +func InboxMemberAddedAggregate(siteID string) string { + return fmt.Sprintf("chat.inbox.%s.aggregate.member_added", siteID) +} + +// InboxMemberRemovedAggregate is the transformed subject for a federated +// member_removed event. +func InboxMemberRemovedAggregate(siteID string) string { + return fmt.Sprintf("chat.inbox.%s.aggregate.member_removed", siteID) +} + +// InboxMemberEventSubjects returns the subject filters a consumer should use +// to receive both local and federated member_added/member_removed events for +// the given site. Use with jetstream.ConsumerConfig.FilterSubjects (NATS 2.10+). +func InboxMemberEventSubjects(siteID string) []string { + return []string{ + InboxMemberAdded(siteID), + InboxMemberRemoved(siteID), + InboxMemberAddedAggregate(siteID), + InboxMemberRemovedAggregate(siteID), + } +} + func MsgCanonicalCreated(siteID string) string { return fmt.Sprintf("chat.msg.canonical.%s.created", siteID) } diff --git a/pkg/subject/subject_test.go b/pkg/subject/subject_test.go index cdc8168fd..3d07ef02a 100644 --- a/pkg/subject/subject_test.go +++ b/pkg/subject/subject_test.go @@ -38,6 +38,14 @@ func TestSubjectBuilders(t *testing.T) { "chat.user.alice.notification"}, {"Outbox", subject.Outbox("site-a", "site-b", "member_added"), "outbox.site-a.to.site-b.member_added"}, + {"InboxMemberAdded", subject.InboxMemberAdded("site-a"), + "chat.inbox.site-a.member_added"}, + {"InboxMemberRemoved", subject.InboxMemberRemoved("site-a"), + "chat.inbox.site-a.member_removed"}, + {"InboxMemberAddedAggregate", subject.InboxMemberAddedAggregate("site-a"), + "chat.inbox.site-a.aggregate.member_added"}, + {"InboxMemberRemovedAggregate", subject.InboxMemberRemovedAggregate("site-a"), + "chat.inbox.site-a.aggregate.member_removed"}, {"MsgCanonicalCreated", subject.MsgCanonicalCreated("site-a"), "chat.msg.canonical.site-a.created"}, {"MsgCanonicalUpdated", subject.MsgCanonicalUpdated("site-a"), @@ -68,6 +76,24 @@ func TestSubjectBuilders(t *testing.T) { } }) } + + t.Run("InboxMemberEventSubjects", func(t *testing.T) { + got := subject.InboxMemberEventSubjects("site-a") + want := []string{ + "chat.inbox.site-a.member_added", + "chat.inbox.site-a.member_removed", + "chat.inbox.site-a.aggregate.member_added", + "chat.inbox.site-a.aggregate.member_removed", + } + if len(got) != len(want) { + t.Fatalf("got %d subjects, want %d", len(got), len(want)) + } + for i := range want { + if got[i] != want[i] { + t.Errorf("[%d] = %q, want %q", i, got[i], want[i]) + } + } + }) } func TestParseUserRoomSubject(t *testing.T) { diff --git a/search-sync-worker/collection.go b/search-sync-worker/collection.go index 70589e563..f46ef29ff 100644 --- a/search-sync-worker/collection.go +++ b/search-sync-worker/collection.go @@ -3,22 +3,31 @@ package main import ( "encoding/json" + "github.com/nats-io/nats.go/jetstream" + "github.com/hmchangw/chat/pkg/searchengine" - "github.com/hmchangw/chat/pkg/stream" ) // Collection defines a search-indexable data source. Each collection // encapsulates its own stream config, ES template, and document mapping. // To add a new collection (e.g., room search), implement this interface. type Collection interface { - // StreamConfig returns the JetStream stream to consume from. - StreamConfig(siteID string) stream.Config + // StreamConfig returns the JetStream stream to create/update and consume + // from. Each collection supplies a native jetstream.StreamConfig so it + // can configure Subjects, Sources, SubjectTransforms, etc. as needed. + StreamConfig(siteID string) jetstream.StreamConfig // ConsumerName returns the durable consumer name for this collection. ConsumerName() string - // TemplateName returns the ES index template name. + // FilterSubjects returns the list of subjects this collection's consumer + // should subscribe to. An empty slice means "all subjects in the stream". + FilterSubjects(siteID string) []string + // TemplateName returns the ES index template name. Empty string means the + // collection has no index template to upsert (e.g., single-index targets). TemplateName() string - // TemplateBody returns the ES index template JSON. + // TemplateBody returns the ES index template JSON. nil means no template. TemplateBody() json.RawMessage - // BuildAction converts raw JetStream message data into a BulkAction. - BuildAction(data []byte) (searchengine.BulkAction, error) + // BuildAction converts raw JetStream message data into one or more + // BulkActions. An empty slice means the event should be acked without + // any ES write (e.g., filtered out). + BuildAction(data []byte) ([]searchengine.BulkAction, error) } diff --git a/search-sync-worker/handler.go b/search-sync-worker/handler.go index 0010fc094..6d1f07467 100644 --- a/search-sync-worker/handler.go +++ b/search-sync-worker/handler.go @@ -7,113 +7,242 @@ import ( "github.com/nats-io/nats.go/jetstream" + "github.com/hmchangw/chat/pkg/natsutil" "github.com/hmchangw/chat/pkg/searchengine" ) -type bufferedMsg struct { - action searchengine.BulkAction - jsMsg jetstream.Msg +// pendingMsg tracks a JetStream message and the range of bulk actions it +// produced. A single JetStream message may fan out into zero, one, or multiple +// actions. The message is acked once ALL of its actions succeed; if any action +// fails the whole message is nakked for redelivery. +type pendingMsg struct { + jsMsg jetstream.Msg + actionStart int // starting index into Handler.actions + actionCount int // number of actions contributed by this message } -// Handler buffers JetStream messages and flushes them as ES bulk requests. +// Handler buffers JetStream messages and the ES bulk actions they produce, +// then flushes the actions as a single ES bulk request. +// +// Two counts are tracked separately because they can diverge for fan-out +// collections (one JetStream message producing N ES actions): +// +// - MessageCount() reports buffered source messages. Used for per-source +// ack/nak accounting at flush time. +// - ActionCount() reports buffered ES bulk actions. This is what bounds +// the size of the next ES bulk request and should drive the flush +// decision in the consumer loop. +// +// For 1:1 collections (messages, and the single-subscription path of +// spotlight/user-room) MessageCount() == ActionCount(). For fan-out +// collections (bulk-invite spotlight/user-room) ActionCount() >= +// MessageCount(). type Handler struct { store Store collection Collection - batchSize int + bulkSize int // soft cap on buffered actions; callers drive flush via ActionCount() mu sync.Mutex - buffer []bufferedMsg + pending []pendingMsg + actions []searchengine.BulkAction } -// NewHandler creates a Handler with the given store, collection, and batch size. -func NewHandler(store Store, collection Collection, batchSize int) *Handler { +// NewHandler creates a Handler with the given store, collection, and bulk +// batch size. `bulkSize` is the soft cap on buffered actions before a flush +// is triggered — the consumer loop compares it against `ActionCount()` to +// decide when to call `Flush`. +func NewHandler(store Store, collection Collection, bulkSize int) *Handler { return &Handler{ store: store, collection: collection, - batchSize: batchSize, - buffer: make([]bufferedMsg, 0, batchSize), + bulkSize: bulkSize, + pending: make([]pendingMsg, 0, bulkSize), + actions: make([]searchengine.BulkAction, 0, bulkSize), } } -// Add parses a JetStream message via the collection and adds it to the buffer. +// Add parses a JetStream message via the collection and adds its actions to +// the buffer. If the collection produces zero actions (e.g., a filtered +// event), the message is immediately acked without touching the buffer. func (h *Handler) Add(msg jetstream.Msg) { - action, err := h.collection.BuildAction(msg.Data()) + actions, err := h.collection.BuildAction(msg.Data()) if err != nil { slog.Error("build action", "error", err) - if ackErr := msg.Ack(); ackErr != nil { - slog.Error("ack malformed message", "error", ackErr) - } + natsutil.Ack(msg, "build action failed") + return + } + + if len(actions) == 0 { + natsutil.Ack(msg, "filtered, no actions") return } h.mu.Lock() - h.buffer = append(h.buffer, bufferedMsg{action: action, jsMsg: msg}) + h.pending = append(h.pending, pendingMsg{ + jsMsg: msg, + actionStart: len(h.actions), + actionCount: len(actions), + }) + h.actions = append(h.actions, actions...) h.mu.Unlock() } -// Flush sends all buffered actions to ES and acks/naks per item. +// Flush sends all buffered actions to ES and acks/naks per source message. func (h *Handler) Flush(ctx context.Context) { h.mu.Lock() - if len(h.buffer) == 0 { + if len(h.pending) == 0 { h.mu.Unlock() return } - items := h.buffer - h.buffer = make([]bufferedMsg, 0, h.batchSize) + pending := h.pending + actions := h.actions + h.pending = make([]pendingMsg, 0, h.bulkSize) + h.actions = make([]searchengine.BulkAction, 0, h.bulkSize) h.mu.Unlock() - actions := make([]searchengine.BulkAction, len(items)) - for i, item := range items { - actions[i] = item.action - } - results, err := h.store.Bulk(ctx, actions) if err != nil { - slog.Error("bulk request failed", "error", err, "count", len(items)) - for _, item := range items { - if nakErr := item.jsMsg.Nak(); nakErr != nil { - slog.Error("nak failed", "error", nakErr) - } - } + slog.Error("bulk request failed", "error", err, "actions", len(actions)) + nakAll(pending, "bulk request failed") return } - if len(results) != len(items) { + if len(results) != len(actions) { // Defensive guard for a protocol-level anomaly: ES bulk API normally // returns one result per input action in input order. Nak-all is safe - // because of external versioning — on redelivery, any items already - // indexed return 409 (handled as ack below), and failed items get - // re-indexed. No duplicate processing, no lost events. - slog.Error("bulk result count mismatch", "expected", len(items), "actual", len(results)) - for _, item := range items { - if nakErr := item.jsMsg.Nak(); nakErr != nil { - slog.Error("nak failed", "error", nakErr) - } - } + // because every action type we emit is idempotent on redelivery: + // - ActionIndex / ActionDelete: external versioning makes a stale + // redelivery return 409 (handled as ack below); a successful + // redelivery is identical to the original write. + // - ActionUpdate: the painless scripts in user_room.go check a + // per-room timestamp guard (params.ts > stored) and short-circuit + // via ctx.op = 'none' on a redelivery, so a redelivered update + // is at worst a no-op. + // No duplicate processing, no lost events. + slog.Error("bulk result count mismatch", "expected", len(actions), "actual", len(results)) + nakAll(pending, "bulk result count mismatch") return } - for i, result := range results { - if result.Status == 409 || (result.Status >= 200 && result.Status < 300) { - if ackErr := items[i].jsMsg.Ack(); ackErr != nil { - slog.Error("ack failed", "error", ackErr) + for _, p := range pending { + allOK := true + for i := p.actionStart; i < p.actionStart+p.actionCount; i++ { + if isBulkItemSuccess(actions[i].Action, results[i]) { + continue } + allOK = false + slog.Error("bulk item failed", + "status", results[i].Status, + "error", results[i].Error, + "docID", actions[i].DocID, + "index", actions[i].Index, + ) + break + } + if allOK { + natsutil.Ack(p.jsMsg, "bulk actions succeeded") } else { - slog.Error("bulk item failed", "status", result.Status, "error", result.Error, "docID", items[i].action.DocID) - if nakErr := items[i].jsMsg.Nak(); nakErr != nil { - slog.Error("nak failed", "error", nakErr) - } + natsutil.Nak(p.jsMsg, "bulk action failed") + } + } +} + +// ES error type strings we match against on 404 responses to distinguish +// benign idempotent outcomes from fatal config errors. These come straight +// from the Elasticsearch `_bulk` response `error.type` field. +const ( + esErrDocumentMissing = "document_missing_exception" // update on missing doc + esErrIndexNotFound = "index_not_found_exception" // index doesn't exist — config error +) + +// isBulkItemSuccess maps an ES bulk item result to a logical success/failure +// per action type. +// +// - 2xx is always success. +// - 409 is success ONLY for externally-versioned writes (ActionIndex, +// ActionDelete): it means external versioning rejected a stale write and +// the desired state is already reached or newer. ActionUpdate does NOT +// use external versioning (the adapter omits version/version_type on +// _update); idempotency there comes from the painless LWW guard via +// `params.ts > stored`. A 409 on an update means an internal +// version_conflict_engine_exception from concurrent writers — the script +// did NOT execute, so the update was dropped. We NAK so JetStream +// redelivers and retries. +// - 404 is success ONLY for specific idempotent outcomes: +// - ActionDelete with no error type set: delete of a missing doc +// sets `result:"not_found"` and no error block — desired state +// already reached. +// - ActionUpdate with ErrorType == "document_missing_exception": the +// user-room remove path emits a scriptless update which 404s if the +// user doc doesn't exist yet — desired state already reached. +// 404 with any other error type (notably `index_not_found_exception`, +// which means the target ES index is missing / misconfigured) is +// treated as a real failure so we don't silently drop messages when the +// backing index/template is wrong. +// - ActionIndex 404 is always a failure because indexing is supposed to +// create the doc; a 404 there only happens when the index itself is +// missing. +func isBulkItemSuccess(action searchengine.ActionType, result searchengine.BulkResult) bool { + if result.Status >= 200 && result.Status < 300 { + return true + } + if result.Status == 409 { + switch action { + case searchengine.ActionIndex, searchengine.ActionDelete: + return true + case searchengine.ActionUpdate: + return false } } + if result.Status == 404 { + switch action { + case searchengine.ActionDelete: + // Delete on a missing doc returns status=404 with result=not_found + // and NO error block (ErrorType is empty). Any other error type + // at 404 (e.g., index_not_found_exception) is a real failure. + return result.ErrorType == "" + case searchengine.ActionUpdate: + // Update on a missing doc reports `document_missing_exception`. + // Update on a missing INDEX reports `index_not_found_exception` + // — we want that to fail loudly, not get silently acked. + return result.ErrorType == esErrDocumentMissing + case searchengine.ActionIndex: + // Index is supposed to CREATE the doc; a 404 here only happens + // when the index itself is missing (config error). Always fail. + return false + } + } + return false +} + +// nakAll naks every buffered source message for redelivery. Used on the +// two defensive paths in Flush where the whole batch can't be processed +// (bulk request failed, or the ES response item count didn't match the +// request). The shared `reason` is logged against every message so an +// operator grepping by cause sees all of them together. +func nakAll(pending []pendingMsg, reason string) { + for _, p := range pending { + natsutil.Nak(p.jsMsg, reason) + } } -// BufferLen returns the current buffer size. -func (h *Handler) BufferLen() int { +// MessageCount returns the number of buffered source JetStream messages. +// This is used for diagnostics and for the per-source ack/nak accounting at +// flush time; it is NOT the quantity that should drive the flush decision +// for fan-out collections — use ActionCount() for that. +func (h *Handler) MessageCount() int { h.mu.Lock() defer h.mu.Unlock() - return len(h.buffer) + return len(h.pending) } -// BufferFull returns true if the buffer has reached batch size. -func (h *Handler) BufferFull() bool { - return h.BufferLen() >= h.batchSize +// ActionCount returns the number of buffered ES bulk actions. For 1:1 +// collections this equals MessageCount(); for fan-out collections (bulk +// invites producing N actions per event) it is ≥ MessageCount(). The +// consumer loop compares this against the configured bulk batch size to +// decide when to flush so ES bulk requests stay bounded regardless of +// fan-out. +func (h *Handler) ActionCount() int { + h.mu.Lock() + defer h.mu.Unlock() + return len(h.actions) } diff --git a/search-sync-worker/handler_test.go b/search-sync-worker/handler_test.go index 93367a50d..2ca98b613 100644 --- a/search-sync-worker/handler_test.go +++ b/search-sync-worker/handler_test.go @@ -60,7 +60,7 @@ func TestHandler_Add(t *testing.T) { msg := makeStubMsg(t, &evt) h.Add(msg) - assert.Equal(t, 1, h.BufferLen()) + assert.Equal(t, 1, h.MessageCount()) } func TestHandler_Add_MalformedJSON(t *testing.T) { @@ -70,7 +70,7 @@ func TestHandler_Add_MalformedJSON(t *testing.T) { msg := &stubMsg{data: []byte("{invalid")} h.Add(msg) - assert.Equal(t, 0, h.BufferLen()) + assert.Equal(t, 0, h.MessageCount()) assert.True(t, msg.acked) } @@ -99,7 +99,7 @@ func TestHandler_Flush(t *testing.T) { assert.True(t, msg.acked) assert.False(t, msg.nacked) - assert.Equal(t, 0, h.BufferLen()) + assert.Equal(t, 0, h.MessageCount()) }) t.Run("version conflict (409) — acked not nacked", func(t *testing.T) { @@ -153,7 +153,7 @@ func TestHandler_Flush(t *testing.T) { assert.True(t, msg1.nacked) assert.True(t, msg2.nacked) - assert.Equal(t, 0, h.BufferLen()) + assert.Equal(t, 0, h.MessageCount()) }) t.Run("empty flush is no-op", func(t *testing.T) { @@ -161,7 +161,7 @@ func TestHandler_Flush(t *testing.T) { store := NewMockStore(ctrl) h := NewHandler(store, newMessageCollection("msgs-v1"), 500) h.Flush(context.Background()) - assert.Equal(t, 0, h.BufferLen()) + assert.Equal(t, 0, h.MessageCount()) }) t.Run("mixed results — per-item ack/nak", func(t *testing.T) { @@ -190,3 +190,313 @@ func TestHandler_Flush(t *testing.T) { assert.True(t, msgs[2].nacked, "500 should be nacked") }) } + +func TestIsBulkItemSuccess(t *testing.T) { + tests := []struct { + name string + action searchengine.ActionType + status int + errorType string + want bool + }{ + // 2xx is always success. + {"index 200", searchengine.ActionIndex, 200, "", true}, + {"index 201", searchengine.ActionIndex, 201, "", true}, + {"delete 200", searchengine.ActionDelete, 200, "", true}, + {"update 200", searchengine.ActionUpdate, 200, "", true}, + + // 409 is success only for externally-versioned writes (index, delete) — + // external versioning rejected a stale write and the desired state is + // already reached. Update 409 is a version_conflict_engine_exception + // from concurrent writers; the painless script did NOT execute, so we + // NAK to let JetStream redeliver and retry the scripted update. + {"index 409", searchengine.ActionIndex, 409, "version_conflict_engine_exception", true}, + {"delete 409", searchengine.ActionDelete, 409, "version_conflict_engine_exception", true}, + {"update 409", searchengine.ActionUpdate, 409, "version_conflict_engine_exception", false}, + + // Delete 404 benign path: doc already absent, no error block. + {"delete 404 not_found (empty errorType)", searchengine.ActionDelete, 404, "", true}, + // Delete 404 fatal path: the target INDEX is missing — config error. + {"delete 404 index_not_found", searchengine.ActionDelete, 404, "index_not_found_exception", false}, + + // Update 404 benign path: doc missing (user-room remove on empty doc). + {"update 404 document_missing", searchengine.ActionUpdate, 404, "document_missing_exception", true}, + // Update 404 fatal path: the target INDEX is missing — config error. + {"update 404 index_not_found", searchengine.ActionUpdate, 404, "index_not_found_exception", false}, + // Update 404 with an unfamiliar error type → fail closed. + {"update 404 unknown error type", searchengine.ActionUpdate, 404, "some_other_exception", false}, + + // Index 404 is always a failure (indexing should create the doc). + {"index 404 index_not_found", searchengine.ActionIndex, 404, "index_not_found_exception", false}, + {"index 404 empty error", searchengine.ActionIndex, 404, "", false}, + + // Server errors are failures on every action. + {"index 500", searchengine.ActionIndex, 500, "", false}, + {"delete 500", searchengine.ActionDelete, 500, "", false}, + {"update 500", searchengine.ActionUpdate, 500, "", false}, + {"index 503", searchengine.ActionIndex, 503, "", false}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := isBulkItemSuccess(tt.action, searchengine.BulkResult{ + Status: tt.status, + ErrorType: tt.errorType, + }) + assert.Equal(t, tt.want, got) + }) + } +} + +// stubCollection is a minimal Collection that emits a single action of the +// configured type. Only BuildAction is exercised by the handler tests; the +// rest of the interface methods return zero values. +type stubCollection struct { + action searchengine.ActionType +} + +func (c stubCollection) StreamConfig(string) jetstream.StreamConfig { return jetstream.StreamConfig{} } +func (c stubCollection) ConsumerName() string { return "stub" } +func (c stubCollection) FilterSubjects(string) []string { return nil } +func (c stubCollection) TemplateName() string { return "" } +func (c stubCollection) TemplateBody() json.RawMessage { return nil } +func (c stubCollection) BuildAction([]byte) ([]searchengine.BulkAction, error) { + return []searchengine.BulkAction{{Action: c.action, Index: "stub", DocID: "id-1"}}, nil +} + +type ( + stubDeleteCollection struct{ stubCollection } + stubUpdateCollection struct{ stubCollection } + stubIndexCollection struct{ stubCollection } +) + +func newStubDeleteCollection() stubDeleteCollection { + return stubDeleteCollection{stubCollection{action: searchengine.ActionDelete}} +} +func newStubUpdateCollection() stubUpdateCollection { + return stubUpdateCollection{stubCollection{action: searchengine.ActionUpdate}} +} +func newStubIndexCollection() stubIndexCollection { + return stubIndexCollection{stubCollection{action: searchengine.ActionIndex}} +} + +// fanOutCollection emits N ActionIndex actions per source message, +// simulating a bulk-invite style fan-out collection. +type fanOutCollection struct { + actionsPerMessage int +} + +func (c fanOutCollection) StreamConfig(string) jetstream.StreamConfig { + return jetstream.StreamConfig{} +} +func (c fanOutCollection) ConsumerName() string { return "fanout" } +func (c fanOutCollection) FilterSubjects(string) []string { return nil } +func (c fanOutCollection) TemplateName() string { return "" } +func (c fanOutCollection) TemplateBody() json.RawMessage { return nil } +func (c fanOutCollection) BuildAction([]byte) ([]searchengine.BulkAction, error) { + actions := make([]searchengine.BulkAction, 0, c.actionsPerMessage) + for i := 0; i < c.actionsPerMessage; i++ { + actions = append(actions, searchengine.BulkAction{ + Action: searchengine.ActionIndex, + Index: "fanout", + DocID: fmt.Sprintf("id-%d", i), + Doc: json.RawMessage(`{}`), + }) + } + return actions, nil +} + +func TestHandler_Flush_404OnDeleteAndUpdate(t *testing.T) { + t.Run("404 on delete with no error block (doc missing) is acked", func(t *testing.T) { + ctrl := gomock.NewController(t) + store := NewMockStore(ctrl) + // ES delete-on-missing-doc sets status=404 + result=not_found with + // NO error block — ErrorType stays empty in our adapter mapping. + store.EXPECT(). + Bulk(gomock.Any(), gomock.Len(1)). + Return([]searchengine.BulkResult{{Status: 404, ErrorType: "", Error: ""}}, nil) + + coll := newStubDeleteCollection() + h := NewHandler(store, coll, 500) + msg := &stubMsg{data: []byte(`{}`)} + h.Add(msg) + h.Flush(context.Background()) + + assert.True(t, msg.acked, "404 on delete with no error block should be acked (already deleted)") + assert.False(t, msg.nacked) + }) + + t.Run("404 on delete with index_not_found is NACKED", func(t *testing.T) { + ctrl := gomock.NewController(t) + store := NewMockStore(ctrl) + store.EXPECT(). + Bulk(gomock.Any(), gomock.Len(1)). + Return([]searchengine.BulkResult{{ + Status: 404, + ErrorType: "index_not_found_exception", + Error: "no such index [spotlight-site-a-v1-chat]", + }}, nil) + + coll := newStubDeleteCollection() + h := NewHandler(store, coll, 500) + msg := &stubMsg{data: []byte(`{}`)} + h.Add(msg) + h.Flush(context.Background()) + + assert.False(t, msg.acked, "404 on delete with index_not_found is a fatal config error") + assert.True(t, msg.nacked) + }) + + t.Run("404 on update with document_missing_exception is acked", func(t *testing.T) { + ctrl := gomock.NewController(t) + store := NewMockStore(ctrl) + store.EXPECT(). + Bulk(gomock.Any(), gomock.Len(1)). + Return([]searchengine.BulkResult{{ + Status: 404, + ErrorType: "document_missing_exception", + Error: "[charlie]: document missing", + }}, nil) + + coll := newStubUpdateCollection() + h := NewHandler(store, coll, 500) + msg := &stubMsg{data: []byte(`{}`)} + h.Add(msg) + h.Flush(context.Background()) + + assert.True(t, msg.acked, "404 on update with document_missing_exception should be acked") + assert.False(t, msg.nacked) + }) + + t.Run("404 on update with index_not_found is NACKED", func(t *testing.T) { + ctrl := gomock.NewController(t) + store := NewMockStore(ctrl) + store.EXPECT(). + Bulk(gomock.Any(), gomock.Len(1)). + Return([]searchengine.BulkResult{{ + Status: 404, + ErrorType: "index_not_found_exception", + Error: "no such index [user-room-site-a]", + }}, nil) + + coll := newStubUpdateCollection() + h := NewHandler(store, coll, 500) + msg := &stubMsg{data: []byte(`{}`)} + h.Add(msg) + h.Flush(context.Background()) + + assert.False(t, msg.acked, "404 on update with index_not_found is a fatal config error") + assert.True(t, msg.nacked) + }) + + t.Run("404 on index is nacked", func(t *testing.T) { + ctrl := gomock.NewController(t) + store := NewMockStore(ctrl) + store.EXPECT(). + Bulk(gomock.Any(), gomock.Len(1)). + Return([]searchengine.BulkResult{{ + Status: 404, + ErrorType: "index_not_found_exception", + Error: "no such index", + }}, nil) + + coll := newStubIndexCollection() + h := NewHandler(store, coll, 500) + msg := &stubMsg{data: []byte(`{}`)} + h.Add(msg) + h.Flush(context.Background()) + + assert.False(t, msg.acked, "404 on index should be nacked") + assert.True(t, msg.nacked) + }) +} + +// TestHandler_FanOut exercises the per-message action-range bookkeeping with +// a fan-out collection (one message → N actions). The handler must: +// - track message count and action count independently +// - ack the source message only when ALL of its actions succeeded +// - nak the source message if ANY action failed +func TestHandler_FanOut(t *testing.T) { + coll := fanOutCollection{actionsPerMessage: 3} + + t.Run("message count and action count diverge", func(t *testing.T) { + ctrl := gomock.NewController(t) + store := NewMockStore(ctrl) + // One message produces 3 actions. + h := NewHandler(store, coll, 500) + msg := &stubMsg{data: []byte(`{}`)} + h.Add(msg) + + assert.Equal(t, 1, h.MessageCount(), "one source message buffered") + assert.Equal(t, 3, h.ActionCount(), "three actions produced by fan-out") + }) + + t.Run("all fan-out actions succeed → source message acked", func(t *testing.T) { + ctrl := gomock.NewController(t) + store := NewMockStore(ctrl) + store.EXPECT(). + Bulk(gomock.Any(), gomock.Len(3)). + Return([]searchengine.BulkResult{ + {Status: 201}, + {Status: 201}, + {Status: 201}, + }, nil) + + h := NewHandler(store, coll, 500) + msg := &stubMsg{data: []byte(`{}`)} + h.Add(msg) + h.Flush(context.Background()) + + assert.True(t, msg.acked, "all 3 fan-out actions succeeded → source message acked") + assert.False(t, msg.nacked) + assert.Equal(t, 0, h.MessageCount()) + assert.Equal(t, 0, h.ActionCount()) + }) + + t.Run("any fan-out action fails → source message nakked", func(t *testing.T) { + ctrl := gomock.NewController(t) + store := NewMockStore(ctrl) + store.EXPECT(). + Bulk(gomock.Any(), gomock.Len(3)). + Return([]searchengine.BulkResult{ + {Status: 201}, // success + {Status: 500}, // failure — second action in the fan-out + {Status: 201}, // success (wouldn't matter, one failure is enough) + }, nil) + + h := NewHandler(store, coll, 500) + msg := &stubMsg{data: []byte(`{}`)} + h.Add(msg) + h.Flush(context.Background()) + + assert.False(t, msg.acked) + assert.True(t, msg.nacked, "one failed fan-out action should nak the whole source message") + }) + + t.Run("multiple messages, one fan-out fails → only that source nakked", func(t *testing.T) { + ctrl := gomock.NewController(t) + store := NewMockStore(ctrl) + // Two messages × 3 actions/message = 6 bulk actions. + // Message 0 actions (index 0-2) all succeed. + // Message 1 actions (index 3-5) — middle one fails. + store.EXPECT(). + Bulk(gomock.Any(), gomock.Len(6)). + Return([]searchengine.BulkResult{ + {Status: 201}, {Status: 201}, {Status: 201}, // msg 0 + {Status: 201}, {Status: 500}, {Status: 201}, // msg 1 + }, nil) + + h := NewHandler(store, coll, 500) + msg0 := &stubMsg{data: []byte(`{}`)} + msg1 := &stubMsg{data: []byte(`{}`)} + h.Add(msg0) + h.Add(msg1) + require.Equal(t, 2, h.MessageCount()) + require.Equal(t, 6, h.ActionCount()) + h.Flush(context.Background()) + + assert.True(t, msg0.acked, "msg 0's 3 actions all succeeded → acked") + assert.False(t, msg0.nacked) + assert.False(t, msg1.acked) + assert.True(t, msg1.nacked, "msg 1 had one failing action → nakked") + }) +} diff --git a/search-sync-worker/inbox_integration_test.go b/search-sync-worker/inbox_integration_test.go new file mode 100644 index 000000000..663d3056f --- /dev/null +++ b/search-sync-worker/inbox_integration_test.go @@ -0,0 +1,612 @@ +//go:build integration + +package main + +import ( + "context" + "encoding/json" + "testing" + "time" + + "github.com/nats-io/nats.go/jetstream" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/hmchangw/chat/pkg/model" + "github.com/hmchangw/chat/pkg/searchengine" + "github.com/hmchangw/chat/pkg/stream" + "github.com/hmchangw/chat/pkg/subject" +) + +// --- Shared helpers for inbox-based collection integration tests --- + +// createInboxStream creates the INBOX_{siteID} stream using pkg/stream.Inbox +// as the canonical baseline (name + local/aggregate subject patterns), with +// no cross-site Sources. Cross-site Sources are a production deployment +// concern owned by inbox-worker; tests simulate federated events by +// publishing directly to the aggregate subject instead. +func createInboxStream(t *testing.T, ctx context.Context, js jetstream.JetStream, siteID string) { + t.Helper() + cfg := stream.Inbox(siteID) + _, err := js.CreateOrUpdateStream(ctx, jetstream.StreamConfig{ + Name: cfg.Name, + Subjects: cfg.Subjects, + }) + require.NoError(t, err, "create INBOX stream for %s", siteID) +} + +// buildInboxMemberEvent constructs an InboxMemberEvent payload for tests. +// `historySharedSince` of 0 means unrestricted; any non-zero value marks the +// entire bulk as restricted and search-sync should skip the whole event. +// `joinedAt` is only meaningful on add events; pass 0 for removes. +func buildInboxMemberEvent( + roomID, roomName, siteID string, + accounts []string, + historySharedSince int64, + joinedAt int64, + timestamp int64, +) model.InboxMemberEvent { + return model.InboxMemberEvent{ + RoomID: roomID, + RoomName: roomName, + RoomType: model.RoomTypeGroup, + SiteID: siteID, + Accounts: accounts, + HistorySharedSince: historySharedSince, + JoinedAt: joinedAt, + Timestamp: timestamp, + } +} + +// publishInboxMemberEvent wraps an InboxMemberEvent inside an OutboxEvent +// with the given Type and publishes it to `subj`. Caller picks `subj` +// (local vs. aggregate, added vs. removed) via pkg/subject builders. +func publishInboxMemberEvent( + t *testing.T, + ctx context.Context, + js jetstream.JetStream, + subj, eventType string, + payload model.InboxMemberEvent, +) { + t.Helper() + payloadData, err := json.Marshal(payload) + require.NoError(t, err) + + evt := model.OutboxEvent{ + Type: eventType, + SiteID: payload.SiteID, + DestSiteID: payload.SiteID, + Payload: payloadData, + Timestamp: payload.Timestamp, + } + data, err := json.Marshal(evt) + require.NoError(t, err) + + _, err = js.Publish(ctx, subj, data) + require.NoError(t, err, "publish to %s", subj) +} + +// drainConsumer fetches exactly `expected` JetStream messages from the +// consumer and feeds them through the handler (Add + Flush). Fails if fewer +// messages are delivered within the fetch timeout. +func drainConsumer( + t *testing.T, + ctx context.Context, + cons jetstream.Consumer, + handler *Handler, + expected int, +) { + t.Helper() + if expected == 0 { + handler.Flush(ctx) + return + } + + received := 0 + for attempts := 0; attempts < 5 && received < expected; attempts++ { + batch, err := cons.Fetch(expected-received, jetstream.FetchMaxWait(5*time.Second)) + require.NoError(t, err) + for msg := range batch.Messages() { + handler.Add(msg) + received++ + } + // Surface any mid-batch error (consumer deleted, leader change, + // transient connection). Without this, batch.Messages() just closes + // silently and only the outer Equal would fail — losing the root cause. + require.NoError(t, batch.Error(), "batch error after draining (attempt %d, received %d of %d)", attempts, received, expected) + } + require.Equal(t, expected, received, "drained %d of %d expected messages", received, expected) + handler.Flush(ctx) +} + +// toStringSlice converts a JSON-decoded array (`[]any`) to `[]string`. +// Fails the test if any element is not a string. +func toStringSlice(t *testing.T, v any) []string { + t.Helper() + if v == nil { + return nil + } + slice, ok := v.([]any) + require.True(t, ok, "expected []any, got %T", v) + out := make([]string, 0, len(slice)) + for _, item := range slice { + s, ok := item.(string) + require.True(t, ok, "expected string element, got %T", item) + out = append(out, s) + } + return out +} + +// --- Spotlight integration test --- + +func TestSpotlightSync_Integration(t *testing.T) { + esURL := setupElasticsearch(t) + js, _ := setupNATSJetStream(t) + ctx := context.Background() + + siteID := "site-spot" + indexName := "spotlight-site-spot-v1-chat" + + // --- ES template + index --- + engine, err := searchengine.New(ctx, "elasticsearch", esURL) + require.NoError(t, err) + waitForClusterGreen(t, esURL, 120*time.Second) + + coll := newSpotlightCollection(indexName) + require.NoError(t, engine.UpsertTemplate(ctx, coll.TemplateName(), overrideIndexSettings(spotlightTemplateBody(indexName)))) + preCreateIndex(t, esURL, indexName) + waitForClusterGreen(t, esURL, 120*time.Second) + + // --- NATS INBOX stream + consumer --- + createInboxStream(t, ctx, js, siteID) + cons, err := js.CreateOrUpdateConsumer(ctx, stream.Inbox(siteID).Name, jetstream.ConsumerConfig{ + Durable: "spotlight-sync-inttest", + AckPolicy: jetstream.AckExplicitPolicy, + FilterSubjects: coll.FilterSubjects(siteID), + }) + require.NoError(t, err, "create spotlight consumer") + + handler := NewHandler(&engineAdapter{engine: engine}, coll, 100) + + const joinedAt int64 = 1744286400000 // 2026-04-10 12:00 UTC + + // --- Publish events covering local + federated + remove --- + + // Local member_added: alice joins engineering + publishInboxMemberEvent(t, ctx, js, + subject.InboxMemberAdded(siteID), model.OutboxMemberAdded, + buildInboxMemberEvent("r-eng", "engineering", siteID, []string{"alice"}, 0, joinedAt, 1000), + ) + + // Local member_added: alice joins platform + publishInboxMemberEvent(t, ctx, js, + subject.InboxMemberAdded(siteID), model.OutboxMemberAdded, + buildInboxMemberEvent("r-platform", "platform", siteID, []string{"alice"}, 0, joinedAt, 1100), + ) + + // Federated (aggregate) member_added: bob joins engineering via a cross-site event + publishInboxMemberEvent(t, ctx, js, + subject.InboxMemberAddedAggregate(siteID), model.OutboxMemberAdded, + buildInboxMemberEvent("r-eng", "engineering", siteID, []string{"bob"}, 0, joinedAt, 1200), + ) + + // Federated (aggregate) member_removed: alice leaves platform + publishInboxMemberEvent(t, ctx, js, + subject.InboxMemberRemovedAggregate(siteID), model.OutboxMemberRemoved, + buildInboxMemberEvent("r-platform", "platform", siteID, []string{"alice"}, 0, 0, 1300), + ) + + drainConsumer(t, ctx, cons, handler, 4) + refreshIndex(t, esURL, indexName) + + // --- Verify --- + + t.Run("two subscriptions remain after one removal", func(t *testing.T) { + // Added 3 (alice-eng, alice-platform, bob-eng), removed 1 (alice-platform) + assert.Equal(t, 2, countDocs(t, esURL, indexName)) + }) + + t.Run("alice engineering doc shape", func(t *testing.T) { + doc := getDoc(t, esURL, indexName, "alice_r-eng") + require.NotNil(t, doc, "alice_r-eng should be indexed") + assert.Equal(t, "alice", doc["userAccount"]) + assert.Equal(t, "r-eng", doc["roomId"]) + assert.Equal(t, "engineering", doc["roomName"]) + assert.Equal(t, "group", doc["roomType"]) + assert.Equal(t, siteID, doc["siteId"]) + }) + + t.Run("federated bob doc was indexed", func(t *testing.T) { + doc := getDoc(t, esURL, indexName, "bob_r-eng") + require.NotNil(t, doc, "bob's federated subscription should be indexed via aggregate filter") + assert.Equal(t, "bob", doc["userAccount"]) + assert.Equal(t, "r-eng", doc["roomId"]) + }) + + t.Run("removed alice-platform doc is gone", func(t *testing.T) { + doc := getDoc(t, esURL, indexName, "alice_r-platform") + assert.Nil(t, doc, "removed subscription should not exist in the index") + }) +} + +// TestSpotlightSync_BulkInvite verifies the fan-out path end-to-end: a single +// JetStream message carrying N accounts must produce N spotlight docs in one +// ES bulk request. +func TestSpotlightSync_BulkInvite(t *testing.T) { + esURL := setupElasticsearch(t) + js, _ := setupNATSJetStream(t) + ctx := context.Background() + + siteID := "site-spot-bulk" + indexName := "spotlight-site-spot-bulk-v1-chat" + + engine, err := searchengine.New(ctx, "elasticsearch", esURL) + require.NoError(t, err) + waitForClusterGreen(t, esURL, 120*time.Second) + + coll := newSpotlightCollection(indexName) + require.NoError(t, engine.UpsertTemplate(ctx, coll.TemplateName(), overrideIndexSettings(spotlightTemplateBody(indexName)))) + preCreateIndex(t, esURL, indexName) + waitForClusterGreen(t, esURL, 120*time.Second) + + createInboxStream(t, ctx, js, siteID) + cons, err := js.CreateOrUpdateConsumer(ctx, stream.Inbox(siteID).Name, jetstream.ConsumerConfig{ + Durable: "spotlight-sync-bulk-inttest", + AckPolicy: jetstream.AckExplicitPolicy, + FilterSubjects: coll.FilterSubjects(siteID), + }) + require.NoError(t, err, "create spotlight consumer") + + handler := NewHandler(&engineAdapter{engine: engine}, coll, 100) + + const joinedAt int64 = 1744286400000 + + // One bulk-invite event adds 3 users to r-platform at once. + payload := buildInboxMemberEvent("r-platform", "platform", siteID, + []string{"dave", "erin", "frank"}, 0, joinedAt, 5000) + publishInboxMemberEvent(t, ctx, js, + subject.InboxMemberAdded(siteID), model.OutboxMemberAdded, payload) + + // Only ONE JetStream message is drained, but it produces THREE ES index + // actions — handler.ActionCount() > MessageCount() is the whole point + // of the fan-out path. + drainConsumer(t, ctx, cons, handler, 1) + refreshIndex(t, esURL, indexName) + + t.Run("all three accounts landed in the index", func(t *testing.T) { + assert.Equal(t, 3, countDocs(t, esURL, indexName), + "1 message × 3 accounts = 3 spotlight docs") + }) + + t.Run("each account has the correct doc shape", func(t *testing.T) { + for _, account := range []string{"dave", "erin", "frank"} { + docID := account + "_r-platform" + doc := getDoc(t, esURL, indexName, docID) + require.NotNil(t, doc, "%s should be indexed", docID) + assert.Equal(t, account, doc["userAccount"]) + assert.Equal(t, "r-platform", doc["roomId"]) + assert.Equal(t, "platform", doc["roomName"]) + } + }) + + t.Run("bulk remove evicts all three docs", func(t *testing.T) { + // Same 3 accounts, now removed in one event. + remove := buildInboxMemberEvent("r-platform", "platform", siteID, + []string{"dave", "erin", "frank"}, 0, 0, 6000) + publishInboxMemberEvent(t, ctx, js, + subject.InboxMemberRemoved(siteID), model.OutboxMemberRemoved, remove) + drainConsumer(t, ctx, cons, handler, 1) + refreshIndex(t, esURL, indexName) + + assert.Equal(t, 0, countDocs(t, esURL, indexName), + "1 message × 3 account deletes = 0 docs remaining") + }) +} + +// --- User-room integration test --- + +func TestUserRoomSync_Integration(t *testing.T) { + esURL := setupElasticsearch(t) + js, _ := setupNATSJetStream(t) + ctx := context.Background() + + siteID := "site-ur" + indexName := "user-room-site-ur" + + engine, err := searchengine.New(ctx, "elasticsearch", esURL) + require.NoError(t, err) + waitForClusterGreen(t, esURL, 120*time.Second) + + coll := newUserRoomCollection(indexName) + require.NoError(t, engine.UpsertTemplate(ctx, coll.TemplateName(), overrideIndexSettings(userRoomTemplateBody(indexName)))) + preCreateIndex(t, esURL, indexName) + waitForClusterGreen(t, esURL, 120*time.Second) + + createInboxStream(t, ctx, js, siteID) + cons, err := js.CreateOrUpdateConsumer(ctx, stream.Inbox(siteID).Name, jetstream.ConsumerConfig{ + Durable: "user-room-sync-inttest", + AckPolicy: jetstream.AckExplicitPolicy, + FilterSubjects: coll.FilterSubjects(siteID), + }) + require.NoError(t, err, "create user-room consumer") + + handler := NewHandler(&engineAdapter{engine: engine}, coll, 100) + + const joinedAt int64 = 1744286400000 + + // --- Publish --- + + // alice joins 3 rooms via local events + publishInboxMemberEvent(t, ctx, js, + subject.InboxMemberAdded(siteID), model.OutboxMemberAdded, + buildInboxMemberEvent("r1", "general", siteID, []string{"alice"}, 0, joinedAt, 1000)) + publishInboxMemberEvent(t, ctx, js, + subject.InboxMemberAdded(siteID), model.OutboxMemberAdded, + buildInboxMemberEvent("r2", "random", siteID, []string{"alice"}, 0, joinedAt, 1100)) + publishInboxMemberEvent(t, ctx, js, + subject.InboxMemberAdded(siteID), model.OutboxMemberAdded, + buildInboxMemberEvent("r3", "eng", siteID, []string{"alice"}, 0, joinedAt, 1200)) + + // bob joins r1 via a federated (aggregate) event + publishInboxMemberEvent(t, ctx, js, + subject.InboxMemberAddedAggregate(siteID), model.OutboxMemberAdded, + buildInboxMemberEvent("r1", "general", siteID, []string{"bob"}, 0, joinedAt, 1300)) + + // alice leaves r2 via a local event + publishInboxMemberEvent(t, ctx, js, + subject.InboxMemberRemoved(siteID), model.OutboxMemberRemoved, + buildInboxMemberEvent("r2", "random", siteID, []string{"alice"}, 0, 0, 1400)) + + // alice joins a restricted room → whole event should be SKIPPED by user- + // room-sync (event-level HistorySharedSince != 0). The message still + // arrives at the consumer; the handler acks it without buffering. + publishInboxMemberEvent(t, ctx, js, + subject.InboxMemberAdded(siteID), model.OutboxMemberAdded, + buildInboxMemberEvent("r-restricted", "archives", siteID, []string{"alice"}, 1743984000000, joinedAt, 1500)) + + drainConsumer(t, ctx, cons, handler, 6) + refreshIndex(t, esURL, indexName) + + // --- Verify --- + + t.Run("alice rooms reflect adds and one remove", func(t *testing.T) { + doc := getDoc(t, esURL, indexName, "alice") + require.NotNil(t, doc, "alice user-room doc should exist") + rooms := toStringSlice(t, doc["rooms"]) + assert.ElementsMatch(t, []string{"r1", "r3"}, rooms, + "alice should be in r1, r3 (r2 removed, r-restricted skipped)") + }) + + t.Run("bob created via federated event", func(t *testing.T) { + doc := getDoc(t, esURL, indexName, "bob") + require.NotNil(t, doc, "bob should be upserted from aggregate.member_added") + rooms := toStringSlice(t, doc["rooms"]) + assert.ElementsMatch(t, []string{"r1"}, rooms) + }) + + t.Run("roomTimestamps retained after remove", func(t *testing.T) { + doc := getDoc(t, esURL, indexName, "alice") + require.NotNil(t, doc) + rts, ok := doc["roomTimestamps"].(map[string]any) + require.True(t, ok, "roomTimestamps should be a flattened map") + + // r1, r2, r3 all get their last-seen timestamps stored. r2's + // entry is KEPT after the remove (preserves LWW monotonicity so a + // late-arriving stale add can't re-insert r2). + assert.Equal(t, float64(1000), rts["r1"]) + assert.Equal(t, float64(1400), rts["r2"], + "r2 timestamp should be bumped to the remove's event timestamp, not deleted") + assert.Equal(t, float64(1200), rts["r3"]) + + // Restricted room was never applied → no timestamp entry. + assert.NotContains(t, rts, "r-restricted") + }) + + t.Run("restricted room not present in rooms array", func(t *testing.T) { + doc := getDoc(t, esURL, indexName, "alice") + require.NotNil(t, doc) + rooms := toStringSlice(t, doc["rooms"]) + assert.NotContains(t, rooms, "r-restricted", + "restricted rooms are skipped by user-room-sync") + }) + + t.Run("createdAt and updatedAt stamped from upsert", func(t *testing.T) { + doc := getDoc(t, esURL, indexName, "alice") + require.NotNil(t, doc) + assert.NotEmpty(t, doc["createdAt"], "upsert should seed createdAt") + assert.NotEmpty(t, doc["updatedAt"], "add path should stamp updatedAt") + }) +} + +// TestUserRoomSync_BulkInvite verifies the fan-out path for user-room: a +// single JetStream message with N accounts produces N distinct user-room +// updates (different DocIDs since each account targets a different user). +// Also covers the all-restricted event case where the whole bulk is skipped. +func TestUserRoomSync_BulkInvite(t *testing.T) { + esURL := setupElasticsearch(t) + js, _ := setupNATSJetStream(t) + ctx := context.Background() + + siteID := "site-ur-bulk" + indexName := "user-room-site-ur-bulk" + + engine, err := searchengine.New(ctx, "elasticsearch", esURL) + require.NoError(t, err) + waitForClusterGreen(t, esURL, 120*time.Second) + + coll := newUserRoomCollection(indexName) + require.NoError(t, engine.UpsertTemplate(ctx, coll.TemplateName(), overrideIndexSettings(userRoomTemplateBody(indexName)))) + preCreateIndex(t, esURL, indexName) + waitForClusterGreen(t, esURL, 120*time.Second) + + createInboxStream(t, ctx, js, siteID) + cons, err := js.CreateOrUpdateConsumer(ctx, stream.Inbox(siteID).Name, jetstream.ConsumerConfig{ + Durable: "user-room-sync-bulk-inttest", + AckPolicy: jetstream.AckExplicitPolicy, + FilterSubjects: coll.FilterSubjects(siteID), + }) + require.NoError(t, err, "create user-room consumer") + + handler := NewHandler(&engineAdapter{engine: engine}, coll, 100) + + const joinedAt int64 = 1744286400000 + + // Unrestricted bulk: 3 users to r-platform in one event → 3 user-room docs. + publishInboxMemberEvent(t, ctx, js, + subject.InboxMemberAdded(siteID), model.OutboxMemberAdded, + buildInboxMemberEvent("r-platform", "platform", siteID, + []string{"dave", "erin", "frank"}, 0, joinedAt, 5000)) + + // Restricted bulk: 3 users to r-archives with non-zero HistorySharedSince + // → whole event skipped, no docs written. The message still arrives at + // the consumer (it matches the FilterSubjects) so drainConsumer below + // counts it. + publishInboxMemberEvent(t, ctx, js, + subject.InboxMemberAdded(siteID), model.OutboxMemberAdded, + buildInboxMemberEvent("r-archives", "archives", siteID, + []string{"heidi", "ivan", "judy"}, 1743984000000, joinedAt, 5100)) + + drainConsumer(t, ctx, cons, handler, 2) + refreshIndex(t, esURL, indexName) + + t.Run("unrestricted bulk upserts all three users", func(t *testing.T) { + for _, account := range []string{"dave", "erin", "frank"} { + doc := getDoc(t, esURL, indexName, account) + require.NotNil(t, doc, "%s should be upserted", account) + rooms := toStringSlice(t, doc["rooms"]) + assert.ElementsMatch(t, []string{"r-platform"}, rooms) + } + }) + + t.Run("restricted bulk writes no docs", func(t *testing.T) { + for _, account := range []string{"heidi", "ivan", "judy"} { + assert.Nil(t, getDoc(t, esURL, indexName, account), + "restricted %s must not be upserted", account) + } + }) + + t.Run("bulk remove evicts rooms from unrestricted users", func(t *testing.T) { + publishInboxMemberEvent(t, ctx, js, + subject.InboxMemberRemoved(siteID), model.OutboxMemberRemoved, + buildInboxMemberEvent("r-platform", "platform", siteID, + []string{"dave", "erin", "frank"}, 0, 0, 6000)) + drainConsumer(t, ctx, cons, handler, 1) + refreshIndex(t, esURL, indexName) + + for _, account := range []string{"dave", "erin", "frank"} { + doc := getDoc(t, esURL, indexName, account) + require.NotNil(t, doc, "%s user doc should still exist (ghost)", account) + assert.Empty(t, toStringSlice(t, doc["rooms"]), + "%s rooms should be empty after bulk remove", account) + } + }) +} + +// --- User-room LWW guard integration test --- + +// TestUserRoomSync_LWWGuard drives a single user doc through a sequence of +// in-order and out-of-order events to prove the per-room timestamp guard +// converges on highest-event-timestamp-wins state regardless of physical +// arrival order. +// +// Implemented as one linear test body (not split into t.Run subtests) +// because the scenario is inherently stateful — each step depends on the +// prior ES state. +func TestUserRoomSync_LWWGuard(t *testing.T) { + esURL := setupElasticsearch(t) + js, _ := setupNATSJetStream(t) + ctx := context.Background() + + siteID := "site-lww" + indexName := "user-room-site-lww" + + engine, err := searchengine.New(ctx, "elasticsearch", esURL) + require.NoError(t, err) + waitForClusterGreen(t, esURL, 120*time.Second) + + coll := newUserRoomCollection(indexName) + require.NoError(t, engine.UpsertTemplate(ctx, coll.TemplateName(), overrideIndexSettings(userRoomTemplateBody(indexName)))) + preCreateIndex(t, esURL, indexName) + waitForClusterGreen(t, esURL, 120*time.Second) + + createInboxStream(t, ctx, js, siteID) + cons, err := js.CreateOrUpdateConsumer(ctx, stream.Inbox(siteID).Name, jetstream.ConsumerConfig{ + Durable: "user-room-sync-lww-inttest", + AckPolicy: jetstream.AckExplicitPolicy, + FilterSubjects: coll.FilterSubjects(siteID), + }) + require.NoError(t, err) + + handler := NewHandler(&engineAdapter{engine: engine}, coll, 100) + + const joinedAt int64 = 1744286400000 + + publish := func(subj, eventType, roomID string, ts int64) { + var j int64 + if eventType == model.OutboxMemberAdded { + j = joinedAt + } + publishInboxMemberEvent(t, ctx, js, subj, eventType, + buildInboxMemberEvent(roomID, "room "+roomID, siteID, []string{"charlie"}, 0, j, ts)) + } + + getCharlieState := func() ([]string, map[string]any) { + refreshIndex(t, esURL, indexName) + doc := getDoc(t, esURL, indexName, "charlie") + require.NotNil(t, doc, "charlie user-room doc should exist") + rooms := toStringSlice(t, doc["rooms"]) + rts, _ := doc["roomTimestamps"].(map[string]any) + return rooms, rts + } + + // Step 1: initial add at ts=2000 creates the doc via upsert. + publish(subject.InboxMemberAdded(siteID), model.OutboxMemberAdded, "rA", 2000) + drainConsumer(t, ctx, cons, handler, 1) + rooms, rts := getCharlieState() + assert.Contains(t, rooms, "rA", "step 1: initial add should put rA in rooms") + assert.Equal(t, float64(2000), rts["rA"], "step 1: stored ts should be 2000") + + // Step 2: stale add at ts=1000 should be a no-op via ctx.op='none'. + publish(subject.InboxMemberAdded(siteID), model.OutboxMemberAdded, "rA", 1000) + drainConsumer(t, ctx, cons, handler, 1) + rooms, rts = getCharlieState() + assert.Contains(t, rooms, "rA", "step 2: rA should still be in rooms after stale add") + assert.Equal(t, float64(2000), rts["rA"], + "step 2: stale add must not overwrite a newer stored timestamp") + + // Step 3: stale remove at ts=1500 should also be a no-op. + publish(subject.InboxMemberRemoved(siteID), model.OutboxMemberRemoved, "rA", 1500) + drainConsumer(t, ctx, cons, handler, 1) + rooms, rts = getCharlieState() + assert.Contains(t, rooms, "rA", "step 3: rA should survive stale remove") + assert.Equal(t, float64(2000), rts["rA"], "step 3: stored timestamp unchanged after stale remove") + + // Step 4: newer remove at ts=3000 evicts the room. + publish(subject.InboxMemberRemoved(siteID), model.OutboxMemberRemoved, "rA", 3000) + drainConsumer(t, ctx, cons, handler, 1) + rooms, rts = getCharlieState() + assert.NotContains(t, rooms, "rA", "step 4: newer remove should evict rA") + assert.Equal(t, float64(3000), rts["rA"], + "step 4: remove must bump stored timestamp to the remove's ts") + + // Step 5: re-add with newer ts=4000 puts the room back. + publish(subject.InboxMemberAdded(siteID), model.OutboxMemberAdded, "rA", 4000) + drainConsumer(t, ctx, cons, handler, 1) + rooms, rts = getCharlieState() + assert.Contains(t, rooms, "rA", "step 5: re-add with newer ts should restore rA") + assert.Equal(t, float64(4000), rts["rA"], "step 5: stored ts should bump to 4000") + + // Step 6: stale add at ts=2500 after the re-add is still a no-op. + publish(subject.InboxMemberAdded(siteID), model.OutboxMemberAdded, "rA", 2500) + drainConsumer(t, ctx, cons, handler, 1) + rooms, rts = getCharlieState() + assert.Contains(t, rooms, "rA", "step 6: rA should still be present") + assert.Equal(t, float64(4000), rts["rA"], "step 6: stored ts unchanged after stale add") + rACount := 0 + for _, r := range rooms { + if r == "rA" { + rACount++ + } + } + assert.Equal(t, 1, rACount, "step 6: rA should not be duplicated by stale add") +} diff --git a/search-sync-worker/inbox_stream.go b/search-sync-worker/inbox_stream.go new file mode 100644 index 000000000..528a0e3cf --- /dev/null +++ b/search-sync-worker/inbox_stream.go @@ -0,0 +1,100 @@ +package main + +import ( + "encoding/json" + "fmt" + + "github.com/nats-io/nats.go/jetstream" + + "github.com/hmchangw/chat/pkg/model" + "github.com/hmchangw/chat/pkg/stream" + "github.com/hmchangw/chat/pkg/subject" +) + +// inboxBootstrapStreamConfig returns the INBOX stream config with cross-site +// Sources + SubjectTransforms layered on top of the canonical baseline from +// pkg/stream.Inbox. Only used from the bootstrap path in main.go when +// BOOTSTRAP_STREAMS=true. In production, inbox-worker owns INBOX creation +// and search-sync-worker never calls this. +// +// Federation mechanics: for each remote site we add a StreamSource pointing +// at `OUTBOX_{remote}` with a SubjectTransform whose `Source` field acts +// as both the filter and the rewrite source — `outbox.{remote}.to.{siteID}.>` +// is matched against the upstream OUTBOX and rewritten to +// `chat.inbox.{siteID}.aggregate.>` on ingest. (NATS JetStream forbids +// setting `FilterSubject` and `SubjectTransforms` together on the same +// source — they are mutually exclusive options; the transform's `Source` +// covers the filter responsibility.) This lets consumers tell local events +// apart from federated ones by the presence of the `aggregate` segment. +// +// This helper stays local to search-sync-worker because it's bootstrap-only +// — inbox-worker will own an equivalent construction (as a proper feature, +// not a test toggle) when it migrates in its own PR. +// +// Requires NATS Server 2.10+ for SubjectTransforms support. +func inboxBootstrapStreamConfig(siteID string, remoteSiteIDs []string) jetstream.StreamConfig { + baseline := stream.Inbox(siteID) + destPattern := fmt.Sprintf("chat.inbox.%s.aggregate.>", siteID) + sources := make([]*jetstream.StreamSource, 0, len(remoteSiteIDs)) + for _, remote := range remoteSiteIDs { + sourcePattern := fmt.Sprintf("outbox.%s.to.%s.>", remote, siteID) + sources = append(sources, &jetstream.StreamSource{ + Name: fmt.Sprintf("OUTBOX_%s", remote), + SubjectTransforms: []jetstream.SubjectTransformConfig{ + {Source: sourcePattern, Destination: destPattern}, + }, + }) + } + return jetstream.StreamConfig{ + Name: baseline.Name, + Subjects: baseline.Subjects, + Sources: sources, + } +} + +// inboxMemberCollection is the shared base for collections that index +// subscription lifecycle events (member_added, member_removed) off the +// INBOX stream. It centralizes stream config and subject filters so +// spotlight and user-room collections only need to implement the +// index-specific parts. +// +// The stream name + local subject pattern come straight from pkg/stream.Inbox +// so there's one canonical definition for every consumer of INBOX. +// Cross-site federation (Sources + SubjectTransforms) is a deployment +// concern owned by whichever service creates the INBOX stream and is +// layered on separately — see inboxBootstrapStreamConfig. +type inboxMemberCollection struct{} + +func (b *inboxMemberCollection) StreamConfig(siteID string) jetstream.StreamConfig { + c := stream.Inbox(siteID) + return jetstream.StreamConfig{ + Name: c.Name, + Subjects: c.Subjects, + } +} + +func (b *inboxMemberCollection) FilterSubjects(siteID string) []string { + return subject.InboxMemberEventSubjects(siteID) +} + +// parseMemberEvent decodes an INBOX message into an OutboxEvent + its +// InboxMemberEvent payload and validates the common preconditions shared by +// all inbox-member collections. +// +// Callers are responsible for the event-level restricted-room short-circuit: +// when payload.HistorySharedSince != 0 the entire event should be skipped +// (the search service handles restricted rooms via DB+cache at query time). +func parseMemberEvent(data []byte) (*model.OutboxEvent, *model.InboxMemberEvent, error) { + var evt model.OutboxEvent + if err := json.Unmarshal(data, &evt); err != nil { + return nil, nil, fmt.Errorf("unmarshal outbox event: %w", err) + } + if evt.Timestamp <= 0 { + return nil, nil, fmt.Errorf("parse member event: missing timestamp") + } + var payload model.InboxMemberEvent + if err := json.Unmarshal(evt.Payload, &payload); err != nil { + return nil, nil, fmt.Errorf("unmarshal inbox member event: %w", err) + } + return &evt, &payload, nil +} diff --git a/search-sync-worker/inbox_stream_test.go b/search-sync-worker/inbox_stream_test.go new file mode 100644 index 000000000..bf0cc0305 --- /dev/null +++ b/search-sync-worker/inbox_stream_test.go @@ -0,0 +1,47 @@ +package main + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestInboxBootstrapStreamConfig(t *testing.T) { + t.Run("baseline name and subjects come from pkg/stream.Inbox", func(t *testing.T) { + cfg := inboxBootstrapStreamConfig("site-a", nil) + assert.Equal(t, "INBOX_site-a", cfg.Name) + assert.Equal(t, []string{ + "chat.inbox.site-a.*", + "chat.inbox.site-a.aggregate.>", + }, cfg.Subjects) + assert.Empty(t, cfg.Sources, "no remote sites → no sources") + }) + + t.Run("one source per remote site with subject transform", func(t *testing.T) { + cfg := inboxBootstrapStreamConfig("site-a", []string{"site-b", "site-c"}) + + assert.Equal(t, "INBOX_site-a", cfg.Name) + assert.Equal(t, []string{ + "chat.inbox.site-a.*", + "chat.inbox.site-a.aggregate.>", + }, cfg.Subjects) + require.Len(t, cfg.Sources, 2) + + // site-b source — FilterSubject is intentionally empty because it's + // mutually exclusive with SubjectTransforms; the transform's Source + // field acts as the filter. + assert.Equal(t, "OUTBOX_site-b", cfg.Sources[0].Name) + assert.Empty(t, cfg.Sources[0].FilterSubject) + require.Len(t, cfg.Sources[0].SubjectTransforms, 1) + assert.Equal(t, "outbox.site-b.to.site-a.>", cfg.Sources[0].SubjectTransforms[0].Source) + assert.Equal(t, "chat.inbox.site-a.aggregate.>", cfg.Sources[0].SubjectTransforms[0].Destination) + + // site-c source + assert.Equal(t, "OUTBOX_site-c", cfg.Sources[1].Name) + assert.Empty(t, cfg.Sources[1].FilterSubject) + require.Len(t, cfg.Sources[1].SubjectTransforms, 1) + assert.Equal(t, "outbox.site-c.to.site-a.>", cfg.Sources[1].SubjectTransforms[0].Source) + assert.Equal(t, "chat.inbox.site-a.aggregate.>", cfg.Sources[1].SubjectTransforms[0].Destination) + }) +} diff --git a/search-sync-worker/integration_test.go b/search-sync-worker/integration_test.go index 2cd113417..b59f37bdd 100644 --- a/search-sync-worker/integration_test.go +++ b/search-sync-worker/integration_test.go @@ -173,13 +173,14 @@ func preCreateIndex(t *testing.T, esURL, index string) { require.Equal(t, http.StatusOK, resp.StatusCode, "pre-create index %s: %s", index, body) } -// testTemplateBody returns an index template with single-node-friendly settings -// (1 shard, 0 replicas) so indices can be created in the test ES container. -func testTemplateBody(prefix string) json.RawMessage { - body := messageTemplateBody(prefix) +// overrideIndexSettings replaces `template.settings.index` on a marshaled ES +// index template with single-node-friendly values (1 shard, 0 replicas, 1s +// refresh) while leaving analysis + mappings intact. Shared by message, +// spotlight, and user-room integration tests so a single ES test container +// can host all three. +func overrideIndexSettings(body json.RawMessage) json.RawMessage { var tmpl map[string]any _ = json.Unmarshal(body, &tmpl) - template := tmpl["template"].(map[string]any) settings := template["settings"].(map[string]any) settings["index"] = map[string]any{ @@ -227,7 +228,7 @@ func TestSearchSyncIntegration(t *testing.T) { waitForClusterGreen(t, esURL, 120*time.Second) coll := newMessageCollection(prefix) - err = engine.UpsertTemplate(ctx, coll.TemplateName(), testTemplateBody(prefix)) + err = engine.UpsertTemplate(ctx, coll.TemplateName(), overrideIndexSettings(messageTemplateBody(prefix))) require.NoError(t, err, "upsert template") // Pre-create indices so shard allocation completes before bulk indexing. @@ -410,7 +411,7 @@ func TestCustomAnalyzer(t *testing.T) { waitForClusterGreen(t, esURL, 120*time.Second) coll := newMessageCollection(prefix) - err = engine.UpsertTemplate(ctx, coll.TemplateName(), testTemplateBody(prefix)) + err = engine.UpsertTemplate(ctx, coll.TemplateName(), overrideIndexSettings(messageTemplateBody(prefix))) require.NoError(t, err, "upsert template") preCreateIndex(t, esURL, prefix+"-2026-03") diff --git a/search-sync-worker/main.go b/search-sync-worker/main.go index 2880c72a0..a6629989f 100644 --- a/search-sync-worker/main.go +++ b/search-sync-worker/main.go @@ -16,8 +16,31 @@ import ( "github.com/hmchangw/chat/pkg/otelutil" "github.com/hmchangw/chat/pkg/searchengine" "github.com/hmchangw/chat/pkg/shutdown" + "github.com/hmchangw/chat/pkg/stream" ) +// bootstrapConfig groups every field that is ONLY meaningful when the worker +// is being stood up in dev or integration tests without its normal upstream +// services. In production none of these fields should be set — streams are +// owned by their publisher services (message-gatekeeper for +// MESSAGES_CANONICAL, inbox-worker for INBOX) and search-sync-worker only +// manages its own durable consumers. +// +// Env vars in this group are all prefixed `BOOTSTRAP_` so they're easy to +// spot in deployment manifests and obvious to grep. +type bootstrapConfig struct { + // Enabled (BOOTSTRAP_STREAMS) toggles whether the worker calls + // CreateOrUpdateStream at startup for each collection's stream. Leave + // false in production. + Enabled bool `env:"STREAMS" envDefault:"false"` + // RemoteSiteIDs (BOOTSTRAP_REMOTE_SITE_IDS) lists the other sites whose + // OUTBOX streams should be sourced into this site's INBOX when the + // worker is creating it itself. Used to build the cross-site Sources + + // SubjectTransforms config during bootstrap. Only consulted when + // Enabled is true; unused in production. + RemoteSiteIDs []string `env:"REMOTE_SITE_IDS" envSeparator:","` +} + type config struct { NatsURL string `env:"NATS_URL,required"` NatsCredsFile string `env:"NATS_CREDS_FILE" envDefault:""` @@ -25,8 +48,33 @@ type config struct { SearchURL string `env:"SEARCH_URL,required"` SearchBackend string `env:"SEARCH_BACKEND" envDefault:"elasticsearch"` MsgIndexPrefix string `env:"MSG_INDEX_PREFIX,required"` - BatchSize int `env:"BATCH_SIZE" envDefault:"500"` - FlushInterval int `env:"FLUSH_INTERVAL" envDefault:"5"` + SpotlightIndex string `env:"SPOTLIGHT_INDEX" envDefault:""` + UserRoomIndex string `env:"USER_ROOM_INDEX" envDefault:""` + + // FetchBatchSize is the maximum number of JetStream messages to pull + // per Fetch() round-trip. Smaller values give lower latency per message + // but more round-trips; larger values amortize the per-Fetch overhead. + // This is a JetStream-client concern — it does NOT bound ES bulk + // request size. + FetchBatchSize int `env:"FETCH_BATCH_SIZE" envDefault:"100"` + + // BulkBatchSize is the soft cap on buffered ES bulk actions before the + // worker flushes to Elasticsearch. This is counted in actions, not + // messages: fan-out collections (bulk invites producing N actions per + // JetStream message) can reach this threshold with far fewer messages + // than the count suggests. The consumer loop checks handler.ActionCount() + // against this value and triggers a flush mid-Fetch if a single fat + // message pushes the buffer over the cap. + BulkBatchSize int `env:"BULK_BATCH_SIZE" envDefault:"500"` + + // BulkFlushInterval is the maximum seconds between ES bulk flushes, even + // if the action buffer hasn't hit BulkBatchSize. It's the time-based + // counterpart to the size-based BulkBatchSize trigger — either + // condition can fire a flush. Keeps write latency bounded during + // idle / low-traffic periods. + BulkFlushInterval int `env:"BULK_FLUSH_INTERVAL" envDefault:"5"` + + Bootstrap bootstrapConfig `envPrefix:"BOOTSTRAP_"` } func main() { @@ -38,6 +86,32 @@ func main() { os.Exit(1) } + if cfg.SpotlightIndex == "" { + cfg.SpotlightIndex = fmt.Sprintf("spotlight-%s-v1-chat", cfg.SiteID) + } + if cfg.UserRoomIndex == "" { + cfg.UserRoomIndex = fmt.Sprintf("user-room-%s", cfg.SiteID) + } + + // Fail fast on non-positive batch/interval settings. Zero or negative + // values degenerate runConsumer into busy loops (`Fetch(0)`, constant + // flush checks) or stall it forever (`remaining <= 0` on every + // iteration). Reject at startup so an operator gets a clear signal + // instead of silent misbehavior. Matches the repo-wide "fail fast on + // bad config" rule in CLAUDE.md. + if cfg.FetchBatchSize <= 0 { + slog.Error("invalid config", "name", "FETCH_BATCH_SIZE", "value", cfg.FetchBatchSize, "reason", "must be > 0") + os.Exit(1) + } + if cfg.BulkBatchSize <= 0 { + slog.Error("invalid config", "name", "BULK_BATCH_SIZE", "value", cfg.BulkBatchSize, "reason", "must be > 0") + os.Exit(1) + } + if cfg.BulkFlushInterval <= 0 { + slog.Error("invalid config", "name", "BULK_FLUSH_INTERVAL", "value", cfg.BulkFlushInterval, "reason", "must be > 0") + os.Exit(1) + } + ctx := context.Background() tracerShutdown, err := otelutil.InitTracer(ctx, "search-sync-worker") @@ -52,15 +126,24 @@ func main() { os.Exit(1) } - coll := newMessageCollection(cfg.MsgIndexPrefix) + collections := []Collection{ + newMessageCollection(cfg.MsgIndexPrefix), + newSpotlightCollection(cfg.SpotlightIndex), + newUserRoomCollection(cfg.UserRoomIndex), + } - tmplName := coll.TemplateName() - tmplBody := coll.TemplateBody() - if err := engine.UpsertTemplate(ctx, tmplName, tmplBody); err != nil { - slog.Error("upsert index template failed", "error", err) - os.Exit(1) + for _, coll := range collections { + name := coll.TemplateName() + body := coll.TemplateBody() + if name == "" || body == nil { + continue + } + if err := engine.UpsertTemplate(ctx, name, body); err != nil { + slog.Error("upsert index template failed", "template", name, "error", err) + os.Exit(1) + } + slog.Info("index template upserted", "name", name) } - slog.Info("index template upserted", "name", tmplName) nc, err := natsutil.Connect(cfg.NatsURL, cfg.NatsCredsFile) if err != nil { @@ -73,34 +156,79 @@ func main() { os.Exit(1) } - canonicalCfg := coll.StreamConfig(cfg.SiteID) - if _, err = js.CreateOrUpdateStream(ctx, jetstream.StreamConfig{ - Name: canonicalCfg.Name, - Subjects: canonicalCfg.Subjects, - }); err != nil { - slog.Error("create MESSAGES_CANONICAL stream failed", "error", err) - os.Exit(1) - } + bulkFlushInterval := time.Duration(cfg.BulkFlushInterval) * time.Second + stopCh := make(chan struct{}) + doneChs := make([]chan struct{}, 0, len(collections)) - cons, err := js.CreateOrUpdateConsumer(ctx, canonicalCfg.Name, jetstream.ConsumerConfig{ - Durable: coll.ConsumerName(), - AckPolicy: jetstream.AckExplicitPolicy, - BackOff: []time.Duration{1 * time.Second, 5 * time.Second, 30 * time.Second}, - }) - if err != nil { - slog.Error("create consumer failed", "error", err) - os.Exit(1) - } + // Multiple collections can share the same stream (spotlight + user-room + // both consume INBOX). Track which streams have already been created so + // we don't redundantly call CreateOrUpdateStream per collection. + createdStreams := make(map[string]struct{}, len(collections)) - handler := NewHandler(&engineAdapter{engine: engine}, coll, cfg.BatchSize) + // Canonical INBOX stream name, used below to decide when to layer on + // cross-site Sources + SubjectTransforms during bootstrap. + inboxName := stream.Inbox(cfg.SiteID).Name - flushInterval := time.Duration(cfg.FlushInterval) * time.Second - stopCh := make(chan struct{}) - doneCh := make(chan struct{}) + for _, coll := range collections { + streamCfg := coll.StreamConfig(cfg.SiteID) + if cfg.Bootstrap.Enabled { + bootstrapCfg := streamCfg + // The INBOX stream is the only one that needs cross-site Sources + // + SubjectTransforms. Collections return a minimal baseline + // (name + local subjects from pkg/stream.Inbox) and the + // bootstrap path layers on the federation config here, keeping + // the cross-site topology out of the Collection type entirely. + if streamCfg.Name == inboxName { + bootstrapCfg = inboxBootstrapStreamConfig(cfg.SiteID, cfg.Bootstrap.RemoteSiteIDs) + } + if _, alreadyCreated := createdStreams[bootstrapCfg.Name]; !alreadyCreated { + if _, err := js.CreateOrUpdateStream(ctx, bootstrapCfg); err != nil { + slog.Error("create stream failed", "stream", bootstrapCfg.Name, "error", err) + os.Exit(1) + } + createdStreams[bootstrapCfg.Name] = struct{}{} + slog.Info("stream bootstrapped", "stream", bootstrapCfg.Name) + } + } + + consumerCfg := jetstream.ConsumerConfig{ + Durable: coll.ConsumerName(), + AckPolicy: jetstream.AckExplicitPolicy, + BackOff: []time.Duration{1 * time.Second, 5 * time.Second, 30 * time.Second}, + } + if filters := coll.FilterSubjects(cfg.SiteID); len(filters) > 0 { + consumerCfg.FilterSubjects = filters + } + cons, err := js.CreateOrUpdateConsumer(ctx, streamCfg.Name, consumerCfg) + if err != nil { + slog.Error("create consumer failed", + "stream", streamCfg.Name, + "consumer", coll.ConsumerName(), + "error", err, + ) + os.Exit(1) + } + + handler := NewHandler(&engineAdapter{engine: engine}, coll, cfg.BulkBatchSize) + doneCh := make(chan struct{}) + doneChs = append(doneChs, doneCh) - go runConsumer(ctx, cons, handler, cfg.BatchSize, flushInterval, stopCh, doneCh) + slog.Info("collection wired", + "stream", streamCfg.Name, + "consumer", coll.ConsumerName(), + "filters", consumerCfg.FilterSubjects, + ) - slog.Info("search-sync-worker running", "site", cfg.SiteID, "prefix", cfg.MsgIndexPrefix) + go runConsumer(ctx, cons, handler, cfg.FetchBatchSize, cfg.BulkBatchSize, bulkFlushInterval, stopCh, doneCh) + } + + slog.Info("search-sync-worker running", + "site", cfg.SiteID, + "msgPrefix", cfg.MsgIndexPrefix, + "spotlightIndex", cfg.SpotlightIndex, + "userRoomIndex", cfg.UserRoomIndex, + "collections", len(collections), + ) shutdown.Wait(ctx, 25*time.Second, func(ctx context.Context) error { @@ -108,12 +236,14 @@ func main() { return nil }, func(ctx context.Context) error { - select { - case <-doneCh: - return nil - case <-ctx.Done(): - return fmt.Errorf("consumer loop drain timed out: %w", ctx.Err()) + for _, ch := range doneChs { + select { + case <-ch: + case <-ctx.Done(): + return fmt.Errorf("consumer loop drain timed out: %w", ctx.Err()) + } } + return nil }, func(ctx context.Context) error { return tracerShutdown(ctx) }, func(ctx context.Context) error { return nc.Drain() }, @@ -121,9 +251,40 @@ func main() { } // runConsumer is the batch-flush consumer loop for a single collection. -// It fetches messages from the JetStream consumer, adds them to the handler buffer, -// and flushes when the buffer is full or the flush interval elapses. -func runConsumer(ctx context.Context, cons oteljetstream.Consumer, handler *Handler, batchSize int, flushInterval time.Duration, stopCh <-chan struct{}, doneCh chan<- struct{}) { +// +// Two batch sizes apply at different layers: +// +// - fetchBatchSize bounds how many JetStream messages are pulled per +// `cons.Fetch(...)` round-trip. This is purely a JetStream-client tuning +// knob — larger = fewer round-trips, smaller = lower per-message latency. +// +// - bulkBatchSize is the soft cap on buffered ES bulk actions before a +// flush is triggered. This is the real ES-side bound: a fan-out +// collection (bulk invite producing N actions per message) can hit it +// with far fewer messages than the count suggests, so the loop checks +// handler.ActionCount() — not message count — against it. +// +// The two caps interact: the loop clamps the per-Fetch count to +// `min(fetchBatchSize, bulkBatchSize - ActionCount())` so we never pull +// more messages than the remaining bulk capacity can absorb under a 1:1 +// assumption. Fan-out messages can still push the buffer past bulkBatchSize +// mid-loop (a single N-subscription event produces N actions on its own), +// which is handled by a mid-batch flush inside the message loop. +// +// Flushes happen on three triggers: +// 1. `stopCh` signalled (graceful shutdown): drain whatever is buffered. +// 2. `handler.ActionCount() >= bulkBatchSize`: size-based flush. +// 3. `time.Since(lastFlush) >= bulkFlushInterval` with a non-empty buffer: +// time-based flush to bound write latency during idle periods. +func runConsumer( + ctx context.Context, + cons oteljetstream.Consumer, + handler *Handler, + fetchBatchSize, bulkBatchSize int, + bulkFlushInterval time.Duration, + stopCh <-chan struct{}, + doneCh chan<- struct{}, +) { defer close(doneCh) lastFlush := time.Now() @@ -135,12 +296,21 @@ func runConsumer(ctx context.Context, cons oteljetstream.Consumer, handler *Hand default: } - fetchSize := batchSize - handler.BufferLen() - if fetchSize <= 0 { - fetchSize = 1 + // Bound the next Fetch by remaining bulk capacity so a steady stream + // of 1:1 messages can't overshoot bulkBatchSize. Fan-out messages + // may still push us over — that's handled mid-loop below. + remaining := bulkBatchSize - handler.ActionCount() + if remaining <= 0 { + handler.Flush(ctx) + lastFlush = time.Now() + continue + } + fetchCount := fetchBatchSize + if fetchCount > remaining { + fetchCount = remaining } - batch, err := cons.Fetch(fetchSize, jetstream.FetchMaxWait(time.Second)) + batch, err := cons.Fetch(fetchCount, jetstream.FetchMaxWait(time.Second)) if err != nil { select { case <-stopCh: @@ -148,7 +318,7 @@ func runConsumer(ctx context.Context, cons oteljetstream.Consumer, handler *Hand return default: } - if handler.BufferLen() > 0 && time.Since(lastFlush) >= flushInterval { + if handler.ActionCount() > 0 && time.Since(lastFlush) >= bulkFlushInterval { handler.Flush(ctx) lastFlush = time.Now() } @@ -157,12 +327,20 @@ func runConsumer(ctx context.Context, cons oteljetstream.Consumer, handler *Hand for msg := range batch.Messages() { handler.Add(msg.Msg) + // Mid-batch flush: if a single fan-out message just pushed the + // buffer over the bulk cap, flush immediately instead of waiting + // for the outer loop — otherwise the next message's actions + // would add to an already-oversized bulk request. + if handler.ActionCount() >= bulkBatchSize { + handler.Flush(ctx) + lastFlush = time.Now() + } } - if handler.BufferFull() { + if handler.ActionCount() >= bulkBatchSize { handler.Flush(ctx) lastFlush = time.Now() - } else if handler.BufferLen() > 0 && time.Since(lastFlush) >= flushInterval { + } else if handler.ActionCount() > 0 && time.Since(lastFlush) >= bulkFlushInterval { handler.Flush(ctx) lastFlush = time.Now() } diff --git a/search-sync-worker/messages.go b/search-sync-worker/messages.go index a05085182..aada5e1f7 100644 --- a/search-sync-worker/messages.go +++ b/search-sync-worker/messages.go @@ -3,10 +3,10 @@ package main import ( "encoding/json" "fmt" - "reflect" - "strings" "time" + "github.com/nats-io/nats.go/jetstream" + "github.com/hmchangw/chat/pkg/model" "github.com/hmchangw/chat/pkg/searchengine" "github.com/hmchangw/chat/pkg/stream" @@ -21,12 +21,21 @@ func newMessageCollection(indexPrefix string) *messageCollection { return &messageCollection{indexPrefix: indexPrefix} } -func (c *messageCollection) StreamConfig(siteID string) stream.Config { - return stream.MessagesCanonical(siteID) +func (c *messageCollection) StreamConfig(siteID string) jetstream.StreamConfig { + cfg := stream.MessagesCanonical(siteID) + return jetstream.StreamConfig{ + Name: cfg.Name, + Subjects: cfg.Subjects, + } } func (c *messageCollection) ConsumerName() string { - return "search-sync-worker" + return "message-sync" +} + +func (c *messageCollection) FilterSubjects(_ string) []string { + // Stream has a single subject pattern — no extra filtering needed. + return nil } func (c *messageCollection) TemplateName() string { @@ -37,21 +46,21 @@ func (c *messageCollection) TemplateBody() json.RawMessage { return messageTemplateBody(c.indexPrefix) } -func (c *messageCollection) BuildAction(data []byte) (searchengine.BulkAction, error) { +func (c *messageCollection) BuildAction(data []byte) ([]searchengine.BulkAction, error) { var evt model.MessageEvent if err := json.Unmarshal(data, &evt); err != nil { - return searchengine.BulkAction{}, fmt.Errorf("unmarshal message event: %w", err) + return nil, fmt.Errorf("unmarshal message event: %w", err) } if evt.Message.ID == "" { - return searchengine.BulkAction{}, fmt.Errorf("build message action: missing message id") + return nil, fmt.Errorf("build message action: missing message id") } if evt.Message.CreatedAt.IsZero() { - return searchengine.BulkAction{}, fmt.Errorf("build message action: missing createdAt") + return nil, fmt.Errorf("build message action: missing createdAt") } if evt.Timestamp <= 0 { - return searchengine.BulkAction{}, fmt.Errorf("build message action: missing timestamp") + return nil, fmt.Errorf("build message action: missing timestamp") } - return buildMessageAction(&evt, c.indexPrefix), nil + return []searchengine.BulkAction{buildMessageAction(&evt, c.indexPrefix)}, nil } // --- Message-specific internals --- @@ -130,25 +139,7 @@ func buildDocument(evt *model.MessageEvent) json.RawMessage { // messageTemplateProperties generates ES mapping properties from // MessageSearchIndex struct tags. The `es` tag is the source of truth. func messageTemplateProperties() map[string]any { - props := make(map[string]any) - t := reflect.TypeOf(MessageSearchIndex{}) - for i := range t.NumField() { - field := t.Field(i) - esTag := field.Tag.Get("es") - if esTag == "" || esTag == "-" { - continue - } - jsonTag := field.Tag.Get("json") - name, _, _ := strings.Cut(jsonTag, ",") - - esType, analyzer, _ := strings.Cut(esTag, ",") - prop := map[string]any{"type": esType} - if analyzer != "" { - prop["analyzer"] = analyzer - } - props[name] = prop - } - return props + return esPropertiesFromStruct[MessageSearchIndex]() } func messageTemplateBody(prefix string) json.RawMessage { diff --git a/search-sync-worker/messages_test.go b/search-sync-worker/messages_test.go index 84619973e..b683b2f0e 100644 --- a/search-sync-worker/messages_test.go +++ b/search-sync-worker/messages_test.go @@ -57,7 +57,7 @@ func TestMessageCollection_StreamConfig(t *testing.T) { func TestMessageCollection_ConsumerName(t *testing.T) { coll := newMessageCollection("msgs-v1") - assert.Equal(t, "search-sync-worker", coll.ConsumerName()) + assert.Equal(t, "message-sync", coll.ConsumerName()) } func TestIndexName(t *testing.T) { @@ -219,11 +219,12 @@ func TestMessageCollection_BuildAction(t *testing.T) { } data, _ := json.Marshal(evt) - action, err := coll.BuildAction(data) + actions, err := coll.BuildAction(data) require.NoError(t, err) - assert.Equal(t, searchengine.ActionIndex, action.Action) - assert.Equal(t, "msgs-v1-2026-01", action.Index) - assert.Equal(t, "m1", action.DocID) + require.Len(t, actions, 1) + assert.Equal(t, searchengine.ActionIndex, actions[0].Action) + assert.Equal(t, "msgs-v1-2026-01", actions[0].Index) + assert.Equal(t, "m1", actions[0].DocID) t.Run("malformed JSON returns error", func(t *testing.T) { _, err := coll.BuildAction([]byte("{invalid")) diff --git a/search-sync-worker/spotlight.go b/search-sync-worker/spotlight.go new file mode 100644 index 000000000..7e0f8509f --- /dev/null +++ b/search-sync-worker/spotlight.go @@ -0,0 +1,172 @@ +package main + +import ( + "encoding/json" + "fmt" + "time" + + "github.com/hmchangw/chat/pkg/model" + "github.com/hmchangw/chat/pkg/searchengine" +) + +// spotlightCollection implements Collection for spotlight room-typeahead +// search. Documents are per (user, room) pair — one doc for every account +// that holds a subscription to a given room — so the search service can +// filter by userAccount and match on roomName. Doc IDs are synthesized as +// `{account}_{roomID}` since the INBOX payload doesn't carry subscription IDs. +type spotlightCollection struct { + inboxMemberCollection + indexName string +} + +func newSpotlightCollection(indexName string) *spotlightCollection { + return &spotlightCollection{indexName: indexName} +} + +func (c *spotlightCollection) ConsumerName() string { + return "spotlight-sync" +} + +func (c *spotlightCollection) TemplateName() string { + return "spotlight_template" +} + +func (c *spotlightCollection) TemplateBody() json.RawMessage { + return spotlightTemplateBody(c.indexName) +} + +// BuildAction fans a member_added / member_removed event out into one ES +// action per account in the payload. Bulk invites produce N spotlight docs +// from a single event; single-user invites produce one. +// +// All actions in the returned slice carry the same external Version +// (evt.Timestamp) because they all represent the same logical event — if the +// event is redelivered, every action 409s uniformly and is treated as a +// successful idempotent replay. +// +// Restricted rooms (HistorySharedSince != 0 on the event) short-circuit the +// entire bulk and return an empty slice — the search service handles +// restricted rooms via DB+cache at query time, so nothing needs to be indexed. +func (c *spotlightCollection) BuildAction(data []byte) ([]searchengine.BulkAction, error) { + evt, payload, err := parseMemberEvent(data) + if err != nil { + return nil, err + } + if payload.RoomID == "" { + return nil, fmt.Errorf("build spotlight action: missing roomId") + } + // Event-level restricted-room short-circuit runs BEFORE account + // validation so restricted events with any payload shape (including an + // empty Accounts slice) are uniformly skipped per the InboxMemberEvent + // contract — no surprise error on an otherwise-valid "skip me" event. + if payload.HistorySharedSince != 0 { + return nil, nil + } + if len(payload.Accounts) == 0 { + return nil, fmt.Errorf("build spotlight action: empty accounts") + } + + actions := make([]searchengine.BulkAction, 0, len(payload.Accounts)) + for i, account := range payload.Accounts { + if account == "" { + return nil, fmt.Errorf("build spotlight action: empty account at index %d", i) + } + docID := fmt.Sprintf("%s_%s", account, payload.RoomID) + + switch evt.Type { + case model.OutboxMemberAdded: + doc := newSpotlightSearchIndex(account, payload) + body, err := json.Marshal(doc) + if err != nil { + return nil, fmt.Errorf("marshal spotlight doc: %w", err) + } + actions = append(actions, searchengine.BulkAction{ + Action: searchengine.ActionIndex, + Index: c.indexName, + DocID: docID, + Version: evt.Timestamp, + Doc: body, + }) + case model.OutboxMemberRemoved: + actions = append(actions, searchengine.BulkAction{ + Action: searchengine.ActionDelete, + Index: c.indexName, + DocID: docID, + Version: evt.Timestamp, + }) + default: + return nil, fmt.Errorf("build spotlight action: unsupported event type %q", evt.Type) + } + } + return actions, nil +} + +// SpotlightSearchIndex defines the Elasticsearch document structure for the +// spotlight index. One doc per (user, room) pair. +type SpotlightSearchIndex struct { + UserAccount string `json:"userAccount" es:"keyword"` + RoomID string `json:"roomId" es:"keyword"` + RoomName string `json:"roomName" es:"search_as_you_type,custom_analyzer"` + RoomType string `json:"roomType" es:"keyword"` + SiteID string `json:"siteId" es:"keyword"` + JoinedAt time.Time `json:"joinedAt" es:"date"` +} + +func newSpotlightSearchIndex(account string, evt *model.InboxMemberEvent) SpotlightSearchIndex { + var joinedAt time.Time + if evt.JoinedAt > 0 { + joinedAt = time.UnixMilli(evt.JoinedAt).UTC() + } + return SpotlightSearchIndex{ + UserAccount: account, + RoomID: evt.RoomID, + RoomName: evt.RoomName, + RoomType: string(evt.RoomType), + SiteID: evt.SiteID, + JoinedAt: joinedAt, + } +} + +// spotlightTemplateBody builds the ES index template for the spotlight +// collection. The `index_patterns` field is set to the exact configured +// index name so a custom SPOTLIGHT_INDEX value still receives the correct +// mapping (no broad wildcard that might catch unrelated indices). +func spotlightTemplateBody(indexName string) json.RawMessage { + tmpl := map[string]any{ + "index_patterns": []string{indexName}, + "template": map[string]any{ + "settings": map[string]any{ + "index": map[string]any{ + "number_of_shards": 3, + "number_of_replicas": 1, + }, + "analysis": map[string]any{ + "analyzer": map[string]any{ + "custom_analyzer": map[string]any{ + "type": "custom", + "tokenizer": "custom_tokenizer", + "filter": []string{"lowercase"}, + }, + }, + "tokenizer": map[string]any{ + // Whitespace tokenizer only supports max_token_length + // (default 255). `token_chars` is valid on ngram / + // edge_ngram tokenizers, not whitespace — sending it + // here would reject the UpsertTemplate request. + "custom_tokenizer": map[string]any{ + "type": "whitespace", + }, + }, + }, + }, + "mappings": map[string]any{ + "dynamic": false, + "properties": esPropertiesFromStruct[SpotlightSearchIndex](), + }, + }, + } + // tmpl is built entirely from map/slice/string/int literals that are + // always JSON-marshalable, so the error cannot occur in practice. + data, _ := json.Marshal(tmpl) + return data +} diff --git a/search-sync-worker/spotlight_test.go b/search-sync-worker/spotlight_test.go new file mode 100644 index 000000000..a567c3085 --- /dev/null +++ b/search-sync-worker/spotlight_test.go @@ -0,0 +1,277 @@ +package main + +import ( + "encoding/json" + "fmt" + "reflect" + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/hmchangw/chat/pkg/model" + "github.com/hmchangw/chat/pkg/searchengine" +) + +func makeInboxMemberEvent(t *testing.T, typ string, payload *model.InboxMemberEvent, ts int64) []byte { + t.Helper() + payloadData, err := json.Marshal(payload) + require.NoError(t, err) + evt := model.OutboxEvent{ + Type: typ, + SiteID: "site-a", + DestSiteID: "site-a", + Payload: payloadData, + Timestamp: ts, + } + data, err := json.Marshal(evt) + require.NoError(t, err) + return data +} + +// baseInboxMemberEvent builds a single-account event for the common unit-test +// case. Bulk-invite test cases supply their own Accounts slice. +func baseInboxMemberEvent() *model.InboxMemberEvent { + const joinedAt int64 = 1735689600000 + return &model.InboxMemberEvent{ + RoomID: "r-eng", + RoomName: "engineering", + RoomType: model.RoomTypeGroup, + SiteID: "site-a", + Accounts: []string{"alice"}, + JoinedAt: joinedAt, + Timestamp: joinedAt, + } +} + +func TestSpotlightCollection_Metadata(t *testing.T) { + coll := newSpotlightCollection("spotlight-site-a-v1-chat") + + assert.Equal(t, "spotlight-sync", coll.ConsumerName()) + assert.Equal(t, "spotlight_template", coll.TemplateName()) + + cfg := coll.StreamConfig("site-a") + assert.Equal(t, "INBOX_site-a", cfg.Name) + assert.Equal(t, []string{ + "chat.inbox.site-a.*", + "chat.inbox.site-a.aggregate.>", + }, cfg.Subjects) + assert.Empty(t, cfg.Sources) + + filters := coll.FilterSubjects("site-a") + assert.ElementsMatch(t, []string{ + "chat.inbox.site-a.member_added", + "chat.inbox.site-a.member_removed", + "chat.inbox.site-a.aggregate.member_added", + "chat.inbox.site-a.aggregate.member_removed", + }, filters) +} + +func TestSpotlightCollection_TemplateBody(t *testing.T) { + coll := newSpotlightCollection("spotlight-site-a-v1-chat") + body := coll.TemplateBody() + require.NotNil(t, body) + + var parsed map[string]any + require.NoError(t, json.Unmarshal(body, &parsed)) + + patterns, ok := parsed["index_patterns"].([]any) + require.True(t, ok) + assert.Equal(t, "spotlight-site-a-v1-chat", patterns[0]) + + tmpl := parsed["template"].(map[string]any) + mappings := tmpl["mappings"].(map[string]any) + props := mappings["properties"].(map[string]any) + assert.Contains(t, props, "userAccount") + assert.Contains(t, props, "roomId") + assert.Contains(t, props, "roomName") + assert.Contains(t, props, "joinedAt") + assert.Equal(t, false, mappings["dynamic"]) + + roomName := props["roomName"].(map[string]any) + assert.Equal(t, "search_as_you_type", roomName["type"]) + assert.Equal(t, "custom_analyzer", roomName["analyzer"]) +} + +func TestSpotlightTemplateProperties_MatchesStruct(t *testing.T) { + props := esPropertiesFromStruct[SpotlightSearchIndex]() + + typ := reflect.TypeOf(SpotlightSearchIndex{}) + esFieldCount := 0 + for i := range typ.NumField() { + field := typ.Field(i) + esTag := field.Tag.Get("es") + if esTag == "" || esTag == "-" { + continue + } + esFieldCount++ + jsonTag := field.Tag.Get("json") + name, _, _ := strings.Cut(jsonTag, ",") + _, ok := props[name] + assert.True(t, ok, "template missing property for field %s (json %s)", field.Name, name) + } + assert.Equal(t, esFieldCount, len(props)) +} + +func TestSpotlightCollection_BuildAction_MemberAdded(t *testing.T) { + coll := newSpotlightCollection("spotlight-site-a-v1-chat") + payload := baseInboxMemberEvent() + data := makeInboxMemberEvent(t, model.OutboxMemberAdded, payload, 1000) + + actions, err := coll.BuildAction(data) + require.NoError(t, err) + require.Len(t, actions, 1) + + action := actions[0] + assert.Equal(t, searchengine.ActionIndex, action.Action) + assert.Equal(t, "spotlight-site-a-v1-chat", action.Index) + // DocID is synthesized as {account}_{roomID} since the new payload shape + // doesn't carry subscription IDs. + assert.Equal(t, "alice_r-eng", action.DocID) + assert.Equal(t, int64(1000), action.Version) + require.NotNil(t, action.Doc) + + var doc map[string]any + require.NoError(t, json.Unmarshal(action.Doc, &doc)) + assert.Equal(t, "alice", doc["userAccount"]) + assert.Equal(t, "r-eng", doc["roomId"]) + assert.Equal(t, "engineering", doc["roomName"]) + assert.Equal(t, "group", doc["roomType"]) + assert.Equal(t, "site-a", doc["siteId"]) +} + +func TestSpotlightCollection_BuildAction_MemberRemoved(t *testing.T) { + coll := newSpotlightCollection("spotlight-site-a-v1-chat") + payload := baseInboxMemberEvent() + data := makeInboxMemberEvent(t, model.OutboxMemberRemoved, payload, 2000) + + actions, err := coll.BuildAction(data) + require.NoError(t, err) + require.Len(t, actions, 1) + + action := actions[0] + assert.Equal(t, searchengine.ActionDelete, action.Action) + assert.Equal(t, "spotlight-site-a-v1-chat", action.Index) + assert.Equal(t, "alice_r-eng", action.DocID) + assert.Equal(t, int64(2000), action.Version) + assert.Nil(t, action.Doc) +} + +func TestSpotlightCollection_BuildAction_RestrictedRoomSkipped(t *testing.T) { + coll := newSpotlightCollection("spotlight-site-a-v1-chat") + payload := baseInboxMemberEvent() + payload.Accounts = []string{"alice", "bob"} + // Event-level HistorySharedSince short-circuits the entire bulk — + // restricted rooms are handled by the search service via DB+cache. + payload.HistorySharedSince = 1735689500000 + + data := makeInboxMemberEvent(t, model.OutboxMemberAdded, payload, 100) + + actions, err := coll.BuildAction(data) + require.NoError(t, err) + assert.Empty(t, actions, "restricted-room event should produce no actions") +} + +func TestSpotlightCollection_BuildAction_Errors(t *testing.T) { + coll := newSpotlightCollection("spotlight-site-a-v1-chat") + + t.Run("malformed outbox event", func(t *testing.T) { + _, err := coll.BuildAction([]byte("{invalid")) + assert.Error(t, err) + }) + + t.Run("malformed payload", func(t *testing.T) { + evt := model.OutboxEvent{ + Type: model.OutboxMemberAdded, + Payload: []byte("not json"), + Timestamp: 100, + } + data, _ := json.Marshal(evt) + _, err := coll.BuildAction(data) + assert.Error(t, err) + }) + + t.Run("empty accounts", func(t *testing.T) { + payload := baseInboxMemberEvent() + payload.Accounts = nil + data := makeInboxMemberEvent(t, model.OutboxMemberAdded, payload, 100) + _, err := coll.BuildAction(data) + assert.Error(t, err) + }) + + t.Run("empty account in list", func(t *testing.T) { + payload := baseInboxMemberEvent() + payload.Accounts = []string{"alice", ""} + data := makeInboxMemberEvent(t, model.OutboxMemberAdded, payload, 100) + _, err := coll.BuildAction(data) + assert.Error(t, err) + }) + + t.Run("missing roomId", func(t *testing.T) { + payload := baseInboxMemberEvent() + payload.RoomID = "" + data := makeInboxMemberEvent(t, model.OutboxMemberAdded, payload, 100) + _, err := coll.BuildAction(data) + assert.Error(t, err) + }) + + t.Run("missing timestamp", func(t *testing.T) { + data := makeInboxMemberEvent(t, model.OutboxMemberAdded, baseInboxMemberEvent(), 0) + _, err := coll.BuildAction(data) + assert.Error(t, err) + }) + + t.Run("unsupported event type", func(t *testing.T) { + data := makeInboxMemberEvent(t, "room_created", baseInboxMemberEvent(), 100) + _, err := coll.BuildAction(data) + assert.Error(t, err) + }) +} + +// TestSpotlightCollection_BuildAction_BulkInvite verifies fan-out: a single +// event carrying N accounts produces N index actions, all sharing the same +// external Version (event timestamp). +func TestSpotlightCollection_BuildAction_BulkInvite(t *testing.T) { + coll := newSpotlightCollection("spotlight-site-a-v1-chat") + payload := baseInboxMemberEvent() + payload.Accounts = []string{"alice", "bob", "carol"} + data := makeInboxMemberEvent(t, model.OutboxMemberAdded, payload, 12345) + + actions, err := coll.BuildAction(data) + require.NoError(t, err) + require.Len(t, actions, 3, "3 accounts should fan out to 3 actions") + + seenDocIDs := make(map[string]bool) + for _, action := range actions { + assert.Equal(t, searchengine.ActionIndex, action.Action) + assert.Equal(t, "spotlight-site-a-v1-chat", action.Index) + assert.Equal(t, int64(12345), action.Version, + "all fan-out actions share the source event's timestamp as their external version") + require.NotNil(t, action.Doc) + seenDocIDs[action.DocID] = true + } + for _, account := range payload.Accounts { + assert.True(t, seenDocIDs[fmt.Sprintf("%s_%s", account, payload.RoomID)], + "expected DocID for %s", account) + } +} + +// TestSpotlightCollection_BuildAction_BulkRemove verifies fan-out on remove: +// N accounts → N delete actions. +func TestSpotlightCollection_BuildAction_BulkRemove(t *testing.T) { + coll := newSpotlightCollection("spotlight-site-a-v1-chat") + payload := baseInboxMemberEvent() + payload.Accounts = []string{"alice", "bob"} + data := makeInboxMemberEvent(t, model.OutboxMemberRemoved, payload, 67890) + + actions, err := coll.BuildAction(data) + require.NoError(t, err) + require.Len(t, actions, 2) + + for _, action := range actions { + assert.Equal(t, searchengine.ActionDelete, action.Action) + assert.Equal(t, int64(67890), action.Version) + assert.Nil(t, action.Doc) + } +} diff --git a/search-sync-worker/template.go b/search-sync-worker/template.go new file mode 100644 index 000000000..5165511e7 --- /dev/null +++ b/search-sync-worker/template.go @@ -0,0 +1,43 @@ +package main + +import ( + "reflect" + "strings" +) + +// esPropertiesFromStruct reflects over struct T's fields to build an +// Elasticsearch mapping properties map from `es` struct tags. Fields are +// keyed by the `json` tag name. +// +// The `es` tag grammar is "type[,analyzer]" — e.g. `es:"keyword"` or +// `es:"text,custom_analyzer"`. Fields are skipped when: +// - the `es` tag is missing or `-` +// - the `json` tag is missing, empty, or `-` (fail closed: we never emit +// a mapping entry under the empty string, which would silently corrupt +// the template for any future struct that adds an `es`-tagged field +// without a matching `json` tag) +func esPropertiesFromStruct[T any]() map[string]any { + var zero T + t := reflect.TypeOf(zero) + props := make(map[string]any, t.NumField()) + for i := range t.NumField() { + field := t.Field(i) + esTag := field.Tag.Get("es") + if esTag == "" || esTag == "-" { + continue + } + jsonTag := field.Tag.Get("json") + name, _, _ := strings.Cut(jsonTag, ",") + if name == "" || name == "-" { + continue + } + + esType, analyzer, _ := strings.Cut(esTag, ",") + prop := map[string]any{"type": esType} + if analyzer != "" { + prop["analyzer"] = analyzer + } + props[name] = prop + } + return props +} diff --git a/search-sync-worker/user_room.go b/search-sync-worker/user_room.go new file mode 100644 index 000000000..0c02c58cf --- /dev/null +++ b/search-sync-worker/user_room.go @@ -0,0 +1,229 @@ +package main + +import ( + "encoding/json" + "fmt" + "time" + + "github.com/hmchangw/chat/pkg/model" + "github.com/hmchangw/chat/pkg/searchengine" +) + +// userRoomCollection implements Collection for the user-room access-control +// index. It maintains a per-user rooms array (one doc per user) used by the +// search service as a terms filter on message search queries. +// +// Concurrency model: the collection is safe to run with multiple search-sync +// worker pods sharing the same durable consumer. Out-of-order delivery of +// (added, removed) pairs for the same (user, room) is handled by an +// application-level timestamp guard inside the painless scripts — each update +// carries the OutboxEvent timestamp in `params.ts` and compares against the +// per-room stored timestamp in `ctx._source.roomTimestamps`, skipping the +// write (via `ctx.op = 'none'`) if the incoming event is stale. Concurrent +// writers from different pods serialize on the primary shard's per-doc lock, +// so the guard converges on last-write-wins-by-event-timestamp regardless of +// physical arrival order. +type userRoomCollection struct { + inboxMemberCollection + indexName string +} + +func newUserRoomCollection(indexName string) *userRoomCollection { + return &userRoomCollection{indexName: indexName} +} + +func (c *userRoomCollection) ConsumerName() string { + return "user-room-sync" +} + +func (c *userRoomCollection) TemplateName() string { + return "user_room_template" +} + +func (c *userRoomCollection) TemplateBody() json.RawMessage { + return userRoomTemplateBody(c.indexName) +} + +// addRoomScript / removeRoomScript implement application-level last-write-wins +// on (user, room) using `params.ts` (OutboxEvent.Timestamp in millis). Stale +// events short-circuit via `ctx.op = 'none'` which tells ES to skip the write +// entirely — no version bump, no disk I/O. +const ( + addRoomScript = `if (ctx._source.roomTimestamps == null) { ctx._source.roomTimestamps = [:]; } ` + + `if (ctx._source.rooms == null) { ctx._source.rooms = []; } ` + + `long stored = ctx._source.roomTimestamps.containsKey(params.rid) ` + + `? ((Number)ctx._source.roomTimestamps.get(params.rid)).longValue() : 0L; ` + + `if (params.ts > stored) { ` + + `if (!ctx._source.rooms.contains(params.rid)) { ctx._source.rooms.add(params.rid); } ` + + `ctx._source.roomTimestamps.put(params.rid, params.ts); ` + + `ctx._source.updatedAt = params.now; ` + + `} else { ctx.op = 'none'; }` + + removeRoomScript = `if (ctx._source.roomTimestamps == null) { ctx._source.roomTimestamps = [:]; } ` + + `long stored = ctx._source.roomTimestamps.containsKey(params.rid) ` + + `? ((Number)ctx._source.roomTimestamps.get(params.rid)).longValue() : 0L; ` + + `if (params.ts > stored) { ` + + `if (ctx._source.rooms != null) { ` + + `int idx = ctx._source.rooms.indexOf(params.rid); ` + + `if (idx >= 0) { ctx._source.rooms.remove(idx); } } ` + + `ctx._source.roomTimestamps.put(params.rid, params.ts); ` + + `} else { ctx.op = 'none'; }` +) + +// BuildAction fans a member_added / member_removed event out into one ES +// update per account in the payload. Bulk invites produce N distinct +// user-room doc updates from a single event (each account touches a +// different user's doc, keyed by account). +// +// Restricted rooms (HistorySharedSince != 0 on the event) short-circuit the +// entire bulk and return an empty slice — the search service handles +// restricted rooms via DB+cache at query time, so nothing needs to be +// indexed. The handler then acks the source message without touching ES. +func (c *userRoomCollection) BuildAction(data []byte) ([]searchengine.BulkAction, error) { + evt, payload, err := parseMemberEvent(data) + if err != nil { + return nil, err + } + if payload.RoomID == "" { + return nil, fmt.Errorf("build user-room action: missing roomId") + } + // Event-level restricted-room short-circuit runs BEFORE account + // validation so restricted events with any payload shape (including an + // empty Accounts slice) are uniformly skipped per the InboxMemberEvent + // contract — no surprise error on an otherwise-valid "skip me" event. + if payload.HistorySharedSince != 0 { + return nil, nil + } + if len(payload.Accounts) == 0 { + return nil, fmt.Errorf("build user-room action: empty accounts") + } + + ts := evt.Timestamp + roomID := payload.RoomID + actions := make([]searchengine.BulkAction, 0, len(payload.Accounts)) + for i, account := range payload.Accounts { + if account == "" { + return nil, fmt.Errorf("build user-room action: empty account at index %d", i) + } + + switch evt.Type { + case model.OutboxMemberAdded: + body, err := buildAddRoomUpdateBody(account, roomID, ts) + if err != nil { + return nil, err + } + actions = append(actions, searchengine.BulkAction{ + Action: searchengine.ActionUpdate, + Index: c.indexName, + DocID: account, + Doc: body, + }) + case model.OutboxMemberRemoved: + body, err := buildRemoveRoomUpdateBody(roomID, ts) + if err != nil { + return nil, err + } + actions = append(actions, searchengine.BulkAction{ + Action: searchengine.ActionUpdate, + Index: c.indexName, + DocID: account, + Doc: body, + }) + default: + return nil, fmt.Errorf("build user-room action: unsupported event type %q", evt.Type) + } + } + return actions, nil +} + +// userRoomUpsertDoc is the full document inserted when the user has no prior +// user-room entry (i.e., the first time a room is added for this user). The +// `RoomTimestamps` map seeds the per-room timestamp guard. +type userRoomUpsertDoc struct { + UserAccount string `json:"userAccount"` + Rooms []string `json:"rooms"` + RoomTimestamps map[string]int64 `json:"roomTimestamps"` + CreatedAt string `json:"createdAt"` + UpdatedAt string `json:"updatedAt"` +} + +func buildAddRoomUpdateBody(account, roomID string, ts int64) (json.RawMessage, error) { + now := time.UnixMilli(ts).UTC().Format(time.RFC3339Nano) + body := map[string]any{ + "script": map[string]any{ + "source": addRoomScript, + "lang": "painless", + "params": map[string]any{ + "rid": roomID, + "ts": ts, + "now": now, + }, + }, + "upsert": userRoomUpsertDoc{ + UserAccount: account, + Rooms: []string{roomID}, + RoomTimestamps: map[string]int64{roomID: ts}, + CreatedAt: now, + UpdatedAt: now, + }, + } + data, err := json.Marshal(body) + if err != nil { + return nil, fmt.Errorf("marshal add room update: %w", err) + } + return data, nil +} + +func buildRemoveRoomUpdateBody(roomID string, ts int64) (json.RawMessage, error) { + body := map[string]any{ + "script": map[string]any{ + "source": removeRoomScript, + "lang": "painless", + "params": map[string]any{ + "rid": roomID, + "ts": ts, + }, + }, + } + data, err := json.Marshal(body) + if err != nil { + return nil, fmt.Errorf("marshal remove room update: %w", err) + } + return data, nil +} + +// userRoomTemplateBody builds the ES index template for the user-room +// collection. The `index_patterns` field is set to the exact configured +// index name so a custom USER_ROOM_INDEX value still receives the correct +// mapping. The `roomTimestamps` field is mapped as `flattened` so new +// roomIds don't balloon the mapping with per-key dynamic sub-fields. +func userRoomTemplateBody(indexName string) json.RawMessage { + tmpl := map[string]any{ + "index_patterns": []string{indexName}, + "template": map[string]any{ + "settings": map[string]any{ + "index": map[string]any{ + "number_of_shards": 1, + "number_of_replicas": 1, + }, + }, + "mappings": map[string]any{ + "dynamic": false, + "properties": map[string]any{ + "userAccount": map[string]any{"type": "keyword"}, + "rooms": map[string]any{ + "type": "text", + "fields": map[string]any{ + "keyword": map[string]any{"type": "keyword", "ignore_above": 256}, + }, + }, + "roomTimestamps": map[string]any{"type": "flattened"}, + "createdAt": map[string]any{"type": "date"}, + "updatedAt": map[string]any{"type": "date"}, + }, + }, + }, + } + data, _ := json.Marshal(tmpl) + return data +} diff --git a/search-sync-worker/user_room_test.go b/search-sync-worker/user_room_test.go new file mode 100644 index 000000000..3fee0302d --- /dev/null +++ b/search-sync-worker/user_room_test.go @@ -0,0 +1,240 @@ +package main + +import ( + "encoding/json" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/hmchangw/chat/pkg/model" + "github.com/hmchangw/chat/pkg/searchengine" +) + +func TestUserRoomCollection_Metadata(t *testing.T) { + coll := newUserRoomCollection("user-room-site-a") + + assert.Equal(t, "user-room-sync", coll.ConsumerName()) + assert.Equal(t, "user_room_template", coll.TemplateName()) + assert.NotNil(t, coll.TemplateBody()) + + cfg := coll.StreamConfig("site-a") + assert.Equal(t, "INBOX_site-a", cfg.Name) + assert.Equal(t, []string{ + "chat.inbox.site-a.*", + "chat.inbox.site-a.aggregate.>", + }, cfg.Subjects) + assert.Empty(t, cfg.Sources) + + filters := coll.FilterSubjects("site-a") + assert.ElementsMatch(t, []string{ + "chat.inbox.site-a.member_added", + "chat.inbox.site-a.member_removed", + "chat.inbox.site-a.aggregate.member_added", + "chat.inbox.site-a.aggregate.member_removed", + }, filters) +} + +func TestUserRoomCollection_TemplateBody(t *testing.T) { + coll := newUserRoomCollection("user-room-site-a") + body := coll.TemplateBody() + require.NotNil(t, body) + + var parsed map[string]any + require.NoError(t, json.Unmarshal(body, &parsed)) + + patterns, ok := parsed["index_patterns"].([]any) + require.True(t, ok) + assert.Equal(t, "user-room-site-a", patterns[0]) + + tmpl := parsed["template"].(map[string]any) + mappings := tmpl["mappings"].(map[string]any) + props := mappings["properties"].(map[string]any) + + assert.Contains(t, props, "userAccount") + assert.Contains(t, props, "rooms") + assert.Contains(t, props, "roomTimestamps") + assert.Contains(t, props, "createdAt") + assert.Contains(t, props, "updatedAt") + + rt := props["roomTimestamps"].(map[string]any) + assert.Equal(t, "flattened", rt["type"]) + + rooms := props["rooms"].(map[string]any) + assert.Equal(t, "text", rooms["type"]) +} + +func TestUserRoomCollection_BuildAction_MemberAdded(t *testing.T) { + coll := newUserRoomCollection("user-room-site-a") + payload := baseInboxMemberEvent() + const ts int64 = 1735689600000 + data := makeInboxMemberEvent(t, model.OutboxMemberAdded, payload, ts) + + actions, err := coll.BuildAction(data) + require.NoError(t, err) + require.Len(t, actions, 1) + + action := actions[0] + assert.Equal(t, searchengine.ActionUpdate, action.Action) + assert.Equal(t, "user-room-site-a", action.Index) + assert.Equal(t, "alice", action.DocID) + // Update actions must NOT use external versioning — ES rejects the combo. + assert.Zero(t, action.Version) + require.NotNil(t, action.Doc) + + var body map[string]any + require.NoError(t, json.Unmarshal(action.Doc, &body)) + + script, ok := body["script"].(map[string]any) + require.True(t, ok) + src := script["source"].(string) + assert.Contains(t, src, "ctx._source.rooms.add") + assert.Contains(t, src, "ctx.op = 'none'") + assert.Contains(t, src, "roomTimestamps") + assert.Contains(t, src, "params.ts") + + params := script["params"].(map[string]any) + assert.Equal(t, "r-eng", params["rid"]) + assert.Equal(t, float64(ts), params["ts"]) + assert.NotEmpty(t, params["now"]) + + upsert, ok := body["upsert"].(map[string]any) + require.True(t, ok) + assert.Equal(t, "alice", upsert["userAccount"]) + rooms := upsert["rooms"].([]any) + require.Len(t, rooms, 1) + assert.Equal(t, "r-eng", rooms[0]) + + roomTimestamps := upsert["roomTimestamps"].(map[string]any) + assert.Equal(t, float64(ts), roomTimestamps["r-eng"]) + assert.NotEmpty(t, upsert["createdAt"]) +} + +func TestUserRoomCollection_BuildAction_MemberRemoved(t *testing.T) { + coll := newUserRoomCollection("user-room-site-a") + payload := baseInboxMemberEvent() + const ts int64 = 1735689700000 + data := makeInboxMemberEvent(t, model.OutboxMemberRemoved, payload, ts) + + actions, err := coll.BuildAction(data) + require.NoError(t, err) + require.Len(t, actions, 1) + + action := actions[0] + assert.Equal(t, searchengine.ActionUpdate, action.Action) + assert.Equal(t, "user-room-site-a", action.Index) + assert.Equal(t, "alice", action.DocID) + + var body map[string]any + require.NoError(t, json.Unmarshal(action.Doc, &body)) + + script, ok := body["script"].(map[string]any) + require.True(t, ok) + src := script["source"].(string) + assert.Contains(t, src, "ctx._source.rooms.remove") + assert.Contains(t, src, "ctx.op = 'none'") + assert.Contains(t, src, "roomTimestamps") + assert.NotContains(t, src, "updatedAt") + + params := script["params"].(map[string]any) + assert.Equal(t, "r-eng", params["rid"]) + assert.Equal(t, float64(ts), params["ts"]) + assert.NotContains(t, params, "now") + assert.Len(t, params, 2) + + _, hasUpsert := body["upsert"] + assert.False(t, hasUpsert, "remove update body must not contain upsert") +} + +// TestUserRoomCollection_BuildAction_RestrictedRoomSkipped verifies the +// event-level restricted-room short-circuit: when HistorySharedSince is +// non-zero on the event, every account in the bulk is skipped (no actions). +// The search service handles restricted rooms via DB+cache at query time. +func TestUserRoomCollection_BuildAction_RestrictedRoomSkipped(t *testing.T) { + coll := newUserRoomCollection("user-room-site-a") + payload := baseInboxMemberEvent() + payload.Accounts = []string{"alice", "bob"} + payload.HistorySharedSince = 1735689500000 + + data := makeInboxMemberEvent(t, model.OutboxMemberAdded, payload, 100) + + actions, err := coll.BuildAction(data) + require.NoError(t, err) + assert.Empty(t, actions, "restricted-room event should produce no actions") +} + +// TestUserRoomCollection_BuildAction_BulkInvite verifies fan-out: one event +// with N accounts produces N distinct user-room update actions (each keyed +// by a different account). +func TestUserRoomCollection_BuildAction_BulkInvite(t *testing.T) { + coll := newUserRoomCollection("user-room-site-a") + payload := baseInboxMemberEvent() + payload.Accounts = []string{"alice", "bob", "carol"} + data := makeInboxMemberEvent(t, model.OutboxMemberAdded, payload, 12345) + + actions, err := coll.BuildAction(data) + require.NoError(t, err) + require.Len(t, actions, 3, "3 accounts → 3 update actions") + + seenDocIDs := make(map[string]bool) + for _, action := range actions { + assert.Equal(t, searchengine.ActionUpdate, action.Action) + assert.Equal(t, "user-room-site-a", action.Index) + assert.Zero(t, action.Version) + seenDocIDs[action.DocID] = true + } + assert.True(t, seenDocIDs["alice"]) + assert.True(t, seenDocIDs["bob"]) + assert.True(t, seenDocIDs["carol"]) +} + +func TestUserRoomCollection_BuildAction_Errors(t *testing.T) { + coll := newUserRoomCollection("user-room-site-a") + + t.Run("malformed outbox event", func(t *testing.T) { + _, err := coll.BuildAction([]byte("{invalid")) + assert.Error(t, err) + }) + + t.Run("malformed payload", func(t *testing.T) { + data, _ := json.Marshal(map[string]any{"type": model.OutboxMemberAdded, "payload": "not-bytes"}) + _, err := coll.BuildAction(data) + assert.Error(t, err) + }) + + t.Run("empty account in list", func(t *testing.T) { + payload := baseInboxMemberEvent() + payload.Accounts = []string{"alice", ""} + data := makeInboxMemberEvent(t, model.OutboxMemberAdded, payload, 100) + _, err := coll.BuildAction(data) + assert.Error(t, err) + }) + + t.Run("missing room id", func(t *testing.T) { + payload := baseInboxMemberEvent() + payload.RoomID = "" + data := makeInboxMemberEvent(t, model.OutboxMemberAdded, payload, 100) + _, err := coll.BuildAction(data) + assert.Error(t, err) + }) + + t.Run("empty accounts", func(t *testing.T) { + payload := baseInboxMemberEvent() + payload.Accounts = nil + data := makeInboxMemberEvent(t, model.OutboxMemberAdded, payload, 100) + _, err := coll.BuildAction(data) + assert.Error(t, err) + }) + + t.Run("missing timestamp", func(t *testing.T) { + data := makeInboxMemberEvent(t, model.OutboxMemberAdded, baseInboxMemberEvent(), 0) + _, err := coll.BuildAction(data) + assert.Error(t, err) + }) + + t.Run("unsupported event type", func(t *testing.T) { + data := makeInboxMemberEvent(t, "room_deleted", baseInboxMemberEvent(), 100) + _, err := coll.BuildAction(data) + assert.Error(t, err) + }) +}