From 16c307311bd98b596ed3e69702c3ff4b34757b4d Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 20 May 2026 06:25:06 +0000 Subject: [PATCH 01/23] test(search-service): share NATS / ES / Valkey containers across integration tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The integration_test.go fixtures used to start fresh containers per test — ~14 fixture invocations across the file, with ES startup (~30-60s) being the worst offender. The four search.rooms tests each waited on a brand-new ES container. This refactor moves shared container startup into setup_shared_test.go, using the same sync.Once pattern already used by pkg/testutil/mongo.go. Each fixture isolates per-test state instead of using a fresh container: - Elasticsearch: unique index name per test via t.Name() hash; the index is DELETEd on cleanup. - Valkey: keyspace is FLUSHDB'd on cleanup so the next test starts clean. Uses a raw go-redis client to avoid exposing FLUSHDB on the production valkeyutil.Client interface. - NATS: each test creates its own *nats.Conn pair; subscriptions are removed via router.Shutdown + nc.Drain before the next test runs. CCS tests are the one exception: they need two networked ES nodes with shared docker-network aliases, which doesn't fit a process-shared pool. They keep their per-test ES pair but now piggyback on the shared Valkey and NATS. Expected impact on a clean run: ES container starts drop from ~6 (4 Rooms + 2 CCS) to 3 (1 shared + 2 CCS); NATS starts drop from ~11 to 1; Valkey starts drop from ~5 to 1. --- search-service/integration_test.go | 145 ++++-------------- search-service/setup_shared_test.go | 227 ++++++++++++++++++++++++++++ 2 files changed, 257 insertions(+), 115 deletions(-) create mode 100644 search-service/setup_shared_test.go diff --git a/search-service/integration_test.go b/search-service/integration_test.go index 14dcd5780..3e65be074 100644 --- a/search-service/integration_test.go +++ b/search-service/integration_test.go @@ -17,7 +17,6 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "github.com/testcontainers/testcontainers-go" - natsmod "github.com/testcontainers/testcontainers-go/modules/nats" "github.com/testcontainers/testcontainers-go/network" "github.com/testcontainers/testcontainers-go/wait" @@ -31,7 +30,6 @@ import ( "github.com/hmchangw/chat/pkg/subject" "github.com/hmchangw/chat/pkg/testutil" "github.com/hmchangw/chat/pkg/testutil/testimages" - "github.com/hmchangw/chat/pkg/valkeyutil" ) const testUserRoomIndex = "user-room" @@ -53,9 +51,11 @@ type ccsFixture struct { clientNATS *nats.Conn } -// setupCCSFixture stands up the whole CCS environment. Total cost is ~ES -// container start × 2 (~60-90s) so tests that use it should reuse via -// TestMain when added. +// setupCCSFixture stands up the CCS environment. It owns the pair of +// networked ES containers (they need a shared docker network with +// transport-port aliases, so they can't be process-shared like the +// single-node ES used by other fixtures), but piggybacks on the +// process-shared Valkey and NATS from setup_shared_test.go. // // Every major step emits a `t.Logf` so a CI failure (where raw logs are // often opaque on public runs) leaves enough breadcrumbs in the `go test` @@ -102,15 +102,9 @@ func setupCCSFixture(t *testing.T) *ccsFixture { remoteEngine, err := searchengine.New(ctx, searchengine.Config{Backend: "elasticsearch", URL: remoteURL}) require.NoError(t, err, "build searchengine for remote") - t.Logf("CCS fixture: starting valkey") - valkeyAddr := startValkey(t) - valkeyClient, err := valkeyutil.Connect(ctx, valkeyAddr, "") - require.NoError(t, err, "connect valkey") - t.Cleanup(func() { valkeyutil.Disconnect(valkeyClient) }) - t.Logf("CCS fixture: valkey at %s", valkeyAddr) + valkeyClient := freshValkeyClient(t) - t.Logf("CCS fixture: starting NATS") - natsURL := startNATS(t) + natsURL := sharedNATS(t) serverNC, err := natsutil.Connect(natsURL, "") require.NoError(t, err, "connect nats (server side)") t.Cleanup(func() { _ = serverNC.Drain() }) @@ -197,40 +191,6 @@ func startESForCCS(t *testing.T, nw *testcontainers.DockerNetwork, alias, cluste return fmt.Sprintf("http://%s:%s", host, port.Port()) } -func startValkey(t *testing.T) string { - t.Helper() - ctx := context.Background() - container, err := testcontainers.GenericContainer(ctx, testcontainers.GenericContainerRequest{ - ContainerRequest: testcontainers.ContainerRequest{ - Image: testimages.Valkey, - ExposedPorts: []string{"6379/tcp"}, - Cmd: []string{"valkey-server", "--save", "", "--appendonly", "no"}, - WaitingFor: wait.ForLog("Ready to accept connections").WithStartupTimeout(30 * time.Second), - }, - Started: true, - }) - require.NoError(t, err, "start valkey") - t.Cleanup(func() { _ = container.Terminate(ctx) }) - - host, err := container.Host(ctx) - require.NoError(t, err) - port, err := container.MappedPort(ctx, "6379") - require.NoError(t, err) - return fmt.Sprintf("%s:%s", host, port.Port()) -} - -func startNATS(t *testing.T) string { - t.Helper() - ctx := context.Background() - c, err := natsmod.Run(ctx, testimages.NATS) - require.NoError(t, err, "start nats") - t.Cleanup(func() { _ = c.Terminate(ctx) }) - - url, err := c.ConnectionString(ctx) - require.NoError(t, err, "nats connection string") - return url -} - // --- Index templates --------------------------------------------------------- // buildTestTemplate wraps a pattern + property map with single-node-friendly @@ -653,19 +613,10 @@ type appsFixture struct { func setupAppsFixture(t *testing.T) *appsFixture { t.Helper() - ctx := context.Background() mongoDB := testutil.MongoDB(t, "search_service_test") - // Start NATS (reuse the existing NATS container helper). - natsContainer, err := natsmod.Run(ctx, testimages.NATS, - testcontainers.WithWaitStrategy(wait.ForLog("Server is ready").WithStartupTimeout(60*time.Second)), - ) - require.NoError(t, err) - t.Cleanup(func() { _ = natsContainer.Terminate(ctx) }) - - natsURL, err := natsContainer.ConnectionString(ctx) - require.NoError(t, err) + natsURL := sharedNATS(t) serverNATS, err := natsutil.Connect(natsURL, "") require.NoError(t, err) @@ -793,8 +744,7 @@ func setupUsersFixture(t *testing.T, thirdPartyHandler http.Handler) *usersFixtu stub := httptest.NewServer(thirdPartyHandler) t.Cleanup(stub.Close) - // NATS. - natsURL := startNATS(t) + natsURL := sharedNATS(t) serverNC, err := natsutil.Connect(natsURL, "") require.NoError(t, err, "connect nats (server side)") t.Cleanup(func() { _ = serverNC.Drain() }) @@ -891,51 +841,28 @@ func TestIntegration_SearchUsers_ThirdPartyErrorReturnsInternal(t *testing.T) { // roomsFixture wires a real ES container (for the spotlight index) and // NATS. search.rooms is served directly from the spotlight index, so no -// Mongo is involved. +// Mongo is involved. The ES container is process-shared; per-test +// isolation comes from a unique spotlight index name (deleted on +// cleanup) plus a Valkey FLUSHDB on cleanup. type roomsFixture struct { - clientNATS *nats.Conn - esURL string + clientNATS *nats.Conn + esURL string + spotlightIndex string } -// setupRoomsFixture stands up ES (spotlight index) and NATS. It registers -// t.Cleanup for all containers and returns a ready fixture. +// setupRoomsFixture wires the search-service router against the +// process-shared ES, Valkey and NATS containers. The spotlight index +// name is unique per test so leftovers from a sibling test can't leak +// into this one's hit set. func setupRoomsFixture(t *testing.T) *roomsFixture { t.Helper() ctx := context.Background() - // Single ES node — no CCS needed; spotlight is always local. - container, err := testcontainers.GenericContainer(ctx, testcontainers.GenericContainerRequest{ - ContainerRequest: testcontainers.ContainerRequest{ - Image: testimages.Elasticsearch, - ExposedPorts: []string{"9200/tcp"}, - Env: map[string]string{ - "discovery.type": "single-node", - "xpack.security.enabled": "false", - "ES_JAVA_OPTS": "-Xms512m -Xmx512m", - "cluster.routing.allocation.disk.threshold_enabled": "false", - }, - WaitingFor: wait.ForAll( - wait.ForHTTP("/").WithPort("9200/tcp").WithStartupTimeout(120*time.Second), - wait.ForHTTP("/_cluster/health?wait_for_status=yellow&timeout=60s"). - WithPort("9200/tcp"). - WithStartupTimeout(120*time.Second), - ), - }, - Started: true, - }) - require.NoError(t, err, "start elasticsearch for subs fixture") - t.Cleanup(func() { _ = container.Terminate(ctx) }) - - host, err := container.Host(ctx) - require.NoError(t, err) - port, err := container.MappedPort(ctx, "9200") - require.NoError(t, err) - esURL := fmt.Sprintf("http://%s:%s", host, port.Port()) - - spotlightIndex := "spotlight-subs-test" + esURL := sharedSingleNodeES(t) + spotlightIndex := uniqueESIndex(t, "spotlight") putTestSpotlightIndex(t, esURL, spotlightIndex) - natsURL := startNATS(t) + natsURL := sharedNATS(t) serverNC, err := natsutil.Connect(natsURL, "") require.NoError(t, err, "connect nats (server side)") t.Cleanup(func() { _ = serverNC.Drain() }) @@ -948,7 +875,7 @@ func setupRoomsFixture(t *testing.T) *roomsFixture { require.NoError(t, err, "build searchengine for subs fixture") esStore := newESStore(engine, testUserRoomIndex) - cache := newValkeyCache(newSubsValkeyClient(t)) + cache := newValkeyCache(freshValkeyClient(t)) h := newHandler(esStore, nil, nil, cache, handlerConfig{ DocCounts: 25, MaxDocCounts: 100, @@ -965,18 +892,7 @@ func setupRoomsFixture(t *testing.T) *roomsFixture { require.NoError(t, serverNC.NatsConn().Flush()) t.Cleanup(func() { _ = router.Shutdown(context.Background()) }) - return &roomsFixture{clientNATS: clientNC, esURL: esURL} -} - -// newSubsValkeyClient starts a Valkey testcontainer and returns a connected -// client for use by the subs fixture. Reuses the existing startValkey helper. -func newSubsValkeyClient(t *testing.T) valkeyutil.Client { - t.Helper() - addr := startValkey(t) - client, err := valkeyutil.Connect(context.Background(), addr, "") - require.NoError(t, err, "connect valkey for subs fixture") - t.Cleanup(func() { valkeyutil.Disconnect(client) }) - return client + return &roomsFixture{clientNATS: clientNC, esURL: esURL, spotlightIndex: spotlightIndex} } // putTestSpotlightIndex creates a minimal spotlight index in ES with the @@ -1022,7 +938,7 @@ func TestIntegration_SearchRooms_HappyPath(t *testing.T) { now := time.Now().UTC() // Seed spotlight docs for two rooms alice is in. - seedDoc(t, f.esURL, "spotlight-subs-test", "spot-r1", map[string]any{ + seedDoc(t, f.esURL, f.spotlightIndex, "spot-r1", map[string]any{ "roomId": "r1", "roomName": "engineering-announcements", "roomType": "channel", @@ -1030,7 +946,7 @@ func TestIntegration_SearchRooms_HappyPath(t *testing.T) { "siteId": "site-local", "joinedAt": now.Add(-48 * time.Hour).Format(time.RFC3339), }) - seedDoc(t, f.esURL, "spotlight-subs-test", "spot-r2", map[string]any{ + seedDoc(t, f.esURL, f.spotlightIndex, "spot-r2", map[string]any{ "roomId": "r2", "roomName": "engineering-random", "roomType": "channel", @@ -1041,7 +957,7 @@ func TestIntegration_SearchRooms_HappyPath(t *testing.T) { // A matching room owned by a different account. With the Mongo // hydration removed, the spotlight userAccount term filter is the // sole access boundary — this must not leak into alice's results. - seedDoc(t, f.esURL, "spotlight-subs-test", "spot-r3", map[string]any{ + seedDoc(t, f.esURL, f.spotlightIndex, "spot-r3", map[string]any{ "roomId": "r3", "roomName": "engineering-secret", "roomType": "channel", @@ -1076,7 +992,7 @@ func TestIntegration_SearchRooms_RoomTypeChannelFilter(t *testing.T) { const account = "bob" now := time.Now().UTC() - seedDoc(t, f.esURL, "spotlight-subs-test", "spot-b-r1", map[string]any{ + seedDoc(t, f.esURL, f.spotlightIndex, "spot-b-r1", map[string]any{ "roomId": "b-r1", "roomName": "bob-alice", "roomType": "dm", @@ -1084,7 +1000,7 @@ func TestIntegration_SearchRooms_RoomTypeChannelFilter(t *testing.T) { "siteId": "site-local", "joinedAt": now.Add(-1 * time.Hour).Format(time.RFC3339), }) - seedDoc(t, f.esURL, "spotlight-subs-test", "spot-b-r2", map[string]any{ + seedDoc(t, f.esURL, f.spotlightIndex, "spot-b-r2", map[string]any{ "roomId": "b-r2", "roomName": "bob-channel", "roomType": "channel", @@ -1171,8 +1087,7 @@ func setupMessagesV2Fixture(t *testing.T) *messagesV2Fixture { fakeValkey := newFakeCache() fakeValkey.store["alice"] = map[string]int64{} // empty restricted map, cache hit - // NATS - natsURL := startNATS(t) + natsURL := sharedNATS(t) serverNATS, err := natsutil.Connect(natsURL, "") require.NoError(t, err) diff --git a/search-service/setup_shared_test.go b/search-service/setup_shared_test.go new file mode 100644 index 000000000..391946053 --- /dev/null +++ b/search-service/setup_shared_test.go @@ -0,0 +1,227 @@ +//go:build integration + +package main + +// This file owns the process-shared test infrastructure used by every +// fixture in integration_test.go. Each container is started exactly once +// via sync.Once and lives for the entire `go test` run; Ryuk (from +// testcontainers-go) reaps it after the process exits. +// +// Sharing is safe because tests within this package run sequentially and +// each fixture isolates state per-test: +// +// - Elasticsearch: unique index name per test (uniqueESIndex), DELETEd +// on cleanup. +// - Valkey: flushSharedValkey wipes the keyspace on cleanup. +// - NATS: each test creates its own *nats.Conn pair and +// router.Shutdown / nc.Close remove subscriptions before the next +// test starts. Each fixture also uses a distinct queue group name. +// +// CCS tests are the one exception — they need two networked ES nodes and +// stand up their own pair inside setupCCSFixture. They still piggyback on +// the shared Valkey and NATS, since those don't care about the topology. + +import ( + "context" + "fmt" + "hash/fnv" + "net/http" + "sync" + "testing" + "time" + + goredis "github.com/redis/go-redis/v9" + "github.com/stretchr/testify/require" + "github.com/testcontainers/testcontainers-go" + natsmod "github.com/testcontainers/testcontainers-go/modules/nats" + "github.com/testcontainers/testcontainers-go/wait" + + "github.com/hmchangw/chat/pkg/testutil/testimages" + "github.com/hmchangw/chat/pkg/valkeyutil" +) + +var ( + sharedESOnce sync.Once + sharedESURL string + sharedESErr error + + sharedValkeyOnce sync.Once + sharedValkeyAddr string + sharedValkeyErr error + + sharedNATSOnce sync.Once + sharedNATSURL string + sharedNATSErr error +) + +// sharedSingleNodeES returns the URL of a process-shared single-node ES +// container. CCS tests do NOT use this — they need a pair of networked +// clusters and stand up their own. +func sharedSingleNodeES(t *testing.T) string { + t.Helper() + sharedESOnce.Do(func() { + ctx := context.Background() + container, err := testcontainers.GenericContainer(ctx, testcontainers.GenericContainerRequest{ + ContainerRequest: testcontainers.ContainerRequest{ + Image: testimages.Elasticsearch, + ExposedPorts: []string{"9200/tcp"}, + Env: map[string]string{ + "discovery.type": "single-node", + "xpack.security.enabled": "false", + "ES_JAVA_OPTS": "-Xms512m -Xmx512m", + "cluster.routing.allocation.disk.threshold_enabled": "false", + }, + WaitingFor: wait.ForAll( + wait.ForHTTP("/").WithPort("9200/tcp").WithStartupTimeout(120*time.Second), + wait.ForHTTP("/_cluster/health?wait_for_status=yellow&timeout=60s"). + WithPort("9200/tcp"). + WithStartupTimeout(120*time.Second), + ), + }, + Started: true, + }) + if err != nil { + sharedESErr = fmt.Errorf("start shared elasticsearch: %w", err) + return + } + host, err := container.Host(ctx) + if err != nil { + _ = container.Terminate(ctx) + sharedESErr = fmt.Errorf("get shared es host: %w", err) + return + } + port, err := container.MappedPort(ctx, "9200") + if err != nil { + _ = container.Terminate(ctx) + sharedESErr = fmt.Errorf("get shared es port: %w", err) + return + } + sharedESURL = fmt.Sprintf("http://%s:%s", host, port.Port()) + }) + if sharedESErr != nil { + t.Fatalf("shared elasticsearch: %v", sharedESErr) + } + return sharedESURL +} + +// sharedValkey returns the addr of a process-shared Valkey container. +// Callers should obtain a fresh client via freshValkeyClient so the +// keyspace is wiped on test cleanup. +func sharedValkey(t *testing.T) string { + t.Helper() + sharedValkeyOnce.Do(func() { + ctx := context.Background() + container, err := testcontainers.GenericContainer(ctx, testcontainers.GenericContainerRequest{ + ContainerRequest: testcontainers.ContainerRequest{ + Image: testimages.Valkey, + ExposedPorts: []string{"6379/tcp"}, + Cmd: []string{"valkey-server", "--save", "", "--appendonly", "no"}, + WaitingFor: wait.ForLog("Ready to accept connections").WithStartupTimeout(30 * time.Second), + }, + Started: true, + }) + if err != nil { + sharedValkeyErr = fmt.Errorf("start shared valkey: %w", err) + return + } + host, err := container.Host(ctx) + if err != nil { + _ = container.Terminate(ctx) + sharedValkeyErr = fmt.Errorf("get shared valkey host: %w", err) + return + } + port, err := container.MappedPort(ctx, "6379") + if err != nil { + _ = container.Terminate(ctx) + sharedValkeyErr = fmt.Errorf("get shared valkey port: %w", err) + return + } + sharedValkeyAddr = fmt.Sprintf("%s:%s", host, port.Port()) + }) + if sharedValkeyErr != nil { + t.Fatalf("shared valkey: %v", sharedValkeyErr) + } + return sharedValkeyAddr +} + +// sharedNATS returns the URL of a process-shared NATS container. +func sharedNATS(t *testing.T) string { + t.Helper() + sharedNATSOnce.Do(func() { + ctx := context.Background() + c, err := natsmod.Run(ctx, testimages.NATS, + testcontainers.WithWaitStrategy(wait.ForLog("Server is ready").WithStartupTimeout(60*time.Second)), + ) + if err != nil { + sharedNATSErr = fmt.Errorf("start shared nats: %w", err) + return + } + url, err := c.ConnectionString(ctx) + if err != nil { + _ = c.Terminate(ctx) + sharedNATSErr = fmt.Errorf("get shared nats url: %w", err) + return + } + sharedNATSURL = url + }) + if sharedNATSErr != nil { + t.Fatalf("shared nats: %v", sharedNATSErr) + } + return sharedNATSURL +} + +// uniqueESIndex returns a per-test ES index name derived from t.Name() +// and registers a cleanup that DELETEs the index from the shared ES +// when the test ends. The hash keeps the name short, deterministic per +// test, and free of characters that ES dislikes (slashes from subtests). +func uniqueESIndex(t *testing.T, prefix string) string { + t.Helper() + esURL := sharedSingleNodeES(t) + h := fnv.New64a() + _, _ = h.Write([]byte(t.Name())) + name := fmt.Sprintf("%s-%x", prefix, h.Sum64()) + t.Cleanup(func() { + req, err := http.NewRequest(http.MethodDelete, esURL+"/"+name, nil) + if err != nil { + t.Logf("delete index %s: build request: %v", name, err) + return + } + resp, err := testHTTPClient.Do(req) + if err != nil { + t.Logf("delete index %s: %v", name, err) + return + } + _ = resp.Body.Close() + }) + return name +} + +// freshValkeyClient returns a valkeyutil.Client connected to the shared +// Valkey, with cleanup that flushes the keyspace at test end so the next +// test starts clean. Tests in this package run sequentially, so a flush +// is sufficient isolation. +func freshValkeyClient(t *testing.T) valkeyutil.Client { + t.Helper() + addr := sharedValkey(t) + client, err := valkeyutil.Connect(context.Background(), addr, "") + require.NoError(t, err, "connect shared valkey") + t.Cleanup(func() { + flushValkey(t, addr) + valkeyutil.Disconnect(client) + }) + return client +} + +// flushValkey wipes the keyspace at addr. Uses a raw go-redis client so +// we don't have to expose FLUSHDB on the production valkeyutil.Client +// interface. +func flushValkey(t *testing.T, addr string) { + t.Helper() + rc := goredis.NewClient(&goredis.Options{Addr: addr}) + defer func() { _ = rc.Close() }() + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + if err := rc.FlushDB(ctx).Err(); err != nil { + t.Logf("flush valkey at %s: %v", addr, err) + } +} From 4252c54782ba6c305b9657e4c6f6b292d2109c94 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 20 May 2026 06:29:56 +0000 Subject: [PATCH 02/23] test(search-service): fail tests when shared Valkey FLUSHDB cleanup errors A silent log on FLUSHDB failure could let state leak into the next sibling test, surfacing far from the real root cause. t.Errorf marks the offending test failed while still allowing the rest of its cleanup chain (valkeyutil.Disconnect, etc.) to run. Per CodeRabbit review on PR #208. --- search-service/setup_shared_test.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/search-service/setup_shared_test.go b/search-service/setup_shared_test.go index 391946053..586f464de 100644 --- a/search-service/setup_shared_test.go +++ b/search-service/setup_shared_test.go @@ -214,7 +214,9 @@ func freshValkeyClient(t *testing.T) valkeyutil.Client { // flushValkey wipes the keyspace at addr. Uses a raw go-redis client so // we don't have to expose FLUSHDB on the production valkeyutil.Client -// interface. +// interface. A FLUSHDB failure here is fatal to the test: state would +// leak into the next sibling test and produce a confusing assertion +// failure far from the real root cause. func flushValkey(t *testing.T, addr string) { t.Helper() rc := goredis.NewClient(&goredis.Options{Addr: addr}) @@ -222,6 +224,6 @@ func flushValkey(t *testing.T, addr string) { ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) defer cancel() if err := rc.FlushDB(ctx).Err(); err != nil { - t.Logf("flush valkey at %s: %v", addr, err) + t.Errorf("flush valkey at %s: %v", addr, err) } } From 6b23294c598c6011fdee2529f8a90b3d0b0411c1 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 20 May 2026 07:50:21 +0000 Subject: [PATCH 03/23] test(search-service): split CCS integration tests into their own file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CCS is the one fixture in the file that owns special infrastructure — a pair of ES nodes on a shared docker network with transport-port aliases — so it doesn't fit the process-shared container pattern the other fixtures now use. Moving it (plus its CCS-only helpers: putClusterSetting, waitForRemoteConnected, install/build/messageTestTemplate, userRoomTestTemplate) into integration_ccs_test.go leaves integration_test.go homogeneous: all remaining tests follow the same "NATS request → assert response" pattern. testHTTPClient and seedDoc stay in integration_test.go because they're shared with the non-CCS path (uniqueESIndex cleanup, putTestSpotlightIndex, the Rooms tests' seeds). --- search-service/integration_ccs_test.go | 580 +++++++++++++++++++++++++ search-service/integration_test.go | 550 +---------------------- 2 files changed, 585 insertions(+), 545 deletions(-) create mode 100644 search-service/integration_ccs_test.go diff --git a/search-service/integration_ccs_test.go b/search-service/integration_ccs_test.go new file mode 100644 index 000000000..f1031d0b3 --- /dev/null +++ b/search-service/integration_ccs_test.go @@ -0,0 +1,580 @@ +//go:build integration + +package main + +// This file owns the cross-cluster-search (CCS) integration tests and +// every helper that only CCS needs. The two CCS tests are the one +// exception to the shared-container pattern in setup_shared_test.go: +// they need a pair of ES nodes on a shared docker network with +// transport-port aliases (`es-local`, `es-remote`), which doesn't fit +// the process-shared single-node ES. NATS and Valkey are still shared. +// +// Shared utilities used here (seedDoc, testHTTPClient, testUserRoomIndex) +// live in integration_test.go. + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "testing" + "time" + + "github.com/nats-io/nats.go" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "github.com/testcontainers/testcontainers-go" + "github.com/testcontainers/testcontainers-go/network" + "github.com/testcontainers/testcontainers-go/wait" + + "github.com/hmchangw/chat/pkg/model" + "github.com/hmchangw/chat/pkg/natsrouter" + "github.com/hmchangw/chat/pkg/natsutil" + "github.com/hmchangw/chat/pkg/searchengine" + "github.com/hmchangw/chat/pkg/subject" + "github.com/hmchangw/chat/pkg/testutil/testimages" +) + +// --- Fixture ----------------------------------------------------------------- + +// ccsFixture is the full stack for cross-cluster integration tests: two ES +// containers on a shared Docker network (with CCS configured from local → +// remote), plus Valkey and NATS, plus the wired search-service router. +// +// localURL / remoteURL are the host-mapped HTTP URLs for seeding; the +// search-service itself sees only localURL. `clientNATS` is the raw NATS +// client used to issue request/reply calls. +type ccsFixture struct { + localURL string + remoteURL string + localES searchengine.SearchEngine + remoteES searchengine.SearchEngine + clientNATS *nats.Conn +} + +// setupCCSFixture stands up the CCS environment. It owns the pair of +// networked ES containers (they need a shared docker network with +// transport-port aliases, so they can't be process-shared like the +// single-node ES used by other fixtures), but piggybacks on the +// process-shared Valkey and NATS from setup_shared_test.go. +// +// Every major step emits a `t.Logf` so a CI failure (where raw logs are +// often opaque on public runs) leaves enough breadcrumbs in the `go test` +// output to pinpoint which phase broke. +func setupCCSFixture(t *testing.T) *ccsFixture { + t.Helper() + ctx := context.Background() + + t.Logf("CCS fixture: creating docker network") + nw, err := network.New(ctx) + require.NoError(t, err, "create docker network") + t.Cleanup(func() { _ = nw.Remove(ctx) }) + t.Logf("CCS fixture: network %q created", nw.Name) + + t.Logf("CCS fixture: starting remote ES container (alias=es-remote)") + remoteURL := startESForCCS(t, nw, "es-remote", "remote-cluster") + t.Logf("CCS fixture: remote ES up at %s", remoteURL) + + t.Logf("CCS fixture: starting local ES container (alias=es-local)") + localURL := startESForCCS(t, nw, "es-local", "local-cluster") + t.Logf("CCS fixture: local ES up at %s", localURL) + + // Wire local ES to reach the remote in PROXY mode. Proxy mode opens a + // single direct connection to the configured address and skips the + // sniff-then-reconnect dance that sniff mode does — that dance requires + // each remote node to advertise a reachable publish address, which is + // fragile when docker containers bind transport on 0.0.0.0 and the + // publish address defaults to an interface the peer can't route to. + // Proxy mode is the robust choice for CCS over an ephemeral docker + // network. Ref: ES docs "Remote cluster settings" → `mode=proxy`. + t.Logf("CCS fixture: configuring cluster.remote.remote1 (proxy mode → es-remote:9300)") + putClusterSetting(t, localURL, map[string]any{ + "persistent": map[string]any{ + "cluster.remote.remote1.mode": "proxy", + "cluster.remote.remote1.proxy_address": "es-remote:9300", + }, + }) + t.Logf("CCS fixture: waiting for remote1 to report connected=true (timeout 120s)") + waitForRemoteConnected(t, localURL, "remote1", 120*time.Second) + t.Logf("CCS fixture: remote1 connected") + + localEngine, err := searchengine.New(ctx, searchengine.Config{Backend: "elasticsearch", URL: localURL}) + require.NoError(t, err, "build searchengine for local") + remoteEngine, err := searchengine.New(ctx, searchengine.Config{Backend: "elasticsearch", URL: remoteURL}) + require.NoError(t, err, "build searchengine for remote") + + valkeyClient := freshValkeyClient(t) + + natsURL := sharedNATS(t) + serverNC, err := natsutil.Connect(natsURL, "") + require.NoError(t, err, "connect nats (server side)") + t.Cleanup(func() { _ = serverNC.Drain() }) + + clientNC, err := nats.Connect(natsURL) + require.NoError(t, err, "connect nats (client side)") + t.Cleanup(func() { clientNC.Close() }) + t.Logf("CCS fixture: NATS at %s", natsURL) + + userRoomIndex := testUserRoomIndex + store := newESStore(localEngine, userRoomIndex) + cache := newValkeyCache(valkeyClient) + handler := newHandler(store, nil, nil, cache, handlerConfig{ + DocCounts: 25, + MaxDocCounts: 100, + RestrictedRoomsCacheTTL: 5 * time.Minute, + RecentWindow: 365 * 24 * time.Hour, + UserRoomIndex: userRoomIndex, + SpotlightReadPattern: "spotlight-test-*", + }) + + router := natsrouter.New(serverNC, "search-service-test") + router.Use(natsrouter.RequestID()) + handler.Register(router) + // Flush — see setupAppsFixture for the rationale. + require.NoError(t, serverNC.NatsConn().Flush()) + + return &ccsFixture{ + localURL: localURL, + remoteURL: remoteURL, + localES: localEngine, + remoteES: remoteEngine, + clientNATS: clientNC, + } +} + +// startESForCCS starts one ES node on the shared network with the given +// network alias so the peer can reach it at `{alias}:9300`. Returns the +// host-mapped HTTP URL for seeding. +// +// `transport.host: 0.0.0.0` is required so the transport port binds on all +// interfaces, including the bridge network (ES 8.x defaults to `_site_` +// which excludes the container's bridge IP in some setups). CCS itself +// uses `proxy` mode to avoid publish-address sensitivity — see +// setupCCSFixture. `xpack.security.enabled=false` matches the local dev +// deps compose. +func startESForCCS(t *testing.T, nw *testcontainers.DockerNetwork, alias, clusterName string) string { + t.Helper() + ctx := context.Background() + + container, err := testcontainers.GenericContainer(ctx, testcontainers.GenericContainerRequest{ + ContainerRequest: testcontainers.ContainerRequest{ + Image: testimages.Elasticsearch, + ExposedPorts: []string{"9200/tcp", "9300/tcp"}, + Networks: []string{nw.Name}, + NetworkAliases: map[string][]string{ + nw.Name: {alias}, + }, + Env: map[string]string{ + "cluster.name": clusterName, + "discovery.type": "single-node", + "xpack.security.enabled": "false", + "network.host": "0.0.0.0", + "transport.host": "0.0.0.0", + "cluster.routing.allocation.disk.threshold_enabled": "false", + "ES_JAVA_OPTS": "-Xms512m -Xmx512m", + }, + WaitingFor: wait.ForAll( + wait.ForHTTP("/").WithPort("9200/tcp").WithStartupTimeout(120*time.Second), + wait.ForHTTP("/_cluster/health?wait_for_status=yellow&timeout=60s"). + WithPort("9200/tcp"). + WithStartupTimeout(120*time.Second), + ), + }, + Started: true, + }) + require.NoError(t, err, "start elasticsearch (%s)", alias) + t.Cleanup(func() { _ = container.Terminate(ctx) }) + + host, err := container.Host(ctx) + require.NoError(t, err) + port, err := container.MappedPort(ctx, "9200") + require.NoError(t, err) + return fmt.Sprintf("http://%s:%s", host, port.Port()) +} + +// --- Index templates --------------------------------------------------------- + +// buildTestTemplate wraps a pattern + property map with single-node-friendly +// index settings (1 shard, 0 replicas, 1s refresh) and `dynamic: false` +// mappings. The templates below hand-roll their property sets so the tests +// remain independent of search-sync-worker's custom-analyzer configuration. +func buildTestTemplate(pattern string, properties map[string]any) json.RawMessage { + body := map[string]any{ + "index_patterns": []string{pattern}, + "template": map[string]any{ + "settings": map[string]any{ + "index": map[string]any{ + "number_of_shards": 1, + "number_of_replicas": 0, + "refresh_interval": "1s", + }, + }, + "mappings": map[string]any{ + "dynamic": false, + "properties": properties, + }, + }, + } + data, _ := json.Marshal(body) + return data +} + +func messageTestTemplate() json.RawMessage { + return buildTestTemplate("messages-*", map[string]any{ + "messageId": map[string]any{"type": "keyword"}, + "roomId": map[string]any{"type": "keyword"}, + "siteId": map[string]any{"type": "keyword"}, + "userId": map[string]any{"type": "keyword"}, + "userAccount": map[string]any{"type": "keyword"}, + "content": map[string]any{ + "type": "text", + "fields": map[string]any{ + "keyword": map[string]any{"type": "keyword"}, + }, + }, + "createdAt": map[string]any{"type": "date"}, + "threadParentMessageId": map[string]any{"type": "keyword"}, + "threadParentMessageCreatedAt": map[string]any{"type": "date"}, + "tshow": map[string]any{"type": "boolean"}, + }) +} + +func userRoomTestTemplate() json.RawMessage { + return buildTestTemplate(testUserRoomIndex, map[string]any{ + "userAccount": map[string]any{"type": "keyword"}, + "rooms": map[string]any{ + "type": "text", + "fields": map[string]any{ + "keyword": map[string]any{"type": "keyword", "ignore_above": 256}, + }, + }, + "restrictedRooms": map[string]any{"type": "flattened"}, + "roomTimestamps": map[string]any{"type": "flattened"}, + "createdAt": map[string]any{"type": "date"}, + "updatedAt": map[string]any{"type": "date"}, + }) +} + +// --- CCS HTTP helpers -------------------------------------------------------- + +// putClusterSetting pushes a /_cluster/settings update. Used to configure +// the CCS remote after both clusters are up. +func putClusterSetting(t *testing.T, esURL string, body map[string]any) { + t.Helper() + data, _ := json.Marshal(body) + req, err := http.NewRequest(http.MethodPut, esURL+"/_cluster/settings", bytes.NewReader(data)) + require.NoError(t, err) + req.Header.Set("Content-Type", "application/json") + resp, err := testHTTPClient.Do(req) + require.NoError(t, err, "put cluster settings") + defer resp.Body.Close() + respBody, _ := io.ReadAll(resp.Body) + require.Equal(t, http.StatusOK, resp.StatusCode, "put cluster settings: %s", respBody) +} + +// waitForRemoteConnected polls /_remote/info until the given remote cluster +// reports connected=true. CCS registration is async — the settings call +// returns immediately but the transport handshake happens in the +// background. On timeout, the last-seen /_remote/info body is captured in +// the failure message so CI can diagnose whether the remote was ever +// registered, what mode it ended up in, or why it couldn't connect. +func waitForRemoteConnected(t *testing.T, localURL, remoteName string, timeout time.Duration) { + t.Helper() + deadline := time.Now().Add(timeout) + var lastBody string + for time.Now().Before(deadline) { + resp, err := testHTTPClient.Get(localURL + "/_remote/info") + if err == nil { + body, _ := io.ReadAll(resp.Body) + resp.Body.Close() + lastBody = string(body) + var info map[string]struct { + Connected bool `json:"connected"` + } + if json.Unmarshal(body, &info) == nil { + if entry, ok := info[remoteName]; ok && entry.Connected { + return + } + } + } + time.Sleep(1 * time.Second) + } + t.Fatalf("remote cluster %q never became connected within %s\nlast /_remote/info body: %s", + remoteName, timeout, lastBody) +} + +// --- Templates on both clusters --------------------------------------------- + +func (f *ccsFixture) installTemplates(t *testing.T) { + t.Helper() + ctx := context.Background() + + t.Logf("templates: upserting messages_template on local") + require.NoError(t, f.localES.UpsertTemplate(ctx, "messages_template", messageTestTemplate()), + "upsert messages_template on local") + t.Logf("templates: upserting messages_template on remote") + require.NoError(t, f.remoteES.UpsertTemplate(ctx, "messages_template", messageTestTemplate()), + "upsert messages_template on remote") + // user-room is local-only per the search-service architecture. + t.Logf("templates: upserting user_room_template on local") + require.NoError(t, f.localES.UpsertTemplate(ctx, "user_room_template", userRoomTestTemplate()), + "upsert user_room_template on local") + t.Logf("templates: all upserted") +} + +// --- Tests ------------------------------------------------------------------- + +// TestSearchService_SearchMessages_CCS_CrossCluster_Unrestricted verifies +// the core CCS promise: a user's search crosses from the local cluster +// (`messages-*`) to a remote cluster (`*:messages-*`) and the service +// returns the merged result set. Both rooms are unrestricted — they live in +// the user-room doc's `rooms[]` — and the terms-lookup clause handles them +// uniformly regardless of which site hosts the message. +func TestSearchService_SearchMessages_CCS_CrossCluster_Unrestricted(t *testing.T) { + f := setupCCSFixture(t) + f.installTemplates(t) + + // --- Seed -------------------------------------------------------------- + // + // Alice is a member of two unrestricted rooms: one lives on the local + // site, the other on the remote site. The user-room doc (local-only) + // lists BOTH in `rooms[]` — the sync-worker would normally populate + // this via INBOX events; here we seed directly. + const account = "alice" + const localRoomID = "room-local-1" + const remoteRoomID = "room-remote-1" + + now := time.Now().UTC() + createdAt := now.Add(-time.Hour) + monthIdx := "messages-" + createdAt.Format("2006-01") + + // user-room doc: unrestricted memberships in both rooms. + seedDoc(t, f.localURL, testUserRoomIndex, account, map[string]any{ + "userAccount": account, + "rooms": []string{localRoomID, remoteRoomID}, + "restrictedRooms": map[string]int64{}, + "roomTimestamps": map[string]int64{ + localRoomID: createdAt.UnixMilli(), + remoteRoomID: createdAt.UnixMilli(), + }, + "createdAt": createdAt.Format(time.RFC3339Nano), + "updatedAt": createdAt.Format(time.RFC3339Nano), + }) + + // Local message in local room. + seedDoc(t, f.localURL, monthIdx, "msg-local-1", map[string]any{ + "messageId": "msg-local-1", + "roomId": localRoomID, + "siteId": "site-local", + "userId": "user-bob", + "userAccount": "bob", + "content": "hello from local", + "createdAt": createdAt.Format(time.RFC3339Nano), + }) + + // Remote message in remote room. Same index pattern (`messages-*`) on + // the remote cluster — CCS resolves the `*:messages-*` segment on the + // local query. + seedDoc(t, f.remoteURL, monthIdx, "msg-remote-1", map[string]any{ + "messageId": "msg-remote-1", + "roomId": remoteRoomID, + "siteId": "site-remote", + "userId": "user-carol", + "userAccount": "carol", + "content": "hello from remote", + "createdAt": createdAt.Format(time.RFC3339Nano), + }) + + // --- Search via NATS --------------------------------------------------- + // + // Round-trips through the real natsrouter: the handler reads + // restrictedRooms from Valkey (miss → ES prefetch → Valkey SET), then + // builds the CCS query against `messages-*,*:messages-*` and parses + // the merged response. + req := model.SearchMessagesRequest{Query: "hello"} + reqData, err := json.Marshal(req) + require.NoError(t, err) + + // Generous timeout: first request is Valkey miss → ES prefetch of + // user-room doc → CCS fanout → response parse. Tight timeouts mask + // real latency bugs in integration. + msg, err := f.clientNATS.Request(subject.SearchMessages(account), reqData, 30*time.Second) + require.NoError(t, err, "NATS request failed") + + t.Logf("response: %s", msg.Data) + + var resp model.SearchMessagesResponse + require.NoError(t, json.Unmarshal(msg.Data, &resp), "decode response: %s", msg.Data) + + assert.EqualValues(t, 2, resp.Total, "expected both local + remote hits; got body=%s", msg.Data) + require.Len(t, resp.Messages, 2, "expected 2 hits; got body=%s", msg.Data) + + gotRooms := map[string]string{} + for _, hit := range resp.Messages { + gotRooms[hit.RoomID] = hit.SiteID + } + assert.Equal(t, "site-local", gotRooms[localRoomID], "local message should be present") + assert.Equal(t, "site-remote", gotRooms[remoteRoomID], "remote message should be present via CCS") +} + +// TestSearchService_SearchMessages_CCS_CrossCluster_Restricted verifies +// the restricted-room access-control clauses fire correctly across the +// CCS boundary. Alice is a member of one UNRESTRICTED local room and one +// RESTRICTED remote room with historySharedSince (HSS) set to a specific +// cutoff. The user-room doc (local-only) routes the remote room into +// `restrictedRooms{rid: hssMillis}`. +// +// Seed on the remote cluster covers every branch the query builder +// encodes for restricted rooms: +// +// - pre-HSS parent → MUST NOT match (Clause A: createdAt < hss) +// - post-HSS parent → MUST match (Clause A) +// - post-HSS thread reply, tshow=true → MUST match (Clause B1: outer gate passes + tshow=true fires B1, even though parent is pre-HSS) +// - post-HSS thread reply, tshow=false → MUST NOT match (Clause B fails: outer gate passes but inner OR fails — tshow=false AND parent < hss so B2 also fails) +// +// Plus one unrestricted local parent to prove the two paths interact +// cleanly on the same search. Total expected hits: 3 (local + post-HSS +// remote parent + post-HSS remote reply with tshow=true). +func TestSearchService_SearchMessages_CCS_CrossCluster_Restricted(t *testing.T) { + f := setupCCSFixture(t) + f.installTemplates(t) + + const account = "alice" + const localRoomID = "room-local-unrestricted" + const remoteRoomID = "room-remote-restricted" + + // Temporal setup: + // - hss is the user's join-time bound for the restricted remote room. + // - preHSS is 3 hours before hss (so pre-HSS messages are clearly + // older than the gate). + // - postHSS is 1 hour after hss. + // All well within the default 1-year `recent_window` so none of them + // get filtered out by the global createdAt range filter. + now := time.Now().UTC() + hss := now.Add(-2 * time.Hour) + preHSS := hss.Add(-3 * time.Hour) + postHSS := hss.Add(time.Hour) + monthIdxFor := func(ts time.Time) string { return "messages-" + ts.Format("2006-01") } + + // user-room doc: local room unrestricted, remote room restricted with hss. + t.Logf("seed: upserting user-room doc for %s (restricted %s since %s)", account, remoteRoomID, hss.Format(time.RFC3339)) + seedDoc(t, f.localURL, testUserRoomIndex, account, map[string]any{ + "userAccount": account, + "rooms": []string{localRoomID}, + "restrictedRooms": map[string]int64{ + remoteRoomID: hss.UnixMilli(), + }, + "roomTimestamps": map[string]int64{ + localRoomID: now.UnixMilli(), + remoteRoomID: now.UnixMilli(), + }, + "createdAt": now.Format(time.RFC3339Nano), + "updatedAt": now.Format(time.RFC3339Nano), + }) + + // --- LOCAL unrestricted room ---------------------------------------- + // One plain message that should always match via the terms-lookup + // branch (no HSS involved). + t.Logf("seed: local unrestricted message in %s", localRoomID) + seedDoc(t, f.localURL, monthIdxFor(postHSS), "msg-local-1", map[string]any{ + "messageId": "msg-local-1", + "roomId": localRoomID, + "siteId": "site-local", + "userId": "user-bob", + "userAccount": "bob", + "content": "hello from local", + "createdAt": postHSS.Format(time.RFC3339Nano), + }) + + // --- REMOTE restricted room ----------------------------------------- + // Four messages, each exercising one branch of the restricted-room + // clauses. Pre-HSS parent lives at `msg-remote-pre-parent`; its + // thread replies reference it via threadParentMessageId + + // threadParentMessageCreatedAt=preHSS. + t.Logf("seed: remote pre-HSS parent (MUST NOT match)") + seedDoc(t, f.remoteURL, monthIdxFor(preHSS), "msg-remote-pre-parent", map[string]any{ + "messageId": "msg-remote-pre-parent", + "roomId": remoteRoomID, + "siteId": "site-remote", + "userId": "user-carol", + "userAccount": "carol", + "content": "hello pre-hss parent", + "createdAt": preHSS.Format(time.RFC3339Nano), + }) + + t.Logf("seed: remote post-HSS parent (Clause A match)") + seedDoc(t, f.remoteURL, monthIdxFor(postHSS), "msg-remote-post-parent", map[string]any{ + "messageId": "msg-remote-post-parent", + "roomId": remoteRoomID, + "siteId": "site-remote", + "userId": "user-carol", + "userAccount": "carol", + "content": "hello post-hss parent", + "createdAt": postHSS.Format(time.RFC3339Nano), + }) + + // Post-HSS reply to a pre-HSS parent, tshow=true → Clause B1 matches. + // The reply's own createdAt satisfies Clause B's outer gate + // (createdAt >= hss); tshow=true then fires B1 regardless of the + // parent's age. If the outer gate weren't there, a pre-HSS tshow=true + // reply would leak history the user never had access to. + t.Logf("seed: remote post-HSS reply with tshow=true, pre-HSS parent (Clause B1 match)") + seedDoc(t, f.remoteURL, monthIdxFor(postHSS), "msg-remote-reply-tshow", map[string]any{ + "messageId": "msg-remote-reply-tshow", + "roomId": remoteRoomID, + "siteId": "site-remote", + "userId": "user-carol", + "userAccount": "carol", + "content": "hello tshow reply", + "createdAt": postHSS.Add(time.Minute).Format(time.RFC3339Nano), + "threadParentMessageId": "msg-remote-pre-parent", + "threadParentMessageCreatedAt": preHSS.Format(time.RFC3339Nano), + "tshow": true, + }) + + // Post-HSS reply to a pre-HSS parent, tshow=false → Clause B rejects. + // Outer gate passes (reply createdAt >= hss) but the inner OR fails: + // tshow=false blocks B1 and the parent's pre-HSS createdAt blocks B2. + t.Logf("seed: remote post-HSS reply without tshow, pre-HSS parent (MUST NOT match)") + seedDoc(t, f.remoteURL, monthIdxFor(postHSS), "msg-remote-reply-plain", map[string]any{ + "messageId": "msg-remote-reply-plain", + "roomId": remoteRoomID, + "siteId": "site-remote", + "userId": "user-carol", + "userAccount": "carol", + "content": "hello plain reply", + "createdAt": postHSS.Add(2 * time.Minute).Format(time.RFC3339Nano), + "threadParentMessageId": "msg-remote-pre-parent", + "threadParentMessageCreatedAt": preHSS.Format(time.RFC3339Nano), + }) + + // --- Search --------------------------------------------------------- + reqData, err := json.Marshal(model.SearchMessagesRequest{Query: "hello"}) + require.NoError(t, err) + + msg, err := f.clientNATS.Request(subject.SearchMessages(account), reqData, 30*time.Second) + require.NoError(t, err, "NATS request failed") + t.Logf("response: %s", msg.Data) + + var resp model.SearchMessagesResponse + require.NoError(t, json.Unmarshal(msg.Data, &resp), "decode response: %s", msg.Data) + + got := map[string]bool{} + for _, hit := range resp.Messages { + got[hit.MessageID] = true + } + + // Expected matches: + assert.True(t, got["msg-local-1"], "local unrestricted message must match via terms-lookup") + assert.True(t, got["msg-remote-post-parent"], "post-HSS remote parent must match via Clause A (CCS)") + assert.True(t, got["msg-remote-reply-tshow"], "post-HSS remote reply with tshow=true must match via Clause B1 (CCS)") + + // Expected exclusions: + assert.False(t, got["msg-remote-pre-parent"], "pre-HSS remote parent must be excluded by Clause A gate") + assert.False(t, got["msg-remote-reply-plain"], "post-HSS remote reply without tshow + pre-HSS parent must be excluded (outer gate passes; B1 and B2 both fail)") + + assert.EqualValues(t, 3, resp.Total, "expected exactly 3 hits; got body=%s", msg.Data) + require.Len(t, resp.Messages, 3, "expected 3 hits; got body=%s", msg.Data) +} diff --git a/search-service/integration_test.go b/search-service/integration_test.go index 3e65be074..1f095ff81 100644 --- a/search-service/integration_test.go +++ b/search-service/integration_test.go @@ -16,9 +16,6 @@ import ( "github.com/nats-io/nats.go" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - "github.com/testcontainers/testcontainers-go" - "github.com/testcontainers/testcontainers-go/network" - "github.com/testcontainers/testcontainers-go/wait" "go.mongodb.org/mongo-driver/v2/mongo" @@ -29,285 +26,22 @@ import ( "github.com/hmchangw/chat/pkg/searchengine" "github.com/hmchangw/chat/pkg/subject" "github.com/hmchangw/chat/pkg/testutil" - "github.com/hmchangw/chat/pkg/testutil/testimages" ) const testUserRoomIndex = "user-room" -// --- Fixture ----------------------------------------------------------------- - -// ccsFixture is the full stack for cross-cluster integration tests: two ES -// containers on a shared Docker network (with CCS configured from local → -// remote), plus Valkey and NATS, plus the wired search-service router. -// -// localURL / remoteURL are the host-mapped HTTP URLs for seeding; the -// search-service itself sees only localURL. `clientNATS` is the raw NATS -// client used to issue request/reply calls. -type ccsFixture struct { - localURL string - remoteURL string - localES searchengine.SearchEngine - remoteES searchengine.SearchEngine - clientNATS *nats.Conn -} - -// setupCCSFixture stands up the CCS environment. It owns the pair of -// networked ES containers (they need a shared docker network with -// transport-port aliases, so they can't be process-shared like the -// single-node ES used by other fixtures), but piggybacks on the -// process-shared Valkey and NATS from setup_shared_test.go. -// -// Every major step emits a `t.Logf` so a CI failure (where raw logs are -// often opaque on public runs) leaves enough breadcrumbs in the `go test` -// output to pinpoint which phase broke. -func setupCCSFixture(t *testing.T) *ccsFixture { - t.Helper() - ctx := context.Background() - - t.Logf("CCS fixture: creating docker network") - nw, err := network.New(ctx) - require.NoError(t, err, "create docker network") - t.Cleanup(func() { _ = nw.Remove(ctx) }) - t.Logf("CCS fixture: network %q created", nw.Name) - - t.Logf("CCS fixture: starting remote ES container (alias=es-remote)") - remoteURL := startESForCCS(t, nw, "es-remote", "remote-cluster") - t.Logf("CCS fixture: remote ES up at %s", remoteURL) - - t.Logf("CCS fixture: starting local ES container (alias=es-local)") - localURL := startESForCCS(t, nw, "es-local", "local-cluster") - t.Logf("CCS fixture: local ES up at %s", localURL) - - // Wire local ES to reach the remote in PROXY mode. Proxy mode opens a - // single direct connection to the configured address and skips the - // sniff-then-reconnect dance that sniff mode does — that dance requires - // each remote node to advertise a reachable publish address, which is - // fragile when docker containers bind transport on 0.0.0.0 and the - // publish address defaults to an interface the peer can't route to. - // Proxy mode is the robust choice for CCS over an ephemeral docker - // network. Ref: ES docs "Remote cluster settings" → `mode=proxy`. - t.Logf("CCS fixture: configuring cluster.remote.remote1 (proxy mode → es-remote:9300)") - putClusterSetting(t, localURL, map[string]any{ - "persistent": map[string]any{ - "cluster.remote.remote1.mode": "proxy", - "cluster.remote.remote1.proxy_address": "es-remote:9300", - }, - }) - t.Logf("CCS fixture: waiting for remote1 to report connected=true (timeout 120s)") - waitForRemoteConnected(t, localURL, "remote1", 120*time.Second) - t.Logf("CCS fixture: remote1 connected") - - localEngine, err := searchengine.New(ctx, searchengine.Config{Backend: "elasticsearch", URL: localURL}) - require.NoError(t, err, "build searchengine for local") - remoteEngine, err := searchengine.New(ctx, searchengine.Config{Backend: "elasticsearch", URL: remoteURL}) - require.NoError(t, err, "build searchengine for remote") - - valkeyClient := freshValkeyClient(t) - - natsURL := sharedNATS(t) - serverNC, err := natsutil.Connect(natsURL, "") - require.NoError(t, err, "connect nats (server side)") - t.Cleanup(func() { _ = serverNC.Drain() }) - - clientNC, err := nats.Connect(natsURL) - require.NoError(t, err, "connect nats (client side)") - t.Cleanup(func() { clientNC.Close() }) - t.Logf("CCS fixture: NATS at %s", natsURL) - - userRoomIndex := testUserRoomIndex - store := newESStore(localEngine, userRoomIndex) - cache := newValkeyCache(valkeyClient) - handler := newHandler(store, nil, nil, cache, handlerConfig{ - DocCounts: 25, - MaxDocCounts: 100, - RestrictedRoomsCacheTTL: 5 * time.Minute, - RecentWindow: 365 * 24 * time.Hour, - UserRoomIndex: userRoomIndex, - SpotlightReadPattern: "spotlight-test-*", - }) - - router := natsrouter.New(serverNC, "search-service-test") - router.Use(natsrouter.RequestID()) - handler.Register(router) - // Flush — see setupAppsFixture for the rationale. - require.NoError(t, serverNC.NatsConn().Flush()) - - return &ccsFixture{ - localURL: localURL, - remoteURL: remoteURL, - localES: localEngine, - remoteES: remoteEngine, - clientNATS: clientNC, - } -} - -// startESForCCS starts one ES node on the shared network with the given -// network alias so the peer can reach it at `{alias}:9300`. Returns the -// host-mapped HTTP URL for seeding. -// -// `transport.host: 0.0.0.0` is required so the transport port binds on all -// interfaces, including the bridge network (ES 8.x defaults to `_site_` -// which excludes the container's bridge IP in some setups). CCS itself -// uses `proxy` mode to avoid publish-address sensitivity — see -// setupCCSFixture. `xpack.security.enabled=false` matches the local dev -// deps compose. -func startESForCCS(t *testing.T, nw *testcontainers.DockerNetwork, alias, clusterName string) string { - t.Helper() - ctx := context.Background() - - container, err := testcontainers.GenericContainer(ctx, testcontainers.GenericContainerRequest{ - ContainerRequest: testcontainers.ContainerRequest{ - Image: testimages.Elasticsearch, - ExposedPorts: []string{"9200/tcp", "9300/tcp"}, - Networks: []string{nw.Name}, - NetworkAliases: map[string][]string{ - nw.Name: {alias}, - }, - Env: map[string]string{ - "cluster.name": clusterName, - "discovery.type": "single-node", - "xpack.security.enabled": "false", - "network.host": "0.0.0.0", - "transport.host": "0.0.0.0", - "cluster.routing.allocation.disk.threshold_enabled": "false", - "ES_JAVA_OPTS": "-Xms512m -Xmx512m", - }, - WaitingFor: wait.ForAll( - wait.ForHTTP("/").WithPort("9200/tcp").WithStartupTimeout(120*time.Second), - wait.ForHTTP("/_cluster/health?wait_for_status=yellow&timeout=60s"). - WithPort("9200/tcp"). - WithStartupTimeout(120*time.Second), - ), - }, - Started: true, - }) - require.NoError(t, err, "start elasticsearch (%s)", alias) - t.Cleanup(func() { _ = container.Terminate(ctx) }) - - host, err := container.Host(ctx) - require.NoError(t, err) - port, err := container.MappedPort(ctx, "9200") - require.NoError(t, err) - return fmt.Sprintf("http://%s:%s", host, port.Port()) -} - -// --- Index templates --------------------------------------------------------- - -// buildTestTemplate wraps a pattern + property map with single-node-friendly -// index settings (1 shard, 0 replicas, 1s refresh) and `dynamic: false` -// mappings. The templates below hand-roll their property sets so the tests -// remain independent of search-sync-worker's custom-analyzer configuration. -func buildTestTemplate(pattern string, properties map[string]any) json.RawMessage { - body := map[string]any{ - "index_patterns": []string{pattern}, - "template": map[string]any{ - "settings": map[string]any{ - "index": map[string]any{ - "number_of_shards": 1, - "number_of_replicas": 0, - "refresh_interval": "1s", - }, - }, - "mappings": map[string]any{ - "dynamic": false, - "properties": properties, - }, - }, - } - data, _ := json.Marshal(body) - return data -} - -func messageTestTemplate() json.RawMessage { - return buildTestTemplate("messages-*", map[string]any{ - "messageId": map[string]any{"type": "keyword"}, - "roomId": map[string]any{"type": "keyword"}, - "siteId": map[string]any{"type": "keyword"}, - "userId": map[string]any{"type": "keyword"}, - "userAccount": map[string]any{"type": "keyword"}, - "content": map[string]any{ - "type": "text", - "fields": map[string]any{ - "keyword": map[string]any{"type": "keyword"}, - }, - }, - "createdAt": map[string]any{"type": "date"}, - "threadParentMessageId": map[string]any{"type": "keyword"}, - "threadParentMessageCreatedAt": map[string]any{"type": "date"}, - "tshow": map[string]any{"type": "boolean"}, - }) -} - -func userRoomTestTemplate() json.RawMessage { - return buildTestTemplate(testUserRoomIndex, map[string]any{ - "userAccount": map[string]any{"type": "keyword"}, - "rooms": map[string]any{ - "type": "text", - "fields": map[string]any{ - "keyword": map[string]any{"type": "keyword", "ignore_above": 256}, - }, - }, - "restrictedRooms": map[string]any{"type": "flattened"}, - "roomTimestamps": map[string]any{"type": "flattened"}, - "createdAt": map[string]any{"type": "date"}, - "updatedAt": map[string]any{"type": "date"}, - }) -} - -// --- HTTP helpers ------------------------------------------------------------ +// --- Shared HTTP helpers ----------------------------------------------------- // testHTTPClient is a bounded HTTP client for ES control-plane calls — // stalled containers shouldn't be able to hang the integration job past // the per-call deadline. Kept small on purpose: these calls hit localhost // (docker-mapped port) and are cheap when they succeed. +// +// Used by seedDoc (below), by the index-cleanup path in uniqueESIndex +// (setup_shared_test.go), by putTestSpotlightIndex, and by the CCS-only +// helpers in integration_ccs_test.go. var testHTTPClient = &http.Client{Timeout: 10 * time.Second} -// putClusterSetting pushes a /_cluster/settings update. Used to configure -// the CCS remote after both clusters are up. -func putClusterSetting(t *testing.T, esURL string, body map[string]any) { - t.Helper() - data, _ := json.Marshal(body) - req, err := http.NewRequest(http.MethodPut, esURL+"/_cluster/settings", bytes.NewReader(data)) - require.NoError(t, err) - req.Header.Set("Content-Type", "application/json") - resp, err := testHTTPClient.Do(req) - require.NoError(t, err, "put cluster settings") - defer resp.Body.Close() - respBody, _ := io.ReadAll(resp.Body) - require.Equal(t, http.StatusOK, resp.StatusCode, "put cluster settings: %s", respBody) -} - -// waitForRemoteConnected polls /_remote/info until the given remote cluster -// reports connected=true. CCS registration is async — the settings call -// returns immediately but the transport handshake happens in the -// background. On timeout, the last-seen /_remote/info body is captured in -// the failure message so CI can diagnose whether the remote was ever -// registered, what mode it ended up in, or why it couldn't connect. -func waitForRemoteConnected(t *testing.T, localURL, remoteName string, timeout time.Duration) { - t.Helper() - deadline := time.Now().Add(timeout) - var lastBody string - for time.Now().Before(deadline) { - resp, err := testHTTPClient.Get(localURL + "/_remote/info") - if err == nil { - body, _ := io.ReadAll(resp.Body) - resp.Body.Close() - lastBody = string(body) - var info map[string]struct { - Connected bool `json:"connected"` - } - if json.Unmarshal(body, &info) == nil { - if entry, ok := info[remoteName]; ok && entry.Connected { - return - } - } - } - time.Sleep(1 * time.Second) - } - t.Fatalf("remote cluster %q never became connected within %s\nlast /_remote/info body: %s", - remoteName, timeout, lastBody) -} - // seedDoc PUTs a JSON document into ES, synchronously refreshing the index // so the next search sees it. func seedDoc(t *testing.T, esURL, index, id string, doc any) { @@ -326,280 +60,6 @@ func seedDoc(t *testing.T, esURL, index, id string, doc any) { "seedDoc %s/%s: status=%d body=%s", index, id, resp.StatusCode, body) } -// --- Templates on both clusters --------------------------------------------- - -func (f *ccsFixture) installTemplates(t *testing.T) { - t.Helper() - ctx := context.Background() - - t.Logf("templates: upserting messages_template on local") - require.NoError(t, f.localES.UpsertTemplate(ctx, "messages_template", messageTestTemplate()), - "upsert messages_template on local") - t.Logf("templates: upserting messages_template on remote") - require.NoError(t, f.remoteES.UpsertTemplate(ctx, "messages_template", messageTestTemplate()), - "upsert messages_template on remote") - // user-room is local-only per the search-service architecture. - t.Logf("templates: upserting user_room_template on local") - require.NoError(t, f.localES.UpsertTemplate(ctx, "user_room_template", userRoomTestTemplate()), - "upsert user_room_template on local") - t.Logf("templates: all upserted") -} - -// --- Test -------------------------------------------------------------------- - -// TestSearchService_SearchMessages_CCS_CrossCluster_Unrestricted verifies -// the core CCS promise: a user's search crosses from the local cluster -// (`messages-*`) to a remote cluster (`*:messages-*`) and the service -// returns the merged result set. Both rooms are unrestricted — they live in -// the user-room doc's `rooms[]` — and the terms-lookup clause handles them -// uniformly regardless of which site hosts the message. -func TestSearchService_SearchMessages_CCS_CrossCluster_Unrestricted(t *testing.T) { - f := setupCCSFixture(t) - f.installTemplates(t) - - // --- Seed -------------------------------------------------------------- - // - // Alice is a member of two unrestricted rooms: one lives on the local - // site, the other on the remote site. The user-room doc (local-only) - // lists BOTH in `rooms[]` — the sync-worker would normally populate - // this via INBOX events; here we seed directly. - const account = "alice" - const localRoomID = "room-local-1" - const remoteRoomID = "room-remote-1" - - now := time.Now().UTC() - createdAt := now.Add(-time.Hour) - monthIdx := "messages-" + createdAt.Format("2006-01") - - // user-room doc: unrestricted memberships in both rooms. - seedDoc(t, f.localURL, testUserRoomIndex, account, map[string]any{ - "userAccount": account, - "rooms": []string{localRoomID, remoteRoomID}, - "restrictedRooms": map[string]int64{}, - "roomTimestamps": map[string]int64{ - localRoomID: createdAt.UnixMilli(), - remoteRoomID: createdAt.UnixMilli(), - }, - "createdAt": createdAt.Format(time.RFC3339Nano), - "updatedAt": createdAt.Format(time.RFC3339Nano), - }) - - // Local message in local room. - seedDoc(t, f.localURL, monthIdx, "msg-local-1", map[string]any{ - "messageId": "msg-local-1", - "roomId": localRoomID, - "siteId": "site-local", - "userId": "user-bob", - "userAccount": "bob", - "content": "hello from local", - "createdAt": createdAt.Format(time.RFC3339Nano), - }) - - // Remote message in remote room. Same index pattern (`messages-*`) on - // the remote cluster — CCS resolves the `*:messages-*` segment on the - // local query. - seedDoc(t, f.remoteURL, monthIdx, "msg-remote-1", map[string]any{ - "messageId": "msg-remote-1", - "roomId": remoteRoomID, - "siteId": "site-remote", - "userId": "user-carol", - "userAccount": "carol", - "content": "hello from remote", - "createdAt": createdAt.Format(time.RFC3339Nano), - }) - - // --- Search via NATS --------------------------------------------------- - // - // Round-trips through the real natsrouter: the handler reads - // restrictedRooms from Valkey (miss → ES prefetch → Valkey SET), then - // builds the CCS query against `messages-*,*:messages-*` and parses - // the merged response. - req := model.SearchMessagesRequest{Query: "hello"} - reqData, err := json.Marshal(req) - require.NoError(t, err) - - // Generous timeout: first request is Valkey miss → ES prefetch of - // user-room doc → CCS fanout → response parse. Tight timeouts mask - // real latency bugs in integration. - msg, err := f.clientNATS.Request(subject.SearchMessages(account), reqData, 30*time.Second) - require.NoError(t, err, "NATS request failed") - - t.Logf("response: %s", msg.Data) - - var resp model.SearchMessagesResponse - require.NoError(t, json.Unmarshal(msg.Data, &resp), "decode response: %s", msg.Data) - - assert.EqualValues(t, 2, resp.Total, "expected both local + remote hits; got body=%s", msg.Data) - require.Len(t, resp.Messages, 2, "expected 2 hits; got body=%s", msg.Data) - - gotRooms := map[string]string{} - for _, hit := range resp.Messages { - gotRooms[hit.RoomID] = hit.SiteID - } - assert.Equal(t, "site-local", gotRooms[localRoomID], "local message should be present") - assert.Equal(t, "site-remote", gotRooms[remoteRoomID], "remote message should be present via CCS") -} - -// TestSearchService_SearchMessages_CCS_CrossCluster_Restricted verifies -// the restricted-room access-control clauses fire correctly across the -// CCS boundary. Alice is a member of one UNRESTRICTED local room and one -// RESTRICTED remote room with historySharedSince (HSS) set to a specific -// cutoff. The user-room doc (local-only) routes the remote room into -// `restrictedRooms{rid: hssMillis}`. -// -// Seed on the remote cluster covers every branch the query builder -// encodes for restricted rooms: -// -// - pre-HSS parent → MUST NOT match (Clause A: createdAt < hss) -// - post-HSS parent → MUST match (Clause A) -// - post-HSS thread reply, tshow=true → MUST match (Clause B1: outer gate passes + tshow=true fires B1, even though parent is pre-HSS) -// - post-HSS thread reply, tshow=false → MUST NOT match (Clause B fails: outer gate passes but inner OR fails — tshow=false AND parent < hss so B2 also fails) -// -// Plus one unrestricted local parent to prove the two paths interact -// cleanly on the same search. Total expected hits: 3 (local + post-HSS -// remote parent + post-HSS remote reply with tshow=true). -func TestSearchService_SearchMessages_CCS_CrossCluster_Restricted(t *testing.T) { - f := setupCCSFixture(t) - f.installTemplates(t) - - const account = "alice" - const localRoomID = "room-local-unrestricted" - const remoteRoomID = "room-remote-restricted" - - // Temporal setup: - // - hss is the user's join-time bound for the restricted remote room. - // - preHSS is 3 hours before hss (so pre-HSS messages are clearly - // older than the gate). - // - postHSS is 1 hour after hss. - // All well within the default 1-year `recent_window` so none of them - // get filtered out by the global createdAt range filter. - now := time.Now().UTC() - hss := now.Add(-2 * time.Hour) - preHSS := hss.Add(-3 * time.Hour) - postHSS := hss.Add(time.Hour) - monthIdxFor := func(ts time.Time) string { return "messages-" + ts.Format("2006-01") } - - // user-room doc: local room unrestricted, remote room restricted with hss. - t.Logf("seed: upserting user-room doc for %s (restricted %s since %s)", account, remoteRoomID, hss.Format(time.RFC3339)) - seedDoc(t, f.localURL, testUserRoomIndex, account, map[string]any{ - "userAccount": account, - "rooms": []string{localRoomID}, - "restrictedRooms": map[string]int64{ - remoteRoomID: hss.UnixMilli(), - }, - "roomTimestamps": map[string]int64{ - localRoomID: now.UnixMilli(), - remoteRoomID: now.UnixMilli(), - }, - "createdAt": now.Format(time.RFC3339Nano), - "updatedAt": now.Format(time.RFC3339Nano), - }) - - // --- LOCAL unrestricted room ---------------------------------------- - // One plain message that should always match via the terms-lookup - // branch (no HSS involved). - t.Logf("seed: local unrestricted message in %s", localRoomID) - seedDoc(t, f.localURL, monthIdxFor(postHSS), "msg-local-1", map[string]any{ - "messageId": "msg-local-1", - "roomId": localRoomID, - "siteId": "site-local", - "userId": "user-bob", - "userAccount": "bob", - "content": "hello from local", - "createdAt": postHSS.Format(time.RFC3339Nano), - }) - - // --- REMOTE restricted room ----------------------------------------- - // Four messages, each exercising one branch of the restricted-room - // clauses. Pre-HSS parent lives at `msg-remote-pre-parent`; its - // thread replies reference it via threadParentMessageId + - // threadParentMessageCreatedAt=preHSS. - t.Logf("seed: remote pre-HSS parent (MUST NOT match)") - seedDoc(t, f.remoteURL, monthIdxFor(preHSS), "msg-remote-pre-parent", map[string]any{ - "messageId": "msg-remote-pre-parent", - "roomId": remoteRoomID, - "siteId": "site-remote", - "userId": "user-carol", - "userAccount": "carol", - "content": "hello pre-hss parent", - "createdAt": preHSS.Format(time.RFC3339Nano), - }) - - t.Logf("seed: remote post-HSS parent (Clause A match)") - seedDoc(t, f.remoteURL, monthIdxFor(postHSS), "msg-remote-post-parent", map[string]any{ - "messageId": "msg-remote-post-parent", - "roomId": remoteRoomID, - "siteId": "site-remote", - "userId": "user-carol", - "userAccount": "carol", - "content": "hello post-hss parent", - "createdAt": postHSS.Format(time.RFC3339Nano), - }) - - // Post-HSS reply to a pre-HSS parent, tshow=true → Clause B1 matches. - // The reply's own createdAt satisfies Clause B's outer gate - // (createdAt >= hss); tshow=true then fires B1 regardless of the - // parent's age. If the outer gate weren't there, a pre-HSS tshow=true - // reply would leak history the user never had access to. - t.Logf("seed: remote post-HSS reply with tshow=true, pre-HSS parent (Clause B1 match)") - seedDoc(t, f.remoteURL, monthIdxFor(postHSS), "msg-remote-reply-tshow", map[string]any{ - "messageId": "msg-remote-reply-tshow", - "roomId": remoteRoomID, - "siteId": "site-remote", - "userId": "user-carol", - "userAccount": "carol", - "content": "hello tshow reply", - "createdAt": postHSS.Add(time.Minute).Format(time.RFC3339Nano), - "threadParentMessageId": "msg-remote-pre-parent", - "threadParentMessageCreatedAt": preHSS.Format(time.RFC3339Nano), - "tshow": true, - }) - - // Post-HSS reply to a pre-HSS parent, tshow=false → Clause B rejects. - // Outer gate passes (reply createdAt >= hss) but the inner OR fails: - // tshow=false blocks B1 and the parent's pre-HSS createdAt blocks B2. - t.Logf("seed: remote post-HSS reply without tshow, pre-HSS parent (MUST NOT match)") - seedDoc(t, f.remoteURL, monthIdxFor(postHSS), "msg-remote-reply-plain", map[string]any{ - "messageId": "msg-remote-reply-plain", - "roomId": remoteRoomID, - "siteId": "site-remote", - "userId": "user-carol", - "userAccount": "carol", - "content": "hello plain reply", - "createdAt": postHSS.Add(2 * time.Minute).Format(time.RFC3339Nano), - "threadParentMessageId": "msg-remote-pre-parent", - "threadParentMessageCreatedAt": preHSS.Format(time.RFC3339Nano), - }) - - // --- Search --------------------------------------------------------- - reqData, err := json.Marshal(model.SearchMessagesRequest{Query: "hello"}) - require.NoError(t, err) - - msg, err := f.clientNATS.Request(subject.SearchMessages(account), reqData, 30*time.Second) - require.NoError(t, err, "NATS request failed") - t.Logf("response: %s", msg.Data) - - var resp model.SearchMessagesResponse - require.NoError(t, json.Unmarshal(msg.Data, &resp), "decode response: %s", msg.Data) - - got := map[string]bool{} - for _, hit := range resp.Messages { - got[hit.MessageID] = true - } - - // Expected matches: - assert.True(t, got["msg-local-1"], "local unrestricted message must match via terms-lookup") - assert.True(t, got["msg-remote-post-parent"], "post-HSS remote parent must match via Clause A (CCS)") - assert.True(t, got["msg-remote-reply-tshow"], "post-HSS remote reply with tshow=true must match via Clause B1 (CCS)") - - // Expected exclusions: - assert.False(t, got["msg-remote-pre-parent"], "pre-HSS remote parent must be excluded by Clause A gate") - assert.False(t, got["msg-remote-reply-plain"], "post-HSS remote reply without tshow + pre-HSS parent must be excluded (outer gate passes; B1 and B2 both fail)") - - assert.EqualValues(t, 3, resp.Total, "expected exactly 3 hits; got body=%s", msg.Data) - require.Len(t, resp.Messages, 3, "expected 3 hits; got body=%s", msg.Data) -} - // --- search.apps integration ------------------------------------------------ // setupAppsFixture starts an isolated Mongo container (via pkg/testutil) and From 24e9280b434b8edd576e5c0292fac1d2b32d9e03 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 20 May 2026 07:55:02 +0000 Subject: [PATCH 04/23] test(search-service): split integration tests by search endpoint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit One file per search endpoint, matching the CCS split pattern: integration_apps_test.go — search.apps (Mongo + NATS) integration_users_test.go — search.users (NATS + httptest stub) integration_rooms_test.go — search.rooms (shared ES + Valkey + NATS) integration_messages_test.go — search.messages v2 (ES httptest stub + NATS) testUserRoomIndex, testHTTPClient, and seedDoc move into setup_shared_test.go alongside the shared-container helpers — they're test infrastructure used by multiple endpoints, so colocating them keeps the per-endpoint files focused. integration_test.go is removed; the package no longer has a catch-all integration file. CI auto-discovery and path filtering both still match because every new file lives under search-service/ and carries the //go:build integration tag. --- search-service/integration_apps_test.go | 151 +++++ search-service/integration_messages_test.go | 131 +++++ search-service/integration_rooms_test.go | 243 ++++++++ search-service/integration_test.go | 621 -------------------- search-service/integration_users_test.go | 134 +++++ search-service/setup_shared_test.go | 40 +- 6 files changed, 695 insertions(+), 625 deletions(-) create mode 100644 search-service/integration_apps_test.go create mode 100644 search-service/integration_messages_test.go create mode 100644 search-service/integration_rooms_test.go delete mode 100644 search-service/integration_test.go create mode 100644 search-service/integration_users_test.go diff --git a/search-service/integration_apps_test.go b/search-service/integration_apps_test.go new file mode 100644 index 000000000..28ccd8814 --- /dev/null +++ b/search-service/integration_apps_test.go @@ -0,0 +1,151 @@ +//go:build integration + +package main + +// search.apps integration tests. Uses the process-shared Mongo +// (testutil.MongoDB) and NATS (sharedNATS in setup_shared_test.go); ES +// and Valkey are stubbed because the apps path doesn't touch them. + +import ( + "context" + "encoding/json" + "testing" + "time" + + "github.com/nats-io/nats.go" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "go.mongodb.org/mongo-driver/v2/mongo" + + "github.com/hmchangw/chat/pkg/model" + "github.com/hmchangw/chat/pkg/natsrouter" + "github.com/hmchangw/chat/pkg/natsutil" + "github.com/hmchangw/chat/pkg/subject" + "github.com/hmchangw/chat/pkg/testutil" +) + +// setupAppsFixture starts an isolated Mongo container (via pkg/testutil) and +// a single search-service router bound to that DB. ES/Valkey are not used by +// search.apps, so we wire fakes (the existing `fakeStore` / `fakeCache` +// satisfy the interfaces but never get called on the apps path). +type appsFixture struct { + clientNATS *nats.Conn + mongoDB *mongo.Database +} + +func setupAppsFixture(t *testing.T) *appsFixture { + t.Helper() + + mongoDB := testutil.MongoDB(t, "search_service_test") + + natsURL := sharedNATS(t) + + serverNATS, err := natsutil.Connect(natsURL, "") + require.NoError(t, err) + t.Cleanup(func() { _ = serverNATS.Drain() }) + + clientNATS, err := nats.Connect(natsURL) + require.NoError(t, err) + t.Cleanup(func() { clientNATS.Close() }) + + // Wire the handler with a real mongoStore and stub ES/cache. + mongoStore := newMongoStore(mongoDB) + store := &fakeStore{} + cache := newFakeCache() + h := newHandler(store, mongoStore, nil, cache, handlerConfig{ + DocCounts: 25, + MaxDocCounts: 100, + RestrictedRoomsCacheTTL: 5 * time.Minute, + RecentWindow: 365 * 24 * time.Hour, + RequestTimeout: 5 * time.Second, + SpotlightReadPattern: "spotlight-*", + }) + + router := natsrouter.New(serverNATS, "search-service-test") + router.Use(natsrouter.RequestID()) + h.Register(router) + // Flush ensures subscriptions are registered on the server before the + // fixture returns. Without this, fast tests that fire a request + // immediately can hit "no responders available" while subscriptions + // are still propagating. natsutil.Connect returns an otelnats.Conn + // wrapper that doesn't expose Flush; reach through to the underlying + // *nats.Conn. + require.NoError(t, serverNATS.NatsConn().Flush()) + t.Cleanup(func() { + _ = router.Shutdown(context.Background()) + }) + + return &appsFixture{clientNATS: clientNATS, mongoDB: mongoDB} +} + +func TestIntegration_SearchApps_PrototypePipeline(t *testing.T) { + f := setupAppsFixture(t) + ctx := context.Background() + + // Seed 3 apps in Mongo. The prototype pipeline matches by `name` regex + // (case-insensitive) and applies $limit; the full $lookup access-guard + // pipeline is implemented in a follow-up. + _, err := f.mongoDB.Collection("apps").InsertMany(ctx, []any{ + map[string]any{"_id": "a1", "name": "Weather Alpha", "assistant": map[string]any{"enabled": true, "name": "weather.bot"}}, + map[string]any{"_id": "a2", "name": "Weatherly", "assistant": map[string]any{"enabled": false, "name": "weatherly.bot"}}, + map[string]any{"_id": "a3", "name": "Calendar"}, + }) + require.NoError(t, err) + + reqBytes, err := json.Marshal(model.SearchAppsRequest{Query: "weather"}) + require.NoError(t, err) + + msg, err := f.clientNATS.Request(subject.SearchApps("alice"), reqBytes, 5*time.Second) + require.NoError(t, err) + + var resp model.SearchAppsResponse + require.NoError(t, json.Unmarshal(msg.Data, &resp)) + + require.Len(t, resp.Apps, 2, "two apps match the 'weather' regex") + names := []string{resp.Apps[0].Name, resp.Apps[1].Name} + assert.Contains(t, names, "Weather Alpha") + assert.Contains(t, names, "Weatherly") +} + +func TestIntegration_SearchApps_AssistantEnabledFilter(t *testing.T) { + f := setupAppsFixture(t) + ctx := context.Background() + + _, err := f.mongoDB.Collection("apps").InsertMany(ctx, []any{ + map[string]any{"_id": "a1", "name": "Weather Alpha", "assistant": map[string]any{"enabled": true, "name": "weather.bot"}}, + map[string]any{"_id": "a2", "name": "Weatherly", "assistant": map[string]any{"enabled": false, "name": "weatherly.bot"}}, + }) + require.NoError(t, err) + + enabled := true + reqBytes, err := json.Marshal(model.SearchAppsRequest{ + Query: "weather", + AssistantEnabled: &enabled, + }) + require.NoError(t, err) + + msg, err := f.clientNATS.Request(subject.SearchApps("alice"), reqBytes, 5*time.Second) + require.NoError(t, err) + + var resp model.SearchAppsResponse + require.NoError(t, json.Unmarshal(msg.Data, &resp)) + + require.Len(t, resp.Apps, 1) + assert.Equal(t, "Weather Alpha", resp.Apps[0].Name) +} + +func TestIntegration_SearchApps_EmptyQueryReturnsBadRequest(t *testing.T) { + f := setupAppsFixture(t) + + reqBytes, err := json.Marshal(model.SearchAppsRequest{Query: ""}) + require.NoError(t, err) + + msg, err := f.clientNATS.Request(subject.SearchApps("alice"), reqBytes, 5*time.Second) + require.NoError(t, err) + + var envelope model.ErrorResponse + require.NoError(t, json.Unmarshal(msg.Data, &envelope)) + require.NotEmpty(t, envelope.Error) + assert.Equal(t, natsrouter.CodeBadRequest, envelope.Code) +} diff --git a/search-service/integration_messages_test.go b/search-service/integration_messages_test.go new file mode 100644 index 000000000..4f28578a5 --- /dev/null +++ b/search-service/integration_messages_test.go @@ -0,0 +1,131 @@ +//go:build integration + +package main + +// search.messages v2 integration tests. Stubs ES with an httptest +// server because the messages path is pure ES — no Mongo round-trip — +// and uses the process-shared NATS from setup_shared_test.go. + +import ( + "context" + "encoding/json" + "io" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/nats-io/nats.go" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/hmchangw/chat/pkg/model" + "github.com/hmchangw/chat/pkg/natsrouter" + "github.com/hmchangw/chat/pkg/natsutil" + "github.com/hmchangw/chat/pkg/searchengine" + "github.com/hmchangw/chat/pkg/subject" +) + +// messagesV2Fixture stubs ES with a fake HTTP server (httptest). The +// messages path is pure ES — no Mongo round-trip — so no Mongo fixture +// is wired. +type messagesV2Fixture struct { + clientNATS *nats.Conn +} + +func setupMessagesV2Fixture(t *testing.T) *messagesV2Fixture { + t.Helper() + ctx := context.Background() + + // Stub ES: always return a canned response containing one hit. + esStub := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + // Drain the body so the HTTP/1.1 connection stays open. + _, _ = io.Copy(io.Discard, r.Body) + // The Elastic Go client performs a "product check" handshake on + // connect and rejects any server that doesn't advertise itself + // as Elasticsearch via this header. Set it on every response so + // the stub passes the check regardless of which endpoint is hit. + w.Header().Set("X-Elastic-Product", "Elasticsearch") + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"hits":{"total":{"value":1},"hits":[{"_source":{` + + `"messageId":"m1","roomId":"r1","siteId":"site-a","userId":"u1",` + + `"userAccount":"alice","content":"hello","createdAt":"2026-04-01T12:00:00Z"}}]}}`)) + })) + t.Cleanup(esStub.Close) + + // Valkey stub — use the fakeCache wired in-process via handler injection. + fakeValkey := newFakeCache() + fakeValkey.store["alice"] = map[string]int64{} // empty restricted map, cache hit + + natsURL := sharedNATS(t) + + serverNATS, err := natsutil.Connect(natsURL, "") + require.NoError(t, err) + t.Cleanup(func() { _ = serverNATS.Drain() }) + + clientNATS, err := nats.Connect(natsURL) + require.NoError(t, err) + t.Cleanup(func() { clientNATS.Close() }) + + // Wire search-service with the stub ES engine. No Mongo store needed + // for the messages path. + engine, err := searchengine.New(ctx, searchengine.Config{Backend: "elasticsearch", URL: esStub.URL}) + require.NoError(t, err) + esStore := newESStore(engine, testUserRoomIndex) + + h := newHandler(esStore, nil, nil, fakeValkey, handlerConfig{ + DocCounts: 25, + MaxDocCounts: 100, + RestrictedRoomsCacheTTL: 5 * time.Minute, + RecentWindow: 365 * 24 * time.Hour, + RequestTimeout: 5 * time.Second, + UserRoomIndex: testUserRoomIndex, + SpotlightReadPattern: "spotlight-*", + }) + + router := natsrouter.New(serverNATS, "search-service-test-v2") + router.Use(natsrouter.RequestID()) + h.Register(router) + // Flush — see setupAppsFixture for the rationale. + require.NoError(t, serverNATS.NatsConn().Flush()) + t.Cleanup(func() { _ = router.Shutdown(context.Background()) }) + + return &messagesV2Fixture{clientNATS: clientNATS} +} + +func TestIntegration_SearchMessages_V2_HitProjection(t *testing.T) { + f := setupMessagesV2Fixture(t) + + reqBytes, err := json.Marshal(model.SearchMessagesRequest{Query: "hello"}) + require.NoError(t, err) + + msg, err := f.clientNATS.Request(subject.SearchMessages("alice"), reqBytes, 5*time.Second) + require.NoError(t, err) + + var resp model.SearchMessagesResponse + require.NoError(t, json.Unmarshal(msg.Data, &resp)) + + require.Len(t, resp.Messages, 1) + assert.EqualValues(t, 1, resp.Total) + + got := resp.Messages[0] + assert.Equal(t, "m1", got.MessageID) + assert.Equal(t, "r1", got.RoomID) + assert.Equal(t, "site-a", got.SiteID) + assert.Equal(t, "alice", got.UserAccount) + assert.Equal(t, "hello", got.Content) +} + +func TestIntegration_SearchMessages_V2_EmptyQueryReturnsBadRequest(t *testing.T) { + f := setupMessagesV2Fixture(t) + + reqBytes, err := json.Marshal(model.SearchMessagesRequest{Query: ""}) + require.NoError(t, err) + + msg, err := f.clientNATS.Request(subject.SearchMessages("alice"), reqBytes, 5*time.Second) + require.NoError(t, err) + + var envelope model.ErrorResponse + require.NoError(t, json.Unmarshal(msg.Data, &envelope)) + assert.Equal(t, natsrouter.CodeBadRequest, envelope.Code) +} diff --git a/search-service/integration_rooms_test.go b/search-service/integration_rooms_test.go new file mode 100644 index 000000000..d0e90ba23 --- /dev/null +++ b/search-service/integration_rooms_test.go @@ -0,0 +1,243 @@ +//go:build integration + +package main + +// search.rooms integration tests. Uses the process-shared ES, NATS, and +// Valkey from setup_shared_test.go; per-test isolation comes from a +// unique spotlight index name (deleted on cleanup) plus a Valkey +// FLUSHDB on cleanup. + +import ( + "bytes" + "context" + "encoding/json" + "io" + "net/http" + "testing" + "time" + + "github.com/nats-io/nats.go" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/hmchangw/chat/pkg/model" + "github.com/hmchangw/chat/pkg/natsrouter" + "github.com/hmchangw/chat/pkg/natsutil" + "github.com/hmchangw/chat/pkg/searchengine" + "github.com/hmchangw/chat/pkg/subject" +) + +// roomsFixture wires a real ES container (for the spotlight index) and +// NATS. search.rooms is served directly from the spotlight index, so no +// Mongo is involved. The ES container is process-shared; per-test +// isolation comes from a unique spotlight index name (deleted on +// cleanup) plus a Valkey FLUSHDB on cleanup. +type roomsFixture struct { + clientNATS *nats.Conn + esURL string + spotlightIndex string +} + +// setupRoomsFixture wires the search-service router against the +// process-shared ES, Valkey and NATS containers. The spotlight index +// name is unique per test so leftovers from a sibling test can't leak +// into this one's hit set. +func setupRoomsFixture(t *testing.T) *roomsFixture { + t.Helper() + ctx := context.Background() + + esURL := sharedSingleNodeES(t) + spotlightIndex := uniqueESIndex(t, "spotlight") + putTestSpotlightIndex(t, esURL, spotlightIndex) + + natsURL := sharedNATS(t) + serverNC, err := natsutil.Connect(natsURL, "") + require.NoError(t, err, "connect nats (server side)") + t.Cleanup(func() { _ = serverNC.Drain() }) + + clientNC, err := nats.Connect(natsURL) + require.NoError(t, err, "connect nats (client side)") + t.Cleanup(func() { clientNC.Close() }) + + engine, err := searchengine.New(ctx, searchengine.Config{Backend: "elasticsearch", URL: esURL}) + require.NoError(t, err, "build searchengine for subs fixture") + + esStore := newESStore(engine, testUserRoomIndex) + cache := newValkeyCache(freshValkeyClient(t)) + h := newHandler(esStore, nil, nil, cache, handlerConfig{ + DocCounts: 25, + MaxDocCounts: 100, + RestrictedRoomsCacheTTL: 5 * time.Minute, + RecentWindow: 365 * 24 * time.Hour, + RequestTimeout: 5 * time.Second, + SpotlightReadPattern: spotlightIndex, + }) + + router := natsrouter.New(serverNC, "search-service-test-subs") + router.Use(natsrouter.RequestID()) + h.Register(router) + // Flush — see setupAppsFixture for the rationale. + require.NoError(t, serverNC.NatsConn().Flush()) + t.Cleanup(func() { _ = router.Shutdown(context.Background()) }) + + return &roomsFixture{clientNATS: clientNC, esURL: esURL, spotlightIndex: spotlightIndex} +} + +// putTestSpotlightIndex creates a minimal spotlight index in ES with the +// fields needed by the subscription search query. +func putTestSpotlightIndex(t *testing.T, esURL, index string) { + t.Helper() + body := map[string]any{ + "settings": map[string]any{ + "number_of_shards": 1, + "number_of_replicas": 0, + "refresh_interval": "1s", + }, + "mappings": map[string]any{ + "dynamic": false, + "properties": map[string]any{ + "roomId": map[string]any{"type": "keyword"}, + "roomName": map[string]any{ + "type": "search_as_you_type", + }, + "roomType": map[string]any{"type": "keyword"}, + "userAccount": map[string]any{"type": "keyword"}, + "siteId": map[string]any{"type": "keyword"}, + "joinedAt": map[string]any{"type": "date"}, + }, + }, + } + data, _ := json.Marshal(body) + req, err := http.NewRequest(http.MethodPut, esURL+"/"+index, bytes.NewReader(data)) + require.NoError(t, err) + req.Header.Set("Content-Type", "application/json") + resp, err := testHTTPClient.Do(req) + require.NoError(t, err) + defer resp.Body.Close() + b, _ := io.ReadAll(resp.Body) + require.True(t, resp.StatusCode == http.StatusOK || resp.StatusCode == http.StatusCreated, + "create spotlight index: status=%d body=%s", resp.StatusCode, b) +} + +func TestIntegration_SearchRooms_HappyPath(t *testing.T) { + f := setupRoomsFixture(t) + + const account = "alice" + now := time.Now().UTC() + + // Seed spotlight docs for two rooms alice is in. + seedDoc(t, f.esURL, f.spotlightIndex, "spot-r1", map[string]any{ + "roomId": "r1", + "roomName": "engineering-announcements", + "roomType": "channel", + "userAccount": account, + "siteId": "site-local", + "joinedAt": now.Add(-48 * time.Hour).Format(time.RFC3339), + }) + seedDoc(t, f.esURL, f.spotlightIndex, "spot-r2", map[string]any{ + "roomId": "r2", + "roomName": "engineering-random", + "roomType": "channel", + "userAccount": account, + "siteId": "site-local", + "joinedAt": now.Add(-24 * time.Hour).Format(time.RFC3339), + }) + // A matching room owned by a different account. With the Mongo + // hydration removed, the spotlight userAccount term filter is the + // sole access boundary — this must not leak into alice's results. + seedDoc(t, f.esURL, f.spotlightIndex, "spot-r3", map[string]any{ + "roomId": "r3", + "roomName": "engineering-secret", + "roomType": "channel", + "userAccount": "mallory", + "siteId": "site-local", + "joinedAt": now.Add(-12 * time.Hour).Format(time.RFC3339), + }) + + reqBytes, err := json.Marshal(model.SearchRoomsRequest{Query: "engineering"}) + require.NoError(t, err) + + msg, err := f.clientNATS.Request(subject.SearchRooms(account), reqBytes, 10*time.Second) + require.NoError(t, err) + + var resp model.SearchRoomsResponse + require.NoError(t, json.Unmarshal(msg.Data, &resp)) + + require.Len(t, resp.Rooms, 2, "both rooms matching 'engineering' must be returned") + byID := map[string]model.SearchRoom{} + for _, r := range resp.Rooms { + byID[r.RoomID] = r + } + assert.Equal(t, model.SearchRoom{RoomID: "r1", Name: "engineering-announcements", RoomType: "channel", SiteID: "site-local"}, byID["r1"]) + assert.Equal(t, model.SearchRoom{RoomID: "r2", Name: "engineering-random", RoomType: "channel", SiteID: "site-local"}, byID["r2"]) + _, leaked := byID["r3"] + assert.False(t, leaked, "rooms owned by another account must not leak") +} + +func TestIntegration_SearchRooms_RoomTypeChannelFilter(t *testing.T) { + f := setupRoomsFixture(t) + + const account = "bob" + now := time.Now().UTC() + + seedDoc(t, f.esURL, f.spotlightIndex, "spot-b-r1", map[string]any{ + "roomId": "b-r1", + "roomName": "bob-alice", + "roomType": "dm", + "userAccount": account, + "siteId": "site-local", + "joinedAt": now.Add(-1 * time.Hour).Format(time.RFC3339), + }) + seedDoc(t, f.esURL, f.spotlightIndex, "spot-b-r2", map[string]any{ + "roomId": "b-r2", + "roomName": "bob-channel", + "roomType": "channel", + "userAccount": account, + "siteId": "site-local", + "joinedAt": now.Add(-2 * time.Hour).Format(time.RFC3339), + }) + + reqBytes, err := json.Marshal(model.SearchRoomsRequest{Query: "bob", RoomType: "channel"}) + require.NoError(t, err) + + msg, err := f.clientNATS.Request(subject.SearchRooms(account), reqBytes, 10*time.Second) + require.NoError(t, err) + + var resp model.SearchRoomsResponse + require.NoError(t, json.Unmarshal(msg.Data, &resp)) + + require.Len(t, resp.Rooms, 1) + assert.Equal(t, model.SearchRoom{RoomID: "b-r2", Name: "bob-channel", RoomType: "channel", SiteID: "site-local"}, resp.Rooms[0], + "only the channel room must match roomType=channel filter") +} + +func TestIntegration_SearchRooms_EmptyQueryReturnsBadRequest(t *testing.T) { + f := setupRoomsFixture(t) + + reqBytes, err := json.Marshal(model.SearchRoomsRequest{Query: ""}) + require.NoError(t, err) + + msg, err := f.clientNATS.Request(subject.SearchRooms("alice"), reqBytes, 5*time.Second) + require.NoError(t, err) + + var envelope model.ErrorResponse + require.NoError(t, json.Unmarshal(msg.Data, &envelope)) + require.NotEmpty(t, envelope.Error) + assert.Equal(t, natsrouter.CodeBadRequest, envelope.Code) +} + +func TestIntegration_SearchRooms_RoomTypeAppReturnsBadRequest(t *testing.T) { + f := setupRoomsFixture(t) + + reqBytes, err := json.Marshal(model.SearchRoomsRequest{Query: "x", RoomType: "app"}) + require.NoError(t, err) + + msg, err := f.clientNATS.Request(subject.SearchRooms("alice"), reqBytes, 5*time.Second) + require.NoError(t, err) + + var envelope model.ErrorResponse + require.NoError(t, json.Unmarshal(msg.Data, &envelope)) + require.NotEmpty(t, envelope.Error) + assert.Equal(t, natsrouter.CodeBadRequest, envelope.Code) + assert.Contains(t, envelope.Error, "invalid roomType") +} diff --git a/search-service/integration_test.go b/search-service/integration_test.go deleted file mode 100644 index 1f095ff81..000000000 --- a/search-service/integration_test.go +++ /dev/null @@ -1,621 +0,0 @@ -//go:build integration - -package main - -import ( - "bytes" - "context" - "encoding/json" - "fmt" - "io" - "net/http" - "net/http/httptest" - "testing" - "time" - - "github.com/nats-io/nats.go" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - - "go.mongodb.org/mongo-driver/v2/mongo" - - "github.com/hmchangw/chat/pkg/model" - "github.com/hmchangw/chat/pkg/natsrouter" - "github.com/hmchangw/chat/pkg/natsutil" - "github.com/hmchangw/chat/pkg/restyutil" - "github.com/hmchangw/chat/pkg/searchengine" - "github.com/hmchangw/chat/pkg/subject" - "github.com/hmchangw/chat/pkg/testutil" -) - -const testUserRoomIndex = "user-room" - -// --- Shared HTTP helpers ----------------------------------------------------- - -// testHTTPClient is a bounded HTTP client for ES control-plane calls — -// stalled containers shouldn't be able to hang the integration job past -// the per-call deadline. Kept small on purpose: these calls hit localhost -// (docker-mapped port) and are cheap when they succeed. -// -// Used by seedDoc (below), by the index-cleanup path in uniqueESIndex -// (setup_shared_test.go), by putTestSpotlightIndex, and by the CCS-only -// helpers in integration_ccs_test.go. -var testHTTPClient = &http.Client{Timeout: 10 * time.Second} - -// seedDoc PUTs a JSON document into ES, synchronously refreshing the index -// so the next search sees it. -func seedDoc(t *testing.T, esURL, index, id string, doc any) { - t.Helper() - data, err := json.Marshal(doc) - require.NoError(t, err) - url := fmt.Sprintf("%s/%s/_doc/%s?refresh=true", esURL, index, id) - req, err := http.NewRequest(http.MethodPut, url, bytes.NewReader(data)) - require.NoError(t, err) - req.Header.Set("Content-Type", "application/json") - resp, err := testHTTPClient.Do(req) - require.NoError(t, err) - defer resp.Body.Close() - body, _ := io.ReadAll(resp.Body) - require.Truef(t, resp.StatusCode == http.StatusCreated || resp.StatusCode == http.StatusOK, - "seedDoc %s/%s: status=%d body=%s", index, id, resp.StatusCode, body) -} - -// --- search.apps integration ------------------------------------------------ - -// setupAppsFixture starts an isolated Mongo container (via pkg/testutil) and -// a single search-service router bound to that DB. ES/Valkey are not used by -// search.apps, so we wire fakes (the existing `fakeStore` / `fakeCache` -// satisfy the interfaces but never get called on the apps path). -type appsFixture struct { - clientNATS *nats.Conn - mongoDB *mongo.Database -} - -func setupAppsFixture(t *testing.T) *appsFixture { - t.Helper() - - mongoDB := testutil.MongoDB(t, "search_service_test") - - natsURL := sharedNATS(t) - - serverNATS, err := natsutil.Connect(natsURL, "") - require.NoError(t, err) - t.Cleanup(func() { _ = serverNATS.Drain() }) - - clientNATS, err := nats.Connect(natsURL) - require.NoError(t, err) - t.Cleanup(func() { clientNATS.Close() }) - - // Wire the handler with a real mongoStore and stub ES/cache. - mongoStore := newMongoStore(mongoDB) - store := &fakeStore{} - cache := newFakeCache() - h := newHandler(store, mongoStore, nil, cache, handlerConfig{ - DocCounts: 25, - MaxDocCounts: 100, - RestrictedRoomsCacheTTL: 5 * time.Minute, - RecentWindow: 365 * 24 * time.Hour, - RequestTimeout: 5 * time.Second, - SpotlightReadPattern: "spotlight-*", - }) - - router := natsrouter.New(serverNATS, "search-service-test") - router.Use(natsrouter.RequestID()) - h.Register(router) - // Flush ensures subscriptions are registered on the server before the - // fixture returns. Without this, fast tests that fire a request - // immediately can hit "no responders available" while subscriptions - // are still propagating. natsutil.Connect returns an otelnats.Conn - // wrapper that doesn't expose Flush; reach through to the underlying - // *nats.Conn. - require.NoError(t, serverNATS.NatsConn().Flush()) - t.Cleanup(func() { - _ = router.Shutdown(context.Background()) - }) - - return &appsFixture{clientNATS: clientNATS, mongoDB: mongoDB} -} - -func TestIntegration_SearchApps_PrototypePipeline(t *testing.T) { - f := setupAppsFixture(t) - ctx := context.Background() - - // Seed 3 apps in Mongo. The prototype pipeline matches by `name` regex - // (case-insensitive) and applies $limit; the full $lookup access-guard - // pipeline is implemented in a follow-up. - _, err := f.mongoDB.Collection("apps").InsertMany(ctx, []any{ - map[string]any{"_id": "a1", "name": "Weather Alpha", "assistant": map[string]any{"enabled": true, "name": "weather.bot"}}, - map[string]any{"_id": "a2", "name": "Weatherly", "assistant": map[string]any{"enabled": false, "name": "weatherly.bot"}}, - map[string]any{"_id": "a3", "name": "Calendar"}, - }) - require.NoError(t, err) - - reqBytes, err := json.Marshal(model.SearchAppsRequest{Query: "weather"}) - require.NoError(t, err) - - msg, err := f.clientNATS.Request(subject.SearchApps("alice"), reqBytes, 5*time.Second) - require.NoError(t, err) - - var resp model.SearchAppsResponse - require.NoError(t, json.Unmarshal(msg.Data, &resp)) - - require.Len(t, resp.Apps, 2, "two apps match the 'weather' regex") - names := []string{resp.Apps[0].Name, resp.Apps[1].Name} - assert.Contains(t, names, "Weather Alpha") - assert.Contains(t, names, "Weatherly") -} - -func TestIntegration_SearchApps_AssistantEnabledFilter(t *testing.T) { - f := setupAppsFixture(t) - ctx := context.Background() - - _, err := f.mongoDB.Collection("apps").InsertMany(ctx, []any{ - map[string]any{"_id": "a1", "name": "Weather Alpha", "assistant": map[string]any{"enabled": true, "name": "weather.bot"}}, - map[string]any{"_id": "a2", "name": "Weatherly", "assistant": map[string]any{"enabled": false, "name": "weatherly.bot"}}, - }) - require.NoError(t, err) - - enabled := true - reqBytes, err := json.Marshal(model.SearchAppsRequest{ - Query: "weather", - AssistantEnabled: &enabled, - }) - require.NoError(t, err) - - msg, err := f.clientNATS.Request(subject.SearchApps("alice"), reqBytes, 5*time.Second) - require.NoError(t, err) - - var resp model.SearchAppsResponse - require.NoError(t, json.Unmarshal(msg.Data, &resp)) - - require.Len(t, resp.Apps, 1) - assert.Equal(t, "Weather Alpha", resp.Apps[0].Name) -} - -func TestIntegration_SearchApps_EmptyQueryReturnsBadRequest(t *testing.T) { - f := setupAppsFixture(t) - - reqBytes, err := json.Marshal(model.SearchAppsRequest{Query: ""}) - require.NoError(t, err) - - msg, err := f.clientNATS.Request(subject.SearchApps("alice"), reqBytes, 5*time.Second) - require.NoError(t, err) - - var envelope model.ErrorResponse - require.NoError(t, json.Unmarshal(msg.Data, &envelope)) - require.NotEmpty(t, envelope.Error) - assert.Equal(t, natsrouter.CodeBadRequest, envelope.Code) -} - -// --- search.users integration ------------------------------------------------ - -// usersFixture is a minimal fixture for the search.users path: NATS for the -// request/reply layer, and an httptest.Server standing in for the third-party -// HR endpoint. No Mongo or ES containers are needed. -type usersFixture struct { - clientNATS *nats.Conn - thirdParty *httptest.Server // controls the stub response -} - -func setupUsersFixture(t *testing.T, thirdPartyHandler http.Handler) *usersFixture { - t.Helper() - - // Start the stub third-party server. - stub := httptest.NewServer(thirdPartyHandler) - t.Cleanup(stub.Close) - - natsURL := sharedNATS(t) - serverNC, err := natsutil.Connect(natsURL, "") - require.NoError(t, err, "connect nats (server side)") - t.Cleanup(func() { _ = serverNC.Drain() }) - - clientNC, err := nats.Connect(natsURL) - require.NoError(t, err, "connect nats (client side)") - t.Cleanup(func() { clientNC.Close() }) - - // Wire the handler with a real httpUsersClient pointing at the stub. - usersRC := restyutil.New(stub.URL, restyutil.WithTimeout(5*time.Second)) - usersClient := newHTTPUsersClient(usersRC, "") - - h := newHandler(nil, nil, usersClient, newFakeCache(), handlerConfig{ - DocCounts: 25, - MaxDocCounts: 100, - RequestTimeout: 5 * time.Second, - }) - - router := natsrouter.New(serverNC, "search-service-test") - router.Use(natsrouter.RequestID()) - h.Register(router) - // Flush — see setupAppsFixture for the rationale. - require.NoError(t, serverNC.NatsConn().Flush()) - t.Cleanup(func() { _ = router.Shutdown(context.Background()) }) - - return &usersFixture{clientNATS: clientNC, thirdParty: stub} -} - -func TestIntegration_SearchUsers_Happy(t *testing.T) { - // Stub returns two users matching the query. - stubResp := `[{"account":"alice","engName":"Alice Wang"},{"account":"alice2","engName":"Alice Chen"}]` - - f := setupUsersFixture(t, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - w.Header().Set("Content-Type", "application/json") - w.WriteHeader(http.StatusOK) - _, _ = w.Write([]byte(stubResp)) - })) - - reqBytes, err := json.Marshal(model.SearchUsersRequest{Query: "alice"}) - require.NoError(t, err) - - msg, err := f.clientNATS.Request(subject.SearchUsers("alice"), reqBytes, 5*time.Second) - require.NoError(t, err) - - var users []model.SearchUser - require.NoError(t, json.Unmarshal(msg.Data, &users)) - - require.Len(t, users, 2) - assert.Equal(t, "alice", users[0].Account) - assert.Equal(t, "Alice Wang", users[0].EngName) -} - -func TestIntegration_SearchUsers_EmptyQueryReturnsBadRequest(t *testing.T) { - // Stub should never be called for a bad-request scenario. - f := setupUsersFixture(t, http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { - t.Error("third-party stub should not be called for empty query") - w.WriteHeader(http.StatusInternalServerError) - })) - - reqBytes, err := json.Marshal(model.SearchUsersRequest{Query: ""}) - require.NoError(t, err) - - msg, err := f.clientNATS.Request(subject.SearchUsers("alice"), reqBytes, 5*time.Second) - require.NoError(t, err) - - var envelope model.ErrorResponse - require.NoError(t, json.Unmarshal(msg.Data, &envelope)) - require.NotEmpty(t, envelope.Error) - assert.Equal(t, natsrouter.CodeBadRequest, envelope.Code) -} - -func TestIntegration_SearchUsers_ThirdPartyErrorReturnsInternal(t *testing.T) { - // Stub returns a 503 to simulate a backend outage. - f := setupUsersFixture(t, http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { - w.WriteHeader(http.StatusServiceUnavailable) - })) - - reqBytes, err := json.Marshal(model.SearchUsersRequest{Query: "alice"}) - require.NoError(t, err) - - msg, err := f.clientNATS.Request(subject.SearchUsers("alice"), reqBytes, 5*time.Second) - require.NoError(t, err) - - var envelope model.ErrorResponse - require.NoError(t, json.Unmarshal(msg.Data, &envelope)) - require.NotEmpty(t, envelope.Error) - assert.Equal(t, natsrouter.CodeInternal, envelope.Code, - "non-2xx from third-party must surface as internal error, not raw status") - // Raw third-party details must not leak to the caller. - assert.NotContains(t, envelope.Error, "503", "status code from third-party must not leak") -} - -// --- search.rooms integration ---------------------------------------- - -// roomsFixture wires a real ES container (for the spotlight index) and -// NATS. search.rooms is served directly from the spotlight index, so no -// Mongo is involved. The ES container is process-shared; per-test -// isolation comes from a unique spotlight index name (deleted on -// cleanup) plus a Valkey FLUSHDB on cleanup. -type roomsFixture struct { - clientNATS *nats.Conn - esURL string - spotlightIndex string -} - -// setupRoomsFixture wires the search-service router against the -// process-shared ES, Valkey and NATS containers. The spotlight index -// name is unique per test so leftovers from a sibling test can't leak -// into this one's hit set. -func setupRoomsFixture(t *testing.T) *roomsFixture { - t.Helper() - ctx := context.Background() - - esURL := sharedSingleNodeES(t) - spotlightIndex := uniqueESIndex(t, "spotlight") - putTestSpotlightIndex(t, esURL, spotlightIndex) - - natsURL := sharedNATS(t) - serverNC, err := natsutil.Connect(natsURL, "") - require.NoError(t, err, "connect nats (server side)") - t.Cleanup(func() { _ = serverNC.Drain() }) - - clientNC, err := nats.Connect(natsURL) - require.NoError(t, err, "connect nats (client side)") - t.Cleanup(func() { clientNC.Close() }) - - engine, err := searchengine.New(ctx, searchengine.Config{Backend: "elasticsearch", URL: esURL}) - require.NoError(t, err, "build searchengine for subs fixture") - - esStore := newESStore(engine, testUserRoomIndex) - cache := newValkeyCache(freshValkeyClient(t)) - h := newHandler(esStore, nil, nil, cache, handlerConfig{ - DocCounts: 25, - MaxDocCounts: 100, - RestrictedRoomsCacheTTL: 5 * time.Minute, - RecentWindow: 365 * 24 * time.Hour, - RequestTimeout: 5 * time.Second, - SpotlightReadPattern: spotlightIndex, - }) - - router := natsrouter.New(serverNC, "search-service-test-subs") - router.Use(natsrouter.RequestID()) - h.Register(router) - // Flush — see setupAppsFixture for the rationale. - require.NoError(t, serverNC.NatsConn().Flush()) - t.Cleanup(func() { _ = router.Shutdown(context.Background()) }) - - return &roomsFixture{clientNATS: clientNC, esURL: esURL, spotlightIndex: spotlightIndex} -} - -// putTestSpotlightIndex creates a minimal spotlight index in ES with the -// fields needed by the subscription search query. -func putTestSpotlightIndex(t *testing.T, esURL, index string) { - t.Helper() - body := map[string]any{ - "settings": map[string]any{ - "number_of_shards": 1, - "number_of_replicas": 0, - "refresh_interval": "1s", - }, - "mappings": map[string]any{ - "dynamic": false, - "properties": map[string]any{ - "roomId": map[string]any{"type": "keyword"}, - "roomName": map[string]any{ - "type": "search_as_you_type", - }, - "roomType": map[string]any{"type": "keyword"}, - "userAccount": map[string]any{"type": "keyword"}, - "siteId": map[string]any{"type": "keyword"}, - "joinedAt": map[string]any{"type": "date"}, - }, - }, - } - data, _ := json.Marshal(body) - req, err := http.NewRequest(http.MethodPut, esURL+"/"+index, bytes.NewReader(data)) - require.NoError(t, err) - req.Header.Set("Content-Type", "application/json") - resp, err := testHTTPClient.Do(req) - require.NoError(t, err) - defer resp.Body.Close() - b, _ := io.ReadAll(resp.Body) - require.True(t, resp.StatusCode == http.StatusOK || resp.StatusCode == http.StatusCreated, - "create spotlight index: status=%d body=%s", resp.StatusCode, b) -} - -func TestIntegration_SearchRooms_HappyPath(t *testing.T) { - f := setupRoomsFixture(t) - - const account = "alice" - now := time.Now().UTC() - - // Seed spotlight docs for two rooms alice is in. - seedDoc(t, f.esURL, f.spotlightIndex, "spot-r1", map[string]any{ - "roomId": "r1", - "roomName": "engineering-announcements", - "roomType": "channel", - "userAccount": account, - "siteId": "site-local", - "joinedAt": now.Add(-48 * time.Hour).Format(time.RFC3339), - }) - seedDoc(t, f.esURL, f.spotlightIndex, "spot-r2", map[string]any{ - "roomId": "r2", - "roomName": "engineering-random", - "roomType": "channel", - "userAccount": account, - "siteId": "site-local", - "joinedAt": now.Add(-24 * time.Hour).Format(time.RFC3339), - }) - // A matching room owned by a different account. With the Mongo - // hydration removed, the spotlight userAccount term filter is the - // sole access boundary — this must not leak into alice's results. - seedDoc(t, f.esURL, f.spotlightIndex, "spot-r3", map[string]any{ - "roomId": "r3", - "roomName": "engineering-secret", - "roomType": "channel", - "userAccount": "mallory", - "siteId": "site-local", - "joinedAt": now.Add(-12 * time.Hour).Format(time.RFC3339), - }) - - reqBytes, err := json.Marshal(model.SearchRoomsRequest{Query: "engineering"}) - require.NoError(t, err) - - msg, err := f.clientNATS.Request(subject.SearchRooms(account), reqBytes, 10*time.Second) - require.NoError(t, err) - - var resp model.SearchRoomsResponse - require.NoError(t, json.Unmarshal(msg.Data, &resp)) - - require.Len(t, resp.Rooms, 2, "both rooms matching 'engineering' must be returned") - byID := map[string]model.SearchRoom{} - for _, r := range resp.Rooms { - byID[r.RoomID] = r - } - assert.Equal(t, model.SearchRoom{RoomID: "r1", Name: "engineering-announcements", RoomType: "channel", SiteID: "site-local"}, byID["r1"]) - assert.Equal(t, model.SearchRoom{RoomID: "r2", Name: "engineering-random", RoomType: "channel", SiteID: "site-local"}, byID["r2"]) - _, leaked := byID["r3"] - assert.False(t, leaked, "rooms owned by another account must not leak") -} - -func TestIntegration_SearchRooms_RoomTypeChannelFilter(t *testing.T) { - f := setupRoomsFixture(t) - - const account = "bob" - now := time.Now().UTC() - - seedDoc(t, f.esURL, f.spotlightIndex, "spot-b-r1", map[string]any{ - "roomId": "b-r1", - "roomName": "bob-alice", - "roomType": "dm", - "userAccount": account, - "siteId": "site-local", - "joinedAt": now.Add(-1 * time.Hour).Format(time.RFC3339), - }) - seedDoc(t, f.esURL, f.spotlightIndex, "spot-b-r2", map[string]any{ - "roomId": "b-r2", - "roomName": "bob-channel", - "roomType": "channel", - "userAccount": account, - "siteId": "site-local", - "joinedAt": now.Add(-2 * time.Hour).Format(time.RFC3339), - }) - - reqBytes, err := json.Marshal(model.SearchRoomsRequest{Query: "bob", RoomType: "channel"}) - require.NoError(t, err) - - msg, err := f.clientNATS.Request(subject.SearchRooms(account), reqBytes, 10*time.Second) - require.NoError(t, err) - - var resp model.SearchRoomsResponse - require.NoError(t, json.Unmarshal(msg.Data, &resp)) - - require.Len(t, resp.Rooms, 1) - assert.Equal(t, model.SearchRoom{RoomID: "b-r2", Name: "bob-channel", RoomType: "channel", SiteID: "site-local"}, resp.Rooms[0], - "only the channel room must match roomType=channel filter") -} - -func TestIntegration_SearchRooms_EmptyQueryReturnsBadRequest(t *testing.T) { - f := setupRoomsFixture(t) - - reqBytes, err := json.Marshal(model.SearchRoomsRequest{Query: ""}) - require.NoError(t, err) - - msg, err := f.clientNATS.Request(subject.SearchRooms("alice"), reqBytes, 5*time.Second) - require.NoError(t, err) - - var envelope model.ErrorResponse - require.NoError(t, json.Unmarshal(msg.Data, &envelope)) - require.NotEmpty(t, envelope.Error) - assert.Equal(t, natsrouter.CodeBadRequest, envelope.Code) -} - -func TestIntegration_SearchRooms_RoomTypeAppReturnsBadRequest(t *testing.T) { - f := setupRoomsFixture(t) - - reqBytes, err := json.Marshal(model.SearchRoomsRequest{Query: "x", RoomType: "app"}) - require.NoError(t, err) - - msg, err := f.clientNATS.Request(subject.SearchRooms("alice"), reqBytes, 5*time.Second) - require.NoError(t, err) - - var envelope model.ErrorResponse - require.NoError(t, json.Unmarshal(msg.Data, &envelope)) - require.NotEmpty(t, envelope.Error) - assert.Equal(t, natsrouter.CodeBadRequest, envelope.Code) - assert.Contains(t, envelope.Error, "invalid roomType") -} - -// --- search.messages v2 integration ----------------------------------------- - -// messagesV2Fixture stubs ES with a fake HTTP server (httptest). The -// messages path is pure ES — no Mongo round-trip — so no Mongo fixture -// is wired. -type messagesV2Fixture struct { - clientNATS *nats.Conn -} - -func setupMessagesV2Fixture(t *testing.T) *messagesV2Fixture { - t.Helper() - ctx := context.Background() - - // Stub ES: always return a canned response containing one hit. - esStub := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - // Drain the body so the HTTP/1.1 connection stays open. - _, _ = io.Copy(io.Discard, r.Body) - // The Elastic Go client performs a "product check" handshake on - // connect and rejects any server that doesn't advertise itself - // as Elasticsearch via this header. Set it on every response so - // the stub passes the check regardless of which endpoint is hit. - w.Header().Set("X-Elastic-Product", "Elasticsearch") - w.Header().Set("Content-Type", "application/json") - _, _ = w.Write([]byte(`{"hits":{"total":{"value":1},"hits":[{"_source":{` + - `"messageId":"m1","roomId":"r1","siteId":"site-a","userId":"u1",` + - `"userAccount":"alice","content":"hello","createdAt":"2026-04-01T12:00:00Z"}}]}}`)) - })) - t.Cleanup(esStub.Close) - - // Valkey stub — use the fakeCache wired in-process via handler injection. - fakeValkey := newFakeCache() - fakeValkey.store["alice"] = map[string]int64{} // empty restricted map, cache hit - - natsURL := sharedNATS(t) - - serverNATS, err := natsutil.Connect(natsURL, "") - require.NoError(t, err) - t.Cleanup(func() { _ = serverNATS.Drain() }) - - clientNATS, err := nats.Connect(natsURL) - require.NoError(t, err) - t.Cleanup(func() { clientNATS.Close() }) - - // Wire search-service with the stub ES engine. No Mongo store needed - // for the messages path. - engine, err := searchengine.New(ctx, searchengine.Config{Backend: "elasticsearch", URL: esStub.URL}) - require.NoError(t, err) - esStore := newESStore(engine, testUserRoomIndex) - - h := newHandler(esStore, nil, nil, fakeValkey, handlerConfig{ - DocCounts: 25, - MaxDocCounts: 100, - RestrictedRoomsCacheTTL: 5 * time.Minute, - RecentWindow: 365 * 24 * time.Hour, - RequestTimeout: 5 * time.Second, - UserRoomIndex: testUserRoomIndex, - SpotlightReadPattern: "spotlight-*", - }) - - router := natsrouter.New(serverNATS, "search-service-test-v2") - router.Use(natsrouter.RequestID()) - h.Register(router) - // Flush — see setupAppsFixture for the rationale. - require.NoError(t, serverNATS.NatsConn().Flush()) - t.Cleanup(func() { _ = router.Shutdown(context.Background()) }) - - return &messagesV2Fixture{clientNATS: clientNATS} -} - -func TestIntegration_SearchMessages_V2_HitProjection(t *testing.T) { - f := setupMessagesV2Fixture(t) - - reqBytes, err := json.Marshal(model.SearchMessagesRequest{Query: "hello"}) - require.NoError(t, err) - - msg, err := f.clientNATS.Request(subject.SearchMessages("alice"), reqBytes, 5*time.Second) - require.NoError(t, err) - - var resp model.SearchMessagesResponse - require.NoError(t, json.Unmarshal(msg.Data, &resp)) - - require.Len(t, resp.Messages, 1) - assert.EqualValues(t, 1, resp.Total) - - got := resp.Messages[0] - assert.Equal(t, "m1", got.MessageID) - assert.Equal(t, "r1", got.RoomID) - assert.Equal(t, "site-a", got.SiteID) - assert.Equal(t, "alice", got.UserAccount) - assert.Equal(t, "hello", got.Content) -} - -func TestIntegration_SearchMessages_V2_EmptyQueryReturnsBadRequest(t *testing.T) { - f := setupMessagesV2Fixture(t) - - reqBytes, err := json.Marshal(model.SearchMessagesRequest{Query: ""}) - require.NoError(t, err) - - msg, err := f.clientNATS.Request(subject.SearchMessages("alice"), reqBytes, 5*time.Second) - require.NoError(t, err) - - var envelope model.ErrorResponse - require.NoError(t, json.Unmarshal(msg.Data, &envelope)) - assert.Equal(t, natsrouter.CodeBadRequest, envelope.Code) -} diff --git a/search-service/integration_users_test.go b/search-service/integration_users_test.go new file mode 100644 index 000000000..95b6b3330 --- /dev/null +++ b/search-service/integration_users_test.go @@ -0,0 +1,134 @@ +//go:build integration + +package main + +// search.users integration tests. The path needs only NATS plus an +// httptest stub for the third-party HR endpoint — no ES, Mongo, or +// Valkey. + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/nats-io/nats.go" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/hmchangw/chat/pkg/model" + "github.com/hmchangw/chat/pkg/natsrouter" + "github.com/hmchangw/chat/pkg/natsutil" + "github.com/hmchangw/chat/pkg/restyutil" + "github.com/hmchangw/chat/pkg/subject" +) + +// usersFixture is a minimal fixture for the search.users path: NATS for the +// request/reply layer, and an httptest.Server standing in for the third-party +// HR endpoint. No Mongo or ES containers are needed. +type usersFixture struct { + clientNATS *nats.Conn + thirdParty *httptest.Server // controls the stub response +} + +func setupUsersFixture(t *testing.T, thirdPartyHandler http.Handler) *usersFixture { + t.Helper() + + // Start the stub third-party server. + stub := httptest.NewServer(thirdPartyHandler) + t.Cleanup(stub.Close) + + natsURL := sharedNATS(t) + serverNC, err := natsutil.Connect(natsURL, "") + require.NoError(t, err, "connect nats (server side)") + t.Cleanup(func() { _ = serverNC.Drain() }) + + clientNC, err := nats.Connect(natsURL) + require.NoError(t, err, "connect nats (client side)") + t.Cleanup(func() { clientNC.Close() }) + + // Wire the handler with a real httpUsersClient pointing at the stub. + usersRC := restyutil.New(stub.URL, restyutil.WithTimeout(5*time.Second)) + usersClient := newHTTPUsersClient(usersRC, "") + + h := newHandler(nil, nil, usersClient, newFakeCache(), handlerConfig{ + DocCounts: 25, + MaxDocCounts: 100, + RequestTimeout: 5 * time.Second, + }) + + router := natsrouter.New(serverNC, "search-service-test") + router.Use(natsrouter.RequestID()) + h.Register(router) + // Flush — see setupAppsFixture for the rationale. + require.NoError(t, serverNC.NatsConn().Flush()) + t.Cleanup(func() { _ = router.Shutdown(context.Background()) }) + + return &usersFixture{clientNATS: clientNC, thirdParty: stub} +} + +func TestIntegration_SearchUsers_Happy(t *testing.T) { + // Stub returns two users matching the query. + stubResp := `[{"account":"alice","engName":"Alice Wang"},{"account":"alice2","engName":"Alice Chen"}]` + + f := setupUsersFixture(t, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte(stubResp)) + })) + + reqBytes, err := json.Marshal(model.SearchUsersRequest{Query: "alice"}) + require.NoError(t, err) + + msg, err := f.clientNATS.Request(subject.SearchUsers("alice"), reqBytes, 5*time.Second) + require.NoError(t, err) + + var users []model.SearchUser + require.NoError(t, json.Unmarshal(msg.Data, &users)) + + require.Len(t, users, 2) + assert.Equal(t, "alice", users[0].Account) + assert.Equal(t, "Alice Wang", users[0].EngName) +} + +func TestIntegration_SearchUsers_EmptyQueryReturnsBadRequest(t *testing.T) { + // Stub should never be called for a bad-request scenario. + f := setupUsersFixture(t, http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + t.Error("third-party stub should not be called for empty query") + w.WriteHeader(http.StatusInternalServerError) + })) + + reqBytes, err := json.Marshal(model.SearchUsersRequest{Query: ""}) + require.NoError(t, err) + + msg, err := f.clientNATS.Request(subject.SearchUsers("alice"), reqBytes, 5*time.Second) + require.NoError(t, err) + + var envelope model.ErrorResponse + require.NoError(t, json.Unmarshal(msg.Data, &envelope)) + require.NotEmpty(t, envelope.Error) + assert.Equal(t, natsrouter.CodeBadRequest, envelope.Code) +} + +func TestIntegration_SearchUsers_ThirdPartyErrorReturnsInternal(t *testing.T) { + // Stub returns a 503 to simulate a backend outage. + f := setupUsersFixture(t, http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusServiceUnavailable) + })) + + reqBytes, err := json.Marshal(model.SearchUsersRequest{Query: "alice"}) + require.NoError(t, err) + + msg, err := f.clientNATS.Request(subject.SearchUsers("alice"), reqBytes, 5*time.Second) + require.NoError(t, err) + + var envelope model.ErrorResponse + require.NoError(t, json.Unmarshal(msg.Data, &envelope)) + require.NotEmpty(t, envelope.Error) + assert.Equal(t, natsrouter.CodeInternal, envelope.Code, + "non-2xx from third-party must surface as internal error, not raw status") + // Raw third-party details must not leak to the caller. + assert.NotContains(t, envelope.Error, "503", "status code from third-party must not leak") +} diff --git a/search-service/setup_shared_test.go b/search-service/setup_shared_test.go index 586f464de..f0dc444df 100644 --- a/search-service/setup_shared_test.go +++ b/search-service/setup_shared_test.go @@ -3,16 +3,16 @@ package main // This file owns the process-shared test infrastructure used by every -// fixture in integration_test.go. Each container is started exactly once -// via sync.Once and lives for the entire `go test` run; Ryuk (from -// testcontainers-go) reaps it after the process exits. +// per-endpoint integration_*_test.go file. Each container is started +// exactly once via sync.Once and lives for the entire `go test` run; +// Ryuk (from testcontainers-go) reaps it after the process exits. // // Sharing is safe because tests within this package run sequentially and // each fixture isolates state per-test: // // - Elasticsearch: unique index name per test (uniqueESIndex), DELETEd // on cleanup. -// - Valkey: flushSharedValkey wipes the keyspace on cleanup. +// - Valkey: flushValkey wipes the keyspace on cleanup. // - NATS: each test creates its own *nats.Conn pair and // router.Shutdown / nc.Close remove subscriptions before the next // test starts. Each fixture also uses a distinct queue group name. @@ -20,11 +20,17 @@ package main // CCS tests are the one exception — they need two networked ES nodes and // stand up their own pair inside setupCCSFixture. They still piggyback on // the shared Valkey and NATS, since those don't care about the topology. +// +// This file also owns the test-wide constants and helpers that all +// integration files share: testUserRoomIndex, testHTTPClient, and seedDoc. import ( + "bytes" "context" + "encoding/json" "fmt" "hash/fnv" + "io" "net/http" "sync" "testing" @@ -40,6 +46,32 @@ import ( "github.com/hmchangw/chat/pkg/valkeyutil" ) +const testUserRoomIndex = "user-room" + +// testHTTPClient is a bounded HTTP client for ES control-plane calls — +// stalled containers shouldn't be able to hang the integration job past +// the per-call deadline. Kept small on purpose: these calls hit localhost +// (docker-mapped port) and are cheap when they succeed. +var testHTTPClient = &http.Client{Timeout: 10 * time.Second} + +// seedDoc PUTs a JSON document into ES, synchronously refreshing the index +// so the next search sees it. +func seedDoc(t *testing.T, esURL, index, id string, doc any) { + t.Helper() + data, err := json.Marshal(doc) + require.NoError(t, err) + url := fmt.Sprintf("%s/%s/_doc/%s?refresh=true", esURL, index, id) + req, err := http.NewRequest(http.MethodPut, url, bytes.NewReader(data)) + require.NoError(t, err) + req.Header.Set("Content-Type", "application/json") + resp, err := testHTTPClient.Do(req) + require.NoError(t, err) + defer resp.Body.Close() + body, _ := io.ReadAll(resp.Body) + require.Truef(t, resp.StatusCode == http.StatusCreated || resp.StatusCode == http.StatusOK, + "seedDoc %s/%s: status=%d body=%s", index, id, resp.StatusCode, body) +} + var ( sharedESOnce sync.Once sharedESURL string From 17ef382f214c80474c8f7dfc6e06432ae0de4fe7 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 20 May 2026 08:00:40 +0000 Subject: [PATCH 05/23] test(search-service): add router cleanup to CCS fixture; trim comments - Add t.Cleanup(router.Shutdown) to setupCCSFixture to match every other fixture in the package (per CodeRabbit review on PR #208). - Trim narration / what-comments across the integration_*_test.go files and setup_shared_test.go. Preserve genuine WHY comments: X-Elastic-Product product-check, mallory access boundary, FNV hash rationale, FLUSHDB isolation note, Flush before subscribe-race. --- search-service/integration_apps_test.go | 23 ++++--------- search-service/integration_ccs_test.go | 1 + search-service/integration_messages_test.go | 11 +------ search-service/integration_rooms_test.go | 17 ++-------- search-service/integration_users_test.go | 11 +------ search-service/setup_shared_test.go | 36 +++++---------------- 6 files changed, 20 insertions(+), 79 deletions(-) diff --git a/search-service/integration_apps_test.go b/search-service/integration_apps_test.go index 28ccd8814..7c3cedaf4 100644 --- a/search-service/integration_apps_test.go +++ b/search-service/integration_apps_test.go @@ -2,9 +2,7 @@ package main -// search.apps integration tests. Uses the process-shared Mongo -// (testutil.MongoDB) and NATS (sharedNATS in setup_shared_test.go); ES -// and Valkey are stubbed because the apps path doesn't touch them. +// Integration tests for search.apps (Mongo + NATS; ES/Valkey stubbed). import ( "context" @@ -25,10 +23,6 @@ import ( "github.com/hmchangw/chat/pkg/testutil" ) -// setupAppsFixture starts an isolated Mongo container (via pkg/testutil) and -// a single search-service router bound to that DB. ES/Valkey are not used by -// search.apps, so we wire fakes (the existing `fakeStore` / `fakeCache` -// satisfy the interfaces but never get called on the apps path). type appsFixture struct { clientNATS *nats.Conn mongoDB *mongo.Database @@ -49,7 +43,6 @@ func setupAppsFixture(t *testing.T) *appsFixture { require.NoError(t, err) t.Cleanup(func() { clientNATS.Close() }) - // Wire the handler with a real mongoStore and stub ES/cache. mongoStore := newMongoStore(mongoDB) store := &fakeStore{} cache := newFakeCache() @@ -65,12 +58,9 @@ func setupAppsFixture(t *testing.T) *appsFixture { router := natsrouter.New(serverNATS, "search-service-test") router.Use(natsrouter.RequestID()) h.Register(router) - // Flush ensures subscriptions are registered on the server before the - // fixture returns. Without this, fast tests that fire a request - // immediately can hit "no responders available" while subscriptions - // are still propagating. natsutil.Connect returns an otelnats.Conn - // wrapper that doesn't expose Flush; reach through to the underlying - // *nats.Conn. + // Flush before returning so a fast test doesn't hit "no responders" + // while subscriptions propagate. otelnats wraps the conn — reach + // through to *nats.Conn. require.NoError(t, serverNATS.NatsConn().Flush()) t.Cleanup(func() { _ = router.Shutdown(context.Background()) @@ -83,9 +73,8 @@ func TestIntegration_SearchApps_PrototypePipeline(t *testing.T) { f := setupAppsFixture(t) ctx := context.Background() - // Seed 3 apps in Mongo. The prototype pipeline matches by `name` regex - // (case-insensitive) and applies $limit; the full $lookup access-guard - // pipeline is implemented in a follow-up. + // Prototype pipeline matches by `name` regex + $limit; $lookup + // access-guard is a follow-up. _, err := f.mongoDB.Collection("apps").InsertMany(ctx, []any{ map[string]any{"_id": "a1", "name": "Weather Alpha", "assistant": map[string]any{"enabled": true, "name": "weather.bot"}}, map[string]any{"_id": "a2", "name": "Weatherly", "assistant": map[string]any{"enabled": false, "name": "weatherly.bot"}}, diff --git a/search-service/integration_ccs_test.go b/search-service/integration_ccs_test.go index f1031d0b3..0b618b491 100644 --- a/search-service/integration_ccs_test.go +++ b/search-service/integration_ccs_test.go @@ -134,6 +134,7 @@ func setupCCSFixture(t *testing.T) *ccsFixture { handler.Register(router) // Flush — see setupAppsFixture for the rationale. require.NoError(t, serverNC.NatsConn().Flush()) + t.Cleanup(func() { _ = router.Shutdown(context.Background()) }) return &ccsFixture{ localURL: localURL, diff --git a/search-service/integration_messages_test.go b/search-service/integration_messages_test.go index 4f28578a5..7e0614328 100644 --- a/search-service/integration_messages_test.go +++ b/search-service/integration_messages_test.go @@ -2,9 +2,7 @@ package main -// search.messages v2 integration tests. Stubs ES with an httptest -// server because the messages path is pure ES — no Mongo round-trip — -// and uses the process-shared NATS from setup_shared_test.go. +// Integration tests for search.messages v2 (ES stubbed via httptest, shared NATS). import ( "context" @@ -26,9 +24,6 @@ import ( "github.com/hmchangw/chat/pkg/subject" ) -// messagesV2Fixture stubs ES with a fake HTTP server (httptest). The -// messages path is pure ES — no Mongo round-trip — so no Mongo fixture -// is wired. type messagesV2Fixture struct { clientNATS *nats.Conn } @@ -37,7 +32,6 @@ func setupMessagesV2Fixture(t *testing.T) *messagesV2Fixture { t.Helper() ctx := context.Background() - // Stub ES: always return a canned response containing one hit. esStub := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { // Drain the body so the HTTP/1.1 connection stays open. _, _ = io.Copy(io.Discard, r.Body) @@ -53,7 +47,6 @@ func setupMessagesV2Fixture(t *testing.T) *messagesV2Fixture { })) t.Cleanup(esStub.Close) - // Valkey stub — use the fakeCache wired in-process via handler injection. fakeValkey := newFakeCache() fakeValkey.store["alice"] = map[string]int64{} // empty restricted map, cache hit @@ -67,8 +60,6 @@ func setupMessagesV2Fixture(t *testing.T) *messagesV2Fixture { require.NoError(t, err) t.Cleanup(func() { clientNATS.Close() }) - // Wire search-service with the stub ES engine. No Mongo store needed - // for the messages path. engine, err := searchengine.New(ctx, searchengine.Config{Backend: "elasticsearch", URL: esStub.URL}) require.NoError(t, err) esStore := newESStore(engine, testUserRoomIndex) diff --git a/search-service/integration_rooms_test.go b/search-service/integration_rooms_test.go index d0e90ba23..fbc00ed39 100644 --- a/search-service/integration_rooms_test.go +++ b/search-service/integration_rooms_test.go @@ -2,10 +2,7 @@ package main -// search.rooms integration tests. Uses the process-shared ES, NATS, and -// Valkey from setup_shared_test.go; per-test isolation comes from a -// unique spotlight index name (deleted on cleanup) plus a Valkey -// FLUSHDB on cleanup. +// Integration tests for search.rooms (real ES + shared NATS + Valkey). import ( "bytes" @@ -27,21 +24,14 @@ import ( "github.com/hmchangw/chat/pkg/subject" ) -// roomsFixture wires a real ES container (for the spotlight index) and -// NATS. search.rooms is served directly from the spotlight index, so no -// Mongo is involved. The ES container is process-shared; per-test -// isolation comes from a unique spotlight index name (deleted on -// cleanup) plus a Valkey FLUSHDB on cleanup. +// roomsFixture uses a per-test spotlight index against the shared ES so +// sibling tests can't leak hits into each other. type roomsFixture struct { clientNATS *nats.Conn esURL string spotlightIndex string } -// setupRoomsFixture wires the search-service router against the -// process-shared ES, Valkey and NATS containers. The spotlight index -// name is unique per test so leftovers from a sibling test can't leak -// into this one's hit set. func setupRoomsFixture(t *testing.T) *roomsFixture { t.Helper() ctx := context.Background() @@ -125,7 +115,6 @@ func TestIntegration_SearchRooms_HappyPath(t *testing.T) { const account = "alice" now := time.Now().UTC() - // Seed spotlight docs for two rooms alice is in. seedDoc(t, f.esURL, f.spotlightIndex, "spot-r1", map[string]any{ "roomId": "r1", "roomName": "engineering-announcements", diff --git a/search-service/integration_users_test.go b/search-service/integration_users_test.go index 95b6b3330..f87c7693c 100644 --- a/search-service/integration_users_test.go +++ b/search-service/integration_users_test.go @@ -2,9 +2,7 @@ package main -// search.users integration tests. The path needs only NATS plus an -// httptest stub for the third-party HR endpoint — no ES, Mongo, or -// Valkey. +// Integration tests for search.users (NATS + httptest stub for HR endpoint). import ( "context" @@ -25,9 +23,6 @@ import ( "github.com/hmchangw/chat/pkg/subject" ) -// usersFixture is a minimal fixture for the search.users path: NATS for the -// request/reply layer, and an httptest.Server standing in for the third-party -// HR endpoint. No Mongo or ES containers are needed. type usersFixture struct { clientNATS *nats.Conn thirdParty *httptest.Server // controls the stub response @@ -36,7 +31,6 @@ type usersFixture struct { func setupUsersFixture(t *testing.T, thirdPartyHandler http.Handler) *usersFixture { t.Helper() - // Start the stub third-party server. stub := httptest.NewServer(thirdPartyHandler) t.Cleanup(stub.Close) @@ -49,7 +43,6 @@ func setupUsersFixture(t *testing.T, thirdPartyHandler http.Handler) *usersFixtu require.NoError(t, err, "connect nats (client side)") t.Cleanup(func() { clientNC.Close() }) - // Wire the handler with a real httpUsersClient pointing at the stub. usersRC := restyutil.New(stub.URL, restyutil.WithTimeout(5*time.Second)) usersClient := newHTTPUsersClient(usersRC, "") @@ -70,7 +63,6 @@ func setupUsersFixture(t *testing.T, thirdPartyHandler http.Handler) *usersFixtu } func TestIntegration_SearchUsers_Happy(t *testing.T) { - // Stub returns two users matching the query. stubResp := `[{"account":"alice","engName":"Alice Wang"},{"account":"alice2","engName":"Alice Chen"}]` f := setupUsersFixture(t, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { @@ -113,7 +105,6 @@ func TestIntegration_SearchUsers_EmptyQueryReturnsBadRequest(t *testing.T) { } func TestIntegration_SearchUsers_ThirdPartyErrorReturnsInternal(t *testing.T) { - // Stub returns a 503 to simulate a backend outage. f := setupUsersFixture(t, http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { w.WriteHeader(http.StatusServiceUnavailable) })) diff --git a/search-service/setup_shared_test.go b/search-service/setup_shared_test.go index f0dc444df..d6edbc62a 100644 --- a/search-service/setup_shared_test.go +++ b/search-service/setup_shared_test.go @@ -2,27 +2,12 @@ package main -// This file owns the process-shared test infrastructure used by every -// per-endpoint integration_*_test.go file. Each container is started -// exactly once via sync.Once and lives for the entire `go test` run; -// Ryuk (from testcontainers-go) reaps it after the process exits. -// -// Sharing is safe because tests within this package run sequentially and -// each fixture isolates state per-test: -// -// - Elasticsearch: unique index name per test (uniqueESIndex), DELETEd -// on cleanup. -// - Valkey: flushValkey wipes the keyspace on cleanup. -// - NATS: each test creates its own *nats.Conn pair and -// router.Shutdown / nc.Close remove subscriptions before the next -// test starts. Each fixture also uses a distinct queue group name. -// -// CCS tests are the one exception — they need two networked ES nodes and -// stand up their own pair inside setupCCSFixture. They still piggyback on -// the shared Valkey and NATS, since those don't care about the topology. -// -// This file also owns the test-wide constants and helpers that all -// integration files share: testUserRoomIndex, testHTTPClient, and seedDoc. +// Process-shared ES, Valkey, and NATS containers used by every +// integration_*_test.go file. Each starts once via sync.Once and is +// reaped by Ryuk at process exit. Tests run sequentially and isolate +// per-test via uniqueESIndex (DELETE on cleanup), Valkey FLUSHDB on +// cleanup, and a fresh *nats.Conn pair per test. CCS tests bring their +// own ES pair. import ( "bytes" @@ -48,10 +33,7 @@ import ( const testUserRoomIndex = "user-room" -// testHTTPClient is a bounded HTTP client for ES control-plane calls — -// stalled containers shouldn't be able to hang the integration job past -// the per-call deadline. Kept small on purpose: these calls hit localhost -// (docker-mapped port) and are cheap when they succeed. +// testHTTPClient bounds ES control-plane calls so a stalled container can't hang the job. var testHTTPClient = &http.Client{Timeout: 10 * time.Second} // seedDoc PUTs a JSON document into ES, synchronously refreshing the index @@ -86,9 +68,7 @@ var ( sharedNATSErr error ) -// sharedSingleNodeES returns the URL of a process-shared single-node ES -// container. CCS tests do NOT use this — they need a pair of networked -// clusters and stand up their own. +// sharedSingleNodeES returns the URL of the process-shared single-node ES. func sharedSingleNodeES(t *testing.T) string { t.Helper() sharedESOnce.Do(func() { From 1e5d97045168477a5317bae798a5887e607d1d8a Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 20 May 2026 08:04:44 +0000 Subject: [PATCH 06/23] test(search-service): pre-warm shared containers in parallel; halve ES heap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - TestMain pre-warms shared ES, NATS, and Valkey concurrently via goroutines. First-test wall-clock for cold start drops to max(ES, NATS, Valkey) ≈ ES alone (~30-60s) instead of the sum. Refactored each sharedXxx(t) into a no-t ensureSharedXxx() so TestMain can drive them without a *testing.T. - Drop ES heap from -Xms512m -Xmx512m to -Xms256m -Xmx256m for both the shared single-node ES and the two CCS ES nodes. The test seed sizes (<10 docs per test) fit comfortably in 256m; saves ~5-10s startup per container and ~250MB RAM per node. --- search-service/integration_ccs_test.go | 2 +- search-service/setup_shared_test.go | 53 +++++++++++++++++++------- 2 files changed, 41 insertions(+), 14 deletions(-) diff --git a/search-service/integration_ccs_test.go b/search-service/integration_ccs_test.go index 0b618b491..1a823fee6 100644 --- a/search-service/integration_ccs_test.go +++ b/search-service/integration_ccs_test.go @@ -174,7 +174,7 @@ func startESForCCS(t *testing.T, nw *testcontainers.DockerNetwork, alias, cluste "network.host": "0.0.0.0", "transport.host": "0.0.0.0", "cluster.routing.allocation.disk.threshold_enabled": "false", - "ES_JAVA_OPTS": "-Xms512m -Xmx512m", + "ES_JAVA_OPTS": "-Xms256m -Xmx256m", }, WaitingFor: wait.ForAll( wait.ForHTTP("/").WithPort("9200/tcp").WithStartupTimeout(120*time.Second), diff --git a/search-service/setup_shared_test.go b/search-service/setup_shared_test.go index d6edbc62a..b247752ab 100644 --- a/search-service/setup_shared_test.go +++ b/search-service/setup_shared_test.go @@ -17,6 +17,7 @@ import ( "hash/fnv" "io" "net/http" + "os" "sync" "testing" "time" @@ -68,9 +69,31 @@ var ( sharedNATSErr error ) +// TestMain pre-warms the shared containers concurrently so the first +// test doesn't pay their startup serially. Total wall-clock for cold +// start drops to max(ES, NATS, Valkey) ≈ ES alone (~30-60s) instead of +// the sum. +func TestMain(m *testing.M) { + var wg sync.WaitGroup + wg.Add(3) + go func() { defer wg.Done(); ensureSharedES() }() + go func() { defer wg.Done(); ensureSharedValkey() }() + go func() { defer wg.Done(); ensureSharedNATS() }() + wg.Wait() + os.Exit(m.Run()) +} + // sharedSingleNodeES returns the URL of the process-shared single-node ES. func sharedSingleNodeES(t *testing.T) string { t.Helper() + ensureSharedES() + if sharedESErr != nil { + t.Fatalf("shared elasticsearch: %v", sharedESErr) + } + return sharedESURL +} + +func ensureSharedES() { sharedESOnce.Do(func() { ctx := context.Background() container, err := testcontainers.GenericContainer(ctx, testcontainers.GenericContainerRequest{ @@ -80,7 +103,7 @@ func sharedSingleNodeES(t *testing.T) string { Env: map[string]string{ "discovery.type": "single-node", "xpack.security.enabled": "false", - "ES_JAVA_OPTS": "-Xms512m -Xmx512m", + "ES_JAVA_OPTS": "-Xms256m -Xmx256m", "cluster.routing.allocation.disk.threshold_enabled": "false", }, WaitingFor: wait.ForAll( @@ -110,10 +133,6 @@ func sharedSingleNodeES(t *testing.T) string { } sharedESURL = fmt.Sprintf("http://%s:%s", host, port.Port()) }) - if sharedESErr != nil { - t.Fatalf("shared elasticsearch: %v", sharedESErr) - } - return sharedESURL } // sharedValkey returns the addr of a process-shared Valkey container. @@ -121,6 +140,14 @@ func sharedSingleNodeES(t *testing.T) string { // keyspace is wiped on test cleanup. func sharedValkey(t *testing.T) string { t.Helper() + ensureSharedValkey() + if sharedValkeyErr != nil { + t.Fatalf("shared valkey: %v", sharedValkeyErr) + } + return sharedValkeyAddr +} + +func ensureSharedValkey() { sharedValkeyOnce.Do(func() { ctx := context.Background() container, err := testcontainers.GenericContainer(ctx, testcontainers.GenericContainerRequest{ @@ -150,15 +177,19 @@ func sharedValkey(t *testing.T) string { } sharedValkeyAddr = fmt.Sprintf("%s:%s", host, port.Port()) }) - if sharedValkeyErr != nil { - t.Fatalf("shared valkey: %v", sharedValkeyErr) - } - return sharedValkeyAddr } // sharedNATS returns the URL of a process-shared NATS container. func sharedNATS(t *testing.T) string { t.Helper() + ensureSharedNATS() + if sharedNATSErr != nil { + t.Fatalf("shared nats: %v", sharedNATSErr) + } + return sharedNATSURL +} + +func ensureSharedNATS() { sharedNATSOnce.Do(func() { ctx := context.Background() c, err := natsmod.Run(ctx, testimages.NATS, @@ -176,10 +207,6 @@ func sharedNATS(t *testing.T) string { } sharedNATSURL = url }) - if sharedNATSErr != nil { - t.Fatalf("shared nats: %v", sharedNATSErr) - } - return sharedNATSURL } // uniqueESIndex returns a per-test ES index name derived from t.Name() From a71664bb9ac78c49fb3c8c55308cab59607f49f7 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 20 May 2026 08:32:38 +0000 Subject: [PATCH 07/23] test(search-service): explicit shared-container cleanup; disable Ryuk in CI CI sets TESTCONTAINERS_RYUK_DISABLED=true on the integration job because Ryuk doesn't run reliably on this runner. Without Ryuk the shared containers (started via sync.Once with no t.Cleanup) would leak per test job. Wrap m.Run() so TestMain calls terminateShared() on clean exits and explicitly Terminates each shared container (ES, Valkey, NATS). Locally Ryuk stays enabled as a safety net for SIGKILL / Ctrl+C where m.Run never returns. --- .github/workflows/ci.yml | 8 +++++ search-service/setup_shared_test.go | 56 ++++++++++++++++++++++------- 2 files changed, 51 insertions(+), 13 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3c9507b5d..e3b4c3506 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -322,7 +322,15 @@ jobs: # Explicit `set -o pipefail` so make's exit status (not tee's) # drives the step outcome. Previously a pipefail gap let a # failing integration test silently report green. + # + # TESTCONTAINERS_RYUK_DISABLED=true: Ryuk doesn't work reliably on + # this CI runner. Shared containers in test packages are cleaned + # up explicitly via TestMain (see search-service/setup_shared_test.go); + # per-test containers use t.Cleanup. Locally Ryuk stays enabled as + # a safety net for SIGKILL / Ctrl+C. - name: Integration tests (${{ matrix.target.path }}) + env: + TESTCONTAINERS_RYUK_DISABLED: "true" run: | set -o pipefail make test-integration SERVICE=${{ matrix.target.path }} 2>&1 \ diff --git a/search-service/setup_shared_test.go b/search-service/setup_shared_test.go index b247752ab..462eb10d7 100644 --- a/search-service/setup_shared_test.go +++ b/search-service/setup_shared_test.go @@ -56,23 +56,28 @@ func seedDoc(t *testing.T, esURL, index, id string, doc any) { } var ( - sharedESOnce sync.Once - sharedESURL string - sharedESErr error + sharedESOnce sync.Once + sharedESContainer testcontainers.Container + sharedESURL string + sharedESErr error - sharedValkeyOnce sync.Once - sharedValkeyAddr string - sharedValkeyErr error + sharedValkeyOnce sync.Once + sharedValkeyContainer testcontainers.Container + sharedValkeyAddr string + sharedValkeyErr error - sharedNATSOnce sync.Once - sharedNATSURL string - sharedNATSErr error + sharedNATSOnce sync.Once + sharedNATSContainer testcontainers.Container + sharedNATSURL string + sharedNATSErr error ) // TestMain pre-warms the shared containers concurrently so the first -// test doesn't pay their startup serially. Total wall-clock for cold -// start drops to max(ES, NATS, Valkey) ≈ ES alone (~30-60s) instead of -// the sum. +// test doesn't pay their startup serially, then explicitly terminates +// them on clean exit. Explicit cleanup is required because CI runs with +// TESTCONTAINERS_RYUK_DISABLED=true — Ryuk would otherwise reap the +// shared containers (they have no t.Cleanup). Locally Ryuk is enabled +// as a safety net for SIGKILL / Ctrl+C, where m.Run never returns. func TestMain(m *testing.M) { var wg sync.WaitGroup wg.Add(3) @@ -80,7 +85,29 @@ func TestMain(m *testing.M) { go func() { defer wg.Done(); ensureSharedValkey() }() go func() { defer wg.Done(); ensureSharedNATS() }() wg.Wait() - os.Exit(m.Run()) + code := m.Run() + terminateShared() + os.Exit(code) +} + +// terminateShared best-effort kills every shared container. Errors are +// logged but don't change the test exit code — the tests already passed +// or failed by this point. +func terminateShared() { + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + for name, c := range map[string]testcontainers.Container{ + "elasticsearch": sharedESContainer, + "valkey": sharedValkeyContainer, + "nats": sharedNATSContainer, + } { + if c == nil { + continue + } + if err := c.Terminate(ctx); err != nil { + fmt.Fprintf(os.Stderr, "terminate shared %s: %v\n", name, err) + } + } } // sharedSingleNodeES returns the URL of the process-shared single-node ES. @@ -131,6 +158,7 @@ func ensureSharedES() { sharedESErr = fmt.Errorf("get shared es port: %w", err) return } + sharedESContainer = container sharedESURL = fmt.Sprintf("http://%s:%s", host, port.Port()) }) } @@ -175,6 +203,7 @@ func ensureSharedValkey() { sharedValkeyErr = fmt.Errorf("get shared valkey port: %w", err) return } + sharedValkeyContainer = container sharedValkeyAddr = fmt.Sprintf("%s:%s", host, port.Port()) }) } @@ -205,6 +234,7 @@ func ensureSharedNATS() { sharedNATSErr = fmt.Errorf("get shared nats url: %w", err) return } + sharedNATSContainer = c sharedNATSURL = url }) } From 436dc39177d5d28e78f9570a1dc8991fe003bc28 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 20 May 2026 08:37:21 +0000 Subject: [PATCH 08/23] test(testutil,search-service): add TerminateMongo for explicit cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pkg/testutil/mongo.go now exports TerminateMongo, which disconnects the shared client and stops the shared container. search-service's terminateShared() calls it so the Mongo container is reaped on clean exits — same need as ES/Valkey/NATS now that CI runs with TESTCONTAINERS_RYUK_DISABLED=true. Other services that use testutil.MongoDB can opt in by calling TerminateMongo from their own TestMain when they need explicit cleanup. --- pkg/testutil/mongo.go | 28 +++++++++++++++++++++++++--- search-service/setup_shared_test.go | 2 ++ 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/pkg/testutil/mongo.go b/pkg/testutil/mongo.go index 5bbe4f4ad..a45f509f5 100644 --- a/pkg/testutil/mongo.go +++ b/pkg/testutil/mongo.go @@ -6,10 +6,12 @@ import ( "context" "fmt" "hash/fnv" + "os" "sync" "testing" "time" + "github.com/testcontainers/testcontainers-go" "github.com/testcontainers/testcontainers-go/modules/mongodb" "go.mongodb.org/mongo-driver/v2/mongo" "go.mongodb.org/mongo-driver/v2/mongo/options" @@ -18,9 +20,10 @@ import ( ) var ( - mongoOnce sync.Once - mongoClient *mongo.Client - mongoInitErr error + mongoOnce sync.Once + mongoClient *mongo.Client + mongoContainer testcontainers.Container + mongoInitErr error ) func ensureMongoClient() (*mongo.Client, error) { @@ -44,10 +47,29 @@ func ensureMongoClient() (*mongo.Client, error) { return } mongoClient = c + mongoContainer = container }) return mongoClient, mongoInitErr } +// TerminateMongo disconnects the shared client and stops the shared +// container. Best-effort; errors go to stderr. Intended for TestMain to +// call on clean exits when Ryuk is disabled (e.g., in CI). +func TerminateMongo() { + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + if mongoClient != nil { + if err := mongoClient.Disconnect(ctx); err != nil { + fmt.Fprintf(os.Stderr, "disconnect shared mongo client: %v\n", err) + } + } + if mongoContainer != nil { + if err := mongoContainer.Terminate(ctx); err != nil { + fmt.Fprintf(os.Stderr, "terminate shared mongo: %v\n", err) + } + } +} + // MongoDB returns an isolated Mongo database for the current test; dropped on t.Cleanup. func MongoDB(t *testing.T, prefix string) *mongo.Database { t.Helper() diff --git a/search-service/setup_shared_test.go b/search-service/setup_shared_test.go index 462eb10d7..c4ea40461 100644 --- a/search-service/setup_shared_test.go +++ b/search-service/setup_shared_test.go @@ -28,6 +28,7 @@ import ( natsmod "github.com/testcontainers/testcontainers-go/modules/nats" "github.com/testcontainers/testcontainers-go/wait" + "github.com/hmchangw/chat/pkg/testutil" "github.com/hmchangw/chat/pkg/testutil/testimages" "github.com/hmchangw/chat/pkg/valkeyutil" ) @@ -108,6 +109,7 @@ func terminateShared() { fmt.Fprintf(os.Stderr, "terminate shared %s: %v\n", name, err) } } + testutil.TerminateMongo() } // sharedSingleNodeES returns the URL of the process-shared single-node ES. From a5a5c5a9e9e89e7012f44822fe9ffb94ce976bdc Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 20 May 2026 08:49:11 +0000 Subject: [PATCH 09/23] test(testutil): add Elasticsearch, NATS, Valkey + Terminate{All,Xxx} pkg/testutil now mirrors the Mongo/Cassandra/MinIO pattern for ES, NATS, and Valkey: each exposes Xxx(t) for tests, EnsureXxx() for TestMain pre-warming, and TerminateXxx() for explicit cleanup. testutil.TerminateAll fans out to every Terminate (each is idempotent if the container was never started) so any service's TestMain becomes a one-liner. NATS starts with --jetstream so consumers that need streams just work against the shared container; core pub/sub consumers pay nothing extra. search-service drops its local sync.Once helpers and uses the testutil helpers throughout. setup_shared_test.go shrinks to just the test-wide testHTTPClient / seedDoc / uniqueESIndex / freshValkeyClient helpers that are specific to the search-* tests. CCS keeps its own networked ES pair (still local) but uses shared NATS/Valkey via testutil. Next: migrate other services that use shared containers (search-sync-worker, room-service, room-worker, etc.) to the testutil helpers + add TestMain calls. --- pkg/testutil/cassandra.go | 33 ++- pkg/testutil/elasticsearch.go | 96 +++++++++ pkg/testutil/minio.go | 27 ++- pkg/testutil/mongo.go | 4 + pkg/testutil/nats.go | 80 +++++++ pkg/testutil/terminate.go | 28 +++ pkg/testutil/valkey.go | 88 ++++++++ search-service/integration_apps_test.go | 2 +- search-service/integration_ccs_test.go | 3 +- search-service/integration_messages_test.go | 3 +- search-service/integration_rooms_test.go | 5 +- search-service/integration_users_test.go | 3 +- search-service/setup_shared_test.go | 227 +++----------------- 13 files changed, 385 insertions(+), 214 deletions(-) create mode 100644 pkg/testutil/elasticsearch.go create mode 100644 pkg/testutil/nats.go create mode 100644 pkg/testutil/terminate.go create mode 100644 pkg/testutil/valkey.go diff --git a/pkg/testutil/cassandra.go b/pkg/testutil/cassandra.go index 370edb086..01c2c9d2c 100644 --- a/pkg/testutil/cassandra.go +++ b/pkg/testutil/cassandra.go @@ -6,6 +6,7 @@ import ( "context" "fmt" "hash/fnv" + "os" "sync" "testing" "time" @@ -18,10 +19,11 @@ import ( const cassandraImage = "cassandra:5" var ( - cassOnce sync.Once - cassHost string - cassSession *gocql.Session - cassInitErr error + cassOnce sync.Once + cassContainer testcontainers.Container + cassHost string + cassSession *gocql.Session + cassInitErr error ) func ensureCassandraSession() (string, *gocql.Session, error) { @@ -70,10 +72,33 @@ func ensureCassandraSession() (string, *gocql.Session, error) { } cassHost = addr cassSession = s + cassContainer = container }) return cassHost, cassSession, cassInitErr } +// TerminateCassandra closes the shared session and stops the shared +// container. Best-effort, idempotent. +func TerminateCassandra() { + if cassSession != nil { + cassSession.Close() + cassSession = nil + } + if cassContainer == nil { + return + } + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + if err := cassContainer.Terminate(ctx); err != nil { + fmt.Fprintf(os.Stderr, "terminate shared cassandra: %v\n", err) + } + cassContainer = nil +} + +// EnsureCassandra starts the shared Cassandra container if not already +// started. No-t variant intended for TestMain pre-warming. +func EnsureCassandra() error { _, _, err := ensureCassandraSession(); return err } + // CassandraKeyspace creates an isolated keyspace for the test (SimpleStrategy, RF=1). // Returns the keyspace name, an admin session for DDL, and the container host. func CassandraKeyspace(t *testing.T, prefix string) (keyspace string, admin *gocql.Session, hostAddr string) { diff --git a/pkg/testutil/elasticsearch.go b/pkg/testutil/elasticsearch.go new file mode 100644 index 000000000..c94e8d4d1 --- /dev/null +++ b/pkg/testutil/elasticsearch.go @@ -0,0 +1,96 @@ +//go:build integration + +package testutil + +import ( + "context" + "fmt" + "os" + "sync" + "testing" + "time" + + "github.com/testcontainers/testcontainers-go" + "github.com/testcontainers/testcontainers-go/wait" + + "github.com/hmchangw/chat/pkg/testutil/testimages" +) + +var ( + esOnce sync.Once + esContainer testcontainers.Container + esURL string + esInitErr error +) + +func ensureElasticsearch() (string, error) { + esOnce.Do(func() { + ctx := context.Background() + container, err := testcontainers.GenericContainer(ctx, testcontainers.GenericContainerRequest{ + ContainerRequest: testcontainers.ContainerRequest{ + Image: testimages.Elasticsearch, + ExposedPorts: []string{"9200/tcp"}, + Env: map[string]string{ + "discovery.type": "single-node", + "xpack.security.enabled": "false", + "ES_JAVA_OPTS": "-Xms256m -Xmx256m", + "cluster.routing.allocation.disk.threshold_enabled": "false", + }, + WaitingFor: wait.ForAll( + wait.ForHTTP("/").WithPort("9200/tcp").WithStartupTimeout(120*time.Second), + wait.ForHTTP("/_cluster/health?wait_for_status=yellow&timeout=60s"). + WithPort("9200/tcp"). + WithStartupTimeout(120*time.Second), + ), + }, + Started: true, + }) + if err != nil { + esInitErr = fmt.Errorf("start elasticsearch: %w", err) + return + } + host, err := container.Host(ctx) + if err != nil { + _ = container.Terminate(ctx) + esInitErr = fmt.Errorf("get es host: %w", err) + return + } + port, err := container.MappedPort(ctx, "9200") + if err != nil { + _ = container.Terminate(ctx) + esInitErr = fmt.Errorf("get es port: %w", err) + return + } + esContainer = container + esURL = fmt.Sprintf("http://%s:%s", host, port.Port()) + }) + return esURL, esInitErr +} + +// Elasticsearch returns the URL of a process-shared single-node ES container. +func Elasticsearch(t *testing.T) string { + t.Helper() + u, err := ensureElasticsearch() + if err != nil { + t.Fatalf("testutil.Elasticsearch: %v", err) + } + return u +} + +// EnsureElasticsearch starts the shared ES container if not already +// started. No-t variant intended for TestMain pre-warming via Prewarm. +func EnsureElasticsearch() error { _, err := ensureElasticsearch(); return err } + +// TerminateElasticsearch stops the shared ES container. Best-effort and +// idempotent — safe to call from TestMain even if no test touched ES. +func TerminateElasticsearch() { + if esContainer == nil { + return + } + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + if err := esContainer.Terminate(ctx); err != nil { + fmt.Fprintf(os.Stderr, "terminate shared elasticsearch: %v\n", err) + } + esContainer = nil +} diff --git a/pkg/testutil/minio.go b/pkg/testutil/minio.go index c10aac0e0..1013c1889 100644 --- a/pkg/testutil/minio.go +++ b/pkg/testutil/minio.go @@ -6,6 +6,7 @@ import ( "context" "fmt" "hash/fnv" + "os" "strings" "sync" "testing" @@ -13,15 +14,17 @@ import ( "github.com/minio/minio-go/v7" "github.com/minio/minio-go/v7/pkg/credentials" + "github.com/testcontainers/testcontainers-go" tcminio "github.com/testcontainers/testcontainers-go/modules/minio" "github.com/hmchangw/chat/pkg/testutil/testimages" ) var ( - minioOnce sync.Once - minioClient *minio.Client - minioInitErr error + minioOnce sync.Once + minioClient *minio.Client + minioContainer testcontainers.Container + minioInitErr error ) func ensureMinIOClient() (*minio.Client, error) { @@ -54,10 +57,28 @@ func ensureMinIOClient() (*minio.Client, error) { return } minioClient = c + minioContainer = container }) return minioClient, minioInitErr } +// TerminateMinIO stops the shared MinIO container. Best-effort, idempotent. +func TerminateMinIO() { + if minioContainer == nil { + return + } + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + if err := minioContainer.Terminate(ctx); err != nil { + fmt.Fprintf(os.Stderr, "terminate shared minio: %v\n", err) + } + minioContainer = nil +} + +// EnsureMinIO starts the shared MinIO container if not already started. +// No-t variant intended for TestMain pre-warming. +func EnsureMinIO() error { _, err := ensureMinIOClient(); return err } + // MinIO returns a shared client + per-test bucket (fnv-hashed from t.Name(); cleaned up via t.Cleanup). // Prefix must be S3-valid (3-46 lowercase chars/digits/hyphens, no leading/trailing hyphen); not validated. func MinIO(t *testing.T, prefix string) (*minio.Client, string) { diff --git a/pkg/testutil/mongo.go b/pkg/testutil/mongo.go index a45f509f5..c121f213e 100644 --- a/pkg/testutil/mongo.go +++ b/pkg/testutil/mongo.go @@ -70,6 +70,10 @@ func TerminateMongo() { } } +// EnsureMongo starts the shared Mongo container if not already started. +// No-t variant intended for TestMain pre-warming. +func EnsureMongo() error { _, err := ensureMongoClient(); return err } + // MongoDB returns an isolated Mongo database for the current test; dropped on t.Cleanup. func MongoDB(t *testing.T, prefix string) *mongo.Database { t.Helper() diff --git a/pkg/testutil/nats.go b/pkg/testutil/nats.go new file mode 100644 index 000000000..2d7997445 --- /dev/null +++ b/pkg/testutil/nats.go @@ -0,0 +1,80 @@ +//go:build integration + +package testutil + +import ( + "context" + "fmt" + "os" + "sync" + "testing" + "time" + + "github.com/testcontainers/testcontainers-go" + natsmod "github.com/testcontainers/testcontainers-go/modules/nats" + "github.com/testcontainers/testcontainers-go/wait" + + "github.com/hmchangw/chat/pkg/testutil/testimages" +) + +var ( + natsOnce sync.Once + natsContainer testcontainers.Container + natsURL string + natsInitErr error +) + +// JetStream is enabled unconditionally so consumers that publish/consume +// through streams (search-sync-worker, inbox-worker, etc.) Just Work +// against the shared container. Consumers that only use core NATS +// request/reply pay nothing extra — JS is dormant until used. +func ensureNATS() (string, error) { + natsOnce.Do(func() { + ctx := context.Background() + c, err := natsmod.Run(ctx, testimages.NATS, + testcontainers.WithCmdArgs("--jetstream"), + testcontainers.WithWaitStrategy(wait.ForLog("Server is ready").WithStartupTimeout(60*time.Second)), + ) + if err != nil { + natsInitErr = fmt.Errorf("start nats: %w", err) + return + } + url, err := c.ConnectionString(ctx) + if err != nil { + _ = c.Terminate(ctx) + natsInitErr = fmt.Errorf("get nats url: %w", err) + return + } + natsContainer = c + natsURL = url + }) + return natsURL, natsInitErr +} + +// NATS returns the URL of a process-shared NATS container with JetStream +// enabled. +func NATS(t *testing.T) string { + t.Helper() + u, err := ensureNATS() + if err != nil { + t.Fatalf("testutil.NATS: %v", err) + } + return u +} + +// EnsureNATS starts the shared NATS container if not already started. +// No-t variant intended for TestMain pre-warming. +func EnsureNATS() error { _, err := ensureNATS(); return err } + +// TerminateNATS stops the shared NATS container. Best-effort, idempotent. +func TerminateNATS() { + if natsContainer == nil { + return + } + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + if err := natsContainer.Terminate(ctx); err != nil { + fmt.Fprintf(os.Stderr, "terminate shared nats: %v\n", err) + } + natsContainer = nil +} diff --git a/pkg/testutil/terminate.go b/pkg/testutil/terminate.go new file mode 100644 index 000000000..82d4a4718 --- /dev/null +++ b/pkg/testutil/terminate.go @@ -0,0 +1,28 @@ +//go:build integration + +package testutil + +// TerminateAll stops every process-shared container started by this +// package, in dependency-free order. Each individual Terminate is a +// no-op if its container was never started, so it's safe to call from +// any service's TestMain regardless of which helpers that service uses. +// +// Intended usage: +// +// func TestMain(m *testing.M) { +// code := m.Run() +// testutil.TerminateAll() +// os.Exit(code) +// } +// +// Required when running with TESTCONTAINERS_RYUK_DISABLED=true (e.g. +// our CI integration job) — Ryuk would otherwise reap these on process +// exit. Locally Ryuk catches SIGKILL / Ctrl+C, where m.Run never returns. +func TerminateAll() { + TerminateMongo() + TerminateCassandra() + TerminateMinIO() + TerminateElasticsearch() + TerminateNATS() + TerminateValkey() +} diff --git a/pkg/testutil/valkey.go b/pkg/testutil/valkey.go new file mode 100644 index 000000000..e662af062 --- /dev/null +++ b/pkg/testutil/valkey.go @@ -0,0 +1,88 @@ +//go:build integration + +package testutil + +import ( + "context" + "fmt" + "os" + "sync" + "testing" + "time" + + "github.com/testcontainers/testcontainers-go" + "github.com/testcontainers/testcontainers-go/wait" + + "github.com/hmchangw/chat/pkg/testutil/testimages" +) + +var ( + valkeyOnce sync.Once + valkeyContainer testcontainers.Container + valkeyAddr string + valkeyInitErr error +) + +func ensureValkey() (string, error) { + valkeyOnce.Do(func() { + ctx := context.Background() + container, err := testcontainers.GenericContainer(ctx, testcontainers.GenericContainerRequest{ + ContainerRequest: testcontainers.ContainerRequest{ + Image: testimages.Valkey, + ExposedPorts: []string{"6379/tcp"}, + Cmd: []string{"valkey-server", "--save", "", "--appendonly", "no"}, + WaitingFor: wait.ForLog("Ready to accept connections").WithStartupTimeout(30 * time.Second), + }, + Started: true, + }) + if err != nil { + valkeyInitErr = fmt.Errorf("start valkey: %w", err) + return + } + host, err := container.Host(ctx) + if err != nil { + _ = container.Terminate(ctx) + valkeyInitErr = fmt.Errorf("get valkey host: %w", err) + return + } + port, err := container.MappedPort(ctx, "6379") + if err != nil { + _ = container.Terminate(ctx) + valkeyInitErr = fmt.Errorf("get valkey port: %w", err) + return + } + valkeyContainer = container + valkeyAddr = fmt.Sprintf("%s:%s", host, port.Port()) + }) + return valkeyAddr, valkeyInitErr +} + +// Valkey returns the addr (host:port) of a process-shared Valkey container. +// Persistence is disabled (--save '' --appendonly no) so the data plane +// is purely in-memory; callers wanting per-test isolation should namespace +// their keys or FLUSHDB on cleanup. +func Valkey(t *testing.T) string { + t.Helper() + addr, err := ensureValkey() + if err != nil { + t.Fatalf("testutil.Valkey: %v", err) + } + return addr +} + +// EnsureValkey starts the shared Valkey container if not already started. +// No-t variant intended for TestMain pre-warming. +func EnsureValkey() error { _, err := ensureValkey(); return err } + +// TerminateValkey stops the shared Valkey container. Best-effort, idempotent. +func TerminateValkey() { + if valkeyContainer == nil { + return + } + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + if err := valkeyContainer.Terminate(ctx); err != nil { + fmt.Fprintf(os.Stderr, "terminate shared valkey: %v\n", err) + } + valkeyContainer = nil +} diff --git a/search-service/integration_apps_test.go b/search-service/integration_apps_test.go index 7c3cedaf4..5151c4d78 100644 --- a/search-service/integration_apps_test.go +++ b/search-service/integration_apps_test.go @@ -33,7 +33,7 @@ func setupAppsFixture(t *testing.T) *appsFixture { mongoDB := testutil.MongoDB(t, "search_service_test") - natsURL := sharedNATS(t) + natsURL := testutil.NATS(t) serverNATS, err := natsutil.Connect(natsURL, "") require.NoError(t, err) diff --git a/search-service/integration_ccs_test.go b/search-service/integration_ccs_test.go index 1a823fee6..36a406191 100644 --- a/search-service/integration_ccs_test.go +++ b/search-service/integration_ccs_test.go @@ -34,6 +34,7 @@ import ( "github.com/hmchangw/chat/pkg/natsutil" "github.com/hmchangw/chat/pkg/searchengine" "github.com/hmchangw/chat/pkg/subject" + "github.com/hmchangw/chat/pkg/testutil" "github.com/hmchangw/chat/pkg/testutil/testimages" ) @@ -107,7 +108,7 @@ func setupCCSFixture(t *testing.T) *ccsFixture { valkeyClient := freshValkeyClient(t) - natsURL := sharedNATS(t) + natsURL := testutil.NATS(t) serverNC, err := natsutil.Connect(natsURL, "") require.NoError(t, err, "connect nats (server side)") t.Cleanup(func() { _ = serverNC.Drain() }) diff --git a/search-service/integration_messages_test.go b/search-service/integration_messages_test.go index 7e0614328..9ba2cb08b 100644 --- a/search-service/integration_messages_test.go +++ b/search-service/integration_messages_test.go @@ -22,6 +22,7 @@ import ( "github.com/hmchangw/chat/pkg/natsutil" "github.com/hmchangw/chat/pkg/searchengine" "github.com/hmchangw/chat/pkg/subject" + "github.com/hmchangw/chat/pkg/testutil" ) type messagesV2Fixture struct { @@ -50,7 +51,7 @@ func setupMessagesV2Fixture(t *testing.T) *messagesV2Fixture { fakeValkey := newFakeCache() fakeValkey.store["alice"] = map[string]int64{} // empty restricted map, cache hit - natsURL := sharedNATS(t) + natsURL := testutil.NATS(t) serverNATS, err := natsutil.Connect(natsURL, "") require.NoError(t, err) diff --git a/search-service/integration_rooms_test.go b/search-service/integration_rooms_test.go index fbc00ed39..1ad673397 100644 --- a/search-service/integration_rooms_test.go +++ b/search-service/integration_rooms_test.go @@ -22,6 +22,7 @@ import ( "github.com/hmchangw/chat/pkg/natsutil" "github.com/hmchangw/chat/pkg/searchengine" "github.com/hmchangw/chat/pkg/subject" + "github.com/hmchangw/chat/pkg/testutil" ) // roomsFixture uses a per-test spotlight index against the shared ES so @@ -36,11 +37,11 @@ func setupRoomsFixture(t *testing.T) *roomsFixture { t.Helper() ctx := context.Background() - esURL := sharedSingleNodeES(t) + esURL := testutil.Elasticsearch(t) spotlightIndex := uniqueESIndex(t, "spotlight") putTestSpotlightIndex(t, esURL, spotlightIndex) - natsURL := sharedNATS(t) + natsURL := testutil.NATS(t) serverNC, err := natsutil.Connect(natsURL, "") require.NoError(t, err, "connect nats (server side)") t.Cleanup(func() { _ = serverNC.Drain() }) diff --git a/search-service/integration_users_test.go b/search-service/integration_users_test.go index f87c7693c..28b34cab0 100644 --- a/search-service/integration_users_test.go +++ b/search-service/integration_users_test.go @@ -21,6 +21,7 @@ import ( "github.com/hmchangw/chat/pkg/natsutil" "github.com/hmchangw/chat/pkg/restyutil" "github.com/hmchangw/chat/pkg/subject" + "github.com/hmchangw/chat/pkg/testutil" ) type usersFixture struct { @@ -34,7 +35,7 @@ func setupUsersFixture(t *testing.T, thirdPartyHandler http.Handler) *usersFixtu stub := httptest.NewServer(thirdPartyHandler) t.Cleanup(stub.Close) - natsURL := sharedNATS(t) + natsURL := testutil.NATS(t) serverNC, err := natsutil.Connect(natsURL, "") require.NoError(t, err, "connect nats (server side)") t.Cleanup(func() { _ = serverNC.Drain() }) diff --git a/search-service/setup_shared_test.go b/search-service/setup_shared_test.go index c4ea40461..b6ddc0ddf 100644 --- a/search-service/setup_shared_test.go +++ b/search-service/setup_shared_test.go @@ -2,12 +2,9 @@ package main -// Process-shared ES, Valkey, and NATS containers used by every -// integration_*_test.go file. Each starts once via sync.Once and is -// reaped by Ryuk at process exit. Tests run sequentially and isolate -// per-test via uniqueESIndex (DELETE on cleanup), Valkey FLUSHDB on -// cleanup, and a fresh *nats.Conn pair per test. CCS tests bring their -// own ES pair. +// Per-package shared test infrastructure. Containers (ES, NATS, Valkey, +// Mongo) come from pkg/testutil and are reaped by testutil.TerminateAll +// in TestMain. CCS tests bring their own ES pair (see integration_ccs_test.go). import ( "bytes" @@ -24,12 +21,8 @@ import ( goredis "github.com/redis/go-redis/v9" "github.com/stretchr/testify/require" - "github.com/testcontainers/testcontainers-go" - natsmod "github.com/testcontainers/testcontainers-go/modules/nats" - "github.com/testcontainers/testcontainers-go/wait" "github.com/hmchangw/chat/pkg/testutil" - "github.com/hmchangw/chat/pkg/testutil/testimages" "github.com/hmchangw/chat/pkg/valkeyutil" ) @@ -56,198 +49,33 @@ func seedDoc(t *testing.T, esURL, index, id string, doc any) { "seedDoc %s/%s: status=%d body=%s", index, id, resp.StatusCode, body) } -var ( - sharedESOnce sync.Once - sharedESContainer testcontainers.Container - sharedESURL string - sharedESErr error - - sharedValkeyOnce sync.Once - sharedValkeyContainer testcontainers.Container - sharedValkeyAddr string - sharedValkeyErr error - - sharedNATSOnce sync.Once - sharedNATSContainer testcontainers.Container - sharedNATSURL string - sharedNATSErr error -) - -// TestMain pre-warms the shared containers concurrently so the first -// test doesn't pay their startup serially, then explicitly terminates -// them on clean exit. Explicit cleanup is required because CI runs with -// TESTCONTAINERS_RYUK_DISABLED=true — Ryuk would otherwise reap the -// shared containers (they have no t.Cleanup). Locally Ryuk is enabled -// as a safety net for SIGKILL / Ctrl+C, where m.Run never returns. +// TestMain pre-warms the shared containers concurrently so the first test +// doesn't pay their startup serially, then explicitly terminates them on +// clean exit (required for CI runs with TESTCONTAINERS_RYUK_DISABLED=true; +// harmless when Ryuk is enabled — explicit cleanup just runs first). func TestMain(m *testing.M) { var wg sync.WaitGroup - wg.Add(3) - go func() { defer wg.Done(); ensureSharedES() }() - go func() { defer wg.Done(); ensureSharedValkey() }() - go func() { defer wg.Done(); ensureSharedNATS() }() + for _, fn := range []func() error{ + testutil.EnsureElasticsearch, + testutil.EnsureNATS, + testutil.EnsureValkey, + } { + wg.Add(1) + go func(f func() error) { defer wg.Done(); _ = f() }(fn) + } wg.Wait() code := m.Run() - terminateShared() + testutil.TerminateAll() os.Exit(code) } -// terminateShared best-effort kills every shared container. Errors are -// logged but don't change the test exit code — the tests already passed -// or failed by this point. -func terminateShared() { - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) - defer cancel() - for name, c := range map[string]testcontainers.Container{ - "elasticsearch": sharedESContainer, - "valkey": sharedValkeyContainer, - "nats": sharedNATSContainer, - } { - if c == nil { - continue - } - if err := c.Terminate(ctx); err != nil { - fmt.Fprintf(os.Stderr, "terminate shared %s: %v\n", name, err) - } - } - testutil.TerminateMongo() -} - -// sharedSingleNodeES returns the URL of the process-shared single-node ES. -func sharedSingleNodeES(t *testing.T) string { - t.Helper() - ensureSharedES() - if sharedESErr != nil { - t.Fatalf("shared elasticsearch: %v", sharedESErr) - } - return sharedESURL -} - -func ensureSharedES() { - sharedESOnce.Do(func() { - ctx := context.Background() - container, err := testcontainers.GenericContainer(ctx, testcontainers.GenericContainerRequest{ - ContainerRequest: testcontainers.ContainerRequest{ - Image: testimages.Elasticsearch, - ExposedPorts: []string{"9200/tcp"}, - Env: map[string]string{ - "discovery.type": "single-node", - "xpack.security.enabled": "false", - "ES_JAVA_OPTS": "-Xms256m -Xmx256m", - "cluster.routing.allocation.disk.threshold_enabled": "false", - }, - WaitingFor: wait.ForAll( - wait.ForHTTP("/").WithPort("9200/tcp").WithStartupTimeout(120*time.Second), - wait.ForHTTP("/_cluster/health?wait_for_status=yellow&timeout=60s"). - WithPort("9200/tcp"). - WithStartupTimeout(120*time.Second), - ), - }, - Started: true, - }) - if err != nil { - sharedESErr = fmt.Errorf("start shared elasticsearch: %w", err) - return - } - host, err := container.Host(ctx) - if err != nil { - _ = container.Terminate(ctx) - sharedESErr = fmt.Errorf("get shared es host: %w", err) - return - } - port, err := container.MappedPort(ctx, "9200") - if err != nil { - _ = container.Terminate(ctx) - sharedESErr = fmt.Errorf("get shared es port: %w", err) - return - } - sharedESContainer = container - sharedESURL = fmt.Sprintf("http://%s:%s", host, port.Port()) - }) -} - -// sharedValkey returns the addr of a process-shared Valkey container. -// Callers should obtain a fresh client via freshValkeyClient so the -// keyspace is wiped on test cleanup. -func sharedValkey(t *testing.T) string { - t.Helper() - ensureSharedValkey() - if sharedValkeyErr != nil { - t.Fatalf("shared valkey: %v", sharedValkeyErr) - } - return sharedValkeyAddr -} - -func ensureSharedValkey() { - sharedValkeyOnce.Do(func() { - ctx := context.Background() - container, err := testcontainers.GenericContainer(ctx, testcontainers.GenericContainerRequest{ - ContainerRequest: testcontainers.ContainerRequest{ - Image: testimages.Valkey, - ExposedPorts: []string{"6379/tcp"}, - Cmd: []string{"valkey-server", "--save", "", "--appendonly", "no"}, - WaitingFor: wait.ForLog("Ready to accept connections").WithStartupTimeout(30 * time.Second), - }, - Started: true, - }) - if err != nil { - sharedValkeyErr = fmt.Errorf("start shared valkey: %w", err) - return - } - host, err := container.Host(ctx) - if err != nil { - _ = container.Terminate(ctx) - sharedValkeyErr = fmt.Errorf("get shared valkey host: %w", err) - return - } - port, err := container.MappedPort(ctx, "6379") - if err != nil { - _ = container.Terminate(ctx) - sharedValkeyErr = fmt.Errorf("get shared valkey port: %w", err) - return - } - sharedValkeyContainer = container - sharedValkeyAddr = fmt.Sprintf("%s:%s", host, port.Port()) - }) -} - -// sharedNATS returns the URL of a process-shared NATS container. -func sharedNATS(t *testing.T) string { - t.Helper() - ensureSharedNATS() - if sharedNATSErr != nil { - t.Fatalf("shared nats: %v", sharedNATSErr) - } - return sharedNATSURL -} - -func ensureSharedNATS() { - sharedNATSOnce.Do(func() { - ctx := context.Background() - c, err := natsmod.Run(ctx, testimages.NATS, - testcontainers.WithWaitStrategy(wait.ForLog("Server is ready").WithStartupTimeout(60*time.Second)), - ) - if err != nil { - sharedNATSErr = fmt.Errorf("start shared nats: %w", err) - return - } - url, err := c.ConnectionString(ctx) - if err != nil { - _ = c.Terminate(ctx) - sharedNATSErr = fmt.Errorf("get shared nats url: %w", err) - return - } - sharedNATSContainer = c - sharedNATSURL = url - }) -} - -// uniqueESIndex returns a per-test ES index name derived from t.Name() -// and registers a cleanup that DELETEs the index from the shared ES -// when the test ends. The hash keeps the name short, deterministic per -// test, and free of characters that ES dislikes (slashes from subtests). +// uniqueESIndex returns a per-test ES index name derived from t.Name() and +// registers a cleanup that DELETEs the index. The fnv hash keeps the name +// short, deterministic per test, and free of characters that ES dislikes +// (slashes from subtests). func uniqueESIndex(t *testing.T, prefix string) string { t.Helper() - esURL := sharedSingleNodeES(t) + esURL := testutil.Elasticsearch(t) h := fnv.New64a() _, _ = h.Write([]byte(t.Name())) name := fmt.Sprintf("%s-%x", prefix, h.Sum64()) @@ -269,11 +97,10 @@ func uniqueESIndex(t *testing.T, prefix string) string { // freshValkeyClient returns a valkeyutil.Client connected to the shared // Valkey, with cleanup that flushes the keyspace at test end so the next -// test starts clean. Tests in this package run sequentially, so a flush -// is sufficient isolation. +// test starts clean. Tests in this package run sequentially. func freshValkeyClient(t *testing.T) valkeyutil.Client { t.Helper() - addr := sharedValkey(t) + addr := testutil.Valkey(t) client, err := valkeyutil.Connect(context.Background(), addr, "") require.NoError(t, err, "connect shared valkey") t.Cleanup(func() { @@ -283,11 +110,9 @@ func freshValkeyClient(t *testing.T) valkeyutil.Client { return client } -// flushValkey wipes the keyspace at addr. Uses a raw go-redis client so -// we don't have to expose FLUSHDB on the production valkeyutil.Client -// interface. A FLUSHDB failure here is fatal to the test: state would -// leak into the next sibling test and produce a confusing assertion -// failure far from the real root cause. +// flushValkey wipes the keyspace at addr. Uses a raw go-redis client so we +// don't have to expose FLUSHDB on the production valkeyutil.Client. A +// FLUSHDB failure is fatal: state would leak into the next sibling test. func flushValkey(t *testing.T, addr string) { t.Helper() rc := goredis.NewClient(&goredis.Options{Addr: addr}) From 08ba0b414e3fd4764fd693672dd9853154b19609 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 20 May 2026 08:52:39 +0000 Subject: [PATCH 10/23] test(testutil): disable Ryuk repo-wide via init(); drop CI env var MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ryuk fails to start on our CI runner (can't pull / run the sidecar image), so set TESTCONTAINERS_RYUK_DISABLED=true once in pkg/testutil's init(). Every integration test that imports testutil inherits this automatically — no per-job env config needed. Removes the TESTCONTAINERS_RYUK_DISABLED env var from the CI workflow's integration step; the init() makes it redundant. Developers can still override locally with TESTCONTAINERS_RYUK_DISABLED=false to re-enable Ryuk for debugging container leaks. Cleanup contract: testutil.TerminateAll (called from each service's TestMain) is the only mechanism that reaps the shared containers. SIGKILL / Ctrl+C leaks containers — acceptable trade-off for the simplification, and on CI it doesn't matter since the runner VM is torn down anyway. --- .github/workflows/ci.yml | 9 ++------- pkg/testutil/init.go | 20 ++++++++++++++++++++ search-service/setup_shared_test.go | 4 ++-- 3 files changed, 24 insertions(+), 9 deletions(-) create mode 100644 pkg/testutil/init.go diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e3b4c3506..74a47bf4d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -323,14 +323,9 @@ jobs: # drives the step outcome. Previously a pipefail gap let a # failing integration test silently report green. # - # TESTCONTAINERS_RYUK_DISABLED=true: Ryuk doesn't work reliably on - # this CI runner. Shared containers in test packages are cleaned - # up explicitly via TestMain (see search-service/setup_shared_test.go); - # per-test containers use t.Cleanup. Locally Ryuk stays enabled as - # a safety net for SIGKILL / Ctrl+C. + # Ryuk is disabled at the pkg/testutil init() level — see + # pkg/testutil/init.go and TerminateAll for the cleanup contract. - name: Integration tests (${{ matrix.target.path }}) - env: - TESTCONTAINERS_RYUK_DISABLED: "true" run: | set -o pipefail make test-integration SERVICE=${{ matrix.target.path }} 2>&1 \ diff --git a/pkg/testutil/init.go b/pkg/testutil/init.go new file mode 100644 index 000000000..8e01dfb1c --- /dev/null +++ b/pkg/testutil/init.go @@ -0,0 +1,20 @@ +//go:build integration + +package testutil + +import "os" + +// init disables testcontainers-go's Ryuk reaper across every integration +// test in this repo. Ryuk fails to start on our CI runner (can't pull / +// run the sidecar image), and Ryuk-on-by-default would block every job. +// Cleanup is handled explicitly via TerminateAll, which each service's +// TestMain calls. +// +// LookupEnv guard so a developer debugging container leaks can flip Ryuk +// back on with `TESTCONTAINERS_RYUK_DISABLED=false go test ...` without +// editing code. +func init() { + if _, set := os.LookupEnv("TESTCONTAINERS_RYUK_DISABLED"); !set { + _ = os.Setenv("TESTCONTAINERS_RYUK_DISABLED", "true") + } +} diff --git a/search-service/setup_shared_test.go b/search-service/setup_shared_test.go index b6ddc0ddf..a7c350787 100644 --- a/search-service/setup_shared_test.go +++ b/search-service/setup_shared_test.go @@ -51,8 +51,8 @@ func seedDoc(t *testing.T, esURL, index, id string, doc any) { // TestMain pre-warms the shared containers concurrently so the first test // doesn't pay their startup serially, then explicitly terminates them on -// clean exit (required for CI runs with TESTCONTAINERS_RYUK_DISABLED=true; -// harmless when Ryuk is enabled — explicit cleanup just runs first). +// clean exit. Ryuk is disabled repo-wide via pkg/testutil's init(), so +// TerminateAll is the primary cleanup mechanism. func TestMain(m *testing.M) { var wg sync.WaitGroup for _, fn := range []func() error{ From d356c6a73fa3d7c1b436e9285c7275c144804605 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 20 May 2026 09:09:54 +0000 Subject: [PATCH 11/23] test(repo): migrate every integration package to testutil shared containers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per consumer: - search-sync-worker, room-service, room-worker, inbox-worker, tools/loadgen, pkg/natsrouter, pkg/roomkeystore, pkg/roomsubcache: drop inline testcontainers.GenericContainer / natsmod.Run starts and use the testutil helpers (Elasticsearch, NATS, Valkey) instead. Sibling-test isolation: Valkey gets FLUSHDB on cleanup via the new testutil.FlushValkey; ES gets per-test unique indices where the suite needs them; NATS gets unique stream / queue-group names where it already did. - pkg/roomkeysender, pkg/roomcrypto: keep their per-test containers (NATS-with-WebSocket and Node have special configs); just add TestMain so they import testutil. - broadcast-worker, message-worker, notification-worker, history-service/internal/{mongorepo,cassrepo,service}, pkg/userstore, pkg/mongoutil, pkg/minioutil: add main_test.go with TestMain calling testutil.TerminateAll. These only consume Mongo/Cassandra/MinIO helpers from testutil — no inline container starts to remove. search-service TestMain now fails fast on pre-warm errors (CodeRabbit). TerminateMongo / TerminateCassandra nil their refs after cleanup so they are idempotent (CodeRabbit). Ryuk works on our CI runner after all, so init() and the CI env var are removed — Ryuk stays enabled as the safety net for SIGKILL / Ctrl+C; TerminateAll runs first on clean exits and the explicit cleanup is what the tests actually rely on. --- .github/workflows/ci.yml | 3 - broadcast-worker/main_test.go | 16 ++ .../internal/cassrepo/main_test.go | 16 ++ .../internal/mongorepo/main_test.go | 16 ++ history-service/internal/service/main_test.go | 16 ++ inbox-worker/integration_test.go | 15 +- inbox-worker/main_test.go | 16 ++ message-worker/main_test.go | 16 ++ notification-worker/main_test.go | 16 ++ pkg/minioutil/main_test.go | 16 ++ pkg/mongoutil/main_test.go | 16 ++ pkg/natsrouter/integration_test.go | 24 +-- pkg/natsrouter/main_test.go | 16 ++ pkg/roomcrypto/main_test.go | 20 +++ pkg/roomkeysender/main_test.go | 21 +++ pkg/roomkeystore/integration_test.go | 32 +--- pkg/roomsubcache/integration_test.go | 34 +--- pkg/testutil/cassandra.go | 15 +- pkg/testutil/init.go | 20 --- pkg/testutil/mongo.go | 5 +- pkg/testutil/valkey.go | 18 +++ pkg/userstore/main_test.go | 16 ++ room-service/integration_test.go | 30 +--- room-service/main_test.go | 16 ++ room-worker/integration_test.go | 33 ++-- room-worker/main_test.go | 16 ++ search-service/setup_shared_test.go | 22 ++- search-sync-worker/integration_test.go | 150 +++++------------- tools/loadgen/integration_test.go | 59 ++----- 29 files changed, 379 insertions(+), 330 deletions(-) create mode 100644 broadcast-worker/main_test.go create mode 100644 history-service/internal/cassrepo/main_test.go create mode 100644 history-service/internal/mongorepo/main_test.go create mode 100644 history-service/internal/service/main_test.go create mode 100644 inbox-worker/main_test.go create mode 100644 message-worker/main_test.go create mode 100644 notification-worker/main_test.go create mode 100644 pkg/minioutil/main_test.go create mode 100644 pkg/mongoutil/main_test.go create mode 100644 pkg/natsrouter/main_test.go create mode 100644 pkg/roomcrypto/main_test.go create mode 100644 pkg/roomkeysender/main_test.go delete mode 100644 pkg/testutil/init.go create mode 100644 pkg/userstore/main_test.go create mode 100644 room-service/main_test.go create mode 100644 room-worker/main_test.go diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 74a47bf4d..3c9507b5d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -322,9 +322,6 @@ jobs: # Explicit `set -o pipefail` so make's exit status (not tee's) # drives the step outcome. Previously a pipefail gap let a # failing integration test silently report green. - # - # Ryuk is disabled at the pkg/testutil init() level — see - # pkg/testutil/init.go and TerminateAll for the cleanup contract. - name: Integration tests (${{ matrix.target.path }}) run: | set -o pipefail diff --git a/broadcast-worker/main_test.go b/broadcast-worker/main_test.go new file mode 100644 index 000000000..32bfb4078 --- /dev/null +++ b/broadcast-worker/main_test.go @@ -0,0 +1,16 @@ +//go:build integration + +package main + +import ( + "os" + "testing" + + "github.com/hmchangw/chat/pkg/testutil" +) + +func TestMain(m *testing.M) { + code := m.Run() + testutil.TerminateAll() + os.Exit(code) +} diff --git a/history-service/internal/cassrepo/main_test.go b/history-service/internal/cassrepo/main_test.go new file mode 100644 index 000000000..4ca88f40c --- /dev/null +++ b/history-service/internal/cassrepo/main_test.go @@ -0,0 +1,16 @@ +//go:build integration + +package cassrepo + +import ( + "os" + "testing" + + "github.com/hmchangw/chat/pkg/testutil" +) + +func TestMain(m *testing.M) { + code := m.Run() + testutil.TerminateAll() + os.Exit(code) +} diff --git a/history-service/internal/mongorepo/main_test.go b/history-service/internal/mongorepo/main_test.go new file mode 100644 index 000000000..7134a1b31 --- /dev/null +++ b/history-service/internal/mongorepo/main_test.go @@ -0,0 +1,16 @@ +//go:build integration + +package mongorepo + +import ( + "os" + "testing" + + "github.com/hmchangw/chat/pkg/testutil" +) + +func TestMain(m *testing.M) { + code := m.Run() + testutil.TerminateAll() + os.Exit(code) +} diff --git a/history-service/internal/service/main_test.go b/history-service/internal/service/main_test.go new file mode 100644 index 000000000..af5c2a8d6 --- /dev/null +++ b/history-service/internal/service/main_test.go @@ -0,0 +1,16 @@ +//go:build integration + +package service_test + +import ( + "os" + "testing" + + "github.com/hmchangw/chat/pkg/testutil" +) + +func TestMain(m *testing.M) { + code := m.Run() + testutil.TerminateAll() + os.Exit(code) +} diff --git a/inbox-worker/integration_test.go b/inbox-worker/integration_test.go index 8eb8599e6..876d48264 100644 --- a/inbox-worker/integration_test.go +++ b/inbox-worker/integration_test.go @@ -13,7 +13,6 @@ import ( "github.com/nats-io/nats.go/jetstream" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - natsmod "github.com/testcontainers/testcontainers-go/modules/nats" "go.mongodb.org/mongo-driver/v2/bson" "go.mongodb.org/mongo-driver/v2/mongo" @@ -21,7 +20,6 @@ import ( "github.com/hmchangw/chat/pkg/stream" "github.com/hmchangw/chat/pkg/subject" "github.com/hmchangw/chat/pkg/testutil" - "github.com/hmchangw/chat/pkg/testutil/testimages" ) func setupMongo(t *testing.T) *mongo.Database { @@ -577,20 +575,13 @@ func TestHandleMemberAdded_DM_PersistsRemoteCounterpartSub(t *testing.T) { assert.False(t, bobSub.IsSubscribed, "DM does not set IsSubscribed=true") } -// setupNATS starts a NATS container with JetStream enabled and returns a -// JetStream client tied to the test's lifetime. +// setupNATS connects to the process-shared NATS (JetStream enabled in +// testutil) and returns a JetStream client tied to the test's lifetime. func setupNATS(t *testing.T) (context.Context, jetstream.JetStream) { t.Helper() ctx := context.Background() - c, err := natsmod.Run(ctx, testimages.NATS) - require.NoError(t, err) - t.Cleanup(func() { _ = c.Terminate(ctx) }) - - url, err := c.ConnectionString(ctx) - require.NoError(t, err) - - nc, err := nats.Connect(url) + nc, err := nats.Connect(testutil.NATS(t)) require.NoError(t, err) t.Cleanup(func() { nc.Close() }) diff --git a/inbox-worker/main_test.go b/inbox-worker/main_test.go new file mode 100644 index 000000000..32bfb4078 --- /dev/null +++ b/inbox-worker/main_test.go @@ -0,0 +1,16 @@ +//go:build integration + +package main + +import ( + "os" + "testing" + + "github.com/hmchangw/chat/pkg/testutil" +) + +func TestMain(m *testing.M) { + code := m.Run() + testutil.TerminateAll() + os.Exit(code) +} diff --git a/message-worker/main_test.go b/message-worker/main_test.go new file mode 100644 index 000000000..32bfb4078 --- /dev/null +++ b/message-worker/main_test.go @@ -0,0 +1,16 @@ +//go:build integration + +package main + +import ( + "os" + "testing" + + "github.com/hmchangw/chat/pkg/testutil" +) + +func TestMain(m *testing.M) { + code := m.Run() + testutil.TerminateAll() + os.Exit(code) +} diff --git a/notification-worker/main_test.go b/notification-worker/main_test.go new file mode 100644 index 000000000..32bfb4078 --- /dev/null +++ b/notification-worker/main_test.go @@ -0,0 +1,16 @@ +//go:build integration + +package main + +import ( + "os" + "testing" + + "github.com/hmchangw/chat/pkg/testutil" +) + +func TestMain(m *testing.M) { + code := m.Run() + testutil.TerminateAll() + os.Exit(code) +} diff --git a/pkg/minioutil/main_test.go b/pkg/minioutil/main_test.go new file mode 100644 index 000000000..cecdbadb8 --- /dev/null +++ b/pkg/minioutil/main_test.go @@ -0,0 +1,16 @@ +//go:build integration + +package minioutil + +import ( + "os" + "testing" + + "github.com/hmchangw/chat/pkg/testutil" +) + +func TestMain(m *testing.M) { + code := m.Run() + testutil.TerminateAll() + os.Exit(code) +} diff --git a/pkg/mongoutil/main_test.go b/pkg/mongoutil/main_test.go new file mode 100644 index 000000000..c9353ea61 --- /dev/null +++ b/pkg/mongoutil/main_test.go @@ -0,0 +1,16 @@ +//go:build integration + +package mongoutil + +import ( + "os" + "testing" + + "github.com/hmchangw/chat/pkg/testutil" +) + +func TestMain(m *testing.M) { + code := m.Run() + testutil.TerminateAll() + os.Exit(code) +} diff --git a/pkg/natsrouter/integration_test.go b/pkg/natsrouter/integration_test.go index ad6e91220..c66a8e721 100644 --- a/pkg/natsrouter/integration_test.go +++ b/pkg/natsrouter/integration_test.go @@ -13,37 +13,21 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - tcnats "github.com/testcontainers/testcontainers-go/modules/nats" "github.com/Marz32onE/instrumentation-go/otel-nats/otelnats" "github.com/hmchangw/chat/pkg/natsrouter" - "github.com/hmchangw/chat/pkg/testutil/testimages" + "github.com/hmchangw/chat/pkg/testutil" ) -// setupNATS starts a real NATS container and returns a connected otelnats -// client. Required to surface timing races that in-process NATS cannot +// setupNATS returns an otelnats client connected to the process-shared +// NATS. Required to surface timing races that in-process NATS cannot // reproduce (real TCP, real server dispatch goroutines, real latency). func setupNATS(t *testing.T) *otelnats.Conn { t.Helper() - ctx := context.Background() - - container, err := tcnats.Run(ctx, testimages.NATS) - require.NoError(t, err, "start NATS container") - t.Cleanup(func() { - // Best-effort container teardown; failures here don't affect outcome. - if err := container.Terminate(ctx); err != nil { - t.Logf("terminate nats container: %v", err) - } - }) - - url, err := container.ConnectionString(ctx) - require.NoError(t, err, "nats connection string") - - nc, err := otelnats.Connect(url) + nc, err := otelnats.Connect(testutil.NATS(t)) require.NoError(t, err, "connect to NATS") t.Cleanup(nc.Close) - return nc } diff --git a/pkg/natsrouter/main_test.go b/pkg/natsrouter/main_test.go new file mode 100644 index 000000000..0c181867a --- /dev/null +++ b/pkg/natsrouter/main_test.go @@ -0,0 +1,16 @@ +//go:build integration + +package natsrouter_test + +import ( + "os" + "testing" + + "github.com/hmchangw/chat/pkg/testutil" +) + +func TestMain(m *testing.M) { + code := m.Run() + testutil.TerminateAll() + os.Exit(code) +} diff --git a/pkg/roomcrypto/main_test.go b/pkg/roomcrypto/main_test.go new file mode 100644 index 000000000..f9b04c7d3 --- /dev/null +++ b/pkg/roomcrypto/main_test.go @@ -0,0 +1,20 @@ +//go:build integration + +package roomcrypto + +// Import testutil for the Ryuk-disable init() side effect. This package +// starts its Node container per-test (t.Cleanup handles teardown); +// TerminateAll is a no-op when no shared testutil containers were started. + +import ( + "os" + "testing" + + "github.com/hmchangw/chat/pkg/testutil" +) + +func TestMain(m *testing.M) { + code := m.Run() + testutil.TerminateAll() + os.Exit(code) +} diff --git a/pkg/roomkeysender/main_test.go b/pkg/roomkeysender/main_test.go new file mode 100644 index 000000000..384b5b2d7 --- /dev/null +++ b/pkg/roomkeysender/main_test.go @@ -0,0 +1,21 @@ +//go:build integration + +package roomkeysender_test + +// Import testutil for the Ryuk-disable init() side effect. TerminateAll +// is called even though this package starts its containers per-test +// (their t.Cleanups already handle teardown); TerminateAll is a no-op +// when no shared testutil containers were started. + +import ( + "os" + "testing" + + "github.com/hmchangw/chat/pkg/testutil" +) + +func TestMain(m *testing.M) { + code := m.Run() + testutil.TerminateAll() + os.Exit(code) +} diff --git a/pkg/roomkeystore/integration_test.go b/pkg/roomkeystore/integration_test.go index 8ce539a3b..57c8d67aa 100644 --- a/pkg/roomkeystore/integration_test.go +++ b/pkg/roomkeystore/integration_test.go @@ -6,47 +6,25 @@ import ( "bytes" "context" "errors" - "fmt" "testing" "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - "github.com/testcontainers/testcontainers-go" - "github.com/testcontainers/testcontainers-go/wait" - "github.com/hmchangw/chat/pkg/testutil/testimages" + "github.com/hmchangw/chat/pkg/testutil" ) -// setupValkey starts a valkey/valkey:8 container and returns a connected valkeyStore. -// The container is terminated via t.Cleanup. +// setupValkey returns a RoomKeyStore connected to the process-shared +// Valkey, with FLUSHDB on cleanup so sibling tests start clean. func setupValkey(t *testing.T, gracePeriod time.Duration) RoomKeyStore { t.Helper() - ctx := context.Background() - - container, err := testcontainers.GenericContainer(ctx, testcontainers.GenericContainerRequest{ - ContainerRequest: testcontainers.ContainerRequest{ - Image: testimages.Valkey, - ExposedPorts: []string{"6379/tcp"}, - WaitingFor: wait.ForLog("Ready to accept connections"), - }, - Started: true, - }) - require.NoError(t, err, "start valkey container") - t.Cleanup(func() { - _ = container.Terminate(ctx) // best-effort; ignore cleanup errors - }) - - host, err := container.Host(ctx) - require.NoError(t, err) - port, err := container.MappedPort(ctx, "6379") - require.NoError(t, err) - store, err := NewValkeyStore(Config{ - Addr: fmt.Sprintf("%s:%s", host, port.Port()), + Addr: testutil.Valkey(t), GracePeriod: gracePeriod, }) require.NoError(t, err, "create valkeyStore") + t.Cleanup(func() { testutil.FlushValkey(t) }) return store } diff --git a/pkg/roomsubcache/integration_test.go b/pkg/roomsubcache/integration_test.go index f3373fce7..9bac30956 100644 --- a/pkg/roomsubcache/integration_test.go +++ b/pkg/roomsubcache/integration_test.go @@ -4,47 +4,27 @@ package roomsubcache_test import ( "context" - "fmt" "testing" "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - "github.com/testcontainers/testcontainers-go" - "github.com/testcontainers/testcontainers-go/wait" "github.com/hmchangw/chat/pkg/roomsubcache" - "github.com/hmchangw/chat/pkg/testutil/testimages" + "github.com/hmchangw/chat/pkg/testutil" "github.com/hmchangw/chat/pkg/valkeyutil" ) -// setupValkey starts a valkey/valkey:8 container and returns a connected -// valkeyutil.Client. The container is terminated via t.Cleanup. +// setupValkey returns a client connected to the process-shared Valkey, +// with FLUSHDB on cleanup so sibling tests start clean. func setupValkey(t *testing.T) valkeyutil.Client { t.Helper() - ctx := context.Background() - - container, err := testcontainers.GenericContainer(ctx, testcontainers.GenericContainerRequest{ - ContainerRequest: testcontainers.ContainerRequest{ - Image: testimages.Valkey, - ExposedPorts: []string{"6379/tcp"}, - WaitingFor: wait.ForLog("Ready to accept connections"), - }, - Started: true, - }) - require.NoError(t, err, "start valkey container") + client, err := valkeyutil.Connect(context.Background(), testutil.Valkey(t), "") + require.NoError(t, err, "connect valkey") t.Cleanup(func() { - _ = container.Terminate(ctx) // best-effort; ignore cleanup errors + testutil.FlushValkey(t) + _ = client.Close() }) - - host, err := container.Host(ctx) - require.NoError(t, err) - port, err := container.MappedPort(ctx, "6379") - require.NoError(t, err) - - client, err := valkeyutil.Connect(ctx, fmt.Sprintf("%s:%s", host, port.Port()), "") - require.NoError(t, err, "connect valkey") - t.Cleanup(func() { _ = client.Close() }) // best-effort; ignore cleanup errors return client } diff --git a/pkg/testutil/cassandra.go b/pkg/testutil/cassandra.go index 01c2c9d2c..ece7f82de 100644 --- a/pkg/testutil/cassandra.go +++ b/pkg/testutil/cassandra.go @@ -84,15 +84,14 @@ func TerminateCassandra() { cassSession.Close() cassSession = nil } - if cassContainer == nil { - return - } - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) - defer cancel() - if err := cassContainer.Terminate(ctx); err != nil { - fmt.Fprintf(os.Stderr, "terminate shared cassandra: %v\n", err) + if cassContainer != nil { + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + if err := cassContainer.Terminate(ctx); err != nil { + fmt.Fprintf(os.Stderr, "terminate shared cassandra: %v\n", err) + } + cassContainer = nil } - cassContainer = nil } // EnsureCassandra starts the shared Cassandra container if not already diff --git a/pkg/testutil/init.go b/pkg/testutil/init.go deleted file mode 100644 index 8e01dfb1c..000000000 --- a/pkg/testutil/init.go +++ /dev/null @@ -1,20 +0,0 @@ -//go:build integration - -package testutil - -import "os" - -// init disables testcontainers-go's Ryuk reaper across every integration -// test in this repo. Ryuk fails to start on our CI runner (can't pull / -// run the sidecar image), and Ryuk-on-by-default would block every job. -// Cleanup is handled explicitly via TerminateAll, which each service's -// TestMain calls. -// -// LookupEnv guard so a developer debugging container leaks can flip Ryuk -// back on with `TESTCONTAINERS_RYUK_DISABLED=false go test ...` without -// editing code. -func init() { - if _, set := os.LookupEnv("TESTCONTAINERS_RYUK_DISABLED"); !set { - _ = os.Setenv("TESTCONTAINERS_RYUK_DISABLED", "true") - } -} diff --git a/pkg/testutil/mongo.go b/pkg/testutil/mongo.go index c121f213e..af8afc16d 100644 --- a/pkg/testutil/mongo.go +++ b/pkg/testutil/mongo.go @@ -53,8 +53,7 @@ func ensureMongoClient() (*mongo.Client, error) { } // TerminateMongo disconnects the shared client and stops the shared -// container. Best-effort; errors go to stderr. Intended for TestMain to -// call on clean exits when Ryuk is disabled (e.g., in CI). +// container. Best-effort and idempotent — safe to call from any TestMain. func TerminateMongo() { ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() @@ -62,11 +61,13 @@ func TerminateMongo() { if err := mongoClient.Disconnect(ctx); err != nil { fmt.Fprintf(os.Stderr, "disconnect shared mongo client: %v\n", err) } + mongoClient = nil } if mongoContainer != nil { if err := mongoContainer.Terminate(ctx); err != nil { fmt.Fprintf(os.Stderr, "terminate shared mongo: %v\n", err) } + mongoContainer = nil } } diff --git a/pkg/testutil/valkey.go b/pkg/testutil/valkey.go index e662af062..f9d3e5b08 100644 --- a/pkg/testutil/valkey.go +++ b/pkg/testutil/valkey.go @@ -10,6 +10,7 @@ import ( "testing" "time" + goredis "github.com/redis/go-redis/v9" "github.com/testcontainers/testcontainers-go" "github.com/testcontainers/testcontainers-go/wait" @@ -74,6 +75,23 @@ func Valkey(t *testing.T) string { // No-t variant intended for TestMain pre-warming. func EnsureValkey() error { _, err := ensureValkey(); return err } +// FlushValkey wipes the shared Valkey keyspace. Intended for per-test +// cleanup so sibling tests don't see each other's keys. Uses a raw +// go-redis client so we don't need to expose FLUSHDB on the production +// valkeyutil interface. Failure is a test failure — leftover state would +// silently break the next sibling test. +func FlushValkey(t *testing.T) { + t.Helper() + addr := Valkey(t) + rc := goredis.NewClient(&goredis.Options{Addr: addr}) + defer func() { _ = rc.Close() }() + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + if err := rc.FlushDB(ctx).Err(); err != nil { + t.Errorf("flush shared valkey: %v", err) + } +} + // TerminateValkey stops the shared Valkey container. Best-effort, idempotent. func TerminateValkey() { if valkeyContainer == nil { diff --git a/pkg/userstore/main_test.go b/pkg/userstore/main_test.go new file mode 100644 index 000000000..593bbc836 --- /dev/null +++ b/pkg/userstore/main_test.go @@ -0,0 +1,16 @@ +//go:build integration + +package userstore + +import ( + "os" + "testing" + + "github.com/hmchangw/chat/pkg/testutil" +) + +func TestMain(m *testing.M) { + code := m.Run() + testutil.TerminateAll() + os.Exit(code) +} diff --git a/room-service/integration_test.go b/room-service/integration_test.go index cc2c9cfbc..b30745c41 100644 --- a/room-service/integration_test.go +++ b/room-service/integration_test.go @@ -17,9 +17,6 @@ import ( "github.com/nats-io/nats.go" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - "github.com/testcontainers/testcontainers-go" - natsmod "github.com/testcontainers/testcontainers-go/modules/nats" - "github.com/testcontainers/testcontainers-go/wait" "go.mongodb.org/mongo-driver/v2/bson" "go.mongodb.org/mongo-driver/v2/mongo" @@ -29,7 +26,6 @@ import ( "github.com/hmchangw/chat/pkg/roomkeystore" "github.com/hmchangw/chat/pkg/subject" "github.com/hmchangw/chat/pkg/testutil" - "github.com/hmchangw/chat/pkg/testutil/testimages" ) func setupMongo(t *testing.T) *mongo.Database { @@ -38,23 +34,9 @@ func setupMongo(t *testing.T) *mongo.Database { func setupValkey(t *testing.T) *roomkeystore.Config { t.Helper() - ctx := context.Background() - container, err := testcontainers.GenericContainer(ctx, testcontainers.GenericContainerRequest{ - ContainerRequest: testcontainers.ContainerRequest{ - Image: testimages.Valkey, - ExposedPorts: []string{"6379/tcp"}, - WaitingFor: wait.ForLog("Ready to accept connections"), - }, - Started: true, - }) - require.NoError(t, err) - t.Cleanup(func() { _ = container.Terminate(ctx) }) - host, err := container.Host(ctx) - require.NoError(t, err) - port, err := container.MappedPort(ctx, "6379") - require.NoError(t, err) + t.Cleanup(func() { testutil.FlushValkey(t) }) return &roomkeystore.Config{ - Addr: fmt.Sprintf("%s:%s", host, port.Port()), + Addr: testutil.Valkey(t), GracePeriod: time.Hour, } } @@ -110,13 +92,7 @@ func TestCassMessageReader_GetMessageRoomAndCreatedAt_Integration(t *testing.T) func setupNATS(t *testing.T) string { t.Helper() - ctx := context.Background() - container, err := natsmod.Run(ctx, testimages.NATS) - require.NoError(t, err) - t.Cleanup(func() { _ = container.Terminate(ctx) }) - url, err := container.ConnectionString(ctx) - require.NoError(t, err) - return url + return testutil.NATS(t) } func TestMongoStore_Integration(t *testing.T) { diff --git a/room-service/main_test.go b/room-service/main_test.go new file mode 100644 index 000000000..32bfb4078 --- /dev/null +++ b/room-service/main_test.go @@ -0,0 +1,16 @@ +//go:build integration + +package main + +import ( + "os" + "testing" + + "github.com/hmchangw/chat/pkg/testutil" +) + +func TestMain(m *testing.M) { + code := m.Run() + testutil.TerminateAll() + os.Exit(code) +} diff --git a/room-worker/integration_test.go b/room-worker/integration_test.go index 87ce1a1bc..3e9e3dfe4 100644 --- a/room-worker/integration_test.go +++ b/room-worker/integration_test.go @@ -5,7 +5,6 @@ package main import ( "context" "encoding/json" - "fmt" "slices" "strings" "sync" @@ -16,8 +15,6 @@ import ( "github.com/nats-io/nats.go" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - "github.com/testcontainers/testcontainers-go" - "github.com/testcontainers/testcontainers-go/wait" "go.mongodb.org/mongo-driver/v2/bson" "go.mongodb.org/mongo-driver/v2/mongo" "go.mongodb.org/mongo-driver/v2/mongo/options" @@ -29,7 +26,6 @@ import ( "github.com/hmchangw/chat/pkg/roomkeystore" "github.com/hmchangw/chat/pkg/subject" "github.com/hmchangw/chat/pkg/testutil" - "github.com/hmchangw/chat/pkg/testutil/testimages" ) // capturedPublish records a single publish call for later assertion. @@ -1166,33 +1162,22 @@ func TestSyncCreateDM_CrossSite_OutboxPayloadConverges(t *testing.T) { "replay must produce identical Nats-Msg-Id so broker dedup blocks duplicate cross-site events") } -// setupValkey starts a Valkey testcontainer and returns a connected full key store. -// The returned store satisfies both roomkeystore.RoomKeyStore (for seeding) and the -// local RoomKeyStore interface accepted by NewHandler (Get-only subset). +// setupValkey returns a key store backed by the process-shared Valkey, +// with FLUSHDB on cleanup so sibling tests start clean. Satisfies both +// roomkeystore.RoomKeyStore (for seeding) and the local Get-only subset +// accepted by NewHandler. func setupValkey(t *testing.T) roomkeystore.RoomKeyStore { t.Helper() - ctx := context.Background() - container, err := testcontainers.GenericContainer(ctx, testcontainers.GenericContainerRequest{ - ContainerRequest: testcontainers.ContainerRequest{ - Image: testimages.Valkey, - ExposedPorts: []string{"6379/tcp"}, - WaitingFor: wait.ForLog("Ready to accept connections"), - }, - Started: true, - }) - require.NoError(t, err) - t.Cleanup(func() { _ = container.Terminate(ctx) }) - host, err := container.Host(ctx) - require.NoError(t, err) - port, err := container.MappedPort(ctx, "6379") - require.NoError(t, err) cfg := roomkeystore.Config{ - Addr: fmt.Sprintf("%s:%s", host, port.Port()), + Addr: testutil.Valkey(t), GracePeriod: time.Hour, } ks, err := roomkeystore.NewValkeyStore(cfg) require.NoError(t, err) - t.Cleanup(func() { _ = ks.Close() }) + t.Cleanup(func() { + _ = ks.Close() + testutil.FlushValkey(t) + }) return ks } diff --git a/room-worker/main_test.go b/room-worker/main_test.go new file mode 100644 index 000000000..32bfb4078 --- /dev/null +++ b/room-worker/main_test.go @@ -0,0 +1,16 @@ +//go:build integration + +package main + +import ( + "os" + "testing" + + "github.com/hmchangw/chat/pkg/testutil" +) + +func TestMain(m *testing.M) { + code := m.Run() + testutil.TerminateAll() + os.Exit(code) +} diff --git a/search-service/setup_shared_test.go b/search-service/setup_shared_test.go index a7c350787..6b7b20917 100644 --- a/search-service/setup_shared_test.go +++ b/search-service/setup_shared_test.go @@ -51,19 +51,35 @@ func seedDoc(t *testing.T, esURL, index, id string, doc any) { // TestMain pre-warms the shared containers concurrently so the first test // doesn't pay their startup serially, then explicitly terminates them on -// clean exit. Ryuk is disabled repo-wide via pkg/testutil's init(), so -// TerminateAll is the primary cleanup mechanism. +// clean exit. TerminateAll runs first so containers disappear immediately; +// Ryuk is the safety net for SIGKILL / Ctrl+C where m.Run never returns. +// +// A pre-warm failure aborts the run with code 1 — better than letting +// every test fail individually with confusing "couldn't start container" +// errors. func TestMain(m *testing.M) { var wg sync.WaitGroup + errCh := make(chan error, 3) for _, fn := range []func() error{ testutil.EnsureElasticsearch, testutil.EnsureNATS, testutil.EnsureValkey, } { wg.Add(1) - go func(f func() error) { defer wg.Done(); _ = f() }(fn) + go func(f func() error) { + defer wg.Done() + if err := f(); err != nil { + errCh <- err + } + }(fn) } wg.Wait() + close(errCh) + if err, ok := <-errCh; ok { + fmt.Fprintf(os.Stderr, "prewarm shared containers: %v\n", err) + testutil.TerminateAll() + os.Exit(1) + } code := m.Run() testutil.TerminateAll() os.Exit(code) diff --git a/search-sync-worker/integration_test.go b/search-sync-worker/integration_test.go index 6abc1677f..d5ef4d21f 100644 --- a/search-sync-worker/integration_test.go +++ b/search-sync-worker/integration_test.go @@ -11,6 +11,7 @@ import ( "net/http" "net/url" "os" + "sync" "testing" "time" @@ -18,139 +19,72 @@ import ( "github.com/nats-io/nats.go/jetstream" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - "github.com/testcontainers/testcontainers-go" - "github.com/testcontainers/testcontainers-go/wait" "github.com/hmchangw/chat/pkg/model" "github.com/hmchangw/chat/pkg/searchengine" "github.com/hmchangw/chat/pkg/stream" "github.com/hmchangw/chat/pkg/subject" - "github.com/hmchangw/chat/pkg/testutil/testimages" + "github.com/hmchangw/chat/pkg/testutil" ) -// Package-level singletons — one Elasticsearch + one NATS JetStream container -// shared across all tests in this package. Tests isolate themselves via unique -// index / stream names (already the case in this suite). On VFS storage, -// spawning ES per-test is prohibitive — a 120s startup * 7 tests = 14min. +// Package-level NATS connection + JetStream client. Connected once in +// TestMain and shared by every test. The underlying NATS and ES +// containers come from pkg/testutil. var ( - testESURL string - testJS jetstream.JetStream - testNATSCon *nats.Conn + testJS jetstream.JetStream + testNATSCon *nats.Conn + testNATSConErr error + testNATSOnce sync.Once ) +// TestMain pre-warms ES + NATS in parallel, opens one JetStream client, +// then terminates the shared containers on clean exit via testutil.TerminateAll. func TestMain(m *testing.M) { - // Wrap the setup logic in an inner function so `defer` runs for every - // successfully-created resource before TestMain returns, regardless of - // which error branch we take. Keeps cleanup in one place instead of - // reinvented cascades at each error site. - os.Exit(runTestMain(m)) -} - -func runTestMain(m *testing.M) int { - ctx := context.Background() - - esContainer, err := testcontainers.GenericContainer(ctx, testcontainers.GenericContainerRequest{ - ContainerRequest: testcontainers.ContainerRequest{ - Image: testimages.Elasticsearch, - ExposedPorts: []string{"9200/tcp"}, - Env: map[string]string{ - "discovery.type": "single-node", - "xpack.security.enabled": "false", - "cluster.routing.allocation.disk.threshold_enabled": "false", - "ES_JAVA_OPTS": "-Xms512m -Xmx512m", - }, - WaitingFor: wait.ForAll( - wait.ForHTTP("/").WithPort("9200/tcp").WithStartupTimeout(120*time.Second), - wait.ForHTTP("/_cluster/health?wait_for_status=yellow&timeout=60s"). - WithPort("9200/tcp"). - WithStartupTimeout(120*time.Second), - ), - }, - Started: true, - }) - if err != nil { - fmt.Fprintf(os.Stderr, "start elasticsearch: %v\n", err) - return 1 + var wg sync.WaitGroup + for _, fn := range []func() error{ + testutil.EnsureElasticsearch, + testutil.EnsureNATS, + } { + wg.Add(1) + go func(f func() error) { defer wg.Done(); _ = f() }(fn) } - defer func() { - if err := esContainer.Terminate(ctx); err != nil { - fmt.Fprintf(os.Stderr, "terminate elasticsearch: %v\n", err) - } - }() - esHost, err := esContainer.Host(ctx) - if err != nil { - fmt.Fprintf(os.Stderr, "get es host: %v\n", err) - return 1 - } - esPort, err := esContainer.MappedPort(ctx, "9200") - if err != nil { - fmt.Fprintf(os.Stderr, "get es port: %v\n", err) - return 1 - } - testESURL = fmt.Sprintf("http://%s:%s", esHost, esPort.Port()) - - natsContainer, err := testcontainers.GenericContainer(ctx, testcontainers.GenericContainerRequest{ - ContainerRequest: testcontainers.ContainerRequest{ - Image: testimages.NATS, - ExposedPorts: []string{"4222/tcp"}, - Cmd: []string{"--jetstream"}, - WaitingFor: wait.ForLog("Server is ready").WithStartupTimeout(30 * time.Second), - }, - Started: true, - }) - if err != nil { - fmt.Fprintf(os.Stderr, "start nats: %v\n", err) - return 1 + wg.Wait() + code := m.Run() + if testNATSCon != nil { + testNATSCon.Close() } - defer func() { - if err := natsContainer.Terminate(ctx); err != nil { - fmt.Fprintf(os.Stderr, "terminate nats: %v\n", err) - } - }() - natsHost, err := natsContainer.Host(ctx) - if err != nil { - fmt.Fprintf(os.Stderr, "get nats host: %v\n", err) - return 1 - } - natsPort, err := natsContainer.MappedPort(ctx, "4222") - if err != nil { - fmt.Fprintf(os.Stderr, "get nats port: %v\n", err) - return 1 - } - natsURL := fmt.Sprintf("nats://%s:%s", natsHost, natsPort.Port()) - nc, err := nats.Connect(natsURL) - if err != nil { - fmt.Fprintf(os.Stderr, "connect nats: %v\n", err) - return 1 - } - defer nc.Close() - js, err := jetstream.New(nc) - if err != nil { - fmt.Fprintf(os.Stderr, "init jetstream: %v\n", err) - return 1 - } - testJS = js - testNATSCon = nc - - return m.Run() + testutil.TerminateAll() + os.Exit(code) } // setupElasticsearch returns the shared ES URL. Tests must use unique index // names to stay isolated — the existing suite does. func setupElasticsearch(t *testing.T) string { t.Helper() - if testESURL == "" { - t.Fatal("testESURL is empty — TestMain did not run") - } - return testESURL + return testutil.Elasticsearch(t) } // setupNATSJetStream returns the shared (JetStream, Conn). Tests must use // unique stream names to stay isolated — the existing suite does. func setupNATSJetStream(t *testing.T) (jetstream.JetStream, *nats.Conn) { t.Helper() - if testJS == nil || testNATSCon == nil { - t.Fatal("testJS/testNATSCon is nil — TestMain did not run") + testNATSOnce.Do(func() { + nc, err := nats.Connect(testutil.NATS(t)) + if err != nil { + testNATSConErr = fmt.Errorf("connect nats: %w", err) + return + } + js, err := jetstream.New(nc) + if err != nil { + nc.Close() + testNATSConErr = fmt.Errorf("init jetstream: %w", err) + return + } + testNATSCon = nc + testJS = js + }) + if testNATSConErr != nil { + t.Fatalf("nats jetstream setup: %v", testNATSConErr) } return testJS, testNATSCon } diff --git a/tools/loadgen/integration_test.go b/tools/loadgen/integration_test.go index b1a0f2948..63d8f8fa9 100644 --- a/tools/loadgen/integration_test.go +++ b/tools/loadgen/integration_test.go @@ -5,69 +5,28 @@ package main import ( "context" "encoding/json" - "fmt" + "os" "testing" "time" "github.com/nats-io/nats.go" "github.com/nats-io/nats.go/jetstream" "github.com/stretchr/testify/require" - "github.com/testcontainers/testcontainers-go" - "github.com/testcontainers/testcontainers-go/modules/mongodb" - "github.com/testcontainers/testcontainers-go/wait" "go.mongodb.org/mongo-driver/v2/bson" "github.com/hmchangw/chat/pkg/model" - "github.com/hmchangw/chat/pkg/mongoutil" "github.com/hmchangw/chat/pkg/stream" "github.com/hmchangw/chat/pkg/subject" - "github.com/hmchangw/chat/pkg/testutil/testimages" + "github.com/hmchangw/chat/pkg/testutil" ) -// setupNATS starts a JetStream-enabled NATS container via the generic -// testcontainers interface (no dedicated NATS module is required). -func setupNATS(t *testing.T) (string, func()) { - t.Helper() - ctx := context.Background() - c, err := testcontainers.GenericContainer(ctx, testcontainers.GenericContainerRequest{ - ContainerRequest: testcontainers.ContainerRequest{ - Image: testimages.NATS, - Cmd: []string{"-js"}, - ExposedPorts: []string{"4222/tcp"}, - WaitingFor: wait.ForLog("Server is ready").WithStartupTimeout(30 * time.Second), - }, - Started: true, - }) - require.NoError(t, err) - host, err := c.Host(ctx) - require.NoError(t, err) - port, err := c.MappedPort(ctx, "4222") - require.NoError(t, err) - return fmt.Sprintf("nats://%s:%s", host, port.Port()), func() { _ = c.Terminate(ctx) } -} - -func setupMongo(t *testing.T) (string, func()) { - t.Helper() - ctx := context.Background() - c, err := mongodb.Run(ctx, testimages.Mongo) - require.NoError(t, err) - uri, err := c.ConnectionString(ctx) - require.NoError(t, err) - return uri, func() { _ = c.Terminate(ctx) } -} - // TestLoadgenSmallPreset_EndToEnd verifies the generator publishes messages, // a fake gatekeeper forwards them to MESSAGES_CANONICAL, two JetStream // consumers drain the stream, a fake broadcast-worker emits room events, // and MongoDB shows the seeded room data. func TestLoadgenSmallPreset_EndToEnd(t *testing.T) { ctx := context.Background() - natsURI, stopNATS := setupNATS(t) - defer stopNATS() - mongoURI, stopMongo := setupMongo(t) - defer stopMongo() - - nc, err := nats.Connect(natsURI) + nc, err := nats.Connect(testutil.NATS(t)) require.NoError(t, err) defer nc.Drain() @@ -95,11 +54,7 @@ func TestLoadgenSmallPreset_EndToEnd(t *testing.T) { defer cc.Stop() } - // Connect Mongo and seed fixtures. - client, err := mongoutil.Connect(ctx, mongoURI, "", "") - require.NoError(t, err) - defer mongoutil.Disconnect(ctx, client) - db := client.Database("chat") + db := testutil.MongoDB(t, "loadgen") preset, _ := BuiltinPreset("small") fixtures := BuildFixtures(&preset, 42, siteID) @@ -186,3 +141,9 @@ func TestLoadgenSmallPreset_EndToEnd(t *testing.T) { require.NoError(t, err) require.Equal(t, fixtures.Rooms[0].ID, room.ID) } + +func TestMain(m *testing.M) { + code := m.Run() + testutil.TerminateAll() + os.Exit(code) +} From 66cecfe43fad75a8003079ee4a00c2182a167912 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 20 May 2026 09:17:46 +0000 Subject: [PATCH 12/23] test(loadgen): migrate members_integration_test.go to testutil.NATS Merge from main pulled in tools/loadgen/members_integration_test.go which referenced the local setupNATS helper that this PR deleted. Swap it to testutil.NATS(t) like the sibling integration test. --- tools/loadgen/members_integration_test.go | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tools/loadgen/members_integration_test.go b/tools/loadgen/members_integration_test.go index 848712ba0..7ceaad532 100644 --- a/tools/loadgen/members_integration_test.go +++ b/tools/loadgen/members_integration_test.go @@ -17,6 +17,7 @@ import ( "github.com/hmchangw/chat/pkg/model" "github.com/hmchangw/chat/pkg/stream" "github.com/hmchangw/chat/pkg/subject" + "github.com/hmchangw/chat/pkg/testutil" ) // TestMembersSustained_EndToEnd verifies the full members-sustained pipeline @@ -25,9 +26,7 @@ import ( // non-zero traffic. func TestMembersSustained_EndToEnd(t *testing.T) { ctx := context.Background() - natsURL, stopNATS := setupNATS(t) - defer stopNATS() - + natsURL := testutil.NATS(t) nc, err := nats.Connect(natsURL) require.NoError(t, err) defer nc.Drain() //nolint:errcheck From 94827bc0614eb25780a0080e8f5d49261a82b33d Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 20 May 2026 10:31:31 +0000 Subject: [PATCH 13/23] test(testutil): re-disable Ryuk via init(); audit per-container cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CI reliably needs TESTCONTAINERS_RYUK_DISABLED=true after all. Set it via pkg/testutil's init() so every test process inherits it without per-job env config. Developers can override locally with TESTCONTAINERS_RYUK_DISABLED=false to debug container leaks. Cleanup audit — every test container in this PR has: - A stored container reference (not just URL/addr). - Best-effort Terminate on init-failure paths inside ensureXxx helpers. - Final teardown driven by TestMain → testutil.TerminateAll OR by t.Cleanup for per-test local containers. This holds regardless of test pass/fail: t.Cleanup runs on failure + panic, and TestMain's `code := m.Run(); TerminateAll(); os.Exit(code)` wrap ensures cleanup runs before process exit. --- pkg/testutil/init.go | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 pkg/testutil/init.go diff --git a/pkg/testutil/init.go b/pkg/testutil/init.go new file mode 100644 index 000000000..ef8f5d595 --- /dev/null +++ b/pkg/testutil/init.go @@ -0,0 +1,21 @@ +//go:build integration + +package testutil + +import "os" + +// init disables testcontainers-go's Ryuk reaper across every integration +// test in this repo. Cleanup is handled explicitly via TerminateAll, which +// each service's TestMain calls. +// +// LookupEnv guard so a developer debugging container leaks can flip Ryuk +// back on with `TESTCONTAINERS_RYUK_DISABLED=false go test ...` without +// editing code. +func init() { + if _, set := os.LookupEnv("TESTCONTAINERS_RYUK_DISABLED"); !set { + // Best-effort — process-level env mutation can't realistically + // fail. If it ever did, testcontainers-go would just default to + // Ryuk-on and CI would surface the original failure mode. + _ = os.Setenv("TESTCONTAINERS_RYUK_DISABLED", "true") + } +} From cda13a27e6fe079e10dcc4ad642b09223985f225 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 20 May 2026 10:38:10 +0000 Subject: [PATCH 14/23] =?UTF-8?q?test(testutil,search-service):=20simplify?= =?UTF-8?q?=20pass=20=E2=80=94=20RunTests=20helper,=20constants,=20naming?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add testutil.RunTests(m) one-liner helper. Collapses 17 identical per-service TestMains into `func TestMain(m *testing.M) { testutil.RunTests(m) }`. - Drop search-service's local flushValkey + freshValkeyClient — the former duplicates testutil.FlushValkey exactly; the latter is renamed to valkeyClient (the underlying container is shared, "fresh" was misleading). - Promote search-service NATS queue-group literals to testQueueGroup / testQueueGroupSubs / testQueueGroupV2 constants. - Add testutil.EnsureMongo to search-service's prewarm slice — apps tests used to pay Mongo startup serially against the first call. - Trim the over-long docstrings on init.go and terminate.go. - Replace the cross-file "// Flush — see setupAppsFixture for the rationale" comment with the inline one-liner everywhere; no more grep-to-find chain. - Fix stale "Prewarm" reference in elasticsearch.go EnsureXxx comment. --- broadcast-worker/main_test.go | 7 +- .../internal/cassrepo/main_test.go | 7 +- .../internal/mongorepo/main_test.go | 7 +- history-service/internal/service/main_test.go | 7 +- inbox-worker/main_test.go | 7 +- message-worker/main_test.go | 7 +- notification-worker/main_test.go | 7 +- pkg/minioutil/main_test.go | 7 +- pkg/mongoutil/main_test.go | 7 +- pkg/natsrouter/main_test.go | 7 +- pkg/roomcrypto/main_test.go | 12 +--- pkg/roomkeysender/main_test.go | 13 +--- pkg/testutil/elasticsearch.go | 2 +- pkg/testutil/init.go | 14 ++-- pkg/testutil/terminate.go | 18 +---- pkg/testutil/testmain.go | 20 ++++++ pkg/userstore/main_test.go | 7 +- room-service/main_test.go | 7 +- room-worker/main_test.go | 7 +- search-service/integration_apps_test.go | 6 +- search-service/integration_ccs_test.go | 8 +-- search-service/integration_messages_test.go | 4 +- search-service/integration_rooms_test.go | 6 +- search-service/integration_users_test.go | 4 +- search-service/setup_shared_test.go | 70 +++++++------------ tools/loadgen/integration_test.go | 7 +- 26 files changed, 88 insertions(+), 187 deletions(-) create mode 100644 pkg/testutil/testmain.go diff --git a/broadcast-worker/main_test.go b/broadcast-worker/main_test.go index 32bfb4078..937f8531a 100644 --- a/broadcast-worker/main_test.go +++ b/broadcast-worker/main_test.go @@ -3,14 +3,9 @@ package main import ( - "os" "testing" "github.com/hmchangw/chat/pkg/testutil" ) -func TestMain(m *testing.M) { - code := m.Run() - testutil.TerminateAll() - os.Exit(code) -} +func TestMain(m *testing.M) { testutil.RunTests(m) } diff --git a/history-service/internal/cassrepo/main_test.go b/history-service/internal/cassrepo/main_test.go index 4ca88f40c..ed7ce21cd 100644 --- a/history-service/internal/cassrepo/main_test.go +++ b/history-service/internal/cassrepo/main_test.go @@ -3,14 +3,9 @@ package cassrepo import ( - "os" "testing" "github.com/hmchangw/chat/pkg/testutil" ) -func TestMain(m *testing.M) { - code := m.Run() - testutil.TerminateAll() - os.Exit(code) -} +func TestMain(m *testing.M) { testutil.RunTests(m) } diff --git a/history-service/internal/mongorepo/main_test.go b/history-service/internal/mongorepo/main_test.go index 7134a1b31..22ef73259 100644 --- a/history-service/internal/mongorepo/main_test.go +++ b/history-service/internal/mongorepo/main_test.go @@ -3,14 +3,9 @@ package mongorepo import ( - "os" "testing" "github.com/hmchangw/chat/pkg/testutil" ) -func TestMain(m *testing.M) { - code := m.Run() - testutil.TerminateAll() - os.Exit(code) -} +func TestMain(m *testing.M) { testutil.RunTests(m) } diff --git a/history-service/internal/service/main_test.go b/history-service/internal/service/main_test.go index af5c2a8d6..794fd0441 100644 --- a/history-service/internal/service/main_test.go +++ b/history-service/internal/service/main_test.go @@ -3,14 +3,9 @@ package service_test import ( - "os" "testing" "github.com/hmchangw/chat/pkg/testutil" ) -func TestMain(m *testing.M) { - code := m.Run() - testutil.TerminateAll() - os.Exit(code) -} +func TestMain(m *testing.M) { testutil.RunTests(m) } diff --git a/inbox-worker/main_test.go b/inbox-worker/main_test.go index 32bfb4078..937f8531a 100644 --- a/inbox-worker/main_test.go +++ b/inbox-worker/main_test.go @@ -3,14 +3,9 @@ package main import ( - "os" "testing" "github.com/hmchangw/chat/pkg/testutil" ) -func TestMain(m *testing.M) { - code := m.Run() - testutil.TerminateAll() - os.Exit(code) -} +func TestMain(m *testing.M) { testutil.RunTests(m) } diff --git a/message-worker/main_test.go b/message-worker/main_test.go index 32bfb4078..937f8531a 100644 --- a/message-worker/main_test.go +++ b/message-worker/main_test.go @@ -3,14 +3,9 @@ package main import ( - "os" "testing" "github.com/hmchangw/chat/pkg/testutil" ) -func TestMain(m *testing.M) { - code := m.Run() - testutil.TerminateAll() - os.Exit(code) -} +func TestMain(m *testing.M) { testutil.RunTests(m) } diff --git a/notification-worker/main_test.go b/notification-worker/main_test.go index 32bfb4078..937f8531a 100644 --- a/notification-worker/main_test.go +++ b/notification-worker/main_test.go @@ -3,14 +3,9 @@ package main import ( - "os" "testing" "github.com/hmchangw/chat/pkg/testutil" ) -func TestMain(m *testing.M) { - code := m.Run() - testutil.TerminateAll() - os.Exit(code) -} +func TestMain(m *testing.M) { testutil.RunTests(m) } diff --git a/pkg/minioutil/main_test.go b/pkg/minioutil/main_test.go index cecdbadb8..cad311b02 100644 --- a/pkg/minioutil/main_test.go +++ b/pkg/minioutil/main_test.go @@ -3,14 +3,9 @@ package minioutil import ( - "os" "testing" "github.com/hmchangw/chat/pkg/testutil" ) -func TestMain(m *testing.M) { - code := m.Run() - testutil.TerminateAll() - os.Exit(code) -} +func TestMain(m *testing.M) { testutil.RunTests(m) } diff --git a/pkg/mongoutil/main_test.go b/pkg/mongoutil/main_test.go index c9353ea61..4f92525e2 100644 --- a/pkg/mongoutil/main_test.go +++ b/pkg/mongoutil/main_test.go @@ -3,14 +3,9 @@ package mongoutil import ( - "os" "testing" "github.com/hmchangw/chat/pkg/testutil" ) -func TestMain(m *testing.M) { - code := m.Run() - testutil.TerminateAll() - os.Exit(code) -} +func TestMain(m *testing.M) { testutil.RunTests(m) } diff --git a/pkg/natsrouter/main_test.go b/pkg/natsrouter/main_test.go index 0c181867a..8c647ca25 100644 --- a/pkg/natsrouter/main_test.go +++ b/pkg/natsrouter/main_test.go @@ -3,14 +3,9 @@ package natsrouter_test import ( - "os" "testing" "github.com/hmchangw/chat/pkg/testutil" ) -func TestMain(m *testing.M) { - code := m.Run() - testutil.TerminateAll() - os.Exit(code) -} +func TestMain(m *testing.M) { testutil.RunTests(m) } diff --git a/pkg/roomcrypto/main_test.go b/pkg/roomcrypto/main_test.go index f9b04c7d3..ec6f00b95 100644 --- a/pkg/roomcrypto/main_test.go +++ b/pkg/roomcrypto/main_test.go @@ -2,19 +2,13 @@ package roomcrypto -// Import testutil for the Ryuk-disable init() side effect. This package -// starts its Node container per-test (t.Cleanup handles teardown); -// TerminateAll is a no-op when no shared testutil containers were started. +// Import testutil for the Ryuk-disable init() side effect even though +// this package starts its containers per-test (t.Cleanup handles teardown). import ( - "os" "testing" "github.com/hmchangw/chat/pkg/testutil" ) -func TestMain(m *testing.M) { - code := m.Run() - testutil.TerminateAll() - os.Exit(code) -} +func TestMain(m *testing.M) { testutil.RunTests(m) } diff --git a/pkg/roomkeysender/main_test.go b/pkg/roomkeysender/main_test.go index 384b5b2d7..d839d4d34 100644 --- a/pkg/roomkeysender/main_test.go +++ b/pkg/roomkeysender/main_test.go @@ -2,20 +2,13 @@ package roomkeysender_test -// Import testutil for the Ryuk-disable init() side effect. TerminateAll -// is called even though this package starts its containers per-test -// (their t.Cleanups already handle teardown); TerminateAll is a no-op -// when no shared testutil containers were started. +// Import testutil for the Ryuk-disable init() side effect even though +// this package starts its containers per-test (t.Cleanup handles teardown). import ( - "os" "testing" "github.com/hmchangw/chat/pkg/testutil" ) -func TestMain(m *testing.M) { - code := m.Run() - testutil.TerminateAll() - os.Exit(code) -} +func TestMain(m *testing.M) { testutil.RunTests(m) } diff --git a/pkg/testutil/elasticsearch.go b/pkg/testutil/elasticsearch.go index c94e8d4d1..6eb2add8d 100644 --- a/pkg/testutil/elasticsearch.go +++ b/pkg/testutil/elasticsearch.go @@ -78,7 +78,7 @@ func Elasticsearch(t *testing.T) string { } // EnsureElasticsearch starts the shared ES container if not already -// started. No-t variant intended for TestMain pre-warming via Prewarm. +// started. No-t variant intended for TestMain pre-warming. func EnsureElasticsearch() error { _, err := ensureElasticsearch(); return err } // TerminateElasticsearch stops the shared ES container. Best-effort and diff --git a/pkg/testutil/init.go b/pkg/testutil/init.go index ef8f5d595..ce2fa0af9 100644 --- a/pkg/testutil/init.go +++ b/pkg/testutil/init.go @@ -4,18 +4,12 @@ package testutil import "os" -// init disables testcontainers-go's Ryuk reaper across every integration -// test in this repo. Cleanup is handled explicitly via TerminateAll, which -// each service's TestMain calls. -// -// LookupEnv guard so a developer debugging container leaks can flip Ryuk -// back on with `TESTCONTAINERS_RYUK_DISABLED=false go test ...` without -// editing code. +// init disables testcontainers-go's Ryuk reaper repo-wide because it +// fails to start on our CI runner. Cleanup is handled by TerminateAll. +// LookupEnv guard lets local debugging flip Ryuk back on without an +// edit: `TESTCONTAINERS_RYUK_DISABLED=false go test ...`. func init() { if _, set := os.LookupEnv("TESTCONTAINERS_RYUK_DISABLED"); !set { - // Best-effort — process-level env mutation can't realistically - // fail. If it ever did, testcontainers-go would just default to - // Ryuk-on and CI would surface the original failure mode. _ = os.Setenv("TESTCONTAINERS_RYUK_DISABLED", "true") } } diff --git a/pkg/testutil/terminate.go b/pkg/testutil/terminate.go index 82d4a4718..8bdd48d51 100644 --- a/pkg/testutil/terminate.go +++ b/pkg/testutil/terminate.go @@ -3,21 +3,9 @@ package testutil // TerminateAll stops every process-shared container started by this -// package, in dependency-free order. Each individual Terminate is a -// no-op if its container was never started, so it's safe to call from -// any service's TestMain regardless of which helpers that service uses. -// -// Intended usage: -// -// func TestMain(m *testing.M) { -// code := m.Run() -// testutil.TerminateAll() -// os.Exit(code) -// } -// -// Required when running with TESTCONTAINERS_RYUK_DISABLED=true (e.g. -// our CI integration job) — Ryuk would otherwise reap these on process -// exit. Locally Ryuk catches SIGKILL / Ctrl+C, where m.Run never returns. +// package. Each TerminateXxx is a no-op if its container was never +// started, so this is safe from any service's TestMain. Use via +// testutil.RunTests for the standard wrap. func TerminateAll() { TerminateMongo() TerminateCassandra() diff --git a/pkg/testutil/testmain.go b/pkg/testutil/testmain.go new file mode 100644 index 000000000..a74b110e3 --- /dev/null +++ b/pkg/testutil/testmain.go @@ -0,0 +1,20 @@ +//go:build integration + +package testutil + +import ( + "os" + "testing" +) + +// RunTests is the canonical TestMain body for any package that uses +// shared testcontainers from this package. It runs the test binary, +// terminates every container started via testutil, and exits with the +// right code. +// +// func TestMain(m *testing.M) { testutil.RunTests(m) } +func RunTests(m *testing.M) { + code := m.Run() + TerminateAll() + os.Exit(code) +} diff --git a/pkg/userstore/main_test.go b/pkg/userstore/main_test.go index 593bbc836..f1015b8e8 100644 --- a/pkg/userstore/main_test.go +++ b/pkg/userstore/main_test.go @@ -3,14 +3,9 @@ package userstore import ( - "os" "testing" "github.com/hmchangw/chat/pkg/testutil" ) -func TestMain(m *testing.M) { - code := m.Run() - testutil.TerminateAll() - os.Exit(code) -} +func TestMain(m *testing.M) { testutil.RunTests(m) } diff --git a/room-service/main_test.go b/room-service/main_test.go index 32bfb4078..937f8531a 100644 --- a/room-service/main_test.go +++ b/room-service/main_test.go @@ -3,14 +3,9 @@ package main import ( - "os" "testing" "github.com/hmchangw/chat/pkg/testutil" ) -func TestMain(m *testing.M) { - code := m.Run() - testutil.TerminateAll() - os.Exit(code) -} +func TestMain(m *testing.M) { testutil.RunTests(m) } diff --git a/room-worker/main_test.go b/room-worker/main_test.go index 32bfb4078..937f8531a 100644 --- a/room-worker/main_test.go +++ b/room-worker/main_test.go @@ -3,14 +3,9 @@ package main import ( - "os" "testing" "github.com/hmchangw/chat/pkg/testutil" ) -func TestMain(m *testing.M) { - code := m.Run() - testutil.TerminateAll() - os.Exit(code) -} +func TestMain(m *testing.M) { testutil.RunTests(m) } diff --git a/search-service/integration_apps_test.go b/search-service/integration_apps_test.go index 5151c4d78..e018793a3 100644 --- a/search-service/integration_apps_test.go +++ b/search-service/integration_apps_test.go @@ -55,12 +55,10 @@ func setupAppsFixture(t *testing.T) *appsFixture { SpotlightReadPattern: "spotlight-*", }) - router := natsrouter.New(serverNATS, "search-service-test") + router := natsrouter.New(serverNATS, testQueueGroup) router.Use(natsrouter.RequestID()) h.Register(router) - // Flush before returning so a fast test doesn't hit "no responders" - // while subscriptions propagate. otelnats wraps the conn — reach - // through to *nats.Conn. + // Flush so subscriptions reach the server before tests send requests (otelnats wraps the conn). require.NoError(t, serverNATS.NatsConn().Flush()) t.Cleanup(func() { _ = router.Shutdown(context.Background()) diff --git a/search-service/integration_ccs_test.go b/search-service/integration_ccs_test.go index 36a406191..f429ec7e2 100644 --- a/search-service/integration_ccs_test.go +++ b/search-service/integration_ccs_test.go @@ -106,7 +106,7 @@ func setupCCSFixture(t *testing.T) *ccsFixture { remoteEngine, err := searchengine.New(ctx, searchengine.Config{Backend: "elasticsearch", URL: remoteURL}) require.NoError(t, err, "build searchengine for remote") - valkeyClient := freshValkeyClient(t) + cacheClient := valkeyClient(t) natsURL := testutil.NATS(t) serverNC, err := natsutil.Connect(natsURL, "") @@ -120,7 +120,7 @@ func setupCCSFixture(t *testing.T) *ccsFixture { userRoomIndex := testUserRoomIndex store := newESStore(localEngine, userRoomIndex) - cache := newValkeyCache(valkeyClient) + cache := newValkeyCache(cacheClient) handler := newHandler(store, nil, nil, cache, handlerConfig{ DocCounts: 25, MaxDocCounts: 100, @@ -130,10 +130,10 @@ func setupCCSFixture(t *testing.T) *ccsFixture { SpotlightReadPattern: "spotlight-test-*", }) - router := natsrouter.New(serverNC, "search-service-test") + router := natsrouter.New(serverNC, testQueueGroup) router.Use(natsrouter.RequestID()) handler.Register(router) - // Flush — see setupAppsFixture for the rationale. + // Flush so subscriptions reach the server before tests send requests (otelnats wraps the conn). require.NoError(t, serverNC.NatsConn().Flush()) t.Cleanup(func() { _ = router.Shutdown(context.Background()) }) diff --git a/search-service/integration_messages_test.go b/search-service/integration_messages_test.go index 9ba2cb08b..013c42519 100644 --- a/search-service/integration_messages_test.go +++ b/search-service/integration_messages_test.go @@ -75,10 +75,10 @@ func setupMessagesV2Fixture(t *testing.T) *messagesV2Fixture { SpotlightReadPattern: "spotlight-*", }) - router := natsrouter.New(serverNATS, "search-service-test-v2") + router := natsrouter.New(serverNATS, testQueueGroupV2) router.Use(natsrouter.RequestID()) h.Register(router) - // Flush — see setupAppsFixture for the rationale. + // Flush so subscriptions reach the server before tests send requests (otelnats wraps the conn). require.NoError(t, serverNATS.NatsConn().Flush()) t.Cleanup(func() { _ = router.Shutdown(context.Background()) }) diff --git a/search-service/integration_rooms_test.go b/search-service/integration_rooms_test.go index 1ad673397..ec7d7500c 100644 --- a/search-service/integration_rooms_test.go +++ b/search-service/integration_rooms_test.go @@ -54,7 +54,7 @@ func setupRoomsFixture(t *testing.T) *roomsFixture { require.NoError(t, err, "build searchengine for subs fixture") esStore := newESStore(engine, testUserRoomIndex) - cache := newValkeyCache(freshValkeyClient(t)) + cache := newValkeyCache(valkeyClient(t)) h := newHandler(esStore, nil, nil, cache, handlerConfig{ DocCounts: 25, MaxDocCounts: 100, @@ -64,10 +64,10 @@ func setupRoomsFixture(t *testing.T) *roomsFixture { SpotlightReadPattern: spotlightIndex, }) - router := natsrouter.New(serverNC, "search-service-test-subs") + router := natsrouter.New(serverNC, testQueueGroupSubs) router.Use(natsrouter.RequestID()) h.Register(router) - // Flush — see setupAppsFixture for the rationale. + // Flush so subscriptions reach the server before tests send requests (otelnats wraps the conn). require.NoError(t, serverNC.NatsConn().Flush()) t.Cleanup(func() { _ = router.Shutdown(context.Background()) }) diff --git a/search-service/integration_users_test.go b/search-service/integration_users_test.go index 28b34cab0..4bb7296d6 100644 --- a/search-service/integration_users_test.go +++ b/search-service/integration_users_test.go @@ -53,10 +53,10 @@ func setupUsersFixture(t *testing.T, thirdPartyHandler http.Handler) *usersFixtu RequestTimeout: 5 * time.Second, }) - router := natsrouter.New(serverNC, "search-service-test") + router := natsrouter.New(serverNC, testQueueGroup) router.Use(natsrouter.RequestID()) h.Register(router) - // Flush — see setupAppsFixture for the rationale. + // Flush so subscriptions reach the server before tests send requests (otelnats wraps the conn). require.NoError(t, serverNC.NatsConn().Flush()) t.Cleanup(func() { _ = router.Shutdown(context.Background()) }) diff --git a/search-service/setup_shared_test.go b/search-service/setup_shared_test.go index 6b7b20917..565565863 100644 --- a/search-service/setup_shared_test.go +++ b/search-service/setup_shared_test.go @@ -2,9 +2,8 @@ package main -// Per-package shared test infrastructure. Containers (ES, NATS, Valkey, -// Mongo) come from pkg/testutil and are reaped by testutil.TerminateAll -// in TestMain. CCS tests bring their own ES pair (see integration_ccs_test.go). +// Per-package shared test infrastructure. Containers come from +// pkg/testutil; CCS tests bring their own ES pair (integration_ccs_test.go). import ( "bytes" @@ -19,7 +18,6 @@ import ( "testing" "time" - goredis "github.com/redis/go-redis/v9" "github.com/stretchr/testify/require" "github.com/hmchangw/chat/pkg/testutil" @@ -28,11 +26,18 @@ import ( const testUserRoomIndex = "user-room" +// NATS queue groups. Each search-service router gets its own so a slow +// drain after one test can't deliver to a sibling test's handler. +const ( + testQueueGroup = "search-service-test" // apps, users, CCS + testQueueGroupSubs = "search-service-test-subs" // rooms + testQueueGroupV2 = "search-service-test-v2" // messages v2 +) + // testHTTPClient bounds ES control-plane calls so a stalled container can't hang the job. var testHTTPClient = &http.Client{Timeout: 10 * time.Second} -// seedDoc PUTs a JSON document into ES, synchronously refreshing the index -// so the next search sees it. +// seedDoc PUTs a JSON document into ES, synchronously refreshing the index. func seedDoc(t *testing.T, esURL, index, id string, doc any) { t.Helper() data, err := json.Marshal(doc) @@ -50,21 +55,18 @@ func seedDoc(t *testing.T, esURL, index, id string, doc any) { } // TestMain pre-warms the shared containers concurrently so the first test -// doesn't pay their startup serially, then explicitly terminates them on -// clean exit. TerminateAll runs first so containers disappear immediately; -// Ryuk is the safety net for SIGKILL / Ctrl+C where m.Run never returns. -// -// A pre-warm failure aborts the run with code 1 — better than letting -// every test fail individually with confusing "couldn't start container" -// errors. +// doesn't pay their startup serially. A pre-warm failure aborts the run +// before m.Run rather than letting every test fail individually. func TestMain(m *testing.M) { var wg sync.WaitGroup - errCh := make(chan error, 3) - for _, fn := range []func() error{ + prewarms := []func() error{ testutil.EnsureElasticsearch, testutil.EnsureNATS, testutil.EnsureValkey, - } { + testutil.EnsureMongo, + } + errCh := make(chan error, len(prewarms)) + for _, fn := range prewarms { wg.Add(1) go func(f func() error) { defer wg.Done() @@ -80,15 +82,12 @@ func TestMain(m *testing.M) { testutil.TerminateAll() os.Exit(1) } - code := m.Run() - testutil.TerminateAll() - os.Exit(code) + testutil.RunTests(m) } // uniqueESIndex returns a per-test ES index name derived from t.Name() and -// registers a cleanup that DELETEs the index. The fnv hash keeps the name -// short, deterministic per test, and free of characters that ES dislikes -// (slashes from subtests). +// registers cleanup that DELETEs the index. The fnv hash keeps the name +// short and free of characters ES dislikes (slashes from subtests). func uniqueESIndex(t *testing.T, prefix string) string { t.Helper() esURL := testutil.Elasticsearch(t) @@ -111,31 +110,16 @@ func uniqueESIndex(t *testing.T, prefix string) string { return name } -// freshValkeyClient returns a valkeyutil.Client connected to the shared -// Valkey, with cleanup that flushes the keyspace at test end so the next -// test starts clean. Tests in this package run sequentially. -func freshValkeyClient(t *testing.T) valkeyutil.Client { +// valkeyClient returns a valkeyutil.Client connected to the shared Valkey, +// with FLUSHDB on cleanup so sibling tests start clean. Tests in this +// package run sequentially. +func valkeyClient(t *testing.T) valkeyutil.Client { t.Helper() - addr := testutil.Valkey(t) - client, err := valkeyutil.Connect(context.Background(), addr, "") + client, err := valkeyutil.Connect(context.Background(), testutil.Valkey(t), "") require.NoError(t, err, "connect shared valkey") t.Cleanup(func() { - flushValkey(t, addr) + testutil.FlushValkey(t) valkeyutil.Disconnect(client) }) return client } - -// flushValkey wipes the keyspace at addr. Uses a raw go-redis client so we -// don't have to expose FLUSHDB on the production valkeyutil.Client. A -// FLUSHDB failure is fatal: state would leak into the next sibling test. -func flushValkey(t *testing.T, addr string) { - t.Helper() - rc := goredis.NewClient(&goredis.Options{Addr: addr}) - defer func() { _ = rc.Close() }() - ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) - defer cancel() - if err := rc.FlushDB(ctx).Err(); err != nil { - t.Errorf("flush valkey at %s: %v", addr, err) - } -} diff --git a/tools/loadgen/integration_test.go b/tools/loadgen/integration_test.go index 63d8f8fa9..101f37dde 100644 --- a/tools/loadgen/integration_test.go +++ b/tools/loadgen/integration_test.go @@ -5,7 +5,6 @@ package main import ( "context" "encoding/json" - "os" "testing" "time" @@ -142,8 +141,4 @@ func TestLoadgenSmallPreset_EndToEnd(t *testing.T) { require.Equal(t, fixtures.Rooms[0].ID, room.ID) } -func TestMain(m *testing.M) { - code := m.Run() - testutil.TerminateAll() - os.Exit(code) -} +func TestMain(m *testing.M) { testutil.RunTests(m) } From 713d5eb8a13c011d55614ed8b99567fd10f504d2 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 20 May 2026 10:46:54 +0000 Subject: [PATCH 15/23] test(search-sync-worker): fail-fast on prewarm errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Match the pattern already used by search-service's TestMain — collect prewarm errors in a channel and surface them before m.Run rather than letting individual tests fail with confusing 'couldn't connect' errors. Per CodeRabbit review on PR #208. --- search-sync-worker/integration_test.go | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/search-sync-worker/integration_test.go b/search-sync-worker/integration_test.go index d5ef4d21f..721ea5fb1 100644 --- a/search-sync-worker/integration_test.go +++ b/search-sync-worker/integration_test.go @@ -37,18 +37,33 @@ var ( testNATSOnce sync.Once ) -// TestMain pre-warms ES + NATS in parallel, opens one JetStream client, -// then terminates the shared containers on clean exit via testutil.TerminateAll. +// TestMain pre-warms ES + NATS in parallel; fails fast if either errors +// (so individual tests don't fail with confusing "couldn't connect" +// messages). Then opens one JetStream client, then terminates the shared +// containers on clean exit via testutil.TerminateAll. func TestMain(m *testing.M) { var wg sync.WaitGroup - for _, fn := range []func() error{ + prewarms := []func() error{ testutil.EnsureElasticsearch, testutil.EnsureNATS, - } { + } + errCh := make(chan error, len(prewarms)) + for _, fn := range prewarms { wg.Add(1) - go func(f func() error) { defer wg.Done(); _ = f() }(fn) + go func(f func() error) { + defer wg.Done() + if err := f(); err != nil { + errCh <- err + } + }(fn) } wg.Wait() + close(errCh) + if err, ok := <-errCh; ok { + fmt.Fprintf(os.Stderr, "prewarm shared containers: %v\n", err) + testutil.TerminateAll() + os.Exit(1) + } code := m.Run() if testNATSCon != nil { testNATSCon.Close() From 1cdad60d29446bb2f7cffbf4d49a3ef7cf693185 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 20 May 2026 11:03:40 +0000 Subject: [PATCH 16/23] test: switch integration test files to internal package per CLAUDE.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Repo convention (CLAUDE.md): test files live in the same package as the code under test so they can reach unexported identifiers. Seven files were using external _test packages — switch them all to internal: pkg/natsrouter/{integration_test,main_test}.go pkg/roomsubcache/integration_test.go (+ new main_test.go) pkg/roomkeysender/{integration_test,main_test}.go history-service/internal/service/{integration_test,main_test}.go Mechanical changes: strip the self-import, de-prefix package-qualified identifiers (natsrouter.New → New, service.New → New, etc.), change the package declaration. No behaviour change. Added pkg/roomsubcache/main_test.go that was missing — it uses testutil.Valkey but had no TestMain to call testutil.TerminateAll. Resolves CodeRabbit "package _test" review threads. --- .../internal/service/integration_test.go | 9 ++- history-service/internal/service/main_test.go | 2 +- pkg/natsrouter/integration_test.go | 57 +++++++++---------- pkg/natsrouter/main_test.go | 2 +- pkg/roomkeysender/integration_test.go | 5 +- pkg/roomkeysender/main_test.go | 2 +- pkg/roomsubcache/integration_test.go | 17 +++--- pkg/roomsubcache/main_test.go | 11 ++++ 8 files changed, 56 insertions(+), 49 deletions(-) create mode 100644 pkg/roomsubcache/main_test.go diff --git a/history-service/internal/service/integration_test.go b/history-service/internal/service/integration_test.go index 8b7c7d254..abd7b546c 100644 --- a/history-service/internal/service/integration_test.go +++ b/history-service/internal/service/integration_test.go @@ -1,6 +1,6 @@ //go:build integration -package service_test +package service import ( "context" @@ -16,7 +16,6 @@ import ( "github.com/hmchangw/chat/history-service/internal/cassrepo" "github.com/hmchangw/chat/history-service/internal/models" - "github.com/hmchangw/chat/history-service/internal/service" "github.com/hmchangw/chat/pkg/model" "github.com/hmchangw/chat/pkg/msgbucket" "github.com/hmchangw/chat/pkg/natsrouter" @@ -139,7 +138,7 @@ func TestEditMessage_Integration(t *testing.T) { session := setupCassandra(t) repo := cassrepo.NewRepository(session, msgbucket.New(24*time.Hour), 365) pub := &recordingPublisher{} - svc := service.New(repo, alwaysSubscribedRepo{}, stubRoomRepo{}, pub, nil, 730*24*time.Hour) + svc := New(repo, alwaysSubscribedRepo{}, stubRoomRepo{}, pub, nil, 730*24*time.Hour) sender := models.Participant{ID: "u1", Account: "alice"} roomID := "r-integ" @@ -202,7 +201,7 @@ func TestDeleteMessage_Integration(t *testing.T) { session := setupCassandra(t) repo := cassrepo.NewRepository(session, msgbucket.New(24*time.Hour), 365) pub := &recordingPublisher{} - svc := service.New(repo, alwaysSubscribedRepo{}, stubRoomRepo{}, pub, nil, 730*24*time.Hour) + svc := New(repo, alwaysSubscribedRepo{}, stubRoomRepo{}, pub, nil, 730*24*time.Hour) sender := models.Participant{ID: "u1", Account: "alice"} roomID := "r-del-integ" @@ -262,7 +261,7 @@ func TestDeleteMessage_ParentWithReplies_NoCascade(t *testing.T) { session := setupCassandra(t) repo := cassrepo.NewRepository(session, msgbucket.New(24*time.Hour), 365) pub := &recordingPublisher{} - svc := service.New(repo, alwaysSubscribedRepo{}, stubRoomRepo{}, pub, nil, 730*24*time.Hour) + svc := New(repo, alwaysSubscribedRepo{}, stubRoomRepo{}, pub, nil, 730*24*time.Hour) sender := models.Participant{ID: "u1", Account: "alice"} roomID := "r-parent-cascade" diff --git a/history-service/internal/service/main_test.go b/history-service/internal/service/main_test.go index 794fd0441..4188eb562 100644 --- a/history-service/internal/service/main_test.go +++ b/history-service/internal/service/main_test.go @@ -1,6 +1,6 @@ //go:build integration -package service_test +package service import ( "testing" diff --git a/pkg/natsrouter/integration_test.go b/pkg/natsrouter/integration_test.go index c66a8e721..61d910b38 100644 --- a/pkg/natsrouter/integration_test.go +++ b/pkg/natsrouter/integration_test.go @@ -1,6 +1,6 @@ //go:build integration -package natsrouter_test +package natsrouter import ( "context" @@ -16,7 +16,6 @@ import ( "github.com/Marz32onE/instrumentation-go/otel-nats/otelnats" - "github.com/hmchangw/chat/pkg/natsrouter" "github.com/hmchangw/chat/pkg/testutil" ) @@ -50,17 +49,17 @@ type echoResp struct { // override is needed. func TestIntegration_ConcurrentRequestsWithCopy(t *testing.T) { nc := setupNATS(t) - r := natsrouter.New(nc, "integration-concurrent") - r.Use(natsrouter.RequestID()) - r.Use(natsrouter.Recovery()) - r.Use(natsrouter.Logging()) + r := New(nc, "integration-concurrent") + r.Use(RequestID()) + r.Use(Recovery()) + r.Use(Logging()) // Async goroutines use Copy() — we count them to prove they all ran. var asyncCompleted atomic.Int64 var asyncStarted sync.WaitGroup - natsrouter.Register(r, "chat.user.{account}.echo.{room}", - func(c *natsrouter.Context, req echoReq) (*echoResp, error) { + Register(r, "chat.user.{account}.echo.{room}", + func(c *Context, req echoReq) (*echoResp, error) { c.Set("account", c.Param("account")) c.Set("room", c.Param("room")) @@ -138,13 +137,13 @@ func TestIntegration_ShutdownUnderLoad(t *testing.T) { for cycle := 0; cycle < cycles; cycle++ { t.Run(fmt.Sprintf("cycle-%d", cycle), func(t *testing.T) { nc := setupNATS(t) - r := natsrouter.New(nc, "integration-shutdown") + r := New(nc, "integration-shutdown") var completed atomic.Int64 started := make(chan struct{}) var startOnce sync.Once - natsrouter.Register(r, "load.{id}", - func(c *natsrouter.Context, req echoReq) (*echoResp, error) { + Register(r, "load.{id}", + func(c *Context, req echoReq) (*echoResp, error) { startOnce.Do(func() { close(started) }) time.Sleep(time.Duration(1+req.Seq%7) * time.Millisecond) completed.Add(1) @@ -185,7 +184,7 @@ func TestIntegration_ShutdownUnderLoad(t *testing.T) { // reply rather than blocking. func TestIntegration_BusyReplyOnSaturation(t *testing.T) { nc := setupNATS(t) - r := natsrouter.New(nc, "integration-busy", natsrouter.WithMaxConcurrency(1)) + r := New(nc, "integration-busy", WithMaxConcurrency(1)) gate := make(chan struct{}) // Safety net: if any assertion below fails before we close the gate, @@ -204,8 +203,8 @@ func TestIntegration_BusyReplyOnSaturation(t *testing.T) { // signals `entered` before blocking on `gate`, so the busy-reply poll // only starts once the slot is genuinely held. entered := make(chan struct{}, 1) - natsrouter.Register(r, "busy.{id}", - func(c *natsrouter.Context, req echoReq) (*echoResp, error) { + Register(r, "busy.{id}", + func(c *Context, req echoReq) (*echoResp, error) { select { case entered <- struct{}{}: default: @@ -243,9 +242,9 @@ func TestIntegration_BusyReplyOnSaturation(t *testing.T) { data, _ := json.Marshal(echoReq{Seq: 2}) resp, err := nc.Request(context.Background(), "busy.2", data, 2*time.Second) require.NoError(t, err) - var re natsrouter.RouteError + var re RouteError require.NoError(t, json.Unmarshal(resp.Data, &re)) - assert.Equal(t, natsrouter.CodeUnavailable, re.Code, "expected busy reply once slot is held") + assert.Equal(t, CodeUnavailable, re.Code, "expected busy reply once slot is held") // Release the gate; first request must complete normally. close(gate) @@ -271,10 +270,10 @@ func TestIntegration_SpawnSitePanicBackstop(t *testing.T) { // the follow-up "ok" request acquire a slot even if cleanup were // broken, masking the regression. cap=1 forces the test to actually // observe slot release. - r := natsrouter.New(nc, "integration-panic-backstop", natsrouter.WithMaxConcurrency(1)) + r := New(nc, "integration-panic-backstop", WithMaxConcurrency(1)) - natsrouter.Register(r, "boom.{id}", - func(c *natsrouter.Context, req echoReq) (*echoResp, error) { + Register(r, "boom.{id}", + func(c *Context, req echoReq) (*echoResp, error) { panic("intentional handler panic") }) @@ -291,8 +290,8 @@ func TestIntegration_SpawnSitePanicBackstop(t *testing.T) { assert.Equal(t, "internal error", payload.Error, "expected internal error reply from backstop") // Process survived: a follow-up normal request must succeed. - natsrouter.Register(r, "ok.{id}", - func(c *natsrouter.Context, req echoReq) (*echoResp, error) { + Register(r, "ok.{id}", + func(c *Context, req echoReq) (*echoResp, error) { return &echoResp{Seq: req.Seq}, nil }) data, _ = json.Marshal(echoReq{Seq: 2}) @@ -308,7 +307,7 @@ func TestIntegration_SpawnSitePanicBackstop(t *testing.T) { // model) have returned, not merely until the dispatcher has stopped. func TestIntegration_ShutdownWaitsForSpawnedHandlers(t *testing.T) { nc := setupNATS(t) - r := natsrouter.New(nc, "integration-shutdown-wg", natsrouter.WithMaxConcurrency(8)) + r := New(nc, "integration-shutdown-wg", WithMaxConcurrency(8)) gate := make(chan struct{}) // Safety net: any test failure before close(gate) below would pin @@ -325,8 +324,8 @@ func TestIntegration_ShutdownWaitsForSpawnedHandlers(t *testing.T) { }() var entered atomic.Int64 var completed atomic.Int64 - natsrouter.Register(r, "wg.{id}", - func(c *natsrouter.Context, req echoReq) (*echoResp, error) { + Register(r, "wg.{id}", + func(c *Context, req echoReq) (*echoResp, error) { entered.Add(1) <-gate completed.Add(1) @@ -400,13 +399,13 @@ func TestIntegration_MultipleRouterInstances(t *testing.T) { const queue = "integration-queue-group" const instances = 3 - routers := make([]*natsrouter.Router, instances) + routers := make([]*Router, instances) hits := make([]atomic.Int64, instances) for idx := 0; idx < instances; idx++ { idx := idx - r := natsrouter.New(nc, queue) - natsrouter.Register(r, "qg.work.{id}", - func(c *natsrouter.Context, req echoReq) (*echoResp, error) { + r := New(nc, queue) + Register(r, "qg.work.{id}", + func(c *Context, req echoReq) (*echoResp, error) { hits[idx].Add(1) return &echoResp{Seq: req.Seq}, nil }) @@ -433,7 +432,7 @@ func TestIntegration_MultipleRouterInstances(t *testing.T) { // Each Shutdown call gets its own deadline. Reusing one ticking context // would mean the cleanup loop could see an already-expired ctx after // the warmup-shutdown + 100 sequential RPCs above. - shutdown := func(r *natsrouter.Router) { + shutdown := func(r *Router) { ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) defer cancel() require.NoError(t, r.Shutdown(ctx)) diff --git a/pkg/natsrouter/main_test.go b/pkg/natsrouter/main_test.go index 8c647ca25..43854be72 100644 --- a/pkg/natsrouter/main_test.go +++ b/pkg/natsrouter/main_test.go @@ -1,6 +1,6 @@ //go:build integration -package natsrouter_test +package natsrouter import ( "testing" diff --git a/pkg/roomkeysender/integration_test.go b/pkg/roomkeysender/integration_test.go index 1ff0ea59c..efd1210e5 100644 --- a/pkg/roomkeysender/integration_test.go +++ b/pkg/roomkeysender/integration_test.go @@ -1,6 +1,6 @@ //go:build integration -package roomkeysender_test +package roomkeysender import ( "bytes" @@ -27,7 +27,6 @@ import ( "github.com/hmchangw/chat/pkg/model" "github.com/hmchangw/chat/pkg/roomcrypto" - "github.com/hmchangw/chat/pkg/roomkeysender" "github.com/hmchangw/chat/pkg/testutil/testimages" ) @@ -289,7 +288,7 @@ func TestRoomKeySender_TypeScriptClient(t *testing.T) { time.Sleep(3 * time.Second) // 6. Publish room key via roomkeysender. - sender := roomkeysender.NewSender(nc) + sender := NewSender(nc) evt := &model.RoomKeyEvent{ RoomID: roomID, Version: version, diff --git a/pkg/roomkeysender/main_test.go b/pkg/roomkeysender/main_test.go index d839d4d34..cbf45f72f 100644 --- a/pkg/roomkeysender/main_test.go +++ b/pkg/roomkeysender/main_test.go @@ -1,6 +1,6 @@ //go:build integration -package roomkeysender_test +package roomkeysender // Import testutil for the Ryuk-disable init() side effect even though // this package starts its containers per-test (t.Cleanup handles teardown). diff --git a/pkg/roomsubcache/integration_test.go b/pkg/roomsubcache/integration_test.go index 9bac30956..66ec2ae0a 100644 --- a/pkg/roomsubcache/integration_test.go +++ b/pkg/roomsubcache/integration_test.go @@ -1,6 +1,6 @@ //go:build integration -package roomsubcache_test +package roomsubcache import ( "context" @@ -10,7 +10,6 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - "github.com/hmchangw/chat/pkg/roomsubcache" "github.com/hmchangw/chat/pkg/testutil" "github.com/hmchangw/chat/pkg/valkeyutil" ) @@ -30,10 +29,10 @@ func setupValkey(t *testing.T) valkeyutil.Client { func TestValkeyCache_Integration_SetGetInvalidate(t *testing.T) { client := setupValkey(t) - cache := roomsubcache.NewValkeyCache(client) + cache := NewValkeyCache(client) ctx := context.Background() - members := []roomsubcache.Member{ + members := []Member{ {ID: "u1", Account: "alice"}, {ID: "u2", Account: "bob"}, } @@ -51,7 +50,7 @@ func TestValkeyCache_Integration_SetGetInvalidate(t *testing.T) { func TestValkeyCache_Integration_MissOnUnsetRoom(t *testing.T) { client := setupValkey(t) - cache := roomsubcache.NewValkeyCache(client) + cache := NewValkeyCache(client) ctx := context.Background() _, err := cache.Get(ctx, "never-set") @@ -60,10 +59,10 @@ func TestValkeyCache_Integration_MissOnUnsetRoom(t *testing.T) { func TestValkeyCache_Integration_TTLExpires(t *testing.T) { client := setupValkey(t) - cache := roomsubcache.NewValkeyCache(client) + cache := NewValkeyCache(client) ctx := context.Background() - require.NoError(t, cache.Set(ctx, "room-ttl", []roomsubcache.Member{{ID: "u1", Account: "a"}}, time.Second)) + require.NoError(t, cache.Set(ctx, "room-ttl", []Member{{ID: "u1", Account: "a"}}, time.Second)) // Poll for expiry — Valkey honors TTL with sub-second granularity but // asserting on a precise deadline is flaky. Allow up to 5s. @@ -81,10 +80,10 @@ func TestValkeyCache_Integration_TTLExpires(t *testing.T) { func TestValkeyCache_Integration_EmptyListIsCacheHit(t *testing.T) { client := setupValkey(t) - cache := roomsubcache.NewValkeyCache(client) + cache := NewValkeyCache(client) ctx := context.Background() - require.NoError(t, cache.Set(ctx, "empty-room", []roomsubcache.Member{}, time.Minute)) + require.NoError(t, cache.Set(ctx, "empty-room", []Member{}, time.Minute)) got, err := cache.Get(ctx, "empty-room") require.NoError(t, err) diff --git a/pkg/roomsubcache/main_test.go b/pkg/roomsubcache/main_test.go new file mode 100644 index 000000000..29f35c6de --- /dev/null +++ b/pkg/roomsubcache/main_test.go @@ -0,0 +1,11 @@ +//go:build integration + +package roomsubcache + +import ( + "testing" + + "github.com/hmchangw/chat/pkg/testutil" +) + +func TestMain(m *testing.M) { testutil.RunTests(m) } From ff2312aa5e2641bf60936369481c2f6e18cc4dc8 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 20 May 2026 11:06:41 +0000 Subject: [PATCH 17/23] test(roomkeystore): add missing TestMain MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pkg/roomkeystore was using testutil.Valkey + testutil.FlushValkey but had no TestMain to call testutil.TerminateAll. Same pattern as pkg/roomsubcache (also caught and fixed in earlier commit). Consistency audit verified — every integration test package now has a TestMain that calls testutil.TerminateAll (via testutil.RunTests for the common case, or a custom prewarm wrapper for search-service and search-sync-worker). The only exception is auth-service, which uses no testcontainers (pure httptest). --- pkg/roomkeystore/main_test.go | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 pkg/roomkeystore/main_test.go diff --git a/pkg/roomkeystore/main_test.go b/pkg/roomkeystore/main_test.go new file mode 100644 index 000000000..1b558b7ea --- /dev/null +++ b/pkg/roomkeystore/main_test.go @@ -0,0 +1,11 @@ +//go:build integration + +package roomkeystore + +import ( + "testing" + + "github.com/hmchangw/chat/pkg/testutil" +) + +func TestMain(m *testing.M) { testutil.RunTests(m) } From 42552f72dc83fb1711c4da69439b409aa89a63bc Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 20 May 2026 11:09:05 +0000 Subject: [PATCH 18/23] docs(CLAUDE.md): document the testcontainer best practice this PR established MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The old Integration Tests section instructed contributors to "write setup(t) helpers that start a container" — exactly the pattern this PR consolidated away. Rewrite the section to capture what we actually expect now: - Internal package, never external *_test - Containers come from pkg/testutil helpers, not inline starts - Every package has a TestMain calling testutil.TerminateAll - Ryuk is disabled repo-wide; explicit cleanup is the contract - Per-test isolation responsibility per container type - When inline GenericContainer is acceptable (CCS, roomcrypto, roomkeysender) - Where new shared container helpers belong (pkg/testutil with the Xxx + EnsureXxx + TerminateXxx triple) --- CLAUDE.md | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 0bb5c3956..1d5bc82fa 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -149,10 +149,32 @@ All commands are wrapped in the root Makefile. Always use `make` targets — nev - Every exported function in `pkg/` must have corresponding test cases ### Integration Tests -- All integration tests use `//go:build integration` build tag -- Use `testcontainers-go` with official modules (`mongodb`, `cassandra`, `nats`) for real dependencies -- Write `setup(t *testing.T)` helpers that start a container, register `t.Cleanup`, and return a connected client -- Use `_test` as database name to avoid collisions +- All integration tests use the `//go:build integration` build tag +- Test files live in the same package as the code under test (`package main` for services, `package ` for libraries) — never external `*_test` packages +- **Containers come from `pkg/testutil`** — do not start your own with `testcontainers.GenericContainer` / `natsmod.Run` / `mongodb.Run` etc. Each helper is `sync.Once`-shared per test process: + - `testutil.MongoDB(t, prefix) *mongo.Database` — isolated DB per test + - `testutil.CassandraKeyspace(t, prefix) (keyspace, *gocql.Session, host)` — isolated keyspace per test + - `testutil.MinIO(t, prefix) (*minio.Client, bucket)` — isolated bucket per test + - `testutil.Elasticsearch(t) string` — shared ES URL; use a per-test unique index name (fnv hash of `t.Name()`) + - `testutil.NATS(t) string` — shared NATS URL with JetStream enabled + - `testutil.Valkey(t) string` — shared Valkey addr; call `testutil.FlushValkey(t)` in cleanup +- **Every integration test package must have a `TestMain` that drives cleanup**: + ```go + //go:build integration + package mypkg + + import ( + "testing" + "github.com/hmchangw/chat/pkg/testutil" + ) + + func TestMain(m *testing.M) { testutil.RunTests(m) } + ``` + `testutil.RunTests` wraps `m.Run()` + `testutil.TerminateAll()` + `os.Exit(code)`. For packages that want concurrent pre-warming, wrap manually instead — see `search-service/setup_shared_test.go` for the reference pattern (`EnsureXxx` goroutines + error channel + fail-fast). +- **Ryuk is disabled repo-wide** (via `pkg/testutil/init.go`) because our CI runner can't run the reaper sidecar. `testutil.TerminateAll` is the only cleanup mechanism on clean exits. SIGKILL / Ctrl+C will leak containers locally — acceptable trade-off; flip Ryuk back on with `TESTCONTAINERS_RYUK_DISABLED=false go test ...` if debugging a leak. +- Per-test isolation is the caller's responsibility: the `MongoDB`/`Cassandra`/`MinIO` helpers already hash `t.Name()`; for ES use a per-test unique index name and DELETE on cleanup; for Valkey call `FlushValkey` on cleanup; for NATS use a per-test `*nats.Conn` pair with `Drain`/`Shutdown` cleanups. +- Inline `testcontainers.GenericContainer` is only acceptable when a shared testutil container can't accommodate the test (e.g. search-service CCS needs two ES nodes on a shared docker network; `pkg/roomkeysender` needs NATS with WebSocket transport; `pkg/roomcrypto` needs a Node container with bundled scripts). Each inline container must store its reference and register `t.Cleanup(container.Terminate)`. +- New shared dependencies (a container type used by ≥2 packages) belong in `pkg/testutil` with the same shape: `Xxx(t)` + `EnsureXxx()` + `TerminateXxx()`, container ref stored at package level, and `TerminateXxx` wired into `TerminateAll`. ### Model Tests - `pkg/model/model_test.go` verifies all domain types marshal/unmarshal correctly via a generic `roundTrip` helper From 7a6cc36a23857349c817aae35445a9210a5f5965 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 21 May 2026 02:10:56 +0000 Subject: [PATCH 19/23] Post-merge fixes for Valkey cluster-mode migration - pkg/testutil/terminate.go: drop TerminateValkey from TerminateAll (no process-shared Valkey to clean up; StartValkeyCluster is per-test). - search-service/setup_shared_test.go: drop local valkeyClient helper and EnsureValkey prewarm. - search-service/integration_ccs_test.go, search-service/integration_rooms_test.go: use valkeyutil.WrapClusterClient(testutil.StartValkeyCluster(t)). - room-service/integration_test.go: re-apply NATS migration to testutil.NATS (was reverted when merge took main's version). - pkg/roomsubcache/integration_test.go: switch back to internal package (roomsubcache, not roomsubcache_test) for consistency. - pkg/valkeyutil/main_test.go: add missing TestMain. - tools/loadgen/main_test.go: update unit tests for ValkeyAddrs []string config field rename. - CLAUDE.md: document Valkey as per-test, not shared. --- CLAUDE.md | 7 ++++--- pkg/roomsubcache/integration_test.go | 17 ++++++++--------- pkg/testutil/terminate.go | 5 ++++- pkg/valkeyutil/main_test.go | 11 +++++++++++ room-service/integration_test.go | 10 +--------- search-service/integration_ccs_test.go | 3 ++- search-service/integration_rooms_test.go | 3 ++- search-service/setup_shared_test.go | 23 ++++------------------- tools/loadgen/main_test.go | 6 +++--- 9 files changed, 39 insertions(+), 46 deletions(-) create mode 100644 pkg/valkeyutil/main_test.go diff --git a/CLAUDE.md b/CLAUDE.md index 1d5bc82fa..b68620463 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -151,13 +151,14 @@ All commands are wrapped in the root Makefile. Always use `make` targets — nev ### Integration Tests - All integration tests use the `//go:build integration` build tag - Test files live in the same package as the code under test (`package main` for services, `package ` for libraries) — never external `*_test` packages -- **Containers come from `pkg/testutil`** — do not start your own with `testcontainers.GenericContainer` / `natsmod.Run` / `mongodb.Run` etc. Each helper is `sync.Once`-shared per test process: +- **Containers come from `pkg/testutil`** — do not start your own with `testcontainers.GenericContainer` / `natsmod.Run` / `mongodb.Run` etc. Process-shared helpers (one container, many tests, started via `sync.Once`, terminated via `TerminateAll`): - `testutil.MongoDB(t, prefix) *mongo.Database` — isolated DB per test - `testutil.CassandraKeyspace(t, prefix) (keyspace, *gocql.Session, host)` — isolated keyspace per test - `testutil.MinIO(t, prefix) (*minio.Client, bucket)` — isolated bucket per test - `testutil.Elasticsearch(t) string` — shared ES URL; use a per-test unique index name (fnv hash of `t.Name()`) - `testutil.NATS(t) string` — shared NATS URL with JetStream enabled - - `testutil.Valkey(t) string` — shared Valkey addr; call `testutil.FlushValkey(t)` in cleanup +- Per-test helper (one cluster per test, registered via `t.Cleanup` — services use cluster-mode Valkey in production): + - `testutil.StartValkeyCluster(t) *redis.ClusterClient` — single-node cluster-mode Valkey. Wrap with `valkeyutil.WrapClusterClient` if you need the `valkeyutil.Client` interface. - **Every integration test package must have a `TestMain` that drives cleanup**: ```go //go:build integration @@ -172,7 +173,7 @@ All commands are wrapped in the root Makefile. Always use `make` targets — nev ``` `testutil.RunTests` wraps `m.Run()` + `testutil.TerminateAll()` + `os.Exit(code)`. For packages that want concurrent pre-warming, wrap manually instead — see `search-service/setup_shared_test.go` for the reference pattern (`EnsureXxx` goroutines + error channel + fail-fast). - **Ryuk is disabled repo-wide** (via `pkg/testutil/init.go`) because our CI runner can't run the reaper sidecar. `testutil.TerminateAll` is the only cleanup mechanism on clean exits. SIGKILL / Ctrl+C will leak containers locally — acceptable trade-off; flip Ryuk back on with `TESTCONTAINERS_RYUK_DISABLED=false go test ...` if debugging a leak. -- Per-test isolation is the caller's responsibility: the `MongoDB`/`Cassandra`/`MinIO` helpers already hash `t.Name()`; for ES use a per-test unique index name and DELETE on cleanup; for Valkey call `FlushValkey` on cleanup; for NATS use a per-test `*nats.Conn` pair with `Drain`/`Shutdown` cleanups. +- Per-test isolation is the caller's responsibility: the `MongoDB`/`Cassandra`/`MinIO` helpers already hash `t.Name()`; for ES use a per-test unique index name and DELETE on cleanup; for NATS use a per-test `*nats.Conn` pair with `Drain`/`Shutdown` cleanups; Valkey isolation is automatic (each test gets its own cluster). - Inline `testcontainers.GenericContainer` is only acceptable when a shared testutil container can't accommodate the test (e.g. search-service CCS needs two ES nodes on a shared docker network; `pkg/roomkeysender` needs NATS with WebSocket transport; `pkg/roomcrypto` needs a Node container with bundled scripts). Each inline container must store its reference and register `t.Cleanup(container.Terminate)`. - New shared dependencies (a container type used by ≥2 packages) belong in `pkg/testutil` with the same shape: `Xxx(t)` + `EnsureXxx()` + `TerminateXxx()`, container ref stored at package level, and `TerminateXxx` wired into `TerminateAll`. diff --git a/pkg/roomsubcache/integration_test.go b/pkg/roomsubcache/integration_test.go index fea394fa4..f6daff111 100644 --- a/pkg/roomsubcache/integration_test.go +++ b/pkg/roomsubcache/integration_test.go @@ -1,6 +1,6 @@ //go:build integration -package roomsubcache_test +package roomsubcache import ( "context" @@ -10,7 +10,6 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - "github.com/hmchangw/chat/pkg/roomsubcache" "github.com/hmchangw/chat/pkg/testutil" "github.com/hmchangw/chat/pkg/valkeyutil" ) @@ -22,10 +21,10 @@ func setupValkey(t *testing.T) valkeyutil.Client { func TestValkeyCache_Integration_SetGetInvalidate(t *testing.T) { client := setupValkey(t) - cache := roomsubcache.NewValkeyCache(client) + cache := NewValkeyCache(client) ctx := context.Background() - members := []roomsubcache.Member{ + members := []Member{ {ID: "u1", Account: "alice"}, {ID: "u2", Account: "bob"}, } @@ -43,7 +42,7 @@ func TestValkeyCache_Integration_SetGetInvalidate(t *testing.T) { func TestValkeyCache_Integration_MissOnUnsetRoom(t *testing.T) { client := setupValkey(t) - cache := roomsubcache.NewValkeyCache(client) + cache := NewValkeyCache(client) ctx := context.Background() _, err := cache.Get(ctx, "never-set") @@ -52,10 +51,10 @@ func TestValkeyCache_Integration_MissOnUnsetRoom(t *testing.T) { func TestValkeyCache_Integration_TTLExpires(t *testing.T) { client := setupValkey(t) - cache := roomsubcache.NewValkeyCache(client) + cache := NewValkeyCache(client) ctx := context.Background() - require.NoError(t, cache.Set(ctx, "room-ttl", []roomsubcache.Member{{ID: "u1", Account: "a"}}, time.Second)) + require.NoError(t, cache.Set(ctx, "room-ttl", []Member{{ID: "u1", Account: "a"}}, time.Second)) // Poll for expiry — Valkey honors TTL with sub-second granularity but // asserting on a precise deadline is flaky. Allow up to 5s. @@ -73,10 +72,10 @@ func TestValkeyCache_Integration_TTLExpires(t *testing.T) { func TestValkeyCache_Integration_EmptyListIsCacheHit(t *testing.T) { client := setupValkey(t) - cache := roomsubcache.NewValkeyCache(client) + cache := NewValkeyCache(client) ctx := context.Background() - require.NoError(t, cache.Set(ctx, "empty-room", []roomsubcache.Member{}, time.Minute)) + require.NoError(t, cache.Set(ctx, "empty-room", []Member{}, time.Minute)) got, err := cache.Get(ctx, "empty-room") require.NoError(t, err) diff --git a/pkg/testutil/terminate.go b/pkg/testutil/terminate.go index 8bdd48d51..b6fd776ba 100644 --- a/pkg/testutil/terminate.go +++ b/pkg/testutil/terminate.go @@ -6,11 +6,14 @@ package testutil // package. Each TerminateXxx is a no-op if its container was never // started, so this is safe from any service's TestMain. Use via // testutil.RunTests for the standard wrap. +// +// Valkey is not included — StartValkeyCluster is per-test (each test +// gets its own container with its own t.Cleanup) so there's no shared +// state to terminate here. func TerminateAll() { TerminateMongo() TerminateCassandra() TerminateMinIO() TerminateElasticsearch() TerminateNATS() - TerminateValkey() } diff --git a/pkg/valkeyutil/main_test.go b/pkg/valkeyutil/main_test.go new file mode 100644 index 000000000..d0469a2c5 --- /dev/null +++ b/pkg/valkeyutil/main_test.go @@ -0,0 +1,11 @@ +//go:build integration + +package valkeyutil + +import ( + "testing" + + "github.com/hmchangw/chat/pkg/testutil" +) + +func TestMain(m *testing.M) { testutil.RunTests(m) } diff --git a/room-service/integration_test.go b/room-service/integration_test.go index 810483556..83335aa3d 100644 --- a/room-service/integration_test.go +++ b/room-service/integration_test.go @@ -17,7 +17,6 @@ import ( "github.com/nats-io/nats.go" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - natsmod "github.com/testcontainers/testcontainers-go/modules/nats" "go.mongodb.org/mongo-driver/v2/bson" "go.mongodb.org/mongo-driver/v2/mongo" @@ -27,7 +26,6 @@ import ( "github.com/hmchangw/chat/pkg/roomkeystore" "github.com/hmchangw/chat/pkg/subject" "github.com/hmchangw/chat/pkg/testutil" - "github.com/hmchangw/chat/pkg/testutil/testimages" ) func setupMongo(t *testing.T) *mongo.Database { @@ -90,13 +88,7 @@ func TestCassMessageReader_GetMessageRoomAndCreatedAt_Integration(t *testing.T) func setupNATS(t *testing.T) string { t.Helper() - ctx := context.Background() - container, err := natsmod.Run(ctx, testimages.NATS) - require.NoError(t, err) - t.Cleanup(func() { _ = container.Terminate(ctx) }) - url, err := container.ConnectionString(ctx) - require.NoError(t, err) - return url + return testutil.NATS(t) } func TestMongoStore_Integration(t *testing.T) { diff --git a/search-service/integration_ccs_test.go b/search-service/integration_ccs_test.go index f429ec7e2..f87330e63 100644 --- a/search-service/integration_ccs_test.go +++ b/search-service/integration_ccs_test.go @@ -36,6 +36,7 @@ import ( "github.com/hmchangw/chat/pkg/subject" "github.com/hmchangw/chat/pkg/testutil" "github.com/hmchangw/chat/pkg/testutil/testimages" + "github.com/hmchangw/chat/pkg/valkeyutil" ) // --- Fixture ----------------------------------------------------------------- @@ -106,7 +107,7 @@ func setupCCSFixture(t *testing.T) *ccsFixture { remoteEngine, err := searchengine.New(ctx, searchengine.Config{Backend: "elasticsearch", URL: remoteURL}) require.NoError(t, err, "build searchengine for remote") - cacheClient := valkeyClient(t) + cacheClient := valkeyutil.WrapClusterClient(testutil.StartValkeyCluster(t)) natsURL := testutil.NATS(t) serverNC, err := natsutil.Connect(natsURL, "") diff --git a/search-service/integration_rooms_test.go b/search-service/integration_rooms_test.go index ec7d7500c..a0dc98091 100644 --- a/search-service/integration_rooms_test.go +++ b/search-service/integration_rooms_test.go @@ -23,6 +23,7 @@ import ( "github.com/hmchangw/chat/pkg/searchengine" "github.com/hmchangw/chat/pkg/subject" "github.com/hmchangw/chat/pkg/testutil" + "github.com/hmchangw/chat/pkg/valkeyutil" ) // roomsFixture uses a per-test spotlight index against the shared ES so @@ -54,7 +55,7 @@ func setupRoomsFixture(t *testing.T) *roomsFixture { require.NoError(t, err, "build searchengine for subs fixture") esStore := newESStore(engine, testUserRoomIndex) - cache := newValkeyCache(valkeyClient(t)) + cache := newValkeyCache(valkeyutil.WrapClusterClient(testutil.StartValkeyCluster(t))) h := newHandler(esStore, nil, nil, cache, handlerConfig{ DocCounts: 25, MaxDocCounts: 100, diff --git a/search-service/setup_shared_test.go b/search-service/setup_shared_test.go index 565565863..586af07b5 100644 --- a/search-service/setup_shared_test.go +++ b/search-service/setup_shared_test.go @@ -2,12 +2,13 @@ package main -// Per-package shared test infrastructure. Containers come from -// pkg/testutil; CCS tests bring their own ES pair (integration_ccs_test.go). +// Per-package shared test infrastructure. ES / NATS / Mongo come from +// pkg/testutil as process-shared containers; Valkey is per-test via +// testutil.StartValkeyCluster. CCS tests bring their own ES pair +// (integration_ccs_test.go). import ( "bytes" - "context" "encoding/json" "fmt" "hash/fnv" @@ -21,7 +22,6 @@ import ( "github.com/stretchr/testify/require" "github.com/hmchangw/chat/pkg/testutil" - "github.com/hmchangw/chat/pkg/valkeyutil" ) const testUserRoomIndex = "user-room" @@ -62,7 +62,6 @@ func TestMain(m *testing.M) { prewarms := []func() error{ testutil.EnsureElasticsearch, testutil.EnsureNATS, - testutil.EnsureValkey, testutil.EnsureMongo, } errCh := make(chan error, len(prewarms)) @@ -109,17 +108,3 @@ func uniqueESIndex(t *testing.T, prefix string) string { }) return name } - -// valkeyClient returns a valkeyutil.Client connected to the shared Valkey, -// with FLUSHDB on cleanup so sibling tests start clean. Tests in this -// package run sequentially. -func valkeyClient(t *testing.T) valkeyutil.Client { - t.Helper() - client, err := valkeyutil.Connect(context.Background(), testutil.Valkey(t), "") - require.NoError(t, err, "connect shared valkey") - t.Cleanup(func() { - testutil.FlushValkey(t) - valkeyutil.Disconnect(client) - }) - return client -} diff --git a/tools/loadgen/main_test.go b/tools/loadgen/main_test.go index 959edf7b4..1c2196f67 100644 --- a/tools/loadgen/main_test.go +++ b/tools/loadgen/main_test.go @@ -234,7 +234,7 @@ func TestDispatch_MembersSustained_UnknownPreset(t *testing.T) { oldArgs := os.Args defer func() { os.Args = oldArgs }() os.Args = []string{"loadgen", "members-sustained", "--preset=nope"} - cfg := &config{NatsURL: "nats://localhost:1", MongoURI: "mongodb://localhost:1", ValkeyAddr: "localhost:1"} + cfg := &config{NatsURL: "nats://localhost:1", MongoURI: "mongodb://localhost:1", ValkeyAddrs: []string{"localhost:1"}} code := dispatch(context.Background(), cfg) assert.Equal(t, 2, code) } @@ -243,7 +243,7 @@ func TestDispatch_MembersSustained_RejectsBadShape(t *testing.T) { oldArgs := os.Args defer func() { os.Args = oldArgs }() os.Args = []string{"loadgen", "members-sustained", "--preset=members-small", "--shape=orgs"} - cfg := &config{NatsURL: "nats://localhost:1", MongoURI: "mongodb://localhost:1", ValkeyAddr: "localhost:1"} + cfg := &config{NatsURL: "nats://localhost:1", MongoURI: "mongodb://localhost:1", ValkeyAddrs: []string{"localhost:1"}} code := dispatch(context.Background(), cfg) assert.Equal(t, 2, code) } @@ -252,7 +252,7 @@ func TestDispatch_MembersCapacity_RequiresTargetSize(t *testing.T) { oldArgs := os.Args defer func() { os.Args = oldArgs }() os.Args = []string{"loadgen", "members-capacity", "--preset=members-capacity"} - cfg := &config{NatsURL: "nats://localhost:1", MongoURI: "mongodb://localhost:1", ValkeyAddr: "localhost:1"} + cfg := &config{NatsURL: "nats://localhost:1", MongoURI: "mongodb://localhost:1", ValkeyAddrs: []string{"localhost:1"}} code := dispatch(context.Background(), cfg) assert.Equal(t, 2, code) } From 80514d199025ab810b7f0206b6ad8cf1c57e4679 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 21 May 2026 02:25:20 +0000 Subject: [PATCH 20/23] test(testutil): add SharedValkeyCluster + FlushValkey for shared cluster MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit testutil/valkey.go now exposes two variants: - SharedValkeyCluster(t) — process-shared cluster via sync.Once. Fast (cluster boots once per go test invocation). Callers register t.Cleanup(func() { testutil.FlushValkey(t) }) for keyspace isolation. - StartValkeyCluster(t) — per-test cluster (unchanged). Use only for tests asserting on cluster-routing state (pkg/roomkeystore) or that own a store wrapper that closes the underlying client. Wired TerminateValkey back into TerminateAll so the shared cluster is reaped at process exit. Migrated safe callers to the shared variant: - search-service/integration_ccs_test.go, integration_rooms_test.go - pkg/roomsubcache/integration_test.go - pkg/valkeyutil/integration_test.go - search-service TestMain prewarm now includes EnsureValkey Kept on per-test (deliberate): - pkg/roomkeystore (CLUSTER KEYSLOT routing assertions) - room-service, room-worker (use NewValkeyClusterStoreFromClient whose store.Close() releases the underlying client — would kill the shared cluster mid-run if a handler ever called Close) Expected wins: ~15-25s per go test ./search-service/... and similar per-package savings elsewhere on packages that hit Valkey heavily. CLAUDE.md updated to document both variants + when to use each. --- CLAUDE.md | 7 +- pkg/roomsubcache/integration_test.go | 3 +- pkg/testutil/terminate.go | 7 +- pkg/testutil/valkey.go | 175 +++++++++++++++++++---- pkg/valkeyutil/integration_test.go | 3 +- search-service/integration_ccs_test.go | 3 +- search-service/integration_rooms_test.go | 3 +- search-service/setup_shared_test.go | 1 + 8 files changed, 168 insertions(+), 34 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index b68620463..be0e3b50f 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -157,8 +157,9 @@ All commands are wrapped in the root Makefile. Always use `make` targets — nev - `testutil.MinIO(t, prefix) (*minio.Client, bucket)` — isolated bucket per test - `testutil.Elasticsearch(t) string` — shared ES URL; use a per-test unique index name (fnv hash of `t.Name()`) - `testutil.NATS(t) string` — shared NATS URL with JetStream enabled -- Per-test helper (one cluster per test, registered via `t.Cleanup` — services use cluster-mode Valkey in production): - - `testutil.StartValkeyCluster(t) *redis.ClusterClient` — single-node cluster-mode Valkey. Wrap with `valkeyutil.WrapClusterClient` if you need the `valkeyutil.Client` interface. +- Valkey (cluster-mode — services use this in production): + - `testutil.SharedValkeyCluster(t) *redis.ClusterClient` — process-shared cluster (started via `sync.Once`, reaped via `TerminateValkey`/`TerminateAll`). Per-test caller MUST register `t.Cleanup(func() { testutil.FlushValkey(t) })` so sibling tests start with a clean keyspace. Default choice. + - `testutil.StartValkeyCluster(t) *redis.ClusterClient` — per-test cluster (each test gets its own container via `t.Cleanup`). Use ONLY when the test asserts on cluster-routing state (e.g., `pkg/roomkeystore`'s `CLUSTER KEYSLOT` checks) or owns a store wrapper that calls `Close()` on the underlying client. - **Every integration test package must have a `TestMain` that drives cleanup**: ```go //go:build integration @@ -173,7 +174,7 @@ All commands are wrapped in the root Makefile. Always use `make` targets — nev ``` `testutil.RunTests` wraps `m.Run()` + `testutil.TerminateAll()` + `os.Exit(code)`. For packages that want concurrent pre-warming, wrap manually instead — see `search-service/setup_shared_test.go` for the reference pattern (`EnsureXxx` goroutines + error channel + fail-fast). - **Ryuk is disabled repo-wide** (via `pkg/testutil/init.go`) because our CI runner can't run the reaper sidecar. `testutil.TerminateAll` is the only cleanup mechanism on clean exits. SIGKILL / Ctrl+C will leak containers locally — acceptable trade-off; flip Ryuk back on with `TESTCONTAINERS_RYUK_DISABLED=false go test ...` if debugging a leak. -- Per-test isolation is the caller's responsibility: the `MongoDB`/`Cassandra`/`MinIO` helpers already hash `t.Name()`; for ES use a per-test unique index name and DELETE on cleanup; for NATS use a per-test `*nats.Conn` pair with `Drain`/`Shutdown` cleanups; Valkey isolation is automatic (each test gets its own cluster). +- Per-test isolation is the caller's responsibility: the `MongoDB`/`Cassandra`/`MinIO` helpers already hash `t.Name()`; for ES use a per-test unique index name and DELETE on cleanup; for NATS use a per-test `*nats.Conn` pair with `Drain`/`Shutdown` cleanups; for shared Valkey call `testutil.FlushValkey(t)` in `t.Cleanup` (StartValkeyCluster's per-test mode is automatic). - Inline `testcontainers.GenericContainer` is only acceptable when a shared testutil container can't accommodate the test (e.g. search-service CCS needs two ES nodes on a shared docker network; `pkg/roomkeysender` needs NATS with WebSocket transport; `pkg/roomcrypto` needs a Node container with bundled scripts). Each inline container must store its reference and register `t.Cleanup(container.Terminate)`. - New shared dependencies (a container type used by ≥2 packages) belong in `pkg/testutil` with the same shape: `Xxx(t)` + `EnsureXxx()` + `TerminateXxx()`, container ref stored at package level, and `TerminateXxx` wired into `TerminateAll`. diff --git a/pkg/roomsubcache/integration_test.go b/pkg/roomsubcache/integration_test.go index f6daff111..220cb0147 100644 --- a/pkg/roomsubcache/integration_test.go +++ b/pkg/roomsubcache/integration_test.go @@ -16,7 +16,8 @@ import ( func setupValkey(t *testing.T) valkeyutil.Client { t.Helper() - return valkeyutil.WrapClusterClient(testutil.StartValkeyCluster(t)) + t.Cleanup(func() { testutil.FlushValkey(t) }) + return valkeyutil.WrapClusterClient(testutil.SharedValkeyCluster(t)) } func TestValkeyCache_Integration_SetGetInvalidate(t *testing.T) { diff --git a/pkg/testutil/terminate.go b/pkg/testutil/terminate.go index b6fd776ba..a0bb6c630 100644 --- a/pkg/testutil/terminate.go +++ b/pkg/testutil/terminate.go @@ -7,13 +7,14 @@ package testutil // started, so this is safe from any service's TestMain. Use via // testutil.RunTests for the standard wrap. // -// Valkey is not included — StartValkeyCluster is per-test (each test -// gets its own container with its own t.Cleanup) so there's no shared -// state to terminate here. +// StartValkeyCluster (per-test) is unaffected — those containers are +// cleaned up by their own t.Cleanup hooks. TerminateValkey only stops +// the shared cluster from SharedValkeyCluster, if one was started. func TerminateAll() { TerminateMongo() TerminateCassandra() TerminateMinIO() TerminateElasticsearch() TerminateNATS() + TerminateValkey() } diff --git a/pkg/testutil/valkey.go b/pkg/testutil/valkey.go index 6a447a161..b1c7b7d8a 100644 --- a/pkg/testutil/valkey.go +++ b/pkg/testutil/valkey.go @@ -6,7 +6,9 @@ import ( "context" "fmt" "io" + "os" "strings" + "sync" "testing" "time" @@ -18,16 +20,119 @@ import ( "github.com/hmchangw/chat/pkg/testutil/testimages" ) -// StartValkeyCluster starts a single-node cluster-mode Valkey container, -// assigns all 16384 hash slots to that node, and returns a connected -// *redis.ClusterClient. The ClusterSlots override routes traffic to the -// externally-mapped address rather than the internal 127.0.0.1:6379 that -// the node announces to peers — required for testcontainer port mapping. -// The container and client are terminated/closed via t.Cleanup. +// StartValkeyCluster starts a single-node cluster-mode Valkey container +// and returns a connected *redis.ClusterClient. The container and client +// are terminated/closed via t.Cleanup. Use this when a test needs a +// pristine cluster (e.g. CLUSTER KEYSLOT routing assertions in +// pkg/roomkeystore); use SharedValkeyCluster otherwise. func StartValkeyCluster(t *testing.T) *redis.ClusterClient { t.Helper() ctx := context.Background() + container, addr := startValkeyClusterContainer(ctx, t) + t.Cleanup(func() { _ = container.Terminate(ctx) }) + c := newValkeyClusterClient(addr) + t.Cleanup(func() { _ = c.Close() }) + require.NoError(t, pingCluster(ctx, c), "ping valkey cluster") + return c +} + +// SharedValkeyCluster returns a *redis.ClusterClient connected to a +// process-shared cluster-mode Valkey. The cluster boots once per `go test` +// invocation via sync.Once; TerminateValkey (called from TerminateAll) +// tears it down at process exit. Callers must register +// `t.Cleanup(func() { testutil.FlushValkey(t) })` themselves so the next +// test starts with a clean keyspace. +// +// Use this for tests that only care about cache get/set behaviour; use +// StartValkeyCluster if the test asserts on cluster-routing state. +func SharedValkeyCluster(t *testing.T) *redis.ClusterClient { + t.Helper() + ensureSharedValkeyCluster() + if sharedValkeyErr != nil { + t.Fatalf("testutil.SharedValkeyCluster: %v", sharedValkeyErr) + } + return sharedValkeyClient +} + +// EnsureValkey starts the shared Valkey cluster if not already started. +// No-t variant intended for TestMain pre-warming. +func EnsureValkey() error { ensureSharedValkeyCluster(); return sharedValkeyErr } + +// FlushValkey runs FLUSHALL across every master in the shared cluster. +// Intended for per-test cleanup so sibling tests start with an empty +// keyspace. Test-fatal on error — leftover state would silently break +// the next test. +func FlushValkey(t *testing.T) { + t.Helper() + if sharedValkeyClient == nil { + return + } + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + err := sharedValkeyClient.ForEachMaster(ctx, func(ctx context.Context, m *redis.Client) error { + return m.FlushAll(ctx).Err() + }) + if err != nil { + t.Errorf("flush shared valkey cluster: %v", err) + } +} + +// TerminateValkey closes the shared client and stops the shared +// container. Best-effort, idempotent. +func TerminateValkey() { + if sharedValkeyClient != nil { + _ = sharedValkeyClient.Close() + sharedValkeyClient = nil + } + if sharedValkeyContainer == nil { + return + } + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + if err := sharedValkeyContainer.Terminate(ctx); err != nil { + fmt.Fprintf(os.Stderr, "terminate shared valkey: %v\n", err) + } + sharedValkeyContainer = nil +} + +var ( + sharedValkeyOnce sync.Once + sharedValkeyContainer testcontainers.Container + sharedValkeyClient *redis.ClusterClient + sharedValkeyErr error +) +func ensureSharedValkeyCluster() { + sharedValkeyOnce.Do(func() { + ctx := context.Background() + container, addr, err := startValkeyClusterContainerNoT(ctx) + if err != nil { + sharedValkeyErr = fmt.Errorf("start shared valkey cluster: %w", err) + return + } + c := newValkeyClusterClient(addr) + if err := pingCluster(ctx, c); err != nil { + _ = c.Close() + _ = container.Terminate(ctx) + sharedValkeyErr = fmt.Errorf("ping shared valkey cluster: %w", err) + return + } + sharedValkeyContainer = container + sharedValkeyClient = c + }) +} + +// startValkeyClusterContainer starts a cluster-mode container, assigns +// all 16384 slots to the node, and waits for cluster_state:ok. Test-fatal +// on error; for the no-t variant see startValkeyClusterContainerNoT. +func startValkeyClusterContainer(ctx context.Context, t *testing.T) (testcontainers.Container, string) { + t.Helper() + container, addr, err := startValkeyClusterContainerNoT(ctx) + require.NoError(t, err, "start valkey cluster container") + return container, addr +} + +func startValkeyClusterContainerNoT(ctx context.Context) (testcontainers.Container, string, error) { container, err := testcontainers.GenericContainer(ctx, testcontainers.GenericContainerRequest{ ContainerRequest: testcontainers.ContainerRequest{ Image: testimages.Valkey, @@ -43,29 +148,52 @@ func StartValkeyCluster(t *testing.T) *redis.ClusterClient { }, Started: true, }) - require.NoError(t, err, "start valkey cluster container") - t.Cleanup(func() { _ = container.Terminate(ctx) }) - + if err != nil { + return nil, "", err + } host, err := container.Host(ctx) - require.NoError(t, err) + if err != nil { + _ = container.Terminate(ctx) + return nil, "", fmt.Errorf("get valkey host: %w", err) + } port, err := container.MappedPort(ctx, "6379") - require.NoError(t, err) + if err != nil { + _ = container.Terminate(ctx) + return nil, "", fmt.Errorf("get valkey port: %w", err) + } addr := fmt.Sprintf("%s:%s", host, port.Port()) exitCode, _, err := container.Exec(ctx, []string{"valkey-cli", "CLUSTER", "ADDSLOTSRANGE", "0", "16383"}) - require.NoError(t, err, "exec cluster addslotsrange") - require.Equal(t, 0, exitCode, "cluster addslotsrange must exit 0") + if err != nil { + _ = container.Terminate(ctx) + return nil, "", fmt.Errorf("exec cluster addslotsrange: %w", err) + } + if exitCode != 0 { + _ = container.Terminate(ctx) + return nil, "", fmt.Errorf("cluster addslotsrange exited %d", exitCode) + } - require.Eventually(t, func() bool { + deadline := time.Now().Add(10 * time.Second) + for time.Now().Before(deadline) { _, out, execErr := container.Exec(ctx, []string{"valkey-cli", "CLUSTER", "INFO"}) - if execErr != nil { - return false + if execErr == nil { + buf, _ := io.ReadAll(out) + if strings.Contains(string(buf), "cluster_state:ok") { + return container, addr, nil + } } - buf, _ := io.ReadAll(out) - return strings.Contains(string(buf), "cluster_state:ok") - }, 10*time.Second, 100*time.Millisecond, "cluster must reach ok state") + time.Sleep(100 * time.Millisecond) + } + _ = container.Terminate(ctx) + return nil, "", fmt.Errorf("cluster never reached ok state within 10s") +} - c := redis.NewClusterClient(&redis.ClusterOptions{ +// newValkeyClusterClient builds a ClusterClient that routes all 16384 +// slots to the externally-mapped addr. The ClusterSlots override is +// required because the node announces 127.0.0.1:6379 to peers (the +// container-internal address), which the host can't reach. +func newValkeyClusterClient(addr string) *redis.ClusterClient { + return redis.NewClusterClient(&redis.ClusterOptions{ Addrs: []string{addr}, ClusterSlots: func(_ context.Context) ([]redis.ClusterSlot, error) { return []redis.ClusterSlot{ @@ -73,11 +201,10 @@ func StartValkeyCluster(t *testing.T) *redis.ClusterClient { }, nil }, }) - t.Cleanup(func() { _ = c.Close() }) +} +func pingCluster(ctx context.Context, c *redis.ClusterClient) error { pingCtx, cancel := context.WithTimeout(ctx, 5*time.Second) defer cancel() - require.NoError(t, c.Ping(pingCtx).Err(), "ping valkey cluster") - - return c + return c.Ping(pingCtx).Err() } diff --git a/pkg/valkeyutil/integration_test.go b/pkg/valkeyutil/integration_test.go index 84bb5531a..5a2fbaee9 100644 --- a/pkg/valkeyutil/integration_test.go +++ b/pkg/valkeyutil/integration_test.go @@ -22,7 +22,8 @@ import ( // ConnectCluster's error-wrapping path is covered by TestConnectCluster_ErrorPath. func setupClusterClient(t *testing.T) Client { t.Helper() - return &clusterClient{c: testutil.StartValkeyCluster(t)} + t.Cleanup(func() { testutil.FlushValkey(t) }) + return &clusterClient{c: testutil.SharedValkeyCluster(t)} } func TestClusterRedisClient_Integration_GetSetDel(t *testing.T) { diff --git a/search-service/integration_ccs_test.go b/search-service/integration_ccs_test.go index f87330e63..619d88f39 100644 --- a/search-service/integration_ccs_test.go +++ b/search-service/integration_ccs_test.go @@ -107,7 +107,8 @@ func setupCCSFixture(t *testing.T) *ccsFixture { remoteEngine, err := searchengine.New(ctx, searchengine.Config{Backend: "elasticsearch", URL: remoteURL}) require.NoError(t, err, "build searchengine for remote") - cacheClient := valkeyutil.WrapClusterClient(testutil.StartValkeyCluster(t)) + cacheClient := valkeyutil.WrapClusterClient(testutil.SharedValkeyCluster(t)) + t.Cleanup(func() { testutil.FlushValkey(t) }) natsURL := testutil.NATS(t) serverNC, err := natsutil.Connect(natsURL, "") diff --git a/search-service/integration_rooms_test.go b/search-service/integration_rooms_test.go index a0dc98091..54db83839 100644 --- a/search-service/integration_rooms_test.go +++ b/search-service/integration_rooms_test.go @@ -55,7 +55,8 @@ func setupRoomsFixture(t *testing.T) *roomsFixture { require.NoError(t, err, "build searchengine for subs fixture") esStore := newESStore(engine, testUserRoomIndex) - cache := newValkeyCache(valkeyutil.WrapClusterClient(testutil.StartValkeyCluster(t))) + cache := newValkeyCache(valkeyutil.WrapClusterClient(testutil.SharedValkeyCluster(t))) + t.Cleanup(func() { testutil.FlushValkey(t) }) h := newHandler(esStore, nil, nil, cache, handlerConfig{ DocCounts: 25, MaxDocCounts: 100, diff --git a/search-service/setup_shared_test.go b/search-service/setup_shared_test.go index 586af07b5..d20a9a82f 100644 --- a/search-service/setup_shared_test.go +++ b/search-service/setup_shared_test.go @@ -62,6 +62,7 @@ func TestMain(m *testing.M) { prewarms := []func() error{ testutil.EnsureElasticsearch, testutil.EnsureNATS, + testutil.EnsureValkey, testutil.EnsureMongo, } errCh := make(chan error, len(prewarms)) From c6424e33934f3fc038bd8af185dc2cc10c4a37ff Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 21 May 2026 02:29:22 +0000 Subject: [PATCH 21/23] chore(history-service): remove local docker-compose folder MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit history-service/docker-local was a per-service dev compose stack (NATS + Mongo + Cassandra + the service itself). The canonical local dev stack lives in the repo-root docker-local/ — no need for the per-service duplicate. --- history-service/docker-local/.env.example | 14 -- .../docker-local/docker-compose.yml | 152 ------------------ 2 files changed, 166 deletions(-) delete mode 100644 history-service/docker-local/.env.example delete mode 100644 history-service/docker-local/docker-compose.yml diff --git a/history-service/docker-local/.env.example b/history-service/docker-local/.env.example deleted file mode 100644 index f32b88319..000000000 --- a/history-service/docker-local/.env.example +++ /dev/null @@ -1,14 +0,0 @@ -# Environment variables for running history-service locally against docker-local services. -# Copy to .env and adjust if needed: -# cp .env.example .env -# -# Usage from repo root: -# set -a && source history-service/docker-local/.env && set +a -# go run ./history-service/cmd/ - -NATS_URL=nats://localhost:4222 -MONGO_URI=mongodb://localhost:27017 -MONGO_DB=chat -CASSANDRA_HOSTS=localhost -CASSANDRA_KEYSPACE=chat -SITE_ID=site-local diff --git a/history-service/docker-local/docker-compose.yml b/history-service/docker-local/docker-compose.yml deleted file mode 100644 index a032adf74..000000000 --- a/history-service/docker-local/docker-compose.yml +++ /dev/null @@ -1,152 +0,0 @@ -# Local development environment for history-service. -# Starts NATS, MongoDB, Cassandra, initializes schema, and runs the service. -# -# Usage: -# cd history-service/docker-local -# docker compose up -d # start everything -# docker compose logs -f history # follow service logs -# docker compose down # stop all services -# docker compose down -v # stop and remove volumes (clean slate) - - -services: - nats: - image: nats:2.11-alpine - ports: - - "4222:4222" - - "8222:8222" - command: ["--jetstream", "--http_port", "8222"] - healthcheck: - test: ["CMD", "wget", "--spider", "-q", "http://localhost:8222/healthz"] - interval: 5s - timeout: 3s - retries: 5 - - mongodb: - image: mongo:8 - ports: - - "27017:27017" - volumes: - - mongo-data:/data/db - healthcheck: - test: ["CMD", "mongosh", "--eval", "db.adminCommand('ping')"] - interval: 5s - timeout: 3s - retries: 5 - - cassandra: - image: cassandra:5 - ports: - - "9042:9042" - volumes: - - cassandra-data:/var/lib/cassandra - environment: - - CASSANDRA_CLUSTER_NAME=chat-dev - healthcheck: - test: ["CMD", "cqlsh", "-e", "DESCRIBE KEYSPACES"] - interval: 10s - timeout: 5s - retries: 10 - start_period: 30s - - cassandra-init: - image: cassandra:5 - depends_on: - cassandra: - condition: service_healthy - restart: "no" - entrypoint: ["/bin/bash", "-c"] - command: - - | - set -e - cqlsh cassandra <<'CQL' - CREATE KEYSPACE IF NOT EXISTS chat WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1}; - CREATE TYPE IF NOT EXISTS chat."Participant" (id TEXT, eng_name TEXT, company_name TEXT, app_id TEXT, app_name TEXT, is_bot BOOLEAN, account TEXT); - CREATE TYPE IF NOT EXISTS chat."File" (id TEXT, name TEXT, type TEXT); - CREATE TYPE IF NOT EXISTS chat."Card" (template TEXT, data BLOB); - CREATE TYPE IF NOT EXISTS chat."CardAction" (verb TEXT, text TEXT, card_id TEXT, display_text TEXT, hide_exec_log BOOLEAN, card_tmid TEXT, data BLOB); - CREATE TYPE IF NOT EXISTS chat."QuotedParentMessage" (message_id TEXT, room_id TEXT, sender FROZEN<"Participant">, created_at TIMESTAMP, msg TEXT, mentions SET>, attachments LIST, message_link TEXT, thread_parent_id TEXT, thread_parent_created_at TIMESTAMP); - CREATE TABLE IF NOT EXISTS chat.messages_by_room ( - room_id TEXT, - created_at TIMESTAMP, - message_id TEXT, - sender FROZEN<"Participant">, - target_user FROZEN<"Participant">, - msg TEXT, - mentions SET>, - attachments LIST, - file FROZEN<"File">, - card FROZEN<"Card">, - card_action FROZEN<"CardAction">, - tshow BOOLEAN, - tcount INT, - thread_parent_id TEXT, - thread_parent_created_at TIMESTAMP, - quoted_parent_message FROZEN<"QuotedParentMessage">, - visible_to TEXT, - reactions MAP>>>, - deleted BOOLEAN, - type TEXT, - sys_msg_data BLOB, - site_id TEXT, - edited_at TIMESTAMP, - updated_at TIMESTAMP, - PRIMARY KEY ((room_id), created_at, message_id) - ) WITH CLUSTERING ORDER BY (created_at DESC, message_id DESC); - CREATE TABLE IF NOT EXISTS chat.messages_by_id ( - message_id TEXT, - room_id TEXT, - thread_room_id TEXT, - sender FROZEN<"Participant">, - target_user FROZEN<"Participant">, - msg TEXT, - mentions SET>, - attachments LIST, - file FROZEN<"File">, - card FROZEN<"Card">, - card_action FROZEN<"CardAction">, - tshow BOOLEAN, - tcount INT, - thread_parent_id TEXT, - thread_parent_created_at TIMESTAMP, - quoted_parent_message FROZEN<"QuotedParentMessage">, - visible_to TEXT, - reactions MAP>>>, - deleted BOOLEAN, - type TEXT, - sys_msg_data BLOB, - site_id TEXT, - edited_at TIMESTAMP, - created_at TIMESTAMP, - updated_at TIMESTAMP, - pinned_at TIMESTAMP, - pinned_by FROZEN<"Participant">, - PRIMARY KEY (message_id, created_at) - ) WITH CLUSTERING ORDER BY (created_at DESC); - CQL - echo "Schema initialized successfully" - - history: - build: - context: ../.. - dockerfile: history-service/deploy/Dockerfile - depends_on: - nats: - condition: service_healthy - mongodb: - condition: service_healthy - cassandra-init: - condition: service_completed_successfully - environment: - NATS_URL: nats://nats:4222 - SITE_ID: site-local - MONGO_URI: mongodb://mongodb:27017 - MONGO_DB: chat - CASSANDRA_HOSTS: cassandra - CASSANDRA_KEYSPACE: chat - ports: - - "8080:8080" - -volumes: - mongo-data: - cassandra-data: From 7b76258d2a36af323988791ccfa9ce0925799b30 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 21 May 2026 02:48:47 +0000 Subject: [PATCH 22/23] test: trim comments per simplify pass MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Net -120 lines of comments / narration across the PR. Targeted only WHAT-comments and progress narration; preserved load-bearing WHY: - pkg/testutil/{init,terminate,testmain,valkey}.go: docstrings cut to one or two lines each; deleted "intended for ..." sentences that restate function names. Kept the newValkeyClusterClient ClusterSlots-override explanation (load-bearing WHY). - search-service/setup_shared_test.go: removed redundant "pre-warms ..." multi-paragraph TestMain comment; same for uniqueESIndex. - search-service/integration_ccs_test.go: deleted the 14 t.Logf progress lines (failures from require.NoError already pinpoint the phase), trimmed setupCCSFixture / installTemplates / startESForCCS / buildTestTemplate docstrings, removed in-test narration like "Local message in local room" / "Round-trips through the real natsrouter ...". Kept the PROXY-mode rationale (real WHY) and the Clause-A/B1/B2 inline scenario comments (load-bearing for restricted query semantics). - search-service/integration_apps_test.go: dropped the "Prototype pipeline" follow-up note (belongs in a TODO, not a comment). - search-service/integration_rooms_test.go: trimmed roomsFixture and putTestSpotlightIndex docstrings. - search-service/integration_users_test.go: removed inline "controls the stub response" comment on a self-explanatory field. - search-sync-worker/integration_test.go: TestMain comment cut to one line, matching search-service. - tools/loadgen/integration_test.go: removed "Allow trailing events to flow" / "Assert canonical stream drained" / "Assert seed data visible" — code is self-explanatory. - pkg/roomcrypto/main_test.go, pkg/roomkeysender/main_test.go: removed 2-line "Import testutil for the Ryuk-disable init() side effect" preamble; the import alone now conveys it. CodeRabbit suggestions to extract a setupRouter helper or a PrewarmFailFast helper deferred — out of scope for a comment-trim pass. --- pkg/roomcrypto/main_test.go | 3 - pkg/roomkeysender/main_test.go | 3 - pkg/testutil/init.go | 7 +- pkg/testutil/terminate.go | 10 +- pkg/testutil/testmain.go | 8 +- pkg/testutil/valkey.go | 35 ++----- search-service/integration_apps_test.go | 2 - search-service/integration_ccs_test.go | 123 +++++------------------ search-service/integration_rooms_test.go | 5 +- search-service/integration_users_test.go | 2 +- search-service/setup_shared_test.go | 18 ++-- search-sync-worker/integration_test.go | 5 +- tools/loadgen/integration_test.go | 3 - 13 files changed, 52 insertions(+), 172 deletions(-) diff --git a/pkg/roomcrypto/main_test.go b/pkg/roomcrypto/main_test.go index ec6f00b95..126394e46 100644 --- a/pkg/roomcrypto/main_test.go +++ b/pkg/roomcrypto/main_test.go @@ -2,9 +2,6 @@ package roomcrypto -// Import testutil for the Ryuk-disable init() side effect even though -// this package starts its containers per-test (t.Cleanup handles teardown). - import ( "testing" diff --git a/pkg/roomkeysender/main_test.go b/pkg/roomkeysender/main_test.go index cbf45f72f..598e48bed 100644 --- a/pkg/roomkeysender/main_test.go +++ b/pkg/roomkeysender/main_test.go @@ -2,9 +2,6 @@ package roomkeysender -// Import testutil for the Ryuk-disable init() side effect even though -// this package starts its containers per-test (t.Cleanup handles teardown). - import ( "testing" diff --git a/pkg/testutil/init.go b/pkg/testutil/init.go index ce2fa0af9..30b7ba214 100644 --- a/pkg/testutil/init.go +++ b/pkg/testutil/init.go @@ -4,10 +4,9 @@ package testutil import "os" -// init disables testcontainers-go's Ryuk reaper repo-wide because it -// fails to start on our CI runner. Cleanup is handled by TerminateAll. -// LookupEnv guard lets local debugging flip Ryuk back on without an -// edit: `TESTCONTAINERS_RYUK_DISABLED=false go test ...`. +// Disable testcontainers Ryuk reaper repo-wide; our CI runner can't +// run the sidecar. Cleanup is handled by TerminateAll. Set +// TESTCONTAINERS_RYUK_DISABLED=false to flip back on locally. func init() { if _, set := os.LookupEnv("TESTCONTAINERS_RYUK_DISABLED"); !set { _ = os.Setenv("TESTCONTAINERS_RYUK_DISABLED", "true") diff --git a/pkg/testutil/terminate.go b/pkg/testutil/terminate.go index a0bb6c630..324ae3a34 100644 --- a/pkg/testutil/terminate.go +++ b/pkg/testutil/terminate.go @@ -2,14 +2,8 @@ package testutil -// TerminateAll stops every process-shared container started by this -// package. Each TerminateXxx is a no-op if its container was never -// started, so this is safe from any service's TestMain. Use via -// testutil.RunTests for the standard wrap. -// -// StartValkeyCluster (per-test) is unaffected — those containers are -// cleaned up by their own t.Cleanup hooks. TerminateValkey only stops -// the shared cluster from SharedValkeyCluster, if one was started. +// TerminateAll stops every process-shared container. Each TerminateXxx +// is a no-op if its container was never started. func TerminateAll() { TerminateMongo() TerminateCassandra() diff --git a/pkg/testutil/testmain.go b/pkg/testutil/testmain.go index a74b110e3..d42ffc39b 100644 --- a/pkg/testutil/testmain.go +++ b/pkg/testutil/testmain.go @@ -7,12 +7,8 @@ import ( "testing" ) -// RunTests is the canonical TestMain body for any package that uses -// shared testcontainers from this package. It runs the test binary, -// terminates every container started via testutil, and exits with the -// right code. -// -// func TestMain(m *testing.M) { testutil.RunTests(m) } +// RunTests runs m.Run, terminates shared containers, and exits. +// Usage: func TestMain(m *testing.M) { testutil.RunTests(m) } func RunTests(m *testing.M) { code := m.Run() TerminateAll() diff --git a/pkg/testutil/valkey.go b/pkg/testutil/valkey.go index b1c7b7d8a..31c6d9a8d 100644 --- a/pkg/testutil/valkey.go +++ b/pkg/testutil/valkey.go @@ -20,11 +20,8 @@ import ( "github.com/hmchangw/chat/pkg/testutil/testimages" ) -// StartValkeyCluster starts a single-node cluster-mode Valkey container -// and returns a connected *redis.ClusterClient. The container and client -// are terminated/closed via t.Cleanup. Use this when a test needs a -// pristine cluster (e.g. CLUSTER KEYSLOT routing assertions in -// pkg/roomkeystore); use SharedValkeyCluster otherwise. +// StartValkeyCluster boots a per-test cluster-mode Valkey. Use when a +// test asserts on cluster-routing state; otherwise prefer SharedValkeyCluster. func StartValkeyCluster(t *testing.T) *redis.ClusterClient { t.Helper() ctx := context.Background() @@ -36,15 +33,10 @@ func StartValkeyCluster(t *testing.T) *redis.ClusterClient { return c } -// SharedValkeyCluster returns a *redis.ClusterClient connected to a -// process-shared cluster-mode Valkey. The cluster boots once per `go test` -// invocation via sync.Once; TerminateValkey (called from TerminateAll) -// tears it down at process exit. Callers must register -// `t.Cleanup(func() { testutil.FlushValkey(t) })` themselves so the next -// test starts with a clean keyspace. -// -// Use this for tests that only care about cache get/set behaviour; use -// StartValkeyCluster if the test asserts on cluster-routing state. +// SharedValkeyCluster returns a *redis.ClusterClient against a +// process-shared cluster-mode Valkey (started via sync.Once, reaped via +// TerminateAll). Callers must register +// `t.Cleanup(func() { testutil.FlushValkey(t) })` for keyspace isolation. func SharedValkeyCluster(t *testing.T) *redis.ClusterClient { t.Helper() ensureSharedValkeyCluster() @@ -54,14 +46,11 @@ func SharedValkeyCluster(t *testing.T) *redis.ClusterClient { return sharedValkeyClient } -// EnsureValkey starts the shared Valkey cluster if not already started. -// No-t variant intended for TestMain pre-warming. +// EnsureValkey is the no-t variant for TestMain pre-warming. func EnsureValkey() error { ensureSharedValkeyCluster(); return sharedValkeyErr } -// FlushValkey runs FLUSHALL across every master in the shared cluster. -// Intended for per-test cleanup so sibling tests start with an empty -// keyspace. Test-fatal on error — leftover state would silently break -// the next test. +// FlushValkey runs FLUSHALL on every master in the shared cluster. +// Test-fatal on error — leftover state would silently break the next test. func FlushValkey(t *testing.T) { t.Helper() if sharedValkeyClient == nil { @@ -77,8 +66,7 @@ func FlushValkey(t *testing.T) { } } -// TerminateValkey closes the shared client and stops the shared -// container. Best-effort, idempotent. +// TerminateValkey closes the shared client/container. Idempotent. func TerminateValkey() { if sharedValkeyClient != nil { _ = sharedValkeyClient.Close() @@ -122,9 +110,6 @@ func ensureSharedValkeyCluster() { }) } -// startValkeyClusterContainer starts a cluster-mode container, assigns -// all 16384 slots to the node, and waits for cluster_state:ok. Test-fatal -// on error; for the no-t variant see startValkeyClusterContainerNoT. func startValkeyClusterContainer(ctx context.Context, t *testing.T) (testcontainers.Container, string) { t.Helper() container, addr, err := startValkeyClusterContainerNoT(ctx) diff --git a/search-service/integration_apps_test.go b/search-service/integration_apps_test.go index e018793a3..af9ee875d 100644 --- a/search-service/integration_apps_test.go +++ b/search-service/integration_apps_test.go @@ -71,8 +71,6 @@ func TestIntegration_SearchApps_PrototypePipeline(t *testing.T) { f := setupAppsFixture(t) ctx := context.Background() - // Prototype pipeline matches by `name` regex + $limit; $lookup - // access-guard is a follow-up. _, err := f.mongoDB.Collection("apps").InsertMany(ctx, []any{ map[string]any{"_id": "a1", "name": "Weather Alpha", "assistant": map[string]any{"enabled": true, "name": "weather.bot"}}, map[string]any{"_id": "a2", "name": "Weatherly", "assistant": map[string]any{"enabled": false, "name": "weatherly.bot"}}, diff --git a/search-service/integration_ccs_test.go b/search-service/integration_ccs_test.go index 619d88f39..124e91172 100644 --- a/search-service/integration_ccs_test.go +++ b/search-service/integration_ccs_test.go @@ -2,15 +2,10 @@ package main -// This file owns the cross-cluster-search (CCS) integration tests and -// every helper that only CCS needs. The two CCS tests are the one -// exception to the shared-container pattern in setup_shared_test.go: +// CCS integration tests + helpers only CCS uses. The two CCS tests are +// the exception to the shared-container pattern in setup_shared_test.go: // they need a pair of ES nodes on a shared docker network with -// transport-port aliases (`es-local`, `es-remote`), which doesn't fit -// the process-shared single-node ES. NATS and Valkey are still shared. -// -// Shared utilities used here (seedDoc, testHTTPClient, testUserRoomIndex) -// live in integration_test.go. +// transport-port aliases. NATS and Valkey are still shared. import ( "bytes" @@ -41,13 +36,8 @@ import ( // --- Fixture ----------------------------------------------------------------- -// ccsFixture is the full stack for cross-cluster integration tests: two ES -// containers on a shared Docker network (with CCS configured from local → -// remote), plus Valkey and NATS, plus the wired search-service router. -// -// localURL / remoteURL are the host-mapped HTTP URLs for seeding; the -// search-service itself sees only localURL. `clientNATS` is the raw NATS -// client used to issue request/reply calls. +// ccsFixture owns the two-ES + Valkey + NATS stack for CCS tests. +// localURL / remoteURL are host-mapped for seeding; the service sees localURL. type ccsFixture struct { localURL string remoteURL string @@ -56,51 +46,31 @@ type ccsFixture struct { clientNATS *nats.Conn } -// setupCCSFixture stands up the CCS environment. It owns the pair of -// networked ES containers (they need a shared docker network with -// transport-port aliases, so they can't be process-shared like the -// single-node ES used by other fixtures), but piggybacks on the -// process-shared Valkey and NATS from setup_shared_test.go. -// -// Every major step emits a `t.Logf` so a CI failure (where raw logs are -// often opaque on public runs) leaves enough breadcrumbs in the `go test` -// output to pinpoint which phase broke. +// setupCCSFixture owns the pair of networked ES containers (can't be +// process-shared — they need a shared docker network with transport-port +// aliases); piggybacks on shared Valkey/NATS. func setupCCSFixture(t *testing.T) *ccsFixture { t.Helper() ctx := context.Background() - t.Logf("CCS fixture: creating docker network") nw, err := network.New(ctx) require.NoError(t, err, "create docker network") t.Cleanup(func() { _ = nw.Remove(ctx) }) - t.Logf("CCS fixture: network %q created", nw.Name) - t.Logf("CCS fixture: starting remote ES container (alias=es-remote)") remoteURL := startESForCCS(t, nw, "es-remote", "remote-cluster") - t.Logf("CCS fixture: remote ES up at %s", remoteURL) - - t.Logf("CCS fixture: starting local ES container (alias=es-local)") localURL := startESForCCS(t, nw, "es-local", "local-cluster") - t.Logf("CCS fixture: local ES up at %s", localURL) - - // Wire local ES to reach the remote in PROXY mode. Proxy mode opens a - // single direct connection to the configured address and skips the - // sniff-then-reconnect dance that sniff mode does — that dance requires - // each remote node to advertise a reachable publish address, which is - // fragile when docker containers bind transport on 0.0.0.0 and the - // publish address defaults to an interface the peer can't route to. - // Proxy mode is the robust choice for CCS over an ephemeral docker - // network. Ref: ES docs "Remote cluster settings" → `mode=proxy`. - t.Logf("CCS fixture: configuring cluster.remote.remote1 (proxy mode → es-remote:9300)") + + // Wire local→remote in PROXY mode. Proxy mode skips sniff-then-reconnect, + // which requires the remote to advertise a reachable publish address — + // fragile when containers bind transport on 0.0.0.0 and publish defaults + // to an unreachable interface. Ref: ES "Remote cluster settings" → `mode=proxy`. putClusterSetting(t, localURL, map[string]any{ "persistent": map[string]any{ "cluster.remote.remote1.mode": "proxy", "cluster.remote.remote1.proxy_address": "es-remote:9300", }, }) - t.Logf("CCS fixture: waiting for remote1 to report connected=true (timeout 120s)") waitForRemoteConnected(t, localURL, "remote1", 120*time.Second) - t.Logf("CCS fixture: remote1 connected") localEngine, err := searchengine.New(ctx, searchengine.Config{Backend: "elasticsearch", URL: localURL}) require.NoError(t, err, "build searchengine for local") @@ -118,7 +88,6 @@ func setupCCSFixture(t *testing.T) *ccsFixture { clientNC, err := nats.Connect(natsURL) require.NoError(t, err, "connect nats (client side)") t.Cleanup(func() { clientNC.Close() }) - t.Logf("CCS fixture: NATS at %s", natsURL) userRoomIndex := testUserRoomIndex store := newESStore(localEngine, userRoomIndex) @@ -148,16 +117,9 @@ func setupCCSFixture(t *testing.T) *ccsFixture { } } -// startESForCCS starts one ES node on the shared network with the given -// network alias so the peer can reach it at `{alias}:9300`. Returns the -// host-mapped HTTP URL for seeding. -// -// `transport.host: 0.0.0.0` is required so the transport port binds on all -// interfaces, including the bridge network (ES 8.x defaults to `_site_` -// which excludes the container's bridge IP in some setups). CCS itself -// uses `proxy` mode to avoid publish-address sensitivity — see -// setupCCSFixture. `xpack.security.enabled=false` matches the local dev -// deps compose. +// startESForCCS starts one ES node on the shared network at alias `{alias}`. +// transport.host=0.0.0.0 is required so the transport port binds on the bridge +// network (ES 8.x defaults to `_site_` which excludes the container bridge IP). func startESForCCS(t *testing.T, nw *testcontainers.DockerNetwork, alias, clusterName string) string { t.Helper() ctx := context.Background() @@ -200,10 +162,9 @@ func startESForCCS(t *testing.T, nw *testcontainers.DockerNetwork, alias, cluste // --- Index templates --------------------------------------------------------- -// buildTestTemplate wraps a pattern + property map with single-node-friendly -// index settings (1 shard, 0 replicas, 1s refresh) and `dynamic: false` -// mappings. The templates below hand-roll their property sets so the tests -// remain independent of search-sync-worker's custom-analyzer configuration. +// buildTestTemplate wraps properties with single-node-friendly settings +// (1 shard, 0 replicas) so tests don't depend on search-sync-worker's +// analyzer config. func buildTestTemplate(pattern string, properties map[string]any) json.RawMessage { body := map[string]any{ "index_patterns": []string{pattern}, @@ -263,8 +224,6 @@ func userRoomTestTemplate() json.RawMessage { // --- CCS HTTP helpers -------------------------------------------------------- -// putClusterSetting pushes a /_cluster/settings update. Used to configure -// the CCS remote after both clusters are up. func putClusterSetting(t *testing.T, esURL string, body map[string]any) { t.Helper() data, _ := json.Marshal(body) @@ -314,18 +273,13 @@ func waitForRemoteConnected(t *testing.T, localURL, remoteName string, timeout t func (f *ccsFixture) installTemplates(t *testing.T) { t.Helper() ctx := context.Background() - - t.Logf("templates: upserting messages_template on local") require.NoError(t, f.localES.UpsertTemplate(ctx, "messages_template", messageTestTemplate()), "upsert messages_template on local") - t.Logf("templates: upserting messages_template on remote") require.NoError(t, f.remoteES.UpsertTemplate(ctx, "messages_template", messageTestTemplate()), "upsert messages_template on remote") // user-room is local-only per the search-service architecture. - t.Logf("templates: upserting user_room_template on local") require.NoError(t, f.localES.UpsertTemplate(ctx, "user_room_template", userRoomTestTemplate()), "upsert user_room_template on local") - t.Logf("templates: all upserted") } // --- Tests ------------------------------------------------------------------- @@ -340,12 +294,9 @@ func TestSearchService_SearchMessages_CCS_CrossCluster_Unrestricted(t *testing.T f := setupCCSFixture(t) f.installTemplates(t) - // --- Seed -------------------------------------------------------------- - // - // Alice is a member of two unrestricted rooms: one lives on the local - // site, the other on the remote site. The user-room doc (local-only) - // lists BOTH in `rooms[]` — the sync-worker would normally populate - // this via INBOX events; here we seed directly. + // Alice is in two unrestricted rooms (one local, one remote); the + // local user-room doc lists both. Sync-worker normally populates it via + // INBOX events — seeded directly here. const account = "alice" const localRoomID = "room-local-1" const remoteRoomID = "room-remote-1" @@ -367,7 +318,6 @@ func TestSearchService_SearchMessages_CCS_CrossCluster_Unrestricted(t *testing.T "updatedAt": createdAt.Format(time.RFC3339Nano), }) - // Local message in local room. seedDoc(t, f.localURL, monthIdx, "msg-local-1", map[string]any{ "messageId": "msg-local-1", "roomId": localRoomID, @@ -378,9 +328,7 @@ func TestSearchService_SearchMessages_CCS_CrossCluster_Unrestricted(t *testing.T "createdAt": createdAt.Format(time.RFC3339Nano), }) - // Remote message in remote room. Same index pattern (`messages-*`) on - // the remote cluster — CCS resolves the `*:messages-*` segment on the - // local query. + // Same index pattern on the remote cluster — CCS resolves `*:messages-*`. seedDoc(t, f.remoteURL, monthIdx, "msg-remote-1", map[string]any{ "messageId": "msg-remote-1", "roomId": remoteRoomID, @@ -391,19 +339,11 @@ func TestSearchService_SearchMessages_CCS_CrossCluster_Unrestricted(t *testing.T "createdAt": createdAt.Format(time.RFC3339Nano), }) - // --- Search via NATS --------------------------------------------------- - // - // Round-trips through the real natsrouter: the handler reads - // restrictedRooms from Valkey (miss → ES prefetch → Valkey SET), then - // builds the CCS query against `messages-*,*:messages-*` and parses - // the merged response. req := model.SearchMessagesRequest{Query: "hello"} reqData, err := json.Marshal(req) require.NoError(t, err) - // Generous timeout: first request is Valkey miss → ES prefetch of - // user-room doc → CCS fanout → response parse. Tight timeouts mask - // real latency bugs in integration. + // Long timeout: first request is Valkey miss → ES prefetch → CCS fanout. msg, err := f.clientNATS.Request(subject.SearchMessages(account), reqData, 30*time.Second) require.NoError(t, err, "NATS request failed") @@ -449,13 +389,8 @@ func TestSearchService_SearchMessages_CCS_CrossCluster_Restricted(t *testing.T) const localRoomID = "room-local-unrestricted" const remoteRoomID = "room-remote-restricted" - // Temporal setup: - // - hss is the user's join-time bound for the restricted remote room. - // - preHSS is 3 hours before hss (so pre-HSS messages are clearly - // older than the gate). - // - postHSS is 1 hour after hss. - // All well within the default 1-year `recent_window` so none of them - // get filtered out by the global createdAt range filter. + // hss is the user's join-time bound for the restricted remote room; + // preHSS / postHSS straddle it. All within the 1-year recent_window. now := time.Now().UTC() hss := now.Add(-2 * time.Hour) preHSS := hss.Add(-3 * time.Hour) @@ -463,7 +398,6 @@ func TestSearchService_SearchMessages_CCS_CrossCluster_Restricted(t *testing.T) monthIdxFor := func(ts time.Time) string { return "messages-" + ts.Format("2006-01") } // user-room doc: local room unrestricted, remote room restricted with hss. - t.Logf("seed: upserting user-room doc for %s (restricted %s since %s)", account, remoteRoomID, hss.Format(time.RFC3339)) seedDoc(t, f.localURL, testUserRoomIndex, account, map[string]any{ "userAccount": account, "rooms": []string{localRoomID}, @@ -481,7 +415,6 @@ func TestSearchService_SearchMessages_CCS_CrossCluster_Restricted(t *testing.T) // --- LOCAL unrestricted room ---------------------------------------- // One plain message that should always match via the terms-lookup // branch (no HSS involved). - t.Logf("seed: local unrestricted message in %s", localRoomID) seedDoc(t, f.localURL, monthIdxFor(postHSS), "msg-local-1", map[string]any{ "messageId": "msg-local-1", "roomId": localRoomID, @@ -497,7 +430,6 @@ func TestSearchService_SearchMessages_CCS_CrossCluster_Restricted(t *testing.T) // clauses. Pre-HSS parent lives at `msg-remote-pre-parent`; its // thread replies reference it via threadParentMessageId + // threadParentMessageCreatedAt=preHSS. - t.Logf("seed: remote pre-HSS parent (MUST NOT match)") seedDoc(t, f.remoteURL, monthIdxFor(preHSS), "msg-remote-pre-parent", map[string]any{ "messageId": "msg-remote-pre-parent", "roomId": remoteRoomID, @@ -508,7 +440,6 @@ func TestSearchService_SearchMessages_CCS_CrossCluster_Restricted(t *testing.T) "createdAt": preHSS.Format(time.RFC3339Nano), }) - t.Logf("seed: remote post-HSS parent (Clause A match)") seedDoc(t, f.remoteURL, monthIdxFor(postHSS), "msg-remote-post-parent", map[string]any{ "messageId": "msg-remote-post-parent", "roomId": remoteRoomID, @@ -524,7 +455,6 @@ func TestSearchService_SearchMessages_CCS_CrossCluster_Restricted(t *testing.T) // (createdAt >= hss); tshow=true then fires B1 regardless of the // parent's age. If the outer gate weren't there, a pre-HSS tshow=true // reply would leak history the user never had access to. - t.Logf("seed: remote post-HSS reply with tshow=true, pre-HSS parent (Clause B1 match)") seedDoc(t, f.remoteURL, monthIdxFor(postHSS), "msg-remote-reply-tshow", map[string]any{ "messageId": "msg-remote-reply-tshow", "roomId": remoteRoomID, @@ -541,7 +471,6 @@ func TestSearchService_SearchMessages_CCS_CrossCluster_Restricted(t *testing.T) // Post-HSS reply to a pre-HSS parent, tshow=false → Clause B rejects. // Outer gate passes (reply createdAt >= hss) but the inner OR fails: // tshow=false blocks B1 and the parent's pre-HSS createdAt blocks B2. - t.Logf("seed: remote post-HSS reply without tshow, pre-HSS parent (MUST NOT match)") seedDoc(t, f.remoteURL, monthIdxFor(postHSS), "msg-remote-reply-plain", map[string]any{ "messageId": "msg-remote-reply-plain", "roomId": remoteRoomID, diff --git a/search-service/integration_rooms_test.go b/search-service/integration_rooms_test.go index 54db83839..eed946da4 100644 --- a/search-service/integration_rooms_test.go +++ b/search-service/integration_rooms_test.go @@ -26,8 +26,7 @@ import ( "github.com/hmchangw/chat/pkg/valkeyutil" ) -// roomsFixture uses a per-test spotlight index against the shared ES so -// sibling tests can't leak hits into each other. +// Per-test spotlight index against shared ES. type roomsFixture struct { clientNATS *nats.Conn esURL string @@ -76,8 +75,6 @@ func setupRoomsFixture(t *testing.T) *roomsFixture { return &roomsFixture{clientNATS: clientNC, esURL: esURL, spotlightIndex: spotlightIndex} } -// putTestSpotlightIndex creates a minimal spotlight index in ES with the -// fields needed by the subscription search query. func putTestSpotlightIndex(t *testing.T, esURL, index string) { t.Helper() body := map[string]any{ diff --git a/search-service/integration_users_test.go b/search-service/integration_users_test.go index 4bb7296d6..a5ef84749 100644 --- a/search-service/integration_users_test.go +++ b/search-service/integration_users_test.go @@ -26,7 +26,7 @@ import ( type usersFixture struct { clientNATS *nats.Conn - thirdParty *httptest.Server // controls the stub response + thirdParty *httptest.Server } func setupUsersFixture(t *testing.T, thirdPartyHandler http.Handler) *usersFixture { diff --git a/search-service/setup_shared_test.go b/search-service/setup_shared_test.go index d20a9a82f..409a567c7 100644 --- a/search-service/setup_shared_test.go +++ b/search-service/setup_shared_test.go @@ -2,10 +2,8 @@ package main -// Per-package shared test infrastructure. ES / NATS / Mongo come from -// pkg/testutil as process-shared containers; Valkey is per-test via -// testutil.StartValkeyCluster. CCS tests bring their own ES pair -// (integration_ccs_test.go). +// ES / NATS / Valkey / Mongo come from pkg/testutil. CCS tests bring +// their own ES pair (integration_ccs_test.go). import ( "bytes" @@ -34,10 +32,9 @@ const ( testQueueGroupV2 = "search-service-test-v2" // messages v2 ) -// testHTTPClient bounds ES control-plane calls so a stalled container can't hang the job. +// Bounded HTTP client for ES control-plane calls. var testHTTPClient = &http.Client{Timeout: 10 * time.Second} -// seedDoc PUTs a JSON document into ES, synchronously refreshing the index. func seedDoc(t *testing.T, esURL, index, id string, doc any) { t.Helper() data, err := json.Marshal(doc) @@ -54,9 +51,7 @@ func seedDoc(t *testing.T, esURL, index, id string, doc any) { "seedDoc %s/%s: status=%d body=%s", index, id, resp.StatusCode, body) } -// TestMain pre-warms the shared containers concurrently so the first test -// doesn't pay their startup serially. A pre-warm failure aborts the run -// before m.Run rather than letting every test fail individually. +// TestMain pre-warms shared containers in parallel; fails fast on error. func TestMain(m *testing.M) { var wg sync.WaitGroup prewarms := []func() error{ @@ -85,9 +80,8 @@ func TestMain(m *testing.M) { testutil.RunTests(m) } -// uniqueESIndex returns a per-test ES index name derived from t.Name() and -// registers cleanup that DELETEs the index. The fnv hash keeps the name -// short and free of characters ES dislikes (slashes from subtests). +// uniqueESIndex returns a per-test ES index name (fnv hash keeps it short +// and ES-safe across subtest slashes) and registers DELETE on cleanup. func uniqueESIndex(t *testing.T, prefix string) string { t.Helper() esURL := testutil.Elasticsearch(t) diff --git a/search-sync-worker/integration_test.go b/search-sync-worker/integration_test.go index 721ea5fb1..ab6a1b574 100644 --- a/search-sync-worker/integration_test.go +++ b/search-sync-worker/integration_test.go @@ -37,10 +37,7 @@ var ( testNATSOnce sync.Once ) -// TestMain pre-warms ES + NATS in parallel; fails fast if either errors -// (so individual tests don't fail with confusing "couldn't connect" -// messages). Then opens one JetStream client, then terminates the shared -// containers on clean exit via testutil.TerminateAll. +// TestMain pre-warms shared containers in parallel; fails fast on error. func TestMain(m *testing.M) { var wg sync.WaitGroup prewarms := []func() error{ diff --git a/tools/loadgen/integration_test.go b/tools/loadgen/integration_test.go index 101f37dde..2fdf23343 100644 --- a/tools/loadgen/integration_test.go +++ b/tools/loadgen/integration_test.go @@ -122,10 +122,8 @@ func TestLoadgenSmallPreset_EndToEnd(t *testing.T) { defer cancel() require.NoError(t, gen.Run(runCtx)) - // Allow trailing events to flow. time.Sleep(2 * time.Second) - // Assert the canonical stream drained. for _, durable := range []string{"message-worker", "broadcast-worker"} { cons, err := js.Consumer(ctx, canonical.Name, durable) require.NoError(t, err) @@ -134,7 +132,6 @@ func TestLoadgenSmallPreset_EndToEnd(t *testing.T) { require.Equal(t, uint64(0), info.NumPending, "durable %s still has pending", durable) } - // Assert seed data is visible in Mongo. var room model.Room err = db.Collection("rooms").FindOne(ctx, bson.M{"_id": fixtures.Rooms[0].ID}).Decode(&room) require.NoError(t, err) From e757c4277c438a6a8f24f30ffe5f59aa92eaf8b2 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 21 May 2026 02:56:15 +0000 Subject: [PATCH 23/23] test(testutil): extract PrewarmFailFast, RunTestsWithPrewarm, ElasticsearchIndex helpers Three new testutil helpers consolidate patterns the PR was already duplicating across 5+ files: - testutil.PrewarmFailFast(fns...): runs EnsureXxx funcs concurrently, returns the first error. - testutil.RunTestsWithPrewarm(m, fns...): the standard TestMain wrap with pre-warm fail-fast. On prewarm failure, exits 1 after TerminateAll. On success, hands off to RunTests. - testutil.ElasticsearchIndex(t, prefix): per-test ES index name (fnv hash of t.Name()) + DELETE on cleanup. Completes the per-test isolation triad alongside MongoDB / CassandraKeyspace / MinIO. Migrations: - search-service/setup_shared_test.go: TestMain collapses from 30 lines (manual wg+errCh+goroutines) to a single RunTestsWithPrewarm call. Drops the local uniqueESIndex helper. Adds a setupRouter helper that every per-endpoint fixture uses to wire NATS server+client conns + router with the given queue group + Flush + cleanups. - search-service/integration_{apps,users,rooms,messages,ccs}_test.go: each fixture's setup function now ends with `clientNATS := setupRouter(t, queueGroup, h.Register)`. Drops ~9 lines of plumbing per fixture (45 lines saved). - search-sync-worker/integration_test.go: replaces the manual prewarm block with testutil.PrewarmFailFast. Keeps its custom m.Run wrap because it needs to close the lazy-init JetStream conn between m.Run and TerminateAll. CLAUDE.md updated to point at the new helpers. --- CLAUDE.md | 4 +- pkg/testutil/elasticsearch.go | 34 +++++++++- pkg/testutil/testmain.go | 35 ++++++++++ search-service/integration_apps_test.go | 29 +------- search-service/integration_ccs_test.go | 26 +------- search-service/integration_messages_test.go | 29 +------- search-service/integration_rooms_test.go | 28 ++------ search-service/integration_users_test.go | 26 +------- search-service/setup_shared_test.go | 74 ++++++++------------- search-sync-worker/integration_test.go | 21 +----- 10 files changed, 114 insertions(+), 192 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index be0e3b50f..892fc58b2 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -155,7 +155,7 @@ All commands are wrapped in the root Makefile. Always use `make` targets — nev - `testutil.MongoDB(t, prefix) *mongo.Database` — isolated DB per test - `testutil.CassandraKeyspace(t, prefix) (keyspace, *gocql.Session, host)` — isolated keyspace per test - `testutil.MinIO(t, prefix) (*minio.Client, bucket)` — isolated bucket per test - - `testutil.Elasticsearch(t) string` — shared ES URL; use a per-test unique index name (fnv hash of `t.Name()`) + - `testutil.Elasticsearch(t) string` — shared ES URL; pair with `testutil.ElasticsearchIndex(t, prefix)` for a per-test isolated index (DELETEd on cleanup) - `testutil.NATS(t) string` — shared NATS URL with JetStream enabled - Valkey (cluster-mode — services use this in production): - `testutil.SharedValkeyCluster(t) *redis.ClusterClient` — process-shared cluster (started via `sync.Once`, reaped via `TerminateValkey`/`TerminateAll`). Per-test caller MUST register `t.Cleanup(func() { testutil.FlushValkey(t) })` so sibling tests start with a clean keyspace. Default choice. @@ -172,7 +172,7 @@ All commands are wrapped in the root Makefile. Always use `make` targets — nev func TestMain(m *testing.M) { testutil.RunTests(m) } ``` - `testutil.RunTests` wraps `m.Run()` + `testutil.TerminateAll()` + `os.Exit(code)`. For packages that want concurrent pre-warming, wrap manually instead — see `search-service/setup_shared_test.go` for the reference pattern (`EnsureXxx` goroutines + error channel + fail-fast). + `testutil.RunTests` wraps `m.Run()` + `testutil.TerminateAll()` + `os.Exit(code)`. For concurrent pre-warming use `testutil.RunTestsWithPrewarm(m, testutil.EnsureElasticsearch, testutil.EnsureNATS, ...)` — runs each `EnsureXxx` concurrently and fails fast on the first error before `m.Run`. The `testutil.PrewarmFailFast(fns...)` building block is also exposed for packages that need extra cleanup between `m.Run` and `os.Exit`. - **Ryuk is disabled repo-wide** (via `pkg/testutil/init.go`) because our CI runner can't run the reaper sidecar. `testutil.TerminateAll` is the only cleanup mechanism on clean exits. SIGKILL / Ctrl+C will leak containers locally — acceptable trade-off; flip Ryuk back on with `TESTCONTAINERS_RYUK_DISABLED=false go test ...` if debugging a leak. - Per-test isolation is the caller's responsibility: the `MongoDB`/`Cassandra`/`MinIO` helpers already hash `t.Name()`; for ES use a per-test unique index name and DELETE on cleanup; for NATS use a per-test `*nats.Conn` pair with `Drain`/`Shutdown` cleanups; for shared Valkey call `testutil.FlushValkey(t)` in `t.Cleanup` (StartValkeyCluster's per-test mode is automatic). - Inline `testcontainers.GenericContainer` is only acceptable when a shared testutil container can't accommodate the test (e.g. search-service CCS needs two ES nodes on a shared docker network; `pkg/roomkeysender` needs NATS with WebSocket transport; `pkg/roomcrypto` needs a Node container with bundled scripts). Each inline container must store its reference and register `t.Cleanup(container.Terminate)`. diff --git a/pkg/testutil/elasticsearch.go b/pkg/testutil/elasticsearch.go index 6eb2add8d..b9f2ab1de 100644 --- a/pkg/testutil/elasticsearch.go +++ b/pkg/testutil/elasticsearch.go @@ -5,6 +5,8 @@ package testutil import ( "context" "fmt" + "hash/fnv" + "net/http" "os" "sync" "testing" @@ -16,6 +18,10 @@ import ( "github.com/hmchangw/chat/pkg/testutil/testimages" ) +// esCleanupHTTPClient is a bounded HTTP client for the index-delete cleanup +// in ElasticsearchIndex. Stalled containers shouldn't hang test exit. +var esCleanupHTTPClient = &http.Client{Timeout: 10 * time.Second} + var ( esOnce sync.Once esContainer testcontainers.Container @@ -77,10 +83,34 @@ func Elasticsearch(t *testing.T) string { return u } -// EnsureElasticsearch starts the shared ES container if not already -// started. No-t variant intended for TestMain pre-warming. +// EnsureElasticsearch is the no-t variant for TestMain pre-warming. func EnsureElasticsearch() error { _, err := ensureElasticsearch(); return err } +// ElasticsearchIndex returns a per-test index name (fnv hash of t.Name() +// keeps it short and ES-safe across subtest slashes) and registers a +// DELETE on cleanup so sibling tests start clean. +func ElasticsearchIndex(t *testing.T, prefix string) string { + t.Helper() + url := Elasticsearch(t) + h := fnv.New64a() + _, _ = h.Write([]byte(t.Name())) + name := fmt.Sprintf("%s-%x", prefix, h.Sum64()) + t.Cleanup(func() { + req, err := http.NewRequest(http.MethodDelete, url+"/"+name, nil) + if err != nil { + t.Logf("delete index %s: build request: %v", name, err) + return + } + resp, err := esCleanupHTTPClient.Do(req) + if err != nil { + t.Logf("delete index %s: %v", name, err) + return + } + _ = resp.Body.Close() + }) + return name +} + // TerminateElasticsearch stops the shared ES container. Best-effort and // idempotent — safe to call from TestMain even if no test touched ES. func TerminateElasticsearch() { diff --git a/pkg/testutil/testmain.go b/pkg/testutil/testmain.go index d42ffc39b..8563e6f62 100644 --- a/pkg/testutil/testmain.go +++ b/pkg/testutil/testmain.go @@ -3,7 +3,9 @@ package testutil import ( + "fmt" "os" + "sync" "testing" ) @@ -14,3 +16,36 @@ func RunTests(m *testing.M) { TerminateAll() os.Exit(code) } + +// PrewarmFailFast runs each Ensure* concurrently and returns the first +// error, or nil if all succeed. Intended for use in TestMain before m.Run. +func PrewarmFailFast(fns ...func() error) error { + var wg sync.WaitGroup + errCh := make(chan error, len(fns)) + for _, fn := range fns { + wg.Add(1) + go func(f func() error) { + defer wg.Done() + if err := f(); err != nil { + errCh <- err + } + }(fn) + } + wg.Wait() + close(errCh) + if err, ok := <-errCh; ok { + return err + } + return nil +} + +// RunTestsWithPrewarm pre-warms via PrewarmFailFast, then RunTests. +// On prewarm failure, exits with code 1 after TerminateAll cleanup. +func RunTestsWithPrewarm(m *testing.M, prewarms ...func() error) { + if err := PrewarmFailFast(prewarms...); err != nil { + fmt.Fprintf(os.Stderr, "prewarm shared containers: %v\n", err) + TerminateAll() + os.Exit(1) + } + RunTests(m) +} diff --git a/search-service/integration_apps_test.go b/search-service/integration_apps_test.go index af9ee875d..ac066b975 100644 --- a/search-service/integration_apps_test.go +++ b/search-service/integration_apps_test.go @@ -18,7 +18,6 @@ import ( "github.com/hmchangw/chat/pkg/model" "github.com/hmchangw/chat/pkg/natsrouter" - "github.com/hmchangw/chat/pkg/natsutil" "github.com/hmchangw/chat/pkg/subject" "github.com/hmchangw/chat/pkg/testutil" ) @@ -30,23 +29,8 @@ type appsFixture struct { func setupAppsFixture(t *testing.T) *appsFixture { t.Helper() - mongoDB := testutil.MongoDB(t, "search_service_test") - - natsURL := testutil.NATS(t) - - serverNATS, err := natsutil.Connect(natsURL, "") - require.NoError(t, err) - t.Cleanup(func() { _ = serverNATS.Drain() }) - - clientNATS, err := nats.Connect(natsURL) - require.NoError(t, err) - t.Cleanup(func() { clientNATS.Close() }) - - mongoStore := newMongoStore(mongoDB) - store := &fakeStore{} - cache := newFakeCache() - h := newHandler(store, mongoStore, nil, cache, handlerConfig{ + h := newHandler(&fakeStore{}, newMongoStore(mongoDB), nil, newFakeCache(), handlerConfig{ DocCounts: 25, MaxDocCounts: 100, RestrictedRoomsCacheTTL: 5 * time.Minute, @@ -54,16 +38,7 @@ func setupAppsFixture(t *testing.T) *appsFixture { RequestTimeout: 5 * time.Second, SpotlightReadPattern: "spotlight-*", }) - - router := natsrouter.New(serverNATS, testQueueGroup) - router.Use(natsrouter.RequestID()) - h.Register(router) - // Flush so subscriptions reach the server before tests send requests (otelnats wraps the conn). - require.NoError(t, serverNATS.NatsConn().Flush()) - t.Cleanup(func() { - _ = router.Shutdown(context.Background()) - }) - + clientNATS := setupRouter(t, testQueueGroup, h.Register) return &appsFixture{clientNATS: clientNATS, mongoDB: mongoDB} } diff --git a/search-service/integration_ccs_test.go b/search-service/integration_ccs_test.go index 124e91172..f38db6860 100644 --- a/search-service/integration_ccs_test.go +++ b/search-service/integration_ccs_test.go @@ -25,8 +25,6 @@ import ( "github.com/testcontainers/testcontainers-go/wait" "github.com/hmchangw/chat/pkg/model" - "github.com/hmchangw/chat/pkg/natsrouter" - "github.com/hmchangw/chat/pkg/natsutil" "github.com/hmchangw/chat/pkg/searchengine" "github.com/hmchangw/chat/pkg/subject" "github.com/hmchangw/chat/pkg/testutil" @@ -80,33 +78,15 @@ func setupCCSFixture(t *testing.T) *ccsFixture { cacheClient := valkeyutil.WrapClusterClient(testutil.SharedValkeyCluster(t)) t.Cleanup(func() { testutil.FlushValkey(t) }) - natsURL := testutil.NATS(t) - serverNC, err := natsutil.Connect(natsURL, "") - require.NoError(t, err, "connect nats (server side)") - t.Cleanup(func() { _ = serverNC.Drain() }) - - clientNC, err := nats.Connect(natsURL) - require.NoError(t, err, "connect nats (client side)") - t.Cleanup(func() { clientNC.Close() }) - - userRoomIndex := testUserRoomIndex - store := newESStore(localEngine, userRoomIndex) - cache := newValkeyCache(cacheClient) - handler := newHandler(store, nil, nil, cache, handlerConfig{ + h := newHandler(newESStore(localEngine, testUserRoomIndex), nil, nil, newValkeyCache(cacheClient), handlerConfig{ DocCounts: 25, MaxDocCounts: 100, RestrictedRoomsCacheTTL: 5 * time.Minute, RecentWindow: 365 * 24 * time.Hour, - UserRoomIndex: userRoomIndex, + UserRoomIndex: testUserRoomIndex, SpotlightReadPattern: "spotlight-test-*", }) - - router := natsrouter.New(serverNC, testQueueGroup) - router.Use(natsrouter.RequestID()) - handler.Register(router) - // Flush so subscriptions reach the server before tests send requests (otelnats wraps the conn). - require.NoError(t, serverNC.NatsConn().Flush()) - t.Cleanup(func() { _ = router.Shutdown(context.Background()) }) + clientNC := setupRouter(t, testQueueGroup, h.Register) return &ccsFixture{ localURL: localURL, diff --git a/search-service/integration_messages_test.go b/search-service/integration_messages_test.go index 013c42519..66d84a2eb 100644 --- a/search-service/integration_messages_test.go +++ b/search-service/integration_messages_test.go @@ -19,10 +19,8 @@ import ( "github.com/hmchangw/chat/pkg/model" "github.com/hmchangw/chat/pkg/natsrouter" - "github.com/hmchangw/chat/pkg/natsutil" "github.com/hmchangw/chat/pkg/searchengine" "github.com/hmchangw/chat/pkg/subject" - "github.com/hmchangw/chat/pkg/testutil" ) type messagesV2Fixture struct { @@ -31,8 +29,6 @@ type messagesV2Fixture struct { func setupMessagesV2Fixture(t *testing.T) *messagesV2Fixture { t.Helper() - ctx := context.Background() - esStub := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { // Drain the body so the HTTP/1.1 connection stays open. _, _ = io.Copy(io.Discard, r.Body) @@ -51,21 +47,9 @@ func setupMessagesV2Fixture(t *testing.T) *messagesV2Fixture { fakeValkey := newFakeCache() fakeValkey.store["alice"] = map[string]int64{} // empty restricted map, cache hit - natsURL := testutil.NATS(t) - - serverNATS, err := natsutil.Connect(natsURL, "") + engine, err := searchengine.New(context.Background(), searchengine.Config{Backend: "elasticsearch", URL: esStub.URL}) require.NoError(t, err) - t.Cleanup(func() { _ = serverNATS.Drain() }) - - clientNATS, err := nats.Connect(natsURL) - require.NoError(t, err) - t.Cleanup(func() { clientNATS.Close() }) - - engine, err := searchengine.New(ctx, searchengine.Config{Backend: "elasticsearch", URL: esStub.URL}) - require.NoError(t, err) - esStore := newESStore(engine, testUserRoomIndex) - - h := newHandler(esStore, nil, nil, fakeValkey, handlerConfig{ + h := newHandler(newESStore(engine, testUserRoomIndex), nil, nil, fakeValkey, handlerConfig{ DocCounts: 25, MaxDocCounts: 100, RestrictedRoomsCacheTTL: 5 * time.Minute, @@ -74,14 +58,7 @@ func setupMessagesV2Fixture(t *testing.T) *messagesV2Fixture { UserRoomIndex: testUserRoomIndex, SpotlightReadPattern: "spotlight-*", }) - - router := natsrouter.New(serverNATS, testQueueGroupV2) - router.Use(natsrouter.RequestID()) - h.Register(router) - // Flush so subscriptions reach the server before tests send requests (otelnats wraps the conn). - require.NoError(t, serverNATS.NatsConn().Flush()) - t.Cleanup(func() { _ = router.Shutdown(context.Background()) }) - + clientNATS := setupRouter(t, testQueueGroupV2, h.Register) return &messagesV2Fixture{clientNATS: clientNATS} } diff --git a/search-service/integration_rooms_test.go b/search-service/integration_rooms_test.go index eed946da4..5062d8cee 100644 --- a/search-service/integration_rooms_test.go +++ b/search-service/integration_rooms_test.go @@ -19,7 +19,6 @@ import ( "github.com/hmchangw/chat/pkg/model" "github.com/hmchangw/chat/pkg/natsrouter" - "github.com/hmchangw/chat/pkg/natsutil" "github.com/hmchangw/chat/pkg/searchengine" "github.com/hmchangw/chat/pkg/subject" "github.com/hmchangw/chat/pkg/testutil" @@ -35,28 +34,16 @@ type roomsFixture struct { func setupRoomsFixture(t *testing.T) *roomsFixture { t.Helper() - ctx := context.Background() - esURL := testutil.Elasticsearch(t) - spotlightIndex := uniqueESIndex(t, "spotlight") + spotlightIndex := testutil.ElasticsearchIndex(t, "spotlight") putTestSpotlightIndex(t, esURL, spotlightIndex) - natsURL := testutil.NATS(t) - serverNC, err := natsutil.Connect(natsURL, "") - require.NoError(t, err, "connect nats (server side)") - t.Cleanup(func() { _ = serverNC.Drain() }) - - clientNC, err := nats.Connect(natsURL) - require.NoError(t, err, "connect nats (client side)") - t.Cleanup(func() { clientNC.Close() }) - - engine, err := searchengine.New(ctx, searchengine.Config{Backend: "elasticsearch", URL: esURL}) + engine, err := searchengine.New(context.Background(), searchengine.Config{Backend: "elasticsearch", URL: esURL}) require.NoError(t, err, "build searchengine for subs fixture") - esStore := newESStore(engine, testUserRoomIndex) cache := newValkeyCache(valkeyutil.WrapClusterClient(testutil.SharedValkeyCluster(t))) t.Cleanup(func() { testutil.FlushValkey(t) }) - h := newHandler(esStore, nil, nil, cache, handlerConfig{ + h := newHandler(newESStore(engine, testUserRoomIndex), nil, nil, cache, handlerConfig{ DocCounts: 25, MaxDocCounts: 100, RestrictedRoomsCacheTTL: 5 * time.Minute, @@ -64,14 +51,7 @@ func setupRoomsFixture(t *testing.T) *roomsFixture { RequestTimeout: 5 * time.Second, SpotlightReadPattern: spotlightIndex, }) - - router := natsrouter.New(serverNC, testQueueGroupSubs) - router.Use(natsrouter.RequestID()) - h.Register(router) - // Flush so subscriptions reach the server before tests send requests (otelnats wraps the conn). - require.NoError(t, serverNC.NatsConn().Flush()) - t.Cleanup(func() { _ = router.Shutdown(context.Background()) }) - + clientNC := setupRouter(t, testQueueGroupSubs, h.Register) return &roomsFixture{clientNATS: clientNC, esURL: esURL, spotlightIndex: spotlightIndex} } diff --git a/search-service/integration_users_test.go b/search-service/integration_users_test.go index a5ef84749..2b26c24de 100644 --- a/search-service/integration_users_test.go +++ b/search-service/integration_users_test.go @@ -5,7 +5,6 @@ package main // Integration tests for search.users (NATS + httptest stub for HR endpoint). import ( - "context" "encoding/json" "net/http" "net/http/httptest" @@ -18,10 +17,8 @@ import ( "github.com/hmchangw/chat/pkg/model" "github.com/hmchangw/chat/pkg/natsrouter" - "github.com/hmchangw/chat/pkg/natsutil" "github.com/hmchangw/chat/pkg/restyutil" "github.com/hmchangw/chat/pkg/subject" - "github.com/hmchangw/chat/pkg/testutil" ) type usersFixture struct { @@ -31,35 +28,16 @@ type usersFixture struct { func setupUsersFixture(t *testing.T, thirdPartyHandler http.Handler) *usersFixture { t.Helper() - stub := httptest.NewServer(thirdPartyHandler) t.Cleanup(stub.Close) - natsURL := testutil.NATS(t) - serverNC, err := natsutil.Connect(natsURL, "") - require.NoError(t, err, "connect nats (server side)") - t.Cleanup(func() { _ = serverNC.Drain() }) - - clientNC, err := nats.Connect(natsURL) - require.NoError(t, err, "connect nats (client side)") - t.Cleanup(func() { clientNC.Close() }) - usersRC := restyutil.New(stub.URL, restyutil.WithTimeout(5*time.Second)) - usersClient := newHTTPUsersClient(usersRC, "") - - h := newHandler(nil, nil, usersClient, newFakeCache(), handlerConfig{ + h := newHandler(nil, nil, newHTTPUsersClient(usersRC, ""), newFakeCache(), handlerConfig{ DocCounts: 25, MaxDocCounts: 100, RequestTimeout: 5 * time.Second, }) - - router := natsrouter.New(serverNC, testQueueGroup) - router.Use(natsrouter.RequestID()) - h.Register(router) - // Flush so subscriptions reach the server before tests send requests (otelnats wraps the conn). - require.NoError(t, serverNC.NatsConn().Flush()) - t.Cleanup(func() { _ = router.Shutdown(context.Background()) }) - + clientNC := setupRouter(t, testQueueGroup, h.Register) return &usersFixture{clientNATS: clientNC, thirdParty: stub} } diff --git a/search-service/setup_shared_test.go b/search-service/setup_shared_test.go index 409a567c7..2a86d34dc 100644 --- a/search-service/setup_shared_test.go +++ b/search-service/setup_shared_test.go @@ -7,18 +7,19 @@ package main import ( "bytes" + "context" "encoding/json" "fmt" - "hash/fnv" "io" "net/http" - "os" - "sync" "testing" "time" + "github.com/nats-io/nats.go" "github.com/stretchr/testify/require" + "github.com/hmchangw/chat/pkg/natsrouter" + "github.com/hmchangw/chat/pkg/natsutil" "github.com/hmchangw/chat/pkg/testutil" ) @@ -51,55 +52,36 @@ func seedDoc(t *testing.T, esURL, index, id string, doc any) { "seedDoc %s/%s: status=%d body=%s", index, id, resp.StatusCode, body) } -// TestMain pre-warms shared containers in parallel; fails fast on error. func TestMain(m *testing.M) { - var wg sync.WaitGroup - prewarms := []func() error{ + testutil.RunTestsWithPrewarm(m, testutil.EnsureElasticsearch, testutil.EnsureNATS, testutil.EnsureValkey, testutil.EnsureMongo, - } - errCh := make(chan error, len(prewarms)) - for _, fn := range prewarms { - wg.Add(1) - go func(f func() error) { - defer wg.Done() - if err := f(); err != nil { - errCh <- err - } - }(fn) - } - wg.Wait() - close(errCh) - if err, ok := <-errCh; ok { - fmt.Fprintf(os.Stderr, "prewarm shared containers: %v\n", err) - testutil.TerminateAll() - os.Exit(1) - } - testutil.RunTests(m) + ) } -// uniqueESIndex returns a per-test ES index name (fnv hash keeps it short -// and ES-safe across subtest slashes) and registers DELETE on cleanup. -func uniqueESIndex(t *testing.T, prefix string) string { +// setupRouter wires the NATS plumbing shared by every search-service +// fixture: server+client conns against the shared NATS, a router with the +// given queue group, RequestID middleware, register, flush, and cleanups. +// The Flush is required because otelnats wraps the conn — subscriptions +// don't reach the server otherwise before tests publish. +func setupRouter(t *testing.T, queueGroup string, register func(*natsrouter.Router)) *nats.Conn { t.Helper() - esURL := testutil.Elasticsearch(t) - h := fnv.New64a() - _, _ = h.Write([]byte(t.Name())) - name := fmt.Sprintf("%s-%x", prefix, h.Sum64()) - t.Cleanup(func() { - req, err := http.NewRequest(http.MethodDelete, esURL+"/"+name, nil) - if err != nil { - t.Logf("delete index %s: build request: %v", name, err) - return - } - resp, err := testHTTPClient.Do(req) - if err != nil { - t.Logf("delete index %s: %v", name, err) - return - } - _ = resp.Body.Close() - }) - return name + natsURL := testutil.NATS(t) + serverNC, err := natsutil.Connect(natsURL, "") + require.NoError(t, err, "connect nats (server side)") + t.Cleanup(func() { _ = serverNC.Drain() }) + + clientNC, err := nats.Connect(natsURL) + require.NoError(t, err, "connect nats (client side)") + t.Cleanup(func() { clientNC.Close() }) + + router := natsrouter.New(serverNC, queueGroup) + router.Use(natsrouter.RequestID()) + register(router) + require.NoError(t, serverNC.NatsConn().Flush()) + t.Cleanup(func() { _ = router.Shutdown(context.Background()) }) + + return clientNC } diff --git a/search-sync-worker/integration_test.go b/search-sync-worker/integration_test.go index ab6a1b574..ece2a40ee 100644 --- a/search-sync-worker/integration_test.go +++ b/search-sync-worker/integration_test.go @@ -38,25 +38,10 @@ var ( ) // TestMain pre-warms shared containers in parallel; fails fast on error. +// Custom wrap (not testutil.RunTestsWithPrewarm) so we can close the +// lazy-init JetStream conn between m.Run and TerminateAll. func TestMain(m *testing.M) { - var wg sync.WaitGroup - prewarms := []func() error{ - testutil.EnsureElasticsearch, - testutil.EnsureNATS, - } - errCh := make(chan error, len(prewarms)) - for _, fn := range prewarms { - wg.Add(1) - go func(f func() error) { - defer wg.Done() - if err := f(); err != nil { - errCh <- err - } - }(fn) - } - wg.Wait() - close(errCh) - if err, ok := <-errCh; ok { + if err := testutil.PrewarmFailFast(testutil.EnsureElasticsearch, testutil.EnsureNATS); err != nil { fmt.Fprintf(os.Stderr, "prewarm shared containers: %v\n", err) testutil.TerminateAll() os.Exit(1)