From 7c4fbb278604f96bd16fd73369d42a3c1c290c57 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 23 Apr 2026 06:33:59 +0000 Subject: [PATCH 1/6] feat(pkg): add search-service primitives Shared primitives consumed by search-service and any future search client: - pkg/model: SearchMessagesRequest/Response, SearchRoomsRequest/Response + round-trip tests via the existing generic roundTrip helper. - pkg/subject: SearchMessages(account), SearchRooms(account) builders plus SearchMessagesPattern/SearchRoomsPattern for natsrouter. - pkg/searchengine: adds Search(indices, body) and GetDoc(index, id) on the SearchEngine interface. Search wires ignore_unavailable=true and allow_no_indices=true so CCS wildcards (*:messages-*) degrade to empty hits when no remote clusters are configured, keeping query bodies identical across single-site and multi-site deployments. - pkg/valkeyutil: Connect/Disconnect and typed GetJSON/SetJSONWithTTL helpers modeled on pkg/mongoutil. Connect closes the half-constructed client on ping failure so repeated connect attempts don't leak internal pool state. --- pkg/model/model_test.go | 115 ++++++++++++++++++++++++++ pkg/model/search.go | 63 ++++++++++++++ pkg/searchengine/adapter.go | 90 +++++++++++++++++++- pkg/searchengine/adapter_test.go | 88 ++++++++++++++++++++ pkg/searchengine/searchengine.go | 14 ++++ pkg/subject/subject.go | 23 ++++++ pkg/subject/subject_test.go | 8 ++ pkg/valkeyutil/valkey.go | 128 ++++++++++++++++++++++++++++ pkg/valkeyutil/valkey_test.go | 138 +++++++++++++++++++++++++++++++ 9 files changed, 664 insertions(+), 3 deletions(-) create mode 100644 pkg/model/search.go create mode 100644 pkg/valkeyutil/valkey.go create mode 100644 pkg/valkeyutil/valkey_test.go diff --git a/pkg/model/model_test.go b/pkg/model/model_test.go index ffe7e81ab..58f161aba 100644 --- a/pkg/model/model_test.go +++ b/pkg/model/model_test.go @@ -1304,6 +1304,121 @@ func TestRoomsInfoBatchResponseJSON(t *testing.T) { } } +func TestSearchMessagesRequestJSON(t *testing.T) { + t.Run("full", func(t *testing.T) { + req := model.SearchMessagesRequest{ + SearchText: "hello", + RoomIds: []string{"r1", "r2"}, + Size: 50, + Offset: 25, + } + data, err := json.Marshal(&req) + require.NoError(t, err) + var dst model.SearchMessagesRequest + require.NoError(t, json.Unmarshal(data, &dst)) + assert.True(t, reflect.DeepEqual(req, dst)) + }) + + t.Run("global (roomIds omitted when nil)", func(t *testing.T) { + req := model.SearchMessagesRequest{SearchText: "hello"} + data, err := json.Marshal(&req) + require.NoError(t, err) + var raw map[string]any + require.NoError(t, json.Unmarshal(data, &raw)) + _, present := raw["roomIds"] + assert.False(t, present, "roomIds must be omitted when nil") + }) +} + +func TestSearchMessagesResponseJSON(t *testing.T) { + created := time.Date(2026, 4, 1, 12, 0, 0, 0, time.UTC) + parent := time.Date(2026, 3, 30, 12, 0, 0, 0, time.UTC) + resp := model.SearchMessagesResponse{ + Total: 3, + Results: []model.MessageSearchHit{ + { + MessageID: "m1", + RoomID: "r1", + SiteID: "site-a", + UserID: "u1", + UserAccount: "alice", + Content: "hello", + CreatedAt: created, + ThreadParentMessageID: "p1", + ThreadParentCreatedAt: &parent, + }, + { + MessageID: "m2", + RoomID: "r2", + SiteID: "site-b", + UserID: "u2", + UserAccount: "bob", + Content: "world", + CreatedAt: created, + }, + }, + } + data, err := json.Marshal(&resp) + require.NoError(t, err) + var dst model.SearchMessagesResponse + require.NoError(t, json.Unmarshal(data, &dst)) + assert.True(t, reflect.DeepEqual(resp, dst)) +} + +func TestMessageSearchHitThreadFieldsOmitted(t *testing.T) { + hit := model.MessageSearchHit{ + MessageID: "m1", RoomID: "r1", SiteID: "site-a", + UserID: "u1", UserAccount: "alice", Content: "hi", + CreatedAt: time.Date(2026, 4, 1, 0, 0, 0, 0, time.UTC), + } + data, err := json.Marshal(&hit) + require.NoError(t, err) + var raw map[string]any + require.NoError(t, json.Unmarshal(data, &raw)) + _, hasPid := raw["threadParentMessageId"] + _, hasPts := raw["threadParentMessageCreatedAt"] + assert.False(t, hasPid, "threadParentMessageId must be omitted when empty") + assert.False(t, hasPts, "threadParentMessageCreatedAt must be omitted when nil") +} + +func TestSearchRoomsRequestJSON(t *testing.T) { + t.Run("full", func(t *testing.T) { + req := model.SearchRoomsRequest{ + SearchText: "general", + Scope: "channel", + Size: 25, + Offset: 0, + } + roundTrip(t, &req, &model.SearchRoomsRequest{}) + }) + + t.Run("scope omitted when empty", func(t *testing.T) { + req := model.SearchRoomsRequest{SearchText: "x"} + data, err := json.Marshal(&req) + require.NoError(t, err) + var raw map[string]any + require.NoError(t, json.Unmarshal(data, &raw)) + _, present := raw["scope"] + assert.False(t, present, "scope must be omitted when empty") + }) +} + +func TestSearchRoomsResponseJSON(t *testing.T) { + joined := time.Date(2026, 4, 1, 12, 0, 0, 0, time.UTC) + resp := model.SearchRoomsResponse{ + Total: 2, + Results: []model.RoomSearchHit{ + {RoomID: "r1", RoomName: "general", RoomType: "p", UserAccount: "alice", SiteID: "site-a", JoinedAt: joined}, + {RoomID: "r2", RoomName: "alice-bob", RoomType: "d", UserAccount: "alice", SiteID: "site-a", JoinedAt: joined}, + }, + } + data, err := json.Marshal(&resp) + require.NoError(t, err) + var dst model.SearchRoomsResponse + require.NoError(t, json.Unmarshal(data, &dst)) + assert.True(t, reflect.DeepEqual(resp, dst)) +} + // roundTrip marshals src to JSON, unmarshals into dst, and compares. func roundTrip[T any](t *testing.T, src *T, dst *T) { t.Helper() diff --git a/pkg/model/search.go b/pkg/model/search.go new file mode 100644 index 000000000..fb3a7bb8e --- /dev/null +++ b/pkg/model/search.go @@ -0,0 +1,63 @@ +package model + +import "time" + +// SearchMessagesRequest is the NATS payload for `chat.user.{account}.request.search.messages`. +// +// RoomIds, when empty, means global search across all rooms the user has +// access to. When set, the search is scoped to the listed rooms; the service +// enforces access using the per-user restricted-rooms map. +type SearchMessagesRequest struct { + SearchText string `json:"searchText"` + RoomIds []string `json:"roomIds,omitempty"` + Size int `json:"size,omitempty"` + Offset int `json:"offset,omitempty"` +} + +// SearchMessagesResponse is the NATS reply for `search.messages`. +type SearchMessagesResponse struct { + Total int64 `json:"total"` + Results []MessageSearchHit `json:"results"` +} + +// MessageSearchHit is a single message result in SearchMessagesResponse. +// Fields mirror the `messages-*` ES index document written by search-sync-worker. +type MessageSearchHit struct { + MessageID string `json:"messageId"` + RoomID string `json:"roomId"` + SiteID string `json:"siteId"` + UserID string `json:"userId"` + UserAccount string `json:"userAccount"` + Content string `json:"content"` + CreatedAt time.Time `json:"createdAt"` + ThreadParentMessageID string `json:"threadParentMessageId,omitempty"` + ThreadParentCreatedAt *time.Time `json:"threadParentMessageCreatedAt,omitempty"` +} + +// SearchRoomsRequest is the NATS payload for `chat.user.{account}.request.search.rooms`. +// +// Scope values: "all" (default), "channel" (roomType=p), "dm" (roomType=d). +// "app" is reserved and currently rejected as unsupported in MVP. +type SearchRoomsRequest struct { + SearchText string `json:"searchText"` + Scope string `json:"scope,omitempty"` + Size int `json:"size,omitempty"` + Offset int `json:"offset,omitempty"` +} + +// SearchRoomsResponse is the NATS reply for `search.rooms`. +type SearchRoomsResponse struct { + Total int64 `json:"total"` + Results []RoomSearchHit `json:"results"` +} + +// RoomSearchHit is a single room result in SearchRoomsResponse. +// Fields mirror the `spotlight` ES index document written by search-sync-worker. +type RoomSearchHit struct { + RoomID string `json:"roomId"` + RoomName string `json:"roomName"` + RoomType string `json:"roomType"` + UserAccount string `json:"userAccount"` + SiteID string `json:"siteId"` + JoinedAt time.Time `json:"joinedAt"` +} diff --git a/pkg/searchengine/adapter.go b/pkg/searchengine/adapter.go index 8bb9592fe..f8b2bf220 100644 --- a/pkg/searchengine/adapter.go +++ b/pkg/searchengine/adapter.go @@ -7,6 +7,8 @@ import ( "fmt" "io" "net/http" + "net/url" + "strings" ) type bulkActionMeta struct { @@ -102,7 +104,10 @@ func (a *httpAdapter) Bulk(ctx context.Context, actions []BulkAction) ([]BulkRes defer resp.Body.Close() if resp.StatusCode != http.StatusOK { - body, _ := io.ReadAll(resp.Body) + body, readErr := io.ReadAll(resp.Body) + if readErr != nil { + return nil, fmt.Errorf("bulk: status %d, read body: %w", resp.StatusCode, readErr) + } return nil, fmt.Errorf("bulk: status %d, body: %s", resp.StatusCode, body) } @@ -138,7 +143,10 @@ func (a *httpAdapter) UpsertTemplate(ctx context.Context, name string, body json defer resp.Body.Close() if resp.StatusCode != http.StatusOK { - respBody, _ := io.ReadAll(resp.Body) + respBody, readErr := io.ReadAll(resp.Body) + if readErr != nil { + return fmt.Errorf("upsert template: status %d, read body: %w", resp.StatusCode, readErr) + } return fmt.Errorf("upsert template: status %d, body: %s", resp.StatusCode, respBody) } return nil @@ -161,7 +169,10 @@ func (a *httpAdapter) GetIndexMapping(ctx context.Context, index string) (json.R } if resp.StatusCode != http.StatusOK { - body, _ := io.ReadAll(resp.Body) + body, readErr := io.ReadAll(resp.Body) + if readErr != nil { + return nil, fmt.Errorf("get index mapping: status %d, read body: %w", resp.StatusCode, readErr) + } return nil, fmt.Errorf("get index mapping: status %d, body: %s", resp.StatusCode, body) } @@ -171,3 +182,76 @@ func (a *httpAdapter) GetIndexMapping(ctx context.Context, index string) (json.R } return body, nil } + +// Search executes a `_search` against the comma-joined indices with +// `ignore_unavailable=true&allow_no_indices=true` so unknown remote clusters +// and missing local indices return empty hits rather than a 404/503. +func (a *httpAdapter) Search(ctx context.Context, indices []string, body json.RawMessage) (json.RawMessage, error) { + if len(indices) == 0 { + return nil, fmt.Errorf("search: indices required") + } + path := fmt.Sprintf("/%s/_search?ignore_unavailable=true&allow_no_indices=true", strings.Join(indices, ",")) + req, err := http.NewRequestWithContext(ctx, http.MethodPost, path, bytes.NewReader(body)) + if err != nil { + return nil, fmt.Errorf("create search request: %w", err) + } + req.Header.Set("Content-Type", "application/json") + + resp, err := a.transport.Perform(req) + if err != nil { + return nil, fmt.Errorf("search request: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + respBody, readErr := io.ReadAll(resp.Body) + if readErr != nil { + return nil, fmt.Errorf("search: status %d, read body: %w", resp.StatusCode, readErr) + } + return nil, fmt.Errorf("search: status %d, body: %s", resp.StatusCode, respBody) + } + + data, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("read search response: %w", err) + } + return data, nil +} + +// GetDoc fetches a single document by id. Returns (nil, false, nil) on 404. +// +// Both `index` and `docID` are URL-path-escaped defensively — ES doc IDs +// can legally contain `/`, `?`, `#`, or whitespace, and an un-escaped +// value there would malform the request path. The caller is expected to +// pass a single index name (not a comma-joined pattern) for this endpoint. +func (a *httpAdapter) GetDoc(ctx context.Context, index, docID string) (json.RawMessage, bool, error) { + path := fmt.Sprintf("/%s/_doc/%s", url.PathEscape(index), url.PathEscape(docID)) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, path, nil) + if err != nil { + return nil, false, fmt.Errorf("create get-doc request: %w", err) + } + + resp, err := a.transport.Perform(req) + if err != nil { + return nil, false, fmt.Errorf("get-doc request: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode == http.StatusNotFound { + return nil, false, nil + } + + if resp.StatusCode != http.StatusOK { + respBody, readErr := io.ReadAll(resp.Body) + if readErr != nil { + return nil, false, fmt.Errorf("get-doc: status %d, read body: %w", resp.StatusCode, readErr) + } + return nil, false, fmt.Errorf("get-doc: status %d, body: %s", resp.StatusCode, respBody) + } + + data, err := io.ReadAll(resp.Body) + if err != nil { + return nil, false, fmt.Errorf("read get-doc response: %w", err) + } + return data, true, nil +} diff --git a/pkg/searchengine/adapter_test.go b/pkg/searchengine/adapter_test.go index 87ac9bbf4..7c9dde4ac 100644 --- a/pkg/searchengine/adapter_test.go +++ b/pkg/searchengine/adapter_test.go @@ -206,6 +206,94 @@ func TestAdapter_UpsertTemplate(t *testing.T) { }) } +func TestAdapter_Search(t *testing.T) { + t.Run("single index", func(t *testing.T) { + var capturedPath, capturedMethod, capturedBody string + ft := &fakeTransport{handler: func(req *http.Request) (*http.Response, error) { + capturedMethod = req.Method + capturedPath = req.URL.Path + capturedBody, _ = func() (string, error) { + b, err := io.ReadAll(req.Body) + return string(b), err + }() + assert.Equal(t, "application/json", req.Header.Get("Content-Type")) + assert.Equal(t, "true", req.URL.Query().Get("ignore_unavailable")) + assert.Equal(t, "true", req.URL.Query().Get("allow_no_indices")) + return jsonResponse(200, `{"hits":{"total":{"value":0},"hits":[]}}`), nil + }} + a := newAdapter(ft) + body := json.RawMessage(`{"query":{"match_all":{}}}`) + raw, err := a.Search(context.Background(), []string{"spotlight"}, body) + require.NoError(t, err) + assert.Equal(t, http.MethodPost, capturedMethod) + assert.Equal(t, "/spotlight/_search", capturedPath) + assert.JSONEq(t, string(body), capturedBody) + assert.Contains(t, string(raw), `"hits"`) + }) + + t.Run("multiple indices joined with comma", func(t *testing.T) { + var capturedPath string + ft := &fakeTransport{handler: func(req *http.Request) (*http.Response, error) { + capturedPath = req.URL.Path + return jsonResponse(200, `{"hits":{"total":{"value":0},"hits":[]}}`), nil + }} + a := newAdapter(ft) + _, err := a.Search(context.Background(), []string{"messages-*", "*:messages-*"}, json.RawMessage(`{}`)) + require.NoError(t, err) + assert.Equal(t, "/messages-*,*:messages-*/_search", capturedPath) + }) + + t.Run("empty indices returns error", func(t *testing.T) { + a := newAdapter(&fakeTransport{}) + _, err := a.Search(context.Background(), nil, json.RawMessage(`{}`)) + assert.Error(t, err) + }) + + t.Run("ES error status", func(t *testing.T) { + ft := &fakeTransport{handler: func(req *http.Request) (*http.Response, error) { + return jsonResponse(500, `{"error":"boom"}`), nil + }} + a := newAdapter(ft) + _, err := a.Search(context.Background(), []string{"spotlight"}, json.RawMessage(`{}`)) + assert.Error(t, err) + }) +} + +func TestAdapter_GetDoc(t *testing.T) { + t.Run("found", func(t *testing.T) { + ft := &fakeTransport{handler: func(req *http.Request) (*http.Response, error) { + assert.Equal(t, http.MethodGet, req.Method) + assert.Equal(t, "/user-room/_doc/alice", req.URL.Path) + return jsonResponse(200, `{"_index":"user-room","_id":"alice","found":true,"_source":{"userAccount":"alice","rooms":["r1"]}}`), nil + }} + a := newAdapter(ft) + raw, found, err := a.GetDoc(context.Background(), "user-room", "alice") + require.NoError(t, err) + require.True(t, found) + assert.Contains(t, string(raw), `"userAccount":"alice"`) + }) + + t.Run("not found via 404", func(t *testing.T) { + ft := &fakeTransport{handler: func(req *http.Request) (*http.Response, error) { + return jsonResponse(404, `{"_index":"user-room","_id":"alice","found":false}`), nil + }} + a := newAdapter(ft) + raw, found, err := a.GetDoc(context.Background(), "user-room", "alice") + require.NoError(t, err) + assert.False(t, found) + assert.Nil(t, raw) + }) + + t.Run("server error", func(t *testing.T) { + ft := &fakeTransport{handler: func(req *http.Request) (*http.Response, error) { + return jsonResponse(503, `{}`), nil + }} + a := newAdapter(ft) + _, _, err := a.GetDoc(context.Background(), "user-room", "alice") + assert.Error(t, err) + }) +} + func TestAdapter_GetIndexMapping(t *testing.T) { t.Run("index exists", func(t *testing.T) { ft := &fakeTransport{handler: func(req *http.Request) (*http.Response, error) { diff --git a/pkg/searchengine/searchengine.go b/pkg/searchengine/searchengine.go index 1c8553d0e..40215bd4f 100644 --- a/pkg/searchengine/searchengine.go +++ b/pkg/searchengine/searchengine.go @@ -58,4 +58,18 @@ type SearchEngine interface { Bulk(ctx context.Context, actions []BulkAction) ([]BulkResult, error) UpsertTemplate(ctx context.Context, name string, body json.RawMessage) error GetIndexMapping(ctx context.Context, index string) (json.RawMessage, error) + + // Search executes a `_search` against the comma-joined list of indices + // and returns the raw response body. `ignore_unavailable=true` and + // `allow_no_indices=true` are applied automatically so unknown remote + // clusters and missing local indices degrade to an empty result set + // rather than an error. Callers that need CCS pass patterns like + // []string{"messages-*", "*:messages-*"}. + Search(ctx context.Context, indices []string, body json.RawMessage) (json.RawMessage, error) + + // GetDoc fetches a single document by index and id. Returns found=false + // and a nil body on 404 (doc or index missing); the two cases are + // indistinguishable to the caller and are treated uniformly as + // "no document" — same convention as GetIndexMapping. + GetDoc(ctx context.Context, index, docID string) (json.RawMessage, bool, error) } diff --git a/pkg/subject/subject.go b/pkg/subject/subject.go index 2e875d0d9..5c5d1a254 100644 --- a/pkg/subject/subject.go +++ b/pkg/subject/subject.go @@ -288,3 +288,26 @@ func MemberAddWildcard(siteID string) string { func RoomMemberEvent(roomID string) string { return fmt.Sprintf("chat.room.%s.event.member", roomID) } + +// --- search-service request/reply builders --- + +// SearchMessages builds the concrete subject for a message search request. +func SearchMessages(account string) string { + return fmt.Sprintf("chat.user.%s.request.search.messages", account) +} + +// SearchRooms builds the concrete subject for a room search request. +func SearchRooms(account string) string { + return fmt.Sprintf("chat.user.%s.request.search.rooms", account) +} + +// SearchMessagesPattern is the natsrouter pattern for message search, used +// during registration to extract `{account}` from incoming subjects. +func SearchMessagesPattern() string { + return "chat.user.{account}.request.search.messages" +} + +// SearchRoomsPattern is the natsrouter pattern for room search. +func SearchRoomsPattern() string { + return "chat.user.{account}.request.search.rooms" +} diff --git a/pkg/subject/subject_test.go b/pkg/subject/subject_test.go index d0451bb57..2aa092198 100644 --- a/pkg/subject/subject_test.go +++ b/pkg/subject/subject_test.go @@ -78,6 +78,14 @@ func TestSubjectBuilders(t *testing.T) { "chat.user.alice.request.orgs.sect-eng.members"}, {"OrgMembersWildcard", subject.OrgMembersWildcard(), "chat.user.*.request.orgs.*.members"}, + {"SearchMessages", subject.SearchMessages("alice"), + "chat.user.alice.request.search.messages"}, + {"SearchRooms", subject.SearchRooms("alice"), + "chat.user.alice.request.search.rooms"}, + {"SearchMessagesPattern", subject.SearchMessagesPattern(), + "chat.user.{account}.request.search.messages"}, + {"SearchRoomsPattern", subject.SearchRoomsPattern(), + "chat.user.{account}.request.search.rooms"}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { diff --git a/pkg/valkeyutil/valkey.go b/pkg/valkeyutil/valkey.go new file mode 100644 index 000000000..0830b3039 --- /dev/null +++ b/pkg/valkeyutil/valkey.go @@ -0,0 +1,128 @@ +// Package valkeyutil provides thin connection + JSON helpers around the +// Valkey (Redis-compatible) client. Modeled on pkg/mongoutil so services +// get a one-call Connect + Disconnect pair plus typed get/set helpers for +// the common JSON-over-Valkey pattern. +// +// The underlying client is go-redis/v9 — Valkey is wire-compatible with +// Redis so no Valkey-specific driver is needed. +package valkeyutil + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "log/slog" + "time" + + "github.com/redis/go-redis/v9" +) + +// Client is the interface exposed by Connect. Tests can substitute their +// own implementation without depending on go-redis directly. +type Client interface { + Get(ctx context.Context, key string) (string, error) + Set(ctx context.Context, key, value string, ttl time.Duration) error + Del(ctx context.Context, keys ...string) error + Close() error +} + +// ErrCacheMiss is returned by Get and GetJSON when the key does not exist. +var ErrCacheMiss = errors.New("valkey: cache miss") + +type redisClient struct { + c *redis.Client +} + +// Connect dials Valkey/Redis, verifies connectivity with PING, and returns +// a Client. Follows the same `fmt.Errorf("… connect: %w", err)` wrapping +// style used by pkg/mongoutil so upstream logs are consistent. +func Connect(ctx context.Context, addr, password string) (Client, error) { + c := redis.NewClient(&redis.Options{ + Addr: addr, + Password: password, + }) + pingCtx, cancel := context.WithTimeout(ctx, 5*time.Second) + defer cancel() + if err := c.Ping(pingCtx).Err(); err != nil { + // Close the half-constructed client on the ping-failure path so + // repeated connect failures (e.g. startup retry loops against + // an unreachable addr) don't leak internal go-redis pool state. + if closeErr := c.Close(); closeErr != nil { + slog.Warn("valkey close after failed connect", "error", closeErr) + } + return nil, fmt.Errorf("valkey connect: %w", err) + } + slog.Info("connected to Valkey", "addr", addr) + return &redisClient{c: c}, nil +} + +// Disconnect closes the client and logs any failure at ERROR. +func Disconnect(client Client) { + if client == nil { + return + } + if err := client.Close(); err != nil { + slog.Error("valkey disconnect failed", "error", err) + } +} + +func (r *redisClient) Get(ctx context.Context, key string) (string, error) { + val, err := r.c.Get(ctx, key).Result() + if errors.Is(err, redis.Nil) { + return "", ErrCacheMiss + } + if err != nil { + return "", fmt.Errorf("valkey get: %w", err) + } + return val, nil +} + +func (r *redisClient) Set(ctx context.Context, key, value string, ttl time.Duration) error { + if err := r.c.Set(ctx, key, value, ttl).Err(); err != nil { + return fmt.Errorf("valkey set: %w", err) + } + return nil +} + +func (r *redisClient) Del(ctx context.Context, keys ...string) error { + if len(keys) == 0 { + return nil + } + if err := r.c.Del(ctx, keys...).Err(); err != nil { + return fmt.Errorf("valkey del: %w", err) + } + return nil +} + +func (r *redisClient) Close() error { + return r.c.Close() +} + +// GetJSON reads `key` from Valkey and unmarshals the stored JSON into +// `out`. Returns ErrCacheMiss (wrapped) if the key is not set so callers +// can `errors.Is` it; all other failures (transport, malformed JSON) wrap +// as "valkey get json: …". +func GetJSON(ctx context.Context, client Client, key string, out any) error { + raw, err := client.Get(ctx, key) + if err != nil { + return fmt.Errorf("valkey get json: %w", err) + } + if err := json.Unmarshal([]byte(raw), out); err != nil { + return fmt.Errorf("valkey get json: unmarshal: %w", err) + } + return nil +} + +// SetJSONWithTTL marshals `value` to JSON and stores it under `key` with +// the given TTL. Zero ttl stores the key without expiry. +func SetJSONWithTTL(ctx context.Context, client Client, key string, value any, ttl time.Duration) error { + data, err := json.Marshal(value) + if err != nil { + return fmt.Errorf("valkey set json: marshal: %w", err) + } + if err := client.Set(ctx, key, string(data), ttl); err != nil { + return fmt.Errorf("valkey set json: %w", err) + } + return nil +} diff --git a/pkg/valkeyutil/valkey_test.go b/pkg/valkeyutil/valkey_test.go new file mode 100644 index 000000000..6837f4bad --- /dev/null +++ b/pkg/valkeyutil/valkey_test.go @@ -0,0 +1,138 @@ +package valkeyutil_test + +import ( + "context" + "errors" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/hmchangw/chat/pkg/valkeyutil" +) + +type fakeClient struct { + store map[string]string + ttls map[string]time.Duration + setErr error + getErr error + delCalls [][]string + closeCalled int + closeErr error +} + +func newFake() *fakeClient { + return &fakeClient{ + store: make(map[string]string), + ttls: make(map[string]time.Duration), + } +} + +func (f *fakeClient) Get(_ context.Context, key string) (string, error) { + if f.getErr != nil { + return "", f.getErr + } + v, ok := f.store[key] + if !ok { + return "", valkeyutil.ErrCacheMiss + } + return v, nil +} + +func (f *fakeClient) Set(_ context.Context, key, value string, ttl time.Duration) error { + if f.setErr != nil { + return f.setErr + } + f.store[key] = value + f.ttls[key] = ttl + return nil +} + +func (f *fakeClient) Del(_ context.Context, keys ...string) error { + f.delCalls = append(f.delCalls, keys) + for _, k := range keys { + delete(f.store, k) + delete(f.ttls, k) + } + return nil +} + +func (f *fakeClient) Close() error { + f.closeCalled++ + return f.closeErr +} + +type cached struct { + Rooms []string `json:"rooms"` +} + +func TestGetJSON_HitAndMiss(t *testing.T) { + ctx := context.Background() + client := newFake() + + t.Run("miss returns ErrCacheMiss", func(t *testing.T) { + var out cached + err := valkeyutil.GetJSON(ctx, client, "missing", &out) + assert.ErrorIs(t, err, valkeyutil.ErrCacheMiss) + }) + + t.Run("hit decodes JSON", func(t *testing.T) { + require.NoError(t, valkeyutil.SetJSONWithTTL(ctx, client, "k1", cached{Rooms: []string{"r1", "r2"}}, 5*time.Minute)) + var out cached + require.NoError(t, valkeyutil.GetJSON(ctx, client, "k1", &out)) + assert.Equal(t, []string{"r1", "r2"}, out.Rooms) + }) + + t.Run("Set persists TTL", func(t *testing.T) { + require.NoError(t, valkeyutil.SetJSONWithTTL(ctx, client, "k2", cached{}, 30*time.Second)) + assert.Equal(t, 30*time.Second, client.ttls["k2"]) + }) + + t.Run("malformed JSON wraps unmarshal error", func(t *testing.T) { + client.store["bad"] = "{not json" + var out cached + err := valkeyutil.GetJSON(ctx, client, "bad", &out) + assert.Error(t, err) + assert.NotErrorIs(t, err, valkeyutil.ErrCacheMiss) + }) + + t.Run("transport error propagates", func(t *testing.T) { + broken := newFake() + broken.getErr = errors.New("boom") + var out cached + err := valkeyutil.GetJSON(ctx, broken, "k", &out) + assert.Error(t, err) + assert.NotErrorIs(t, err, valkeyutil.ErrCacheMiss) + }) +} + +func TestDisconnect(t *testing.T) { + t.Run("nil is safe", func(t *testing.T) { + valkeyutil.Disconnect(nil) + }) + + t.Run("happy path calls Close once", func(t *testing.T) { + f := newFake() + valkeyutil.Disconnect(f) + assert.Equal(t, 1, f.closeCalled) + }) + + t.Run("Close error is logged but does not panic", func(t *testing.T) { + f := newFake() + f.closeErr = errors.New("close failed") + // Disconnect swallows the error (logs only). The assertion is + // that Close WAS called — the error path doesn't skip or retry. + valkeyutil.Disconnect(f) + assert.Equal(t, 1, f.closeCalled) + }) +} + +func TestConnect_ErrorPath(t *testing.T) { + // Point at a port that refuses connections (port 1 is well-known to + // reject without a listener). The internal Ping must fail fast and + // Connect must return a wrapped error — no real Valkey needed. + _, err := valkeyutil.Connect(context.Background(), "127.0.0.1:1", "") + require.Error(t, err) + assert.Contains(t, err.Error(), "valkey connect") +} From f51537121b81be98b7215b039cc371534f1efd58 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 23 Apr 2026 06:34:11 +0000 Subject: [PATCH 2/6] feat(search-sync-worker): index restricted rooms in spotlight MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously spotlight short-circuited when HistorySharedSince > 0, meaning restricted rooms never appeared in typeahead search. That's the wrong policy: a user who's a member of a restricted room should still be able to type-to-find it (access-control is enforced at message-search time via the Clause A/B query gates, not at the spotlight-visibility layer). Drops the HSS short-circuit in BuildAction. Restricted rooms are now indexed like any other room. The inbox-member-event path and its docstring are updated to match: spotlight indexes the room regardless of HSS. Tests renamed (`RestrictedRoomSkipped` → `RestrictedRoomIndexedLikeAnyOther`) and tightened to assert Version, Doc != nil, roomId, roomName. --- search-sync-worker/inbox_integration_test.go | 7 +++-- search-sync-worker/inbox_stream.go | 12 ++++++-- search-sync-worker/spotlight.go | 16 ++++------- search-sync-worker/spotlight_test.go | 29 +++++++++++++++++--- 4 files changed, 44 insertions(+), 20 deletions(-) diff --git a/search-sync-worker/inbox_integration_test.go b/search-sync-worker/inbox_integration_test.go index 5f7ef1b6e..0ccfe1559 100644 --- a/search-sync-worker/inbox_integration_test.go +++ b/search-sync-worker/inbox_integration_test.go @@ -36,9 +36,10 @@ func createInboxStream(t *testing.T, ctx context.Context, js jetstream.JetStream } // buildInboxMemberEvent constructs an InboxMemberEvent payload for tests. -// `historySharedSince` is nil for unrestricted; non-nil marks the bulk as -// restricted — user-room routes it into restrictedRooms{}, spotlight skips. -// `joinedAt` is only meaningful on add events; pass 0 for removes. +// `historySharedSince` is nil for unrestricted; non-nil with a positive +// value marks the bulk as restricted. See parseMemberEvent for how each +// collection consumes the flag. `joinedAt` is only meaningful on add +// events; pass 0 for removes. func buildInboxMemberEvent( roomID, roomName, siteID string, accounts []string, diff --git a/search-sync-worker/inbox_stream.go b/search-sync-worker/inbox_stream.go index 0ccf13981..2147fb3e2 100644 --- a/search-sync-worker/inbox_stream.go +++ b/search-sync-worker/inbox_stream.go @@ -81,9 +81,15 @@ func (b *inboxMemberCollection) FilterSubjects(siteID string) []string { // InboxMemberEvent payload and validates the common preconditions shared by // all inbox-member collections. // -// Callers decide how to handle the event-level restricted-room flag: -// - spotlight keeps the MVP skip on `payload.HistorySharedSince != nil` -// - user-room routes the event into `restrictedRooms{}` on the doc +// Callers decide how to handle the event-level restricted-room flag. +// The Go↔painless contract is "positive hss means restricted" — i.e. +// `payload.HistorySharedSince != nil && *payload.HistorySharedSince > 0`; +// a nil pointer OR a leaked `&0`/negative pointer is the intentional +// "unrestricted" sentinel. +// - user-room routes the event into `restrictedRooms{}` on positive +// HSS, otherwise into `rooms[]` +// - spotlight indexes the room regardless of HSS; HSS is a +// message-content access concern, not a room-name discovery one func parseMemberEvent(data []byte) (*model.OutboxEvent, *model.InboxMemberEvent, error) { var evt model.OutboxEvent if err := json.Unmarshal(data, &evt); err != nil { diff --git a/search-sync-worker/spotlight.go b/search-sync-worker/spotlight.go index f0d0ec20d..d5bcbd38e 100644 --- a/search-sync-worker/spotlight.go +++ b/search-sync-worker/spotlight.go @@ -44,10 +44,12 @@ func (c *spotlightCollection) TemplateBody() json.RawMessage { // event is redelivered, every action 409s uniformly and is treated as a // successful idempotent replay. // -// Restricted rooms (HistorySharedSince != nil on the event) short-circuit the -// entire bulk and return an empty slice. The spotlight index keeps the MVP -// skip for restricted rooms; user-room (not spotlight) stores restricted-room -// membership on the per-user ES doc via `restrictedRooms{}`. +// Restricted rooms are indexed the same as unrestricted rooms. Spotlight +// is a room-name typeahead over rooms the user belongs to — the HSS / +// restricted-rooms distinction is a MESSAGE-content access-control +// concern, enforced at query time by search-service's Clauses A/B against +// the messages index. Room-name discovery has no such boundary: a user +// who joined a restricted room must still be able to find it by name. func (c *spotlightCollection) BuildAction(data []byte) ([]searchengine.BulkAction, error) { evt, payload, err := parseMemberEvent(data) if err != nil { @@ -56,12 +58,6 @@ func (c *spotlightCollection) BuildAction(data []byte) ([]searchengine.BulkActio if payload.RoomID == "" { return nil, fmt.Errorf("build spotlight action: missing roomId") } - // Spotlight skips restricted rooms (MVP); user-room stores them. - // Mirror user_room.go's `hss > 0` sentinel so a leaked &0 is treated as - // unrestricted by both indices. - if payload.HistorySharedSince != nil && *payload.HistorySharedSince > 0 { - return nil, nil - } if len(payload.Accounts) == 0 { return nil, fmt.Errorf("build spotlight action: empty accounts") } diff --git a/search-sync-worker/spotlight_test.go b/search-sync-worker/spotlight_test.go index ea7e2f955..ad8b6e4e6 100644 --- a/search-sync-worker/spotlight_test.go +++ b/search-sync-worker/spotlight_test.go @@ -158,12 +158,12 @@ func TestSpotlightCollection_BuildAction_MemberRemoved(t *testing.T) { assert.Nil(t, action.Doc) } -func TestSpotlightCollection_BuildAction_RestrictedRoomSkipped(t *testing.T) { +func TestSpotlightCollection_BuildAction_RestrictedRoomIndexedLikeAnyOther(t *testing.T) { + // See spotlightCollection.BuildAction docstring for the room-name + // vs message-content access boundary. coll := newSpotlightCollection("spotlight-site-a-v1-chat") payload := baseInboxMemberEvent() payload.Accounts = []string{"alice", "bob"} - // Event-level HistorySharedSince short-circuits the entire bulk — - // spotlight keeps MVP skip for restricted rooms; user-room stores them. hss := int64(1735689500000) payload.HistorySharedSince = &hss @@ -171,7 +171,28 @@ func TestSpotlightCollection_BuildAction_RestrictedRoomSkipped(t *testing.T) { actions, err := coll.BuildAction(data) require.NoError(t, err) - assert.Empty(t, actions, "restricted-room event should produce no actions") + require.Len(t, actions, 2, "restricted room must still produce one doc per account") + + docIDs := make([]string, len(actions)) + for i, a := range actions { + docIDs[i] = a.DocID + assert.Equal(t, searchengine.ActionIndex, a.Action) + assert.Equal(t, "spotlight-site-a-v1-chat", a.Index) + // evt.Timestamp → external version propagation. If a future + // refactor silently drops Version for the restricted path, + // spotlight would lose idempotency under redelivery. + assert.Equal(t, int64(100), a.Version) + require.NotNil(t, a.Doc, "restricted room must produce a populated doc, not a delete") + + // Confirm the room NAME — the discoverability payload this + // whole change is about — actually made it into the indexed + // body, not just that an index action was emitted. + var body map[string]any + require.NoError(t, json.Unmarshal(a.Doc, &body)) + assert.Equal(t, "r-eng", body["roomId"]) + assert.Equal(t, "engineering", body["roomName"]) + } + assert.ElementsMatch(t, []string{"alice_r-eng", "bob_r-eng"}, docIDs) } func TestSpotlightCollection_BuildAction_Errors(t *testing.T) { From 870870e5f721722fcd89ffe3bf339c878602bb5c Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 23 Apr 2026 06:34:32 +0000 Subject: [PATCH 3/6] feat(search-service): NATS request/reply search with CCS + /metrics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New flat-layout service per CLAUDE.md §1. Two NATS request/reply endpoints: chat.user.{account}.request.search.messages chat.user.{account}.request.search.rooms Message search is cross-cluster via messages-*,*:messages-* so the same query reaches local and remote sites without per-cluster service config. Room search is local-only against the spotlight index with a scope filter (all / channel / dm; app rejected per MVP). Access control for restricted rooms rides on two Elasticsearch bool-should clauses composed in query_messages.go: - Clause A: non-thread parents gated on createdAt >= hss (historySharedSince — the user's join-time bound for this room). - Clause B: thread replies with an outer gate reply.createdAt >= hss AND (tshow=true OR parent.createdAt >= hss). The outer gate blocks pre-HSS tshow=true replies from leaking history the user never had access to. Scoped searches (req.RoomIds != nil) still gate the inline terms on the user-room terms-lookup so a caller can't reach rooms they don't belong to by passing arbitrary roomIds. The shared restrictedRoomBaseMust helper makes the "both clauses gate on roomId + HSS" invariant visible. Handler reads restrictedRooms via a 2-tier Valkey → ES lookup with log-and-fall-through on cache failure. Request fails only when BOTH cache and ES prefetch fail. Prometheus collectors for requests, request_duration, cache_hits/misses, and es_duration are pre-resolved by label at init — no per-request WithLabelValues lookup. Includes unit tests for handler, query builders, response parser, and store adapters. Deploy artifacts: Dockerfile, docker-compose, azure-pipelines.yml. --- search-service/deploy/Dockerfile | 13 + search-service/deploy/azure-pipelines.yml | 95 +++++++ search-service/deploy/docker-compose.yml | 34 +++ search-service/handler.go | 222 +++++++++++++++++ search-service/handler_test.go | 286 ++++++++++++++++++++++ search-service/main.go | 165 +++++++++++++ search-service/metrics.go | 145 +++++++++++ search-service/query_messages.go | 263 ++++++++++++++++++++ search-service/query_messages_test.go | 223 +++++++++++++++++ search-service/query_rooms.go | 91 +++++++ search-service/query_rooms_test.go | 98 ++++++++ search-service/response.go | 100 ++++++++ search-service/response_test.go | 103 ++++++++ search-service/store.go | 31 +++ search-service/store_es.go | 70 ++++++ search-service/store_es_test.go | 105 ++++++++ search-service/store_valkey.go | 54 ++++ search-service/store_valkey_test.go | 122 +++++++++ 18 files changed, 2220 insertions(+) create mode 100644 search-service/deploy/Dockerfile create mode 100644 search-service/deploy/azure-pipelines.yml create mode 100644 search-service/deploy/docker-compose.yml create mode 100644 search-service/handler.go create mode 100644 search-service/handler_test.go create mode 100644 search-service/main.go create mode 100644 search-service/metrics.go create mode 100644 search-service/query_messages.go create mode 100644 search-service/query_messages_test.go create mode 100644 search-service/query_rooms.go create mode 100644 search-service/query_rooms_test.go create mode 100644 search-service/response.go create mode 100644 search-service/response_test.go create mode 100644 search-service/store.go create mode 100644 search-service/store_es.go create mode 100644 search-service/store_es_test.go create mode 100644 search-service/store_valkey.go create mode 100644 search-service/store_valkey_test.go diff --git a/search-service/deploy/Dockerfile b/search-service/deploy/Dockerfile new file mode 100644 index 000000000..7dff74c76 --- /dev/null +++ b/search-service/deploy/Dockerfile @@ -0,0 +1,13 @@ +FROM golang:1.25.8-alpine AS builder +WORKDIR /app +COPY go.mod go.sum ./ +RUN go mod download +COPY pkg/ pkg/ +COPY search-service/ search-service/ +RUN CGO_ENABLED=0 go build -o /search-service ./search-service/ + +FROM alpine:3.21 +RUN apk add --no-cache ca-certificates && adduser -D -u 10001 app +COPY --from=builder /search-service /search-service +USER app +ENTRYPOINT ["/search-service"] diff --git a/search-service/deploy/azure-pipelines.yml b/search-service/deploy/azure-pipelines.yml new file mode 100644 index 000000000..ffbb6af3d --- /dev/null +++ b/search-service/deploy/azure-pipelines.yml @@ -0,0 +1,95 @@ +trigger: + branches: + include: + - main + - develop + paths: + include: + - search-service/ + - pkg/ + +pr: + branches: + include: + - main + paths: + include: + - search-service/ + - pkg/ + +variables: + # Aligned with .github/workflows/ci.yml; the April 7, 2026 security + # release covers 10 CVEs. The Dockerfile builder stays on 1.25.8-alpine + # per the repo-wide image pin — CI toolchain and builder base are + # independent concerns. + GO_VERSION: '1.25.9' + SERVICE_NAME: search-service + REGISTRY: '$(containerRegistry)' + # Per-target coverage thresholds. Both pkg and service meet the + # repo-wide 80% minimum. main.go is excluded from the service profile + # below because it's a startup-harness (config parse → DI wire → + # goroutine spawn → shutdown.Wait hand-off) covered only by the + # integration job — leaving it in would drag the unit number without + # representing an actual unit-test gap. + MIN_COVERAGE_PKG: '80' + MIN_COVERAGE_SERVICE: '80' + +stages: + - stage: Validate + displayName: 'Lint & Test' + jobs: + - job: LintAndTest + pool: + vmImage: 'ubuntu-latest' + steps: + - task: GoTool@0 + inputs: + version: '$(GO_VERSION)' + displayName: 'Install Go $(GO_VERSION)' + + - script: go vet ./$(SERVICE_NAME)/... ./pkg/... + displayName: 'Go Vet' + + - script: | + set -euo pipefail + go test ./pkg/... -v -race -coverprofile=coverage-pkg.out + cov="$(go tool cover -func=coverage-pkg.out | awk '/^total:/ {gsub("%","",$3); print $3}')" + echo "pkg coverage: ${cov}% (threshold ${MIN_COVERAGE_PKG}%)" + awk -v c="$cov" -v t="$MIN_COVERAGE_PKG" 'BEGIN { exit (c >= t ? 0 : 1) }' + displayName: 'Test shared packages (with coverage gate)' + + - script: | + set -euo pipefail + go test ./$(SERVICE_NAME)/... -v -race -coverprofile=coverage-$(SERVICE_NAME)-raw.out + # Filter out main.go before computing the total. The first + # line of a cover profile is `mode: atomic` and must be + # preserved verbatim. + awk 'NR==1 || $0 !~ "/main\\.go:"' coverage-$(SERVICE_NAME)-raw.out > coverage-$(SERVICE_NAME).out + cov="$(go tool cover -func=coverage-$(SERVICE_NAME).out | awk '/^total:/ {gsub("%","",$3); print $3}')" + echo "$(SERVICE_NAME) coverage (excl. main.go): ${cov}% (threshold ${MIN_COVERAGE_SERVICE}%)" + awk -v c="$cov" -v t="$MIN_COVERAGE_SERVICE" 'BEGIN { exit (c >= t ? 0 : 1) }' + displayName: 'Test $(SERVICE_NAME) (with coverage gate)' + + - script: go build -o /dev/null ./$(SERVICE_NAME)/ + displayName: 'Build $(SERVICE_NAME)' + + - stage: Build + displayName: 'Build & Push Image' + dependsOn: Validate + condition: and(succeeded(), eq(variables['Build.SourceBranch'], 'refs/heads/main')) + jobs: + - job: BuildImage + pool: + vmImage: 'ubuntu-latest' + steps: + - task: Docker@2 + inputs: + containerRegistry: '$(containerRegistry)' + repository: 'chat/$(SERVICE_NAME)' + command: 'buildAndPush' + Dockerfile: '$(SERVICE_NAME)/deploy/Dockerfile' + buildContext: '.' + tags: | + $(Build.BuildId) + latest + displayName: 'Build & push $(SERVICE_NAME)' diff --git a/search-service/deploy/docker-compose.yml b/search-service/deploy/docker-compose.yml new file mode 100644 index 000000000..b4c191c99 --- /dev/null +++ b/search-service/deploy/docker-compose.yml @@ -0,0 +1,34 @@ +name: search-service + +services: + search-service: + build: + context: ../.. + dockerfile: search-service/deploy/Dockerfile + environment: + - NATS_URL=nats://nats:4222 + - NATS_CREDS_FILE=/etc/nats/backend.creds + - SITE_ID=site-local + - SEARCH_URL=http://elasticsearch:9200 + - SEARCH_BACKEND=elasticsearch + - VALKEY_ADDR=valkey:6379 + - VALKEY_PASSWORD= + - SEARCH_DOC_COUNTS=25 + - SEARCH_MAX_DOC_COUNTS=100 + - SEARCH_RESTRICTED_ROOMS_CACHE_TTL=5m + - SEARCH_RECENT_WINDOW=8760h + - SEARCH_REQUEST_TIMEOUT=10s + - SEARCH_METRICS_ADDR=:9090 + ports: + # Expose /metrics so Prometheus / curl can scrape from the host + # during local dev. The listener is bound on 0.0.0.0:9090 inside + # the container. + - "9090:9090" + volumes: + - ../../docker-local/backend.creds:/etc/nats/backend.creds:ro + networks: + - chat-local + +networks: + chat-local: + external: true diff --git a/search-service/handler.go b/search-service/handler.go new file mode 100644 index 000000000..a803eeecc --- /dev/null +++ b/search-service/handler.go @@ -0,0 +1,222 @@ +package main + +import ( + "context" + "errors" + "log/slog" + "time" + + "github.com/hmchangw/chat/pkg/model" + "github.com/hmchangw/chat/pkg/natsrouter" + "github.com/hmchangw/chat/pkg/subject" +) + +// handlerConfig carries knobs read per request. `RequestTimeout` of zero +// disables the context deadline so tests can skip wiring it. +type handlerConfig struct { + DocCounts int + MaxDocCounts int + RestrictedRoomsCacheTTL time.Duration + RecentWindow time.Duration + RequestTimeout time.Duration + UserRoomIndex string +} + +type handler struct { + store SearchStore + cache RestrictedRoomCache + cfg handlerConfig +} + +func newHandler(store SearchStore, cache RestrictedRoomCache, cfg handlerConfig) *handler { + if cfg.DocCounts <= 0 { + cfg.DocCounts = 25 + } + if cfg.MaxDocCounts <= 0 { + cfg.MaxDocCounts = 100 + } + if cfg.RestrictedRoomsCacheTTL <= 0 { + cfg.RestrictedRoomsCacheTTL = 5 * time.Minute + } + if cfg.RecentWindow <= 0 { + cfg.RecentWindow = 365 * 24 * time.Hour + } + return &handler{store: store, cache: cache, cfg: cfg} +} + +func (h *handler) Register(r *natsrouter.Router) { + natsrouter.Register(r, subject.SearchMessagesPattern(), h.searchMessages) + natsrouter.Register(r, subject.SearchRoomsPattern(), h.searchRooms) +} + +func (h *handler) withRequestTimeout(parent context.Context) (context.Context, context.CancelFunc) { + if h.cfg.RequestTimeout <= 0 { + return parent, func() {} + } + return context.WithTimeout(parent, h.cfg.RequestTimeout) +} + +func (h *handler) searchMessages(c *natsrouter.Context, req model.SearchMessagesRequest) (resp *model.SearchMessagesResponse, err error) { + defer observeRequest(metricKindMessages, &err)() + + account, rerr := c.Params.Require("account") + if rerr != nil { + return nil, rerr + } + + if err := h.normalizePagination(&req.Size, &req.Offset); err != nil { + return nil, err + } + if req.SearchText == "" { + return nil, natsrouter.ErrBadRequest("searchText is required") + } + + ctx, cancel := h.withRequestTimeout(c) + defer cancel() + + restricted, err := h.loadRestricted(ctx, account, metricKindMessages) + if err != nil { + return nil, err + } + + body, err := buildMessageQuery(req, account, restricted, h.cfg.RecentWindow, h.cfg.UserRoomIndex) + if err != nil { + // Only reachable via json.Marshal failure on well-typed maps — + // effectively unreachable — but sanitize anyway so no raw + // internal error ever leaves the service boundary. + slog.Error("build message query failed", "account", account, "error", err) + return nil, natsrouter.ErrInternal("unable to build search query") + } + + observeESDone := observeES(metricOpSearch) + raw, err := h.store.Search(ctx, MessageIndexPattern, body) + observeESDone() + if err != nil { + slog.Error("message search backend failed", "account", account, "error", err) + return nil, natsrouter.ErrInternal("search backend unavailable") + } + + resp, err = parseMessagesResponse(raw) + if err != nil { + slog.Error("parse messages response failed", "account", account, "error", err) + return nil, natsrouter.ErrInternal("unexpected search response") + } + return resp, nil +} + +func (h *handler) searchRooms(c *natsrouter.Context, req model.SearchRoomsRequest) (resp *model.SearchRoomsResponse, err error) { + defer observeRequest(metricKindRooms, &err)() + + account, rerr := c.Params.Require("account") + if rerr != nil { + return nil, rerr + } + + if err := h.normalizePagination(&req.Size, &req.Offset); err != nil { + return nil, err + } + if req.SearchText == "" { + return nil, natsrouter.ErrBadRequest("searchText is required") + } + + ctx, cancel := h.withRequestTimeout(c) + defer cancel() + + body, err := buildRoomQuery(req, account) + if err != nil { + // RouteError (bad scope, scope=app, unknown) passes through; + // anything else (marshal failure — unreachable) gets sanitized + // to ErrInternal. Mirrors searchMessages's buildMessageQuery branch. + var rerr *natsrouter.RouteError + if errors.As(err, &rerr) { + return nil, err + } + slog.Error("build room query failed", "account", account, "error", err) + return nil, natsrouter.ErrInternal("unable to build search query") + } + + observeESDone := observeES(metricOpSearch) + raw, err := h.store.Search(ctx, []string{SpotlightIndex}, body) + observeESDone() + if err != nil { + slog.Error("room search backend failed", "account", account, "error", err) + return nil, natsrouter.ErrInternal("search backend unavailable") + } + + resp, err = parseRoomsResponse(raw) + if err != nil { + slog.Error("parse rooms response failed", "account", account, "error", err) + return nil, natsrouter.ErrInternal("unexpected search response") + } + return resp, nil +} + +// loadRestricted implements the 2-tier Valkey → ES read. Cache errors +// alone never fail the request — log-and-fall-through. Only when both +// cache AND ES prefetch fail do we surface ErrInternal. +// +// `kind` is the endpoint that called in (`metricKindMessages` or +// `metricKindRooms`) so cache-hit/miss counters carry the right label. +// Plumbing it through here rather than hardcoding prevents silent +// mislabeling when both endpoints traverse this path. +func (h *handler) loadRestricted(ctx context.Context, account, kind string) (map[string]int64, error) { + cached, hit, cerr := h.cache.GetRestricted(ctx, account) + if cerr != nil { + slog.Warn("valkey read failed; falling through to ES", "account", account, "error", cerr) + } + if hit { + cacheHitFor(kind).Inc() + return cached, nil + } + // Counts both "key absent" and transport errors as a miss — both + // mean "we have no cached answer and must hit ES" from the caller's + // perspective. + cacheMissFor(kind).Inc() + + observeESDone := observeES(metricOpUserRoomGet) + doc, _, err := h.store.GetUserRoomDoc(ctx, account) + observeESDone() + if err != nil { + // Always log the store error, even if the cache GET also failed + // — it's the actionable signal when both fail. Include cache_err + // so operators can correlate, but don't let the cache warning + // mask the ES root cause. + slog.Error("user-room doc fetch failed", "account", account, "error", err, "cache_err", cerr) + return nil, natsrouter.ErrInternal("unable to resolve room access") + } + + restricted := doc.RestrictedRooms + if restricted == nil { + // Covers both "user has no subs" (found=false) and "doc exists + // but has no restricted rooms" — cache an empty map to prevent + // miss-storms. + restricted = map[string]int64{} + } + + // Skip the SET when the GET already errored — the transport is + // almost certainly still down and a second warning adds noise + // without new signal. + if cerr == nil { + if err := h.cache.SetRestricted(ctx, account, restricted, h.cfg.RestrictedRoomsCacheTTL); err != nil { + slog.Warn("valkey set failed; continuing without cache", "account", account, "error", err) + } + } + return restricted, nil +} + +// normalizePagination validates and clamps size/offset in place. size=0 +// falls back to DocCounts; size>MaxDocCounts is capped; negative +// offset is clamped to 0. Negative size or offset in the request is a +// client bug, not a defaultable value, so it returns ErrBadRequest. +func (h *handler) normalizePagination(size, offset *int) error { + if *size < 0 || *offset < 0 { + return natsrouter.ErrBadRequest("size and offset must be non-negative") + } + if *size == 0 { + *size = h.cfg.DocCounts + } + if *size > h.cfg.MaxDocCounts { + *size = h.cfg.MaxDocCounts + } + return nil +} diff --git a/search-service/handler_test.go b/search-service/handler_test.go new file mode 100644 index 000000000..601f6309d --- /dev/null +++ b/search-service/handler_test.go @@ -0,0 +1,286 @@ +package main + +import ( + "context" + "encoding/json" + "errors" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/hmchangw/chat/pkg/model" + "github.com/hmchangw/chat/pkg/natsrouter" +) + +type fakeStore struct { + searchCalls []searchCall + searchBody json.RawMessage + searchErr error + userRoom UserRoomDoc + userRoomFound bool + userRoomErr error + userRoomCalls int +} + +type searchCall struct { + indices []string + body json.RawMessage +} + +func (f *fakeStore) Search(_ context.Context, indices []string, body json.RawMessage) (json.RawMessage, error) { + f.searchCalls = append(f.searchCalls, searchCall{indices: indices, body: body}) + if f.searchErr != nil { + return nil, f.searchErr + } + if f.searchBody == nil { + return json.RawMessage(`{"hits":{"total":{"value":0},"hits":[]}}`), nil + } + return f.searchBody, nil +} + +func (f *fakeStore) GetUserRoomDoc(_ context.Context, _ string) (UserRoomDoc, bool, error) { + f.userRoomCalls++ + if f.userRoomErr != nil { + return UserRoomDoc{}, false, f.userRoomErr + } + return f.userRoom, f.userRoomFound, nil +} + +type fakeCache struct { + store map[string]map[string]int64 + getErr error + setErr error + setCalls int + getCalls int +} + +func newFakeCache() *fakeCache { + return &fakeCache{store: map[string]map[string]int64{}} +} + +func (f *fakeCache) GetRestricted(_ context.Context, account string) (map[string]int64, bool, error) { + f.getCalls++ + if f.getErr != nil { + return nil, false, f.getErr + } + v, ok := f.store[account] + return v, ok, nil +} + +func (f *fakeCache) SetRestricted(_ context.Context, account string, rooms map[string]int64, _ time.Duration) error { + f.setCalls++ + if f.setErr != nil { + return f.setErr + } + f.store[account] = rooms + return nil +} + +func newTestHandler(store SearchStore, cache RestrictedRoomCache) *handler { + return newHandler(store, cache, handlerConfig{ + DocCounts: 25, + MaxDocCounts: 100, + RestrictedRoomsCacheTTL: 5 * time.Minute, + RecentWindow: 365 * 24 * time.Hour, + }) +} + +func ctxWithAccount(account string) *natsrouter.Context { + return natsrouter.NewContext(map[string]string{"account": account}) +} + +func TestHandler_SearchMessages_CacheHitUnrestricted(t *testing.T) { + store := &fakeStore{} + cache := newFakeCache() + cache.store["alice"] = map[string]int64{} // empty restricted → cache hit + + h := newTestHandler(store, cache) + + resp, err := h.searchMessages(ctxWithAccount("alice"), model.SearchMessagesRequest{SearchText: "hi"}) + require.NoError(t, err) + assert.EqualValues(t, 0, resp.Total) + + assert.Equal(t, 0, store.userRoomCalls, "cache hit → no ES user-room call") + require.Len(t, store.searchCalls, 1) + assert.Equal(t, MessageIndexPattern, store.searchCalls[0].indices) +} + +func TestHandler_SearchMessages_CacheMissPopulatesFromES(t *testing.T) { + store := &fakeStore{ + userRoom: UserRoomDoc{UserAccount: "alice", RestrictedRooms: map[string]int64{"rx": 1_700_000_000_000}}, + userRoomFound: true, + } + cache := newFakeCache() + + h := newTestHandler(store, cache) + resp, err := h.searchMessages(ctxWithAccount("alice"), model.SearchMessagesRequest{SearchText: "hi"}) + require.NoError(t, err) + assert.EqualValues(t, 0, resp.Total) + + assert.Equal(t, 1, store.userRoomCalls) + assert.Equal(t, 1, cache.setCalls) + assert.Equal(t, map[string]int64{"rx": 1_700_000_000_000}, cache.store["alice"]) +} + +func TestHandler_SearchMessages_CacheErrorFallsThroughToES(t *testing.T) { + store := &fakeStore{userRoomFound: false} + cache := newFakeCache() + cache.getErr = errors.New("valkey down") + + h := newTestHandler(store, cache) + _, err := h.searchMessages(ctxWithAccount("alice"), model.SearchMessagesRequest{SearchText: "hi"}) + require.NoError(t, err) + assert.Equal(t, 1, store.userRoomCalls, "cache error triggers ES prefetch") + // Verify the handler skips SetRestricted when the prior GetRestricted + // errored — the transport is almost certainly still down, and a + // second failure-warning log adds noise without new signal. + assert.Equal(t, 0, cache.setCalls, "set must not run after cache-get error") +} + +func TestHandler_SearchMessages_CacheAndESFailReturnInternal(t *testing.T) { + store := &fakeStore{userRoomErr: errors.New("es down")} + cache := newFakeCache() + cache.getErr = errors.New("valkey down") + + h := newTestHandler(store, cache) + _, err := h.searchMessages(ctxWithAccount("alice"), model.SearchMessagesRequest{SearchText: "hi"}) + require.Error(t, err) + + var rerr *natsrouter.RouteError + require.True(t, errors.As(err, &rerr)) + assert.Equal(t, natsrouter.CodeInternal, rerr.Code) +} + +func TestHandler_SearchMessages_ESSearchError(t *testing.T) { + store := &fakeStore{searchErr: errors.New("es failed")} + cache := newFakeCache() + cache.store["alice"] = map[string]int64{} + + h := newTestHandler(store, cache) + _, err := h.searchMessages(ctxWithAccount("alice"), model.SearchMessagesRequest{SearchText: "hi"}) + require.Error(t, err) + var rerr *natsrouter.RouteError + require.True(t, errors.As(err, &rerr)) + assert.Equal(t, natsrouter.CodeInternal, rerr.Code) +} + +func TestHandler_SearchMessages_EmptySearchText(t *testing.T) { + h := newTestHandler(&fakeStore{}, newFakeCache()) + _, err := h.searchMessages(ctxWithAccount("alice"), model.SearchMessagesRequest{}) + require.Error(t, err) + var rerr *natsrouter.RouteError + require.True(t, errors.As(err, &rerr)) + assert.Equal(t, natsrouter.CodeBadRequest, rerr.Code) +} + +func TestHandler_SearchMessages_NegativeSizeRejected(t *testing.T) { + h := newTestHandler(&fakeStore{}, newFakeCache()) + _, err := h.searchMessages(ctxWithAccount("alice"), model.SearchMessagesRequest{SearchText: "x", Size: -1}) + require.Error(t, err) + var rerr *natsrouter.RouteError + require.True(t, errors.As(err, &rerr)) + assert.Equal(t, natsrouter.CodeBadRequest, rerr.Code) +} + +func TestHandler_SearchMessages_SizeClamped(t *testing.T) { + store := &fakeStore{} + cache := newFakeCache() + cache.store["alice"] = map[string]int64{} + + h := newHandler(store, cache, handlerConfig{ + DocCounts: 25, + MaxDocCounts: 50, + RestrictedRoomsCacheTTL: time.Minute, + RecentWindow: time.Hour, + }) + _, err := h.searchMessages(ctxWithAccount("alice"), model.SearchMessagesRequest{SearchText: "x", Size: 1000}) + require.NoError(t, err) + + // Inspect the emitted query body — size should be clamped to 50. + require.Len(t, store.searchCalls, 1) + var body map[string]any + require.NoError(t, json.Unmarshal(store.searchCalls[0].body, &body)) + assert.Equal(t, float64(50), body["size"]) +} + +func TestHandler_SearchMessages_UserWithNoSubsReturnsEmpty(t *testing.T) { + store := &fakeStore{userRoomFound: false} + cache := newFakeCache() + h := newTestHandler(store, cache) + + resp, err := h.searchMessages(ctxWithAccount("alice"), model.SearchMessagesRequest{SearchText: "x"}) + require.NoError(t, err) + assert.EqualValues(t, 0, resp.Total) + assert.Empty(t, resp.Results) + + // empty restricted map should be cached to prevent miss-storm + v, hit := cache.store["alice"] + assert.True(t, hit) + assert.Empty(t, v) +} + +func TestHandler_SearchRooms_ScopeAllHappyPath(t *testing.T) { + store := &fakeStore{ + searchBody: json.RawMessage(`{"hits":{"total":{"value":1},"hits":[{"_source":{"roomId":"r1","roomName":"general","roomType":"p","userAccount":"alice","siteId":"site-a","joinedAt":"2026-04-01T00:00:00Z"}}]}}`), + } + h := newTestHandler(store, newFakeCache()) + + resp, err := h.searchRooms(ctxWithAccount("alice"), model.SearchRoomsRequest{SearchText: "general"}) + require.NoError(t, err) + assert.EqualValues(t, 1, resp.Total) + require.Len(t, resp.Results, 1) + assert.Equal(t, "r1", resp.Results[0].RoomID) + + require.Len(t, store.searchCalls, 1) + assert.Equal(t, []string{SpotlightIndex}, store.searchCalls[0].indices) +} + +func TestHandler_SearchRooms_ScopeAppRejected(t *testing.T) { + h := newTestHandler(&fakeStore{}, newFakeCache()) + _, err := h.searchRooms(ctxWithAccount("alice"), model.SearchRoomsRequest{SearchText: "x", Scope: scopeApp}) + require.Error(t, err) + var rerr *natsrouter.RouteError + require.True(t, errors.As(err, &rerr)) + assert.Equal(t, natsrouter.CodeBadRequest, rerr.Code) + assert.Contains(t, rerr.Message, "scope=app") +} + +func TestHandler_SearchRooms_UnknownScopeRejected(t *testing.T) { + h := newTestHandler(&fakeStore{}, newFakeCache()) + _, err := h.searchRooms(ctxWithAccount("alice"), model.SearchRoomsRequest{SearchText: "x", Scope: "zzz"}) + require.Error(t, err) + var rerr *natsrouter.RouteError + require.True(t, errors.As(err, &rerr)) + assert.Equal(t, natsrouter.CodeBadRequest, rerr.Code) +} + +func TestHandler_SearchRooms_EmptySearchText(t *testing.T) { + h := newTestHandler(&fakeStore{}, newFakeCache()) + _, err := h.searchRooms(ctxWithAccount("alice"), model.SearchRoomsRequest{}) + require.Error(t, err) + var rerr *natsrouter.RouteError + require.True(t, errors.As(err, &rerr)) + assert.Equal(t, natsrouter.CodeBadRequest, rerr.Code) +} + +func TestHandler_SearchMessages_ScopedPartitioning(t *testing.T) { + store := &fakeStore{} + cache := newFakeCache() + cache.store["alice"] = map[string]int64{"rr": 1_700_000_000_000} + + h := newTestHandler(store, cache) + _, err := h.searchMessages(ctxWithAccount("alice"), model.SearchMessagesRequest{ + SearchText: "x", + RoomIds: []string{"r1", "rr", "r2"}, + }) + require.NoError(t, err) + + // Should emit: inline terms for [r1, r2] + restricted A+B for rr = 3 clauses. + var body map[string]any + require.NoError(t, json.Unmarshal(store.searchCalls[0].body, &body)) + filter := body["query"].(map[string]any)["bool"].(map[string]any)["filter"].([]any) + shoulds := filter[1].(map[string]any)["bool"].(map[string]any)["should"].([]any) + assert.Len(t, shoulds, 3) +} diff --git a/search-service/main.go b/search-service/main.go new file mode 100644 index 000000000..2e383b40d --- /dev/null +++ b/search-service/main.go @@ -0,0 +1,165 @@ +package main + +import ( + "context" + "errors" + "log/slog" + "net" + "net/http" + "os" + "time" + + "github.com/caarlos0/env/v11" + + "github.com/hmchangw/chat/pkg/natsrouter" + "github.com/hmchangw/chat/pkg/natsutil" + "github.com/hmchangw/chat/pkg/otelutil" + "github.com/hmchangw/chat/pkg/searchengine" + "github.com/hmchangw/chat/pkg/shutdown" + "github.com/hmchangw/chat/pkg/valkeyutil" +) + +// ESConfig bundles the search backend knobs. BACKEND is the key +// `pkg/searchengine.New` reads to choose between elasticsearch/opensearch. +type ESConfig struct { + URL string `env:"URL,required"` + Backend string `env:"BACKEND" envDefault:"elasticsearch"` +} + +type ValkeyConfig struct { + Addr string `env:"ADDR,required"` + Password string `env:"PASSWORD" envDefault:""` +} + +type NATSConfig struct { + URL string `env:"URL,required"` + CredsFile string `env:"CREDS_FILE" envDefault:""` +} + +// SearchConfig groups the request-shape knobs — size caps, cache TTL, and +// the recent-window filter bound. All optional with sane defaults so a +// minimal environment only needs URL + NATS_URL + VALKEY_ADDR. +type SearchConfig struct { + DocCounts int `env:"DOC_COUNTS" envDefault:"25"` + MaxDocCounts int `env:"MAX_DOC_COUNTS" envDefault:"100"` + RestrictedRoomsCacheTTL time.Duration `env:"RESTRICTED_ROOMS_CACHE_TTL" envDefault:"5m"` + RecentWindow time.Duration `env:"RECENT_WINDOW" envDefault:"8760h"` + RequestTimeout time.Duration `env:"REQUEST_TIMEOUT" envDefault:"10s"` + UserRoomIndex string `env:"USER_ROOM_INDEX" envDefault:""` + MetricsAddr string `env:"METRICS_ADDR" envDefault:":9090"` +} + +// Config is the root service config. Note that ES and Search share the +// `SEARCH_` env prefix — the fields on the two structs (URL/BACKEND vs +// DOC_COUNTS/MAX_DOC_COUNTS/RECENT_WINDOW/REQUEST_TIMEOUT/…) don't +// collide today, but any new field added to either must be checked +// against the other or moved to a distinct prefix to avoid silent env +// shadowing. +type Config struct { + SiteID string `env:"SITE_ID" envDefault:"site-local"` + ES ESConfig `envPrefix:"SEARCH_"` + Valkey ValkeyConfig `envPrefix:"VALKEY_"` + NATS NATSConfig `envPrefix:"NATS_"` + Search SearchConfig `envPrefix:"SEARCH_"` +} + +func main() { + slog.SetDefault(slog.New(slog.NewJSONHandler(os.Stdout, nil))) + + cfg, err := env.ParseAs[Config]() + if err != nil { + slog.Error("parse config", "error", err) + os.Exit(1) + } + + ctx := context.Background() + + tracerShutdown, err := otelutil.InitTracer(ctx, "search-service") + if err != nil { + slog.Error("init tracer failed", "error", err) + os.Exit(1) + } + + engine, err := searchengine.New(ctx, cfg.ES.Backend, cfg.ES.URL) + if err != nil { + slog.Error("search engine connect failed", "error", err) + os.Exit(1) + } + + valkey, err := valkeyutil.Connect(ctx, cfg.Valkey.Addr, cfg.Valkey.Password) + if err != nil { + slog.Error("valkey connect failed", "error", err) + os.Exit(1) + } + + nc, err := natsutil.Connect(cfg.NATS.URL, cfg.NATS.CredsFile) + if err != nil { + slog.Error("nats connect failed", "error", err) + os.Exit(1) + } + + store := newESStore(engine, cfg.Search.UserRoomIndex) + cache := newValkeyCache(valkey) + handler := newHandler(store, cache, handlerConfig{ + DocCounts: cfg.Search.DocCounts, + MaxDocCounts: cfg.Search.MaxDocCounts, + RestrictedRoomsCacheTTL: cfg.Search.RestrictedRoomsCacheTTL, + RecentWindow: cfg.Search.RecentWindow, + RequestTimeout: cfg.Search.RequestTimeout, + UserRoomIndex: cfg.Search.UserRoomIndex, + }) + + router := natsrouter.New(nc, "search-service") + router.Use(natsrouter.RequestID()) + router.Use(natsrouter.Recovery()) + router.Use(natsrouter.Logging()) + handler.Register(router) + + // /metrics-only listener. All four timeouts guard against hung + // scrapers tying up a goroutine indefinitely on an operator-exposed + // port. + // + // Bind synchronously so a port conflict fails startup loudly — + // otherwise ListenAndServe's error would surface in a goroutine and + // the service would run happily with no /metrics, silently losing + // observability. Serve(listener) takes ownership of the listener + // from here on; Shutdown() closes it. + metricsMux := http.NewServeMux() + metricsMux.Handle("/metrics", metricsHandler()) + metricsServer := &http.Server{ + Handler: metricsMux, + ReadHeaderTimeout: 5 * time.Second, + ReadTimeout: 10 * time.Second, + WriteTimeout: 10 * time.Second, + IdleTimeout: 60 * time.Second, + } + metricsListener, err := net.Listen("tcp", cfg.Search.MetricsAddr) + if err != nil { + slog.Error("metrics server listen failed", "addr", cfg.Search.MetricsAddr, "error", err) + os.Exit(1) + } + go func() { + slog.Info("metrics server listening", "addr", cfg.Search.MetricsAddr) + if err := metricsServer.Serve(metricsListener); err != nil && !errors.Is(err, http.ErrServerClosed) { + slog.Error("metrics server failed", "error", err) + } + }() + + slog.Info("search-service running", + "site", cfg.SiteID, + "backend", cfg.ES.Backend, + "valkey", cfg.Valkey.Addr, + ) + + shutdown.Wait(ctx, 25*time.Second, + // Drain NATS first so no new requests are accepted while metrics + // keep recording the in-flight ones. + func(ctx context.Context) error { return nc.Drain() }, + // tracerShutdown + valkey disconnect: internal plumbing, any order. + func(ctx context.Context) error { return tracerShutdown(ctx) }, + func(_ context.Context) error { valkeyutil.Disconnect(valkey); return nil }, + // Keep /metrics open LAST so Prometheus can scrape the final + // drain-window observations before the listener closes. + func(ctx context.Context) error { return metricsServer.Shutdown(ctx) }, + ) +} diff --git a/search-service/metrics.go b/search-service/metrics.go new file mode 100644 index 000000000..79f75e436 --- /dev/null +++ b/search-service/metrics.go @@ -0,0 +1,145 @@ +package main + +import ( + "errors" + "net/http" + "time" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" + "github.com/prometheus/client_golang/prometheus/promhttp" + + "github.com/hmchangw/chat/pkg/natsrouter" +) + +// Collector names match the Observability → Metrics table in the spec. +// All collectors register with the default Prometheus registry via +// promauto so a plain promhttp.Handler() exposes them on /metrics. +var ( + metricRequestsTotal = promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "search_service_requests_total", + Help: "Total NATS request/reply invocations handled, partitioned by endpoint and terminal status.", + }, []string{"kind", "status"}) + + metricRequestDuration = promauto.NewHistogramVec(prometheus.HistogramOpts{ + Name: "search_service_request_duration_seconds", + Help: "End-to-end handler latency in seconds, from NATS request receipt to response emission.", + Buckets: prometheus.DefBuckets, + }, []string{"kind"}) + + metricCacheHits = promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "search_service_cache_hits_total", + Help: "Restricted-rooms Valkey cache hits.", + }, []string{"kind"}) + + metricCacheMisses = promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "search_service_cache_misses_total", + Help: "Restricted-rooms Valkey cache misses (including transport errors that fall through to ES).", + }, []string{"kind"}) + + metricESDuration = promauto.NewHistogramVec(prometheus.HistogramOpts{ + Name: "search_service_es_duration_seconds", + Help: "Elasticsearch call latency in seconds, partitioned by operation.", + Buckets: prometheus.DefBuckets, + }, []string{"op"}) +) + +// Metric label constants. Cardinality is fixed and bounded so every +// permutation can be pre-resolved at init (see below) to avoid a +// per-request map lookup inside `WithLabelValues`. +const ( + metricKindMessages = "messages" + metricKindRooms = "rooms" + + metricOpSearch = "search" + metricOpUserRoomGet = "user_room_get" +) + +// Pre-resolved per-kind handles for the request-path metrics. The +// `status` label on requests_total is resolved lazily (5 values × 2 +// kinds = 10 perms would clutter here); the others are fully bound. +var ( + durMessages = metricRequestDuration.WithLabelValues(metricKindMessages) + durRooms = metricRequestDuration.WithLabelValues(metricKindRooms) + + cacheHitMessages = metricCacheHits.WithLabelValues(metricKindMessages) + cacheHitRooms = metricCacheHits.WithLabelValues(metricKindRooms) + cacheMissMessages = metricCacheMisses.WithLabelValues(metricKindMessages) + cacheMissRooms = metricCacheMisses.WithLabelValues(metricKindRooms) + + esDurSearch = metricESDuration.WithLabelValues(metricOpSearch) + esDurUserRoomGet = metricESDuration.WithLabelValues(metricOpUserRoomGet) +) + +// observeRequest captures a handler's total latency and terminal status. +// The status is classified at fire-time from the named `err` return, so +// late-bound error classification (wrapping, defer-assigned) is counted +// correctly. Usage: +// +// func (h *handler) search(...) (resp *R, err error) { +// defer observeRequest(metricKindMessages, &err)() +// ... +// } +func observeRequest(kind string, errPtr *error) func() { + start := time.Now() + dur := durFor(kind) + return func() { + dur.Observe(time.Since(start).Seconds()) + metricRequestsTotal.WithLabelValues(kind, statusLabel(*errPtr)).Inc() + } +} + +func observeES(op string) func() { + start := time.Now() + h := esDurFor(op) + return func() { h.Observe(time.Since(start).Seconds()) } +} + +// durFor / esDurFor / cacheHitFor / cacheMissFor return the pre-resolved +// observer for the given label. All four fall back to the messages/search +// variant on an unknown label so a caller typo surfaces as misattributed +// metrics rather than a nil-observer panic at fire time. +func durFor(kind string) prometheus.Observer { + if kind == metricKindRooms { + return durRooms + } + return durMessages +} + +func esDurFor(op string) prometheus.Observer { + if op == metricOpUserRoomGet { + return esDurUserRoomGet + } + return esDurSearch +} + +func cacheHitFor(kind string) prometheus.Counter { + if kind == metricKindRooms { + return cacheHitRooms + } + return cacheHitMessages +} + +func cacheMissFor(kind string) prometheus.Counter { + if kind == metricKindRooms { + return cacheMissRooms + } + return cacheMissMessages +} + +// statusLabel maps a handler's returned error onto the requests_total +// `status` label. nil → "ok"; non-internal RouteError → its Code +// (bad_request, not_found, forbidden, conflict) so operators can +// distinguish 4xx-equivalents; everything else → "internal". +func statusLabel(err error) string { + if err == nil { + return "ok" + } + var rerr *natsrouter.RouteError + if errors.As(err, &rerr) && rerr.Code != "" && rerr.Code != natsrouter.CodeInternal { + return rerr.Code + } + return natsrouter.CodeInternal +} + +func metricsHandler() http.Handler { return promhttp.Handler() } diff --git a/search-service/query_messages.go b/search-service/query_messages.go new file mode 100644 index 000000000..1e6733f78 --- /dev/null +++ b/search-service/query_messages.go @@ -0,0 +1,263 @@ +package main + +import ( + "encoding/json" + "fmt" + "sort" + "time" + + "github.com/hmchangw/chat/pkg/model" +) + +// MessageIndexPattern is the comma-joined list of indices searched for +// messages. The local prefix (`messages-*`) is required because `*:` only +// matches configured remote clusters — without it, a user on site-a would +// miss their own site's messages. When no remote clusters are configured, +// `*:messages-*` resolves to zero matches and the query proceeds against +// local `messages-*` only (courtesy of `ignore_unavailable=true`). +var MessageIndexPattern = []string{"messages-*", "*:messages-*"} + +// buildMessageQuery composes the ES `_search` body for a single message +// search request. `restricted` is the caller's cached `restrictedRooms` +// map (rid → historySharedSince millis); pass nil or empty for +// unrestricted-only users. +// +// For global search (req.RoomIds == nil), the unrestricted clause uses an +// ES terms-lookup against the user-room doc so the service doesn't need +// to send the full list over the wire. For scoped search +// (req.RoomIds != nil), the inline terms clause is STILL gated by the +// terms-lookup so a caller can't reach rooms they don't belong to by +// passing arbitrary roomIds. +func buildMessageQuery(req model.SearchMessagesRequest, account string, restricted map[string]int64, recentWindow time.Duration, userRoomIndex string) (json.RawMessage, error) { + userRoomIndex = resolveUserRoomIndex(userRoomIndex) + clauses := roomAccessClauses(req.RoomIds, account, restricted, userRoomIndex) + + body := map[string]any{ + "from": req.Offset, + "size": req.Size, + "track_total_hits": true, + "query": map[string]any{ + "bool": map[string]any{ + "must": []any{ + map[string]any{ + "multi_match": map[string]any{ + "query": req.SearchText, + "type": "bool_prefix", + "operator": "AND", + "fields": []string{"content"}, + }, + }, + }, + "filter": []any{ + map[string]any{ + "range": map[string]any{ + "createdAt": map[string]any{ + "gte": fmt.Sprintf("now-%s", recentWindowToGte(recentWindow)), + }, + }, + }, + map[string]any{ + "bool": map[string]any{ + "should": clauses, + "minimum_should_match": 1, + }, + }, + }, + }, + }, + "sort": []any{ + "_score", + map[string]any{"createdAt": map[string]any{"order": "desc"}}, + }, + } + + data, err := json.Marshal(body) + if err != nil { + return nil, fmt.Errorf("marshal message query: %w", err) + } + return data, nil +} + +func roomAccessClauses(roomIDs []string, account string, restricted map[string]int64, userRoomIndex string) []any { + if len(roomIDs) == 0 { + return globalAccessClauses(account, restricted, userRoomIndex) + } + return scopedAccessClauses(roomIDs, account, restricted, userRoomIndex) +} + +func globalAccessClauses(account string, restricted map[string]int64, userRoomIndex string) []any { + clauses := []any{termsLookupClause(account, userRoomIndex)} + for _, rid := range sortedRIDs(restricted) { + iso := hssToISO(restricted[rid]) + clauses = append(clauses, + restrictedRoomClauseA(rid, iso), + restrictedRoomClauseB(rid, iso), + ) + } + return clauses +} + +func scopedAccessClauses(roomIDs []string, account string, restricted map[string]int64, userRoomIndex string) []any { + var unrestricted []string + var restrictedSubset []string + for _, rid := range roomIDs { + if _, isRestricted := restricted[rid]; isRestricted { + restrictedSubset = append(restrictedSubset, rid) + } else { + unrestricted = append(unrestricted, rid) + } + } + sort.Strings(unrestricted) + sort.Strings(restrictedSubset) + + clauses := make([]any, 0, 1+2*len(restrictedSubset)) + if len(unrestricted) > 0 { + // AND inline terms with the user-room lookup so a caller can't + // reach rooms they don't belong to by passing arbitrary roomIds. + // The restricted subset is already safe — Clause A/B only fire + // for rids present in the caller's cached restrictedRooms map. + clauses = append(clauses, map[string]any{ + "bool": map[string]any{ + "filter": []any{ + termsInlineClause(unrestricted), + termsLookupClause(account, userRoomIndex), + }, + }, + }) + } + for _, rid := range restrictedSubset { + iso := hssToISO(restricted[rid]) + clauses = append(clauses, + restrictedRoomClauseA(rid, iso), + restrictedRoomClauseB(rid, iso), + ) + } + return clauses +} + +// termsLookupClause resolves the user's allowed rooms via ES terms-lookup +// instead of shipping the rooms[] array on every query. Passing an empty +// userRoomIndex would produce an invalid index name, so callers must +// resolve it (e.g. via resolveUserRoomIndex) before calling. +func termsLookupClause(account, userRoomIndex string) map[string]any { + return map[string]any{ + "terms": map[string]any{ + "roomId": map[string]any{ + "index": userRoomIndex, + "id": account, + "path": "rooms", + }, + }, + } +} + +func termsInlineClause(roomIDs []string) map[string]any { + return map[string]any{ + "terms": map[string]any{ + "roomId": roomIDs, + }, + } +} + +// restrictedRoomBaseMust is the shared scaffolding between Clause A and +// Clause B — every restricted-room clause gates on (roomId == rid) AND +// (createdAt >= hss). Clause A adds must_not exists threadParent; Clause +// B adds exists threadParent plus the B1/B2 inner OR. +func restrictedRoomBaseMust(rid, hssISO string) []any { + return []any{ + map[string]any{"term": map[string]any{"roomId": rid}}, + map[string]any{ + "range": map[string]any{ + "createdAt": map[string]any{"gte": hssISO}, + }, + }, + } +} + +// Clause A matches parent messages (or regular non-thread messages) +// posted after the user's HSS bound for this room. Thread replies are +// explicitly excluded via must_not so Clause B remains the sole gate +// for thread replies in restricted rooms. +func restrictedRoomClauseA(rid, hssISO string) map[string]any { + return map[string]any{ + "bool": map[string]any{ + "must": restrictedRoomBaseMust(rid, hssISO), + "must_not": []any{ + map[string]any{"exists": map[string]any{"field": "threadParentMessageId"}}, + }, + }, + } +} + +// Clause B matches thread replies in this restricted room. Two gates +// must BOTH hold: +// 1. The reply itself is at or after HSS (via restrictedRoomBaseMust's +// createdAt range). Without this outer gate, a pre-HSS reply flagged +// tshow=true would leak restricted-room history the user never had +// access to. +// 2. Either the reply is also shown in the channel (B1: tshow=true) OR +// the parent message is at or after HSS (B2). +func restrictedRoomClauseB(rid, hssISO string) map[string]any { + must := restrictedRoomBaseMust(rid, hssISO) + must = append(must, + map[string]any{"exists": map[string]any{"field": "threadParentMessageId"}}, + map[string]any{ + "bool": map[string]any{ + "should": []any{ + map[string]any{"term": map[string]any{"tshow": true}}, + map[string]any{ + "range": map[string]any{ + "threadParentMessageCreatedAt": map[string]any{"gte": hssISO}, + }, + }, + }, + "minimum_should_match": 1, + }, + }, + ) + return map[string]any{"bool": map[string]any{"must": must}} +} + +func hssToISO(hss int64) string { + return time.UnixMilli(hss).UTC().Format(time.RFC3339Nano) +} + +// recentWindowToGte converts a Go Duration into an ES date-math fragment. +// ES date-math accepts a single `` token per operator, NOT +// compound forms like `8760h0m0s` that Go's Duration.String produces — +// compound forms fail to parse and ES rejects the whole query. +func recentWindowToGte(d time.Duration) string { + if d <= 0 { + // Zero / negative durations would emit `now-0s` which ES interprets + // as "strictly after now" — effectively no matches. Bias to a + // 1-year default so a misconfigured value degrades to the intended + // behavior rather than an empty result set. + d = 365 * 24 * time.Hour + } + switch { + case d%time.Hour == 0: + return fmt.Sprintf("%dh", int64(d/time.Hour)) + case d%time.Minute == 0: + return fmt.Sprintf("%dm", int64(d/time.Minute)) + default: + // ES date-math has no sub-second unit — supported set is y/M/w/d/h/m/s. + // Round UP so a misconfigured sub-second value widens the window + // rather than collapsing it (which would silently drop matches). + return fmt.Sprintf("%ds", int64((d+time.Second-1)/time.Second)) + } +} + +// sortedRIDs returns map keys in ascending order. Sort is load-bearing +// for golden-file tests and for ES query-plan cacheability — do not +// remove without replacing both guarantees. +func sortedRIDs(m map[string]int64) []string { + if len(m) == 0 { + return nil + } + out := make([]string, 0, len(m)) + for k := range m { + out = append(out, k) + } + sort.Strings(out) + return out +} diff --git a/search-service/query_messages_test.go b/search-service/query_messages_test.go new file mode 100644 index 000000000..07ec79bc9 --- /dev/null +++ b/search-service/query_messages_test.go @@ -0,0 +1,223 @@ +package main + +import ( + "encoding/json" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/hmchangw/chat/pkg/model" +) + +func parseQuery(t *testing.T, raw json.RawMessage) map[string]any { + t.Helper() + var m map[string]any + require.NoError(t, json.Unmarshal(raw, &m)) + return m +} + +func filterClauses(t *testing.T, q map[string]any) []any { + t.Helper() + query := q["query"].(map[string]any) + b := query["bool"].(map[string]any) + return b["filter"].([]any) +} + +func shouldClauses(t *testing.T, q map[string]any) []any { + t.Helper() + filters := filterClauses(t, q) + // filters = [range createdAt, bool { should: ... }] + roomAccess := filters[1].(map[string]any)["bool"].(map[string]any) + return roomAccess["should"].([]any) +} + +func TestBuildMessageQuery_GlobalUnrestricted(t *testing.T) { + req := model.SearchMessagesRequest{SearchText: "hello", Size: 25, Offset: 0} + raw, err := buildMessageQuery(req, "alice", nil, 365*24*time.Hour, "") + require.NoError(t, err) + + q := parseQuery(t, raw) + assert.Equal(t, float64(0), q["from"]) + assert.Equal(t, float64(25), q["size"]) + assert.Equal(t, true, q["track_total_hits"]) + + shoulds := shouldClauses(t, q) + require.Len(t, shoulds, 1, "unrestricted-only → exactly one terms-lookup clause") + terms := shoulds[0].(map[string]any)["terms"].(map[string]any) + lookup := terms["roomId"].(map[string]any) + assert.Equal(t, "user-room", lookup["index"]) + assert.Equal(t, "alice", lookup["id"]) + assert.Equal(t, "rooms", lookup["path"]) +} + +func TestBuildMessageQuery_GlobalWithRestricted(t *testing.T) { + req := model.SearchMessagesRequest{SearchText: "hi", Size: 10, Offset: 5} + restricted := map[string]int64{ + "room-b": 1_700_000_000_000, + "room-a": 1_600_000_000_000, + } + raw, err := buildMessageQuery(req, "alice", restricted, 24*time.Hour, "") + require.NoError(t, err) + + q := parseQuery(t, raw) + shoulds := shouldClauses(t, q) + // 1 terms-lookup + 2 clauses per restricted room (A + B) = 5 + require.Len(t, shoulds, 5) + + // deterministic ordering — alphabetical by rid, Clause A before Clause B + clauseA := shoulds[1].(map[string]any)["bool"].(map[string]any) + aMust := clauseA["must"].([]any) + term := aMust[0].(map[string]any)["term"].(map[string]any) + assert.Equal(t, "room-a", term["roomId"]) + // range.createdAt.gte is ISO8601 of the HSS + rng := aMust[1].(map[string]any)["range"].(map[string]any)["createdAt"].(map[string]any) + assert.Equal(t, time.UnixMilli(1_600_000_000_000).UTC().Format(time.RFC3339Nano), rng["gte"]) + + // Clause B for room-a: roomId + createdAt >= hss (shared base must — + // the outer security gate that prevents pre-HSS tshow replies from + // leaking) + threadParent exists + (tshow=true OR parent.createdAt >= hss). + clauseB := shoulds[2].(map[string]any)["bool"].(map[string]any) + bMust := clauseB["must"].([]any) + require.Len(t, bMust, 4, "clauseB must have term, createdAt-range, exists, inner-bool") + assert.Equal(t, "room-a", bMust[0].(map[string]any)["term"].(map[string]any)["roomId"]) + + // The reply itself must be at or after HSS. + createdAtRange := bMust[1].(map[string]any)["range"].(map[string]any)["createdAt"].(map[string]any) + assert.Equal(t, time.UnixMilli(1_600_000_000_000).UTC().Format(time.RFC3339Nano), createdAtRange["gte"]) + + exists := bMust[2].(map[string]any)["exists"].(map[string]any) + assert.Equal(t, "threadParentMessageId", exists["field"]) + + innerBool := bMust[3].(map[string]any)["bool"].(map[string]any) + assert.EqualValues(t, 1, innerBool["minimum_should_match"]) + innerShould := innerBool["should"].([]any) + require.Len(t, innerShould, 2) + assert.Equal(t, true, innerShould[0].(map[string]any)["term"].(map[string]any)["tshow"]) + parentRange := innerShould[1].(map[string]any)["range"].(map[string]any)["threadParentMessageCreatedAt"].(map[string]any) + assert.Equal(t, time.UnixMilli(1_600_000_000_000).UTC().Format(time.RFC3339Nano), parentRange["gte"]) +} + +// inlineTermsAndLookup digs into the bool-filter wrapper that +// scopedAccessClauses now produces for the unrestricted subset. Returns +// the roomIds the inline terms clause matches and the account the +// terms-lookup gates against. +func inlineTermsAndLookup(t *testing.T, clause any) (roomIDs []any, account string) { + t.Helper() + filter := clause.(map[string]any)["bool"].(map[string]any)["filter"].([]any) + require.Len(t, filter, 2, "scoped unrestricted clause must be inline-terms AND terms-lookup") + inline := filter[0].(map[string]any)["terms"].(map[string]any)["roomId"].([]any) + lookup := filter[1].(map[string]any)["terms"].(map[string]any)["roomId"].(map[string]any) + return inline, lookup["id"].(string) +} + +func TestBuildMessageQuery_ScopedInlineTerms(t *testing.T) { + req := model.SearchMessagesRequest{ + SearchText: "hi", + RoomIds: []string{"r1", "r2", "r3"}, + } + raw, err := buildMessageQuery(req, "alice", nil, time.Hour, "") + require.NoError(t, err) + + shoulds := shouldClauses(t, parseQuery(t, raw)) + require.Len(t, shoulds, 1) + inline, account := inlineTermsAndLookup(t, shoulds[0]) + assert.ElementsMatch(t, []any{"r1", "r2", "r3"}, inline) + assert.Equal(t, "alice", account, "inline terms must be gated by the user-room lookup") +} + +func TestBuildMessageQuery_ScopedMixed(t *testing.T) { + req := model.SearchMessagesRequest{ + SearchText: "hi", + RoomIds: []string{"r1", "restricted-r2", "r3"}, + } + restricted := map[string]int64{"restricted-r2": 1_600_000_000_000} + raw, err := buildMessageQuery(req, "alice", restricted, time.Hour, "") + require.NoError(t, err) + + shoulds := shouldClauses(t, parseQuery(t, raw)) + require.Len(t, shoulds, 3) // 1 inline terms (r1, r3) + 2 restricted clauses (A + B) + + inline, account := inlineTermsAndLookup(t, shoulds[0]) + assert.ElementsMatch(t, []any{"r1", "r3"}, inline) + assert.Equal(t, "alice", account) + + // Clause A for the restricted room + clauseA := shoulds[1].(map[string]any)["bool"].(map[string]any)["must"].([]any) + assert.Equal(t, "restricted-r2", clauseA[0].(map[string]any)["term"].(map[string]any)["roomId"]) + // Clause A must explicitly exclude thread replies via must_not exists. + clauseABool := shoulds[1].(map[string]any)["bool"].(map[string]any) + mustNot := clauseABool["must_not"].([]any) + require.Len(t, mustNot, 1) + exists := mustNot[0].(map[string]any)["exists"].(map[string]any) + assert.Equal(t, "threadParentMessageId", exists["field"]) +} + +func TestBuildMessageQuery_UserRoomIndexOverride(t *testing.T) { + req := model.SearchMessagesRequest{SearchText: "hi"} + raw, err := buildMessageQuery(req, "alice", nil, time.Hour, "custom-user-room") + require.NoError(t, err) + + shoulds := shouldClauses(t, parseQuery(t, raw)) + lookup := shoulds[0].(map[string]any)["terms"].(map[string]any)["roomId"].(map[string]any) + assert.Equal(t, "custom-user-room", lookup["index"]) +} + +func TestBuildMessageQuery_ScopedAllRestricted(t *testing.T) { + req := model.SearchMessagesRequest{ + SearchText: "hi", + RoomIds: []string{"ra"}, + } + restricted := map[string]int64{"ra": 1_700_000_000_000} + raw, err := buildMessageQuery(req, "alice", restricted, time.Hour, "") + require.NoError(t, err) + + shoulds := shouldClauses(t, parseQuery(t, raw)) + require.Len(t, shoulds, 2) // Clause A + Clause B, no inline terms + _, hasTermsLookup := shoulds[0].(map[string]any)["terms"] + assert.False(t, hasTermsLookup) +} + +func TestBuildMessageQuery_RecentWindow(t *testing.T) { + req := model.SearchMessagesRequest{SearchText: "hi"} + raw, err := buildMessageQuery(req, "alice", nil, 48*time.Hour, "") + require.NoError(t, err) + + filters := filterClauses(t, parseQuery(t, raw)) + rng := filters[0].(map[string]any)["range"].(map[string]any)["createdAt"].(map[string]any) + // Single-unit date-math required by ES — `48h`, NOT `48h0m0s`. + assert.Equal(t, "now-48h", rng["gte"]) +} + +func TestBuildMessageQuery_RecentWindowDefault(t *testing.T) { + req := model.SearchMessagesRequest{SearchText: "hi"} + raw, err := buildMessageQuery(req, "alice", nil, 0, "") + require.NoError(t, err) + + filters := filterClauses(t, parseQuery(t, raw)) + rng := filters[0].(map[string]any)["range"].(map[string]any)["createdAt"].(map[string]any) + // Zero / negative window defaults to 1 year (365 * 24h = 8760h). + assert.Equal(t, "now-8760h", rng["gte"]) +} + +func TestRecentWindowToGte_Units(t *testing.T) { + cases := []struct { + name string + d time.Duration + want string + }{ + {"hours", 8760 * time.Hour, "8760h"}, + {"minutes", 90 * time.Minute, "90m"}, + {"exact seconds", 45 * time.Second, "45s"}, + {"sub-second rounds up to seconds", 1500 * time.Millisecond, "2s"}, + {"fractional minute falls to seconds", 90*time.Second + 500*time.Millisecond, "91s"}, + {"zero defaults to 1y", 0, "8760h"}, + {"negative defaults to 1y", -time.Hour, "8760h"}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + assert.Equal(t, tc.want, recentWindowToGte(tc.d)) + }) + } +} diff --git a/search-service/query_rooms.go b/search-service/query_rooms.go new file mode 100644 index 000000000..9c536c16a --- /dev/null +++ b/search-service/query_rooms.go @@ -0,0 +1,91 @@ +package main + +import ( + "encoding/json" + "fmt" + + "github.com/hmchangw/chat/pkg/model" + "github.com/hmchangw/chat/pkg/natsrouter" +) + +// SpotlightIndex is the (local-only) room-typeahead index. Always one +// document per (user, room) pair — the spotlight index keeps search-as-you- +// type docs locally even when messages live on remote sites. +const SpotlightIndex = "spotlight" + +// room scope filter values accepted on SearchRoomsRequest.Scope. +const ( + scopeAll = "all" + scopeChannel = "channel" + scopeDM = "dm" + scopeApp = "app" +) + +// buildRoomQuery composes the ES `_search` body for a room search. It +// returns a *natsrouter.RouteError (user-facing) on invalid/unsupported +// scopes and a plain error on marshalling failures. +func buildRoomQuery(req model.SearchRoomsRequest, account string) (json.RawMessage, error) { + scopeFilter, rerr := scopeFilterClause(req.Scope) + if rerr != nil { + return nil, rerr + } + + filters := []any{ + map[string]any{"term": map[string]any{"userAccount": account}}, + } + if scopeFilter != nil { + filters = append(filters, scopeFilter) + } + + body := map[string]any{ + "from": req.Offset, + "size": req.Size, + "track_total_hits": true, + "query": map[string]any{ + "bool": map[string]any{ + "must": []any{ + map[string]any{ + "multi_match": map[string]any{ + "query": req.SearchText, + "type": "bool_prefix", + "operator": "AND", + "fields": []string{"roomName"}, + }, + }, + }, + "filter": filters, + }, + }, + "sort": []any{ + "_score", + map[string]any{"joinedAt": map[string]any{"order": "desc"}}, + }, + } + + data, err := json.Marshal(body) + if err != nil { + return nil, fmt.Errorf("marshal room query: %w", err) + } + return data, nil +} + +// scopeFilterClause translates the request-level scope into an ES term +// filter on `roomType`. The filter values match the strings written to +// the spotlight index by search-sync-worker (the model.RoomType values +// themselves), NOT the Rocket.Chat legacy `p`/`d` convention. Returns +// (nil, nil) for "all" which needs no extra filter; returns an +// ErrBadRequest for `app` (MVP-unsupported) and for any unknown value. +func scopeFilterClause(scope string) (map[string]any, *natsrouter.RouteError) { + switch scope { + case "", scopeAll: + return nil, nil + case scopeChannel: + return map[string]any{"term": map[string]any{"roomType": string(model.RoomTypeChannel)}}, nil + case scopeDM: + return map[string]any{"term": map[string]any{"roomType": string(model.RoomTypeDM)}}, nil + case scopeApp: + return nil, natsrouter.ErrBadRequest("scope=app not supported in MVP") + default: + return nil, natsrouter.ErrBadRequest(fmt.Sprintf("unknown scope: %s", scope)) + } +} diff --git a/search-service/query_rooms_test.go b/search-service/query_rooms_test.go new file mode 100644 index 000000000..82e828035 --- /dev/null +++ b/search-service/query_rooms_test.go @@ -0,0 +1,98 @@ +package main + +import ( + "errors" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/hmchangw/chat/pkg/model" + "github.com/hmchangw/chat/pkg/natsrouter" +) + +func roomFilters(t *testing.T, q map[string]any) []any { + t.Helper() + return q["query"].(map[string]any)["bool"].(map[string]any)["filter"].([]any) +} + +func TestBuildRoomQuery_ScopeAll(t *testing.T) { + req := model.SearchRoomsRequest{SearchText: "general", Size: 10, Offset: 0} + raw, err := buildRoomQuery(req, "alice") + require.NoError(t, err) + + q := parseQuery(t, raw) + assert.Equal(t, float64(0), q["from"]) + assert.Equal(t, float64(10), q["size"]) + assert.Equal(t, true, q["track_total_hits"]) + + filters := roomFilters(t, q) + require.Len(t, filters, 1) + account := filters[0].(map[string]any)["term"].(map[string]any)["userAccount"] + assert.Equal(t, "alice", account) +} + +func TestBuildRoomQuery_ScopeExplicitAll(t *testing.T) { + req := model.SearchRoomsRequest{SearchText: "general", Scope: scopeAll} + raw, err := buildRoomQuery(req, "alice") + require.NoError(t, err) + + filters := roomFilters(t, parseQuery(t, raw)) + require.Len(t, filters, 1) // only userAccount, no roomType filter +} + +func TestBuildRoomQuery_ScopeChannel(t *testing.T) { + req := model.SearchRoomsRequest{SearchText: "general", Scope: scopeChannel} + raw, err := buildRoomQuery(req, "alice") + require.NoError(t, err) + + filters := roomFilters(t, parseQuery(t, raw)) + require.Len(t, filters, 2) + roomType := filters[1].(map[string]any)["term"].(map[string]any)["roomType"] + assert.Equal(t, string(model.RoomTypeChannel), roomType) +} + +func TestBuildRoomQuery_ScopeDM(t *testing.T) { + req := model.SearchRoomsRequest{SearchText: "alice", Scope: scopeDM} + raw, err := buildRoomQuery(req, "alice") + require.NoError(t, err) + + filters := roomFilters(t, parseQuery(t, raw)) + require.Len(t, filters, 2) + roomType := filters[1].(map[string]any)["term"].(map[string]any)["roomType"] + assert.Equal(t, string(model.RoomTypeDM), roomType) +} + +func TestBuildRoomQuery_ScopeAppRejected(t *testing.T) { + req := model.SearchRoomsRequest{SearchText: "bot", Scope: scopeApp} + _, err := buildRoomQuery(req, "alice") + require.Error(t, err) + + var rerr *natsrouter.RouteError + require.True(t, errors.As(err, &rerr), "expected RouteError") + assert.Equal(t, natsrouter.CodeBadRequest, rerr.Code) + assert.Contains(t, rerr.Message, "scope=app") +} + +func TestBuildRoomQuery_UnknownScopeRejected(t *testing.T) { + req := model.SearchRoomsRequest{SearchText: "x", Scope: "orb"} + _, err := buildRoomQuery(req, "alice") + require.Error(t, err) + + var rerr *natsrouter.RouteError + require.True(t, errors.As(err, &rerr)) + assert.Equal(t, natsrouter.CodeBadRequest, rerr.Code) + assert.Contains(t, rerr.Message, "unknown scope") +} + +func TestBuildRoomQuery_SortByScoreThenJoinedAtDesc(t *testing.T) { + req := model.SearchRoomsRequest{SearchText: "x"} + raw, err := buildRoomQuery(req, "alice") + require.NoError(t, err) + + sort := parseQuery(t, raw)["sort"].([]any) + require.Len(t, sort, 2) + assert.Equal(t, "_score", sort[0]) + joinedAt := sort[1].(map[string]any)["joinedAt"].(map[string]any) + assert.Equal(t, "desc", joinedAt["order"]) +} diff --git a/search-service/response.go b/search-service/response.go new file mode 100644 index 000000000..471558b80 --- /dev/null +++ b/search-service/response.go @@ -0,0 +1,100 @@ +package main + +import ( + "encoding/json" + "fmt" + "time" + + "github.com/hmchangw/chat/pkg/model" +) + +// rawHit is generic over the source type so both message and room +// results can share the response-envelope shape. +type rawHit[T any] struct { + Source T `json:"_source"` +} + +type rawResponse[T any] struct { + Hits struct { + Total struct { + Value int64 `json:"value"` + } `json:"total"` + Hits []rawHit[T] `json:"hits"` + } `json:"hits"` +} + +// messageSource mirrors MessageSearchIndex in search-sync-worker. The +// `tshow` flag is used by the restricted-room access clauses at query +// time and not surfaced in the response. +type messageSource struct { + MessageID string `json:"messageId"` + RoomID string `json:"roomId"` + SiteID string `json:"siteId"` + UserID string `json:"userId"` + UserAccount string `json:"userAccount"` + Content string `json:"content"` + CreatedAt time.Time `json:"createdAt"` + ThreadParentID string `json:"threadParentMessageId,omitempty"` + ThreadParentCreatedAt *time.Time `json:"threadParentMessageCreatedAt,omitempty"` +} + +// roomSource is the ES `_source` shape for a spotlight hit. +type roomSource struct { + RoomID string `json:"roomId"` + RoomName string `json:"roomName"` + RoomType string `json:"roomType"` + UserAccount string `json:"userAccount"` + SiteID string `json:"siteId"` + JoinedAt time.Time `json:"joinedAt"` +} + +func parseMessagesResponse(raw json.RawMessage) (*model.SearchMessagesResponse, error) { + var rr rawResponse[messageSource] + if err := json.Unmarshal(raw, &rr); err != nil { + return nil, fmt.Errorf("parse messages response: %w", err) + } + + out := &model.SearchMessagesResponse{ + Total: rr.Hits.Total.Value, + Results: make([]model.MessageSearchHit, 0, len(rr.Hits.Hits)), + } + for i := range rr.Hits.Hits { + src := &rr.Hits.Hits[i].Source + out.Results = append(out.Results, model.MessageSearchHit{ + MessageID: src.MessageID, + RoomID: src.RoomID, + SiteID: src.SiteID, + UserID: src.UserID, + UserAccount: src.UserAccount, + Content: src.Content, + CreatedAt: src.CreatedAt, + ThreadParentMessageID: src.ThreadParentID, + ThreadParentCreatedAt: src.ThreadParentCreatedAt, + }) + } + return out, nil +} + +func parseRoomsResponse(raw json.RawMessage) (*model.SearchRoomsResponse, error) { + var rr rawResponse[roomSource] + if err := json.Unmarshal(raw, &rr); err != nil { + return nil, fmt.Errorf("parse rooms response: %w", err) + } + + out := &model.SearchRoomsResponse{ + Total: rr.Hits.Total.Value, + Results: make([]model.RoomSearchHit, 0, len(rr.Hits.Hits)), + } + for i := range rr.Hits.Hits { + src := &rr.Hits.Hits[i].Source + out.Results = append(out.Results, model.RoomSearchHit{ + RoomID: src.RoomID, + RoomName: src.RoomName, + RoomType: src.RoomType, + UserAccount: src.UserAccount, + SiteID: src.SiteID, + JoinedAt: src.JoinedAt, + }) + } + return out, nil +} diff --git a/search-service/response_test.go b/search-service/response_test.go new file mode 100644 index 000000000..849707367 --- /dev/null +++ b/search-service/response_test.go @@ -0,0 +1,103 @@ +package main + +import ( + "encoding/json" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestParseMessagesResponse_HappyPath(t *testing.T) { + body := json.RawMessage(`{ + "hits": { + "total": {"value": 2}, + "hits": [ + {"_source": { + "messageId": "m1", "roomId": "r1", "siteId": "site-a", + "userId": "u1", "userAccount": "alice", "content": "hello", + "createdAt": "2026-04-01T12:00:00Z" + }}, + {"_source": { + "messageId": "m2", "roomId": "r2", "siteId": "site-b", + "userId": "u2", "userAccount": "bob", "content": "world", + "createdAt": "2026-04-02T12:00:00Z", + "threadParentMessageId": "p1", + "threadParentMessageCreatedAt": "2026-04-02T11:00:00Z" + }} + ] + } + }`) + + resp, err := parseMessagesResponse(body) + require.NoError(t, err) + assert.EqualValues(t, 2, resp.Total) + require.Len(t, resp.Results, 2) + + assert.Equal(t, "m1", resp.Results[0].MessageID) + assert.Equal(t, "alice", resp.Results[0].UserAccount) + assert.Empty(t, resp.Results[0].ThreadParentMessageID) + assert.Nil(t, resp.Results[0].ThreadParentCreatedAt) + + assert.Equal(t, "p1", resp.Results[1].ThreadParentMessageID) + require.NotNil(t, resp.Results[1].ThreadParentCreatedAt) + want := time.Date(2026, 4, 2, 11, 0, 0, 0, time.UTC) + assert.True(t, resp.Results[1].ThreadParentCreatedAt.Equal(want)) +} + +func TestParseMessagesResponse_Empty(t *testing.T) { + body := json.RawMessage(`{"hits":{"total":{"value":0},"hits":[]}}`) + resp, err := parseMessagesResponse(body) + require.NoError(t, err) + assert.EqualValues(t, 0, resp.Total) + assert.Empty(t, resp.Results) +} + +func TestParseMessagesResponse_Malformed(t *testing.T) { + _, err := parseMessagesResponse(json.RawMessage(`{not json`)) + assert.Error(t, err) +} + +func TestParseRoomsResponse_HappyPath(t *testing.T) { + body := json.RawMessage(`{ + "hits": { + "total": {"value": 2}, + "hits": [ + {"_source": { + "roomId": "r1", "roomName": "general", "roomType": "p", + "userAccount": "alice", "siteId": "site-a", + "joinedAt": "2026-04-01T12:00:00Z" + }}, + {"_source": { + "roomId": "r2", "roomName": "alice-bob", "roomType": "d", + "userAccount": "alice", "siteId": "site-a", + "joinedAt": "2026-04-02T12:00:00Z" + }} + ] + } + }`) + + resp, err := parseRoomsResponse(body) + require.NoError(t, err) + assert.EqualValues(t, 2, resp.Total) + require.Len(t, resp.Results, 2) + + assert.Equal(t, "r1", resp.Results[0].RoomID) + assert.Equal(t, "p", resp.Results[0].RoomType) + assert.Equal(t, "r2", resp.Results[1].RoomID) + assert.Equal(t, "d", resp.Results[1].RoomType) +} + +func TestParseRoomsResponse_Empty(t *testing.T) { + body := json.RawMessage(`{"hits":{"total":{"value":0},"hits":[]}}`) + resp, err := parseRoomsResponse(body) + require.NoError(t, err) + assert.EqualValues(t, 0, resp.Total) + assert.Empty(t, resp.Results) +} + +func TestParseRoomsResponse_Malformed(t *testing.T) { + _, err := parseRoomsResponse(json.RawMessage(`{`)) + assert.Error(t, err) +} diff --git a/search-service/store.go b/search-service/store.go new file mode 100644 index 000000000..28e26c430 --- /dev/null +++ b/search-service/store.go @@ -0,0 +1,31 @@ +package main + +import ( + "context" + "encoding/json" + "time" +) + +//go:generate mockgen -source=store.go -destination=mock_store_test.go -package=main + +type SearchStore interface { + Search(ctx context.Context, indices []string, body json.RawMessage) (json.RawMessage, error) + GetUserRoomDoc(ctx context.Context, account string) (UserRoomDoc, bool, error) +} + +// RestrictedRoomCache stores only the restricted-rooms map (rid → HSS +// millis). The unrestricted rooms[] array is always resolved via ES +// terms-lookup at query time, so no local copy is needed. +type RestrictedRoomCache interface { + GetRestricted(ctx context.Context, account string) (map[string]int64, bool, error) + SetRestricted(ctx context.Context, account string, rooms map[string]int64, ttl time.Duration) error +} + +// UserRoomDoc mirrors the subset of the user-room ES doc that +// search-service reads. Fields must stay in sync with the upsert shape +// in search-sync-worker/user_room.go userRoomUpsertDoc. +type UserRoomDoc struct { + UserAccount string `json:"userAccount"` + Rooms []string `json:"rooms"` + RestrictedRooms map[string]int64 `json:"restrictedRooms"` +} diff --git a/search-service/store_es.go b/search-service/store_es.go new file mode 100644 index 000000000..1da83228b --- /dev/null +++ b/search-service/store_es.go @@ -0,0 +1,70 @@ +package main + +import ( + "context" + "encoding/json" + "fmt" +) + +// UserRoomIndex is the default index holding per-user access-control docs. +// The sync-worker uses a site-qualified name internally, but the search +// service reaches it via a stable alias — one name across cluster +// topologies. Overridable via SEARCH_USER_ROOM_INDEX. +const UserRoomIndex = "user-room" + +// esEngine is the narrow slice of pkg/searchengine.SearchEngine the +// store uses — declared at the consumer so unit tests can stub without +// satisfying the full SearchEngine contract. +type esEngine interface { + Search(ctx context.Context, indices []string, body json.RawMessage) (json.RawMessage, error) + GetDoc(ctx context.Context, index, docID string) (json.RawMessage, bool, error) +} + +type esStore struct { + engine esEngine + userRoomIndex string +} + +func newESStore(engine esEngine, userRoomIndex string) *esStore { + return &esStore{engine: engine, userRoomIndex: resolveUserRoomIndex(userRoomIndex)} +} + +// resolveUserRoomIndex falls back to UserRoomIndex when empty. Kept as a +// single normalization point so both newESStore and termsLookupClause +// consult the same default without repeating the `if == ""` branch. +func resolveUserRoomIndex(name string) string { + if name == "" { + return UserRoomIndex + } + return name +} + +func (s *esStore) Search(ctx context.Context, indices []string, body json.RawMessage) (json.RawMessage, error) { + raw, err := s.engine.Search(ctx, indices, body) + if err != nil { + return nil, fmt.Errorf("es store search: %w", err) + } + return raw, nil +} + +// GetUserRoomDoc fetches the access-control doc. On a 404 (doc absent or +// index missing) returns (zero, false, nil) so the handler can populate an +// empty-map cache entry instead of erroring. +func (s *esStore) GetUserRoomDoc(ctx context.Context, account string) (UserRoomDoc, bool, error) { + raw, found, err := s.engine.GetDoc(ctx, s.userRoomIndex, account) + if err != nil { + return UserRoomDoc{}, false, fmt.Errorf("get user-room doc: %w", err) + } + if !found { + return UserRoomDoc{}, false, nil + } + + // ES wraps the document in `{ _source: … }`; extract it. + var wrapper struct { + Source UserRoomDoc `json:"_source"` + } + if err := json.Unmarshal(raw, &wrapper); err != nil { + return UserRoomDoc{}, false, fmt.Errorf("decode user-room doc: %w", err) + } + return wrapper.Source, true, nil +} diff --git a/search-service/store_es_test.go b/search-service/store_es_test.go new file mode 100644 index 000000000..e50b2b5e8 --- /dev/null +++ b/search-service/store_es_test.go @@ -0,0 +1,105 @@ +package main + +import ( + "context" + "encoding/json" + "errors" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// stubEngine implements the local esEngine interface (Search + GetDoc). +type stubEngine struct { + searchBody json.RawMessage + searchIndices []string + searchReqBody json.RawMessage + searchErr error + + docBody json.RawMessage + docFound bool + docErr error + docIndex string + docDocID string + docCallCnt int +} + +func (s *stubEngine) Search(_ context.Context, indices []string, body json.RawMessage) (json.RawMessage, error) { + s.searchIndices = indices + // Copy the body so a caller that reuses its buffer can't mutate the + // captured assertion value. + s.searchReqBody = append(json.RawMessage(nil), body...) + return s.searchBody, s.searchErr +} + +func (s *stubEngine) GetDoc(_ context.Context, index, id string) (json.RawMessage, bool, error) { + s.docIndex, s.docDocID = index, id + s.docCallCnt++ + return s.docBody, s.docFound, s.docErr +} + +func TestESStore_Search_DelegatesToEngine(t *testing.T) { + eng := &stubEngine{searchBody: json.RawMessage(`{"ok":true}`)} + s := newESStore(eng, "user-room") + + body := json.RawMessage(`{"query":{"match_all":{}}}`) + raw, err := s.Search(context.Background(), []string{"messages-*"}, body) + require.NoError(t, err) + assert.JSONEq(t, `{"ok":true}`, string(raw)) + assert.Equal(t, []string{"messages-*"}, eng.searchIndices) + assert.JSONEq(t, string(body), string(eng.searchReqBody), + "request body must be forwarded to the engine unmodified") +} + +func TestESStore_GetUserRoomDoc_Found(t *testing.T) { + eng := &stubEngine{ + docBody: json.RawMessage(`{"_source":{"userAccount":"alice","rooms":["r1"],"restrictedRooms":{"rr":42}}}`), + docFound: true, + } + s := newESStore(eng, "user-room") + + doc, found, err := s.GetUserRoomDoc(context.Background(), "alice") + require.NoError(t, err) + require.True(t, found) + assert.Equal(t, "alice", doc.UserAccount) + assert.Equal(t, []string{"r1"}, doc.Rooms) + assert.Equal(t, map[string]int64{"rr": 42}, doc.RestrictedRooms) + + assert.Equal(t, "user-room", eng.docIndex) + assert.Equal(t, "alice", eng.docDocID) +} + +func TestESStore_GetUserRoomDoc_NotFound(t *testing.T) { + eng := &stubEngine{docFound: false} + s := newESStore(eng, "user-room") + + _, found, err := s.GetUserRoomDoc(context.Background(), "ghost") + require.NoError(t, err) + assert.False(t, found) +} + +func TestESStore_GetUserRoomDoc_TransportError(t *testing.T) { + eng := &stubEngine{docErr: errors.New("boom")} + s := newESStore(eng, "user-room") + + _, _, err := s.GetUserRoomDoc(context.Background(), "alice") + assert.Error(t, err) +} + +func TestESStore_GetUserRoomDoc_MalformedBody(t *testing.T) { + eng := &stubEngine{docBody: json.RawMessage(`{not json`), docFound: true} + s := newESStore(eng, "user-room") + + _, _, err := s.GetUserRoomDoc(context.Background(), "alice") + assert.Error(t, err) +} + +func TestESStore_UsesDefaultIndexWhenEmpty(t *testing.T) { + eng := &stubEngine{docFound: false} + s := newESStore(eng, "") + + _, _, err := s.GetUserRoomDoc(context.Background(), "alice") + require.NoError(t, err) + assert.Equal(t, UserRoomIndex, eng.docIndex) +} diff --git a/search-service/store_valkey.go b/search-service/store_valkey.go new file mode 100644 index 000000000..936b79019 --- /dev/null +++ b/search-service/store_valkey.go @@ -0,0 +1,54 @@ +package main + +import ( + "context" + "errors" + "fmt" + "time" + + "github.com/hmchangw/chat/pkg/valkeyutil" +) + +// valkeyCache keys are namespaced under `searchservice:restrictedrooms:` +// so the cache can coexist with other services on the same Valkey. +type valkeyCache struct { + client valkeyutil.Client +} + +func newValkeyCache(client valkeyutil.Client) *valkeyCache { + return &valkeyCache{client: client} +} + +func restrictedKey(account string) string { + return fmt.Sprintf("searchservice:restrictedrooms:%s", account) +} + +// GetRestricted returns the cached restricted-rooms map for `account`. +// hit=false covers both "key absent" and "transport failure" — the handler +// treats both as cache misses and falls through to ES. Transport-level +// errors are returned via `err` so the handler can log them for visibility; +// hit=true implies err==nil. +func (c *valkeyCache) GetRestricted(ctx context.Context, account string) (map[string]int64, bool, error) { + var rooms map[string]int64 + err := valkeyutil.GetJSON(ctx, c.client, restrictedKey(account), &rooms) + if errors.Is(err, valkeyutil.ErrCacheMiss) { + return nil, false, nil + } + if err != nil { + return nil, false, fmt.Errorf("cache get restricted: %w", err) + } + if rooms == nil { + rooms = map[string]int64{} + } + return rooms, true, nil +} + +func (c *valkeyCache) SetRestricted(ctx context.Context, account string, rooms map[string]int64, ttl time.Duration) error { + if rooms == nil { + rooms = map[string]int64{} + } + if err := valkeyutil.SetJSONWithTTL(ctx, c.client, restrictedKey(account), rooms, ttl); err != nil { + return fmt.Errorf("cache set restricted: %w", err) + } + return nil +} diff --git a/search-service/store_valkey_test.go b/search-service/store_valkey_test.go new file mode 100644 index 000000000..d18cacfe6 --- /dev/null +++ b/search-service/store_valkey_test.go @@ -0,0 +1,122 @@ +package main + +import ( + "context" + "errors" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/hmchangw/chat/pkg/valkeyutil" +) + +// stubValkey is an in-memory stand-in for valkeyutil.Client — only the +// methods the valkeyCache actually uses are implemented. +type stubValkey struct { + store map[string]string + getErr error + setErr error + lastTTL time.Duration + setCalls int +} + +func newStubValkey() *stubValkey { + return &stubValkey{store: map[string]string{}} +} + +func (s *stubValkey) Get(_ context.Context, key string) (string, error) { + if s.getErr != nil { + return "", s.getErr + } + v, ok := s.store[key] + if !ok { + return "", valkeyutil.ErrCacheMiss + } + return v, nil +} + +func (s *stubValkey) Set(_ context.Context, key, value string, ttl time.Duration) error { + s.setCalls++ + s.lastTTL = ttl + if s.setErr != nil { + return s.setErr + } + s.store[key] = value + return nil +} + +func (s *stubValkey) Del(_ context.Context, keys ...string) error { + for _, k := range keys { + delete(s.store, k) + } + return nil +} + +func (s *stubValkey) Close() error { return nil } + +func TestValkeyCache_SetThenGet(t *testing.T) { + ctx := context.Background() + c := newValkeyCache(newStubValkey()) + + require.NoError(t, c.SetRestricted(ctx, "alice", map[string]int64{"r1": 100}, time.Minute)) + got, hit, err := c.GetRestricted(ctx, "alice") + require.NoError(t, err) + assert.True(t, hit) + assert.Equal(t, map[string]int64{"r1": 100}, got) +} + +func TestValkeyCache_GetMiss(t *testing.T) { + c := newValkeyCache(newStubValkey()) + got, hit, err := c.GetRestricted(context.Background(), "nobody") + require.NoError(t, err) + assert.False(t, hit) + assert.Nil(t, got) +} + +func TestValkeyCache_GetTransportError(t *testing.T) { + stub := newStubValkey() + stub.getErr = errors.New("conn refused") + c := newValkeyCache(stub) + + _, hit, err := c.GetRestricted(context.Background(), "alice") + assert.False(t, hit) + assert.Error(t, err) +} + +func TestValkeyCache_SetError(t *testing.T) { + stub := newStubValkey() + stub.setErr = errors.New("disk full") + c := newValkeyCache(stub) + + err := c.SetRestricted(context.Background(), "alice", map[string]int64{}, time.Minute) + assert.Error(t, err) +} + +func TestValkeyCache_SetNilMapBecomesEmpty(t *testing.T) { + stub := newStubValkey() + c := newValkeyCache(stub) + + require.NoError(t, c.SetRestricted(context.Background(), "alice", nil, time.Minute)) + // Read back the stored value — should be `{}` (marshalled empty map), + // not `null`, so a subsequent cache hit returns an empty map rather + // than a nil map that the handler would fall through on. + assert.Equal(t, "{}", stub.store[restrictedKey("alice")]) +} + +func TestValkeyCache_GetJSONNullYieldsEmptyMap(t *testing.T) { + stub := newStubValkey() + stub.store[restrictedKey("alice")] = "null" + c := newValkeyCache(stub) + + got, hit, err := c.GetRestricted(context.Background(), "alice") + require.NoError(t, err) + assert.True(t, hit) + assert.NotNil(t, got) + assert.Empty(t, got) +} + +func TestRestrictedKey_Format(t *testing.T) { + assert.Equal(t, "searchservice:restrictedrooms:alice", restrictedKey("alice")) +} From 9656d9f7b526889047ace806ecf50b3e89c34440 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 23 Apr 2026 06:34:48 +0000 Subject: [PATCH 4/6] test(search-service): CCS integration tests for unrestricted + restricted rooms MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit testcontainers-go spins up two Elasticsearch 8.17 nodes on a shared Docker network, NATS, and Valkey. The setup fixture: - Creates both ES clusters with the same cluster.name/node.name so transport discovery works via network aliases (es-local, es-remote). - Wires remote1 on the local cluster in proxy mode (cluster.remote.remote1.mode=proxy, cluster.remote.remote1.proxy_address=es-remote:9300) — proxy, not sniff, because docker bridge networking mirrors the k8s-with-ingress topology where remote ES pods aren't directly routable from the local cluster. Polls /_remote/info until connected=true before proceeding (the settings PUT returns immediately but the transport handshake is async). - Uploads the messages-* and user-room templates on the relevant clusters, then seeds data. Two tests: TestSearchService_SearchMessages_CCS_CrossCluster_Unrestricted: seeds one message per cluster; asserts the response includes hits from both with correct siteId's — the core CCS promise. TestSearchService_SearchMessages_CCS_CrossCluster_Restricted: alice is a member of one unrestricted local room and one restricted remote room with HSS set. Remote seeds cover every branch Clause A and Clause B exercise: pre-HSS parent → MUST NOT match post-HSS parent → match via A post-HSS reply, tshow=true, pre-HSS parent → match via B1 post-HSS reply, tshow=false, pre-HSS parent → MUST NOT match Asserts the expected 3 hits and validates Clause B's outer gate doesn't leak pre-HSS history. Both tests drive the full service: real natsrouter, real valkeyutil, real searchengine adapter — the request traverses NATS → router → handler → Valkey cache → ES terms-lookup → merged CCS search. --- search-service/integration_test.go | 633 +++++++++++++++++++++++++++++ 1 file changed, 633 insertions(+) create mode 100644 search-service/integration_test.go diff --git a/search-service/integration_test.go b/search-service/integration_test.go new file mode 100644 index 000000000..003ed47e8 --- /dev/null +++ b/search-service/integration_test.go @@ -0,0 +1,633 @@ +//go:build integration + +package main + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "testing" + "time" + + "github.com/nats-io/nats.go" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "github.com/testcontainers/testcontainers-go" + natsmod "github.com/testcontainers/testcontainers-go/modules/nats" + "github.com/testcontainers/testcontainers-go/network" + "github.com/testcontainers/testcontainers-go/wait" + + "github.com/hmchangw/chat/pkg/model" + "github.com/hmchangw/chat/pkg/natsrouter" + "github.com/hmchangw/chat/pkg/natsutil" + "github.com/hmchangw/chat/pkg/searchengine" + "github.com/hmchangw/chat/pkg/subject" + "github.com/hmchangw/chat/pkg/valkeyutil" +) + +// --- Fixture ----------------------------------------------------------------- + +// ccsFixture is the full stack for cross-cluster integration tests: two ES +// containers on a shared Docker network (with CCS configured from local → +// remote), plus Valkey and NATS, plus the wired search-service router. +// +// localURL / remoteURL are the host-mapped HTTP URLs for seeding; the +// search-service itself sees only localURL. `clientNATS` is the raw NATS +// client used to issue request/reply calls. +type ccsFixture struct { + localURL string + remoteURL string + localES searchengine.SearchEngine + remoteES searchengine.SearchEngine + clientNATS *nats.Conn +} + +// setupCCSFixture stands up the whole CCS environment. Total cost is ~ES +// container start × 2 (~60-90s) so tests that use it should reuse via +// TestMain when added. +// +// Every major step emits a `t.Logf` so a CI failure (where raw logs are +// often opaque on public runs) leaves enough breadcrumbs in the `go test` +// output to pinpoint which phase broke. +func setupCCSFixture(t *testing.T) *ccsFixture { + t.Helper() + ctx := context.Background() + + t.Logf("CCS fixture: creating docker network") + nw, err := network.New(ctx) + require.NoError(t, err, "create docker network") + t.Cleanup(func() { _ = nw.Remove(ctx) }) + t.Logf("CCS fixture: network %q created", nw.Name) + + t.Logf("CCS fixture: starting remote ES container (alias=es-remote)") + remoteURL := startESForCCS(t, nw, "es-remote", "remote-cluster") + t.Logf("CCS fixture: remote ES up at %s", remoteURL) + + t.Logf("CCS fixture: starting local ES container (alias=es-local)") + localURL := startESForCCS(t, nw, "es-local", "local-cluster") + t.Logf("CCS fixture: local ES up at %s", localURL) + + // Wire local ES to reach the remote in PROXY mode. Proxy mode opens a + // single direct connection to the configured address and skips the + // sniff-then-reconnect dance that sniff mode does — that dance requires + // each remote node to advertise a reachable publish address, which is + // fragile when docker containers bind transport on 0.0.0.0 and the + // publish address defaults to an interface the peer can't route to. + // Proxy mode is the robust choice for CCS over an ephemeral docker + // network. Ref: ES docs "Remote cluster settings" → `mode=proxy`. + t.Logf("CCS fixture: configuring cluster.remote.remote1 (proxy mode → es-remote:9300)") + putClusterSetting(t, localURL, map[string]any{ + "persistent": map[string]any{ + "cluster.remote.remote1.mode": "proxy", + "cluster.remote.remote1.proxy_address": "es-remote:9300", + }, + }) + t.Logf("CCS fixture: waiting for remote1 to report connected=true (timeout 120s)") + waitForRemoteConnected(t, localURL, "remote1", 120*time.Second) + t.Logf("CCS fixture: remote1 connected") + + localEngine, err := searchengine.New(ctx, "elasticsearch", localURL) + require.NoError(t, err, "build searchengine for local") + remoteEngine, err := searchengine.New(ctx, "elasticsearch", remoteURL) + require.NoError(t, err, "build searchengine for remote") + + t.Logf("CCS fixture: starting valkey") + valkeyAddr := startValkey(t) + valkeyClient, err := valkeyutil.Connect(ctx, valkeyAddr, "") + require.NoError(t, err, "connect valkey") + t.Cleanup(func() { valkeyutil.Disconnect(valkeyClient) }) + t.Logf("CCS fixture: valkey at %s", valkeyAddr) + + t.Logf("CCS fixture: starting NATS") + natsURL := startNATS(t) + serverNC, err := natsutil.Connect(natsURL, "") + require.NoError(t, err, "connect nats (server side)") + t.Cleanup(func() { _ = serverNC.Drain() }) + + clientNC, err := nats.Connect(natsURL) + require.NoError(t, err, "connect nats (client side)") + t.Cleanup(func() { clientNC.Close() }) + t.Logf("CCS fixture: NATS at %s", natsURL) + + // Thread the same index name through both the store and handlerConfig + // so the test exercises the full SEARCH_USER_ROOM_INDEX wiring path + // (store.GetUserRoomDoc → ES index; query builder → terms-lookup index). + userRoomIndex := UserRoomIndex + store := newESStore(localEngine, userRoomIndex) + cache := newValkeyCache(valkeyClient) + handler := newHandler(store, cache, handlerConfig{ + DocCounts: 25, + MaxDocCounts: 100, + RestrictedRoomsCacheTTL: 5 * time.Minute, + RecentWindow: 365 * 24 * time.Hour, + UserRoomIndex: userRoomIndex, + }) + + router := natsrouter.New(serverNC, "search-service-test") + router.Use(natsrouter.RequestID()) + handler.Register(router) + + return &ccsFixture{ + localURL: localURL, + remoteURL: remoteURL, + localES: localEngine, + remoteES: remoteEngine, + clientNATS: clientNC, + } +} + +// startESForCCS starts one ES node on the shared network with the given +// network alias so the peer can reach it at `{alias}:9300`. Returns the +// host-mapped HTTP URL for seeding. +// +// `transport.host: 0.0.0.0` is required so the transport port binds on all +// interfaces, including the bridge network (ES 8.x defaults to `_site_` +// which excludes the container's bridge IP in some setups). CCS itself +// uses `proxy` mode to avoid publish-address sensitivity — see +// setupCCSFixture. `xpack.security.enabled=false` matches the local dev +// deps compose. +func startESForCCS(t *testing.T, nw *testcontainers.DockerNetwork, alias, clusterName string) string { + t.Helper() + ctx := context.Background() + + container, err := testcontainers.GenericContainer(ctx, testcontainers.GenericContainerRequest{ + ContainerRequest: testcontainers.ContainerRequest{ + Image: "elasticsearch:8.17.0", + ExposedPorts: []string{"9200/tcp", "9300/tcp"}, + Networks: []string{nw.Name}, + NetworkAliases: map[string][]string{ + nw.Name: {alias}, + }, + Env: map[string]string{ + "cluster.name": clusterName, + "discovery.type": "single-node", + "xpack.security.enabled": "false", + "network.host": "0.0.0.0", + "transport.host": "0.0.0.0", + "cluster.routing.allocation.disk.threshold_enabled": "false", + "ES_JAVA_OPTS": "-Xms512m -Xmx512m", + }, + WaitingFor: wait.ForAll( + wait.ForHTTP("/").WithPort("9200/tcp").WithStartupTimeout(120*time.Second), + wait.ForHTTP("/_cluster/health?wait_for_status=yellow&timeout=60s"). + WithPort("9200/tcp"). + WithStartupTimeout(120*time.Second), + ), + }, + Started: true, + }) + require.NoError(t, err, "start elasticsearch (%s)", alias) + t.Cleanup(func() { _ = container.Terminate(ctx) }) + + host, err := container.Host(ctx) + require.NoError(t, err) + port, err := container.MappedPort(ctx, "9200") + require.NoError(t, err) + return fmt.Sprintf("http://%s:%s", host, port.Port()) +} + +func startValkey(t *testing.T) string { + t.Helper() + ctx := context.Background() + container, err := testcontainers.GenericContainer(ctx, testcontainers.GenericContainerRequest{ + ContainerRequest: testcontainers.ContainerRequest{ + Image: "valkey/valkey:8-alpine", + ExposedPorts: []string{"6379/tcp"}, + Cmd: []string{"valkey-server", "--save", "", "--appendonly", "no"}, + WaitingFor: wait.ForLog("Ready to accept connections").WithStartupTimeout(30 * time.Second), + }, + Started: true, + }) + require.NoError(t, err, "start valkey") + t.Cleanup(func() { _ = container.Terminate(ctx) }) + + host, err := container.Host(ctx) + require.NoError(t, err) + port, err := container.MappedPort(ctx, "6379") + require.NoError(t, err) + return fmt.Sprintf("%s:%s", host, port.Port()) +} + +func startNATS(t *testing.T) string { + t.Helper() + ctx := context.Background() + c, err := natsmod.Run(ctx, "nats:2.11-alpine") + require.NoError(t, err, "start nats") + t.Cleanup(func() { _ = c.Terminate(ctx) }) + + url, err := c.ConnectionString(ctx) + require.NoError(t, err, "nats connection string") + return url +} + +// --- Index templates --------------------------------------------------------- + +// buildTestTemplate wraps a pattern + property map with single-node-friendly +// index settings (1 shard, 0 replicas, 1s refresh) and `dynamic: false` +// mappings. The templates below hand-roll their property sets so the tests +// remain independent of search-sync-worker's custom-analyzer configuration. +func buildTestTemplate(pattern string, properties map[string]any) json.RawMessage { + body := map[string]any{ + "index_patterns": []string{pattern}, + "template": map[string]any{ + "settings": map[string]any{ + "index": map[string]any{ + "number_of_shards": 1, + "number_of_replicas": 0, + "refresh_interval": "1s", + }, + }, + "mappings": map[string]any{ + "dynamic": false, + "properties": properties, + }, + }, + } + data, _ := json.Marshal(body) + return data +} + +func messageTestTemplate() json.RawMessage { + return buildTestTemplate("messages-*", map[string]any{ + "messageId": map[string]any{"type": "keyword"}, + "roomId": map[string]any{"type": "keyword"}, + "siteId": map[string]any{"type": "keyword"}, + "userId": map[string]any{"type": "keyword"}, + "userAccount": map[string]any{"type": "keyword"}, + "content": map[string]any{ + "type": "text", + "fields": map[string]any{ + "keyword": map[string]any{"type": "keyword"}, + }, + }, + "createdAt": map[string]any{"type": "date"}, + "threadParentMessageId": map[string]any{"type": "keyword"}, + "threadParentMessageCreatedAt": map[string]any{"type": "date"}, + "tshow": map[string]any{"type": "boolean"}, + }) +} + +func userRoomTestTemplate() json.RawMessage { + return buildTestTemplate(UserRoomIndex, map[string]any{ + "userAccount": map[string]any{"type": "keyword"}, + "rooms": map[string]any{ + "type": "text", + "fields": map[string]any{ + "keyword": map[string]any{"type": "keyword", "ignore_above": 256}, + }, + }, + "restrictedRooms": map[string]any{"type": "flattened"}, + "roomTimestamps": map[string]any{"type": "flattened"}, + "createdAt": map[string]any{"type": "date"}, + "updatedAt": map[string]any{"type": "date"}, + }) +} + +// --- HTTP helpers ------------------------------------------------------------ + +// testHTTPClient is a bounded HTTP client for ES control-plane calls — +// stalled containers shouldn't be able to hang the integration job past +// the per-call deadline. Kept small on purpose: these calls hit localhost +// (docker-mapped port) and are cheap when they succeed. +var testHTTPClient = &http.Client{Timeout: 10 * time.Second} + +// putClusterSetting pushes a /_cluster/settings update. Used to configure +// the CCS remote after both clusters are up. +func putClusterSetting(t *testing.T, esURL string, body map[string]any) { + t.Helper() + data, _ := json.Marshal(body) + req, err := http.NewRequest(http.MethodPut, esURL+"/_cluster/settings", bytes.NewReader(data)) + require.NoError(t, err) + req.Header.Set("Content-Type", "application/json") + resp, err := testHTTPClient.Do(req) + require.NoError(t, err, "put cluster settings") + defer resp.Body.Close() + respBody, _ := io.ReadAll(resp.Body) + require.Equal(t, http.StatusOK, resp.StatusCode, "put cluster settings: %s", respBody) +} + +// waitForRemoteConnected polls /_remote/info until the given remote cluster +// reports connected=true. CCS registration is async — the settings call +// returns immediately but the transport handshake happens in the +// background. On timeout, the last-seen /_remote/info body is captured in +// the failure message so CI can diagnose whether the remote was ever +// registered, what mode it ended up in, or why it couldn't connect. +func waitForRemoteConnected(t *testing.T, localURL, remoteName string, timeout time.Duration) { + t.Helper() + deadline := time.Now().Add(timeout) + var lastBody string + for time.Now().Before(deadline) { + resp, err := testHTTPClient.Get(localURL + "/_remote/info") + if err == nil { + body, _ := io.ReadAll(resp.Body) + resp.Body.Close() + lastBody = string(body) + var info map[string]struct { + Connected bool `json:"connected"` + } + if json.Unmarshal(body, &info) == nil { + if entry, ok := info[remoteName]; ok && entry.Connected { + return + } + } + } + time.Sleep(1 * time.Second) + } + t.Fatalf("remote cluster %q never became connected within %s\nlast /_remote/info body: %s", + remoteName, timeout, lastBody) +} + +// seedDoc PUTs a JSON document into ES, synchronously refreshing the index +// so the next search sees it. +func seedDoc(t *testing.T, esURL, index, id string, doc any) { + t.Helper() + data, err := json.Marshal(doc) + require.NoError(t, err) + url := fmt.Sprintf("%s/%s/_doc/%s?refresh=true", esURL, index, id) + req, err := http.NewRequest(http.MethodPut, url, bytes.NewReader(data)) + require.NoError(t, err) + req.Header.Set("Content-Type", "application/json") + resp, err := testHTTPClient.Do(req) + require.NoError(t, err) + defer resp.Body.Close() + body, _ := io.ReadAll(resp.Body) + require.Truef(t, resp.StatusCode == http.StatusCreated || resp.StatusCode == http.StatusOK, + "seedDoc %s/%s: status=%d body=%s", index, id, resp.StatusCode, body) +} + +// --- Templates on both clusters --------------------------------------------- + +func (f *ccsFixture) installTemplates(t *testing.T) { + t.Helper() + ctx := context.Background() + + t.Logf("templates: upserting messages_template on local") + require.NoError(t, f.localES.UpsertTemplate(ctx, "messages_template", messageTestTemplate()), + "upsert messages_template on local") + t.Logf("templates: upserting messages_template on remote") + require.NoError(t, f.remoteES.UpsertTemplate(ctx, "messages_template", messageTestTemplate()), + "upsert messages_template on remote") + // user-room is local-only per the search-service architecture. + t.Logf("templates: upserting user_room_template on local") + require.NoError(t, f.localES.UpsertTemplate(ctx, "user_room_template", userRoomTestTemplate()), + "upsert user_room_template on local") + t.Logf("templates: all upserted") +} + +// --- Test -------------------------------------------------------------------- + +// TestSearchService_SearchMessages_CCS_CrossCluster_Unrestricted verifies +// the core CCS promise: a user's search crosses from the local cluster +// (`messages-*`) to a remote cluster (`*:messages-*`) and the service +// returns the merged result set. Both rooms are unrestricted — they live in +// the user-room doc's `rooms[]` — and the terms-lookup clause handles them +// uniformly regardless of which site hosts the message. +func TestSearchService_SearchMessages_CCS_CrossCluster_Unrestricted(t *testing.T) { + f := setupCCSFixture(t) + f.installTemplates(t) + + // --- Seed -------------------------------------------------------------- + // + // Alice is a member of two unrestricted rooms: one lives on the local + // site, the other on the remote site. The user-room doc (local-only) + // lists BOTH in `rooms[]` — the sync-worker would normally populate + // this via INBOX events; here we seed directly. + const account = "alice" + const localRoomID = "room-local-1" + const remoteRoomID = "room-remote-1" + + now := time.Now().UTC() + createdAt := now.Add(-time.Hour) + monthIdx := "messages-" + createdAt.Format("2006-01") + + // user-room doc: unrestricted memberships in both rooms. + seedDoc(t, f.localURL, UserRoomIndex, account, map[string]any{ + "userAccount": account, + "rooms": []string{localRoomID, remoteRoomID}, + "restrictedRooms": map[string]int64{}, + "roomTimestamps": map[string]int64{ + localRoomID: createdAt.UnixMilli(), + remoteRoomID: createdAt.UnixMilli(), + }, + "createdAt": createdAt.Format(time.RFC3339Nano), + "updatedAt": createdAt.Format(time.RFC3339Nano), + }) + + // Local message in local room. + seedDoc(t, f.localURL, monthIdx, "msg-local-1", map[string]any{ + "messageId": "msg-local-1", + "roomId": localRoomID, + "siteId": "site-local", + "userId": "user-bob", + "userAccount": "bob", + "content": "hello from local", + "createdAt": createdAt.Format(time.RFC3339Nano), + }) + + // Remote message in remote room. Same index pattern (`messages-*`) on + // the remote cluster — CCS resolves the `*:messages-*` segment on the + // local query. + seedDoc(t, f.remoteURL, monthIdx, "msg-remote-1", map[string]any{ + "messageId": "msg-remote-1", + "roomId": remoteRoomID, + "siteId": "site-remote", + "userId": "user-carol", + "userAccount": "carol", + "content": "hello from remote", + "createdAt": createdAt.Format(time.RFC3339Nano), + }) + + // --- Search via NATS --------------------------------------------------- + // + // Round-trips through the real natsrouter: the handler reads + // restrictedRooms from Valkey (miss → ES prefetch → Valkey SET), then + // builds the CCS query against `messages-*,*:messages-*` and parses + // the merged response. + req := model.SearchMessagesRequest{SearchText: "hello"} + reqData, err := json.Marshal(req) + require.NoError(t, err) + + // Generous timeout: first request is Valkey miss → ES prefetch of + // user-room doc → CCS fanout → response parse. Tight timeouts mask + // real latency bugs in integration. + msg, err := f.clientNATS.Request(subject.SearchMessages(account), reqData, 30*time.Second) + require.NoError(t, err, "NATS request failed") + + t.Logf("response: %s", msg.Data) + + var resp model.SearchMessagesResponse + require.NoError(t, json.Unmarshal(msg.Data, &resp), "decode response: %s", msg.Data) + + assert.EqualValues(t, 2, resp.Total, "expected both local + remote hits; got body=%s", msg.Data) + require.Len(t, resp.Results, 2, "expected 2 hits; got body=%s", msg.Data) + + gotRooms := map[string]string{} + for _, hit := range resp.Results { + gotRooms[hit.RoomID] = hit.SiteID + } + assert.Equal(t, "site-local", gotRooms[localRoomID], "local message should be present") + assert.Equal(t, "site-remote", gotRooms[remoteRoomID], "remote message should be present via CCS") +} + +// TestSearchService_SearchMessages_CCS_CrossCluster_Restricted verifies +// the restricted-room access-control clauses fire correctly across the +// CCS boundary. Alice is a member of one UNRESTRICTED local room and one +// RESTRICTED remote room with historySharedSince (HSS) set to a specific +// cutoff. The user-room doc (local-only) routes the remote room into +// `restrictedRooms{rid: hssMillis}`. +// +// Seed on the remote cluster covers every branch the query builder +// encodes for restricted rooms: +// +// - pre-HSS parent → MUST NOT match (Clause A: createdAt < hss) +// - post-HSS parent → MUST match (Clause A) +// - post-HSS thread reply, tshow=true → MUST match (Clause B1: outer gate passes + tshow=true fires B1, even though parent is pre-HSS) +// - post-HSS thread reply, tshow=false → MUST NOT match (Clause B fails: outer gate passes but inner OR fails — tshow=false AND parent < hss so B2 also fails) +// +// Plus one unrestricted local parent to prove the two paths interact +// cleanly on the same search. Total expected hits: 3 (local + post-HSS +// remote parent + post-HSS remote reply with tshow=true). +func TestSearchService_SearchMessages_CCS_CrossCluster_Restricted(t *testing.T) { + f := setupCCSFixture(t) + f.installTemplates(t) + + const account = "alice" + const localRoomID = "room-local-unrestricted" + const remoteRoomID = "room-remote-restricted" + + // Temporal setup: + // - hss is the user's join-time bound for the restricted remote room. + // - preHSS is 3 hours before hss (so pre-HSS messages are clearly + // older than the gate). + // - postHSS is 1 hour after hss. + // All well within the default 1-year `recent_window` so none of them + // get filtered out by the global createdAt range filter. + now := time.Now().UTC() + hss := now.Add(-2 * time.Hour) + preHSS := hss.Add(-3 * time.Hour) + postHSS := hss.Add(time.Hour) + monthIdxFor := func(ts time.Time) string { return "messages-" + ts.Format("2006-01") } + + // user-room doc: local room unrestricted, remote room restricted with hss. + t.Logf("seed: upserting user-room doc for %s (restricted %s since %s)", account, remoteRoomID, hss.Format(time.RFC3339)) + seedDoc(t, f.localURL, UserRoomIndex, account, map[string]any{ + "userAccount": account, + "rooms": []string{localRoomID}, + "restrictedRooms": map[string]int64{ + remoteRoomID: hss.UnixMilli(), + }, + "roomTimestamps": map[string]int64{ + localRoomID: now.UnixMilli(), + remoteRoomID: now.UnixMilli(), + }, + "createdAt": now.Format(time.RFC3339Nano), + "updatedAt": now.Format(time.RFC3339Nano), + }) + + // --- LOCAL unrestricted room ---------------------------------------- + // One plain message that should always match via the terms-lookup + // branch (no HSS involved). + t.Logf("seed: local unrestricted message in %s", localRoomID) + seedDoc(t, f.localURL, monthIdxFor(postHSS), "msg-local-1", map[string]any{ + "messageId": "msg-local-1", + "roomId": localRoomID, + "siteId": "site-local", + "userId": "user-bob", + "userAccount": "bob", + "content": "hello from local", + "createdAt": postHSS.Format(time.RFC3339Nano), + }) + + // --- REMOTE restricted room ----------------------------------------- + // Four messages, each exercising one branch of the restricted-room + // clauses. Pre-HSS parent lives at `msg-remote-pre-parent`; its + // thread replies reference it via threadParentMessageId + + // threadParentMessageCreatedAt=preHSS. + t.Logf("seed: remote pre-HSS parent (MUST NOT match)") + seedDoc(t, f.remoteURL, monthIdxFor(preHSS), "msg-remote-pre-parent", map[string]any{ + "messageId": "msg-remote-pre-parent", + "roomId": remoteRoomID, + "siteId": "site-remote", + "userId": "user-carol", + "userAccount": "carol", + "content": "hello pre-hss parent", + "createdAt": preHSS.Format(time.RFC3339Nano), + }) + + t.Logf("seed: remote post-HSS parent (Clause A match)") + seedDoc(t, f.remoteURL, monthIdxFor(postHSS), "msg-remote-post-parent", map[string]any{ + "messageId": "msg-remote-post-parent", + "roomId": remoteRoomID, + "siteId": "site-remote", + "userId": "user-carol", + "userAccount": "carol", + "content": "hello post-hss parent", + "createdAt": postHSS.Format(time.RFC3339Nano), + }) + + // Post-HSS reply to a pre-HSS parent, tshow=true → Clause B1 matches. + // The reply's own createdAt satisfies Clause B's outer gate + // (createdAt >= hss); tshow=true then fires B1 regardless of the + // parent's age. If the outer gate weren't there, a pre-HSS tshow=true + // reply would leak history the user never had access to. + t.Logf("seed: remote post-HSS reply with tshow=true, pre-HSS parent (Clause B1 match)") + seedDoc(t, f.remoteURL, monthIdxFor(postHSS), "msg-remote-reply-tshow", map[string]any{ + "messageId": "msg-remote-reply-tshow", + "roomId": remoteRoomID, + "siteId": "site-remote", + "userId": "user-carol", + "userAccount": "carol", + "content": "hello tshow reply", + "createdAt": postHSS.Add(time.Minute).Format(time.RFC3339Nano), + "threadParentMessageId": "msg-remote-pre-parent", + "threadParentMessageCreatedAt": preHSS.Format(time.RFC3339Nano), + "tshow": true, + }) + + // Post-HSS reply to a pre-HSS parent, tshow=false → Clause B rejects. + // Outer gate passes (reply createdAt >= hss) but the inner OR fails: + // tshow=false blocks B1 and the parent's pre-HSS createdAt blocks B2. + t.Logf("seed: remote post-HSS reply without tshow, pre-HSS parent (MUST NOT match)") + seedDoc(t, f.remoteURL, monthIdxFor(postHSS), "msg-remote-reply-plain", map[string]any{ + "messageId": "msg-remote-reply-plain", + "roomId": remoteRoomID, + "siteId": "site-remote", + "userId": "user-carol", + "userAccount": "carol", + "content": "hello plain reply", + "createdAt": postHSS.Add(2 * time.Minute).Format(time.RFC3339Nano), + "threadParentMessageId": "msg-remote-pre-parent", + "threadParentMessageCreatedAt": preHSS.Format(time.RFC3339Nano), + }) + + // --- Search --------------------------------------------------------- + reqData, err := json.Marshal(model.SearchMessagesRequest{SearchText: "hello"}) + require.NoError(t, err) + + msg, err := f.clientNATS.Request(subject.SearchMessages(account), reqData, 30*time.Second) + require.NoError(t, err, "NATS request failed") + t.Logf("response: %s", msg.Data) + + var resp model.SearchMessagesResponse + require.NoError(t, json.Unmarshal(msg.Data, &resp), "decode response: %s", msg.Data) + + got := map[string]bool{} + for _, hit := range resp.Results { + got[hit.MessageID] = true + } + + // Expected matches: + assert.True(t, got["msg-local-1"], "local unrestricted message must match via terms-lookup") + assert.True(t, got["msg-remote-post-parent"], "post-HSS remote parent must match via Clause A (CCS)") + assert.True(t, got["msg-remote-reply-tshow"], "post-HSS remote reply with tshow=true must match via Clause B1 (CCS)") + + // Expected exclusions: + assert.False(t, got["msg-remote-pre-parent"], "pre-HSS remote parent must be excluded by Clause A gate") + assert.False(t, got["msg-remote-reply-plain"], "post-HSS remote reply without tshow + pre-HSS parent must be excluded (outer gate passes; B1 and B2 both fail)") + + assert.EqualValues(t, 3, resp.Total, "expected exactly 3 hits; got body=%s", msg.Data) + require.Len(t, resp.Results, 3, "expected 3 hits; got body=%s", msg.Data) +} From 3679a5f99877953cedcedf8159c0786fc86f6df0 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 23 Apr 2026 06:35:00 +0000 Subject: [PATCH 5/6] feat(docker-local): add Valkey and Kibana deps for search-service MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds two shared-deps services to docker-local/compose.deps.yaml: - valkey (valkey-server 8-alpine, persistence off — cache is derivative of the authoritative user-room ES doc and survives restart via the lazy-populate path, not via disk persistence). - kibana 8.17.0 pointing at the existing elasticsearch container. Kibana is dev-only; it lets developers browse the messages-*, spotlight, and user-room indexes and tail documents as search-sync-worker fills them. Wires search-service into compose.services.yaml via include so `make up` brings up the service alongside the existing stack with the same backend.creds, nats.conf, and chat-local network. Kibana's image ships wget (not curl), so its healthcheck uses wget -qO- http://localhost:5601/api/status | grep available. --- docker-local/compose.deps.yaml | 45 ++++++++++++++++++++++++++++++ docker-local/compose.services.yaml | 1 + 2 files changed, 46 insertions(+) diff --git a/docker-local/compose.deps.yaml b/docker-local/compose.deps.yaml index a0b1c1b94..7efea9878 100644 --- a/docker-local/compose.deps.yaml +++ b/docker-local/compose.deps.yaml @@ -109,6 +109,51 @@ services: networks: - chat-local + # Kibana is a dev-only convenience — point a browser at http://localhost:5601 + # to browse indexes, run queries against the `messages-*`, `spotlight`, and + # `user-room` indexes, and tail documents as search-sync-worker fills them. + # Security is disabled to match the local ES setup. + kibana: + image: docker.elastic.co/kibana/kibana:8.17.0 + container_name: chat-local-kibana + ports: + - "5601:5601" + environment: + - ELASTICSEARCH_HOSTS=http://elasticsearch:9200 + - XPACK_SECURITY_ENABLED=false + depends_on: + elasticsearch: + condition: service_healthy + # `curl` is NOT shipped in the kibana:8.17.0 image — use wget, which is + # present. `start_period: 60s` covers Kibana's 30-60s boot before the + # first /api/status succeeds so the 10 retries don't exhaust early. + healthcheck: + test: ["CMD-SHELL", "wget -qO- http://localhost:5601/api/status | grep -q '\"level\":\"available\"'"] + interval: 10s + timeout: 5s + retries: 10 + start_period: 60s + networks: + - chat-local + + # Valkey backs the per-user restricted-rooms cache in search-service (5-min + # TTL, lazy-populated from Elasticsearch on miss). Persistence is disabled: + # the cache is derivative of the authoritative user-room ES doc and survives + # restart only through the lazy-populate path. + valkey: + image: valkey/valkey:8-alpine + container_name: chat-local-valkey + ports: + - "6379:6379" + command: ["valkey-server", "--save", "", "--appendonly", "no"] + healthcheck: + test: ["CMD", "valkey-cli", "ping"] + interval: 5s + timeout: 3s + retries: 5 + networks: + - chat-local + keycloak: image: quay.io/keycloak/keycloak:26.0 container_name: chat-local-keycloak diff --git a/docker-local/compose.services.yaml b/docker-local/compose.services.yaml index 96cbd5e36..8c0be1aef 100644 --- a/docker-local/compose.services.yaml +++ b/docker-local/compose.services.yaml @@ -17,4 +17,5 @@ include: - ../notification-worker/deploy/docker-compose.yml - ../room-service/deploy/docker-compose.yml - ../room-worker/deploy/docker-compose.yml + - ../search-service/deploy/docker-compose.yml - ../search-sync-worker/deploy/docker-compose.yml From 61df0ae5b6e19d9ab6d4f3feb441feafafbaf34e Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 23 Apr 2026 06:35:21 +0000 Subject: [PATCH 6/6] docs(search-service): demo-ccs + production CCS topology notes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds two things, both about making the CCS path easier to reason about and exercise: search-service/demo-ccs/ Standalone demo that piggybacks on `make deps-up`. Adds ONE service (es-remote) to the existing chat-local network, configures CCS proxy-mode on the deps elasticsearch, seeds room1/room2 across the two clusters, and seeds an access-control user-room doc for alice. No second NATS / Valkey / Kibana / search-service — all reused from the shared stack. Two exercise paths: - Kibana Dev Tools for ad-hoc queries. - `nats req chat.user.alice.request.search.messages '{…}' \ --creds docker-local/backend.creds \ --server nats://localhost:4222` against the real search-service. The NATS broker routes the request to the service's queue-group subscriber inside the container — no `docker exec` needed. Template mirrors the prod custom_analyzer so demo queries behave like prod; priority 10 so a real prod/sync-worker template with a more specific pattern takes precedence. docs/superpowers/specs/…-search-service-design.md New "Production CCS Topology Notes" section covering the decision space the service code leaves as ops config: - Mode per remote: proxy vs sniff (requirements, pros/cons). - Topology path: k8s-ingress-front (pick proxy) vs node-routable pods / Calico BGP / VPC-CNI / GKE alias IPs (sniff becomes viable, often preferred — lower latency, but TLS + API keys become mandatory). - Federation shape at N clusters: full mesh (N²) vs hub-and-spoke (N) vs per-site service deployments. - Per-remote operational knobs: skip_unavailable, ping_schedule, TLS on transport. - What the service guarantees regardless of topology. Also fixes a stale CCS integration test reference that used cluster.remote.remote1.seeds (sniff-mode config) — aligned to the mode=proxy + proxy_address pair the test actually sets. --- .../specs/2026-04-21-search-service-design.md | 92 ++++++++-- search-service/demo-ccs/README.md | 170 ++++++++++++++++++ search-service/demo-ccs/compose.ccs.yaml | 64 +++++++ search-service/demo-ccs/seed-local.ndjson | 8 + search-service/demo-ccs/seed-remote.ndjson | 8 + search-service/demo-ccs/setup.sh | 109 +++++++++++ search-service/demo-ccs/template.json | 52 ++++++ search-service/demo-ccs/user-room.json | 11 ++ 8 files changed, 499 insertions(+), 15 deletions(-) create mode 100644 search-service/demo-ccs/README.md create mode 100644 search-service/demo-ccs/compose.ccs.yaml create mode 100644 search-service/demo-ccs/seed-local.ndjson create mode 100644 search-service/demo-ccs/seed-remote.ndjson create mode 100755 search-service/demo-ccs/setup.sh create mode 100644 search-service/demo-ccs/template.json create mode 100644 search-service/demo-ccs/user-room.json diff --git a/docs/superpowers/specs/2026-04-21-search-service-design.md b/docs/superpowers/specs/2026-04-21-search-service-design.md index 79094439a..8cd1ff652 100644 --- a/docs/superpowers/specs/2026-04-21-search-service-design.md +++ b/docs/superpowers/specs/2026-04-21-search-service-design.md @@ -59,13 +59,13 @@ This spec covers the `search-service` itself. The `user-room` schema extension i | CCS config flag | None | Wildcard alias works whether remote clusters are configured or not; local-only deployments silently resolve the `*:` segment to zero matches. | | Restricted-rooms source | ES `user-room` doc (single local GET on cache miss) | Sync-worker companion spec writes `restrictedRooms` into the doc; no Mongo read at query time. | | Restricted-rooms cache | Valkey, 5-min TTL, lazy-populate | Shared across replicas; graceful degradation on cache failure. | -| Room-search restricted handling | Not shown (spotlight MVP skips `hss != nil`) | Per companion spec MVP. Documented gap. | +| Room-search restricted handling | Restricted rooms are indexed in spotlight like any other room the user belongs to | Room search is a name-typeahead over the user's room memberships; the HSS / restricted-rooms distinction is a MESSAGE-content access-control concern enforced by Clauses A/B at query time, not a room-name discovery concern. `hss <= 0` (nil, `&0`, negative) remains the Go↔painless "unrestricted" sentinel for the message-index routing. | --- ## Architecture -``` +```text ┌────────┐ NATS request ┌────────────────┐ Valkey GET/SET ┌────────┐ │ client │ ───────────────→ │ search-service │ ←──────────────────→│ Valkey │ └────────┘ ←─────────────── │ (natsrouter) │ └────────┘ @@ -113,8 +113,8 @@ Same as global, except the room-access clause partitions the requested `roomIds` - `multi_match` on `roomName` (bool_prefix + AND). - Filter: `term userAccount: {account}`. - Plus one conditional filter based on `req.scope`: - - `"channel"` → `term roomType: p` - - `"dm"` → `term roomType: d` + - `"channel"` → `term roomType: ` (== `"channel"`) + - `"dm"` → `term roomType: ` (== `"dm"`) - `"all"` → (no additional filter) - `"app"` → handler returns `ErrBadRequest("scope=app not supported in MVP")` 2. ES `POST spotlight/_search?ignore_unavailable=true&allow_no_indices=true` (local-only, no CCS). @@ -221,8 +221,8 @@ Partition `req.roomIds` using the cached `restrictedRooms` map: "filter": [ { "term": { "userAccount": "{account}" } } /* plus ONE of (from req.scope): - "channel" → { "term": { "roomType": "p" } } - "dm" → { "term": { "roomType": "d" } } + "channel" → { "term": { "roomType": "" } } // value: "channel" + "dm" → { "term": { "roomType": "" } } // value: "dm" "all" → nothing "app" → handler rejects with ErrBadRequest */ @@ -324,7 +324,7 @@ All types get `model_test.go` round-trip coverage (nil/empty/full cases). ## Service Layout (flat, per `CLAUDE.md`) -``` +```text search-service/ main.go # config parsing, ES/Valkey/NATS wiring, natsrouter setup, graceful shutdown handler.go # Handler struct + SearchMessages, SearchRooms methods; RegisterHandlers @@ -378,17 +378,17 @@ Standard nested-prefix pattern via `caarlos0/env`: ```go type ESConfig struct { - URL string `env:"URL" required:"true"` + URL string `env:"URL,required"` Backend string `env:"BACKEND" envDefault:"elasticsearch"` } type ValkeyConfig struct { - Addr string `env:"ADDR" required:"true"` + Addr string `env:"ADDR,required"` Password string `env:"PASSWORD" envDefault:""` } type NATSConfig struct { - URL string `env:"URL" required:"true"` + URL string `env:"URL,required"` CredsFile string `env:"CREDS_FILE" envDefault:""` } @@ -398,6 +398,8 @@ type SearchConfig struct { RestrictedRoomsCacheTTL time.Duration `env:"RESTRICTED_ROOMS_CACHE_TTL" envDefault:"5m"` RecentWindow time.Duration `env:"RECENT_WINDOW" envDefault:"8760h"` // 1y RequestTimeout time.Duration `env:"REQUEST_TIMEOUT" envDefault:"10s"` + UserRoomIndex string `env:"USER_ROOM_INDEX" envDefault:""` + MetricsAddr string `env:"METRICS_ADDR" envDefault:":9090"` } type Config struct { @@ -425,6 +427,8 @@ type Config struct { | `SEARCH_RESTRICTED_ROOMS_CACHE_TTL` | `5m` | no | Valkey TTL for per-user restricted-rooms map. | | `SEARCH_RECENT_WINDOW` | `8760h` | no | Basic filter window (messages-in-last-year). | | `SEARCH_REQUEST_TIMEOUT` | `10s` | no | Per-request upper bound. | +| `SEARCH_USER_ROOM_INDEX` | `""` | no | Override for the user-room access-control index; empty uses the `user-room` default constant. | +| `SEARCH_METRICS_ADDR` | `:9090` | no | Listen address for the Prometheus `/metrics` HTTP server. | No remote-cluster env var. No CCS toggle. The `messages-*,*:messages-*` index pattern is a hardcoded constant in the query builders. @@ -563,7 +567,7 @@ Using `testcontainers-go` (NATS, Elasticsearch, Valkey modules): - `TestSearchService_SearchMessages_Restricted` — user-room doc with entries in both `rooms[]` and `restrictedRooms{}`; assert filtering honors HSS bound. - `TestSearchService_SearchMessages_ScopedRoomIds` — request subset mix, assert partition. - `TestSearchService_SearchMessages_CacheBehavior` — first request triggers prefetch + set, second hits cache only. -- **`TestSearchService_SearchMessages_CCS_CrossCluster`** — two ES containers on same docker network, `PUT /_cluster/settings { "persistent": { "cluster.remote.remote1.seeds": ["es-remote1:9300"] } }` against local; seed distinct messages in each; assert merged results. +- **`TestSearchService_SearchMessages_CCS_CrossCluster`** — two ES containers on same docker network, `PUT /_cluster/settings { "persistent": { "cluster.remote.remote1.mode": "proxy", "cluster.remote.remote1.proxy_address": ":9300" } }` against local (proxy mode — see "Production CCS Topology Notes" for why); seed distinct messages in each; assert merged results. - `TestSearchService_SearchRooms_HappyPath` — seed spotlight docs, assert shape. - `TestSearchService_SearchRooms_Scope_{Channel,DM,All}` — seed docs with mixed `roomType`, assert filtering. - `TestSearchService_SearchRooms_AppRejected` — `scope=app` → `ErrBadRequest`. @@ -584,15 +588,73 @@ Using `testcontainers-go` (NATS, Elasticsearch, Valkey modules): | Gap | Impact | Follow-up needed | |---|---|---| -| `attachment_text`, `card_data`, `msg` not in message index — only `content` searched | Messages with non-text content not discoverable | Extend `MessageSearchIndex` in search-sync-worker + message-worker event schema. | -| `hidden` field not indexed | Hidden messages leak into results | Extend message index. | -| `t=tcard` / `visibleTo` filter logic dropped | User-specific card visibility not enforced | Extend message index; restore filter. | +| `attachment_text`, `card_data` not in message index — only `content` searched (the Rocket.Chat `msg` field IS `content` in this codebase, so they're the same field, not two gaps) | Messages with non-text content (attachments, card payloads) not discoverable | Extend `MessageSearchIndex` in search-sync-worker + message-worker event schema. The `t=tcard` / `visibleTo` per-user card-visibility filter rides on this same work — once `card_data` is indexed, restore the filter as part of the same PR. | +| `hidden` messages not evicted from ES | Hidden messages remain searchable until the room is reindexed | Subscribe to a message-hidden/deleted event and issue an ES `_delete_by_query` (or a bulk delete) — design decision is to REMOVE hidden messages from ES rather than indexing a `hidden` field and filtering at query time. | | `archived`, `prid`, `botDM` not indexed for room search | Archived/thread-sub/bot-DM rooms not filtered | Extend spotlight index + event payload. | | `fname`, `sidebarname` not in spotlight — multi-match on `roomName` only | Old display-name-aware search reduced | Extend spotlight index. | | `ls`, `_updatedAt`, `fname` sort not available | Room search sort is score+joinedAt only | Extend spotlight index + indexed fields. | | `scope=app` (bot DM) rejected | Clients lose that scope | Index `botDM`, re-enable. | | No push invalidation of Valkey cache | Up to 5 min window where a user unsubscribed from a restricted room still sees their messages in search | Subscribe to `chat.user.{account}.event.subscriptions.changed` and delete cache entry. | -| Spotlight doesn't index restricted rooms | Restricted rooms missing from room search | Separate event-level flag or strategy; see companion sync-worker spec. | + +--- + +## Production CCS Topology Notes + +The service code queries the fixed pattern `messages-*,*:messages-*` +and leaves cluster-wiring as an **ops concern**. How many clusters +you CCS to and in which mode is decided at the ES `_cluster/settings` +layer, not in service config. This section captures the decision +space so the next operator doesn't have to rediscover it. + +### Mode per remote — `proxy` vs `sniff` + +| | `sniff` (ES default) | `proxy` | +|---|---|---| +| **How it works** | Local cluster seeds a connection to the remote, then **discovers** every remote node (usually data + coordinating tier) and opens `node_connections` (default 3) to each. Each subsequent request load-balances across those connections. | Local cluster opens `proxy_socket_connections` (default 18) to the single configured `proxy_address`. No discovery. | +| **Requires** | Every remote node to advertise a `publish_address` that the local cluster can route to. On k8s that means remote pod IPs reachable from local pods (node-routable pod networking — Calico BGP / AWS VPC CNI / GKE alias IPs). | Only the `proxy_address` (usually an ingress LB / service VIP) to be reachable. | +| **Pros** | Automatic failover if a remote node dies. Lower p99 (direct connection to the data node, no extra hop). | Single reachable address per remote. No discovery surface. Works through firewalls / NAT / k8s ingress. | +| **Cons** | Breaks the moment a remote node's publish address isn't routable from the local cluster. No tolerance for cross-cluster network boundaries. | Extra hop (ingress → data node). Single-point bottleneck unless the proxy is itself a load-balancer. | + +### Topology path — two common prod shapes + +**Path A — k8s ingress gateway in front of each cluster (no cross-cluster pod reachability):** +- Each k8s cluster has its own ingress gateway terminating external traffic. +- ES pods are NOT routable from outside their own k8s cluster. +- **Pick proxy mode.** `proxy_address` = the ingress-exposed address for the remote cluster's transport port (9300). One `_cluster/settings` per remote on the cluster doing the federating. +- Security: TLS on the ingress, `cluster.remote..transport.compress` if bandwidth-bound, remote-cluster API keys if auth is needed. + +**Path B — node-routable pods (single VPC, Calico BGP / VPC-CNI / equivalent):** +- Every k8s pod IP is routable from every other k8s cluster in the same VPC. +- **Sniff mode becomes viable and often preferred** — you get direct node-to-node connections with automatic failover. Latency is lower because there's no ingress hop. +- Remote ES pods must set `network.publish_host` to their pod IP (via k8s downward API `status.podIP`) so discovery returns addresses the local cluster can actually route to. +- Security is mandatory here, not optional: there's no ingress boundary between clusters, so **TLS on the transport layer (port 9300) + remote-cluster API keys or cert-based auth** are required. Otherwise any pod in the routable network could connect to remote ES. k8s NetworkPolicies to allow only ES↔ES traffic are strongly advised. + +### Federation shape at N clusters (O(N²) problem) + +At ~10-12 clusters, CCS config maintenance starts to dominate. + +| Shape | `_cluster/settings` to maintain | Trade-off | +|---|---|---| +| **Full mesh** | N × (N−1) | Any cluster's search reaches every message. Operationally expensive — updating a remote's address touches (N−1) clusters. | +| **Hub-and-spoke** | N (at the hub only) | One dedicated search-hub cluster federates all N. `search-service` points at the hub. Single source of truth for CCS config; easier TLS. Hub must be HA'd since it's SPOF for search. | +| **Per-site service deployments** | K per cluster (K = average remotes that site needs) | Deploy `search-service` near each cluster so a user's home-site is the entry point. Fan-out scope is decided per site. Still O(N²) in the worst case. | + +**Recommended default at ~12 clusters: hub-and-spoke + `proxy` mode** (if behind ingresses) **or + `sniff` mode** (if node-routable). One place to edit CCS configs, one place to roll TLS certs, and `search-service` always points at the hub. + +### Per-remote operational knobs (set on every remote regardless of mode) + +| Setting | Why | +|---|---| +| `cluster.remote..skip_unavailable: true` | **Strongly recommended at any N; critical at N > 2.** Without it, one down remote fails every CCS query fleet-wide. With it, the remote's shards are skipped and the search returns partial results with a `_shards.skipped` count. | +| `cluster.remote..transport.ping_schedule` | Detect dead remote connections faster than TCP keepalive. Typical value: `30s`. | +| Request-side `timeout` / `search_timeout` | Per-search upper bound; prevents one slow cluster from tail-latency-poisoning every query. | +| TLS on transport (9300) + cert chain between clusters | Mandatory for path B (node-routable), strongly recommended for path A (even when ingress terminates TLS, transport-layer TLS means compromise of one cluster doesn't implicitly let it masquerade as another). | + +### What the service guarantees regardless of topology + +- **Query shape is fixed**: `messages-*,*:messages-*` with `ignore_unavailable=true&allow_no_indices=true`. No service-side fan-out, no per-cluster config. +- **Local-only queries degrade gracefully**: when no remote is configured, the `*:messages-*` segment resolves to zero matches and the query returns only local hits — no code path change needed. +- **Partial-failure tolerance**: if `skip_unavailable=true` is set on each remote at the ES layer, a down remote doesn't fail the request. The service does NOT introspect `_shards.skipped` today — callers with strict-consistency needs would need a follow-up to surface that in the response. --- diff --git a/search-service/demo-ccs/README.md b/search-service/demo-ccs/README.md new file mode 100644 index 000000000..9988b1721 --- /dev/null +++ b/search-service/demo-ccs/README.md @@ -0,0 +1,170 @@ +# Cross-Cluster Search Demo + +Stands up a **second** Elasticsearch cluster alongside your existing +local dev deps, wires it to the deps ES via CCS proxy mode, and seeds +two rooms (one per cluster) so you can see a single query fan out. +Exercise it either from **Kibana Dev Tools** or via the **real +`search-service` + `nats` CLI** — both hit the same wiring. + +## What's here + +| File | Role | +|---|---| +| `compose.ccs.yaml` | Adds ONE service, `es-remote`, to the shared `chat-local` docker network. Plus a one-shot `setup` service (profile `setup`). No second NATS / Valkey / Kibana — all reused from `make deps-up`. | +| `setup.sh` | Configures CCS on the deps `elasticsearch` → `es-remote:9300` in **proxy mode**, uploads the `messages-*` template to both, bulk-indexes room1 / room2 data, seeds a user-room doc for `alice`. | +| `template.json` | `messages-*` template mirroring prod (custom_analyzer, same 10 fields). Priority `10` so a real prod/sync-worker template with a more specific pattern takes precedence. | +| `seed-local.ndjson` | 4 messages in `room1`, indexed into the deps ES. | +| `seed-remote.ndjson` | 4 messages in `room2`, indexed into `es-remote`. | +| `user-room.json` | Alice's access-control doc — lives on the deps ES only, lists both rooms in `rooms[]`. | + +## How to run + +```bash +# 1. Bring up shared deps (NATS + ES + Valkey + Kibana + Mongo + Cassandra). +# One-time per machine — if they're already running, skip. +make deps-up + +# 2. Bring up search-service in the background (uses backend.creds from docker-local). +docker compose -f search-service/deploy/docker-compose.yml up --build -d + +# 3. Add the second ES cluster + run CCS wiring + seed. +docker compose -f search-service/demo-ccs/compose.ccs.yaml up -d --wait +docker compose -f search-service/demo-ccs/compose.ccs.yaml --profile setup run --rm setup +``` + +## Exercise it — Kibana Dev Tools + +Open http://localhost:5601 → left sidebar → **Dev Tools** → Console: + +``` +# 1. Confirm CCS is wired +GET _remote/info + +# 2. Local-only search (just the deps ES messages-*) +GET messages-*/_search +{ "query": { "match": { "content": "hello" } } } + +# 3. CCS search: fans out to es-remote as well +GET messages-*,*:messages-*/_search +{ "query": { "match": { "content": "hello" } } } + +# 4. Same query scoped to a specific room on the remote cluster +GET remote1:messages-*/_search +{ "query": { "term": { "roomId": "room2" } } } +``` + +`_index` on each hit tells you which cluster it came from: +- `"_index": "messages-2026-04"` → local +- `"_index": "remote1:messages-2026-04"` → remote via CCS + +### The full `search-service` query for alice + +This is what `search-service` actually sends for a global (non-scoped) +`searchText=hello` request — recent-window filter + terms-lookup +against alice's user-room doc + bool_prefix multi_match + pagination: + +``` +GET messages-*,*:messages-*/_search?ignore_unavailable=true&allow_no_indices=true +{ + "from": 0, + "size": 25, + "track_total_hits": true, + "query": { + "bool": { + "must": [{ "multi_match": { "query": "hello", "type": "bool_prefix", "operator": "AND", "fields": ["content"] } }], + "filter": [ + { "range": { "createdAt": { "gte": "now-8760h" } } }, + { "bool": { + "should": [{ "terms": { "roomId": { "index": "user-room", "id": "alice", "path": "rooms" } } }], + "minimum_should_match": 1 + }} + ] + } + }, + "sort": [ "_score", { "createdAt": "desc" } ] +} +``` + +The `terms-lookup` pulls alice's allowed rooms from the user-room doc +on the local cluster and filters hits across BOTH clusters. Note that +`user-room` lives only on the local cluster — CCS intentionally does +not forward terms-lookup to the remote; the filter is applied locally +to results merged from all clusters. + +## Exercise it — real `search-service` via NATS CLI + +If `make up SERVICE=search-service` is running, the service is already +subscribed to `chat.user.*.request.search.messages` inside the +`chat-local` network. NATS is exposed on `localhost:4222` and the +shared `backend.creds` from `docker-local/` grants `pub >` / `sub >`. + +Install the `nats` CLI if you don't have it: +```bash +go install github.com/nats-io/natscli/cmd/nats@latest +# or: brew install nats-io/nats-tools/nats +``` + +Then from the repo root: + +```bash +nats req chat.user.alice.request.search.messages \ + '{"searchText":"hello","size":10}' \ + --creds docker-local/backend.creds \ + --server nats://localhost:4222 +``` + +You should see a JSON reply with hits from both clusters. Each hit's +`_index` (if you expand the ES `_search` response) carries the +`remote1:` prefix for docs from the remote cluster — that's how CCS +marks them. The `siteId` field is copied from what was *seeded* +into each cluster, so `site-local` / `site-remote` also happen to +line up here, but the index prefix is the authoritative "which +cluster" signal. No `docker exec` needed — the NATS broker routes +the request to the service's queue group subscriber inside the +container. + +### Variations + +Scope to a specific room: +```bash +nats req chat.user.alice.request.search.messages \ + '{"searchText":"hello","roomIds":["room2"]}' \ + --creds docker-local/backend.creds --server nats://localhost:4222 +``` + +Confirm access control — bob has no user-room doc, so the filter +returns zero allowed rooms: +```bash +nats req chat.user.bob.request.search.messages \ + '{"searchText":"hello"}' \ + --creds docker-local/backend.creds --server nats://localhost:4222 +# → {"total":0,"results":[]} +``` + +## Tear down + +```bash +# Just the demo add-on — drops es-remote + its volume: +docker compose -f search-service/demo-ccs/compose.ccs.yaml down -v + +# The `cluster.remote.remote1.*` persistent settings live on the DEPS +# elasticsearch and survive `down -v`. Scrub them if you want a clean +# slate (otherwise they're harmless — the remote just reports +# disconnected until you bring es-remote back up): +curl -s -X PUT -H 'Content-Type: application/json' \ + http://localhost:9200/_cluster/settings \ + -d '{"persistent":{"cluster.remote.remote1":null}}' + +# Or nuke the whole stack (recreating the ES volume clears them too): +make down +make deps-down +``` + +## Why proxy mode (not sniff) + +Short version: docker bridge networking mirrors the k8s-with-ingress +prod topology where remote clusters are reachable only via a single +stable address, not as a fleet of node-routable pods. See the spec's +"Production CCS Topology Notes" section for the full decision matrix +(including when sniff is preferable — node-routable pod networking +via Calico BGP / AWS VPC CNI / GKE alias IPs). diff --git a/search-service/demo-ccs/compose.ccs.yaml b/search-service/demo-ccs/compose.ccs.yaml new file mode 100644 index 000000000..ef3c4b11d --- /dev/null +++ b/search-service/demo-ccs/compose.ccs.yaml @@ -0,0 +1,64 @@ +name: chat-ccs-demo + +# Demo extension for the local dev stack. Adds a SECOND Elasticsearch +# cluster (`es-remote`) to the existing `chat-local` network created by +# docker-local/compose.deps.yaml, then configures the local ES (from +# `make deps-up`) to Cross-Cluster-Search against it. +# +# Run instructions: see README.md in this directory. + +services: + es-remote: + image: docker.elastic.co/elasticsearch/elasticsearch:8.17.0 + container_name: chat-local-es-remote + environment: + - cluster.name=chat-remote + - node.name=es-remote + - discovery.type=single-node + - xpack.security.enabled=false + - ES_JAVA_OPTS=-Xms512m -Xmx512m + # Transport port is how the LOCAL cluster reaches this cluster for + # CCS. Proxy mode on the local side dials `es-remote:9300` inside + # the shared docker network. + - transport.host=0.0.0.0 + - transport.port=9300 + ports: + # HTTP on 9201 (host) → 9200 (container). Keeps 9200 free for the + # existing `elasticsearch` deps container. + - "9201:9200" + volumes: + - es-remote-data:/usr/share/elasticsearch/data + healthcheck: + test: ["CMD-SHELL", "curl -fs http://localhost:9200/_cluster/health | grep -Eq '\"status\":\"(yellow|green)\"'"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 20s + networks: + - chat-local + + # One-shot CCS wiring + seed. Lives in the `setup` profile so plain + # `docker compose up` ignores it — invoke explicitly with + # `--profile setup run --rm setup`. + setup: + image: curlimages/curl:8.11.1 + profiles: ["setup"] + depends_on: + es-remote: + condition: service_healthy + entrypoint: ["sh", "/setup.sh"] + volumes: + - ./setup.sh:/setup.sh:ro + - ./template.json:/template.json:ro + - ./seed-local.ndjson:/seed-local.ndjson:ro + - ./seed-remote.ndjson:/seed-remote.ndjson:ro + - ./user-room.json:/user-room.json:ro + networks: + - chat-local + +volumes: + es-remote-data: + +networks: + chat-local: + external: true diff --git a/search-service/demo-ccs/seed-local.ndjson b/search-service/demo-ccs/seed-local.ndjson new file mode 100644 index 000000000..8416d6891 --- /dev/null +++ b/search-service/demo-ccs/seed-local.ndjson @@ -0,0 +1,8 @@ +{"index":{"_index":"messages-2026-04","_id":"m1-local-1"}} +{"messageId":"m1-local-1","roomId":"room1","siteId":"site-local","userId":"u-alice","userAccount":"alice","content":"hello from local room1","createdAt":"2026-04-20T10:00:00Z"} +{"index":{"_index":"messages-2026-04","_id":"m1-local-2"}} +{"messageId":"m1-local-2","roomId":"room1","siteId":"site-local","userId":"u-bob","userAccount":"bob","content":"hi alice, are you around","createdAt":"2026-04-20T10:01:00Z"} +{"index":{"_index":"messages-2026-04","_id":"m1-local-3"}} +{"messageId":"m1-local-3","roomId":"room1","siteId":"site-local","userId":"u-alice","userAccount":"alice","content":"yes, testing search service","createdAt":"2026-04-20T10:02:00Z"} +{"index":{"_index":"messages-2026-04","_id":"m1-local-4"}} +{"messageId":"m1-local-4","roomId":"room1","siteId":"site-local","userId":"u-carol","userAccount":"carol","content":"ccs demo local cluster message","createdAt":"2026-04-20T10:03:00Z"} diff --git a/search-service/demo-ccs/seed-remote.ndjson b/search-service/demo-ccs/seed-remote.ndjson new file mode 100644 index 000000000..6a9d11e9c --- /dev/null +++ b/search-service/demo-ccs/seed-remote.ndjson @@ -0,0 +1,8 @@ +{"index":{"_index":"messages-2026-04","_id":"m2-remote-1"}} +{"messageId":"m2-remote-1","roomId":"room2","siteId":"site-remote","userId":"u-alice","userAccount":"alice","content":"hello from remote room2","createdAt":"2026-04-20T11:00:00Z"} +{"index":{"_index":"messages-2026-04","_id":"m2-remote-2"}} +{"messageId":"m2-remote-2","roomId":"room2","siteId":"site-remote","userId":"u-dave","userAccount":"dave","content":"alice are you on remote site","createdAt":"2026-04-20T11:01:00Z"} +{"index":{"_index":"messages-2026-04","_id":"m2-remote-3"}} +{"messageId":"m2-remote-3","roomId":"room2","siteId":"site-remote","userId":"u-alice","userAccount":"alice","content":"yes running the ccs demo","createdAt":"2026-04-20T11:02:00Z"} +{"index":{"_index":"messages-2026-04","_id":"m2-remote-4"}} +{"messageId":"m2-remote-4","roomId":"room2","siteId":"site-remote","userId":"u-alice","userAccount":"alice","content":"test search across clusters","createdAt":"2026-04-20T11:03:00Z"} diff --git a/search-service/demo-ccs/setup.sh b/search-service/demo-ccs/setup.sh new file mode 100755 index 000000000..5c723786d --- /dev/null +++ b/search-service/demo-ccs/setup.sh @@ -0,0 +1,109 @@ +#!/bin/sh +# Demo-CCS one-shot setup (runs inside the `setup` compose service). +# +# Preconditions: +# - `make deps-up` is up — so `elasticsearch` (host of chat-local) +# resolves inside the chat-local docker network. +# - `es-remote` is healthy (compose depends_on guarantees this). +# +# Steps: +# 1. Configure the deps `elasticsearch` as a CCS client of `es-remote` +# via proxy-mode settings (single TCP conn to es-remote:9300, no +# node discovery — matches what prod does when clusters sit behind +# k8s ingress gateways). +# 2. Wait until /_remote/info reports connected=true — the settings +# PUT returns immediately but the transport handshake is async. +# 3. Upload the same `messages-*` index template to both clusters so +# `messages-YYYY-MM` indices inherit consistent mappings regardless +# of which cluster they land on. +# 4. Bulk-index seed messages: room1 on local, room2 on remote. +# 5. Seed the user-room access-control doc on local for alice so +# search-service's terms-lookup has something to return. +# +# Idempotent: re-running is safe — settings and bulk indexing both +# upsert by _id. + +set -eu + +ES_LOCAL="http://elasticsearch:9200" +ES_REMOTE="http://es-remote:9200" + +echo "==> waiting for deps elasticsearch (from make deps-up)" +local_ready="" +for i in $(seq 1 30); do + if curl -fsS "$ES_LOCAL/_cluster/health" >/dev/null 2>&1; then + echo " local ES reachable after ${i}s" + local_ready=1 + break + fi + sleep 1 +done +if [ -z "$local_ready" ]; then + echo "!! deps elasticsearch at $ES_LOCAL not reachable after 30s." + echo " Run \`make deps-up\` first — this demo piggybacks on the shared chat-local network." + exit 1 +fi + +echo "==> configuring CCS on local: proxy → es-remote:9300" +curl -sS -X PUT -H "Content-Type: application/json" "$ES_LOCAL/_cluster/settings" -d '{ + "persistent": { + "cluster.remote.remote1.mode": "proxy", + "cluster.remote.remote1.proxy_address": "es-remote:9300", + "cluster.remote.remote1.skip_unavailable": true + } +}' >/dev/null + +echo "==> waiting for remote1 to report connected=true" +connected="" +for i in $(seq 1 20); do + connected=$(curl -sS "$ES_LOCAL/_remote/info" | grep -o '"connected":true' || true) + if [ -n "$connected" ]; then + echo " connected after ${i}s" + break + fi + sleep 1 +done +if [ -z "$connected" ]; then + echo "!! remote1 never connected; aborting" + curl -sS "$ES_LOCAL/_remote/info" + exit 1 +fi + +echo "==> uploading messages-* template to both clusters" +# template.json mirrors the mappings + custom_analyzer produced by +# messageTemplateBody() in search-sync-worker/messages.go. If you add a +# field to MessageSearchIndex there, update template.json here too. +for ES in "$ES_LOCAL" "$ES_REMOTE"; do + curl -sS -X PUT -H "Content-Type: application/json" \ + "$ES/_index_template/messages-demo-template" \ + --data-binary @/template.json >/dev/null +done + +echo "==> bulk-indexing room1 messages into local" +curl -sS -X POST -H "Content-Type: application/x-ndjson" \ + "$ES_LOCAL/_bulk?refresh=true" --data-binary @/seed-local.ndjson >/dev/null + +echo "==> bulk-indexing room2 messages into remote" +curl -sS -X POST -H "Content-Type: application/x-ndjson" \ + "$ES_REMOTE/_bulk?refresh=true" --data-binary @/seed-remote.ndjson >/dev/null + +echo "==> seeding user-room doc for alice on local (rooms: [room1, room2])" +curl -sS -X PUT -H "Content-Type: application/json" \ + "$ES_LOCAL/user-room/_doc/alice?refresh=true" \ + --data-binary @/user-room.json >/dev/null + +echo "" +echo "==> demo ready" +echo " Kibana: http://localhost:5601 (Dev Tools → Console)" +echo " Local ES: http://localhost:9200 (from deps)" +echo " Remote ES: http://localhost:9201 (from this demo)" +echo "" +echo " Kibana quick test:" +echo " GET messages-*,*:messages-*/_search" +echo " { \"query\": { \"match\": { \"content\": \"hello\" } } }" +echo "" +echo " NATS quick test (requires \`nats\` CLI on host):" +echo " nats req chat.user.alice.request.search.messages \\" +echo " '{\"searchText\":\"hello\"}' \\" +echo " --creds docker-local/backend.creds \\" +echo " --server nats://localhost:4222" diff --git a/search-service/demo-ccs/template.json b/search-service/demo-ccs/template.json new file mode 100644 index 000000000..c6497415b --- /dev/null +++ b/search-service/demo-ccs/template.json @@ -0,0 +1,52 @@ +{ + "index_patterns": ["messages-*"], + "priority": 10, + "template": { + "settings": { + "index": { + "number_of_shards": 1, + "number_of_replicas": 0, + "refresh_interval": "1s" + }, + "analysis": { + "analyzer": { + "custom_analyzer": { + "type": "custom", + "tokenizer": "underscore_preserving", + "filter": ["underscore_subword", "cjk_bigram", "lowercase"], + "char_filter": ["html_strip"] + } + }, + "tokenizer": { + "underscore_preserving": { + "type": "pattern", + "pattern": "[\\s,;!?()\\[\\]{}\"'<>]+" + } + }, + "filter": { + "underscore_subword": { + "type": "word_delimiter_graph", + "split_on_case_change": false, + "split_on_numerics": false, + "preserve_original": true + } + } + } + }, + "mappings": { + "dynamic": false, + "properties": { + "messageId": { "type": "keyword" }, + "roomId": { "type": "keyword" }, + "siteId": { "type": "keyword" }, + "userId": { "type": "keyword" }, + "userAccount": { "type": "keyword" }, + "content": { "type": "text", "analyzer": "custom_analyzer" }, + "createdAt": { "type": "date" }, + "threadParentMessageId": { "type": "keyword" }, + "threadParentMessageCreatedAt": { "type": "date" }, + "tshow": { "type": "boolean" } + } + } + } +} diff --git a/search-service/demo-ccs/user-room.json b/search-service/demo-ccs/user-room.json new file mode 100644 index 000000000..1df2c04a0 --- /dev/null +++ b/search-service/demo-ccs/user-room.json @@ -0,0 +1,11 @@ +{ + "userAccount": "alice", + "rooms": ["room1", "room2"], + "restrictedRooms": {}, + "roomTimestamps": { + "room1": 1745143200000, + "room2": 1745143200000 + }, + "createdAt": "2026-04-20T10:00:00Z", + "updatedAt": "2026-04-20T10:00:00Z" +}