diff --git a/.semgrep/errcode.yml b/.semgrep/errcode.yml new file mode 100644 index 000000000..10e67c57c --- /dev/null +++ b/.semgrep/errcode.yml @@ -0,0 +1,61 @@ +rules: + - id: errcode-no-reason-literal-outside-catalog + languages: [go] + severity: ERROR + message: > + Declare Reason codes as typed constants in pkg/errcode/codes_.go, + not inline. Use an existing errcode.Reason constant. + paths: + exclude: + - "**/pkg/errcode/codes_*.go" + - "**/*_test.go" + patterns: + - pattern: errcode.Reason("...") + + - id: errcode-withcause-must-not-wrap-errcode + languages: [go] + severity: ERROR + message: > + WithCause must wrap a raw error, never another errcode error. Propagate a + typed error with `return err` or `fmt.Errorf("...: %w", err)`. + patterns: + - pattern: errcode.WithCause(errcode.$F(...)) + + - id: errcode-no-multi-wrap-errcode + languages: [go] + severity: ERROR + message: > + Multiple %w verbs can place two errcode errors in one chain, defeating + the "one *Error per chain" invariant (Classify picks the first). Use a + single %w only. + paths: + exclude: + # pkg/errcode/** is the implementation + docs of this very contract; + # doc.go intentionally includes a "forbidden" example in a comment that + # the regex-based pattern matches even though it's only documentation. + - "**/pkg/errcode/**" + # pkg/atrest/** does not import errcode; its sentinels (ErrAuthFailed, + # ErrPayloadMalformed) are plain errors.New strings. The multi-%w idiom + # there joins a sentinel with the underlying crypto/json error so + # errors.Is works for both — there is no errcode error in the chain. + - "**/pkg/atrest/**" + - "**/*_test.go" + patterns: + - pattern-regex: 'fmt\.Errorf\([^)]*%w[^)]*%w' + + - id: errcode-prefer-named-constructor + languages: [go] + severity: WARNING + message: > + Prefer the named constructor (errcode.NotFound(msg)) over + errcode.New(errcode.CodeX, msg) for a literal category. Reserve New for + a category chosen at runtime. + paths: + exclude: + - "**/pkg/errcode/**" + - "**/*_test.go" + patterns: + - pattern: errcode.New(errcode.$CODE, ...) + - metavariable-regex: + metavariable: $CODE + regex: '^Code[A-Z].*' diff --git a/CLAUDE.md b/CLAUDE.md index 1b4a3d89d..0e70ea23e 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -70,9 +70,9 @@ All commands are wrapped in the root Makefile. Always use `make` targets — nev - Always wrap with context: `fmt.Errorf("short description: %w", err)` — describe what the current function was doing, not what failed underneath - Never return bare `err` or `fmt.Errorf("error: %w", err)` - Never ignore errors silently — comment if intentionally discarded -- Use `model.ErrorResponse` via `natsutil.ReplyError` for all NATS reply errors +- Use `pkg/errcode` for ALL client-facing errors; reply via `errnats.Reply` (NATS) / `errhttp.Write` (Gin). Construct with the named constructors (`errcode.NotFound`, `errcode.Forbidden`, …), attach a domain `reason` from `codes_.go` where the frontend must distinguish cases, and return raw `fmt.Errorf("…: %w", err)` for infra failures (they collapse to `internal` at the boundary). Full guide: `docs/error-handling.md`. Wire-side reference for clients: `docs/client-api.md` §6. - Never compare errors by string — use `errors.Is` and `errors.As` -- Never expose raw internal errors to clients — sanitize errors at service boundaries, return user-safe messages +- Never expose raw internal errors to clients — the unexported `errcode.Error.cause` is never serialized; `Classify` logs it once server-side. Never wrap raw message bodies/tokens into a cause. ### Interfaces & Dependency Injection - Define interfaces in the consumer, not the implementer @@ -222,11 +222,31 @@ All commands are wrapped in the root Makefile. Always use `make` targets — nev - Use `iter.Stop()` + `wg.Wait()` + `nc.Drain()` for graceful shutdown — see "JetStream Consumer Pattern" and "Graceful Shutdown" sections - All NATS payloads are JSON — use `encoding/json` with typed structs from `pkg/model`, never `map[string]interface{}` - Use NATS request/reply for synchronous operations; `nc.QueueSubscribe` with service name as queue group -- Use `natsutil.ReplyJSON` for success responses, `natsutil.ReplyError` for errors +- Use `natsutil.ReplyJSON` for success responses; for errors return a typed `*errcode.Error` from the handler and let `errnats.Reply` / `errhttp.Write` marshal the envelope (see `docs/error-handling.md`). - Define all stream configs in `pkg/stream/stream.go` with name pattern `_` - Use durable consumers named after the service - Stream creation is gated by `BOOTSTRAP_STREAMS` (see below); when enabled, use `js.CreateOrUpdateStream` (it's idempotent) via the service's `bootstrapStreams` helper, never inline +### Error Handling at the NATS/HTTP Boundary +`pkg/errcode` has a broad surface, but **day-to-day handler code touches almost none of it.** Use this tiering — if you reach past Tier 1, you should know why. + +- **Tier 1 — every handler (this is 90% of usage).** Return a typed error built from a named constructor, optionally tagged with a `reason`. You do NOT call the adapter, classify, or log — the plumbing does: + - `return errcode.NotFound("room not found")` — pick the constructor whose name matches the HTTP/wire category (`BadRequest`, `NotFound`, `Forbidden`, `Conflict`, `Internal`, …). + - `return errcode.Forbidden("only owners can do this", errcode.WithReason(errcode.RoomNotOwner))` — add `WithReason` **only** when the frontend must branch on the case. Prefer a package-level sentinel (e.g. room-service `helper.go`) over reconstructing the same error at multiple sites, so `errors.Is` matches. + - For an infra failure, `return fmt.Errorf("get subscription: %w", err)` — a raw wrapped error collapses to `internal` at the boundary; do NOT dress it up as an errcode. +- **Tier 2 — one line per handler, written once and copied.** The adapter that turns the returned error into the wire envelope. You pick exactly one, determined by your transport, never both: + - NATS raw handler: `errnats.Reply(ctx, m.Msg, err)`. + - `pkg/natsrouter` handler: returned automatically by the router — you write nothing. + - Gin handler: `errhttp.Write(ctx, c, err)`. +- **Tier 3 — specialist, you'll know when.** Don't use these in ordinary request/reply handlers: + - `errcode.Permanent` / `IsPermanent` — JetStream **workers only**, to Ack-poison vs Nak-retry. + - `errcode.Parse` — **cross-site consumers** decoding a remote envelope (e.g. `memberlist_client.go`). + - `errnats.Marshal` / `MarshalQuiet` / `ReplyQuiet` — outbox/already-logged paths; the plain `Reply` already classifies-and-logs once, so `Quiet` exists only to avoid a double-log. + - `errcode.Classify`, `WithLogger`, `WithLogValues` — boundary/observability plumbing; handlers get request-id logging for free from the router middleware. +- **Never log AND return.** `Reply`/`Write` run `Classify`, which logs once at a category-aware level. A `slog.Error(...)` before returning the same error double-logs. +- **`WithCause` wraps an infra error, never another `*errcode.Error`** (one-errcode-per-chain; it panics otherwise, and semgrep guards it). Never put a raw token/body/subject in a cause or message — it reaches the server log. +- Full guide: `docs/error-handling.md`. Wire reference for clients: `docs/client-api.md` §6. + ### Event Timestamps - Every NATS event struct in `pkg/model` must include a `Timestamp int64 \`json:"timestamp" bson:"timestamp"\`` field - Set the timestamp at the publish site using `time.Now().UTC().UnixMilli()` diff --git a/Makefile b/Makefile index 84861177b..21043920d 100644 --- a/Makefile +++ b/Makefile @@ -42,7 +42,8 @@ GOSEC_FLAGS := -quiet -severity medium -confidence medium -tests=false \ # semgrep: fail on medium+ (WARNING/ERROR; INFO is informational/low). SEMGREP_FLAGS := --error --severity=WARNING --severity=ERROR --metrics=off \ --exclude=tools --exclude=chat-frontend --exclude=testdata \ - --exclude=docs --config=p/golang --config=p/security-audit + --exclude=docs --config=p/golang --config=p/security-audit \ + --config=.semgrep/errcode.yml # Makefile for the distributed multi-site chat system. diff --git a/auth-service/handler.go b/auth-service/handler.go index 3e5f66066..0b624ee79 100644 --- a/auth-service/handler.go +++ b/auth-service/handler.go @@ -13,6 +13,8 @@ import ( "github.com/nats-io/jwt/v2" "github.com/nats-io/nkeys" + "github.com/hmchangw/chat/pkg/errcode" + "github.com/hmchangw/chat/pkg/errcode/errhttp" pkgoidc "github.com/hmchangw/chat/pkg/oidc" ) @@ -74,26 +76,33 @@ func (h *AuthHandler) HandleAuth(c *gin.Context) { return } + ctx := errcode.WithLogValues(c.Request.Context(), "request_id", c.GetString("request_id")) + var req authRequest if err := c.ShouldBindJSON(&req); err != nil { - c.JSON(http.StatusBadRequest, gin.H{"error": "ssoToken and natsPublicKey are required"}) + errhttp.Write(ctx, c, errcode.BadRequest("ssoToken and natsPublicKey are required", + errcode.WithReason(errcode.AuthMissingFields))) return } if !nkeys.IsValidPublicUserKey(req.NATSPublicKey) { - c.JSON(http.StatusBadRequest, gin.H{"error": "invalid natsPublicKey format"}) + errhttp.Write(ctx, c, errcode.BadRequest("invalid natsPublicKey format", + errcode.WithReason(errcode.AuthInvalidNKey))) return } - claims, err := h.validator.Validate(c.Request.Context(), req.SSOToken) + claims, err := h.validator.Validate(ctx, req.SSOToken) if err != nil { if errors.Is(err, pkgoidc.ErrTokenExpired) { - slog.Warn("sso token expired", "error", err) - c.JSON(http.StatusUnauthorized, gin.H{"error": "SSO token has expired, please re-login"}) + errhttp.Write(ctx, c, errcode.Unauthenticated("SSO token has expired, please re-login", + errcode.WithReason(errcode.AuthTokenExpired))) return } - slog.Error("oidc validation failed", "error", err) - c.JSON(http.StatusUnauthorized, gin.H{"error": "invalid SSO token"}) + // Non-expiry failures surface as "invalid SSO token"; attach the raw + // cause so the server log carries the actual reason. + errhttp.Write(ctx, c, errcode.Unauthenticated("invalid SSO token", + errcode.WithReason(errcode.AuthInvalidToken), + errcode.WithCause(err))) return } @@ -101,11 +110,17 @@ func (h *AuthHandler) HandleAuth(c *gin.Context) { if account == "" { account = claims.Name } + if account == "" { + // Blank account would mint a JWT with chat.user..> permissions — refuse. + errhttp.Write(ctx, c, errcode.Unauthenticated("token missing account claim", + errcode.WithReason(errcode.AuthInvalidToken))) + return + } + ctx = errcode.WithLogValues(ctx, "account", account) natsJWT, err := h.signNATSJWT(req.NATSPublicKey, account) if err != nil { - slog.Error("nats jwt signing failed", "error", err, "account", account) - c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to generate NATS token"}) + errhttp.Write(ctx, c, fmt.Errorf("generating NATS token: %w", err)) return } @@ -131,21 +146,26 @@ func (h *AuthHandler) HandleAuth(c *gin.Context) { // handleDevAuth handles auth in dev mode: accepts account name directly // without OIDC validation, for use during local development only. func (h *AuthHandler) handleDevAuth(c *gin.Context) { + ctx := errcode.WithLogValues(c.Request.Context(), "request_id", c.GetString("request_id")) + var req devAuthRequest if err := c.ShouldBindJSON(&req); err != nil { - c.JSON(http.StatusBadRequest, gin.H{"error": "account and natsPublicKey are required"}) + errhttp.Write(ctx, c, errcode.BadRequest("account and natsPublicKey are required", + errcode.WithReason(errcode.AuthMissingFields))) return } if !nkeys.IsValidPublicUserKey(req.NATSPublicKey) { - c.JSON(http.StatusBadRequest, gin.H{"error": "invalid natsPublicKey format"}) + errhttp.Write(ctx, c, errcode.BadRequest("invalid natsPublicKey format", + errcode.WithReason(errcode.AuthInvalidNKey))) return } + ctx = errcode.WithLogValues(ctx, "account", req.Account) + natsJWT, err := h.signNATSJWT(req.NATSPublicKey, req.Account) if err != nil { - slog.Error("nats jwt signing failed", "error", err, "account", req.Account) - c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to generate NATS token"}) + errhttp.Write(ctx, c, fmt.Errorf("generating NATS token: %w", err)) return } diff --git a/auth-service/handler_test.go b/auth-service/handler_test.go index 9ee65aaa6..fa409a532 100644 --- a/auth-service/handler_test.go +++ b/auth-service/handler_test.go @@ -16,6 +16,8 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "github.com/hmchangw/chat/pkg/errcode" + "github.com/hmchangw/chat/pkg/errcode/errtest" pkgoidc "github.com/hmchangw/chat/pkg/oidc" ) @@ -146,7 +148,8 @@ func TestHandleAuth_ExpiredToken(t *testing.T) { router.ServeHTTP(w, req) assert.Equal(t, http.StatusUnauthorized, w.Code) - assert.Contains(t, w.Body.String(), "expired") + errtest.AssertCode(t, w.Body.Bytes(), errcode.CodeUnauthenticated) + errtest.AssertReason(t, w.Body.Bytes(), errcode.AuthTokenExpired) } func TestHandleAuth_InvalidToken(t *testing.T) { @@ -163,7 +166,8 @@ func TestHandleAuth_InvalidToken(t *testing.T) { router.ServeHTTP(w, req) assert.Equal(t, http.StatusUnauthorized, w.Code) - assert.Contains(t, w.Body.String(), "invalid SSO token") + errtest.AssertCode(t, w.Body.Bytes(), errcode.CodeUnauthenticated) + errtest.AssertReason(t, w.Body.Bytes(), errcode.AuthInvalidToken) } func TestHandleAuth_InvalidNKey(t *testing.T) { @@ -179,7 +183,7 @@ func TestHandleAuth_InvalidNKey(t *testing.T) { router.ServeHTTP(w, req) assert.Equal(t, http.StatusBadRequest, w.Code) - assert.Contains(t, w.Body.String(), "invalid natsPublicKey format") + errtest.AssertCode(t, w.Body.Bytes(), errcode.CodeBadRequest) } func TestHandleAuth_MissingFields(t *testing.T) { @@ -204,6 +208,7 @@ func TestHandleAuth_MissingFields(t *testing.T) { req.Header.Set("Content-Type", "application/json") router.ServeHTTP(w, req) assert.Equal(t, http.StatusBadRequest, w.Code) + errtest.AssertCode(t, w.Body.Bytes(), errcode.CodeBadRequest) }) } } @@ -286,7 +291,7 @@ func TestHandleAuth_DevMode_MissingAccount(t *testing.T) { router.ServeHTTP(w, req) assert.Equal(t, http.StatusBadRequest, w.Code) - assert.Contains(t, w.Body.String(), "account") + errtest.AssertCode(t, w.Body.Bytes(), errcode.CodeBadRequest) } func TestHandleAuth_DevMode_InvalidNKey(t *testing.T) { @@ -302,7 +307,34 @@ func TestHandleAuth_DevMode_InvalidNKey(t *testing.T) { router.ServeHTTP(w, req) assert.Equal(t, http.StatusBadRequest, w.Code) - assert.Contains(t, w.Body.String(), "invalid natsPublicKey") + errtest.AssertCode(t, w.Body.Bytes(), errcode.CodeBadRequest) +} + +func TestHandleAuth_DevMode_TokenGenerationFailure(t *testing.T) { + // Force signNATSJWT (uc.Encode) to fail by supplying a non-account + // signing key. A user key pair cannot sign a NATS user JWT, so Encode + // returns an error, exercising the 500 internal-error path. The real + // cause is logged via Classify and must NOT appear in the response body. + userKP, err := nkeys.CreateUser() + require.NoError(t, err, "create user key") + + handler := NewAuthHandler(nil, userKP, 2*time.Hour, true) + router := setupRouter(t, handler) + + userPub := mustUserNKey(t) + body := `{"account":"alice","natsPublicKey":"` + userPub + `"}` + w := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodPost, "/auth", strings.NewReader(body)) + req.Header.Set("Content-Type", "application/json") + router.ServeHTTP(w, req) + + require.Equal(t, http.StatusInternalServerError, w.Code) + errtest.AssertCode(t, w.Body.Bytes(), errcode.CodeInternal) + + var env errcode.Error + require.NoError(t, json.Unmarshal(w.Body.Bytes(), &env)) + assert.Equal(t, "internal error", env.Message) + assert.NotContains(t, w.Body.String(), "generating NATS token") } func TestHandleHealth(t *testing.T) { diff --git a/auth-service/middleware.go b/auth-service/middleware.go index 437dcba2d..8da55c38d 100644 --- a/auth-service/middleware.go +++ b/auth-service/middleware.go @@ -11,16 +11,20 @@ import ( "github.com/hmchangw/chat/pkg/natsutil" ) -// requestIDMiddleware extracts X-Request-ID (or mints via idgen) and stores it on Gin keys, c.Request.Context() via natsutil, and the response header. +// requestIDMiddleware funnels HTTP X-Request-ID through idgen.ResolveRequestID +// (the same primitive the NATS path uses via natsutil.StampRequestID) so the +// mint-vs-pass-through policy has a single owner. Missing → silent mint; +// malformed → mint + Warn preserving the inbound value for traceability. func requestIDMiddleware() gin.HandlerFunc { return func(c *gin.Context) { - id := c.GetHeader(natsutil.RequestIDHeader) - if !idgen.IsValidUUID(id) { - id = idgen.GenerateRequestID() - } + inbound := c.GetHeader(natsutil.RequestIDHeader) + id, replaced := idgen.ResolveRequestID(inbound) c.Set("request_id", id) c.Request = c.Request.WithContext(natsutil.WithRequestID(c.Request.Context(), id)) c.Header(natsutil.RequestIDHeader, id) + if replaced { + slog.WarnContext(c.Request.Context(), "minted request_id (inbound invalid)", "inbound", inbound, "path", c.Request.URL.Path) + } c.Next() } } diff --git a/broadcast-worker/handler.go b/broadcast-worker/handler.go index 74116cf41..95daee71f 100644 --- a/broadcast-worker/handler.go +++ b/broadcast-worker/handler.go @@ -119,7 +119,7 @@ func (h *Handler) handleCreated(ctx context.Context, evt *model.MessageEvent) er case model.RoomTypeDM: return h.publishDMEvents(ctx, meta, clientMsg, resolved.Accounts) default: - slog.Warn("unknown room type, skipping fan-out", "type", meta.Type, "roomID", meta.ID) + slog.Warn("unknown room type, skipping fan-out", "type", meta.Type, "room_id", meta.ID) return nil } } @@ -251,14 +251,14 @@ func (h *Handler) publishMutation(ctx context.Context, room *model.Room, roomEvt "type", roomEvtType, "account", account, "messageID", messageID, - "roomID", room.ID, + "room_id", room.ID, ) } } return nil default: - slog.Warn("unknown room type, skipping mutation fan-out", "type", room.Type, "roomID", room.ID) + slog.Warn("unknown room type, skipping mutation fan-out", "type", room.Type, "room_id", room.ID) return nil } } diff --git a/broadcast-worker/main.go b/broadcast-worker/main.go index 8c5fe03ad..4fc3e65c6 100644 --- a/broadcast-worker/main.go +++ b/broadcast-worker/main.go @@ -167,7 +167,7 @@ func main() { <-sem wg.Done() }() - handlerCtx := natsutil.ContextWithRequestIDFromHeaders(msgCtx, msg.Headers()) + handlerCtx, _ := natsutil.StampRequestID(msgCtx, msg.Headers(), msg.Subject()) if err := handler.HandleMessage(handlerCtx, msg.Data()); err != nil { slog.Error("handle message failed", "error", err, "request_id", natsutil.RequestIDFromContext(handlerCtx)) if err := msg.Nak(); err != nil { diff --git a/chat-frontend/CLAUDE.md b/chat-frontend/CLAUDE.md index 9a3f641c9..e32dead4e 100644 --- a/chat-frontend/CLAUDE.md +++ b/chat-frontend/CLAUDE.md @@ -95,6 +95,39 @@ When the op accepts `opts`, **always pass it through** — even when undefined. - `request` is generic — every op passes its response type through. **Don't accept `Promise`** at the call site. - Discriminated subscription kinds: `Subscription` is the base (channels / botDMs / discussions); `DMSubscription extends Subscription` adds `hrInfo?: SubscriptionHRInfo` for DM rooms. State maps that hold either use `Record` so consumers read `.hrInfo` without narrowing. +### Error envelope (server contract) + +Every backend error — NATS sync replies, JetStream async results (`AsyncJobResult`), and HTTP — comes back in one shape, owned by the backend's `pkg/errcode` package: + +```ts +type ErrorEnvelope = { + error: string // human-readable, user-safe message + code: ErrorCode // always present — drives UX category + HTTP status + reason?: string // optional domain code (when frontend must distinguish) + metadata?: Record // optional structured detail +} +type ErrorCode = + | 'bad_request' | 'unauthenticated' | 'forbidden' | 'not_found' + | 'conflict' | 'too_many_requests' | 'unavailable' | 'internal' +``` + +- **Branch on `reason ?? code`** — `reason` when present (e.g. `max_room_size_reached`, `not_subscribed`, `sso_token_expired`), `code` otherwise. +- **Never branch on `error` text** — wording can change without notice; only display it. +- `code: 'internal'` always carries the message `"internal error"`. The real cause is logged server-side and never reaches the client. +- `formatAsyncJobError` and the shared transport throwers (`NatsContext.request`, `asyncJob.ts`) already throw an `AsyncJobError` with `.code` / `.reason` populated — consumers just read those fields. Don't re-parse the message. + +Reasons emitted today (full catalog in [`docs/client-api.md`](../docs/client-api.md) §6): +- `max_room_size_reached`, `not_room_member`, `not_room_owner`, `last_owner_cannot_leave`, `bot_in_channel`, `bot_not_available`, `user_not_found`, `invalid_org`, `self_dm`, `last_member_cannot_remove`, `target_not_member`, `already_owner`, `cannot_demote_last_owner`, `promote_requires_individual` — room-service / room-worker +- `large_room_post_restricted`, `not_subscribed`, `outside_access_window` — message-gatekeeper / history-service +- `sso_token_expired`, `invalid_sso_token` — auth-service (drive a redirect-to-relogin) +- `invalid_request`, `invalid_nkey`, `missing_fields` — auth-service (form-validation surface; rarely actionable by the UI today) + +When adding a new client-facing branch in the UI, prefer matching a reason over a message substring. If the case you need isn't in the catalog, ask backend to add a `Reason` constant in `pkg/errcode/codes_.go` rather than substring-matching the english text. + +`formatAsyncJobError` is now reason-keyed: it looks up the thrown `AsyncJobError`'s `.reason` against an internal `REASON_COPY` map and returns the humanized english copy when present, falling back to `err.message` otherwise. Components calling `setError(formatAsyncJobError(err))` get the right UX automatically without their own per-call switch on reason. To add a new humanized line, edit `REASON_COPY` in `chat-frontend/src/api/_transport/asyncJob.ts` and add the reason to the catalog above. + +**DM-exists is a SUCCESS reply, not an error.** When a client requests a DM that already exists, the reply is `{ status: 'exists', roomId: }` — open that room. The legacy error-shaped reply (`{ error: 'dm already exists', roomId: … }`) is still accepted by `isDMExistsReply` during the backend rollout window, then removed in a follow-up release. The cutover (extending the sync error decoder + `AsyncJobResult` decoder + `isDMExistsReply` to handle both shapes, plus the typecheck/test/smoke gates) is plan Chapter 19 in `docs/superpowers/plans/2026-05-28-centralized-error-codes.md`; it ships in the same release as the backend. + ### What components do ```jsx diff --git a/chat-frontend/src/api/_transport/asyncJob.test.js b/chat-frontend/src/api/_transport/asyncJob.test.js index e36fc8892..26fa2f137 100644 --- a/chat-frontend/src/api/_transport/asyncJob.test.js +++ b/chat-frontend/src/api/_transport/asyncJob.test.js @@ -1,6 +1,6 @@ import { describe, it, expect, vi } from 'vitest' import { StringCodec } from 'nats.ws' -import { requestWithAsyncResult, ASYNC_JOB_ERROR_KINDS, formatAsyncJobError } from './asyncJob' +import { requestWithAsyncResult, ASYNC_JOB_ERROR_KINDS, formatAsyncJobError, AsyncJobError } from './asyncJob' const sc = StringCodec() @@ -218,18 +218,53 @@ describe('formatAsyncJobError', () => { expect(formatAsyncJobError(err)).toMatch(/connection|disconnected|interrupted/i) }) - it('returns the raw message for SyncError (server-supplied user-safe text)', () => { + it('falls back to message for SyncError when no reason is attached (server-supplied user-safe text)', () => { const err = new Error('only owners can add members') err.kind = ASYNC_JOB_ERROR_KINDS.SyncError expect(formatAsyncJobError(err)).toBe('only owners can add members') }) - it('returns the raw message for AsyncError', () => { + it('falls back to message for AsyncError when no reason is attached', () => { const err = new Error('exceeds maximum capacity') err.kind = ASYNC_JOB_ERROR_KINDS.AsyncError expect(formatAsyncJobError(err)).toBe('exceeds maximum capacity') }) + it('returns the humanized REASON_COPY copy when a SyncError carries a known reason', () => { + const err = new AsyncJobError('exceeds maximum capacity (50)', ASYNC_JOB_ERROR_KINDS.SyncError, { + reason: 'max_room_size_reached', + }) + expect(formatAsyncJobError(err)).toBe('This room is at capacity.') + }) + + it('returns the humanized copy for not_room_member', () => { + const err = new AsyncJobError('only room members can do that', ASYNC_JOB_ERROR_KINDS.AsyncError, { + reason: 'not_room_member', + }) + expect(formatAsyncJobError(err)).toBe("You're not a member of this room.") + }) + + it('returns the humanized copy for not_subscribed', () => { + const err = new AsyncJobError('not subscribed', ASYNC_JOB_ERROR_KINDS.SyncError, { + reason: 'not_subscribed', + }) + expect(formatAsyncJobError(err)).toBe('You need to join this room first.') + }) + + it('returns the humanized copy for large_room_post_restricted', () => { + const err = new AsyncJobError('only owners/admins may post', ASYNC_JOB_ERROR_KINDS.SyncError, { + reason: 'large_room_post_restricted', + }) + expect(formatAsyncJobError(err)).toBe('Only owners and admins can post here.') + }) + + it('falls back to err.message when the reason is unrecognized', () => { + const err = new AsyncJobError('some unmapped error', ASYNC_JOB_ERROR_KINDS.SyncError, { + reason: 'not_in_catalog', + }) + expect(formatAsyncJobError(err)).toBe('some unmapped error') + }) + it('falls back to message for unknown / untagged errors', () => { expect(formatAsyncJobError(new Error('random'))).toBe('random') expect(formatAsyncJobError(null)).toBe('') diff --git a/chat-frontend/src/api/_transport/asyncJob.ts b/chat-frontend/src/api/_transport/asyncJob.ts index 05553d053..df7f75603 100644 --- a/chat-frontend/src/api/_transport/asyncJob.ts +++ b/chat-frontend/src/api/_transport/asyncJob.ts @@ -30,31 +30,99 @@ export type AsyncJobErrorKind = (typeof ASYNC_JOB_ERROR_KINDS)[keyof typeof ASYNC_JOB_ERROR_KINDS] /** - * Error class thrown by `requestWithAsyncResult`. Use `instanceof` to - * narrow without string-matching the message. + * The 7+1 generic error categories the backend's `pkg/errcode` package emits + * on every error envelope (NATS reply, JetStream result, HTTP). Mirrors the + * closed set in `pkg/errcode/category.go`. `string & {}` is a JSDoc-style + * escape hatch so a not-yet-mirrored future category still typechecks. + */ +export type ErrorCode = + | 'bad_request' + | 'unauthenticated' + | 'forbidden' + | 'not_found' + | 'conflict' + | 'too_many_requests' + | 'unavailable' + | 'internal' + | (string & {}) + +/** + * Error class thrown by `requestWithAsyncResult` AND `NatsContext.request`. + * Use `instanceof` to narrow without string-matching the message. * * Why a class (not just an interface): callers can do * `if (err instanceof AsyncJobError) …` * which is the idiomatic way to discriminate caught `unknown` in TS. + * + * `code` / `reason` / `metadata` are populated from the backend errcode + * envelope when the failure carries one (`SyncError`, `AsyncError`); they are + * undefined for wire-level failures (`AsyncTimeout`, `SubscriptionClosed`). + * Branch on `reason ?? code` — never on `message`. */ export class AsyncJobError extends Error { readonly kind: AsyncJobErrorKind - constructor(message: string, kind: AsyncJobErrorKind, cause?: unknown) { + readonly code?: ErrorCode + readonly reason?: string + readonly metadata?: Record + constructor( + message: string, + kind: AsyncJobErrorKind, + opts?: { + cause?: unknown + code?: ErrorCode + reason?: string + metadata?: Record + }, + ) { super(message) this.name = 'AsyncJobError' this.kind = kind - if (cause !== undefined) this.cause = cause + if (opts?.cause !== undefined) this.cause = opts.cause + if (opts?.code !== undefined) this.code = opts.code + if (opts?.reason !== undefined) this.reason = opts.reason + if (opts?.metadata !== undefined) this.metadata = opts.metadata } } +/** + * Reason-keyed humanized copy for the errcode reasons emitted today + * (catalog: docs/client-api.md §6 + chat-frontend/CLAUDE.md). Used by + * formatAsyncJobError so consumers don't have to maintain their own per-call + * map of reason→copy. sso_token_expired / invalid_sso_token are intentionally + * absent — they drive a redirect (Task 20.7), not a user-facing message. + */ +const REASON_COPY: Record = { + max_room_size_reached: 'This room is at capacity.', + not_room_member: "You're not a member of this room.", + not_room_owner: 'Only owners can do that.', + last_owner_cannot_leave: "You're the last owner — promote someone else first.", + bot_in_channel: "Bots can't join channels.", + bot_not_available: "This bot isn't available right now.", + user_not_found: "We couldn't find that user.", + invalid_org: "We couldn't find that group.", + self_dm: "You can't DM yourself.", + last_member_cannot_remove: "Can't remove the last member — delete the room instead.", + target_not_member: "That user isn't in this room.", + already_owner: 'That user is already an owner.', + cannot_demote_last_owner: "Can't demote the last owner — promote someone else first.", + promote_requires_individual: 'Only individual members can be promoted to owner.', + large_room_post_restricted: 'Only owners and admins can post here.', + not_subscribed: 'You need to join this room first.', + outside_access_window: 'This message is older than your access to this room.', + pin_disabled: 'Pinning is turned off for this site.', + pin_limit_reached: 'This room has reached its pin limit — unpin a message first.', + pin_room_too_large: 'This room is too large for non-admins to pin.', +} + /** * User-facing message for an error thrown by `requestWithAsyncResult`. * - * Server-side errors (`SyncError`, `AsyncError`) already carry a sanitised, - * user-safe message and are returned as-is. Wire-level failures - * (`AsyncTimeout`, `SubscriptionClosed`) get a friendlier hint that says what - * happened and what the user can do about it — the raw "async result timeout" - * isn't actionable. + * Server-side errors (`SyncError`, `AsyncError`) carry the errcode envelope's + * `reason` when applicable — preferred over `message` because reasons are + * stable machine codes (the english text can change without notice). Falls + * back to `err.message` when the reason isn't in the catalog (or absent, e.g. + * a bare `Error` from a non-backend caller). Wire-level failures + * (`AsyncTimeout`, `SubscriptionClosed`) get friendlier actionable hints. */ export function formatAsyncJobError(err: unknown): string { if (!err) return '' @@ -70,8 +138,16 @@ export function formatAsyncJobError(err: unknown): string { return "The server didn't respond in time. The action may still complete — refresh to check." case ASYNC_JOB_ERROR_KINDS.SubscriptionClosed: return 'Connection interrupted before the server confirmed. Refresh to check the result.' - default: + default: { + const reason = + err instanceof AsyncJobError + ? err.reason + : (err as { reason?: string })?.reason + if (reason && REASON_COPY[reason]) { + return REASON_COPY[reason] + } return err instanceof Error ? err.message : String(err) + } } } @@ -84,18 +160,27 @@ type Envelope = | { kind: 'timeout' } /** Common shape of any sync reply we treat specially — `error` triggers - * the failure branch, `status` is the typical 'accepted'/'error' marker. */ + * the failure branch, `status` is the typical 'accepted'/'error'/'exists' + * marker. `code`/`reason`/`metadata` are the new errcode envelope fields + * (present on errors from any post-migration backend; absent on a legacy + * reply during the rollout window). */ interface SyncReplyEnvelope { error?: string status?: string + code?: ErrorCode + reason?: string + metadata?: Record } /** Common shape of any async-job result envelope we receive on the - * response subject. Both `status` and `error` may be set; status === 'error' - * takes the failure path. */ + * response subject. `status === 'error'` takes the failure path. + * `code`/`reason` mirror the backend `AsyncJobResult.Code`/`Reason` fields. */ interface AsyncReplyEnvelope { status?: string error?: string + code?: ErrorCode + reason?: string + metadata?: Record } /** @@ -161,20 +246,26 @@ export async function requestWithAsyncResult( } catch (err) { cleanupSub() const msg = err instanceof Error ? err.message : String(err) - throw new AsyncJobError(msg, ASYNC_JOB_ERROR_KINDS.SyncError, err) + throw new AsyncJobError(msg, ASYNC_JOB_ERROR_KINDS.SyncError, { cause: err }) } // `sync` is generic, but we always inspect the same envelope fields. const syncEnv = sync as unknown as SyncReplyEnvelope if (syncEnv?.error) { - // DM-exists and similar "200 with error+roomId" replies are success cases - // for the caller. The caller opts into this with treatAsSuccess(reply). + // DM-exists and similar replies the caller wants to treat as success + // (legacy `{error:"dm already exists", roomId}` shape, or any other + // 200-with-error contract). The new `{status:"exists", roomId}` shape + // never has `.error`, so it skips this branch entirely. if (treatAsSuccess && treatAsSuccess(sync)) { cleanupSub() return { requestId, sync, async: null } } cleanupSub() - throw new AsyncJobError(syncEnv.error, ASYNC_JOB_ERROR_KINDS.SyncError) + throw new AsyncJobError(syncEnv.error, ASYNC_JOB_ERROR_KINDS.SyncError, { + code: syncEnv.code, + reason: syncEnv.reason, + metadata: syncEnv.metadata, + }) } let timer: ReturnType | undefined @@ -190,7 +281,7 @@ export async function requestWithAsyncResult( if (envelope.kind === 'error') { const cause = envelope.error const msg = cause instanceof Error ? cause.message : 'subscription error' - throw new AsyncJobError(msg, ASYNC_JOB_ERROR_KINDS.SubscriptionClosed, cause) + throw new AsyncJobError(msg, ASYNC_JOB_ERROR_KINDS.SubscriptionClosed, { cause }) } if (envelope.kind === 'closed') { throw new AsyncJobError( @@ -203,6 +294,11 @@ export async function requestWithAsyncResult( throw new AsyncJobError( asyncEnv.error || 'operation failed', ASYNC_JOB_ERROR_KINDS.AsyncError, + { + code: asyncEnv.code, + reason: asyncEnv.reason, + metadata: asyncEnv.metadata, + }, ) } cleanupSub() diff --git a/chat-frontend/src/api/auth/oidcClient.js b/chat-frontend/src/api/auth/oidcClient.js index 2f0174a89..b3a94c240 100644 --- a/chat-frontend/src/api/auth/oidcClient.js +++ b/chat-frontend/src/api/auth/oidcClient.js @@ -27,3 +27,34 @@ export function getOidcManager() { export function _resetOidcManagerForTests() { manager = null } + +/** + * Reason-aware re-login redirect. Call from catch blocks when an AsyncJobError + * carries an SSO token-expired/invalid reason. Clears any stashed OIDC session + * state and kicks the OIDC sign-in redirect; the browser navigates away, so + * the caller's catch returns and any subsequent setError(...) never paints. + * + * For dev-mode (no OIDC manager configured), no-ops — dev sessions don't carry + * SSO tokens, so this branch shouldn't fire there anyway. + */ +export function isSSOTokenInvalidError(err) { + if (!err || typeof err !== 'object') return false + const reason = err.reason + return reason === 'sso_token_expired' || reason === 'invalid_sso_token' +} + +export async function redirectToReloginOnTokenInvalid() { + try { + // Clear oidc-client-ts's stashed user + the LoginPage's siteId stash. + if (manager) { + try { await manager.removeUser() } catch { /* best-effort */ } + } + window.sessionStorage.removeItem('oidc.siteId') + const mgr = getOidcManager() + await mgr.signinRedirect() + } catch { + // If the redirect itself fails (e.g. test envs without window.location + // navigation), fall through; the caller's outer flow will then surface a + // generic error. + } +} diff --git a/chat-frontend/src/api/index.ts b/chat-frontend/src/api/index.ts index 40bee943d..6755ad6be 100644 --- a/chat-frontend/src/api/index.ts +++ b/chat-frontend/src/api/index.ts @@ -44,7 +44,7 @@ export { ASYNC_JOB_ERROR_KINDS, formatAsyncJobError, } from './_transport/asyncJob' -export type { AsyncJobErrorKind } from './_transport/asyncJob' +export type { AsyncJobErrorKind, ErrorCode } from './_transport/asyncJob' // Shared wire types — mirror pkg/model. Components/contexts import // these from `@/api` instead of deep-importing `@/api/types`. @@ -56,6 +56,7 @@ export type { SubscriptionUpdateAction, AsyncJobOptions, AsyncJobResult, + AsyncJobResultEnvelope, // Domain types User, Room, diff --git a/chat-frontend/src/api/types.ts b/chat-frontend/src/api/types.ts index 44ae09024..7a56dc623 100644 --- a/chat-frontend/src/api/types.ts +++ b/chat-frontend/src/api/types.ts @@ -274,13 +274,37 @@ export interface RoomKeyEvent { timestamp: number } -/** Two-phase async-job result returned by `requestWithAsyncResult`. */ +/** Two-phase async-job result returned by `requestWithAsyncResult`. The + * outer wrapper; `S` is the sync reply shape, `A` is the wire-side async + * payload (typically `AsyncJobResultEnvelope` below). */ export interface AsyncJobResult { requestId: string sync: S async: A | null } +/** + * Wire-side envelope published by room-worker on the per-request response + * subject when an async job finishes. Mirrors `pkg/model.AsyncJobResult` + * field-by-field (the strict TS-mirror rule). Use as the `A` generic on + * `AsyncJobResult` when the caller wants typed access to the worker's + * result payload. + * + * `code` and `reason` mirror the errcode envelope and are populated only + * when `status === 'error'`. Branch on `reason ?? code`; never on `error` + * text. See chat-frontend/CLAUDE.md "Error envelope (server contract)". + */ +export interface AsyncJobResultEnvelope { + requestId: string + operation: string + status: 'ok' | 'error' + roomId?: string + error?: string + code?: string + reason?: string + timestamp: number +} + /** Options forwarded to `requestWithAsyncResult` from the api layer. */ export interface AsyncJobOptions { /** When set, a sync reply matching this predicate is treated as diff --git a/chat-frontend/src/components/MainApp/ChatPage/LeaveRoomButton/LeaveRoomButton.jsx b/chat-frontend/src/components/MainApp/ChatPage/LeaveRoomButton/LeaveRoomButton.jsx index 3e4a73482..644d25180 100644 --- a/chat-frontend/src/components/MainApp/ChatPage/LeaveRoomButton/LeaveRoomButton.jsx +++ b/chat-frontend/src/components/MainApp/ChatPage/LeaveRoomButton/LeaveRoomButton.jsx @@ -1,5 +1,5 @@ import { useNats } from '@/context/NatsContext' -import { leaveRoom } from '@/api' +import { leaveRoom, formatAsyncJobError } from '@/api' export default function LeaveRoomButton({ room }) { const nats = useNats() @@ -11,7 +11,7 @@ export default function LeaveRoomButton({ room }) { try { await leaveRoom(nats, { roomId: room.id, siteId: room.siteId }) } catch (err) { - window.alert(`Failed to leave: ${err.message}`) + window.alert(`Failed to leave: ${formatAsyncJobError(err)}`) } } diff --git a/chat-frontend/src/components/MainApp/ChatPage/LeaveRoomButton/LeaveRoomButton.test.jsx b/chat-frontend/src/components/MainApp/ChatPage/LeaveRoomButton/LeaveRoomButton.test.jsx index e7388923b..dd376713a 100644 --- a/chat-frontend/src/components/MainApp/ChatPage/LeaveRoomButton/LeaveRoomButton.test.jsx +++ b/chat-frontend/src/components/MainApp/ChatPage/LeaveRoomButton/LeaveRoomButton.test.jsx @@ -7,6 +7,7 @@ vi.mock('@/context/NatsContext', () => ({ })) import { useNats } from '@/context/NatsContext' +import { AsyncJobError, ASYNC_JOB_ERROR_KINDS } from '@/api/_transport/asyncJob' const channelRoom = { id: 'r1', siteId: 'site-A', name: 'general', type: 'channel' } const dmRoom = { id: 'r2', siteId: 'site-A', name: 'bob-dm', type: 'dm' } @@ -55,10 +56,13 @@ describe('LeaveRoomButton', () => { expect(request).not.toHaveBeenCalled() }) - it('alerts the user when the request fails', async () => { + it('alerts the user with the humanized reason copy when the request fails (last_owner_cannot_leave)', async () => { vi.spyOn(window, 'confirm').mockReturnValue(true) const alertSpy = vi.spyOn(window, 'alert').mockImplementation(() => {}) - const request = vi.fn().mockRejectedValue(new Error('cannot leave: you are the last owner')) + const err = new AsyncJobError('cannot leave: you are the last owner', ASYNC_JOB_ERROR_KINDS.SyncError, { + reason: 'last_owner_cannot_leave', + }) + const request = vi.fn().mockRejectedValue(err) setup(channelRoom, { request }) fireEvent.click(screen.getByRole('button', { name: /Leave/i })) await waitFor(() => expect(alertSpy).toHaveBeenCalledTimes(1)) diff --git a/chat-frontend/src/components/MainApp/ChatPage/ManageMembersDialog/MemberRoster/MemberRoster.jsx b/chat-frontend/src/components/MainApp/ChatPage/ManageMembersDialog/MemberRoster/MemberRoster.jsx index 1cee4813e..2361940b7 100644 --- a/chat-frontend/src/components/MainApp/ChatPage/ManageMembersDialog/MemberRoster/MemberRoster.jsx +++ b/chat-frontend/src/components/MainApp/ChatPage/ManageMembersDialog/MemberRoster/MemberRoster.jsx @@ -58,7 +58,7 @@ export default function MemberRoster({ room }) { setMembers(resp.members ?? []) } catch (err) { if (gen !== memberListGenRef.current) return - setError(err.message) + setError(formatAsyncJobError(err)) } finally { if (gen === memberListGenRef.current) setLoading(false) } diff --git a/chat-frontend/src/components/MainApp/Sidebar/CreateRoomDialog/CreateRoomDialog.test.jsx b/chat-frontend/src/components/MainApp/Sidebar/CreateRoomDialog/CreateRoomDialog.test.jsx index 3e6657b65..ade27e479 100644 --- a/chat-frontend/src/components/MainApp/Sidebar/CreateRoomDialog/CreateRoomDialog.test.jsx +++ b/chat-frontend/src/components/MainApp/Sidebar/CreateRoomDialog/CreateRoomDialog.test.jsx @@ -16,6 +16,7 @@ vi.mock('@/context/RoomEventsContext', () => ({ import { useNats } from '@/context/NatsContext' import { useRoomSummaries } from '@/context/RoomEventsContext' +import { AsyncJobError, ASYNC_JOB_ERROR_KINDS } from '@/api/_transport/asyncJob' // Pre-populate summaries with the roomIds the success fixtures return so // the dialog's "wait for subscription.update" useEffect resolves on the @@ -250,8 +251,11 @@ describe('CreateRoomDialog', () => { }) }) - it('shows the server error on a failed create and does not close', async () => { - const requestWithAsyncResult = vi.fn().mockRejectedValue(new Error('exceeds maximum capacity (50)')) + it('shows the humanized REASON_COPY copy on a failed create (max_room_size_reached) and does not close', async () => { + const err = new AsyncJobError('exceeds maximum capacity (50)', ASYNC_JOB_ERROR_KINDS.SyncError, { + reason: 'max_room_size_reached', + }) + const requestWithAsyncResult = vi.fn().mockRejectedValue(err) useNats.mockReturnValue({ user: { account: 'alice', siteId: 'site-A' }, request: vi.fn(), @@ -262,7 +266,7 @@ describe('CreateRoomDialog', () => { render() fireEvent.change(screen.getByLabelText(/Name/i), { target: { value: 'huge' } }) fireEvent.click(screen.getByRole('button', { name: /Create/i })) - expect(await screen.findByText(/exceeds maximum capacity/i)).toBeInTheDocument() + expect(await screen.findByText(/at capacity/i)).toBeInTheDocument() expect(onClose).not.toHaveBeenCalled() }) }) diff --git a/chat-frontend/src/components/shared/MessageList/MessageRow/MessageActions/MessageActionMenu/MessageActionMenu.jsx b/chat-frontend/src/components/shared/MessageList/MessageRow/MessageActions/MessageActionMenu/MessageActionMenu.jsx index 00e10dc9a..0deeb681c 100644 --- a/chat-frontend/src/components/shared/MessageList/MessageRow/MessageActions/MessageActionMenu/MessageActionMenu.jsx +++ b/chat-frontend/src/components/shared/MessageList/MessageRow/MessageActions/MessageActionMenu/MessageActionMenu.jsx @@ -1,6 +1,6 @@ import { useCallback, useEffect, useRef, useState } from 'react' import { useNats } from '@/context/NatsContext' -import { fetchReadReceipt } from '@/api' +import { fetchReadReceipt, formatAsyncJobError } from '@/api' import './style.css' function formatReaderName(r) { @@ -66,7 +66,7 @@ export default function MessageActionMenu({ message, room }) { }) .catch((err) => { if (!mountedRef.current) return - setError(err?.message || 'Failed to load read receipts') + setError(formatAsyncJobError(err) || 'Failed to load read receipts') setLoading(false) }) } diff --git a/chat-frontend/src/context/NatsContext/NatsContext.jsx b/chat-frontend/src/context/NatsContext/NatsContext.jsx index 5f93f37a2..ad95b123b 100644 --- a/chat-frontend/src/context/NatsContext/NatsContext.jsx +++ b/chat-frontend/src/context/NatsContext/NatsContext.jsx @@ -2,7 +2,11 @@ import { createContext, useContext, useRef, useState, useCallback, useMemo } fro import { connect as natsConnect, StringCodec, jwtAuthenticator } from 'nats.ws' import { createUser } from 'nkeys.js' import { AUTH_URL, NATS_URL } from '@/lib/runtimeConfig' -import { requestWithAsyncResult as asyncJobRequest } from '@/api/_transport/asyncJob' +import { + requestWithAsyncResult as asyncJobRequest, + AsyncJobError, + ASYNC_JOB_ERROR_KINDS, +} from '@/api/_transport/asyncJob' export const NatsContext = createContext(null) @@ -49,8 +53,15 @@ export function NatsProvider({ children }) { }) if (!authResp.ok) { + // auth-service emits the errcode envelope {code, reason?, error, metadata?} + // via errhttp.Write. Older auth deployments may return {error} only — + // err.code is then undefined and consumers fall back to err.message text. const errBody = await authResp.json().catch(() => ({})) - throw new Error(errBody.error || `Auth failed: ${authResp.status}`) + throw new AsyncJobError( + errBody.error || `Auth failed: ${authResp.status}`, + ASYNC_JOB_ERROR_KINDS.SyncError, + { code: errBody.code, reason: errBody.reason, metadata: errBody.metadata }, + ) } const { natsJwt, user: userInfo } = await authResp.json() @@ -80,16 +91,27 @@ export function NatsProvider({ children }) { * @param {string} subject * @param {unknown} [data={}] JSON-serialisable payload. * @returns {Promise} Parsed JSON reply. - * @throws if not connected, the request times out (5s), or the reply - * carries `{error}` — in the last case the thrown Error's message - * is the server's user-safe error string. + * @throws {AsyncJobError} On error replies the thrown error carries + * `.code` (always) and `.reason`/`.metadata` (when the backend emits + * them). Branch on `reason ?? code`; `.message` is the user-safe text + * for display only. Wire-level failures (not connected, request + * timeout) still throw a plain Error. */ const request = useCallback(async (subject, data = {}) => { if (!ncRef.current) throw new Error('Not connected') const payload = sc.encode(JSON.stringify(data)) const resp = await ncRef.current.request(subject, payload, { timeout: 5000 }) const parsed = JSON.parse(sc.decode(resp.data)) - if (parsed.error) throw new Error(parsed.error) + if (parsed.error) { + // errcode envelope {code, reason?, error, metadata?}. Legacy replies + // (pre-migration backend during rollout) lack code/reason — consumers + // fall back to err.message. + throw new AsyncJobError(parsed.error, ASYNC_JOB_ERROR_KINDS.SyncError, { + code: parsed.code, + reason: parsed.reason, + metadata: parsed.metadata, + }) + } return parsed }, []) diff --git a/chat-frontend/src/lib/constants.js b/chat-frontend/src/lib/constants.js index 416b55bf3..bc9c3941b 100644 --- a/chat-frontend/src/lib/constants.js +++ b/chat-frontend/src/lib/constants.js @@ -2,7 +2,7 @@ // tightly-coupled predicates that test them. Keep in sync with: // pkg/model/subscription.go (Role) // pkg/model/member.go (HistoryMode) -// room-service/helper.go (dmExistsError.Error()) +// pkg/model/event.go (CreateRoomStatusExists) export const ROLE_OWNER = 'owner' export const ROLE_MEMBER = 'member' @@ -10,14 +10,22 @@ export const ROLE_MEMBER = 'member' export const HISTORY_MODE_ALL = 'all' export const HISTORY_MODE_NONE = 'none' -// Server's "DM already exists" sync-reply error string. The dedup reply is -// shape {error: this, roomId: existingId} — a 200-with-error that callers -// treat as success (open the existing room). +// New canonical DM-exists status (post errcode migration). The backend +// returns {status: STATUS_EXISTS, roomId: } as a SUCCESS reply. +export const STATUS_EXISTS = 'exists' + +// Legacy DM-exists error string — the pre-migration reply was the error-shaped +// {error: this, roomId: existingId}. Accepted by isDMExistsReply during the +// backend rollout window so the frontend can deploy first; removed in a +// follow-up release once the new envelope is everywhere. export const ERR_DM_ALREADY_EXISTS = 'dm already exists' -// Predicate for the DM-exists sync reply. Co-located with the constant -// because they encode the same contract; any caller that wants to treat -// dedup as success should use this rather than re-deriving the check. +// Predicate for the DM-exists sync reply. Accepts BOTH shapes during the +// rollout window: the new {status:"exists", roomId} success envelope and the +// legacy {error:"dm already exists", roomId} 200-with-error. Callers that +// want to treat dedup as success should use this rather than re-deriving the +// check — see plan Chapter 19 for the cutover details. export function isDMExistsReply(reply) { - return reply?.error === ERR_DM_ALREADY_EXISTS && !!reply.roomId + if (!reply || !reply.roomId) return false + return reply.status === STATUS_EXISTS || reply.error === ERR_DM_ALREADY_EXISTS } diff --git a/chat-frontend/src/lib/constants.test.js b/chat-frontend/src/lib/constants.test.js new file mode 100644 index 000000000..86a407389 --- /dev/null +++ b/chat-frontend/src/lib/constants.test.js @@ -0,0 +1,40 @@ +// Regression net for the centralized-error-codes migration: the predicate +// MUST accept both reply shapes during the backend rollout window (plan +// Ch 19). The legacy shape is removed in a follow-up release once the new +// envelope is everywhere. + +import { describe, it, expect } from 'vitest' +import { + isDMExistsReply, + STATUS_EXISTS, + ERR_DM_ALREADY_EXISTS, +} from './constants' + +describe('isDMExistsReply', () => { + it('matches the new success-envelope shape {status:"exists", roomId}', () => { + expect(isDMExistsReply({ status: STATUS_EXISTS, roomId: 'r1' })).toBe(true) + }) + + it('still matches the legacy error-envelope shape {error:"dm already exists", roomId}', () => { + expect(isDMExistsReply({ error: ERR_DM_ALREADY_EXISTS, roomId: 'r1' })).toBe(true) + }) + + it('does NOT match a real failure (no roomId)', () => { + expect(isDMExistsReply({ error: ERR_DM_ALREADY_EXISTS })).toBe(false) + expect(isDMExistsReply({ status: STATUS_EXISTS })).toBe(false) + }) + + it('does NOT match an unrelated error string with a roomId', () => { + expect(isDMExistsReply({ error: 'something else', roomId: 'r1' })).toBe(false) + }) + + it('does NOT match an unrelated success status with a roomId', () => { + expect(isDMExistsReply({ status: 'accepted', roomId: 'r1' })).toBe(false) + }) + + it('handles null / undefined / non-object inputs gracefully', () => { + expect(isDMExistsReply(null)).toBe(false) + expect(isDMExistsReply(undefined)).toBe(false) + expect(isDMExistsReply({})).toBe(false) + }) +}) diff --git a/chat-frontend/src/pages/LoginPage/LoginPage.jsx b/chat-frontend/src/pages/LoginPage/LoginPage.jsx index 1b6e5b32d..0954ebf62 100644 --- a/chat-frontend/src/pages/LoginPage/LoginPage.jsx +++ b/chat-frontend/src/pages/LoginPage/LoginPage.jsx @@ -1,7 +1,8 @@ import { useState } from 'react' import { useNats } from '@/context/NatsContext' import { DEFAULT_SITE_ID, DEV_MODE } from '@/lib/runtimeConfig' -import { getOidcManager } from '@/api/auth/oidcClient' +import { getOidcManager, isSSOTokenInvalidError, redirectToReloginOnTokenInvalid } from '@/api/auth/oidcClient' +import { formatAsyncJobError } from '@/api' import './style.css' export default function LoginPage() { @@ -26,7 +27,11 @@ export default function LoginPage() { siteId: siteId.trim(), }) } catch (err) { - setError(err.message) + if (isSSOTokenInvalidError(err)) { + await redirectToReloginOnTokenInvalid() + return + } + setError(formatAsyncJobError(err)) } finally { setLoading(false) } @@ -42,7 +47,11 @@ export default function LoginPage() { await manager.signinRedirect() // Browser navigates away — code below this point is unreachable in prod. } catch (err) { - setError(err.message) + if (isSSOTokenInvalidError(err)) { + await redirectToReloginOnTokenInvalid() + return + } + setError(formatAsyncJobError(err)) setLoading(false) } } diff --git a/chat-frontend/src/pages/LoginPage/LoginPage.test.jsx b/chat-frontend/src/pages/LoginPage/LoginPage.test.jsx index d71043df2..3871c0fc6 100644 --- a/chat-frontend/src/pages/LoginPage/LoginPage.test.jsx +++ b/chat-frontend/src/pages/LoginPage/LoginPage.test.jsx @@ -12,6 +12,8 @@ vi.mock('@/lib/runtimeConfig', () => ({ vi.mock('@/api/auth/oidcClient', () => ({ getOidcManager: vi.fn(), + isSSOTokenInvalidError: vi.fn(() => false), + redirectToReloginOnTokenInvalid: vi.fn(() => Promise.resolve()), })) import LoginPage from './LoginPage' diff --git a/chat-frontend/src/pages/OidcCallback/OidcCallback.jsx b/chat-frontend/src/pages/OidcCallback/OidcCallback.jsx index e326d9bdb..0557aac07 100644 --- a/chat-frontend/src/pages/OidcCallback/OidcCallback.jsx +++ b/chat-frontend/src/pages/OidcCallback/OidcCallback.jsx @@ -1,6 +1,7 @@ import { useEffect, useState } from 'react' import { useNats } from '@/context/NatsContext' -import { getOidcManager } from '@/api/auth/oidcClient' +import { getOidcManager, isSSOTokenInvalidError, redirectToReloginOnTokenInvalid } from '@/api/auth/oidcClient' +import { formatAsyncJobError } from '@/api' // OidcCallback handles the post-redirect leg of the OIDC authorization-code // flow. It pulls the access token from oidc-client-ts, then hands it off to @@ -38,7 +39,18 @@ export default function OidcCallback({ onDone }) { } } catch (err) { if (cancelled) return - setError(err.message || String(err)) + if (isSSOTokenInvalidError(err)) { + try { + await redirectToReloginOnTokenInvalid() + return + } catch (redirectErr) { + // Redirect failed (e.g. signinRedirect rejected) — surface the + // error so the user isn't stuck on "Completing sign-in...". + setError(formatAsyncJobError(redirectErr) || String(redirectErr)) + return + } + } + setError(formatAsyncJobError(err) || String(err)) } } diff --git a/chat-frontend/src/pages/OidcCallback/OidcCallback.test.jsx b/chat-frontend/src/pages/OidcCallback/OidcCallback.test.jsx index 63992ce07..6248ddd35 100644 --- a/chat-frontend/src/pages/OidcCallback/OidcCallback.test.jsx +++ b/chat-frontend/src/pages/OidcCallback/OidcCallback.test.jsx @@ -7,6 +7,8 @@ vi.mock('@/context/NatsContext', () => ({ vi.mock('@/api/auth/oidcClient', () => ({ getOidcManager: vi.fn(), + isSSOTokenInvalidError: vi.fn(() => false), + redirectToReloginOnTokenInvalid: vi.fn(() => Promise.resolve()), })) import OidcCallback from './OidcCallback' diff --git a/docs/client-api.md b/docs/client-api.md index 64e10ab71..606cb1247 100644 --- a/docs/client-api.md +++ b/docs/client-api.md @@ -1,5 +1,22 @@ # Chat Backend — Client API Reference +> **Changelog — centralized error codes (current release).** All client-facing +> errors — over NATS sync replies, JetStream async results (`model.AsyncJobResult`), +> and HTTP — now use the same envelope: `{ "error": , "code": , +> "reason"?: , "metadata"?: {…} }`. `code` is **always present** and +> drives HTTP status (see §6); `reason` is the optional domain code the client +> branches on (`reason ?? code`). Three notable behavior changes: +> +> 1. **`POST /auth` 500** now returns `{ "code": "internal", "error": "internal error" }` — +> the real signing-failure cause is logged server-side and never sent to the client. +> 2. **room-service DM-exists** flipped from the legacy error-shaped envelope +> `{ "error": "dm already exists", "roomId": … }` to a SUCCESS reply +> `{ "status": "exists", "roomId": … }`. Clients must route on `status === "exists"`. +> A frontend predicate `isDMExistsReply` accepts BOTH shapes during the rollout +> window; the legacy fallback is removed in a follow-up. +> 3. **message-gatekeeper `not_subscribed`** now carries `code: forbidden` + `reason: not_subscribed` +> (and `large_room_post_restricted` is `forbidden` + that reason) instead of bare error strings. + This document is the integrator-facing reference for the chat backend. It covers every API a client (web, mobile, third-party) can call: @@ -160,13 +177,13 @@ Exchanges an SSO token for a signed NATS user JWT. The returned JWT is what the See [Error envelope](#6-error-envelope-reference). HTTP statuses: -| Status | Meaning | Example body | -|--------|---------|--------------| -| 400 | Missing required fields. | `{ "error": "ssoToken and natsPublicKey are required" }` | -| 400 | `natsPublicKey` is not a valid NATS user public NKey. | `{ "error": "invalid natsPublicKey format" }` | -| 401 | SSO token expired. | `{ "error": "SSO token has expired, please re-login" }` | -| 401 | SSO token invalid (bad signature, audience, etc.). | `{ "error": "invalid SSO token" }` | -| 500 | Server-side JWT signing failure. | `{ "error": "failed to generate NATS token" }` | +| Status | `code` | `reason` | Example body | +|--------|-------------------|---------------------|--------------| +| 400 | `bad_request` | — | `{ "code": "bad_request", "error": "ssoToken and natsPublicKey are required" }` | +| 400 | `bad_request` | — | `{ "code": "bad_request", "error": "invalid natsPublicKey format" }` | +| 401 | `unauthenticated` | `sso_token_expired` | `{ "code": "unauthenticated", "reason": "sso_token_expired", "error": "SSO token has expired, please re-login" }` | +| 401 | `unauthenticated` | `invalid_sso_token` | `{ "code": "unauthenticated", "reason": "invalid_sso_token", "error": "invalid SSO token" }` | +| 500 | `internal` | — | `{ "code": "internal", "error": "internal error" }` — the real cause is logged server-side and never sent to the client. | The returned `natsJwt` has a server-configured lifetime (default 2h). Clients should re-call `POST /auth` to refresh before it expires. @@ -232,12 +249,14 @@ The creator's account and the site come from the subject (`chat.user.{account}.r { "status": "accepted", "roomId": "01970a4f8c2d7c9aQ", "roomType": "channel" } ``` -**DM already exists.** When the client asks to create a DM/botDM that already exists, the reply is a non-standard envelope carrying the existing room ID (this is the open-or-create contract — the client should treat it as success and open the existing room): +**DM already exists.** When the client asks to create a DM/botDM that already exists, the reply is a SUCCESS reply carrying the existing room ID (open-or-create contract — the client opens the existing room): ```json -{ "error": "dm already exists", "roomId": "" } +{ "status": "exists", "roomId": "" } ``` +> **Contract change (breaking):** prior to the centralized error-codes migration this case returned the *error*-shaped envelope `{ "error": "dm already exists", "roomId": "" }`. Clients on the new release must route on `status === "exists"`. During the rollout window, the frontend predicate `isDMExistsReply` accepts BOTH shapes; see the migration changelog at the top of this file. + ##### Error response See [Error envelope](#6-error-envelope-reference). Returned synchronously on validation/authorization failure: @@ -252,7 +271,7 @@ See [Error envelope](#6-error-envelope-reference). Returned synchronously on val - `"exceeds maximum capacity (N): would create M members"` ```json -{ "error": "channel name is required" } +{ "code": "bad_request", "error": "channel name is required" } ``` ##### Triggered events — success path @@ -320,7 +339,7 @@ The fields `requesterId`, `requesterAccount`, and `timestamp` on the Go `AddMemb See [Error envelope](#6-error-envelope-reference). Returned synchronously when validation or authorization fails (e.g. requester not in room, room is full, room is restricted and requester is not owner, or a `users` entry is a bot — rejected with `"bots cannot be added to a channel"`). Any `orgs` entry that matches zero users (no user with `sectId == orgId` or `deptId == orgId`) is rejected with `org "": invalid org`, and any `users` entry that has no matching user document is rejected with `user "": user not found` (each wrapped with the offending account/org ID) — in both cases the request is not queued and no members are added. ```json -{ "error": "room is at maximum capacity (200): cannot add 5 members to room with 198 existing" } +{ "code": "conflict", "reason": "max_room_size_reached", "error": "room is at maximum capacity" } ``` ##### Triggered events — success path @@ -335,9 +354,13 @@ See [Error envelope](#6-error-envelope-reference). Returned synchronously when v | `operation` | string | One of `"room.create"`, `"room.member.add"`, `"room.member.remove"`, `"room.member.remove_org"`. | | `status` | string | `"ok"` or `"error"`. | | `roomId` | string | Optional. The affected room. | -| `error` | string | Optional. Sanitized message; present only when `status="error"`. | +| `error` | string | Optional. User-safe message; present only when `status="error"`. | +| `code` | string | Optional. The errcode category (`bad_request`, `not_found`, `forbidden`, `conflict`, `internal`, …) — same closed set as sync replies (see §6). Present only when `status="error"`. | +| `reason` | string | Optional. Domain reason from `pkg/errcode/codes_room.go` (e.g. `not_room_member`, `max_room_size_reached`) when the frontend needs to distinguish cases. Present only when `status="error"` and a reason was attached server-side. | | `timestamp` | number | Milliseconds since Unix epoch (UTC). | +Success example: + ```json { "requestId": "01970a4f-8c2d-7c9a-abcd-e0123456789f", @@ -348,6 +371,20 @@ See [Error envelope](#6-error-envelope-reference). Returned synchronously when v } ``` +Error example (e.g. requester not in room): + +```json +{ + "requestId": "01970a4f-8c2d-7c9a-abcd-e0123456789f", + "operation": "room.member.add", + "status": "error", + "error": "only room members can list members", + "code": "forbidden", + "reason": "not_room_member", + "timestamp": 1746518400456 +} +``` + **2. `chat.user.{newMember}.event.subscription.update`** — one event per **newly subscribed** member (not the requester, not existing members, not org→individual upgrades). **`subscription.update` schema** (shared by Add Members, Remove Member, Update Member Role): @@ -430,7 +467,7 @@ Exactly one of `account` or `orgId` must be set. The fields `requester`, `roomTy See [Error envelope](#6-error-envelope-reference). Returned synchronously when validation or authorization fails (e.g. neither or both of `account`/`orgId` set, requester is not an owner, target is the last member, or org member cannot leave individually). ```json -{ "error": "exactly one of account or orgId must be set" } +{ "code": "bad_request", "error": "exactly one of account or orgId must be set" } ``` ##### Triggered events — success path @@ -512,7 +549,7 @@ See [Error envelope](#6-error-envelope-reference). Returned synchronously when v - Promote attempt on an org-only member (individual subscription required). ```json -{ "error": "only owners can update roles" } +{ "code": "forbidden", "error": "only owners can update roles" } ``` ##### Triggered events — success path @@ -654,7 +691,7 @@ See [Error envelope](#6-error-envelope-reference). Common errors: - A malformed subject surfaces as a generic `"internal error"` (the specific reason is sanitized away). Not normally reachable — the wildcard subscription guarantees a well-formed subject. ```json -{ "error": "only room members can list members" } +{ "code": "forbidden", "reason": "not_room_member", "error": "only room members can list members" } ``` ##### Behaviour notes @@ -838,7 +875,7 @@ See [Error envelope](#6-error-envelope-reference). Common errors: - `"invalid request: messageId is required"` — empty `messageId`. ```json -{ "error": "only the message sender can view read receipts" } +{ "code": "forbidden", "error": "only the message sender can view read receipts" } ``` ##### Behaviour notes @@ -908,7 +945,7 @@ Empty. Send `{}` or no payload. See [Error envelope](#6-error-envelope-reference). ```json -{ "error": "invalid org" } +{ "code": "bad_request", "error": "invalid org" } ``` ##### Triggered events — success path @@ -939,7 +976,16 @@ The paginated read RPCs (Load History, Load Next, Load Surrounding, Get Thread M Both are **hints, not authority**: the server sanitizes them (ignores values that are negative, in the future, or mutually inconsistent) and falls back to a MongoDB fetch when a value is missing or fails sanitization. A client that does not have these values should omit `meta` entirely — correctness is unaffected, only an extra lookup is incurred. -Common error strings across these RPCs: `"not subscribed to room"`, `"unable to verify room access"`, `"room not found"`, `"invalid pagination cursor"`, `"message not found"`, and (for access-restricted readers) `"message is outside access window"` / `"thread is outside access window"`. +Common error envelopes across these RPCs (see §6 for the full shape): + +| `code` | When | +|---------------|------| +| `forbidden` | `"not subscribed to room"`, or (for access-restricted readers) `"message is outside access window"` / `"thread is outside access window"`. | +| `not_found` | `"room not found"`, `"message not found"`. | +| `bad_request` | `"invalid pagination cursor"` (malformed `cursor` value), other request-validation failures. | +| `internal` | `"internal error"` — bubbled from store/publisher failures; real cause logged server-side, not sent. | + +history-service does not currently emit a domain `reason` — clients branch on `code` (and the human-readable `error` only for display, never logic). #### Message schema @@ -1075,7 +1121,7 @@ Live reaction events (`MessageReactedPayload`) carry a single-actor delta (`{sho See [Error envelope](#6-error-envelope-reference). ```json -{ "error": "not subscribed to room" } +{ "code": "forbidden", "error": "not subscribed to room" } ``` ##### Triggered events — success path @@ -1248,7 +1294,7 @@ A single `Message` object. See [Message schema](#message-schema). See [Error envelope](#6-error-envelope-reference). ```json -{ "error": "message not found" } +{ "code": "not_found", "error": "message not found" } ``` ##### Triggered events — success path @@ -1300,7 +1346,15 @@ Only the original sender may edit a message. ##### Error response -See [Error envelope](#6-error-envelope-reference). Common errors: `"only the sender can edit"`, `"message not found"`, `"newMsg must not be empty"`, `"newMsg exceeds maximum size"`, `"failed to edit message"`. +See [Error envelope](#6-error-envelope-reference). Errors: + +| `code` | `error` | When | +|--------|---------|------| +| `forbidden` | `only the sender can edit` | Caller is not the message author. | +| `not_found` | `message not found` | `messageId` does not exist (or is outside the access window). | +| `bad_request` | `newMsg must not be empty` | Empty `newMsg`. | +| `bad_request` | `newMsg exceeds maximum size` | `newMsg` exceeds the configured cap. | +| `internal` | `internal error` | Store/publish failure; real cause logged server-side. | ##### Triggered events — success path @@ -1379,7 +1433,13 @@ Soft-deletes a message (sets `deleted=true` on the row; row is preserved for aud ##### Error response -See [Error envelope](#6-error-envelope-reference). Common errors: `"only the sender can delete"`, `"message not found"`, `"failed to delete message"`. +See [Error envelope](#6-error-envelope-reference). Errors: + +| `code` | `error` | When | +|--------|---------|------| +| `forbidden` | `only the sender can delete` | Caller is not the message author. | +| `not_found` | `message not found` | `messageId` does not exist (or is outside the access window). | +| `internal` | `internal error` | Store/publish failure; real cause logged server-side. | ##### Triggered events — success path @@ -1455,18 +1515,14 @@ Pins a message in the room. Idempotent — pinning an already-pinned message suc See [Error envelope](#6-error-envelope-reference). Common errors: -| Code | Message | Cause | -|------|---------|-------| -| `forbidden` | `"pinning is disabled"` | Global kill-switch (`PIN_ENABLED=false`) is off. | -| `forbidden` | `"not subscribed to room"` | Caller has no subscription to the room. | -| `forbidden` | `"room is too large to pin"` | Room member count exceeds the configured `LARGE_ROOM_THRESHOLD`. Owners, admins, and bot accounts are exempt. | -| `forbidden` | `"room pin limit reached"` | Room already has `MAX_PINNED_PER_ROOM` pinned messages (default 10). Hard cap — no role-based bypass. Unpin an existing message to free a slot. | -| `not_found` | `"message not found"` | Message does not exist, belongs to a different room, or has been deleted. | -| `internal` | `"failed to retrieve message"` | Cassandra read failed while looking up the target message. | -| `internal` | `"unable to verify room access"` | Failed to look up the caller's subscription. | -| `internal` | `"unable to verify room size"` | Failed to read the room member count. | -| `internal` | `"unable to verify pin count"` | Failed to read the current pinned-messages count for the room. | -| `internal` | `"failed to pin message"` | Write to the message store failed. | +| Code | Reason | Message | Cause | +|------|--------|---------|-------| +| `forbidden` | `pin_disabled` | `"pinning is disabled"` | Global kill-switch (`PIN_ENABLED=false`) is off. | +| `forbidden` | `not_subscribed` | `"not subscribed to room"` | Caller has no subscription to the room. | +| `forbidden` | `pin_room_too_large` | `"room is too large to pin"` | Room member count exceeds the configured `LARGE_ROOM_THRESHOLD`. Owners, admins, and bot accounts are exempt. | +| `forbidden` | `pin_limit_reached` | `"room pin limit reached"` | Room already has `MAX_PINNED_PER_ROOM` pinned messages (default 10). Hard cap — no role-based bypass. Unpin an existing message to free a slot. | +| `not_found` | — | `"message not found"` | Message does not exist, belongs to a different room, or has been deleted. | +| `internal` | — | `"internal error"` | Mongo/Cassandra read or write failed (subscription lookup, room user count, pinned-messages count, message lookup, or pin write). Specific cause appears in the server log. | ##### Triggered events — success path @@ -1564,16 +1620,13 @@ Unpins a message in the room. Idempotent — unpinning a message that is not pin See [Error envelope](#6-error-envelope-reference). Common errors: -| Code | Message | Cause | -|------|---------|-------| -| `forbidden` | `"pinning is disabled"` | Global kill-switch (`PIN_ENABLED=false`) is off. | -| `forbidden` | `"not subscribed to room"` | Caller has no subscription to the room. | -| `forbidden` | `"room is too large to pin"` | Room member count exceeds the configured `LARGE_ROOM_THRESHOLD`. Owners, admins, and bot accounts are exempt. | -| `not_found` | `"message not found"` | Message does not exist or belongs to a different room. Unlike pin, **soft-deleted messages are still unpinnable** — a pinned message that was later deleted retains its slot in `pinned_messages_by_room`, and unpin is the only way to free it. | -| `internal` | `"failed to retrieve message"` | Cassandra read failed while looking up the target message. | -| `internal` | `"unable to verify room access"` | Failed to look up the caller's subscription. | -| `internal` | `"unable to verify room size"` | Failed to read the room member count. | -| `internal` | `"failed to unpin message"` | Write to the message store failed. | +| Code | Reason | Message | Cause | +|------|--------|---------|-------| +| `forbidden` | `pin_disabled` | `"pinning is disabled"` | Global kill-switch (`PIN_ENABLED=false`) is off. | +| `forbidden` | `not_subscribed` | `"not subscribed to room"` | Caller has no subscription to the room. | +| `forbidden` | `pin_room_too_large` | `"room is too large to pin"` | Room member count exceeds the configured `LARGE_ROOM_THRESHOLD`. Owners, admins, and bot accounts are exempt. | +| `not_found` | — | `"message not found"` | Message does not exist or belongs to a different room. Unlike pin, **soft-deleted messages are still unpinnable** — a pinned message that was later deleted retains its slot in `pinned_messages_by_room`, and unpin is the only way to free it. | +| `internal` | — | `"internal error"` | Mongo/Cassandra read or write failed (subscription lookup, room user count, message lookup, or unpin write). Specific cause appears in the server log. | ##### Triggered events — success path @@ -1698,12 +1751,11 @@ The response is cursor-paginated (`cursor`/`limit` in the request, `nextCursor`/ See [Error envelope](#6-error-envelope-reference). Common errors: -| Code | Message | Cause | -|------|---------|-------| -| `forbidden` | `"not subscribed to room"` | Caller has no subscription to the room. | -| `bad_request` | `"invalid pagination cursor"` | The `cursor` value is not a valid base64 page-state token. | -| `internal` | `"unable to verify room access"` | Failed to look up the caller's subscription. | -| `internal` | `"failed to list pinned messages"` | Read from the message store failed. | +| Code | Reason | Message | Cause | +|------|--------|---------|-------| +| `forbidden` | `not_subscribed` | `"not subscribed to room"` | Caller has no subscription to the room. | +| `bad_request` | — | `"invalid pagination cursor"` | The `cursor` value is not a valid base64 page-state token. | +| `internal` | — | `"internal error"` | Mongo/Cassandra read failed (subscription lookup or pinned-messages page). Specific cause appears in the server log. | ##### Triggered events — success path @@ -2058,7 +2110,7 @@ See [Error envelope](#6-error-envelope-reference). | `bad_request` | Validation failures (`query` missing/blank, negative `size`/`offset`). | | `internal` | Backend failure (transient or permanent). The raw error is never leaked to the client. | -These are documentation categories. The wire error envelope is `{ "error": "", "code": "" }` per `pkg/model.ErrorResponse` (see §5). +These are documentation categories. The wire error envelope shape — `{ "error": "", "code": "", "reason"?: "", "metadata"?: {…} }` — is the same across every endpoint and is defined canonically in §6 (Error envelope reference). --- @@ -2196,22 +2248,23 @@ The gatekeeper does **not** populate `mentions`, `editedAt`/`updatedAt`, `tshow` Delivered on `chat.user.{account}.response.{requestId}`. See [Error envelope](#6-error-envelope-reference). Errors: -| Wire `error` | `code` | Cause | -|--------------|--------|-------| -| `invalid requestId "…": must be a hyphenated UUID` | — | Empty/malformed `requestId`. (Reachable only when `requestId` is non-empty but malformed; an empty `requestId` leaves no reply subject, so the client just times out.) | -| `invalid message ID "…": must be a 20-char base62 string` | — | `id` is not valid base62. | -| `invalid thread parent message ID "…": …` | — | `threadParentMessageId` is not a valid message ID. | -| `content must not be empty` | — | Empty `content`. | -| `content exceeds maximum size of 20480 bytes` | — | `content` > 20 KiB. | -| `validate thread parent fields: threadParentMessageCreatedAt is required when threadParentMessageId is set` | — | Missing thread-parent timestamp. | -| `user {account} is not subscribed to room {roomID}` | — | Sender is not a member. | -| `posting is restricted to owners and admins in this room` | `large_room_post_restricted` | Non-owner/admin/bot posting a top-level message in a room above the large-room threshold (thread replies are exempt). | -| `quoted parent {id} thread context mismatch: …` | — | A quoted message must be in the same thread context (main-room or the same thread) as the new message. | +| Wire `error` | `code` | `reason` | Cause | +|--------------|--------|----------|-------| +| `invalid requestId "…": must be a hyphenated UUID` | `bad_request` | — | Empty/malformed `requestId`. (Reachable only when `requestId` is non-empty but malformed; an empty `requestId` leaves no reply subject, so the client just times out.) | +| `invalid message ID "…": must be a 20-char base62 string` | `bad_request` | — | `id` is not valid base62. | +| `invalid thread parent message ID "…": …` | `bad_request` | — | `threadParentMessageId` is not a valid message ID. | +| `content must not be empty` | `bad_request` | — | Empty `content`. | +| `content exceeds maximum size of 20480 bytes` | `bad_request` | — | `content` > 20 KiB. | +| `validate thread parent fields: threadParentMessageCreatedAt is required when threadParentMessageId is set` | `bad_request` | — | Missing thread-parent timestamp. | +| `not subscribed` | `forbidden` | `not_subscribed` | Sender is not a member of the room. | +| `posting is restricted to owners and admins in this room` | `forbidden` | `large_room_post_restricted` | Non-owner/admin/bot posting a top-level message in a room above the large-room threshold (thread replies are exempt). | +| `quoted parent {id} not found` | `not_found` | — | The quoted message lookup failed (deleted, cross-room, …). | +| `quoted parent {id} thread context mismatch: …` | `bad_request` | — | A quoted message must be in the same thread context (main-room or the same thread) as the new message. | **Delivery guarantee:** every validation/authorization failure — including a `siteID` mismatch and a malformed `msg.send` subject — is replied to the client on the response subject and the JetStream message is acked (not retried). The error reply requires a routable response subject, so it can only be sent when the `{account}` segment is recoverable from the subject and the payload carries a valid hyphenated-UUID `requestId`; if neither is recoverable (a truly malformed subject or missing/invalid `requestId`) no reply is possible and the client falls back to a request timeout. **Only infrastructure failures** (store/publish errors) are nak'd and **redelivered by JetStream** — these produce no immediate reply. ```json -{ "error": "content must not be empty" } +{ "code": "bad_request", "error": "content must not be empty" } ``` #### Triggered events — success path @@ -2296,7 +2349,7 @@ A `RoomEvent` (same struct as above) published once per DM participant. Recipien When validation fails, the gatekeeper publishes the error envelope to `chat.user.{account}.response.{requestId}` and **no downstream events are emitted**. The client should display the error and offer a retry. ```json -{ "error": "content must not be empty" } +{ "code": "bad_request", "error": "content must not be empty" } ``` --- @@ -2412,19 +2465,73 @@ permanently gone. ## 6. Error envelope reference -Every error response — over NATS reply subjects and HTTP — uses the same envelope: +Every error response — NATS reply subjects, JetStream async results, and HTTP — uses the same envelope: ```json -{ "error": "", "code": "" } +{ + "error": "", + "code": "", + "reason": "", + "metadata": { "": "" } +} ``` -| Field | Type | Notes | -|---------|--------|-------| -| `error` | string | Human-readable, sanitized at the service boundary. Do not parse or pattern-match against the text. | -| `code` | string | Optional. Machine-readable category emitted by services using `natsrouter`'s typed `RouteError` (e.g. `bad_request`, `not_found`, `forbidden`, `conflict`, `internal`, `unavailable`). The message-send flow also emits `large_room_post_restricted`. Absent for plain `natsutil.ReplyError` responses. | - -**NATS errors** are sent on the standard reply subject (`_INBOX.>` for §3 methods, `chat.user.{account}.response.{requestID}` for §4) via `natsutil.ReplyError` (no `code`) or `natsrouter`'s typed error replies (with `code`). - -**HTTP errors** (auth-service §2.2) use the same shape with an HTTP status code in the response line. - -Clients should rely on the presence/absence of the `error` field — and on context (HTTP status, or whether a reply parses as a success-shape) — rather than on the error text. When `code` is present, prefer matching against the documented constant rather than the human-readable message. +| Field | Type | Notes | +|------------|-----------------------|-------| +| `error` | string | Human-readable, user-safe (never carries an internal cause). Do not parse or pattern-match against the text. | +| `code` | string | **Always present.** One of the 7 categories below. Drives HTTP status. | +| `reason` | string (optional) | Domain-specific machine code (e.g. `max_room_size_reached`, `not_subscribed`). When present, the client should branch on `reason ?? code`. | +| `metadata` | object (optional) | Free-form `string→string` map for structured detail (e.g. `{ "limit": "500" }`). | + +### Generic `code` values (always present) → HTTP status + +| `code` | HTTP | When | +|----------------------|------|------| +| `bad_request` | 400 | Malformed/invalid input or unsupported parameters. | +| `unauthenticated` | 401 | Missing/expired/invalid credentials. | +| `forbidden` | 403 | Authenticated but not permitted. | +| `not_found` | 404 | Target resource does not exist. | +| `conflict` | 409 | State conflict (duplicate, capacity exceeded, last-owner removal, …). | +| `too_many_requests` | 429 | Per-caller rate limit / quota exceeded. | +| `unavailable` | 503 | Transient server saturation/timeout (admission, expand timeout). | +| `internal` | 500 | Unclassified server-side fault. The real cause is logged server-side only and never sent to the client. | + +### `reason` catalog (present today) + +| `reason` | Typical `code` | Emitted by | +|--------------------------------|---------------|------------| +| `max_room_size_reached` | conflict | room-service create/add (room capacity exceeded) | +| `not_room_member` | forbidden | room-service / room-worker (actor not a member) | +| `not_room_owner` | forbidden | room-service role/admin paths | +| `last_owner_cannot_leave` | conflict | room-service leave | +| `bot_in_channel` | bad_request | room-service member-add (bot in a channel room) | +| `bot_not_available` | not_found | room-service member-add (unknown bot) | +| `user_not_found` | not_found | room-service / room-worker (account does not resolve to a user) | +| `invalid_org` | bad_request | room-service create/add (orgId does not resolve to any users) | +| `self_dm` | bad_request | room-service create (DM to yourself) | +| `last_member_cannot_remove` | conflict | room-service remove-member (would empty the room) | +| `target_not_member` | bad_request | room-service role-update (target is not a room member) | +| `already_owner` | conflict | room-service role-update (promote a current owner) | +| `cannot_demote_last_owner` | conflict | room-service role-update (demote the last owner) | +| `promote_requires_individual` | bad_request | room-service role-update (only individual members can be owners) | +| `large_room_post_restricted` | forbidden | message-gatekeeper (non-owner/admin posting in a large room) | +| `not_subscribed` | forbidden | message-gatekeeper / history-service (caller not subscribed) | +| `outside_access_window` | forbidden | history-service (subscribed but message predates HSS) | +| `pin_disabled` | forbidden | history-service pin/unpin/list (kill-switch `PIN_ENABLED=false`) | +| `pin_limit_reached` | forbidden | history-service pin (room at `MAX_PINNED_PER_ROOM` hard cap) | +| `pin_room_too_large` | forbidden | history-service pin/unpin (non-owner/admin/bot in a room above `LARGE_ROOM_THRESHOLD`) | +| `sso_token_expired` | unauthenticated | auth-service `POST /auth` | +| `invalid_sso_token` | unauthenticated | auth-service `POST /auth` | +| `invalid_request` | bad_request | auth-service (body parse / required field missing) | +| `invalid_nkey` | bad_request | auth-service (natsPublicKey format) | +| `missing_fields` | bad_request | auth-service (ssoToken/account/natsPublicKey missing) | + +### Where envelopes are sent + +- **NATS sync replies** — on the reply subject for §3/§4 RPCs. +- **JetStream async results** — `model.AsyncJobResult` carries the same `code` + `reason` fields when `status == "error"`, so a failed async job is surfaced the same way as a sync error. +- **HTTP** — auth-service `POST /auth` writes the envelope as the response body with the matching HTTP status from the table above. + +### Client branching guidance + +Compute the trigger as `reason ?? code` and branch on that. Use `code` for generic copy ("you don't have permission", "service unavailable, try again"), `reason` for endpoint-specific UX (open the "room is full" dialog on `max_room_size_reached`; redirect to re-login on `sso_token_expired`/`invalid_sso_token`; surface "join the room first" on `not_subscribed`). Never branch on the `error` text — message wording can change without notice. diff --git a/docs/errcode-nats-talk.md b/docs/errcode-nats-talk.md new file mode 100644 index 000000000..9d70d37a5 --- /dev/null +++ b/docs/errcode-nats-talk.md @@ -0,0 +1,419 @@ +# A Unified Error Contract over NATS Request/Reply + +## `pkg/errcode` — one envelope from handler to frontend + +> Every service speaks the same contract. Three of them as a representative cross-section: +> **room-service** uses raw core request/reply — the wiring is explicit. +> **history-service** uses `pkg/natsrouter` — the wiring is handled by the framework. +> **auth-service** uses Gin/HTTP — the same error, a different boundary adapter. +> Whatever the transport, business logic returns the *same* typed error. + +--- + +## The error response structure — every field + +One shape on the wire, owned by `errcode.Error` (Go) — `cause` is unexported, so JSON **cannot** leak it: + +```go +type Error struct { + Code Code `json:"code"` // ALWAYS present — e.g. "not_found" + Reason Reason `json:"reason,omitempty"` // optional domain code — e.g. "not_subscribed" + Message string `json:"error"` // user-safe text — e.g. "message not found" + Metadata map[string]string `json:"metadata,omitempty"` // optional structured detail — e.g. {"max_size":"500"} + cause error // UNEXPORTED → never serialized, server-log only — e.g. fmt.Errorf("get room: %w", mongo.ErrNoDocuments) +} +``` + +| Field | Wire key | Always? | What it's for | +|---|---|---|---| +| `Code` | `code` | ✅ yes | Closed 8-value set (`not_found`, `forbidden`, …). Drives HTTP status + UX category. | +| `Reason` | `reason` | optional | Open per-service domain code (`not_subscribed`). **The thing the frontend keys off.** | +| `Message` | `error` | ✅ yes | Human text. **Display only — never key off it** (wording changes). | +| `Metadata` | `metadata` | optional | Structured key/values (e.g. limits) when the UI needs them. | +| `cause` | — | never | The infra error. Logged once server-side by `Classify`; **never** on the wire. | + +```jsonc +{ "error": "message not found", "code": "not_found" } +{ "error": "not subscribed to room","code": "forbidden", "reason": "not_subscribed" } +{ "error": "internal error", "code": "internal" } // fmt.Errorf(...) collapsed; cause hidden +``` + +--- + +## Mental model + +```text +Business logic returns → errcode.NotFound("...") // typed, user-facing + → fmt.Errorf("get room: %w", e) // infra → collapses to "internal" + +The boundary (errnats.Reply / router) does the rest: classify, log once, marshal envelope. +``` + +You almost only ever touch the **left column**. The boundary call is *one line, written once per handler file*. + +--- + +## The eight constructors — the name is the HTTP/wire category + +```go +errcode.BadRequest("...") // 400 errcode.Conflict("...") // 409 +errcode.Unauthenticated("...") // 401 errcode.TooManyRequests("...") // 429 +errcode.Forbidden("...") // 403 errcode.Unavailable("...") // 503 +errcode.NotFound("...") // 404 errcode.Internal("...") // 500 +``` + +Options (only when needed): `WithReason(...)` · `WithCause(infraErr)` · `WithMetadata(...)` + +--- + +# Example A — room-service (raw core req/reply) + +## A1 · Subscription and handler wiring + +```go +// RegisterCRUD registers NATS request/reply handlers with a queue group. +func (h *Handler) RegisterCRUD(nc *otelnats.Conn) error { + const queue = "room-service" + if _, err := nc.QueueSubscribe(subject.MemberRoleUpdateWildcard(h.siteID), queue, h.natsUpdateRole); err != nil { + return fmt.Errorf("subscribe member role update: %w", err) + } + // ...12 more subjects, same shape... +} + +func (h *Handler) natsUpdateRole(m otelnats.Msg) { + ctx, err := wrappedCtx(m) // ← validate request-id, seed log ctx + if err != nil { + errnats.Reply(ctx, m.Msg, err) // ← THE boundary call + return + } + resp, err := h.handleUpdateRole(ctx, m.Msg.Subject, m.Msg.Data) + if err != nil { + errnats.Reply(ctx, m.Msg, err) // ← same one line on every error + return + } + if err := m.Msg.Respond(resp); err != nil { + slog.Error("failed to respond to update-role message", "error", err) + } +} +``` + +`wrappedCtx` is the only service-specific helper — strict request-id (dedup-critical path): + +```go +func wrappedCtx(m otelnats.Msg) (context.Context, error) { + ctx, id, err := natsutil.RequireRequestID(m.Context(), m.Msg.Header, m.Msg.Subject) + if err != nil { + return m.Context(), err // BadRequest → caller replies it + } + return errcode.WithLogValues(ctx, "request_id", id), nil +} +``` + +--- + +## A2 · Business logic returns typed errors + +```go +func (h *Handler) handleUpdateRole(ctx context.Context, subj string, data []byte) ([]byte, error) { + requester, roomID, ok := subject.ParseUserRoomSubject(subj) + if !ok { + return nil, fmt.Errorf("invalid subject: %s", subj) // infra-ish → "internal", subject never leaks + } + var req model.UpdateRoleRequest + if err := json.Unmarshal(data, &req); err != nil { + return nil, errcode.BadRequest("invalid request") // 400 + } + room, err := h.store.GetRoom(ctx, roomID) + if err != nil { + return nil, fmt.Errorf("get room: %w", err) // DB down → "internal" + } + if room.Type != model.RoomTypeChannel { + return nil, errRoomTypeGuard // sentinel (see A3) + } + if !hasRole(requesterSub.Roles, model.RoleOwner) { + return nil, errOnlyOwners // 403 + reason + } + if req.NewRole == model.RoleOwner && hasRole(target.Subscription.Roles, model.RoleOwner) { + return nil, errAlreadyOwner // 409 + reason + } + // ...happy path: publish to stream, return accepted... +} +``` + +No logging, no marshalling, no status codes. Just `return`. + +--- + +## A3 · Sentinels — define repeated errors once (`room-service/helper.go`) + +```go +var ( + errInvalidRole = errcode.BadRequest("invalid role: must be owner or member") + errOnlyOwners = errcode.Forbidden("only owners can update roles", errcode.WithReason(errcode.RoomNotOwner)) + errAlreadyOwner = errcode.Conflict ("user is already an owner", errcode.WithReason(errcode.RoomAlreadyOwner)) + errCannotDemoteLast = errcode.Conflict ("cannot demote the last owner", errcode.WithReason(errcode.RoomCannotDemoteLastOwner)) + errRoomTypeGuard = errcode.BadRequest("role update is only allowed in channel rooms", + errcode.WithReason(errcode.RoomNonChannelOperation)) +) +``` + +Return the **singleton** at every site → `errors.Is(err, errOnlyOwners)` matches everywhere, and the frontend gets a stable `reason`. + +--- + +# Example B — history-service (`pkg/natsrouter`) + +## B1 · Typed handler registration — no wiring required + +```go +func (s *HistoryService) RegisterHandlers(r *natsrouter.Router, siteID string) { + natsrouter.Register(r, subject.MsgHistoryPattern(siteID), s.LoadHistory) + natsrouter.Register(r, subject.MsgGetPattern(siteID), s.GetMessageByID) + natsrouter.Register(r, subject.MsgThreadPattern(siteID), s.GetThreadMessages) + // ... +} +``` + +Handler signature is `func(c *natsrouter.Context, req T) (*R, error)`. The router unmarshals the body, **calls `errnats.Reply` for you on error**, and `ReplyJSON`s on success: + +```go +// pkg/natsrouter/register.go — written ONCE, for every service +func Register[Req, Resp any](r *Router, pattern string, fn func(c *Context, req Req) (*Resp, error)) { + handler := func(c *Context) { + var req Req + if err := json.Unmarshal(c.Msg.Data, &req); err != nil { + replyErr(c, errcode.BadRequest("invalid request payload", errcode.WithCause(err))) + return + } + resp, err := fn(c, req) + if err != nil { replyErr(c, err); return } // ← the boundary, automatic + c.ReplyJSON(resp) + } + r.addRoute(pattern, []HandlerFunc{handler}) +} +``` + +--- + +## B2 · A handler — identical return style to room-service + +```go +func (s *HistoryService) GetMessageByID(c *natsrouter.Context, req models.GetMessageByIDRequest) (*models.Message, error) { + account := c.Param("account") // subject token, not a manual parse + roomID := c.Param("roomID") + c.WithLogValues("account", account, "room_id", roomID) + + accessSince, err := s.getAccessSince(c, account, roomID) // c IS the context.Context + if err != nil { + return nil, err // already typed — just bubble it + } + msg, err := s.findMessage(c, roomID, req.MessageID) + if err != nil { + return nil, err + } + if accessSince != nil && msg.CreatedAt.Before(*accessSince) { + return nil, errcode.Forbidden("message is outside access window", + errcode.WithReason(errcode.MessageOutsideAccessWindow)) // 403 + reason + } + return msg, nil +} +``` + +--- + +## B3 · Helpers — the full vocabulary in one place + +```go +// 403 with a reason the frontend keys off +func (s *HistoryService) getAccessSince(ctx context.Context, account, roomID string) (*time.Time, error) { + accessSince, subscribed, err := s.subscriptions.GetHistorySharedSince(ctx, account, roomID) + if err != nil { + return nil, fmt.Errorf("verifying room access for %s/%s: %w", account, roomID, err) // infra → internal + } + if !subscribed { + return nil, errcode.Forbidden("not subscribed to room", errcode.WithReason(errcode.MessageNotSubscribed)) + } + return accessSince, nil +} + +// 404 — distinct "not found" from "bad input" +func (s *HistoryService) findMessage(ctx context.Context, roomID, messageID string) (*models.Message, error) { + if messageID == "" { + return nil, errcode.BadRequest("messageId is required") // 400 + } + msg, err := s.msgReader.GetMessageByID(ctx, messageID) + if err != nil { + return nil, fmt.Errorf("retrieving message %s: %w", messageID, err) // infra → internal + } + if msg == nil || msg.RoomID != roomID { + return nil, errcode.NotFound("message not found") // 404 + } + return msg, nil +} + +// WithCause: keep the parse error server-side, generic message to the client +func parsePageRequest(cursor string, limit int) (cassrepo.PageRequest, error) { + q, err := cassrepo.ParsePageRequest(cursor, limit) + if err != nil { + return cassrepo.PageRequest{}, errcode.BadRequest("invalid pagination cursor", errcode.WithCause(err)) + } + return q, nil +} +``` + +--- + +# Example C — auth-service (Gin / HTTP) + +The same `*errcode.Error` also drives the HTTP boundary — the only change is the adapter: `errhttp.Write` instead of `errnats.Reply`. The constructor name maps straight to the HTTP status. + +## C1 · Routes and request-id middleware + +```go +func registerRoutes(r *gin.Engine, h *AuthHandler) { + r.POST("/auth", h.HandleAuth) + r.GET("/healthz", h.HandleHealth) +} + +// Same request-id primitive as the NATS path (idgen.ResolveRequestID), +// so one ID flows HTTP → NATS → logs. +func requestIDMiddleware() gin.HandlerFunc { + return func(c *gin.Context) { + id, _ := idgen.ResolveRequestID(c.GetHeader(natsutil.RequestIDHeader)) + c.Set("request_id", id) + c.Request = c.Request.WithContext(natsutil.WithRequestID(c.Request.Context(), id)) + c.Header(natsutil.RequestIDHeader, id) + c.Next() + } +} +``` + +--- + +## C2 · The handler — same return style, `errhttp.Write` at the boundary + +```go +func (h *AuthHandler) HandleAuth(c *gin.Context) { + ctx := errcode.WithLogValues(c.Request.Context(), "request_id", c.GetString("request_id")) + + var req authRequest + if err := c.ShouldBindJSON(&req); err != nil { + errhttp.Write(ctx, c, errcode.BadRequest("ssoToken and natsPublicKey are required", + errcode.WithReason(errcode.AuthMissingFields))) // 400 + reason + return + } + claims, err := h.validator.Validate(ctx, req.SSOToken) + if err != nil { + if errors.Is(err, pkgoidc.ErrTokenExpired) { + errhttp.Write(ctx, c, errcode.Unauthenticated("SSO token has expired, please re-login", + errcode.WithReason(errcode.AuthTokenExpired))) // 401 + reason + return + } + // WithCause keeps the real OIDC error server-side; client sees a generic message. + errhttp.Write(ctx, c, errcode.Unauthenticated("invalid SSO token", + errcode.WithReason(errcode.AuthInvalidToken), errcode.WithCause(err))) + return + } + natsJWT, err := h.signNATSJWT(req.NATSPublicKey, claims.PreferredUsername) + if err != nil { + errhttp.Write(ctx, c, fmt.Errorf("signing NATS token: %w", err)) // infra → 500 "internal" + return + } + c.JSON(http.StatusOK, authResponse{NATSJWT: natsJWT /* ... */}) +} +``` + +`errhttp.Write` runs the *same* `Classify` as the NATS path — sets the HTTP status from `Code`, marshals the `{code, reason, error}` envelope, and logs the `cause` once. Frontend reads it with the identical `reason ?? code` contract. + +--- + +## Same return value, any transport + +Three services shown, but the pattern is service-wide — pick the row that matches your transport: + +| | room-service (raw core) | history-service (router) | auth-service (Gin) | +|---|---|---|---| +| Register | `nc.QueueSubscribe(subj, queue, h.natsUpdateRole)` | `natsrouter.Register(r, pat, s.GetMessageByID)` | `r.POST("/auth", h.HandleAuth)` | +| Handler sig | `func(m otelnats.Msg)` | `func(c *Context, req T) (*R, error)` | `func(c *gin.Context)` | +| Request-id | `wrappedCtx` → `RequireRequestID` (strict) | `RequestID()` middleware (auto) | `requestIDMiddleware` (resolve) | +| Reply on error | **you** call `errnats.Reply(ctx, m.Msg, err)` | router calls it **for you** | **you** call `errhttp.Write(ctx, c, err)` | +| **Business logic** | `return errcode.NotFound("...")` | `return errcode.NotFound("...")` | `errcode.NotFound("...")` | + +The thing you actually write — the `errcode.X(...)` value — is **identical** in every service. Only the adapter (`errnats.Reply` vs `errhttp.Write`) and the wiring differ. + +--- + +## Same envelope, wrapped for async jobs + +room-worker's two-phase result carries the **same `code`/`reason`/`error`** inside a job wrapper (`pkg/model.AsyncJobResult` → TS mirror): + +```ts +export interface AsyncJobResultEnvelope { + requestId: string + operation: string + status: 'ok' | 'error' + roomId?: string + error?: string // ┐ + code?: string // ├─ populated only when status === 'error' — same errcode fields + reason?: string // ┘ + timestamp: number +} +``` + +--- + +## Frontend — the `reason` is the cross-language API + +The string you stamp on the server is the *same* string the client keys off. One contract, two languages: + +| | Go — producer | TypeScript — consumer | +|---|---|---| +| Declare | `RoomNotOwner Reason = "not_room_owner"` | `REASON_COPY.not_room_owner` | +| Emit / read | `errcode.Forbidden(msg, WithReason(RoomNotOwner))` | `formatAsyncJobError(err)` | + +The transport parses the envelope into one typed error — the client mirror of `errcode.Error`: + +```ts +class AsyncJobError extends Error { + code?: ErrorCode // closed 8-value set: 'forbidden' | 'not_found' | … + reason?: string // open per-service: 'not_room_owner' | 'already_owner' | … +} +``` + +`reason` → friendly copy in **one** map — the only place user-facing English lives client-side: + +```ts +const REASON_COPY: Record = { + not_room_owner: 'Only owners can do that.', + already_owner: 'That user is already an owner.', + last_owner_cannot_leave: "You're the last owner — promote someone else first.", + not_subscribed: 'You need to join this room first.', + // …one line per reason in the catalog +} +// reason → copy, else fall back to the server's message (never key off it) +formatAsyncJobError(err) // ≈ REASON_COPY[err.reason] ?? err.message +``` + +A component parses nothing — it catches, formats, and shows: + +```jsx +try { + await createRoom(nats, { name, users }) +} catch (err) { + setError(formatAsyncJobError(err)) // reason → friendly copy, automatic +} +``` + +> **Contract:** key off `reason ?? code`; only *display* `error`. `code: "internal"` is always `"internal error"` — the real cause never leaves the server. + +--- + +## Recap — what you write day to day + +1. **`return errcode.("msg")`** — pick the name that matches the status. +2. Add **`WithReason(...)`** only when the frontend must key off the case. +3. Infra failure → **`return fmt.Errorf("doing X: %w", err)`** (becomes `internal`). +4. The **one** boundary line (`errnats.Reply`) is the router's job, or copied once per raw handler file. +5. Repeated errors → a **sentinel** in `helper.go` so `errors.Is` + `reason` stay consistent. + +> Everything else in the package (`Classify`, `Permanent`, `Parse`, `Marshal`) is boundary/worker/cross-site machinery you rarely touch. diff --git a/docs/error-handling.md b/docs/error-handling.md new file mode 100644 index 000000000..6bf063379 --- /dev/null +++ b/docs/error-handling.md @@ -0,0 +1,355 @@ +# Error Handling Guide + +How to produce client-facing errors in this codebase. The canonical source is +`pkg/errcode` (and its adapters `errnats` for NATS, `errhttp` for Gin); this +guide is a developer-facing walkthrough. + +For the client-side view of the wire envelope (what callers see and how to +branch), see `docs/client-api.md` §6. + +--- + +## 1. The contract + +Every client-facing error is an `*errcode.Error` that marshals to: + +```json +{ + "error": "", + "code": "", + "reason": "", + "metadata": { "": "" } +} +``` + +- `code` is **always present** and drives HTTP status. +- `reason` is **optional**; declare it only when the frontend must distinguish + cases that the generic `code` cannot. +- `metadata` is **client-visible** structured detail (`map[string]string`). +- The cause attached via `WithCause` is **never serialized** — it is logged + server-side once by `Classify` and reachable via `Unwrap()`/`errors.Is`/`As`. + +The eight generic categories and HTTP statuses: + +| Constant | Wire `code` | HTTP | +|--------------------------------|---------------------|------| +| `errcode.CodeBadRequest` | `bad_request` | 400 | +| `errcode.CodeUnauthenticated` | `unauthenticated` | 401 | +| `errcode.CodeForbidden` | `forbidden` | 403 | +| `errcode.CodeNotFound` | `not_found` | 404 | +| `errcode.CodeConflict` | `conflict` | 409 | +| `errcode.CodeTooManyRequests` | `too_many_requests` | 429 | +| `errcode.CodeUnavailable` | `unavailable` | 503 | +| `errcode.CodeInternal` | `internal` | 500 | + +`503 vs 429`: `unavailable` is server-wide saturation (admission control, +expand-timeout); `too_many_requests` is per-caller rate limiting / quota. + +--- + +## 2. Producing errors + +### The common case — a typed client error + +```go +return nil, errcode.BadRequest("name is required") +return nil, errcode.NotFound("room not found") +return nil, errcode.Forbidden("only owners can update roles") +return nil, errcode.Conflict("room is at maximum capacity", + errcode.WithReason(errcode.RoomMaxSizeReached)) +``` + +Use the **named constructor** (`BadRequest`, `Unauthenticated`, `Forbidden`, +`NotFound`, `Conflict`, `TooManyRequests`, `Unavailable`, `Internal`). There +are no `*f` variants on purpose — they would silently swallow trailing +`Option` args. For dynamic text, format the message at the call site: + +```go +return nil, errcode.BadRequest( + fmt.Sprintf("batch size %d exceeds limit %d", n, max)) +``` + +`errcode.New(code, msg, opts...)` is the escape hatch for a dynamically chosen +category; semgrep warns when you pass a literal `errcode.CodeX` to it +(prefer the named constructor in that case). + +### Infra / DB / third-party errors + +Don't manually classify them — return the wrapped raw error and let `Classify` +collapse it to `internal`/"internal error" at the boundary (the real cause is +logged once, never sent): + +```go +if err := h.store.Find(ctx, id); err != nil { + return nil, fmt.Errorf("loading room: %w", err) // → client sees "internal error" +} +``` + +### Attaching a cause for server-side debugging + +```go +return nil, errcode.BadRequest("invalid ensure-room-key request", + errcode.WithCause(err)) +``` + +`WithCause` panics if `err` already contains an `*errcode.Error` — the +invariant is **one `*errcode.Error` per chain**, propagated via a single `%w`. +Never wrap a message body, token, or any secret into a cause; the cause is +included in the server log line. + +### Client-visible metadata + +```go +return nil, errcode.Conflict("room is at maximum capacity", + errcode.WithReason(errcode.RoomMaxSizeReached), + errcode.WithMetadata("limit", strconv.Itoa(max))) +``` + +`WithMetadata` is **client-visible** (ships in the envelope). For server-only +attributes — request_id, account, roomID — use `WithLogValues` (next section). +Mixing them up is a leak risk. + +--- + +## 3. Replying + +You never marshal the envelope yourself; the adapter does it (and logs once): + +| Transport | Adapter | +|----------------------|----------------------------------------------------| +| NATS sync reply | `errnats.Reply(ctx, msg, err)` | +| NATS already-logged | `errnats.ReplyQuiet(msg, err)` (panic backstop / `replyBusy`) | +| Gin HTTP | `errhttp.Write(ctx, c, err)` | + +Handlers registered via `pkg/natsrouter` are automatic: returning a typed +errcode error from the handler routes through `errnats.Reply`. JetStream +consumers / raw NATS handlers call `errnats.Reply` directly. + +--- + +## 3a. Request-ID policy: mint by default, reject on dedup-critical paths + +Every NATS and HTTP entry point in this repo enforces a rule on the inbound +`X-Request-ID` header. The repo runs **two** policies side by side: + +### Default — mint on missing/malformed + +Used by every entry point whose request ID is logging/tracing only — most +read paths, auth-service, gatekeeper validation reply, etc. + +- **Valid hyphenated UUID** (`idgen.IsValidUUID`) → pass through unchanged. +- **Missing** (header absent or empty) → silently mint a fresh UUIDv7 via + `idgen.GenerateRequestID`. No log line — this is the benign common case. +- **Malformed** (present but not a valid UUID) → mint a fresh UUIDv7 AND emit + a single `slog.Warn("minted request_id (inbound invalid)", ...)` carrying + the original inbound value, so a buggy client stays traceable. + +Chokepoint: `idgen.ResolveRequestID(inbound) (id, replaced bool)`. NATS +wrapper: `natsutil.StampRequestID(ctx, headers, subject) (ctx, id)`. HTTP: +auth-service `requestIDMiddleware` calls `idgen.ResolveRequestID` directly. +The `pkg/natsrouter` `RequestID()` middleware applies the default policy +automatically. + +### Strict — reject missing/malformed (dedup-critical paths) + +Some handlers in **room-service** and **room-worker** fan out to JetStream +publishes whose `Nats-Msg-Id` (via `natsutil.OutboxDedupID`, +`natsutil.CanonicalDedupID`, and the in-package `messageDedupSeed` helper) and +whose canonical message IDs (via `idgen.MessageIDFromRequestID`) are derived +from the request ID. A server-side mint there would break client-retry +deduplication: a client retrying without `X-Request-ID` (or with a malformed +value) would get a fresh server-minted ID each attempt, produce a different +dedup key each time, and silently duplicate outbox events and system +messages. + +These entry points use the **strict** helper instead: + +- **NATS**: `natsutil.RequireRequestID(ctx, headers, subject) (ctx, id, error)` + returns an `errcode.BadRequest` when the inbound header is missing or + malformed. The error flows through `errnats.Reply` as a normal envelope. +- **Strict callers today**: every handler in `room-service` (via the + `wrappedCtx` helper, which now returns an error) and + `room-worker.natsServerCreateDM` (sync DM endpoint). +- **The room-worker JetStream consume loop** keeps the default mint policy + defensively — by the time a message lands on the ROOMS stream, room-service + validated the header at publish time. The consume loop logs an `slog.Error` + if it ever has to mint, because that indicates an upstream contract + violation (and downstream dedup will be broken for that message). + +**Client contract**: any client calling room-service or room-worker MUST +send a stable `X-Request-ID` header (a valid hyphenated UUIDv4 or v7) and +reuse the same value across retries of the same logical operation. See +`docs/client-api.md` for the wire-level contract. + +Once stamped, `errcode.Classify(ctx, err)` and every `slog.…Context(ctx, ...)` +call automatically carries `request_id` — handlers never need to pass it +explicitly. + +## 4. Logging contract + +`errcode.Classify(ctx, err)` emits **exactly one** `slog` line per failed +request, at a **category-aware level**: + +- `internal`, `unavailable` → `ERROR` +- all expected client errors (`bad_request`, `unauthenticated`, `forbidden`, + `not_found`, `conflict`, `too_many_requests`) → `INFO` + +This keeps routine 4xx validation failures out of the ERROR stream so +error-rate alerting stays meaningful. **Handlers must not log-then-reply** — +the reply path logs. + +Attach domain context once at handler entry. The seam differs by handler style: + +- **natsrouter handler** (`*natsrouter.Context`): use the cycle-safe method + `c.WithLogValues("account", a, "roomID", r)`. +- **Gin or raw NATS** (plain `context.Context`): use the package func + `ctx = errcode.WithLogValues(ctx, "request_id", id, "account", a, ...)`. + +The `request_id`/`account`/`roomID` then appear in the centralized Classify +log line and any downstream slog usage in the chain. + +> **Why two APIs?** `*natsrouter.Context` implements `context.Context` and +> delegates `Value(key)` lookups to an inner `ctx` field. Calling +> `errcode.WithLogValues(c, …)` would derive a new ctx whose parent is `c` — +> any subsequent `c.Value(otherKey)` would loop. The method (`c.WithLogValues`) +> derives from the inner field, avoiding the cycle. + +--- + +## 5. Adding a new `reason` + +Reasons are **per-service catalogs** in `pkg/errcode/codes_.go` +(declared as `Reason` constants — never `errcode.Reason("...")` inline; semgrep +will reject it). + +1. Pick a `flat_snake_case` machine code (e.g. `bot_rate_limited`). +2. Add it to the right catalog: + ```go + // pkg/errcode/codes_room.go + RoomBotRateLimited Reason = "bot_rate_limited" + ``` +3. Add the constant to `allReasons` in `pkg/errcode/codes_test.go` (the + snake-case + uniqueness tests pick it up automatically). +4. Use it: `errcode.TooManyRequests("bot quota exceeded", + errcode.WithReason(errcode.RoomBotRateLimited))`. +5. Update `docs/client-api.md` §6 reason catalog AND the relevant endpoint + error table in the SAME PR (CLAUDE.md client-API rule). + +Only add a reason when the frontend genuinely needs to distinguish it from +other errors of the same category. Most cases are generic. + +--- + +## 6. Wrapping invariant — allowed vs forbidden + +**Invariant:** at most one `*errcode.Error` per error chain, propagated via a +single `%w`. + +**Allowed:** + +```go +return errcode.BadRequest("name is required") +return errcode.NotFound("x", errcode.WithReason(RoomNotMember)) +return errcode.Internal("x", errcode.WithCause(rawDBErr)) // RAW cause only +return fmt.Errorf("checking room: %w", typedErr) // typed survives +return typedErr // bare propagation +``` + +**Forbidden (semgrep-flagged + panics at runtime):** + +```go +return errcode.Internal("x", errcode.WithCause(anotherErrcodeErr)) // PANIC +return fmt.Errorf("%w and %w", errcodeA, errcodeB) // Classify picks one +``` + +--- + +## 7. Lint enforcement + +`.semgrep/errcode.yml` (wired into `make sast`) enforces: + +| Rule | Severity | What it catches | +|--------------------------------------------|----------|-----------------| +| `errcode-no-reason-literal-outside-catalog`| ERROR | Inline `errcode.Reason("...")` outside `codes_*.go` | +| `errcode-withcause-must-not-wrap-errcode` | ERROR | `errcode.WithCause(errcode.X(...))` literal | +| `errcode-no-multi-wrap-errcode` | ERROR | `fmt.Errorf("%w … %w")` mixing typed errors | +| `errcode-prefer-named-constructor` | WARNING | `errcode.New(errcode.CodeX, msg)` literal | + +CI runs `make sast` on every PR. + +--- + +## 8. Testing + +Use `pkg/errcode/errtest` to assert on a decoded reply payload: + +```go +import "github.com/hmchangw/chat/pkg/errcode/errtest" + +errtest.AssertCode(t, replyBytes, errcode.CodeNotFound) +errtest.AssertReason(t, replyBytes, errcode.RoomNotMember) +e := errtest.Decode(t, replyBytes) // for ad-hoc checks +``` + +For in-process matching on chained errors: + +```go +if errcode.HasReason(err, errcode.RoomNotMember) { /* … */ } +r := errcode.ReasonOf(err) // "" if no errcode error in chain +``` + +--- + +## 9. JetStream consumers — `errcode.Permanent` + +JetStream handlers face a different question than request/reply handlers: on +failure, do we **Ack** (drop the message) or **Nak** (let JetStream redeliver)? +The category alone can't answer it — an `Internal` from a deterministic bug +should drop, while a transient infra `Internal` should retry. The marker is +**explicit**: + +```go +if err := json.Unmarshal(data, &req); err != nil { + // Malformed payload: redelivery won't help. Ack via Permanent. + return errcode.Permanent(errcode.BadRequest("unmarshal X")) +} +// Transient infra failure: bare error → consumer Naks for redelivery. +if err := h.store.Save(ctx, &row); err != nil { + return fmt.Errorf("save row: %w", err) +} +``` + +The consume loop in `main.go` reads the marker: + +```go +if _, ok := errcode.IsPermanent(err); ok { + msg.Ack() // poison-pill drop; client already got the AsyncJobResult. + return +} +msg.Nak() // transient — retry. +``` + +`Permanent` wraps an `*errcode.Error` so `fillAsyncError` can still extract +`Code` / `Reason` for the `AsyncJobResult` envelope; the wrapper is invisible +to clients (it isn't serialized). `errors.Is(err, errcode.ErrPermanent)` is +the sentinel-style match if you don't need the wrapped `*Error`. + +**Don't** infer permanence from `Code`: an `Internal` can be either a poison- +pill (bad payload classified to internal by Classify) or a retryable +infra-down condition. Wrap explicitly at the call site. + +--- + +## 10. Migration history + +This package replaced four legacy patterns (all removed in `pkg/natsrouter` +cleanup): + +- `pkg/natsrouter`'s `RouteError` + `Err*` constructors + `Code*` consts +- `pkg/natsutil`'s `MarshalError` / `MarshalErrorWithCode` / `ReplyError` / + `TryParseError` +- `pkg/model.ErrorResponse` +- `auth-service`'s ad-hoc `gin.H{"error": ...}` + +See `docs/superpowers/specs/2026-05-28-centralized-error-codes-design.md` for +the design rationale and the per-service error contract. diff --git a/docs/superpowers/plans/2026-05-28-centralized-error-codes.md b/docs/superpowers/plans/2026-05-28-centralized-error-codes.md new file mode 100644 index 000000000..2de97b306 --- /dev/null +++ b/docs/superpowers/plans/2026-05-28-centralized-error-codes.md @@ -0,0 +1,2441 @@ +# Centralized Error Codes Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +> **Design spec:** `docs/superpowers/specs/2026-05-28-centralized-error-codes-design.md` — read it first for the contract, API surface, and locked decisions. This plan is the step-by-step implementation of that spec. + +**Goal:** Replace the four incompatible error-reply patterns across the repo with one transport-neutral `pkg/errcode` package that produces a single wire envelope `{error, code, reason?, metadata?}`, centralizes server-side logging, and makes cause-leakage and code/reason-mixing structurally impossible. + +**Architecture:** A core `pkg/errcode` package owns two distinct types — `Code` (closed set of 7 generic codes; the wire `code`) and `Reason` (open set of domain codes; the wire `reason`) — plus the `*Error` type (unexported, non-serializable `cause`), functional-option constructors, a `Classify` boundary that collapses unknown errors to `internal` and logs the full cause chain once, and a `Parse` helper for RPC clients. Two thin transport adapters — `pkg/errcode/errnats` and `pkg/errcode/errhttp` — marshal the envelope for NATS replies and Gin responses. Domain reasons live in `pkg/errcode/codes_.go` as typed `Reason` constants. A `logctx` helper plus a `natsrouter.Context.WithLogValues` seam let handlers attach domain attributes to the logger without ctx-cycle hazards. + +**Tech Stack:** Go 1.25, `log/slog`, `github.com/nats-io/nats.go`, `github.com/gin-gonic/gin`, `go.uber.org/mock`, `stretchr/testify`, semgrep (custom local rules). + +--- + +## Design Decisions (locked during brainstorming + spec review) + +Do not relitigate these while executing. If a task seems to contradict one, stop and ask. + +1. **Wire envelope:** `{"error": "", "code": "", "reason": ""?, "metadata": {}?}`. `error` = human message (existing field name). `code` = one of 7 generic categories, always present. `reason` = optional specific machine code the frontend switches on. `metadata` = optional `map[string]string`. +2. **Two distinct Go types:** `type Code string` (the 7 generics) and `type Reason string` (domain codes). The compiler rejects `New(SomeReason, …)` and `WithReason(SomeCode)`. This is the type-safety guarantee chosen over a single `Code` type. +3. **Generic categories live in core; domain reasons live in `pkg/errcode/codes_.go`** as `Reason` constants (importable across `package main` services, compiler-unique, single catalog). +4. **Infra/DB/third-party errors always collapse to `internal`** with message `"internal error"`, done automatically by `Classify`. +5. **Cause never leaks:** the `cause` field is unexported → `encoding/json` cannot serialize it. Reachable only via `Unwrap()` for logging / `errors.Is`/`As`. +6. **Centralized, level-aware logging:** `Classify` emits exactly one `slog` line (request_id + domain attrs from ctx + code + reason + full cause chain). The level is **category-aware**: `internal`/`unavailable` → ERROR; all expected client errors (`bad_request`/`unauthenticated`/`forbidden`/`not_found`/`conflict`) → INFO. This keeps routine validation failures out of the ERROR stream so error-rate alerting stays meaningful. Handlers do NOT log-then-reply. +7. **One way to format:** named constructors (`errcode.BadRequest(msg, opts...)`) are the entire constructor API; there are **no `*f` variants** (they silently dropped options — a `Conflictf("…%s", id, WithReason(r))` would pass the option as a format arg and lose the reason). For dynamic text use `errcode.BadRequest(fmt.Sprintf(...), opts...)`. Fixed strings never go through `Sprintf`, so literal `%` is safe. +8. **Footgun guard:** `WithCause` panics if the cause already carries an `*errcode.Error`. semgrep flags the literal form at lint time. Invariant: **at most one `*errcode.Error` per chain via single-`%w` propagation.** (Multi-`%w` `fmt.Errorf("%w … %w", ecA, ecB)` can defeat this — `Classify` then picks the first in traversal order; semgrep rule `errcode-no-multi-wrap-errcode` flags it.) +9. **Options carry a trust boundary:** `WithMetadata` is **client-visible** (ships in the envelope); `WithLogValues` is **server-only** (never serialized). Never put server-internal detail in `WithMetadata`, never put client data you wouldn't log in `WithLogValues`. +10. **DM-already-exists is reclassified as a success response** (returns the existing room ID via `model.CreateRoomReply`), not an error. +11. **New generic category `unauthenticated` (HTTP 401)** beyond the original 6 — `auth-service` needs 401-vs-403. **PM gate before Chapter 16.** +12. **Migration uses shims, not mid-plan deletion.** `natsrouter.Err*`/`RouteError` are converted to thin delegating shims in Chapter 10 and deleted only in Chapter 17. The shims keep *production* callers compiling; natsrouter's own tests and ~40 cross-service `.Code`-as-string test assertions are migrated **in the same chapter that introduces the change** (see Ch 10) so each commit is green and bisectable. + +### Open items to confirm + +- **PM gate (Ch 16):** confirm the `unauthenticated` (401) category. If rejected, `auth-service` token errors fold into `forbidden` (403). +- **`unavailable` HTTP mapping:** currently 503. Admission-control "service busy" is arguably **429**. This plan keeps 503 (the NATS services don't care; only matters if admission surfaces over HTTP). Revisit if/when an HTTP service needs rate-limit semantics. +- **Double-logging is intentional but level-managed:** a failed request emits the `Logging()` middleware access line (`"nats request"`, info) AND the `Classify` line (`"request failed"`). These serve different purposes (access vs error log). With the category-aware level (Decision 6), a routine 4xx produces two INFO-ish lines, not an ERROR — so this does not pollute error alerting. Already-logged transport paths (panic backstop, `replyBusy`) use a **non-logging** marshal (`errnats.MarshalQuiet`) so they don't emit a second redundant `Classify` line; see Ch 8 / Ch 10. + +--- + +## File Structure + +**New (core):** `pkg/errcode/{category.go, reason.go, error.go, options.go, classify.go, parse.go, match.go, logctx.go, doc.go, codes_room.go, codes_message.go, codes_search.go, codes_auth.go}` + `*_test.go`. +**New (adapters):** `pkg/errcode/errnats/{reply.go,reply_test.go}`, `pkg/errcode/errhttp/{write.go,write_test.go}`. +**New (test helper):** `pkg/errcode/errtest/{assert.go,assert_test.go}` (decode-and-assert helper for the ~60 migrated test sites). +**New (lint/docs):** `.semgrep/errcode.yml`, `docs/error-handling.md`. + +**Modified (foundation):** `pkg/natsrouter/{errors.go (shim then delete), register.go, router.go, context.go, middleware.go}`, `pkg/model/{event.go, error.go}`, `pkg/natsutil/reply.go`, `Makefile`. + +**Modified (migrations):** `history-service/*`, `search-service/*` (incl. `metrics.go`), `mock-user-service/*`, `message-gatekeeper/*` (incl. `fetcher_history.go`), `room-service/*` (incl. `memberlist_client.go`), `room-worker/*`, `auth-service/*`, `docs/client-api.md`, `chat-frontend/*`. + +--- + +## Chapter 0 — Core types: `Code` and `Reason` + +### Task 0.1: `Code` with `HTTPStatus` + +**Files:** Create `pkg/errcode/category.go`; Test `pkg/errcode/category_test.go`. + +- [ ] **Step 1: Failing test** + +```go +package errcode + +import "testing" + +func TestCode_HTTPStatus(t *testing.T) { + cases := map[Code]int{ + CodeBadRequest: 400, + CodeUnauthenticated: 401, + CodeForbidden: 403, + CodeNotFound: 404, + CodeConflict: 409, + CodeUnavailable: 503, + CodeInternal: 500, + Code("weird"): 500, + } + for c, want := range cases { + if got := c.HTTPStatus(); got != want { + t.Errorf("%s.HTTPStatus() = %d, want %d", c, got, want) + } + } +} +``` + +- [ ] **Step 2: Run, expect FAIL** — `go test ./pkg/errcode/ -run TestCode -v` → `undefined: CodeBadRequest`. + +- [ ] **Step 3: Implement** + +```go +// Package errcode is the single source of client-facing error envelopes for +// every transport in the chat system. See doc.go for the wrapping invariant +// and leak guarantee. +package errcode + +// Code is the closed set of generic error classifications. It is the wire +// `code` field and drives HTTP status. Only the constants below are valid. +type Code string + +const ( + CodeBadRequest Code = "bad_request" + CodeUnauthenticated Code = "unauthenticated" + CodeForbidden Code = "forbidden" + CodeNotFound Code = "not_found" + CodeConflict Code = "conflict" + CodeUnavailable Code = "unavailable" + CodeInternal Code = "internal" +) + +// HTTPStatus maps a category to its HTTP status. Unknown values map to 500 so +// a misclassification never leaks as a 2xx. +func (c Code) HTTPStatus() int { + switch c { + case CodeBadRequest: + return 400 + case CodeUnauthenticated: + return 401 + case CodeForbidden: + return 403 + case CodeNotFound: + return 404 + case CodeConflict: + return 409 + case CodeUnavailable: + return 503 + default: + return 500 + } +} +``` + +- [ ] **Step 4: Run, expect PASS.** +- [ ] **Step 5: Commit** — `git add pkg/errcode/category.go pkg/errcode/category_test.go && git commit -m "feat(errcode): add Code type with HTTP status mapping"` + +### Task 0.2: `Reason` type + +**Files:** Create `pkg/errcode/reason.go`; Test `pkg/errcode/reason_test.go`. + +- [ ] **Step 1: Failing test** + +```go +package errcode + +import "testing" + +func TestReason_IsString(t *testing.T) { + var r Reason = "max_room_size_reached" + if string(r) != "max_room_size_reached" { + t.Fatal("Reason must be a string-backed type") + } +} +``` + +- [ ] **Step 2: Run, expect FAIL** — `undefined: Reason`. +- [ ] **Step 3: Implement** + +```go +package errcode + +// Reason is an open set of domain-specific machine codes the frontend switches +// on. It is the wire `reason` field. Concrete reasons are declared as typed +// constants in codes_.go. Reason is deliberately distinct from +// Code so the compiler rejects passing one where the other is expected. +type Reason string +``` + +- [ ] **Step 4: Run, expect PASS.** +- [ ] **Step 5: Commit** — `git add pkg/errcode/reason.go pkg/errcode/reason_test.go && git commit -m "feat(errcode): add Reason type for domain codes"` + +--- + +## Chapter 1 — Core: `Error` type + +### Task 1.1: `Error` struct, `Error()`, `Unwrap()`, `HTTPStatus()` + +**Files:** Create `pkg/errcode/error.go`; Test `pkg/errcode/error_test.go`. + +- [ ] **Step 1: Failing test** + +```go +package errcode + +import ( + "encoding/json" + "errors" + "strings" + "testing" +) + +func TestError_Error_ReturnsMessageOnly(t *testing.T) { + e := &Error{Code: CodeBadRequest, Message: "name is required", cause: errors.New("secret db detail")} + if e.Error() != "name is required" { + t.Fatalf("Error() = %q, want safe message only", e.Error()) + } +} + +func TestError_Unwrap(t *testing.T) { + root := errors.New("root") + e := &Error{Code: CodeInternal, Message: "internal error", cause: root} + if !errors.Is(e, root) { + t.Fatal("errors.Is should reach the wrapped cause via Unwrap") + } +} + +func TestError_MarshalJSON_NeverLeaksCause(t *testing.T) { + e := &Error{ + Code: CodeBadRequest, + Reason: "max_room_size_reached", + Message: "room is full", + Metadata: map[string]string{"limit": "500"}, + cause: errors.New("mongo: connection refused at 10.0.0.5"), + } + b, err := json.Marshal(e) + if err != nil { + t.Fatal(err) + } + want := `{"code":"bad_request","reason":"max_room_size_reached","error":"room is full","metadata":{"limit":"500"}}` + if string(b) != want { + t.Fatalf("marshal = %s, want %s", b, want) + } + if strings.Contains(string(b), "mongo") { + t.Fatal("cause leaked into JSON") + } +} + +func TestError_MarshalJSON_OmitsEmptyOptionalFields(t *testing.T) { + b, _ := json.Marshal(&Error{Code: CodeNotFound, Message: "not found"}) + if want := `{"code":"not_found","error":"not found"}`; string(b) != want { + t.Fatalf("marshal = %s, want %s", b, want) + } +} + +func TestError_HTTPStatus(t *testing.T) { + if (&Error{Code: CodeNotFound}).HTTPStatus() != 404 { + t.Fatal("HTTPStatus should delegate to Code.HTTPStatus") + } +} +``` + +- [ ] **Step 2: Run, expect FAIL** — `undefined: Error`. +- [ ] **Step 3: Implement** + +```go +package errcode + +// Error is the canonical client-facing error. It marshals to the wire envelope +// {error, code, reason?, metadata?}. cause is UNEXPORTED and therefore cannot +// be serialized by encoding/json — it exists only for server-side logging and +// errors.Is/As traversal. See doc.go. +type Error struct { + Code Code `json:"code"` + Reason Reason `json:"reason,omitempty"` + Message string `json:"error"` + Metadata map[string]string `json:"metadata,omitempty"` + cause error +} + +// Error returns ONLY the user-safe message, never the cause. +func (e *Error) Error() string { return e.Message } + +// Unwrap exposes the wrapped cause for errors.Is/As and logging. JSON +// marshalling does not call Unwrap, so this does not leak the cause to clients. +func (e *Error) Unwrap() error { return e.cause } + +// HTTPStatus returns the HTTP status for this error's category. +func (e *Error) HTTPStatus() int { return e.Code.HTTPStatus() } +``` + +- [ ] **Step 4: Run, expect PASS.** +- [ ] **Step 5: Commit** — `git add pkg/errcode/error.go pkg/errcode/error_test.go && git commit -m "feat(errcode): add Error type with unexported, non-serializable cause"` + +--- + +## Chapter 2 — Core: `logctx` + +### Task 2.1: logger-in-context helpers + +**Files:** Create `pkg/errcode/logctx.go`; Test `pkg/errcode/logctx_test.go`. + +- [ ] **Step 1: Failing test** + +```go +package errcode + +import ( + "bytes" + "context" + "encoding/json" + "log/slog" + "testing" +) + +func TestWithLogValues_AccumulatesAttrs(t *testing.T) { + var buf bytes.Buffer + ctx := WithLogger(context.Background(), slog.New(slog.NewJSONHandler(&buf, nil))) + ctx = WithLogValues(ctx, "account", "alice") + ctx = WithLogValues(ctx, "roomID", "r1") + loggerFrom(ctx).Info("hello") + + var line map[string]any + if err := json.Unmarshal(buf.Bytes(), &line); err != nil { + t.Fatal(err) + } + if line["account"] != "alice" || line["roomID"] != "r1" { + t.Fatalf("attrs not accumulated: %v", line) + } +} + +func TestLoggerFrom_DefaultsWhenAbsent(t *testing.T) { + if loggerFrom(context.Background()) == nil { + t.Fatal("loggerFrom must never return nil") + } +} +``` + +- [ ] **Step 2: Run, expect FAIL** — `undefined: WithLogger`. +- [ ] **Step 3: Implement** + +```go +package errcode + +import ( + "context" + "log/slog" +) + +type loggerCtxKey struct{} + +// WithLogger stores an explicit *slog.Logger in ctx (mainly for tests). +func WithLogger(ctx context.Context, l *slog.Logger) context.Context { + return context.WithValue(ctx, loggerCtxKey{}, l) +} + +// WithLogValues returns ctx carrying a logger enriched with the given key/value +// pairs. Call once at the top of a handler to attach domain context; the +// centralized Classify log line then includes them. +func WithLogValues(ctx context.Context, args ...any) context.Context { + return WithLogger(ctx, loggerFrom(ctx).With(args...)) +} + +// loggerFrom returns the ctx logger, or slog.Default() if none was set. +func loggerFrom(ctx context.Context) *slog.Logger { + if l, ok := ctx.Value(loggerCtxKey{}).(*slog.Logger); ok && l != nil { + return l + } + return slog.Default() +} +``` + +- [ ] **Step 4: Run, expect PASS.** +- [ ] **Step 5: Commit** — `git add pkg/errcode/logctx.go pkg/errcode/logctx_test.go && git commit -m "feat(errcode): add logger-in-context helpers"` + +--- + +## Chapter 3 — Core: options and constructors + +### Task 3.1: `Option`, `New`, named constructors, `WithReason`/`WithMetadata`/`WithCause` + +**Files:** Create `pkg/errcode/options.go`; Test `pkg/errcode/options_test.go`. + +Constructor convention (resolves the review's naming inconsistency AND the `*f`-drops-options footgun): **named constructors are the entire API** — `errcode.BadRequest(msg, opts...)`, one per category. There are **no `*f` variants**: a `Conflictf("room %s full", id, WithReason(r))` would pass the `Option` as a `Sprintf` arg and silently lose the reason, so they are omitted. For dynamic text, call `errcode.BadRequest(fmt.Sprintf("…", x), opts...)` — the message is computed by the caller and options stay first-class. Fixed strings never touch `Sprintf`, so literal `%` is safe. `New(code, msg, opts...)` is the dynamic-category escape hatch. + +- [ ] **Step 1: Failing test** + +```go +package errcode + +import ( + "errors" + "fmt" + "testing" +) + +func TestNamedConstructors(t *testing.T) { + if e := BadRequest("name is required"); e.Code != CodeBadRequest || e.Message != "name is required" { + t.Fatalf("BadRequest: %+v", e) + } + if e := NotFound("gone"); e.Code != CodeNotFound { + t.Fatal("NotFound") + } + for _, e := range []*Error{ + Unauthenticated("x"), Forbidden("x"), Conflict("x"), Unavailable("x"), Internal("x"), + } { + if e.Message != "x" { + t.Fatalf("constructor message: %+v", e) + } + } +} + +func TestConstructorDoesNotFormat_LiteralPercentIsSafe(t *testing.T) { + if got := BadRequest("100% full").Message; got != "100% full" { + t.Fatalf("constructor must not format: %q", got) + } +} + +func TestFormattingPlusOptionUsesSprintfAtCallSite(t *testing.T) { + // The supported pattern for dynamic text + a reason: caller formats, options stay first-class. + e := Conflict(fmt.Sprintf("room %s is full", "r1"), WithReason("max_room_size_reached")) + if e.Message != "room r1 is full" || e.Reason != "max_room_size_reached" { + t.Fatalf("got %+v", e) + } +} + +func TestWithReason(t *testing.T) { + e := BadRequest("room full", WithReason("max_room_size_reached")) + if e.Reason != "max_room_size_reached" { + t.Fatalf("reason = %q", e.Reason) + } +} + +func TestWithMetadata_Pairs(t *testing.T) { + e := Conflict("dm exists", WithMetadata("roomId", "r1", "kind", "dm")) + if e.Metadata["roomId"] != "r1" || e.Metadata["kind"] != "dm" { + t.Fatalf("meta = %v", e.Metadata) + } +} + +func TestWithMetadata_OddArgsPanics(t *testing.T) { + defer func() { + if recover() == nil { + t.Fatal("odd WithMetadata args must panic") + } + }() + BadRequest("x", WithMetadata("lonely")) +} + +func TestWithCause_RawError(t *testing.T) { + root := errors.New("mongo down") + if e := Internal("internal error", WithCause(root)); !errors.Is(e, root) { + t.Fatal("cause not attached") + } +} + +func TestWithCause_PanicsOnNestedErrcode(t *testing.T) { + inner := NotFound("room not found") + defer func() { + if recover() == nil { + t.Fatal("WithCause(errcode.Error) must panic — invariant: one *Error per chain") + } + }() + Internal("x", WithCause(inner)) +} + +func TestWithCause_PanicsOnWrappedNestedErrcode(t *testing.T) { + inner := NotFound("room not found") + wrapped := fmt.Errorf("ctx: %w", inner) + defer func() { + if recover() == nil { + t.Fatal("WithCause must detect *Error even when wrapped") + } + }() + Internal("x", WithCause(wrapped)) +} +``` + +- [ ] **Step 2: Run, expect FAIL** — `undefined: BadRequest`. +- [ ] **Step 3: Implement** + +```go +package errcode + +import "errors" + +// Option configures an *Error during construction. +type Option func(*Error) + +// New builds an *Error with a generic category and message, applying options. +// Prefer the named constructors below; use New only for a dynamically chosen +// category. +func New(code Code, message string, opts ...Option) *Error { + e := &Error{Code: code, Message: message} + for _, opt := range opts { + opt(e) + } + return e +} + +// Named constructors are the entire constructor API: one per category, each +// taking a fixed message and options. They never format — for dynamic text the +// caller passes fmt.Sprintf(...) as msg, keeping options first-class. There are +// deliberately no *f variants (they would swallow trailing Option args). +func BadRequest(msg string, opts ...Option) *Error { return New(CodeBadRequest, msg, opts...) } +func Unauthenticated(msg string, opts ...Option) *Error { return New(CodeUnauthenticated, msg, opts...) } +func Forbidden(msg string, opts ...Option) *Error { return New(CodeForbidden, msg, opts...) } +func NotFound(msg string, opts ...Option) *Error { return New(CodeNotFound, msg, opts...) } +func Conflict(msg string, opts ...Option) *Error { return New(CodeConflict, msg, opts...) } +func Unavailable(msg string, opts ...Option) *Error { return New(CodeUnavailable, msg, opts...) } +func Internal(msg string, opts ...Option) *Error { return New(CodeInternal, msg, opts...) } + +// WithReason attaches the specific machine code the frontend switches on. +// Accepts only Reason — the compiler rejects a Code here. +func WithReason(r Reason) Option { return func(e *Error) { e.Reason = r } } + +// WithMetadata attaches CLIENT-VISIBLE string key/value metadata (it ships in +// the wire envelope). Never put server-internal detail here — use WithLogValues +// for that. Args must be even; an odd count is a programmer error and panics. +func WithMetadata(kv ...string) Option { + return func(e *Error) { + if len(kv)%2 != 0 { + panic("errcode: WithMetadata requires an even number of args (key/value pairs)") + } + if e.Metadata == nil { + e.Metadata = make(map[string]string, len(kv)/2) + } + for i := 0; i < len(kv); i += 2 { + e.Metadata[kv[i]] = kv[i+1] + } + } +} + +// WithCause attaches an underlying error for server-side logging. The cause is +// NEVER serialized. It PANICS if the cause already carries an *errcode.Error, +// preserving the "at most one *Error per chain" invariant. Pass only raw +// infra/third-party errors. See doc.go. +func WithCause(err error) Option { + return func(e *Error) { + var nested *Error + if errors.As(err, &nested) { + panic("errcode: WithCause must not wrap another *errcode.Error; " + + `propagate it with "return err" or fmt.Errorf("...: %w", err) instead`) + } + e.cause = err + } +} +``` + +- [ ] **Step 4: Run, expect PASS.** +- [ ] **Step 5: Commit** — `git add pkg/errcode/options.go pkg/errcode/options_test.go && git commit -m "feat(errcode): add constructors and options with WithCause panic guard"` + +--- + +## Chapter 4 — Core: `Classify` + +### Task 4.1: boundary classifier + centralized log + +**Files:** Create `pkg/errcode/classify.go`; Test `pkg/errcode/classify_test.go`. + +- [ ] **Step 1: Failing test** + +```go +package errcode + +import ( + "bytes" + "context" + "encoding/json" + "errors" + "fmt" + "log/slog" + "strings" + "testing" +) + +func newCapture() (context.Context, *bytes.Buffer) { + var buf bytes.Buffer + l := slog.New(slog.NewJSONHandler(&buf, &slog.HandlerOptions{Level: slog.LevelInfo})) + return WithLogger(context.Background(), l), &buf +} + +func TestClassify_NilReturnsNil(t *testing.T) { + ctx, _ := newCapture() + if Classify(ctx, nil) != nil { + t.Fatal("nil → nil") + } +} + +func TestClassify_UnknownBecomesInternalAndLogsCause(t *testing.T) { + ctx, buf := newCapture() + raw := fmt.Errorf("load room: %w", errors.New("mongo: connection refused 10.0.0.5")) + e := Classify(ctx, raw) + if e.Code != CodeInternal || e.Message != "internal error" { + t.Fatalf("got %+v", e) + } + if !strings.Contains(buf.String(), "mongo: connection refused") { + t.Fatalf("cause not logged: %s", buf.String()) + } + b, _ := json.Marshal(e) + if strings.Contains(string(b), "mongo") { + t.Fatalf("cause leaked into reply: %s", b) + } +} + +func TestClassify_TypedErrorPreservedThroughWrapping(t *testing.T) { + ctx, _ := newCapture() + typed := NotFound("room not found", WithReason("room_not_found")) + e := Classify(ctx, fmt.Errorf("checking room: %w", typed)) + if e.Code != CodeNotFound || e.Reason != "room_not_found" { + t.Fatalf("typed lost: %+v", e) + } +} + +func TestClassify_LogsCtxValues(t *testing.T) { + ctx, buf := newCapture() + ctx = WithLogValues(ctx, "request_id", "req-123", "account", "alice") + Classify(ctx, errors.New("boom")) + if l := buf.String(); !strings.Contains(l, "req-123") || !strings.Contains(l, "alice") { + t.Fatalf("ctx values missing: %s", l) + } +} + +func TestClassify_LevelIsCategoryAware(t *testing.T) { + level := func(err error) string { + ctx, buf := newCapture() + Classify(ctx, err) + var line map[string]any + _ = json.Unmarshal(buf.Bytes(), &line) + return line["level"].(string) + } + // Expected client errors must NOT log at ERROR (would pollute alerting). + if got := level(BadRequest("name is required")); got != "INFO" { + t.Fatalf("4xx level = %s, want INFO", got) + } + if got := level(NotFound("gone")); got != "INFO" { + t.Fatalf("not_found level = %s, want INFO", got) + } + // Server/infra errors log at ERROR. + if got := level(errors.New("mongo down")); got != "ERROR" { + t.Fatalf("internal level = %s, want ERROR", got) + } + if got := level(Unavailable("service busy")); got != "ERROR" { + t.Fatalf("unavailable level = %s, want ERROR", got) + } +} +``` + +- [ ] **Step 2: Run, expect FAIL** — `undefined: Classify`. +- [ ] **Step 3: Implement** + +```go +package errcode + +import ( + "context" + "errors" + "log/slog" +) + +// Classify converts any error into a client-safe *Error and logs it exactly +// once on the server side. It is the single boundary every transport adapter +// calls before replying. +// +// - nil → nil. +// - *errcode.Error in the chain (via errors.As) → that error. +// - anything else → Internal "internal error", original chain as cause. +// +// The log line carries request_id and domain attrs stashed via WithLogValues, +// plus the full cause chain, and its LEVEL is category-aware: server faults +// (internal/unavailable) at ERROR, expected client errors at INFO — so routine +// 4xx validation failures don't pollute the ERROR stream / break alerting. +// The cause is never part of the returned *Error's serialized form. The cause +// chain is logged via err.Error(); callers MUST NOT wrap raw message bodies or +// tokens into a cause (see doc.go logging contract). +func Classify(ctx context.Context, err error) *Error { + if err == nil { + return nil + } + var e *Error + if !errors.As(err, &e) { + e = &Error{Code: CodeInternal, Message: "internal error", cause: err} + } + loggerFrom(ctx).Log(ctx, e.logLevel(), "request failed", + "code", string(e.Code), + "reason", string(e.Reason), + "cause", err.Error(), + ) + return e +} + +// logLevel maps a category to a server-log level: server faults are ERROR, +// expected client errors are INFO. +func (e *Error) logLevel() slog.Level { + switch e.Code { + case CodeInternal, CodeUnavailable: + return slog.LevelError + default: + return slog.LevelInfo + } +} +``` + +- [ ] **Step 4: Run, expect PASS.** +- [ ] **Step 5: Commit** — `git add pkg/errcode/classify.go pkg/errcode/classify_test.go && git commit -m "feat(errcode): add Classify boundary with centralized logging"` + +--- + +## Chapter 5 — Core: `Parse` (for RPC clients) + +Needed by `message-gatekeeper/fetcher_history.go` and `room-service/memberlist_client.go`, which decode *remote* error replies. Replaces `natsutil.TryParseError`. + +### Task 5.1: `Parse` + +**Files:** Create `pkg/errcode/parse.go`; Test `pkg/errcode/parse_test.go`. + +- [ ] **Step 1: Failing test** + +```go +package errcode + +import "testing" + +func TestParse_ErrorEnvelope(t *testing.T) { + e, ok := Parse([]byte(`{"code":"forbidden","reason":"not_room_member","error":"only room members can list members"}`)) + if !ok || e.Code != CodeForbidden || e.Reason != "not_room_member" { + t.Fatalf("parse failed: %+v ok=%v", e, ok) + } +} + +func TestParse_NonErrorJSON(t *testing.T) { + if _, ok := Parse([]byte(`{"roomId":"r1","status":"accepted"}`)); ok { + t.Fatal("payload without non-empty error must not parse as error") + } +} + +func TestParse_Malformed(t *testing.T) { + if _, ok := Parse([]byte(`not json`)); ok { + t.Fatal("malformed must not parse") + } +} +``` + +- [ ] **Step 2: Run, expect FAIL** — `undefined: Parse`. +- [ ] **Step 3: Implement** + +```go +package errcode + +import "encoding/json" + +// Parse decodes a reply payload into an *Error iff it is an error envelope +// (non-empty "error" field). Used by RPC clients to detect remote failures and +// branch on code/reason. Returns (nil, false) for success payloads or garbage. +func Parse(data []byte) (*Error, bool) { + var e Error + if err := json.Unmarshal(data, &e); err != nil || e.Message == "" { + return nil, false + } + return &e, true +} +``` + +- [ ] **Step 4: Run, expect PASS.** +- [ ] **Step 5: Commit** — `git add pkg/errcode/parse.go pkg/errcode/parse_test.go && git commit -m "feat(errcode): add Parse for RPC-client error detection"` + +### Task 5.2: in-process reason matching (`match.go`) + +In-process callers (e.g. `room-service/memberlist_client.go` Ch 14.2) need to branch on the reason of an error without hand-rolling `errors.As`. Provide one helper each for the value and the boolean test. + +**Files:** Create `pkg/errcode/match.go`, `match_test.go`. + +- [ ] **Step 1: Failing test** +```go +package errcode + +import ( + "errors" + "fmt" + "testing" +) + +func TestReasonOf(t *testing.T) { + err := fmt.Errorf("ctx: %w", NotFound("x", WithReason(RoomNotMember))) + if ReasonOf(err) != RoomNotMember { + t.Fatalf("ReasonOf = %q", ReasonOf(err)) + } + if ReasonOf(errors.New("plain")) != "" { + t.Fatal("non-errcode error must yield empty reason") + } +} + +func TestHasReason(t *testing.T) { + if !HasReason(NotFound("x", WithReason(RoomNotMember)), RoomNotMember) { + t.Fatal("HasReason should match") + } + if HasReason(NotFound("x"), RoomNotMember) { + t.Fatal("HasReason must not match an absent reason") + } +} +``` + +- [ ] **Step 2: Run, expect FAIL** — `undefined: ReasonOf`. +- [ ] **Step 3: Implement** +```go +package errcode + +import "errors" + +// ReasonOf returns the Reason of the first *Error in err's chain, or "" if +// there is none. +func ReasonOf(err error) Reason { + var e *Error + if errors.As(err, &e) { + return e.Reason + } + return "" +} + +// HasReason reports whether err's chain carries an *Error with reason r. +func HasReason(err error, r Reason) bool { return ReasonOf(err) == r } +``` + +- [ ] **Step 4: Run, expect PASS.** +- [ ] **Step 5: Commit** — `git add pkg/errcode/match.go pkg/errcode/match_test.go && git commit -m "feat(errcode): add ReasonOf/HasReason matchers"` + +### Task 5.3: test helper (`errtest`) + +~60 test sites across services migrate from `RouteError.Code == "..."` to decoding the reply envelope. A tiny shared helper avoids hand-rolling JSON-decode-and-assert in each, and keeps the migration chapters mechanical. + +**Files:** Create `pkg/errcode/errtest/assert.go`, `assert_test.go`. + +- [ ] **Step 1: Failing test** +```go +package errtest + +import ( + "encoding/json" + "testing" + + "github.com/hmchangw/chat/pkg/errcode" +) + +func TestAssertEnvelope(t *testing.T) { + data, _ := json.Marshal(errcode.NotFound("room not found", errcode.WithReason(errcode.RoomNotMember))) + AssertCode(t, data, errcode.CodeNotFound) + AssertReason(t, data, errcode.RoomNotMember) +} +``` + +- [ ] **Step 2: Run, expect FAIL** — `undefined: AssertCode`. +- [ ] **Step 3: Implement** (helpers call `t.Helper()`; live in a non-`_test.go` file so other packages' tests can import them — allowed by CLAUDE.md "shared test utilities used by multiple packages may live in a dedicated package", and this package is import-only-by-tests): +```go +// Package errtest provides assertions for errcode wire envelopes in tests. +package errtest + +import ( + "testing" + + "github.com/hmchangw/chat/pkg/errcode" +) + +// Decode parses an error envelope from a reply payload, failing the test if it +// is not one. +func Decode(t *testing.T, data []byte) *errcode.Error { + t.Helper() + e, ok := errcode.Parse(data) + if !ok { + t.Fatalf("payload is not an error envelope: %s", data) + } + return e +} + +// AssertCode fails unless data is an error envelope with the given code. +func AssertCode(t *testing.T, data []byte, want errcode.Code) { + t.Helper() + if got := Decode(t, data).Code; got != want { + t.Fatalf("code = %q, want %q (payload %s)", got, want, data) + } +} + +// AssertReason fails unless data is an error envelope with the given reason. +func AssertReason(t *testing.T, data []byte, want errcode.Reason) { + t.Helper() + if got := Decode(t, data).Reason; got != want { + t.Fatalf("reason = %q, want %q (payload %s)", got, want, data) + } +} +``` + +- [ ] **Step 4: Run, expect PASS.** Migration chapters (11–16) SHOULD use `errtest.AssertCode`/`AssertReason` instead of bespoke decoding. +- [ ] **Step 5: Commit** — `git add pkg/errcode/errtest/ && git commit -m "feat(errcode/errtest): add envelope assertion helpers for tests"` + +--- + +## Chapter 6 — Core: `doc.go` + +### Task 6.1: package documentation + +**Files:** Create `pkg/errcode/doc.go`. + +- [ ] **Step 1: Write the doc** + +```go +// Package errcode is the single source of client-facing error envelopes for +// every transport (NATS request/reply, JetStream replies, Gin HTTP). +// +// # Wire envelope +// +// {"error":"","code":"","reason":""?,"metadata":{…}?} +// +// - error — human-readable, user-safe message. +// - code — one Code (bad_request, unauthenticated, forbidden, +// not_found, conflict, unavailable, internal). Always present. +// - reason — optional Reason (domain code, e.g. "max_room_size_reached"), +// declared in codes_.go. Frontend logic: trigger = reason ?? code. +// - metadata — optional map[string]string for structured detail. +// +// # Two types, by design +// +// Code (the 7 generics) and Reason (open domain set) are distinct types so +// the compiler rejects New(SomeReason, …) and WithReason(SomeCode). +// +// # Leak guarantee +// +// Error.cause is unexported; encoding/json cannot serialize it. The cause is +// reachable only server-side via Unwrap()/errors.Is/As and is logged exactly +// once by Classify. +// +// # Wrapping invariant: at most one *errcode.Error per chain +// +// Allowed: +// +// return errcode.BadRequest("name is required") +// return errcode.NotFound("x", errcode.WithReason(RoomNotMember)) +// return errcode.Internal("x", errcode.WithCause(rawDBErr)) // RAW cause only +// return fmt.Errorf("checking room: %w", typedErr) // typed survives +// return typedErr +// +// Forbidden (WithCause panics; semgrep-flagged): +// +// return errcode.Internal("x", errcode.WithCause(anotherErrcodeErr)) +// +// Also forbidden (defeats the invariant; semgrep-flagged): putting two errcode +// errors in one chain via multi-verb fmt.Errorf — +// +// return fmt.Errorf("%w and %w", errcodeA, errcodeB) // Classify picks the first +// +// Propagate with a single %w only. +// +// # Logging +// +// Classify logs each error exactly once, at a category-aware level (server +// faults ERROR, expected client errors INFO). Handlers must NOT log-then-reply. +// +// Attach domain context once at handler entry, choosing by handler style: +// - natsrouter handler (has *Context): c.WithLogValues("account", a) +// - Gin / raw NATS (has context.Context): ctx = errcode.WithLogValues(ctx, …) +// +// Never call the package func errcode.WithLogValues with a *natsrouter.Context +// as parent — use the method, which derives from the inner ctx and avoids the +// Value-delegation cycle. +// +// Trust boundary: WithLogValues attributes are SERVER-ONLY (never serialized); +// WithMetadata is CLIENT-VISIBLE (ships in the envelope). A cause attached via +// WithCause is logged through err.Error() — never wrap raw message bodies, +// tokens, or secrets into a cause, or the central log becomes a leak vector. +package errcode +``` + +- [ ] **Step 2: Run** `go vet ./pkg/errcode/` → clean. +- [ ] **Step 3: Commit** — `git add pkg/errcode/doc.go && git commit -m "docs(errcode): document envelope, type split, leak guarantee, invariant"` + +--- + +## Chapter 7 — Domain reason catalogs + +### Task 7.1: per-service `Reason` catalogs + +**Files:** Create `pkg/errcode/codes_room.go`, `codes_message.go`, `codes_search.go`, `codes_auth.go`; Test `pkg/errcode/codes_test.go`. + +- [ ] **Step 1: Failing test (uniqueness + snake_case)** + +```go +package errcode + +import ( + "regexp" + "testing" +) + +var allReasons = []Reason{ + RoomMaxSizeReached, RoomDMAlreadyExists, RoomNotMember, RoomNotOwner, + RoomLastOwnerCannotLeave, RoomBotInChannel, RoomBotNotAvailable, + MessageLargeRoomPostRestricted, MessageNotSubscribed, + AuthTokenExpired, AuthInvalidToken, +} + +func TestReasons_SnakeCase(t *testing.T) { + re := regexp.MustCompile(`^[a-z][a-z0-9_]*[a-z0-9]$`) + for _, r := range allReasons { + if !re.MatchString(string(r)) { + t.Errorf("reason %q is not flat snake_case", r) + } + } +} + +func TestReasons_Unique(t *testing.T) { + seen := map[Reason]bool{} + for _, r := range allReasons { + if seen[r] { + t.Errorf("duplicate reason: %q", r) + } + seen[r] = true + } +} +``` + +- [ ] **Step 2: Run, expect FAIL** — `undefined: RoomMaxSizeReached`. +- [ ] **Step 3: Create catalogs** + +`pkg/errcode/codes_room.go`: +```go +package errcode + +// Reasons emitted by room-service and room-worker. +const ( + RoomMaxSizeReached Reason = "max_room_size_reached" + RoomDMAlreadyExists Reason = "dm_already_exists" + RoomNotMember Reason = "not_room_member" + RoomNotOwner Reason = "not_room_owner" + RoomLastOwnerCannotLeave Reason = "last_owner_cannot_leave" + RoomBotInChannel Reason = "bot_in_channel" + RoomBotNotAvailable Reason = "bot_not_available" +) +``` + +`pkg/errcode/codes_message.go`: +```go +package errcode + +// Reasons emitted by message-gatekeeper. +const ( + MessageLargeRoomPostRestricted Reason = "large_room_post_restricted" + MessageNotSubscribed Reason = "not_subscribed" +) +``` + +`pkg/errcode/codes_search.go`: +```go +package errcode + +// Reasons emitted by search-service. None require frontend branching today; +// this file is the per-service home for future search reasons. +``` + +`pkg/errcode/codes_auth.go`: +```go +package errcode + +// Reasons emitted by auth-service. +const ( + AuthTokenExpired Reason = "sso_token_expired" + AuthInvalidToken Reason = "invalid_sso_token" +) +``` + +- [ ] **Step 4: Run** `go test ./pkg/errcode/ -run TestReasons -v` → PASS. +- [ ] **Step 5: Full package gate** — `make test SERVICE=pkg/errcode && go vet ./pkg/errcode/...` → PASS. +- [ ] **Step 6: Commit** — `git add pkg/errcode/codes_*.go pkg/errcode/codes_test.go && git commit -m "feat(errcode): add per-service Reason catalogs"` + +--- + +## Chapter 8 — Adapter: `errnats` + +### Task 8.1: `errnats.Marshal` and `errnats.Reply` + +**Files:** Create `pkg/errcode/errnats/reply.go`, `reply_test.go`. + +- [ ] **Step 1: Failing test** + +```go +package errnats + +import ( + "bytes" + "context" + "encoding/json" + "errors" + "log/slog" + "testing" + + "github.com/hmchangw/chat/pkg/errcode" +) + +func ctxQuiet() context.Context { + return errcode.WithLogger(context.Background(), slog.New(slog.NewJSONHandler(&bytes.Buffer{}, nil))) +} + +func TestMarshal_TypedError(t *testing.T) { + data := Marshal(ctxQuiet(), errcode.NotFound("room not found", errcode.WithReason(errcode.RoomNotMember))) + var got map[string]any + _ = json.Unmarshal(data, &got) + if got["code"] != "not_found" || got["reason"] != "not_room_member" || got["error"] != "room not found" { + t.Fatalf("envelope = %v", got) + } +} + +func TestMarshal_UnknownCollapsesToInternal(t *testing.T) { + data := Marshal(ctxQuiet(), errors.New("mongo down")) + var got map[string]any + _ = json.Unmarshal(data, &got) + if got["code"] != "internal" || got["error"] != "internal error" { + t.Fatalf("envelope = %v", got) + } + if _, leaked := got["reason"]; leaked { + t.Fatal("reason should be absent") + } +} + +func TestMarshalQuiet_DoesNotLogButStillCollapses(t *testing.T) { + var buf bytes.Buffer + // Default logger must not receive a line from MarshalQuiet. + old := slog.Default() + slog.SetDefault(slog.New(slog.NewJSONHandler(&buf, nil))) + defer slog.SetDefault(old) + + data := MarshalQuiet(errors.New("mongo down")) + var got map[string]any + _ = json.Unmarshal(data, &got) + if got["code"] != "internal" || got["error"] != "internal error" { + t.Fatalf("envelope = %v", got) + } + if buf.Len() != 0 { + t.Fatalf("MarshalQuiet must not log; got %s", buf.String()) + } +} +``` + +(`Reply`/`ReplyQuiet` call `msg.Respond`; covered by service integration tests. Unit-test `Marshal`/`MarshalQuiet`, which hold the logic.) + +- [ ] **Step 2: Run, expect FAIL** — `undefined: Marshal`. +- [ ] **Step 3: Implement** + +```go +// Package errnats adapts errcode.Error to NATS request/reply responses. +package errnats + +import ( + "context" + "encoding/json" + "errors" + "log/slog" + + "github.com/nats-io/nats.go" + + "github.com/hmchangw/chat/pkg/errcode" +) + +const fallback = `{"code":"internal","error":"internal error"}` + +// Marshal classifies err (logging it once) and returns the JSON envelope. +func Marshal(ctx context.Context, err error) []byte { + data, mErr := json.Marshal(errcode.Classify(ctx, err)) + if mErr != nil { + return []byte(fallback) + } + return data +} + +// MarshalQuiet returns the envelope WITHOUT logging. Use only on paths that have +// already logged the failure (panic backstop, admission/replyBusy) to avoid a +// redundant second log line. Unknown errors still collapse to internal and the +// cause is never serialized. +func MarshalQuiet(err error) []byte { + var e *errcode.Error + if !errors.As(err, &e) { + e = errcode.Internal("internal error") + } + data, mErr := json.Marshal(e) + if mErr != nil { + return []byte(fallback) + } + return data +} + +// Reply classifies err (logging once) and sends the envelope on msg's reply subject. +func Reply(ctx context.Context, msg *nats.Msg, err error) { + if rErr := msg.Respond(Marshal(ctx, err)); rErr != nil { + slog.ErrorContext(ctx, "error reply failed", "error", rErr, "subject", msg.Subject) + } +} + +// ReplyQuiet sends the envelope WITHOUT logging the failure (see MarshalQuiet). +func ReplyQuiet(msg *nats.Msg, err error) { + if rErr := msg.Respond(MarshalQuiet(err)); rErr != nil { + slog.Error("error reply failed", "error", rErr, "subject", msg.Subject) + } +} +``` + +- [ ] **Step 4: Run, expect PASS.** +- [ ] **Step 5: Commit** — `git add pkg/errcode/errnats/ && git commit -m "feat(errcode/errnats): add NATS reply adapter"` + +--- + +## Chapter 9 — Adapter: `errhttp` + +### Task 9.1: `errhttp.Write` for Gin + +**Files:** Create `pkg/errcode/errhttp/write.go`, `write_test.go`. + +- [ ] **Step 1: Failing test** + +```go +package errhttp + +import ( + "encoding/json" + "errors" + "net/http" + "net/http/httptest" + "testing" + + "github.com/gin-gonic/gin" + + "github.com/hmchangw/chat/pkg/errcode" +) + +func TestWrite_StatusAndEnvelope(t *testing.T) { + gin.SetMode(gin.TestMode) + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Request = httptest.NewRequest(http.MethodPost, "/auth", nil) + Write(c.Request.Context(), c, errcode.Unauthenticated("token expired", errcode.WithReason(errcode.AuthTokenExpired))) + if w.Code != http.StatusUnauthorized { + t.Fatalf("status = %d, want 401", w.Code) + } + var got map[string]any + _ = json.Unmarshal(w.Body.Bytes(), &got) + if got["code"] != "unauthenticated" || got["reason"] != "sso_token_expired" { + t.Fatalf("envelope = %v", got) + } +} + +func TestWrite_UnknownIs500(t *testing.T) { + gin.SetMode(gin.TestMode) + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Request = httptest.NewRequest(http.MethodGet, "/x", nil) + Write(c.Request.Context(), c, errors.New("db exploded")) + if w.Code != http.StatusInternalServerError || !json.Valid(w.Body.Bytes()) { + t.Fatalf("status=%d body=%q", w.Code, w.Body.String()) + } +} +``` + +- [ ] **Step 2: Run, expect FAIL** — `undefined: Write`. +- [ ] **Step 3: Implement** + +```go +// Package errhttp adapts errcode.Error to Gin HTTP responses. +package errhttp + +import ( + "context" + + "github.com/gin-gonic/gin" + + "github.com/hmchangw/chat/pkg/errcode" +) + +// Write classifies err (logging it once) and writes the envelope with the +// category's HTTP status. +func Write(ctx context.Context, c *gin.Context, err error) { + e := errcode.Classify(ctx, err) + c.JSON(e.HTTPStatus(), e) +} +``` + +- [ ] **Step 4: Run, expect PASS.** +- [ ] **Step 5: Commit** — `git add pkg/errcode/errhttp/ && git commit -m "feat(errcode/errhttp): add Gin response adapter"` + +--- + +## Chapter 10 — natsrouter integration (shims, not deletion) + +This wires natsrouter to `errcode`/`errnats`, adds the logging seam, and converts `RouteError`/`Err*` to **thin shims** so all existing callers keep compiling until Chapter 17. After this, `pkg/natsrouter` depends on `errcode` + `errnats` (never Gin). + +### Task 10.1: Add `Context.WithLogValues` seam (fixes the ctx-cycle bug) + +**Files:** Modify `pkg/natsrouter/context.go`; Test `pkg/natsrouter/context_test.go`. + +- [ ] **Step 1: Failing test** + +```go +func TestContext_WithLogValues_NoCycleAndEnriches(t *testing.T) { + var buf bytes.Buffer + c := NewContext(map[string]string{}) + c.SetContext(errcode.WithLogger(c.ctx, slog.New(slog.NewJSONHandler(&buf, nil)))) + + c.WithLogValues("account", "alice") // must not hang (no ctx cycle) + + // A value lookup must terminate (would loop forever on a cycle): + _ = c.Value("anything") + + errcode.Classify(c, errors.New("boom")) + if !strings.Contains(buf.String(), "alice") { + t.Fatalf("log values not applied: %s", buf.String()) + } +} +``` + +- [ ] **Step 2: Run, expect FAIL** — `c.WithLogValues undefined`. +- [ ] **Step 3: Implement the seam** + +Add to `pkg/natsrouter/context.go`: +```go +// WithLogValues enriches the context logger with key/value pairs for the +// centralized errcode log line (account, roomID, …). It derives from c.ctx +// (the unexported underlying context), never from c itself, avoiding the +// Value-delegation cycle documented on SetContext. Call once at handler entry. +func (c *Context) WithLogValues(args ...any) { + c.SetContext(errcode.WithLogValues(c.ctx, args...)) +} +``` +Add import `"github.com/hmchangw/chat/pkg/errcode"`. + +- [ ] **Step 4: Run, expect PASS.** +- [ ] **Step 5: Commit** — `git add pkg/natsrouter/context.go pkg/natsrouter/context_test.go && git commit -m "feat(natsrouter): add Context.WithLogValues seam (cycle-safe)"` + +### Task 10.2: Route handler errors through `errnats`; convert `RouteError`/`Err*` to shims + +**Files:** Modify `pkg/natsrouter/register.go`, `errors.go`, `context.go`, `router.go`, `middleware.go`, `params.go`; AND the package's own tests `errors_test.go`, `router_test.go`, `example_test.go`, `integration_test.go` (they assert on `.Code` and must be migrated in THIS task — see Step 1). + +> **Why the tests change here, not later:** `type RouteError = errcode.Error` makes `RouteError.Code` an `errcode.Code`, not a `string`. natsrouter's own tests compare it to string literals (`assert.Equal(t, "not_found", result.Code)`); with testify those *compile* but **fail at runtime** (`errcode.Code("not_found") != "not_found"`). And `TestRouteError_Error` asserts the old `"not_found: room not found"` format, but `errcode.Error.Error()` returns message-only. So these tests MUST move with the shim, in this commit. + +- [ ] **Step 1: Update/add tests (use `errtest` helpers from Task 5.3)** + + - `register_test.go`: a handler returning `errcode.NotFound("x", errcode.WithReason(errcode.RoomNotMember))` replies `{code:"not_found",reason:"not_room_member"}` (`errtest.AssertCode`/`AssertReason`); an unknown error replies `{code:"internal"}`; the deserialize-failure path (`register.go:20`, see Step 3) replies `{code:"bad_request"}`. + - `router_test.go`: admission-saturation test asserts `code=="unavailable"`; panic-backstop test asserts `code=="internal"`. **Delete/rewrite `TestRouteError_Error`** — the `code: message` string format no longer exists; if a message-only check is still wanted, assert `err.Error()=="room not found"`. Rewrite the three `result.Code == "..."` string assertions (router_test.go:257,276,361) to `errtest.AssertCode` on the reply bytes, or compare to `errcode.Code*`. + - `errors_test.go`: rewrite the two `err.Code == "..."`/`CodeUnavailable` assertions to compare against `errcode.Code*` (the shim consts are now typed via `string(errcode.Code*)`). + - `example_test.go` (line ~93) and `integration_test.go` (line ~247): rewrite `.Code == "..."` runtime comparisons to `errtest.AssertCode`. + +- [ ] **Step 2: Run, expect FAIL.** + +- [ ] **Step 3: Apply changes** + +`register.go` — replace `replyErr`: +```go +func replyErr(c *Context, err error) { + errnats.Reply(c, c.Msg, err) +} +``` +Drop now-unused `errors`/`slog` imports only if `RegisterVoid` no longer needs them (it still uses `slog` — keep). Add `errnats` import. + +`errors.go` — convert `RouteError` and constructors to shims over `errcode` (do NOT delete yet): +```go +package natsrouter + +import ( + "fmt" + + "github.com/hmchangw/chat/pkg/errcode" +) + +// Deprecated: use pkg/errcode directly. Retained as a shim during migration; +// removed in the error-codes cleanup chapter. +type RouteError = errcode.Error + +func Err(message string) *RouteError { return errcode.BadRequest(message) } +func Errf(f string, a ...any) *RouteError { return errcode.BadRequest(fmt.Sprintf(f, a...)) } +func ErrWithCode(code, message string) *RouteError { return errcode.New(errcode.Code(code), message) } + +const ( + CodeBadRequest = string(errcode.CodeBadRequest) + CodeNotFound = string(errcode.CodeNotFound) + CodeForbidden = string(errcode.CodeForbidden) + CodeConflict = string(errcode.CodeConflict) + CodeInternal = string(errcode.CodeInternal) + CodeUnavailable = string(errcode.CodeUnavailable) +) + +func ErrBadRequest(m string) *RouteError { return errcode.BadRequest(m) } +func ErrNotFound(m string) *RouteError { return errcode.NotFound(m) } +func ErrForbidden(m string) *RouteError { return errcode.Forbidden(m) } +func ErrConflict(m string) *RouteError { return errcode.Conflict(m) } +func ErrInternal(m string) *RouteError { return errcode.Internal(m) } +func ErrUnavailable(m string) *RouteError { return errcode.Unavailable(m) } +``` +Note: `RouteError = errcode.Error` is a type alias, so `var rerr *RouteError; errors.As(err,&rerr); return rerr` keeps compiling. But `rerr.Code` is now `errcode.Code`, not `string`, so **production** code that assigns/compares `.Code` to a string breaks the build. There is exactly one such production site — `search-service/metrics.go` (`status = rerr.Code`) — fixed in Step 5 below so `go build` stays green. (`search-service/handler.go:148` only does `errors.As` + `return rerr`, which the alias keeps valid.) Test-only `.Code` comparisons are migrated in Chapters 11–12. + +`context.go` — BOTH `Context` error-reply methods become errcode-backed (the plan previously missed `ReplyError`, which is called at `register.go:20` and by Recovery middleware; leaving it on `natsutil.ReplyError` would emit a `code`-less envelope AND break compilation when Ch 17 deletes `natsutil.ReplyError`): +```go +func (c *Context) ReplyRouteError(e *RouteError) { errnats.Reply(c, c.Msg, e) } + +// ReplyError replies with a bad_request envelope for the given message. (Kept +// for the deserialize-failure path; new code should return a typed errcode.) +func (c *Context) ReplyError(message string) { errnats.Reply(c, c.Msg, errcode.BadRequest(message)) } +``` +Note `register.go:20` (`c.ReplyError("invalid request payload")`, the payload-deserialize path) now routes through errcode and yields `{code:"bad_request","error":"invalid request payload"}` — assert this in Step 1. + +`router.go` — `replyBusy` and the panic backstop have ALREADY logged (saturation warn / panic recovery), so they use the **non-logging** `errnats.ReplyQuiet` to avoid a redundant `Classify` line (and to keep per-event saturation replies out of the ERROR stream): +```go +func (r *Router) replyBusy(msg *nats.Msg) { + if msg.Reply == "" { + slog.Warn("natsrouter: dropped fire-and-forget message under saturation", "subject", msg.Subject) + return + } + errnats.ReplyQuiet(msg, errcode.Unavailable("service busy")) +} +``` +Panic backstop (router.go:~204) — `recover()` already logged the panic: +```go +if m.Msg.Reply != "" { + errnats.ReplyQuiet(m.Msg, errcode.Internal("internal error")) +} +``` + +`middleware.go` Recovery (~54): the recovery handler already logs `"panic recovered"`, so use the quiet reply: `c.ReplyError(...)` → `errnats.ReplyQuiet(c.Msg, errcode.Internal("internal error"))`. + +`params.go:41` — `ErrBadRequest("missing required param ...")` keeps working via the shim; no change needed, but verify it compiles. + +Add `errcode`/`errnats` imports where used; drop `natsutil`/`errors` where now unused (goimports via `make fmt` will catch leftovers in `register.go`). + +- [ ] **Step 4: Run** `make test SERVICE=pkg/natsrouter` → PASS (this runs the package's own tests with `-race`; they were migrated in Step 1, so the package is fully green here — NOT deferred). +- [ ] **Step 5: Fold the one production-code break, then build.** Before building, fix the single production site that assigns `errcode.Code` to a string — `search-service/metrics.go` (~:100-104). Minimal in-place change (full handler migration still happens in Ch 12; this is just to keep `go build` green now): + ```go + status := string(errcode.CodeInternal) + var ee *errcode.Error + if errors.As(err, &ee) { status = string(ee.Code) } + ``` + (Drop the `natsrouter.CodeInternal`/`RouteError` references in that block; add `errcode` import.) Then: + ```bash + go build ./... # expect: clean + ``` + Now enumerate the test-only breakage (compiles, fails at runtime — `go build` skips other packages' `_test.go`) so each chapter's scope is exact: + ```bash + grep -rn '\.Code ==\|== natsrouter.Code\|RouteError\|natsrouter.Err' --include='*_test.go' \ + search-service history-service mock-user-service + ``` + Expected hits (migrated in their chapters): `search-service` handler_test ×18, query_rooms_test ×2, integration_*_test ×6 (incl. one `CodeInternal` at `integration_users_test.go:100`); `history-service/internal/service` messages_test ×15, threads_test ×1; `mock-user-service` handler_test ×1. Record the list; do not fix here. +- [ ] **Step 6: Update CLAUDE.md NOW (not in Ch 18).** The Section 3 Error-Handling bullet "Use `model.ErrorResponse` via `natsutil.ReplyError` for all NATS reply errors" is contradicted from the next chapter onward; leaving it until Ch 18 means every intermediate commit violates a current OVERRIDING rule. Replace that bullet with: "Use `pkg/errcode` for all client-facing errors; reply via `errnats.Reply` (NATS) / `errhttp.Write` (Gin). Never hand-build `model.ErrorResponse` or call `natsutil.ReplyError`." (Full `docs/error-handling.md` still lands in Ch 18.) +- [ ] **Step 7: Commit** — `git add pkg/natsrouter/ search-service/metrics.go CLAUDE.md && git commit -m "refactor(natsrouter): emit errcode envelopes; RouteError/Err* now shims over errcode"` + +--- + +## Chapter 11 — history-service (reference migration) + +### Task 11.1: Swap `natsrouter.Err*` for `errcode` + +**Files:** Modify `history-service/*.go`, `history-service/**/*_test.go`, `docs/client-api.md`. + +- [ ] **Step 1: Enumerate** — `grep -rn "natsrouter.Err\|ErrWithCode\|RouteError\|ReplyRouteError\|natsrouter.Code" history-service/` (include subpackages, e.g. `internal/service/`). Record each site + current code/message. Audit found **30 client-facing `natsrouter.Err*` reply sites** (messages.go ×11, threads.go ×7, utils.go ×6, room_times.go ×2, …) and **16 test `.Code` asserts** (`messages_test.go` ×15 + `threads_test.go:452` ×1 — not "~22"). All use BadRequest/Forbidden/NotFound/Internal; Conflict/Unavailable/Unauthenticated are unused here. + +- [ ] **Step 2: Update one handler test** to decode the reply and assert `code` (and `reason` if applicable) instead of `RouteError{Code:...}`. + +- [ ] **Step 3: Run, expect FAIL.** + +- [ ] **Step 4: Migrate call sites** using this table (apply to every site from Step 1): + +| Old | New | +|-----|-----| +| `natsrouter.ErrBadRequest(m)` / `natsrouter.Err(m)` | `errcode.BadRequest(m)` | +| `natsrouter.Errf(f, a...)` | `errcode.BadRequest(fmt.Sprintf(f, a...))` | +| `natsrouter.ErrNotFound(m)` | `errcode.NotFound(m)` | +| `natsrouter.ErrForbidden(m)` | `errcode.Forbidden(m)` | +| `natsrouter.ErrConflict(m)` | `errcode.Conflict(m)` | +| `natsrouter.ErrInternal(m)` | prefer returning the wrapped raw error: `fmt.Errorf("…: %w", err)` (Classify → internal). Use `errcode.Internal(m)` only when there is no underlying error. | + +Add `reason`s only where history-service has a case the frontend must distinguish (most are generic — confirm against the endpoints). Update imports. + +- [ ] **Step 5: Enrich log context** at each handler entry: `c.WithLogValues("request_id", natsutil.RequestIDFromContext(c), "account", account, "roomID", c.Param("roomID"))`. Every history handler has `roomID` from the route param, so include it unconditionally (not "where available"). The reply path (`errnats.Reply(c, …)`) then logs with these attrs. + +- [ ] **Step 6: Update all remaining old-shape test assertions** found in Step 1 (the full `messages_test.go`/`threads_test.go` set), decoding to `code`/`reason`. + +- [ ] **Step 7: Run** `make test SERVICE=history-service` → PASS. + +- [ ] **Step 8: Update `docs/client-api.md`** — per history-service endpoint, add/refresh the "Possible errors" table (`code`, `reason`, when). + +- [ ] **Step 9: Commit** — `git add history-service/ docs/client-api.md && git commit -m "refactor(history-service): adopt errcode envelopes"` + +--- + +## Chapter 12 — search-service (incl. metrics.go) and mock-user-service + +### Task 12.1: search-service handlers AND the Prometheus label path + +**Files:** Modify `search-service/handler.go`, `query_rooms.go`, **`metrics.go`**, `*_test.go`, `integration_*_test.go`, `docs/client-api.md`. + +- [ ] **Step 1: Enumerate** — `grep -rn "natsrouter.Err\|RouteError\|natsrouter.Code\|rerr.Code" search-service/`. Sites: handler.go (`natsrouter.Err*` constructors); `handler.go:148-150` (`var rerr *natsrouter.RouteError` passthrough); `query_rooms.go:74` — the **exported** signature `func roomTypeFilterClause(...) (map[string]any, *natsrouter.RouteError)` plus its doc comment (`query_rooms.go:20`); `metrics.go:100-104` (already minimally fixed in Ch 10 Step 5 — finalize here). Plus the test sites enumerated in Ch 10 Step 5. + +- [ ] **Step 2: Update a representative handler test** (`searchMessages` "query is required") to assert `errtest.AssertCode(t, reply, errcode.CodeBadRequest)`. + +- [ ] **Step 3: Run, expect FAIL.** + +- [ ] **Step 4: Migrate handlers** per Task 11.1's table. `natsrouter.ErrInternal("unable to build search query")` → `fmt.Errorf("building search query: %w", err)`. + +- [ ] **Step 5: Change `query_rooms.go:74` return type** `*natsrouter.RouteError` → `*errcode.Error` (and update the `query_rooms.go:20` doc comment + the caller in `handler.go`). This is required because Ch 17 deletes the `RouteError` alias; the alias only masks it until then. Update `query_rooms_test.go` (×2 `.Code` asserts) to `errtest`. + +- [ ] **Step 6: Finalize `metrics.go`** — Ch 10 left the minimal `errors.As`-based status label. Keep that form (it deliberately avoids a second `Classify` log line, since the reply path already logs); just confirm it reads cleanly and drop any leftover `natsrouter` import. + +- [ ] **Step 7: Migrate `handler.go:148-150`** passthrough: replace the `*natsrouter.RouteError` assertion with `*errcode.Error`. + +- [ ] **Step 8: Enrich log ctx** at handler entry (as Task 11.1 Step 5). Migrate ALL test `.Code` assertions to `errtest`: `handler_test.go` has **18** (not "~17"), `query_rooms_test.go` ×2, and integration tests asserting `model.ErrorResponse.Code` against **both** `natsrouter.CodeBadRequest` AND `natsrouter.CodeInternal` (esp. `integration_users_test.go:100`, the one internal assertion) — = 26 total. The internal one must migrate too, or it fails at runtime and breaks compile when Ch 17 removes `model.ErrorResponse`. + +- [ ] **Step 9: Run + docs + commit** +```bash +make test SERVICE=search-service +git add search-service/ docs/client-api.md +git commit -m "refactor(search-service): adopt errcode envelopes incl. metrics label" +``` + +### Task 12.2: mock-user-service + +**Files:** Modify `mock-user-service/handler.go`, `*_test.go`. + +- [ ] **Step 1:** `grep -n "natsrouter.Err" mock-user-service/handler.go` → `checkSite` returns `natsrouter.ErrNotFound("unknown site")`. +- [ ] **Step 2:** Update the `checkSite` test (`handler_test.go:28-30`) to assert decoded `code=="not_found"`. +- [ ] **Step 3:** Run, expect FAIL. +- [ ] **Step 4:** Replace with `errcode.NotFound("unknown site")`; update imports; enrich log ctx. +- [ ] **Step 5:** `make test SERVICE=mock-user-service && git add mock-user-service/ && git commit -m "refactor(mock-user-service): adopt errcode envelopes"` + +--- + +## Chapter 13 — message-gatekeeper (incl. fetcher_history.go) + +Preserve the infra-vs-validation **ack/nak** decision (a JetStream retry concern, independent of the envelope). Only the reply payload and the remote-error parse change. + +### Task 13.1: Replace `codedError`; resolve category to `forbidden` + reason + +**Files:** Modify `message-gatekeeper/store.go`, `handler.go`, `*_test.go`, `docs/client-api.md`. + +Resolve the spec contradiction: both gatekeeper validation errors are **`forbidden`** (the user is not permitted to post), with the specific case in `reason`. + +- [ ] **Step 1: Fix the tests that reference the deleted reply path.** The plan previously cited the wrong range. The actual sites: + - `handler_test.go:1159-1188 TestHandler_marshalErrorReply` — 3 subtests calling the soon-deleted `marshalErrorReply` and reading `errLargeRoomPostRestricted.Code`. **Delete/rewrite** this whole test (the method is gone). + - `handler_test.go:686-688` — the real large-room reply assertion inside the `processMessage` table. Update to decode the reply and assert `code=="forbidden"`, `reason=="large_room_post_restricted"` via `errtest`. + - `handler_test.go:477,563,687` — `assert.ErrorIs(t, err, errLargeRoomPostRestricted)`: these match by identity and KEEP working (the sentinel is still returned directly at handler.go:220). Leave them. + +- [ ] **Step 2: Run, expect FAIL.** + +- [ ] **Step 3: Replace coded errors in `store.go`** (delete `codedError`, `codeLargeRoomPostRestricted`, the old `errLargeRoomPostRestricted`, and `errNotSubscribed`'s `errors.New` form): +```go +var ( + errNotSubscribed = errcode.Forbidden("not subscribed", errcode.WithReason(errcode.MessageNotSubscribed)) + errLargeRoomPostRestricted = errcode.Forbidden( + "posting is restricted to owners and admins in this room", + errcode.WithReason(errcode.MessageLargeRoomPostRestricted)) +) +``` +`errors.Is(err, errNotSubscribed)` (handler.go:194) still works by identity. The `infraError` detection (`errors.As(err,&ie)`) is independent of these values, so ack/nak is unchanged. + +- [ ] **Step 4: CRITICAL — re-home the inline validation errors so they don't collapse to `internal`.** Step 5 routes ALL validation replies through `errnats.Marshal`→`Classify`, which collapses any non-`*errcode.Error` to `{"code":"internal","error":"internal error"}`. Today these reply with their real message (documented at `docs/client-api.md:1886-1894`). Convert each plain `fmt.Errorf` validation return to a typed errcode: + - `handler.go:149,155,164,169,173,178,183,188,277,281,283` (missing/malformed fields, bad subject, invalid payload) → `errcode.BadRequest(...)` (or `errcode.NotFound(...)` for the quote/message-missing case — check each message). Preserve the exact user message text. + - `handler.go:195` — the `not_subscribed` reply path returns a FRESH `fmt.Errorf("user %s is not subscribed…")` that does NOT wrap the sentinel, so it would collapse to internal and lose the reason. Change to `return nil, errNotSubscribed` (or `fmt.Errorf("…: %w", errNotSubscribed)` if the dynamic account text matters — but the sentinel message is preferred so the reason survives). + Audit the whole handler: any error path that REPLIES to the client (validation/Ack branch) must be an `*errcode.Error`; only infra errors (Nak branch) stay raw. + +- [ ] **Step 5: Replace `marshalErrorReply`** in `handler.go` — delete it; build the reply bytes with `errnats.Marshal`: +```go +h.sendReply(ctx, account, msg.Data(), errnats.Marshal(ctx, err)) +``` +The `"invalid message subject"` path → `errnats.Marshal(ctx, errcode.BadRequest("invalid message subject"))`. Keep the infra→Nak / validation→Ack+reply branch exactly as-is. **Enrich ctx AFTER parsing** (not "at entry" — `account`/`roomID` come from `ParseUserRoomSiteSubject` and `reqID` from the payload `req.RequestID`, parsed inside `HandleJetStreamMsg`, not at the `main.go:143` call site): once subject+payload are parsed, `ctx = errcode.WithLogValues(ctx, "request_id", req.RequestID, "account", account, "roomID", roomID)`. + +- [ ] **Step 6: Migrate `fetcher_history.go:61`** — it decodes a remote history-service reply via `model.ErrorResponse` to detect upstream errors. Replace with `errcode.Parse`: +```go +if ee, ok := errcode.Parse(replyData); ok { + return nil, fmt.Errorf("history fetch: %s", ee.Message) +} +``` +Update `fetcher_history_test.go:88` accordingly (it currently builds a `model.ErrorResponse` payload — switch to an errcode envelope; the `Contains("message not found")` assertion still holds since `ee.Message` is the `error` field). + +- [ ] **Step 7: Run** `make test SERVICE=message-gatekeeper` → PASS. +- [ ] **Step 8: docs + commit** — Update `docs/client-api.md:1886-1894`: fill the `code` column (currently `—`) for every now-`bad_request`/`forbidden`/`not_found` validation row, and add the `reason` for the two forbidden cases. +```bash +git add message-gatekeeper/ docs/client-api.md +git commit -m "refactor(message-gatekeeper): errcode envelopes + errcode.Parse for remote errors" +``` + +--- + +## Chapter 14 — room-service (delete `sanitizeError`, incl. memberlist_client.go + DM-exists) + +### Task 14.1: Convert sentinels to errcode; delete `sanitizeError` + +**Files:** Modify `room-service/helper.go`, `handler.go`, `*_test.go`, `docs/client-api.md`. + +Mapping (message text from the current sentinels): + +| Sentinel | Code | Reason | +|----------|----------|--------| +| `errInvalidRole` | BadRequest | — | +| `errOnlyOwners` | Forbidden | — | +| `errAlreadyOwner` | Conflict | — | +| `errNotOwner` | Forbidden | `RoomNotOwner` | +| `errCannotDemoteLast` | Conflict | — | +| `errRoomTypeGuard` | BadRequest | — | +| `errTargetNotMember` | BadRequest | — | +| `errNotRoomMember` | Forbidden | `RoomNotMember` | +| `errInvalidOrg` | BadRequest | — | +| `errInvalidThreadID` | BadRequest | — | +| `errThreadSubNotFound` | NotFound | — | +| `errPromoteRequiresIndividual` | BadRequest | — | +| `errEmptyCreateRequest` | BadRequest | — | +| `errSelfDM` | BadRequest | — | +| `errBotInChannel` | BadRequest | `RoomBotInChannel` | +| `errBotNotAvailable` | NotFound | `RoomBotNotAvailable` | +| `errInvalidUserData` | BadRequest | — | +| `errMissingRequestID` | BadRequest | — | +| `errInvalidRequestID` | BadRequest | — | +| `errChannelNameRequired` | BadRequest | — | +| `errChannelNameTooLong` | BadRequest | — | +| `errUserNotFound` | NotFound | — | +| `errMessageNotFound` | NotFound | — | +| `errMessageRoomMismatch` | BadRequest | — | +| `errNotMessageSender` | Forbidden | — | +| `errRemoveTargetAmbiguous` | BadRequest | — | +| `errCannotRemoveLastMember` | Conflict | — | +| `errLastOwnerCannotLeave` | Conflict | `RoomLastOwnerCannotLeave` | +| `errOrgMemberCannotLeaveSolo` | Forbidden | — | +| `errRoomIDMismatch` | BadRequest | — | +| `errRemoveChannelOnly` | BadRequest | — | +| `errListLimitInvalid` | BadRequest | — | +| `errListOffsetInvalid` | BadRequest | — | +| room-capacity ("at maximum capacity"/"exceeds maximum capacity") | Conflict | `RoomMaxSizeReached` | +| `channelExpandTimeoutError` | Unavailable | — | +| `dmExistsError` | → **success reply** (Task 14.3) | — | + +- [ ] **Step 1: Convert sentinel defs in `helper.go`** (each `errors.New` → `errcode.(msg[, WithReason(...)])`). Example: +```go +var ( + errInvalidRole = errcode.BadRequest("invalid role: must be owner or member") + errOnlyOwners = errcode.Forbidden("only owners can update roles") + errNotRoomMember = errcode.Forbidden("only room members can list members", errcode.WithReason(errcode.RoomNotMember)) + errNotOwner = errcode.Forbidden("user is not an owner", errcode.WithReason(errcode.RoomNotOwner)) + // … all sentinels per the table +) +``` + +- [ ] **Step 2: CRITICAL — re-home the inline allowlist-passthrough errors BEFORE deleting `sanitizeError`.** `sanitizeError`'s substring allowlist (`helper.go:227`) does not just sanitize the named sentinels — it passes through ~6 classes of **inline `fmt.Errorf`** errors that are NOT in the sentinel table. Deleting the allowlist (this step) makes every one of them collapse to `internal error` at `errnats.Reply`. The load-bearing passthrough is proven by `helper_test.go:63-69`. Convert each inline site to a typed errcode AT THE SOURCE (preserve the user message text): + - `handler.go:502,522` `"only owners can remove members"`, `:660` `"only owners can add members to a restricted room"` → `errcode.Forbidden(...)` (allowlist prefix `"only owners can"`). + - `handler.go:657` `"cannot add members to a non-channel room"` → `errcode.BadRequest(...)` (prefix `"cannot add members"`). + - `handler.go:648` `"requester not in room: %w"` → `errcode.Forbidden(...)` (prefix `"requester not in room"`). + - `handler.go:155,437,462,561,564,666,669,889,1139,1142` — the `"invalid request…"` family incl. `"room ID mismatch"`, `"messageId is required"` (prefix `"invalid request"`, the widest leak — 10 sites) → `errcode.BadRequest(...)`. + - `handler.go:1408` `"invalid mute-toggle subject"` → `errcode.BadRequest(...)`. + Grep `sanitizeError`'s allowlist for the exact prefixes and verify every passthrough string has a re-homed errcode; the test at `helper_test.go:63-69` enumerates them — use it as the checklist. +- [ ] **Step 3: Delete `sanitizeError` (helper.go:176-234)** including the substring allowlist. Where the capacity strings originate (`handler.go:320,717`), return `errcode.Conflict("room is at maximum capacity", errcode.WithReason(errcode.RoomMaxSizeReached))` at the source. Where `channelExpandTimeoutError` is constructed, return `errcode.Unavailable(fmt.Sprintf("expanding channels timed out for room %s on site %s", roomID, siteID))`; delete the custom type if unreferenced. + +- [ ] **Step 4: Update reply sites in `handler.go`** — `natsutil.ReplyError(m.Msg, sanitizeError(err))` → `errnats.Reply(ctx, m.Msg, err)`. room-service uses raw `nc.QueueSubscribe`/`otelnats.Msg` and an existing `wrappedCtx(m)` helper used at **11 call sites** (`handler.go:108,371,384,…`). **Fold the log enrichment into `wrappedCtx`** rather than hand-rolling per handler, so all 11 sites are consistent: +```go +// inside wrappedCtx(m), after extracting requestID/account/roomID: +return errcode.WithLogValues(ctx, "request_id", requestID, "account", account, "roomID", roomID) +``` +(If `account`/`roomID` aren't known inside `wrappedCtx`, add them at each handler via `c`-less `errcode.WithLogValues(ctx, …)`; do NOT use `c.WithLogValues` — this is not natsrouter.) + +- [ ] **Step 5: Delete/rewrite the `sanitizeError` test suite** (`helper_test.go:38-234` — the suite STARTS at 38, not 75; it includes `RemoteMemberListPrefix`/`WithContext`/`TransportFailureStillOpaque` at 131-146 and `NewSentinelErrorsExist`/`DMExistsErrorWraps` at 149-174, all referencing deleted symbols — remove `TestDMExistsErrorWrapsCorrectly` explicitly). Rewrite handler/integration assertions (`handler_test.go:1437-1458,2651-2701`, `integration_test.go:1334-1339`) to decode `code`/`reason` via `errtest`. + +- [ ] **Step 6: Run** `make test SERVICE=room-service` (DM-exists + memberlist tests temporarily expected to fail until 14.2/14.3). + +### Task 14.2: Migrate `memberlist_client.go` to `errcode.Parse` + reason matching + +**Files:** Modify `room-service/memberlist_client.go`, `memberlist_client_test.go`. + +The current code (`memberlist_client.go:65-70`) uses `natsutil.TryParseError` + `errResp.Error == errNotRoomMember.Error()` (brittle message-string equality across sites) to remap a remote not-member error. + +- [ ] **Step 1: Update the test** (`memberlist_client_test.go:68-94`) so the simulated remote reply is an errcode envelope `{code:"forbidden",reason:"not_room_member",error:"…"}`, and assert the client remaps it to the local `errNotRoomMember` (or returns an `*errcode.Error` with `reason==RoomNotMember`). + +- [ ] **Step 2: Run, expect FAIL.** + +- [ ] **Step 3: Rewrite the decode**: +```go +if ee, ok := errcode.Parse(replyData); ok { + if ee.Reason == errcode.RoomNotMember { + return errNotRoomMember + } + // Preserve remote message for other remote errors (replaces the old + // "remote member.list:" allowlist passthrough). + return errcode.New(ee.Code, ee.Message, errcode.WithReason(ee.Reason)) +} +``` +Remove the `errResp.Error == errNotRoomMember.Error()` string comparison and the dependence on the deleted `sanitizeError` "remote member.list:" prefix. **Rollout note:** during a mixed-version window a legacy remote room-service still replies via `natsutil.MarshalError` (no `code`). `errcode.Parse` still succeeds (it only requires a non-empty `error`) but yields `Code==""` → `HTTPStatus()` 500 and no reason match. That's an acceptable degradation (the not-member remap simply doesn't fire until both sides are upgraded); call it out so it isn't mistaken for a bug. + +- [ ] **Step 4: Run** the memberlist tests → PASS. + +### Task 14.3: DM-already-exists → success reply + +**Files:** Modify `room-service/handler.go` (DM-exists block, ~125-137), `pkg/model/event.go` (add status const), `handler_test.go`, `integration_test.go`, `pkg/model/model_test.go`, `docs/client-api.md`. + +- [ ] **Step 1: Add the status constant** in `pkg/model/event.go` near `CreateRoomReply`: +```go +// CreateRoomStatusExists indicates the requested DM already existed; RoomID is +// the existing room. Clients treat it as success and open the room. +const CreateRoomStatusExists = "exists" +``` + +- [ ] **Step 2: Update the DM-exists handler test** to expect a SUCCESS reply `model.CreateRoomReply{Status: "exists", RoomID: existing}` (decode the reply; assert `status=="exists"` and `roomId==existing`), not an error envelope. + +- [ ] **Step 3: Run, expect FAIL.** + +- [ ] **Step 4: Replace the error reply** — instead of marshalling `model.ErrorResponse{Error:"dm already exists", RoomID: existingRoomID}`: +```go +natsutil.ReplyJSON(m.Msg, model.CreateRoomReply{Status: model.CreateRoomStatusExists, RoomID: existingRoomID}) +return +``` +Remove the `dmExistsError` type, `newDMExistsError` (handler.go:258), and ALL `errors.As/Is(err, *dmExistsError)` checks — `handleCreateRoom` must stop returning it. **Confirm `model.CreateRoomReply.RoomType`** (`pkg/model/event.go:316`) — the success-exists reply leaves it `""` (the old error path didn't send it either); verify the frontend does not read `roomType` on the exists branch, else populate it. + +- [ ] **Step 5: Rewrite the `*dmExistsError` routing tests** — the type is gone, so these no longer compile: `integration_test.go:1588-1594` (`errors.As(err, *dmExistsError)`), `handler_test.go:2333,2427-2442,2662-2681`. Rewrite each to assert the SUCCESS reply (`status=="exists"`, `roomId==existing`) instead of an error. (Step 2's test covers the happy path; these are the routing/`errors.As` sites the plan previously missed.) + +- [ ] **Step 6: Remove the now-obsolete model test** `TestErrorResponseRoomIDOmitempty` (`model_test.go:2002`) since `model.ErrorResponse.RoomID` is removed in Chapter 17; if Chapter 17 hasn't run yet, leave the field but stop using it here. (Field removal happens in Ch 17.) + +- [ ] **Step 7: Update `docs/client-api.md:235-238`** — rewrite the DM-exists block from the old `{"error":"dm already exists","roomId":…}` (documented-as-success-error) to the new success shape `{"status":"exists","roomId":…}`. Note the semantics flip in the changelog section. + +- [ ] **Step 8: BREAKING-CONTRACT GATE (must be checked before merge/deploy).** DM-exists flips from an error the old client treats as success (keys on `.error` + `.roomId`) to an explicit `{status:"exists"}` success with NO `.error`. An old frontend would mis-handle the new reply. Therefore: + - The frontend create-DM change (Ch 18.2 Step 3) MUST land in the **same release** as this commit. Do not merge room-service ahead of the frontend. + - Record this coupling in the PR description and the `docs/client-api.md` changelog (not just here). + - If the team cannot co-release, STOP and ask: the fallback is to keep emitting the legacy `{error,roomId}` shape behind a temporary flag until the frontend ships — but the default plan is co-release. + +- [ ] **Step 9: Run + commit** +```bash +make test SERVICE=room-service +git add room-service/ pkg/model/ docs/client-api.md +git commit -m "refactor(room-service): replace sanitizeError with errcode; DM-exists now returns success; remote member-list via errcode.Parse" +``` + +--- + +## Chapter 15 — room-worker (explicit permanence + AsyncJobResult) + +### Task 15.1: Add `Code`/`Reason` to `AsyncJobResult` + +**Files:** Modify `pkg/model/event.go:280-287`, `pkg/model/model_test.go`. + +- [ ] **Step 1: Update `TestAsyncJobResultShape`** to set `Code`/`Reason` on the error case and assert round-trip, AND assert the success case (`Status:"ok"`) marshals WITHOUT `code`/`reason` (omitempty). + +- [ ] **Step 2: Run, expect FAIL.** + +- [ ] **Step 3: Add fields** +```go +type AsyncJobResult struct { + RequestID string `json:"requestId"` + Operation string `json:"operation"` + Status string `json:"status"` + RoomID string `json:"roomId,omitempty"` + Error string `json:"error,omitempty"` + Code string `json:"code,omitempty"` // string, not errcode.Code: pkg/model must not import errcode + Reason string `json:"reason,omitempty"` + Timestamp int64 `json:"timestamp"` +} +``` + +- [ ] **Step 4: Run + commit** +```bash +go test ./pkg/model/ -run TestAsyncJobResult -v +git add pkg/model/event.go pkg/model/model_test.go +git commit -m "feat(model): add code/reason to AsyncJobResult" +``` + +### Task 15.2: Convert sanitizers; make permanence EXPLICIT (not category-inferred) + +**Files:** Modify `room-worker/handler.go`, `*_test.go`, `docs/client-api.md`. + +**Critical correction from review:** do NOT infer permanence from "non-internal category". Many real permanent errors (room-ID collision, user-not-found, unknown room type, invalid request ID, room key absent) naturally classify to `internal` and would be Nak'd/redelivered forever. Keep an **explicit permanent marker** alongside the errcode value. + +- [ ] **Step 1: Define a permanent wrapper that carries an errcode payload** +```go +// permanentError marks a job failure as non-retryable (Ack, don't Nak) AND +// carries the client-facing errcode payload. The errcode value may be any +// category, including internal — permanence is explicit, never inferred. +type permanentError struct{ ec *errcode.Error } + +func (e *permanentError) Error() string { return e.ec.Error() } +func (e *permanentError) Unwrap() error { return e.ec } // so errors.As finds the *errcode.Error + +func permanent(ec *errcode.Error) error { return &permanentError{ec: ec} } +``` +Migrate EVERY current explicit-permanent site — there are **21, not "≈10"** (enumerate via `grep -n "newPermanent\|newPermanentAbsent" room-worker/handler.go`): `newPermanent` at lines 182, 309, 469, 674, 747, 750, 766, 839, 847, 965, 1214, 1217, 1222, 1241, 1267, 1269, 1310, 1351; `newPermanentAbsent` at 1235, 1846. Each becomes `permanent(errcode.("…"[, WithReason(...)]))`. Suggested categories: collision(182)→`Internal`; wrong-room-type(309,766,1310)→`BadRequest`/`Internal`; user-not-found(469,674,839,847,965,1241,1267,1269,1351)→`NotFound` or `Internal`; request-ID(747,750,1214,1217)→`BadRequest`; unmarshal(1222)→`BadRequest`; key-absent(1235,1846)→`Internal`+`WithCause` (below). **All 21 must stay wrapped in `permanent(...)`** — several (collision, key-absent, unknown-room-type) classify to `internal`, so if permanence were category-inferred they'd be Nak'd forever; the explicit marker is exactly what prevents that. + +**Locked design for `newPermanentAbsent`** (resolving the earlier "OR"): attach the alert sentinel as the errcode cause — +```go +permanent(errcode.Internal("room key absent", errcode.WithCause(errRoomKeyAbsent))) +``` +This yields the chain `permanentError →(Unwrap)→ *errcode.Error →(Unwrap)→ errRoomKeyAbsent`. Note: the production `KeyAbsentErrors` alert metric is incremented **inline at the call site** (handler.go:1234,1845), BEFORE the error is built — so the alert does not actually depend on this chain. The `errors.Is(err, errRoomKeyAbsent)` resolution is relied on only by tests (handler_test.go:3478,3558). Still, TEST in Step 2 that BOTH resolve in one pass: +- `errors.As(err, &ee)` finds the `*errcode.Error` (for the reply payload), AND +- `errors.Is(err, errRoomKeyAbsent)` still matches, because `errcode.Error.Unwrap()` returns its cause. + +`errRoomKeyAbsent` is confirmed a raw `errors.New(...)` (handler.go:34), not an `*errcode.Error`, so `WithCause` does NOT panic. + +- [ ] **Step 1b: Migrate the two paths the plan previously missed.** + - **`errRoomIDCollision` + sync-DM reconcile branch** (`handler.go:1593`, and the `errors.Is(reconcileErr, errPermanent)` branch at `~1689-1696`). Once `errPermanent`/`permanentError` semantics change, rewrite that branch to `errors.As(reconcileErr, &pe *permanentError)`, and convert `errRoomIDCollision` → `errcode.Conflict("room id collision", …)`. Without this, the sync-DM collision path either won't compile or silently returns raw internal. + - **`processRoleUpdate` (handler.go:224-299) has NO AsyncJobResult and returns 6+ bare `fmt.Errorf`** (lines 228,237,242,245,248,254,266,269,275,289,295). Line 248 (`"unsupported role"`) is a real permanent error currently **Nak'd forever — a pre-existing bug**. Wrap the validation returns as typed errcode, and wrap the permanent ones (esp. :248) in `permanent(errcode.BadRequest("unsupported role"))` so they Ack. If role-update is supposed to publish an AsyncJobResult like its siblings, add it; if not, confirm that's intentional (docs:476 says it doesn't today). + +- [ ] **Step 2: Write the failing test** — cover: + - permanent `errcode.Forbidden(..., WithReason(RoomNotMember))` → `AsyncJobResult{Status:"error", Error:"…", Code:"forbidden", Reason:"not_room_member"}` AND Ack'd (not Nak'd); + - raw infra error → `Code:"internal", Error:"internal error"` AND Nak'd; + - permanent `errcode.Internal("collision")` → Ack'd (permanence is explicit, not category-inferred) with `Code:"internal"`; + - **the `newPermanentAbsent` chain**: assert `errors.Is(err, errRoomKeyAbsent)` is true AND `errors.As(err, &ee)` yields `ee.Code=="internal"` on the SAME error value (guards the locked Step-1 design); + - **`processRoleUpdate` unsupported-role** (handler.go:248) is now Ack'd, not Nak'd forever; + - **rewrite the existing suites that reference deleted functions**: `handler_test.go:2644-2657` (`sanitizeAsyncJobError`) and the `sanitizeSyncDMError` tables at `2713-2723,3008-3151` must move to `fillAsyncError`/`errnats`, not merely be added to. + +- [ ] **Step 3: Run, expect FAIL.** + +- [ ] **Step 4: Replace `sanitizeAsyncJobError`** with a populator: +```go +func (h *Handler) fillAsyncError(ctx context.Context, result *model.AsyncJobResult, jobErr error) { + e := errcode.Classify(ctx, jobErr) + result.Status = model.AsyncJobStatusError + result.Error, result.Code, result.Reason = e.Message, string(e.Code), string(e.Reason) +} +``` +The Ack/Nak decision stays keyed on `errors.As(jobErr, &pe *permanentError)` (Ack) vs not (Nak) — independent of category. Where the result was built with `result.Error = sanitizeAsyncJobError(jobErr)`, call `h.fillAsyncError(ctx, &result, jobErr)`. + +- [ ] **Step 5: Replace `sanitizeSyncDMError`** — `natsutil.ReplyError(m.Msg, sanitizeSyncDMError(err))` → `errnats.Reply(ctx, m.Msg, err)`. Convert sync-DM sentinels (`errMissingRequestID`, `errInvalidRequestID`, `errInvalidSyncDMRequest`, `errUserLookupFailed`, `errCrossSiteRequester`) to errcode: mostly `errcode.BadRequest(...)`; `errUserLookupFailed` → return the wrapped raw error (Classify → internal). Enrich ctx at entry. + +- [ ] **Step 6: REQUIRED — add panic recovery to the async consumer goroutine.** The async path runs in a JetStream consumer goroutine NOT under natsrouter's recovery middleware. A stray panic crashes the worker. The surface is broader than `WithCause`: `WithMetadata` with odd args also panics. Verified absent today — `room-worker/main.go` (~:156-163) only defers semaphore release + `wg.Done()`. Add recovery inside that goroutine (write the failing test first — a handler that panics must Nak + not crash): + ```go + go func() { + defer func() { + if r := recover(); r != nil { + slog.Error("panic in async job handler", "panic", r, "subject", msg.Subject()) + _ = msg.Nak() + } + <-sem + wg.Done() + }() + // ... existing job processing ... + }() + ``` + (Match the existing semaphore/`wg` mechanics exactly; the recover must run before the semaphore release so ordering is preserved. Follow the reference worker pattern if one already has recovery.) + +- [ ] **Step 7: Run + docs + commit.** Docs updates (`docs/client-api.md`): add `code`/`reason` rows to the `AsyncJobResult` schema table (~:338) and refresh the example JSON; update §5 (~:2045-2057) which currently says "Absent for plain `natsutil.ReplyError`" — the sync-DM reply now emits `code`; update the create.dm error section accordingly. +```bash +make test SERVICE=room-worker +git add room-worker/ docs/client-api.md +git commit -m "refactor(room-worker): errcode async results + sync DM; explicit permanence marker" +``` + +--- + +## Chapter 16 — auth-service (HTTP) + +**GATED on PM confirmation of the `unauthenticated` (401) category.** + +### Task 16.1: Replace `gin.H{"error":...}` with `errhttp.Write` + +**Files:** Modify `auth-service/handler.go:79-148`, `handler_test.go`, `docs/client-api.md`. + +| Site | New | +|------|-----| +| :79 | `errhttp.Write(ctx, c, errcode.BadRequest("ssoToken and natsPublicKey are required"))` | +| :84,:141 | `errcode.BadRequest("invalid natsPublicKey format")` | +| :92 | `errcode.Unauthenticated("SSO token has expired, please re-login", errcode.WithReason(errcode.AuthTokenExpired))` | +| :96 | `errcode.Unauthenticated("invalid SSO token", errcode.WithReason(errcode.AuthInvalidToken))` | +| :108,:148 | `errhttp.Write(ctx, c, fmt.Errorf("generating NATS token: %w", err))` (→ internal; real error logged) | +| :136 | `errcode.BadRequest("account and natsPublicKey are required")` | + +Audit confirmed all **8 error sites** above are the complete set (no `c.String`/`fmt.Errorf`/`errors.New` response sites), the `errors.Is(err, pkgoidc.ErrTokenExpired)` discriminator at :90 is preserved, and `middleware.go:21` does set `c.Set("request_id", …)` (36-char UUIDv7, CLAUDE.md-compliant). The two `authResponse` 200s and healthz are untouched. + +- [ ] **Step 1: Update the error tests — all five, not just one.** Besides the token-expired test (`handler_test.go:135-150` → expect 401, `code=="unauthenticated"`, `reason=="sso_token_expired"` via `errtest`), update: `:166` (invalid SSO token → 401, `reason=="invalid_sso_token"`), `:182`/`:305` (invalid natsPublicKey → 400 `bad_request`), `:289` (missing account → 400). **Add a 500-path test** — there is none today, and the 500 message changes (see Step 3), so it MUST be covered. +- [ ] **Step 2: Run, expect FAIL.** +- [ ] **Step 3: Apply the table.** Enrich ctx at entry: `ctx := errcode.WithLogValues(c.Request.Context(), "request_id", c.GetString("request_id"), "account", account)`. Replace each `c.JSON(status, gin.H{"error":...})` with `errhttp.Write(ctx, c, )`. **NOTE the visible behavior change:** the 500 sites (:108,:148) currently return `{"error":"failed to generate NATS token"}`; after Classify they return `{"code":"internal","error":"internal error"}` — the real cause is logged, not sent. This is intended (no internal leak) but is a client-visible message change to document. +- [ ] **Step 4: Verify** healthz (`:199`), the CORS-preflight 204 (`middleware.go:39`), and the two success `c.JSON(http.StatusOK, authResponse{...})` paths are untouched. +- [ ] **Step 5: Run + docs + commit.** Docs (`docs/client-api.md:165-169`): rewrite every auth error row to the envelope shape — 401 rows show `"code":"unauthenticated"` + the right `reason`; 400 rows `"code":"bad_request"`; the 500 row's body changes from `"failed to generate NATS token"` to `{"code":"internal","error":"internal error"}` (currently documented as the old message at :169 — must be edited). +```bash +make test SERVICE=auth-service +git add auth-service/ docs/client-api.md +git commit -m "refactor(auth-service): adopt errcode HTTP envelopes" +``` + +> **PM-gate fallback:** if PM rejects the `unauthenticated`/401 category, the :92/:96 rows fold to `errcode.Forbidden(...)` (403) with the same reasons, and `codes_auth.go` + the errhttp test must use 403. The spec carries no separate 403 mapping table — derive it from this one by swapping Unauthenticated→Forbidden if the gate fails. + +--- + +## Chapter 17 — Cleanup: delete shims and legacy helpers + +Now that all callers use `errcode`, remove the natsrouter shims, `model.ErrorResponse`, and the legacy `natsutil` helpers. + +### Task 17.1: Delete natsrouter `RouteError`/`Err*` shims + +**Files:** Modify/delete `pkg/natsrouter/errors.go`, `pkg/natsrouter/context.go` (`ReplyRouteError`). + +- [ ] **Step 1:** `grep -rn "natsrouter.Err\|RouteError\|ReplyRouteError\|natsrouter.Code" --include="*.go" .` → expect only `pkg/natsrouter/errors.go`, `context.go`, and natsrouter's own tests/examples. If any service still references them, migrate it now (it was missed). +- [ ] **Step 2:** Delete the shim contents of `errors.go` (the whole `RouteError` alias, constructors, `Code*` consts) and `ReplyRouteError`. Update `example_test.go`/`router_test.go` to construct errors via `errcode` directly. +- [ ] **Step 3:** `go build ./... && go test ./pkg/natsrouter/ -v` → PASS. +- [ ] **Step 4:** Commit — `git add pkg/natsrouter/ && git commit -m "refactor(natsrouter): remove RouteError shims"` + +### Task 17.2: Retire `model.ErrorResponse` and legacy `natsutil` helpers + +**Files:** Modify `pkg/model/error.go`, `pkg/natsutil/reply.go`, callers. + +- [ ] **Step 1:** `grep -rn "model.ErrorResponse\|MarshalError\|MarshalErrorWithCode\|natsutil.ReplyError\|TryParseError" --include="*.go" .` (exclude tests last). Confirm all production callers are migrated (`fetcher_history.go`, `memberlist_client.go` already moved to `errcode.Parse` in Ch 13/14). +- [ ] **Step 2:** Delete `TryParseError`, `MarshalError`, `MarshalErrorWithCode`, `ReplyError` from `pkg/natsutil/reply.go` (keep `MarshalResponse`/`ReplyJSON`). Delete `model.ErrorResponse` (or, if any non-migrated test still needs it, migrate that test first). Remove `RoomID` reference sites. +- [ ] **Step 3:** `go build ./... && make test` → PASS. +- [ ] **Step 4:** Commit — `git add -A && git commit -m "refactor: retire model.ErrorResponse and legacy natsutil error helpers"` + +--- + +## Chapter 18 — semgrep, docs, final verification (backend) + +> **Frontend cutover split out:** the TypeScript work, browser verification, and the breaking DM-exists co-release coupling now live in Chapter 19 as a separate release task — different toolchain (npm), different release gate. Ch 18 covers only the Go/lint/docs side; Ch 19 ships with the frontend release. + +### Task 18.1: semgrep rules + +**Files:** Create `.semgrep/errcode.yml`; Modify `Makefile:44`. + +- [ ] **Step 1: Write rules** +```yaml +rules: + - id: errcode-no-reason-literal-outside-catalog + languages: [go] + severity: ERROR + message: > + Declare Reason codes as typed constants in pkg/errcode/codes_.go, + not inline. Use an existing errcode.Reason constant. + paths: + exclude: + - "pkg/errcode/codes_*.go" + - "**/*_test.go" + patterns: + - pattern: errcode.Reason("...") + + - id: errcode-withcause-must-not-wrap-errcode + languages: [go] + severity: ERROR + message: > + WithCause must wrap a raw error, never another errcode error. Propagate a + typed error with `return err` or `fmt.Errorf("...: %w", err)`. + patterns: + - pattern: errcode.WithCause(errcode.$F(...)) + + - id: errcode-no-multi-wrap-errcode + languages: [go] + severity: ERROR + message: > + Multiple %w verbs can place two errcode errors in one chain, defeating the + "one *Error per chain" invariant (Classify picks the first). Use a single %w. + patterns: + - pattern-regex: 'fmt\.Errorf\([^)]*%w[^)]*%w' + + - id: errcode-prefer-named-constructor + languages: [go] + severity: WARNING + message: > + Prefer the named constructor (errcode.NotFound(msg)) over + errcode.New(errcode.CodeX, msg) for a literal category. Reserve New for a + category chosen at runtime. + paths: + exclude: + - "pkg/errcode/**" + - "**/*_test.go" + patterns: + - pattern: errcode.New(errcode.$CODE, ...) + - metavariable-regex: + metavariable: $CODE + regex: '^Code[A-Z].*' +``` + +- [ ] **Step 2: Wire into Makefile** — append `--config=.semgrep/errcode.yml` to `SEMGREP_FLAGS` (line ~44). +- [ ] **Step 3: Verify each rule** — plant, run `make sast-semgrep`, confirm the expected rule fires, remove the plant, confirm clean. Cover: (a) `_ = errcode.Reason("oops")` in a non-catalog file → `errcode-no-reason-literal-outside-catalog`; (b) `errcode.WithCause(errcode.NotFound("x"))` → `errcode-withcause-must-not-wrap-errcode`; (c) `fmt.Errorf("%w %w", a, b)` → `errcode-no-multi-wrap-errcode`; (d) `errcode.New(errcode.CodeNotFound, "x")` → `errcode-prefer-named-constructor` (WARNING). Confirm the real codebase is clean against all four. +- [ ] **Step 4: Commit** — `git add .semgrep/errcode.yml Makefile && git commit -m "ci(semgrep): enforce Reason location, WithCause + single-%w invariants, named constructors"` + +### Task 18.2: docs/client-api.md per-service envelope rows (consolidated docs pass) + +The per-service migration chapters intentionally deferred the `docs/client-api.md` updates to a single pass here, so the doc edits are coherent across endpoints. CLAUDE.md requires `docs/client-api.md` updates in the same PR as client-facing handler changes — this pass discharges that obligation for Chapters 11–16. + +- [ ] **Step 1: Inventory the deferred rows.** For each migrated service, list the affected endpoints and what the envelope shape now is. Per-service notes (cross-check against the per-service audit findings): + - **history-service:** add a "Possible errors" table per endpoint (history/next/surrounding/get/edit/delete/thread/thread.parent) with `code` (the generic) and `when`. No reasons today. + - **search-service:** error tables for search endpoints; `code` only (codes_search.go is the empty placeholder, no reasons). + - **mock-user-service:** the `checkSite` 404 envelope. + - **message-gatekeeper:** the doc table around `docs/client-api.md:1886-1894` — fill the `code` column (currently `—`) for every now-`bad_request`/`forbidden`/`not_found` row, and add `reason` for the two forbidden cases (`large_room_post_restricted`, `not_subscribed`). + - **room-service:** rewrite the DM-exists block (~`docs/client-api.md:235-238`) from the old documented-as-success-error `{"error":"dm already exists","roomId":…}` to the new success shape `{"status":"exists","roomId":…}`; add a changelog entry noting the semantics flip and the **Ch 19 co-release coupling**. For other room-service endpoints, fill in code + reason where applicable (RoomNotMember/RoomNotOwner/RoomLastOwnerCannotLeave/RoomBotInChannel/RoomBotNotAvailable/RoomMaxSizeReached). + - **room-worker:** AsyncJobResult schema table (~`docs/client-api.md:338`) gains `code`/`reason` rows; example JSON refreshed. §5 (~`:2045-2057`) currently says "Absent for plain `natsutil.ReplyError`" — update: sync-DM reply now emits `code`. Update the create.dm error section accordingly. + - **auth-service:** `docs/client-api.md:165-169` — rewrite the 4 auth error rows to envelope shape (401 rows show `"code":"unauthenticated"` + the right `reason`; 400 rows `"code":"bad_request"`); the 500 row body changes from `{"error":"failed to generate NATS token"}` to `{"code":"internal","error":"internal error"}`. + +- [ ] **Step 2: Apply the edits** in one pass through `docs/client-api.md`. Add a top-of-file changelog entry summarizing: envelope shape is now `{error, code, reason?, metadata?}`; the DM-exists contract flip (co-releases with the frontend in Ch 19); the 500 message homogenization to `"internal error"`. + +- [ ] **Step 3: Commit** — `git add docs/client-api.md && git commit -m "docs(client-api): refresh error envelopes for the errcode migration"` + +### Task 18.3: Repo-wide gates + error-handling guide + +- [ ] **Step 1:** `go build ./...` → clean. +- [ ] **Step 2:** `make lint` → clean. +- [ ] **Step 3:** `make test` → all unit tests pass. +- [ ] **Step 4:** `make sast` → clean (incl. errcode rules). +- [ ] **Step 5:** `grep -rn "sanitizeError\|RouteError\|codedError\|MarshalErrorWithCode\|model.ErrorResponse\|TryParseError" --include="*.go" .` → no production references. +- [ ] **Step 6:** Write `docs/error-handling.md` (envelope, categories + HTTP map incl. the 503-vs-429 note + the new `too_many_requests`/429 category, how to add a Reason, the wrapping invariant + allowed/forbidden table mirroring `doc.go`, the logging contract incl. `Context.WithLogValues` vs `errcode.WithLogValues`, the semgrep rules). Link from `CLAUDE.md` Section 3 "Error Handling". +- [ ] **Step 7:** Commit — `git add docs/error-handling.md CLAUDE.md && git commit -m "docs: add repo-wide error-handling guide"` +- [ ] **Step 8:** Push — `git push -u origin claude/sharp-hopper-qzm6W` (retry with backoff on network errors). + +### Task 18.4: Retire the last stale references + adjacent worker bugs + +Final repo-wide consistency sweep. A focused audit (one reviewer per non-migrated service + a cross-cutting `pkg/` scan) confirmed the 5 remaining JetStream-only worker services (`broadcast-`, `message-`, `notification-`, `inbox-`, `search-sync-worker`) have NO client-facing error surface — they correctly stay `pkg/errcode`-free. But 4 doc sites still reference deleted symbols, and 3 adjacent worker bugs were surfaced (out of scope for the migration, but small enough to fold into the same PR). + +- [ ] **Step 1: Retire the 4 stale doc references.** Each currently points at deleted symbols; fix in place. + - `CLAUDE.md:224` — bullet still reads "Use `natsutil.ReplyJSON` for success responses, `natsutil.ReplyError` for errors". Replace the second half: "use `errnats.Reply` for errors (typed errcode envelope; see `docs/error-handling.md`)". + - `docs/client-api.md:1806` — "…per `pkg/model.ErrorResponse` (see §5)" — `ErrorResponse` is deleted; rewrite to point at §6 (Error envelope reference) and drop the type reference. + - `docs/superpowers/spec.md:233` — `**ErrorResponse**: error (string)` type entry — `ErrorResponse` is deleted; remove or replace with a note that error envelopes are owned by `pkg/errcode` (see `docs/error-handling.md`). + - `docs/superpowers/spec.md:439` — package table row `pkg/natsutil | ReplyJSON, ReplyError, MarshalResponse, MarshalError, HeaderCarrier` — `ReplyError`/`MarshalError` deleted; trim to the surviving helpers and add a `pkg/errcode` row referencing the canonical owner. + +> **Historical implementation plans** under `docs/superpowers/plans/2026-04-*`, `2026-05-07-*`, `2026-05-13-*` etc. still mention `RouteError`/`model.ErrorResponse`/`natsutil.ReplyError` — these are point-in-time SNAPSHOT records from earlier PRs (the cross-cutting reviewer explicitly flagged them as "out of scope; acceptable to leave"). Do NOT touch them; they document state at the time the prior plan landed. + +- [ ] **Step 2: Adjacent worker bug fixes** (flagged during the audit; small enough to bundle): + - `notification-worker/main.go:48,54` — `mongoMemberLookup.ListSubscriptions` returns bare `err` twice. CLAUDE.md §3 violation. Wrap with `fmt.Errorf("find subscriptions for room %s: %w", roomID, err)` and `fmt.Errorf("decode subscriptions: %w", err)` respectively. + - `inbox-worker/handler.go:187` — `return fmt.Errorf("role_updated event has empty roles")` on a permanently-malformed event causes infinite NAK redelivery of a poison message. Change to `slog.Warn(...) + return nil` (silent-drop pattern, consistent with the `default:` branch at `handler.go:73`). + - `notification-worker/handler.go:67` — `slog.Error("publish notification failed", "error", err, "account", subs[i].User.Account)` logs the recipient account (PII-adjacent). Operators need the identifier to debug "why didn't user X get a notification", so don't drop it — add a one-line comment acknowledging the trade-off so the choice is explicit. + +- [ ] **Step 3: Verify** — `go build ./...` clean; `go test -race ./notification-worker/ ./inbox-worker/` green; `golangci-lint` clean. + +- [ ] **Step 4: Repo-wide grep gate** (extends Task 18.3 Step 5 to docs): + ```bash + grep -rn "natsutil\.\(ReplyError\|MarshalError\|MarshalErrorWithCode\|TryParseError\)\|model\.ErrorResponse\|natsrouter\.\(RouteError\|Err[A-Z]\|Code[A-Z]\|ReplyRouteError\)" \ + --include="*.md" --include="*.go" \ + docs/ CLAUDE.md pkg/ auth-service/ history-service/ search-service/ mock-user-service/ \ + message-gatekeeper/ room-service/ room-worker/ broadcast-worker/ message-worker/ \ + notification-worker/ inbox-worker/ search-sync-worker/ + ``` + Expect: zero hits outside (a) the `docs/error-handling.md` "Migration history" tombstone list, (b) the `pkg/natsutil/reply.go` package-doc tombstone comment, and (c) the historical implementation plans under `docs/superpowers/plans/2026-04-*`/`2026-05-07-*`/`2026-05-13-*`/`2026-03-*`/`2026-04-*`. + +- [ ] **Step 5: Commit** — `git add CLAUDE.md docs/client-api.md docs/superpowers/spec.md notification-worker/ inbox-worker/ && git commit -m "docs+chore: retire last stale references; fix notification/inbox worker bugs"` +- [ ] **Step 6: Push.** + +--- + +## Chapter 19 — Follow-up: Frontend Cutover (separate release task) + +> **Why split from Ch 18:** different toolchain (TypeScript / npm vs Go), and it ships under a **release coupling** — the room-service DM-exists reply has flipped from `{error,"dm already exists",roomId}` to `{status:"exists",roomId}` (a contract change the old frontend would mishandle), so this chapter's commit MUST land in the same release as the backend. Until it ships, the backend is paused at the gate (or the frontend rolls out FIRST so it tolerates both shapes — see Step 0). + +**Pre-merge gate (must be checked before merging the backend release):** +- [ ] **Step 0: Confirm rollout order.** + - **Preferred:** ship frontend + backend in the same release; deploy frontend first (it already tolerates the new shape — backward-compatible by Steps 2-3), then deploy room-service. + - **Fallback:** if co-release is impossible, the room-service DM-exists change must be reverted behind a temporary flag emitting the legacy shape until the frontend ships. STOP and ask if you reach this case. + +**Files:** the canonical seams (verified by a focused frontend-section review): +- `chat-frontend/src/api/_transport/asyncJob.ts` — `SyncReplyEnvelope`/`AsyncReplyEnvelope` types (~lines 88-99); the `AsyncJobError` class (~40-48); the sync-envelope error path (~169-177); the `'operation failed'` fallback (~204). +- **`chat-frontend/src/context/NatsContext/NatsContext.jsx`** — **the CANONICAL sync error decoder** (`NatsContext.request`, ~line 92: `if (parsed.error) throw new Error(parsed.error)`). Every sync RPC (search, member.list, getRoom, etc.) flows through here; without rewriting this, NO sync error sees `reason`. ALSO the auth HTTP error path (~:51-53 reading `errBody.error`) for the new `errhttp.Write` envelope (Ch 16). +- **`chat-frontend/src/lib/constants.js`** — the `isDMExistsReply` predicate (~line 21: `reply.error === 'dm already exists'`). MUST be rewritten or DM-exists silently 30s-timeouts after the contract flip. +- `chat-frontend/src/api/types.ts` — the TS mirror of `pkg/model.AsyncJobResult` (~278-282); strict mirror rule requires adding `code?` / `reason?` fields. +- `chat-frontend/src/api/createRoom/index.ts` + `src/components/MainApp/Sidebar/CreateRoomDialog/CreateRoomDialog.jsx` — the create-DM call site + dialog driver. +- `chat-frontend/src/components/MainApp/ChatPage/ManageMembersDialog/{MemberRoster/MemberRoster.jsx, AddMembersForm/AddMembersForm.jsx}` — member-management surfaces currently rendering raw error strings. +- Tests: `asyncJob.test.js`, `CreateRoomDialog.test.jsx` (currently substring-matches `/exceeds maximum capacity/i` — must move to `reason === 'max_room_size_reached'`). + +- [ ] **Step 1: Enumerate seams + reason inventory.** `grep -rn "\.code\|\.reason\|\.error\|AsyncError\|ErrorResponse\|dm already exists\|roomId" chat-frontend/src/` (whole `src/`, not just `api/`). Build an inventory mapping each emitted Reason to its UI driver: `max_room_size_reached`→CreateRoomDialog; `sso_token_expired`/`invalid_sso_token`→redirect to re-login (currently falls through ErrorBoundary); `not_subscribed`→send/edit "join the room first" copy; `not_room_member`/`not_room_owner`/`last_owner_cannot_leave`→member-management dialogs (replace raw strings); `large_room_post_restricted`→send-failed toast; generic `forbidden`/`not_found`/`bad_request`→default copy. Anything in this inventory without an explicit step below is a gap to close in Step 6. + +- [ ] **Step 2: Extend the transport types + rewrite both decoders.** + - Extend `SyncReplyEnvelope`/`AsyncReplyEnvelope` (`asyncJob.ts:88-99`) with `code?: string`, `reason?: string`, `metadata?: Record`. + - Extend `AsyncJobError` (`asyncJob.ts:40-48`) with `code?` and `reason?` so consumers can branch without re-parsing `.message`. + - Rewrite `NatsContext.request` (`NatsContext.jsx:~92`): instead of throwing `new Error(parsed.error)`, throw a structured `RequestError`/`AsyncJobError` carrying `{message, code, reason, metadata}`. Backward-compatible: an `error`-only legacy payload still throws (with `code=undefined`), so the frontend can deploy first. + - Rewrite the auth HTTP error path in `NatsContext.jsx:~51-53` to read `errBody.code`/`errBody.reason` and surface `sso_token_expired`/`invalid_sso_token` as the re-login redirect, not a raw login-UI string. + - UI branches must use `reason ?? code` for trigger logic; keep generic copy keyed on `code`. + +- [ ] **Step 3: Fix DM-exists (CRITICAL) — `isDMExistsReply` accepts BOTH shapes.** In `src/lib/constants.js:~21`, rewrite the predicate to: + ```js + export const isDMExistsReply = (reply) => + (reply?.status === 'exists' && reply?.roomId) || // new backend + (reply?.error === 'dm already exists' && reply?.roomId); // legacy backend during rollout + ``` + Without this, the new `{status:"exists", roomId}` falls through to the sync error branch (which also doesn't match since there's no `.error`), and the request hangs until the 30s `AsyncTimeout`. The create-DM call site already routes through `isDMExistsReply`; no per-call changes needed once the predicate is fixed. Remove the legacy branch in a follow-up release once the backend is everywhere. + +- [ ] **Step 4: AsyncJobResult — mirror the Go fields in TS AND decode them.** Update `chat-frontend/src/api/types.ts` (~:278-282) to add `code?: string` and `reason?: string` to the AsyncJobResult mirror (CLAUDE.md strict-mirror rule). Then add `code`/`reason` handling to `asyncJob.ts`'s async decoder so a failed async job throws an `AsyncJobError` carrying them; replace the hard-coded `'operation failed'` fallback (`asyncJob.ts:~204`) with a humanized lookup keyed off `reason` (or `code` when reason is absent). Regenerate any test fixtures (`api/_transport/__fixtures__/*` if applicable) per the frontend CLAUDE.md fixture rule. + +- [ ] **Step 5: Frontend gates** — actual `chat-frontend/package.json` scripts: `npm run typecheck`, `npm test`, `npm run build`, `npm run smoke`, `npm run smoke:asyncjob`, `npm run smoke:livestack`. Run ALL of them until green — the three `smoke*` scripts exercise the wire contract end-to-end against a real stack and are the **required regression net** for a contract-flip, not optional. Document anything skipped + why. Add explicit unit tests: (a) `isDMExistsReply({status:'exists', roomId:'r1'})===true`; (b) legacy `{error:'dm already exists', roomId:'r1'}` STILL true (rollout window); (c) real-failure `{error:'something else'}` falls through to SyncError; (d) `AsyncJobError` exposes the new `.code`/`.reason` fields. + +- [ ] **Step 6: Rewrite reason-driven UX from message substrings to reason matches.** Walk the Step-1 inventory and replace every "english-string-substring" branch with a `reason === ''` branch. Specifically: `CreateRoomDialog.test.jsx:~265` `/exceeds maximum capacity/i` → match `reason === 'max_room_size_reached'`; member-management dialogs rendering raw strings → use mapped copy per reason; auth re-login redirect on `sso_token_expired`/`invalid_sso_token`; send/edit "join the room first" copy on `not_subscribed`. Browser verification (CLAUDE.md UI rule): start the dev server; exercise (a) generic validation, (b) `max_room_size_reached` dialog, (c) DM-exists navigates to the existing room, (d) async-failed `{Code:"forbidden", Reason:"not_room_member"}` surfaces correctly, (e) expired SSO redirects to re-login. If browser unavailable, state so and rely on unit tests + staging. + +- [ ] **Step 7: Commit + push** — `git add chat-frontend/ && git commit -m "feat(frontend): consume errcode reason + DM-exists success reply"` then push for review. + +- [ ] **Step 8: Release coordination** — PR description records: backend release SHA this targets; rollout order (frontend first); when the legacy DM-exists fallback (Step 3) + legacy sync-error-only path (Step 2) are removed (follow-up release once backend is everywhere). Note: `ErrorBoundary` render-time errors are intentionally left generic; only `formatAsyncJobError`/`NatsContext.request` consumers gain reason-aware copy. + +--- + +## Chapter 20 — Migration consequences (in-PR completion sweep) + +> **Scope rule.** This chapter captures EVERY adjustment that exists ONLY because the errcode migration changed how errors are produced, shaped, logged, or consumed. After this PR ships, no future engineer should have to "go back and finish errcode". The one pre-existing bug surfaced during audits that is **NOT** errcode-driven and stays on its backlog is the gatekeeper `fakeJSMsg` Ack/Nak test gap (JetStream consumer semantics). The other three originally-listed items — notification-worker bare-`err` returns, notification-worker account PII log, inbox-worker NAK-forever — were already addressed by Task 18.4 (commit `a619863`). + +> **Locked execution policy.** All in-PR tasks (20.1 → 20.8 plus 20.11 → 20.20) execute in a single batch in one focused session — verify after each (build + per-package test + lint), then a single end-of-batch verification gate (Task 20.10) before one consolidated push. 20.9 stays as a planned follow-up release (gated on backend rollout). Task 20.1 takes the "drop sentinels, add Reasons, use `HasReason` for identity matches" approach (the new identity primitive — aligns with the migration's design). Tasks 20.11–20.20 are the branch_review findings folded in (per user decision 2026-06-01: all 3 HIGH+ + all 4 aesthetic + 3 medium items land in this PR). + +### Task 20.1: room-service — restore wrapped name in client envelope (REGRESSION FIX) + +**The migration broke client UX.** Pre-migration, `sanitizeError`'s substring allowlist passed `fmt.Errorf("user %q: %w", a, errUserNotFound).Error()` verbatim to the client (user saw `user "alice": user not found`). Post-migration, `Classify` walks `errors.As`, finds the sentinel, and returns only `e.Message` (`"user not found"`) — the account/org name is gone. Sites: `room-service/handler.go:758, :785`; `room-service/store_mongo.go:730`. The reviewer's room-service report rated this **medium** but it IS a real client-visible regression caused by this migration. + +- [ ] **Step 1: Add reasons** in `pkg/errcode/codes_room.go`: + ```go + RoomUserNotFound Reason = "user_not_found" + RoomInvalidOrg Reason = "invalid_org" + ``` + **Also update `pkg/errcode/codes_test.go:8` `allReasons` list manually** — it's a hand-maintained slice (not auto-discovered); the snake-case + uniqueness tests run over it. +- [ ] **Step 2: Emit fresh errcode values at EVERY producer site** (do this **BEFORE** dropping sentinels in Step 3 so the build never breaks). Plan-review found 6 producers, not 3: + - `room-service/handler.go:151` → `return nil, errcode.NotFound("user not found", errcode.WithReason(errcode.RoomUserNotFound))` (requester-check; no account is in scope to format into the message — the bare reason carries the intent). + - `room-service/handler.go:223` → same shape for the counterpart-check. + - `room-service/handler.go:404` (the re-emit after the `errors.Is` at `:403`) → `errcode.BadRequest("invalid org", errcode.WithReason(errcode.RoomInvalidOrg))`. + - `room-service/handler.go:758` → `errcode.NotFound(fmt.Sprintf("user %q not found", a), errcode.WithReason(errcode.RoomUserNotFound))`. + - `room-service/handler.go:785` → `errcode.BadRequest(fmt.Sprintf("invalid org %q", id), errcode.WithReason(errcode.RoomInvalidOrg))`. + - `room-service/store_mongo.go:730` → `errcode.BadRequest(fmt.Sprintf("list org members for %q", orgID), errcode.WithReason(errcode.RoomInvalidOrg))`. +- [ ] **Step 3: Drop the package-level sentinels** in `room-service/helper.go:26,45` — `errInvalidOrg`, `errUserNotFound`. After Step 2 they have no producer; this step makes them disappear. +- [ ] **Step 4: Convert identity matches to reason matches.** Production: `room-service/handler.go:403` does `errors.Is(err, errInvalidOrg)` today — rewrite to `errcode.HasReason(err, errcode.RoomInvalidOrg)`. Tests (the plan previously listed only `:2262` + 2 integration sites — that's incomplete): + - `room-service/handler_test.go:1121, 1130, 1149, 1227, 1236` — `wantErrSentinel: errInvalidOrg/errUserNotFound` table-cells (asserted via `errors.Is` at `:1198, :1275`). Rename the column to `wantReason errcode.Reason` (keep a separate `wantStoreFailure` column if `errStoreFailure` is still in play). + - `room-service/handler_test.go:1825, 1828, 1830` — `wantErrSentinel: errInvalidOrg/errUserNotFound`, plus the mock-return at `:1828` (`store.EXPECT().ListOrgMembers(...).Return(nil, errInvalidOrg)`) needs to call the same fresh constructor as production. + - `room-service/handler_test.go:2262` (the single previously-listed site). + - `room-service/helper_test.go:95, 108, 139` — identity assertions over the sentinel values; rewrite or delete (the symbols vanish in Step 3). + - `room-service/integration_test.go:805, 820` — `errors.Is` over the sentinels → `errcode.HasReason`. +- [ ] **Step 5: Refresh doc comments** that reference the dropped sentinels: + - `room-service/store.go:66` — "returns a `RoomInvalidOrg`-reason errcode" (was: "returns `errInvalidOrg`"). + - `room-service/store_mongo.go:705` — same. +- [ ] **Step 6: Docs.** + - `docs/client-api.md` §6 reason catalog — add `user_not_found` (not_found, room-service) and `invalid_org` (bad_request, room-service). + - `chat-frontend/CLAUDE.md:119-122` reason-catalog list — add the two new reasons under the room-service bullet so other frontend Claudes can branch on them. +- [ ] **Step 7: Verify** `make test SERVICE=room-service`, `make test SERVICE=pkg/errcode`, `golangci-lint run ./room-service/... ./pkg/errcode/...` all clean. + +### Task 20.2: `errnats.Reply` + `errnats.ReplyQuiet` direct unit tests + +`pkg/errcode/errnats/reply.go:43, :50` are exported but 0% direct coverage — only indirectly exercised via `pkg/natsrouter/router_test.go`. CLAUDE.md §4: "Every exported function in `pkg/` must have corresponding test cases". + +**Plan-review finding:** the original "fake `*nats.Msg`" approach **does not work** — `nats.Msg` is a concrete struct (not an interface) and `Respond(data)` requires `m.Sub` to be a non-nil `*nats.Subscription` (also concrete). You cannot subclass the struct or interpose `Respond`. The fix: use the same in-memory NATS server pattern as `pkg/natsrouter/router_test.go` (`natsserver.RunRandClientPortServer`), subscribe to a reply subject, and capture the published bytes. + +- [ ] **Step 1: Test scaffolding** in `pkg/errcode/errnats/reply_test.go`: + - Reuse the `startTestNATS(t *testing.T) *nats.Conn` helper pattern from `pkg/natsrouter/helpers_test.go` (or duplicate it locally; if duplicated, file an internal follow-up note to extract to `pkg/testutil`). Spin up the server in `t.Cleanup`. + - Helper: `requestAndCaptureReply(t, nc, replyTo, fn func(*nats.Msg))` — open a `nats.Sub` on `replyTo`, call `fn(msg)` with a real `*nats.Msg` whose `Reply` is set, capture the responded bytes. +- [ ] **Step 2: Cases.** + - `TestReply_RespondsWithEnvelopeAndLogsOnce` — pass an `errcode.Forbidden("x", WithReason(RoomNotMember))`, ctx with a capturing `slog.Handler` (JSON to `bytes.Buffer`); assert: (a) captured reply bytes decode to `{code:"forbidden", reason:"not_room_member", error:"x"}`, (b) exactly one `"request failed"` log line at `INFO` level. + - `TestReply_LogsAtErrorLevelOnInternal` — pass `errcode.Internal("x")`; assert the log line is at `ERROR` level (category-aware level pin). + - `TestReply_UnknownErrorCollapsesToInternal` — pass `errors.New("mongo down")`; assert wire bytes carry `code:"internal", error:"internal error"` and the raw cause appears in the LOG but NOT the wire. + - `TestReplyQuiet_RespondsButEmitsNoClassifyLine` — pass an `errcode.Unavailable(...)`; assert envelope sent, zero `"request failed"` lines (`ReplyQuiet` is for already-logged paths). + - `TestReply_LogsTransportFailure` — drop the subscriber (subscribe + immediately `Drain()`), call `Reply`, assert the `"error reply failed"` operational slog line fires when `msg.Respond` errors. +- [ ] **Step 3:** verify `go test -race -cover ./pkg/errcode/errnats/` shows ≥80% line coverage on `reply.go` (the existing `Marshal`/`MarshalQuiet` tests already pad some of it; these add the `Reply`/`ReplyQuiet` paths). + +### Task 20.3: `errtest` negative-branch coverage + +`pkg/errcode/errtest/assert.go` lines 16, 25, 33 (`t.Fatalf` failure branches) are uncovered (66.7%). The migration introduced `errtest`; close the gap so future regressions in the failure-message format are caught. + +- [ ] Add tests in `pkg/errcode/errtest/assert_test.go` using a `testing.T` capture (a thin `mockT` that records `Fatalf` calls — pattern used in stdlib `testing/iotest`): + - `TestDecode_FailsOnNonEnvelope` — non-error JSON triggers `t.Fatalf` with the documented message. + - `TestAssertCode_FailsOnMismatch` — wrong code triggers `t.Fatalf`. + - `TestAssertReason_FailsOnMismatch` — wrong reason triggers `t.Fatalf`. + +### Task 20.4: `request_id` slog-key standardization + +Migration's `WithLogValues` introduced `"request_id"` (snake_case, slog-canonical) as the convention. Pre-existing call sites still use `"requestID"` (camelCase) at multiple spots; same JSON log now mixes both styles and observability dashboards keyed on `request_id` miss the legacy emitters. + +**Plan-review correction:** the original "enumerate workers" framing was wrong. Verified scope: +- **`pkg/natsrouter/middleware.go:44`** — THE central request-attr emitter; the migration's `WithLogValues` convention applies repo-wide, and skipping the middleware leaves the very emitter that's supposed to be canonical inconsistent. +- **`room-worker/handler.go`** — 8 sites (lines 119, 569, 1051, 1069, 1463, 1675, 1740, 1745). +- **All other worker dirs** (`message-`, `inbox-`, `broadcast-`, `notification-`, `search-sync-`) — ZERO `"requestID"` slog sites; they're already clean. + +**CRITICAL distinction — string literal vs Go identifier vs Gin-context-key:** at `pkg/natsrouter/middleware.go:19` there is `const requestIDKey = "requestID"` — this constant is the **Gin context key** (read by `pkg/natsrouter/router_test.go:417, 437, 540, 551, 571` via `c.Get("requestID")` and used by service handlers via `c.Get(requestIDKey)`). Changing it would break every caller. The slog literal at `:44` is a SEPARATE string that happens to share the value. The plan must NOT touch the const or anything reading it; it changes only the slog literal at `:44` to `"request_id"`. + +- [ ] **Step 1:** verify the exact scope with one grep — should match the audit (~9 sites total): + ```bash + grep -rnE '(slog\.[A-Z][a-zA-Z]+(Context)?\(.*"requestID"|WithLogValues\(.*"requestID")' --include="*.go" . + ``` + Anything matching `c.Get("requestID")`, `c.Set("requestID", ...)`, `c.MustGet("requestID")`, or `requestIDKey` (the const) is **NOT in scope** — those are Gin context keys. +- [ ] **Step 2: `pkg/natsrouter/middleware.go:44`** — change ONLY the slog literal (`"requestID"` → `"request_id"`). The const at `:19` stays as-is. +- [ ] **Step 3: `room-worker/handler.go`** — replace the 8 `"requestID"` slog literals with `"request_id"`. Identifier `requestID` (Go variable) stays. +- [ ] **Step 4: Tests.** + - `pkg/natsrouter/router_test.go:417, 437, 540, 551, 571` use `c.Get("requestID")` (Gin context key) — **do NOT change**. + - Any test that asserts on the slog literal — search `"requestID"` across `*_test.go` after Step 2/3 land and confirm only context-key sites remain. +- [ ] **Step 5:** verify `go build ./...`, the changed packages' tests, and `golangci-lint run ./pkg/natsrouter/... ./room-worker/...`. + +### Task 20.5: `errcode.New` code-set validation + +`pkg/errcode/options.go` `New(code Code, msg string, opts ...Option)` is exported but doesn't validate `code` is one of the 8 canonical constants. The "named constructors are the only validated entry points" claim in `parse.go` is loose; the only in-tree caller (`room-service/memberlist_client.go:78`) feeds it a `Code` parsed from a remote envelope — a foreign Code would silently pass through. + +- [ ] **Step 1:** in `New`, validate the input is in the closed set; **panic with a clear message** if not. This matches the `WithCause` invariant-guard style (Decision 8). Rationale: `New` is the dynamic escape hatch; passing a non-canonical Code is a programmer error that should fail loudly. +- [ ] **Step 2:** add a test asserting the panic (`TestNew_PanicsOnUnknownCategory`). +- [ ] **Step 3: `memberlist_client.go:71-78`** — when the remote `Code` isn't canonical (legacy site during rollout), fall back to `errcode.Internal(ee.Message)` AND log a single `slog.Warn("legacy peer emitted non-canonical errcode", "code", ee.Code, "message", ee.Message)` so SREs can see legacy peers explicitly (instead of silently collapsing). **Update the existing comment block at `:71-74`** to describe the new fallback behavior — do not just delete the comment. +- [ ] **Step 4:** document the runtime contract in two places: + - `docs/error-handling.md` near where `New` is described (around line 72) — add a sentence: "Passing a non-canonical `Code` to `errcode.New` panics — `New` is for dynamic but well-known categories, not for arbitrary strings." + - `docs/error-handling.md` §7 (semgrep rules table) — note that the existing `errcode-prefer-named-constructor` WARNING is now backed at runtime by the panic so unknown-Code bugs surface fast. + +### Task 20.6: Frontend UI — convert english-text branching to reason branching + +The transport now carries `code`/`reason` (Ch 19 done). Several UI sites still substring-match the English error text; convert them so the contract becomes text-agnostic. This is the second half of Ch 19's Step 6 that I deferred — undeferring per the "every change in this PR" scope rule. + +- [ ] **Step 1:** enumerate UI sites — `grep -rnE "toMatch\(.*(?:capacity|exists|owner|subscribed|forbidden|exceeds|requires)|err\.message\.(includes|match|startsWith)|setError\(err.*\.message" chat-frontend/src/` (incl. tests). Known sites the plan-review verified: `CreateRoomDialog.jsx:112` + `.test.jsx:254,265`, `LeaveRoomButton.jsx:14` + `.test.jsx:61,65`, `MemberRoster.jsx:61` (raw `err.message`) + `:102, :126` (via `formatAsyncJobError`) + `.test.jsx:266, :352`, `AddMembersForm.jsx:57` + `.test.jsx:93, :98`, plus `MessageActionMenu.jsx:69` and `OidcCallback.jsx:41` (the latter folds into Task 20.7). +- [ ] **Step 2: `CreateRoomDialog`** — branch on `err instanceof AsyncJobError && err.reason === 'max_room_size_reached'`. Surface humanized copy ("This room is at capacity — owners can raise the limit"). **Update `CreateRoomDialog.test.jsx:254`** to construct the mock rejection as `new AsyncJobError('exceeds maximum capacity (50)', 'sync-error', { reason: 'max_room_size_reached' })` instead of `new Error(...)` — otherwise `REASON_COPY[err.reason]` falls back to `err.message` and the assertion at `:265` passes on the legacy text, masking the change. Update `:265` to assert on the humanized reason copy. +- [ ] **Step 3: `LeaveRoomButton`** — same shape; branch on `err.reason === 'last_owner_cannot_leave'`. Surface "You're the last owner — promote someone else first or delete the room." Update `.test.jsx:61` mock to `new AsyncJobError('cannot leave: you are the last owner', 'sync-error', { reason: 'last_owner_cannot_leave' })` and `:65` assertion to the humanized copy. +- [ ] **Step 4: `MemberRoster.jsx`** — three sites: + - `:61` currently does `setError(err.message)` — swap to `setError(formatAsyncJobError(err))` (consistency with `:102` / `:126`; otherwise the roster-load failure path stays inconsistent and bypasses 20.8's lookup). + - `:102` and `:126` already use `formatAsyncJobError` — verify they pick up the humanized copy from Task 20.8 once tests' mocks carry `.reason`. + - `.test.jsx:266, :352` — update the mock rejections to `AsyncJobError` with `.reason: 'not_room_member'` (or 'not_room_owner', whichever matches the scenario); assertions follow the humanized REASON_COPY copy from 20.8. +- [ ] **Step 5: `AddMembersForm.jsx:57`** — already calls `formatAsyncJobError`. Update `.test.jsx:93` mock to `AsyncJobError({ reason: 'not_room_owner' })` and `:98` to assert on the humanized copy. +- [ ] **Step 6: `MessageActionMenu.jsx:69`** (minor uniformity fix) — `setError(err?.message || 'Failed to load read receipts')` → `setError(formatAsyncJobError(err) || 'Failed to load read receipts')`. Read-receipts likely never carry an actionable reason today, but routing through `formatAsyncJobError` keeps the contract uniform. +- [ ] **Step 7:** verify `npm run typecheck && npm test && npm run build` clean. (End-of-batch smoke runs are in Task 20.10.) + +### Task 20.7: Frontend — auth re-login redirect on token-expired + +`sso_token_expired` / `invalid_sso_token` reasons are now carried into the `AsyncJobError` thrown by `NatsContext.connect` (Ch 19 done). Today they fall through to the surface that catches them with raw text — the migration's enabling change isn't yet realized as UX. + +**Plan-review correction:** there is **no App-level catch handler.** `App.jsx` only mounts ``, which by design catches render errors only (per `chat-frontend/CLAUDE.md`: "boundary does NOT catch event-handler errors"). The real catch sites are the components/contexts that call `useNats()`. There are two trigger surfaces — initial login and mid-session token expiry — and both need handling. + +- [ ] **Step 1: Initial-login surface** — `chat-frontend/src/pages/LoginPage/LoginPage.jsx:28` (`handleDevSubmit`) and `:44` (`handleKeycloakLogin`). Both `catch` and just call `setError(err.message)`. Add: + ```js + if (err instanceof AsyncJobError && + (err.reason === 'sso_token_expired' || err.reason === 'invalid_sso_token')) { + // Clear any partial session, then redirect to the OIDC sign-in flow + // (sso) or surface a "session expired, please log in again" prompt (dev). + clearSession() + if (mode === 'sso') getOidcManager().signinRedirect() + return + } + setError(formatAsyncJobError(err)) + ``` +- [ ] **Step 2: OIDC callback surface** — `chat-frontend/src/pages/OidcCallback/OidcCallback.jsx:41` currently does `setError(err.message || String(err))`. Apply the same `reason`-aware branch (an expired-token error from auth-service during callback should redirect to re-login, not display). +- [ ] **Step 3: Mid-session surface** — `NatsContext.request` (`chat-frontend/src/context/NatsContext/NatsContext.jsx:100`) is the throw site for sync RPCs that hit an expired token mid-session. The cleanest fix is centralizing the redirect in `NatsContext` itself: if a thrown error's `.reason` is `sso_token_expired` / `invalid_sso_token`, trigger the redirect side-effect AND continue to throw (so callers don't see a phantom-success). Alternatively each consumer (e.g. `MemberRoster.jsx:61`) checks; the centralized approach is preferred for a single source of truth. +- [ ] **Step 4: Tests.** + - Add a `LoginPage.test.jsx` case asserting that an `AsyncJobError({ reason: 'sso_token_expired' })` thrown by `connectToNats` triggers `clearSession()` + redirect (mock the OIDC manager). + - Same for `invalid_sso_token`. + - If Step 3 centralizes in `NatsContext`, add a `NatsContext.test.jsx` case mocking a request whose reply carries `reason: 'sso_token_expired'`; assert the redirect side-effect fires. + +### Task 20.8: Frontend — `formatAsyncJobError` reason-keyed humanization + +`formatAsyncJobError` (`chat-frontend/src/api/_transport/asyncJob.ts:59`) currently returns `err.message` verbatim for `SyncError`/`AsyncError`. Now that `reason` is carried, prefer a humanized copy lookup keyed off reason; fall back to `err.message` for unmapped cases (and bare `Error` callers). + +- [ ] **Step 1:** add a `REASON_COPY: Record` map next to `formatAsyncJobError`. Seed with the catalog reasons (each from `pkg/errcode/codes_*.go`): + ```ts + const REASON_COPY: Record = { + max_room_size_reached: 'This room is at capacity.', + not_room_member: "You're not a member of this room.", + not_room_owner: 'Only owners can do that.', + last_owner_cannot_leave: "You're the last owner — promote someone else first.", + bot_in_channel: "Bots can't join channels.", + bot_not_available: "This bot isn't available right now.", + large_room_post_restricted: 'Only owners and admins can post here.', + not_subscribed: 'You need to join this room first.', + // sso_token_expired / invalid_sso_token are intentionally absent — they + // drive a redirect (Task 20.7), not a user-facing message. + } + ``` +- [ ] **Step 2:** `formatAsyncJobError` returns `REASON_COPY[err.reason] ?? err.message` for `SyncError`/`AsyncError` (only — wire-level kinds keep their existing copy at lines 69-72 of `asyncJob.ts`). +- [ ] **Step 3: Tests.** The existing tests at `chat-frontend/src/api/_transport/asyncJob.test.js:223-230` ("returns the raw message for SyncError" + "exceeds maximum capacity") stay green **only because** their mock errors construct `new Error(...)` with no `.reason` — the lookup falls back to message text. Plan-review explicitly flagged that this masks the change. Action: + - Keep the existing two tests as the fall-back-path proof (rename them to `..._fallsBackToMessageWhenNoReason` so the contract is explicit). + - **Add new test cases** that construct `new AsyncJobError(rawMessage, 'sync-error', { reason: '' })` and assert `formatAsyncJobError(err)` returns the humanized REASON_COPY copy. Cover at least: `max_room_size_reached`, `not_room_member`, `not_subscribed`, `large_room_post_restricted` (the highest-traffic reasons). +- [ ] **Step 4:** update `chat-frontend/CLAUDE.md` "Error envelope" section — under the reasons-emitted-today list (around line 119), append a short note that `formatAsyncJobError` is now the reason-keyed lookup so consumers don't need to map themselves. Cross-link to the REASON_COPY constant. + +### Task 20.9: Frontend — remove `isDMExistsReply` legacy fallback (FOLLOW-UP RELEASE) + +Once the room-service-with-the-flip is deployed everywhere, the legacy `error: 'dm already exists'` branch in `isDMExistsReply` is dead. Plan-review found the original task missed several sites that exercise/document the legacy shape — these all retire together. + +- [ ] `chat-frontend/src/lib/constants.js` — drop the legacy `error: ERR_DM_ALREADY_EXISTS && roomId` branch in `isDMExistsReply`; drop the `ERR_DM_ALREADY_EXISTS` constant (and the "Legacy" comment on `:30`). +- [ ] `chat-frontend/src/lib/constants.test.js` — drop the "legacy shape true" case (the new-shape case stays). +- [ ] `chat-frontend/src/api/_transport/asyncJob.test.js:104, :107, :109` — `treatAsSuccess: (reply) => reply.error === 'dm already exists' && !!reply.roomId` uses the legacy predicate inline. Switch to `treatAsSuccess: (reply) => reply.status === 'exists' && !!reply.roomId` (or import `isDMExistsReply`). +- [ ] `chat-frontend/src/api/_transport/asyncJob.ts:217` — code comment references the legacy shape ("DM-exists and similar `200-with-error+roomId` replies…"); update to describe only the new success-envelope shape. +- [ ] `chat-frontend/src/components/MainApp/Sidebar/CreateRoomDialog/CreateRoomDialog.test.jsx:145` — mock sync reply uses `sync: { error: 'dm already exists', roomId: 'r-existing' }`. Switch to `sync: { status: 'exists', roomId: 'r-existing' }`. +- [ ] `chat-frontend/scripts/asyncJob.smoke.mjs:110, :130` — produces / asserts the legacy shape; switch to the new shape. +- [ ] `chat-frontend/scripts/liveStack.smoke.mjs:141, :148-149` — same. +- [ ] `chat-frontend/CLAUDE.md` — drop the legacy-fallback paragraph (the long sentence noting `isDMExistsReply` accepts both shapes during the rollout window). Keep the canonical `{status:"exists", roomId}` description. + +**Release gate:** schedule for ONE release AFTER the backend has been deployed to every site. Not for this PR; lives in the plan so it isn't forgotten. + +### Task 20.10: End-of-batch verification gate (must pass before push) + +Locked execution policy is "one batch → one push". Mirror Ch 18.3's repo-wide gate, updated for the new Ch 20 sites. **Run from a clean checkout of the post-20.1-through-20.8 state.** + +- [ ] **Backend gates.** + - `go build ./...` → clean. + - `make lint` → clean (no SA1019 from any leftover legacy reference). + - `make test` → all unit tests pass (`-race`). + - `make sast` → clean (the 4 errcode semgrep rules + the new 20.5 unknown-Code surface). +- [ ] **Frontend gates.** + - `cd chat-frontend && npm run typecheck` → clean. + - `npm test` → green. + - `npm run build` → succeeds (pre-existing chunk-size warning OK). + - `npm run smoke && npm run smoke:asyncjob && npm run smoke:livestack` → green. **These are the wire-contract regression net for the reason-driven branching changes in 20.6–20.8** (per Ch 19 reviewer); even though the contract flip itself shipped in Ch 19, the consumer-side reshaping in 20.6–20.8 should re-prove the round-trip. +- [ ] **Repo-wide grep — no leftover legacy references.** + ```bash + grep -rnE 'errUserNotFound|errInvalidOrg' --include="*.go" room-service/ pkg/ + # expect: 0 hits (sentinels are deleted) + + grep -rnE '(slog\.[A-Z][a-zA-Z]+(Context)?\(.*"requestID"|WithLogValues\(.*"requestID")' --include="*.go" . + # expect: 0 hits (slog literals all switched; Gin-context-key sites remain — only the slog literals were in scope) + + grep -rn "fakeJSMsg\|errcode\.Reason(" --include="*.go" . + # expect: same as the established Ch 18.3 baseline (errtest helpers don't tip the second one) + ``` +- [ ] **Docs consistency.** Confirm the four documents that name the reason catalog all agree: + - `pkg/errcode/codes_room.go` lists the two new reasons. + - `docs/client-api.md` §6 reason catalog lists them. + - `docs/error-handling.md` reason-catalog reference, if any, is accurate. + - `chat-frontend/CLAUDE.md` "Error envelope" reasons list includes them. +- [ ] **Commit + push.** Single consolidated commit covering 20.1–20.8 + 20.11–20.20 + the gate proof, followed by `git push origin claude/sharp-hopper-qzm6W`. Use a structured commit message that lists each task by number so reviewers can navigate the diff. + +### Task 20.11: message-gatekeeper — collapse triple-unmarshal to a single decode (CRITICAL perf) + +Branch-review CRITICAL finding (performance lens). `message-gatekeeper/handler.go` currently unmarshals the inbound JetStream message body three times per request: once for routing, once for the validation context, once for the outbox publish. At `MAX_WORKERS=100` on MESSAGES_CANONICAL this triples GC pressure on the hottest path in the system. + +- [ ] **Step 1: Identify the three decode sites** — grep `json.Unmarshal` in `message-gatekeeper/handler.go`. Expected: lines ~74 (initial routing decode), ~129 (validation-context decode), ~168 (outbox publish decode). Confirm each decodes the same `model.Message` (or equivalent) struct. +- [ ] **Step 2: Decode once at entry.** At the first decode site, store the decoded struct in a local (or pass through the per-message handler scope). Delete the second and third `json.Unmarshal` calls — refer to the in-scope variable instead. +- [ ] **Step 3: Outbox publish reuse.** The outbox publish site needs the original raw bytes (it republishes), not a re-decode. If the function consumes `[]byte`, pass `msg.Data` directly (already in scope from JetStream); if it consumes the struct, pass the struct decoded in Step 2. Either way, no second `json.Unmarshal` call. +- [ ] **Step 4: Verify** `make build SERVICE=message-gatekeeper` + `make test SERVICE=message-gatekeeper` — no test fixture changes expected (the behavior is unchanged; only the number of allocations drops). + +### Task 20.12: WithCause audit — drop payload-shape leaks (HIGH leak) + +Branch-review HIGH finding (observability lens). `room-service/handler.go:1353` introduced `WithCause(json.Unmarshal err)` in commit `d8ef62b`. The unmarshal error's text includes a byte-offset and a payload prefix; via `WithCause` that string lands in the server log. The same shape was deliberately dropped from `message-gatekeeper:169` in the same review-fix batch — this is the symmetric regression. + +- [ ] **Step 1: Repo-wide audit.** Grep all `WithCause` call sites: + ```bash + grep -rnE 'errcode\.With(Cause|cause)\(' --include="*.go" . + ``` + For each match, verify the wrapped error is NOT one of: `json.Unmarshal` result, `proto.Unmarshal` result, raw `msg.Data` slice, OIDC token string, anything containing user input bytes. +- [ ] **Step 2: Fix `room-service/handler.go:1353`** — drop `errcode.WithCause(err)` from the chain. Keep the typed `errcode.BadRequest("invalid payload", errcode.WithReason(...))`. The Unmarshal error already gets logged by the caller via `slog.Error("decode failed", "err", err)` at a sanitized level — no need to second-channel it through `WithCause`. +- [ ] **Step 3: Any other site found in Step 1** — apply the same surgery. If a site legitimately needs the cause for ops debugging (e.g. wrapping a Mongo driver error), keep it — the audit is about user-input payloads only. +- [ ] **Step 4: Verify** `make test SERVICE=room-service` + `make lint` clean. Confirm via `grep` that no `WithCause(json.Unmarshal` patterns remain. + +### Task 20.13: message-gatekeeper — hoist duplicate WithLogValues (HIGH perf) + +Branch-review HIGH finding (performance lens). `message-gatekeeper/handler.go:73` calls `errcode.WithLogValues(ctx, ...)` early; `:140` calls it again with the same room/user fields. The second call re-allocates a fresh inner ctx and a new `logValues` map for fields already in scope. Round-2 already removed one redundancy; this is the other. + +- [ ] **Step 1: Confirm the two sites.** Grep `WithLogValues` in `message-gatekeeper/handler.go`. Expected: 2 sites (or more — find all). Read each and identify which fields are passed. +- [ ] **Step 2: Hoist into one early call.** Move all field assignments to the first call at :73 (or wherever the earliest scope-fully-known site is). Delete subsequent calls that add only already-present fields. +- [ ] **Step 3: Edge case** — if a downstream call adds a field that's only known later in the flow (e.g. a derived msgID after dedup), keep that one but assert (via grep) it's not re-adding fields from the earlier call. +- [ ] **Step 4: Verify** `make test SERVICE=message-gatekeeper` clean. + +### Task 20.14: search-service — pin metrics status-label cardinality (medium observability) + +Branch-review MEDIUM finding (observability + per-service lens). `search-service/metrics.go:96-105` status-label cardinality widened from {ok, internal, bad_request, not_found, forbidden, conflict} (5-6 values) to "any non-empty `errcode.Code`" (up to 9). Bounded today, but future Code additions auto-create new series and Prometheus has no allowlist guard. + +- [ ] **Step 1: Pin the allowed label set.** In `search-service/metrics.go`, define a package-level `var allowedStatusLabels = map[string]struct{}{ "ok": {}, "bad_request": {}, "not_found": {}, "forbidden": {}, "conflict": {}, "unauthenticated": {}, "too_many_requests": {}, "unavailable": {}, "internal": {} }` (the 8 errcode Codes + "ok"). +- [ ] **Step 2: Guard the labeling site.** Before passing the status string to `.WithLabelValues(...)`, check `if _, ok := allowedStatusLabels[status]; !ok { status = "internal" }`. This forces unexpected labels into the "internal" bucket rather than creating a new time series. +- [ ] **Step 3: Refresh doc comments.** The doc comments at `:93-94` currently enumerate only 4 codes — stale. Rewrite the comment to enumerate all 9 allowed values (or reference the `allowedStatusLabels` var). +- [ ] **Step 4: Test.** Add `TestStatusLabel_RejectsUnknown` in `search-service/metrics_test.go` (or extend the existing metrics test) that calls the labeling helper with a synthetic `"made_up_code"` and asserts the counter increments on `"internal"`, not `"made_up_code"`. +- [ ] **Step 5: Verify** `make test SERVICE=search-service` + `make lint` clean. + +### Task 20.15: pkg/errcode — consolidate room-worker permanentError marker + add nil-guard (medium) + +Branch-review MEDIUM finding (Go lens). `room-worker/handler.go:129` `permanentError` is a single-field marker (`ec *errcode.Error`) with a dereference in `Error()` that has no nil-guard. The marker is room-worker-private but the pattern (explicit permanence for JetStream Nak/Ack discrimination) is general — consolidate into `pkg/errcode` so other workers can reuse it. + +- [ ] **Step 1: Create `pkg/errcode/permanent.go`** with: + ```go + // PermanentError marks an *Error as non-retryable: the JetStream consumer + // should Ack (drop) rather than Nak (redeliver). Workers wrap a classified + // errcode in Permanent() at the call site to make the policy explicit. + type PermanentError struct{ ec *Error } + + func Permanent(ec *Error) *PermanentError { + if ec == nil { panic("errcode.Permanent: nil *Error") } + return &PermanentError{ec: ec} + } + func (p *PermanentError) Error() string { return p.ec.Error() } + func (p *PermanentError) Unwrap() error { return p.ec } + ``` + The nil-guard panics at construction (not deref) — same invariant style as `WithCause`. +- [ ] **Step 2: Add `IsPermanent(err error) (*Error, bool)`** convenience: `var p *PermanentError; if errors.As(err, &p) { return p.ec, true }; return nil, false`. Tests for both. +- [ ] **Step 3: Tests** in `pkg/errcode/permanent_test.go`: + - `TestPermanent_PanicsOnNil` + - `TestPermanent_UnwrapReachesErrcode` (verify `errors.As(p, &*Error)` works) + - `TestIsPermanent_DetectsWrapper` + - `TestIsPermanent_FalseOnPlainErrcode` +- [ ] **Step 4: Migrate room-worker.** Replace `room-worker/handler.go`'s local `permanentError` + `permanent()` with `errcode.PermanentError` + `errcode.Permanent`. Update the `errors.As(&pe)` consumer (search the worker for `*permanentError` and `permanent(`) to use `errcode.IsPermanent`. All 21 `permanent(errcode...)` call sites swap to `errcode.Permanent(errcode...)`. +- [ ] **Step 5: Delete the local marker** from `room-worker/handler.go:129` once Step 4 makes it unused. Verify with `grep "permanentError\b\|\bpermanent(" room-worker/`. +- [ ] **Step 6: Verify** `make test SERVICE=room-worker` + `make test SERVICE=pkg/errcode` + `make lint` clean. + +### Task 20.16: pkg/errcode — validate non-empty Message at construction (medium) + +Branch-review MEDIUM finding (Go lens). `errcode.New(code, msg, opts...)` accepts empty `msg`. The wire payload then carries `"error": ""` — a contract violation (`error` is documented as always-populated, user-safe text). The named constructors (`NotFound`, `Forbidden`, …) all call `New` internally; an empty message at the constructor leaks through. + +- [ ] **Step 1: Add the guard** at the top of `New` in `pkg/errcode/options.go`: `if msg == "" { panic("errcode: empty message — every constructor requires user-safe text") }`. Panic, not return-error: same invariant-guard style as the existing `WithCause` panic on `*Error` (Decision 8). +- [ ] **Step 2: Audit existing constructors.** Grep `errcode\.(NotFound|Forbidden|BadRequest|Conflict|Unauthenticated|TooManyRequests|Unavailable|Internal)\(""` — should be zero matches in tree. If any match exists, fix the caller (the panic would crash the service at first request). +- [ ] **Step 3: Test.** `TestNew_PanicsOnEmptyMessage` in `pkg/errcode/options_test.go`. +- [ ] **Step 4: Doc.** `docs/error-handling.md` near the `New` description — add "Every `*Error` must carry a non-empty user-safe message. Passing `""` panics at construction time. This is enforced because the wire envelope's `error` field is documented as always-populated." +- [ ] **Step 5: Verify** `make test SERVICE=pkg/errcode` + `make lint` clean. + +### Task 20.17: room-service/helper.go — document sentinel non-mutation contract (aesthetic) + +Branch-review LOW finding (Go lens). After Task 20.1 retires `errUserNotFound` / `errInvalidOrg`, the only remaining sentinels in `room-service/helper.go` are package-level singletons. Today's Options return fresh `*Error` values (mutation-safe), but a future option that mutates in place would silently alias state across callers. + +- [ ] **Step 1: Add doc-comment** on the var block (after 20.1 leaves the remaining sentinels, e.g. `errStoreFailure`): + ```go + // Package-level errcode sentinels. SHARED across all goroutines. + // Callers MUST NOT mutate; use errors.Is for identity, errcode.HasReason + // for reason matching, and construct fresh *Error values via the named + // constructors when callers need a wrapped message or extra metadata. + ``` +- [ ] **Step 2: Verify** `golangci-lint run ./room-service/...` clean (doc-comment-only change; no behavior change). + +### Task 20.18: pkg/errcode/parse.go — guard empty error text (aesthetic) + +Branch-review LOW finding (Go lens). `Parse` accepts a struct with empty `error` text. Strictly per the wire contract, `error` is always populated server-side. If a malformed envelope arrives, returning a synthetic placeholder is safer than silently round-tripping `""`. + +- [ ] **Step 1: Add the guard** in `Parse`: after JSON-decoding, if `env.Error == ""`, set `env.Error = "(no message)"`. This keeps the returned `*Error` invariant-compliant with Task 20.16's New-panics-on-empty rule. +- [ ] **Step 2: Test.** `TestParse_FillsPlaceholderOnEmptyMessage` — assert the returned `*Error.Message == "(no message)"` when the input envelope has `"error": ""`. +- [ ] **Step 3: Doc.** Inline comment on the guard: `// Defensive: if a peer ships an empty error string (contract violation), fill a placeholder so the resulting *Error satisfies the non-empty-message invariant enforced by New (Task 20.16).` +- [ ] **Step 4: Verify** `make test SERVICE=pkg/errcode` clean. + +### Task 20.19: pkg/errcode/match.go — delete dead Is shim (aesthetic) + +Branch-review MEDIUM finding (Go lens). `*Error.Is(target error) bool` is dead in-tree — `errors.Is(*Error, ...)` is only ever used against sentinels via the `cause` chain, which the default unwrap handles. Custom `Is` increases the maintenance surface (future maintainers must wonder "does Is collapse code+reason or only code?"). + +- [ ] **Step 1: Verify dead.** `grep -rn 'errors\.Is.*\*errcode\.Error\|errors\.Is(.*&\?errcode\.Error{' --include="*.go" .` — confirm no caller relies on the custom semantic (only sentinel-identity comparisons, handled by default unwrap). +- [ ] **Step 2: Delete the method** in `pkg/errcode/match.go`. +- [ ] **Step 3: Delete any test** in `pkg/errcode/match_test.go` that pinned the custom semantic (the sentinel-identity tests stay — they exercise default unwrap). +- [ ] **Step 4: Verify** `make test SERVICE=pkg/errcode` clean (any test that broke means the method WAS load-bearing — back out the delete and document instead). + +### Task 20.20: pkg/errcode/classify.go — reduce cause-string concat allocation (aesthetic perf) + +Branch-review MEDIUM finding (performance lens). `Classify` builds the log-line cause via `cause := cause + ": " + e.cause.Error()`. On every classified-error path. Two intermediate strings allocated per call. Replace with separate slog fields — zero string concat, more queryable in log aggregators. + +- [ ] **Step 1: Refactor `Classify`** to log the cause as two distinct slog fields: + ```go + attrs := []any{"code", string(e.Code), "reason", string(e.Reason), "cause", cause} + if e.cause != nil { + attrs = append(attrs, "underlying", e.cause.Error()) + } + loggerFrom(ctx).Log(ctx, e.logLevel(), "request failed", attrs...) + ``` + The `cause` field now carries the outer-error's text (`err.Error()` minus any wrapped `*Error.Message`); `underlying` carries the unexported-cause's text. Both are independently queryable. +- [ ] **Step 2: Update tests.** Any existing classify_test that asserts on `cause` field shape needs the assertion split between `cause` and `underlying`. Add a regression test that pins both fields land in the log when `WithCause` was used. +- [ ] **Step 3: Update doc** in `pkg/errcode/doc.go` (or `docs/error-handling.md`) describing the two-field log shape — log aggregators that pivot on `cause` see the same data; those that want the unexported underlying separately query `underlying`. +- [ ] **Step 4: Verify** `make test SERVICE=pkg/errcode` clean. Spot-check one downstream service (e.g. `make test SERVICE=room-service`) — no test should depend on the old single-field shape. + +--- + +## Self-Review Notes + +- **Spec coverage:** wire format (Ch 0–1), infra→internal (Ch 4), remove sanitizeError (Ch 14), extract to pkg (Ch 0–9), server log before reply (Ch 4 + per-service ctx enrichment), specific-vs-general via Code+Reason (Ch 0, 3, 7). Self-found items: natsrouter decoupling + cycle-safe seam (Ch 10), full caller sweep incl. `params.go`/`metrics.go`/`fetcher_history.go`/`memberlist_client.go` (Ch 10/12/13/14), AsyncJobResult (Ch 15), explicit permanence (Ch 15), auth HTTP (Ch 16), shim-then-delete ordering (Ch 10/17), legacy cleanup (Ch 17), semgrep (Ch 18), consolidated docs/client-api.md pass (Ch 18.2), error-handling guide (Ch 18.3), frontend cutover split out (Ch 19 — separate release task, gated on co-release with the room-service DM-exists flip). +- **Type consistency:** `Code`/`Reason`, `New`, named constructors (no `*f`), `WithReason(Reason)`, `WithMetadata`, `WithCause`, `Classify`/`logLevel`, `Parse`, `ReasonOf`/`HasReason`, `errnats.Reply/Marshal/ReplyQuiet/MarshalQuiet`, `errhttp.Write`, `errtest.AssertCode/AssertReason/Decode`, `WithLogger`/`WithLogValues`/`loggerFrom`, `Context.WithLogValues`, `permanent`/`permanentError` used consistently. +- **Confirm-before-execution:** (a) PM gate on `unauthenticated` (Ch 16); (b) frontend test/script names (Ch 18); (c) DM-exists co-release gate (Ch 14.3 Step 7). +- **Review round 1 fixes applied:** ctx-cycle seam, `natsutil.ReplyJSON` for DM-exists, complete caller sweep, shim ordering, explicit permanence, Code/Reason split, constructor naming, Ch 13 contradiction resolved (forbidden+reason), 503/429 noted, AsyncJobResult string fields + omitempty test. +- **Review round 2 fixes applied:** (Critical) natsrouter's own tests + ~40 cross-service `.Code` test asserts migrated in-chapter (Ch 10 Step 1/5); `Context.ReplyError` + `register.go:20` covered (Ch 10 Step 3); `query_rooms.go:74` return-type change (Ch 12 Step 5); one production `metrics.go` break folded into Ch 10 to keep `go build` green; room-worker async `recover()` now a REQUIRED task (Ch 15 Step 6). (High) category-aware `Classify` log level (Ch 4); `*f` constructors removed (Ch 3); CLAUDE.md rule updated in Ch 10 not Ch 18. (Medium) `WithMetadata`/`WithLogValues` trust-boundary documented (Decision 9, doc.go); `errnats.MarshalQuiet`/`ReplyQuiet` for already-logged paths (Ch 8/10); single-`%w` invariant + semgrep rule; `newPermanentAbsent` chain locked + tested (Ch 15); DM-exists co-release gate (Ch 14.3). (Low) `errtest` helper (Ch 5.3); `ReasonOf`/`HasReason` (Ch 5.2); semgrep `prefer-named-constructor`. +- **Review round 3 (per-service exhaustive audits) fixes applied:** (Critical) message-gatekeeper — ~10 inline validation `fmt.Errorf` re-homed so they don't collapse to internal, and the `not_subscribed` reply now returns the sentinel not a fresh error (Ch 13 Step 4); room-service — the `sanitizeError` allowlist's ~14 inline passthrough sites (`"only owners can"`, `"invalid request"`, `"cannot add members"`, `"requester not in room"`, mute-toggle) re-homed at source BEFORE deleting the allowlist (Ch 14 Step 2); room-worker — 21 (not "≈10") `newPermanent` sites enumerated, `errRoomIDCollision`/sync-DM reconcile branch + `processRoleUpdate`'s Nak-forever bug covered (Ch 15 Step 1/1b). (High) gatekeeper test line-range corrected (`TestHandler_marshalErrorReply` 1159-1188 vs real assertion 686-688); room-service `*dmExistsError` routing tests (`integration_test.go:1588`, `handler_test.go:2333/2427/2662`) added (Ch 14.3 Step 5); search-service test count 18 (not ~17) + the `CodeInternal` integration assert (Ch 12 Step 8). (Medium) `wrappedCtx` log-enrichment fold (Ch 14 Step 4); gatekeeper ctx-enrich placed after parse (Ch 13 Step 5); memberlist legacy-remote empty-code rollout note (Ch 14.2); auth 500-message change + missing 500 test + all five error tests + docs rows (Ch 16); per-service docs row specifics. (Low) history test count 16 (not ~22) + mandatory roomID (Ch 11); CreateRoomReply.RoomType confirm (Ch 14.3); errRoomKeyAbsent alert is call-site-driven, framing corrected (Ch 15). mock-user-service: audited clean, no change. + +--- + +## Post-Plan Amendments (implemented after plan was written) + +### PA-1 — `Code.Valid()` + `New()` panic on bad code/empty message + +`Code.Valid()` added to `pkg/errcode/category.go`. `New()` in `options.go` now panics on a non-canonical `Code` or empty `Message` (programmer errors surfaced at init time rather than producing silent broken envelopes). `Parse` uses `Code.Valid()` and message-emptiness to detect legacy/non-canonical remote envelopes (see PA-4). + +### PA-2 — `TooManyRequests` (429) added + +Resolved the spec open-item: `CodeTooManyRequests Code = "too_many_requests"` (HTTP 429) and the named constructor `TooManyRequests(msg, opts...)` were added alongside the other 7 categories. `Code.Valid()` covers it. `too_many_requests` = per-caller quota/rate-limit; `unavailable` = server-wide saturation. Both are INFO-level in `logLevel()`. + +### PA-3 — Request-ID policy split: `StampRequestID` vs `RequireRequestID` + +The uniform "mint-on-missing" approach in the original plan was discovered to break dedup for room-service and room-worker handlers that derive `Nats-Msg-Id` / message IDs from the inbound request ID. Two policies were implemented: + +**`natsutil.StampRequestID(ctx, headers, subject) (ctx, id)`** — default; mints on missing, warns on malformed. Applied by all other handlers and the `pkg/natsrouter` RequestID middleware. + +**`natsutil.RequireRequestID(ctx, headers, subject) (ctx, id, error)`** — strict; returns `errcode.BadRequest` on missing or malformed. Applied by: +- All room-service NATS handlers via `wrappedCtx(m otelnats.Msg) (context.Context, error)` (signature changed from `context.Context` to `(context.Context, error)`) +- `room-worker.natsServerCreateDM` via `requireDedupRequestID` wrapper + +The room-worker JetStream consume loop keeps `StampRequestID` defensively but logs `slog.Error` on a forced mint. + +Tests added: `TestRequireRequestID` (pkg/natsutil), `TestWrappedCtx_*` (room-service), `TestRequireDedupRequestID` (room-worker). + +Documented in `docs/error-handling.md` §3a. + +### PA-4 — Cross-site memberlist: propagate X-Request-ID + +`room-service/memberlist_client.go` constructed a bare `nats.Msg` with empty headers, so `X-Request-ID` was never forwarded to the remote site. Since remote room-service uses `RequireRequestID` (strict — PA-3), this caused integration tests to fail with `bad_request`. Fixed by using `natsutil.NewMsg(reqCtx, subject, body)`. + +Integration tests `TestRoomsInfoBatchRPC` and `TestAddMembers_TwoSiteEndToEnd` in `room-service/integration_test.go` were updated to stamp valid UUIDs on the request context before calling handlers. + +### PA-5 — Legacy remote peer handling in `memberlist_client` + +When `errcode.Parse` returns an envelope with a non-canonical `Code` or empty `Message` (old peer), `memberlist_client` now falls back to `errcode.Internal("remote site returned an error")` and emits `slog.Warn("legacy peer emitted non-canonical errcode", ...)` instead of panicking in `errcode.New`. This implements safe mixed-version rollout. + +### PA-6 — `errRoomKeyAbsent` (room-service) converted to typed errcode + +The `errRoomKeyAbsent` sentinel in `room-service/helper.go` (introduced by main's room-key-fetch RPC feature) was converted from `errors.New(...)` to `errcode.NotFound("room key not available")`. The `handleGetRoomKey` inline `fmt.Errorf("invalid request: %w", err)` was also converted to `errcode.BadRequest("invalid request")`. `natsGetRoomKey` handler updated to use `errnats.Reply` + the new two-return `wrappedCtx`. + +Note: room-worker's own `errRoomKeyAbsent = errors.New(...)` is intentionally a raw sentinel — it is wrapped via `errcode.WithCause(errRoomKeyAbsent)` so both `errors.Is` (alert path) and `errors.As` (errcode classification) resolve in the same chain. + +### PA-7 — Worker-side logging boundary (designed, deferred) + +This plan standardized the **client-facing** error boundary and left JetStream worker error logging out of scope. A follow-on `errcode.LogJobError` helper to unify worker logs was designed but **considered and deferred (YAGNI)** after measuring that the candidate workers are ~100% raw `fmt.Errorf` with no `WithCause` — so the unification would standardize things that don't actually differ. The one concrete gap (`message-worker` missing `request_id`) was fixed directly. Rationale + retained design: `docs/superpowers/specs/2026-06-02-unified-worker-error-logging-design.md`. diff --git a/docs/superpowers/spec.md b/docs/superpowers/spec.md index bb9f31bc1..ea9f521d3 100644 --- a/docs/superpowers/spec.md +++ b/docs/superpowers/spec.md @@ -230,7 +230,7 @@ Roles: `"owner"`, `"member"` **HistoryResponse**: `messages` ([]Message), `hasMore` (bool) -**ErrorResponse**: `error` (string) +**Error envelope** (every transport — NATS reply, HTTP, AsyncJobResult): owned by `pkg/errcode`; shape `{error, code, reason?, metadata?}`. See `docs/error-handling.md` and `docs/client-api.md` §6. --- @@ -436,7 +436,8 @@ All client publishes are under `chat.user.{account}.>`: |---------|---------| | `pkg/model` | All domain structs with `json` + `bson` tags | | `pkg/subject` | NATS subject builder functions and wildcard patterns | -| `pkg/natsutil` | `ReplyJSON`, `ReplyError`, `MarshalResponse`, `MarshalError`, `HeaderCarrier` (OTel) | +| `pkg/natsutil` | `ReplyJSON`, `MarshalResponse`, `HeaderCarrier` (OTel) — success-reply mechanics only | +| `pkg/errcode` | `Code`/`Reason` types, `Error` (the wire envelope, leak-safe), named constructors (`BadRequest`, `NotFound`, …), `Classify` boundary, `Parse` for remote replies. Adapters: `errnats.Reply` (NATS) and `errhttp.Write` (Gin). Test helper: `errtest.AssertCode`/`AssertReason`. See `docs/error-handling.md`. | | `pkg/stream` | JetStream `StreamConfig` builders for all 5 streams | | `pkg/mongoutil` | `Connect`, `Disconnect` wrappers | | `pkg/cassutil` | `Connect`, `Close` wrappers (LocalQuorum, 10s timeout) | diff --git a/docs/superpowers/specs/2026-05-28-centralized-error-codes-design.md b/docs/superpowers/specs/2026-05-28-centralized-error-codes-design.md new file mode 100644 index 000000000..a77491a48 --- /dev/null +++ b/docs/superpowers/specs/2026-05-28-centralized-error-codes-design.md @@ -0,0 +1,453 @@ +# Centralized Error Codes — Design Spec + +**Date:** 2026-05-28 +**Updated:** 2026-06-02 (post-implementation amendments — see §Amendment Log) +**Status:** Implemented + +--- + +## Overview + +A shared Go package at `pkg/errcode/` that is the single source of client-facing error envelopes for every transport in the chat system (NATS request/reply, JetStream replies, Gin HTTP). It replaces four incompatible error-reply patterns that exist today: + +1. `pkg/natsrouter` — `RouteError` + `Err*` constructors + `Code*` string consts. +2. `pkg/natsutil` — `MarshalError` / `MarshalErrorWithCode` / `ReplyError` / `TryParseError`. +3. `pkg/model.ErrorResponse` — hand-built reply struct. +4. `auth-service` — ad-hoc `gin.H{"error": ...}`. + +The package produces one wire envelope, `{error, code, reason?, metadata?}`, centralizes server-side logging at a single classification boundary, and makes two classes of bug structurally impossible: leaking an internal cause to a client, and mixing the generic code with the specific reason. + +## Goals + +- One transport-neutral error type marshalling to a stable envelope `{error, code, reason?, metadata?}`. +- A closed set of generic categories (`code`) that map to standard REST/HTTP status, plus an open set of domain `reason`s the frontend switches on. +- Compile-time separation of `code` and `reason` (distinct Go types). +- Infra/DB/third-party errors always collapse to `internal` with a safe message — never leak a cause. +- Exactly one server-side log line per failed request, at a category-appropriate level. +- Thin transport adapters for NATS (`errnats`) and Gin (`errhttp`); core stays transport-neutral. +- Lint-enforced invariants (semgrep) so the guarantees survive future code. +- A clean migration that keeps every intermediate commit compiling and bisectable. + +## Non-Goals + +- Changing the JetStream Ack/Nak retry semantics of any worker (the envelope is independent of the retry decision; permanence stays an explicit, separate signal). +- i18n / localized error messages (messages remain English, server-authored). +- Error codes for purely internal (non-client-facing) errors — those stay raw `fmt.Errorf` and collapse to `internal` at the boundary. +- A registry/enum of every possible reason across services in one file — reasons live per-service. + +--- + +## Wire Envelope + +```json +{ + "error": "room is full", + "code": "conflict", + "reason": "max_room_size_reached", + "metadata": { "limit": "500" } +} +``` + +| Field | Required | Meaning | +|-------|----------|---------| +| `error` | yes | Human-readable, user-safe message. Existing field name, preserved. | +| `code` | yes | One of 7 generic categories. Drives HTTP status. Always present. | +| `reason` | no | Domain-specific machine code the frontend branches on. Omitted when absent. | +| `metadata` | no | `map[string]string` of structured, client-visible detail. Omitted when empty. | + +**Frontend rule:** the trigger a client switches on is `reason ?? code` — specific when present, generic otherwise. + +The 8 categories and their HTTP status: + +| `code` | HTTP | Use | +|--------|------|-----| +| `bad_request` | 400 | Malformed/invalid input | +| `unauthenticated` | 401 | Missing/expired/invalid credentials | +| `forbidden` | 403 | Authenticated but not permitted | +| `not_found` | 404 | Target does not exist | +| `conflict` | 409 | State conflict (duplicate, capacity, last-owner) | +| `too_many_requests` | 429 | Per-caller rate limiting / quota exceeded | +| `unavailable` | 503 | Transient server saturation/timeout (admission, expand timeout) | +| `internal` | 500 | Anything unclassified; the default collapse target | + +> **Open item (`unavailable` mapping):** admission-control "service busy" is arguably HTTP 429. This design keeps 503 (the NATS services don't surface HTTP; only matters if an HTTP service later needs rate-limit semantics). Revisit then. + +--- + +## Design Decisions (locked) + +1. **Two distinct Go types** — `type Code string` (the 7 generics; the wire `code`) and `type Reason string` (open domain set; the wire `reason`). The compiler rejects `New(SomeReason, …)` and `WithReason(SomeCode)`. Chosen over a single `Code` type for compile-time safety. +2. **Generic categories live in core; domain reasons live in `codes_.go`** as `Reason` constants — importable across the flat `package main` services, compiler-unique, one catalog per service. +3. **Cause never leaks** — `Error.cause` is unexported, so `encoding/json` cannot serialize it. Reachable only via `Unwrap()` for logging and `errors.Is`/`As`. +4. **Infra errors collapse to `internal`** — `Classify` maps any non-`*errcode.Error` to `internal` with message `"internal error"`, keeping the original chain as the (unserialized) cause. +5. **Centralized, level-aware logging** — `Classify` emits exactly one `slog` line; level is category-aware (`internal`/`unavailable` → ERROR, expected client errors → INFO) so routine 4xx don't pollute error alerting. Handlers never log-then-reply. +6. **One way to construct** — named constructors (`errcode.BadRequest(msg, opts...)`), one per category; no `*f` variants (they silently swallow trailing options). Dynamic text uses `errcode.BadRequest(fmt.Sprintf(...), opts...)`. +7. **Functional options** — `WithReason`, `WithMetadata`, `WithCause`. +8. **Single `*errcode.Error` per chain** — `WithCause` panics if the cause already carries an `*errcode.Error` (semgrep also flags the literal form). Propagate typed errors with a single `%w` or bare `return`. Multi-`%w` mixing is forbidden and semgrep-flagged. +9. **Trust boundary on options** — `WithMetadata` is client-visible (ships in the envelope); `WithLogValues` is server-only (never serialized). Causes attached via `WithCause` are logged via `err.Error()`, so they must never wrap raw message bodies, tokens, or secrets. +10. **DM-already-exists is a success, not an error** — room-service returns `model.CreateRoomReply{Status:"exists", RoomID:…}` instead of an error envelope the client treats as success. (Breaking contract change; co-released with the frontend.) +11. **Migration uses shims, not mid-plan deletion** — `natsrouter.RouteError`/`Err*` become thin delegating shims, deleted only in the cleanup chapter, so every commit compiles. + +--- + +## Package Structure + +```text +pkg/errcode/ +├── category.go # type Code + 7 Code constants + HTTPStatus() +├── reason.go # type Reason +├── error.go # type Error{Code,Reason,Message,Metadata,cause} + Error/Unwrap/HTTPStatus +├── options.go # Option, New, named constructors, WithReason/WithMetadata/WithCause +├── classify.go # Classify(ctx,err) + category-aware logLevel +├── parse.go # Parse([]byte) for RPC clients +├── match.go # ReasonOf / HasReason +├── logctx.go # WithLogger / WithLogValues / loggerFrom +├── doc.go # package contract + invariants +├── codes_room.go # Reason consts for room-service / room-worker +├── codes_message.go # Reason consts for message-gatekeeper +├── codes_search.go # (placeholder; none needed today) +├── codes_auth.go # Reason consts for auth-service +├── *_test.go +├── errnats/ # NATS adapter: Marshal/Reply + MarshalQuiet/ReplyQuiet +├── errhttp/ # Gin adapter: Write +└── errtest/ # test helper: Decode/AssertCode/AssertReason +``` + +Dependency direction: `errnats`/`errhttp`/`errtest` → `errcode` → stdlib only. `pkg/natsrouter` → `errcode` + `errnats` (never Gin). `pkg/model` must NOT import `errcode` (it stores `code`/`reason` as plain strings where needed). + +--- + +## API Surface + +### Code and Reason + +```go +// Code is the closed set of generic classifications; the wire `code` field. +type Code string + +const ( + CodeBadRequest Code = "bad_request" + CodeUnauthenticated Code = "unauthenticated" + CodeForbidden Code = "forbidden" + CodeNotFound Code = "not_found" + CodeConflict Code = "conflict" + CodeTooManyRequests Code = "too_many_requests" + CodeUnavailable Code = "unavailable" + CodeInternal Code = "internal" +) + +// HTTPStatus maps a Code to its HTTP status; unknown → 500. +func (c Code) HTTPStatus() int + +// Reason is an open set of domain machine codes; the wire `reason` field. +// Concrete values are declared per-service in codes_.go. +type Reason string +``` + +### Error + +```go +// Error is the canonical client-facing error. It marshals to +// {code, reason?, error, metadata?}. cause is UNEXPORTED — encoding/json +// cannot serialize it; it exists only for server-side logging and +// errors.Is/As traversal. +type Error struct { + Code Code `json:"code"` + Reason Reason `json:"reason,omitempty"` + Message string `json:"error"` + Metadata map[string]string `json:"metadata,omitempty"` + cause error +} + +func (e *Error) Error() string // message only, never the cause +func (e *Error) Unwrap() error // the cause (not serialized) +func (e *Error) HTTPStatus() int +``` + +### Constructors and options + +```go +// New builds an Error with a dynamic Code. Prefer the named constructors. +func New(code Code, message string, opts ...Option) *Error + +// Named constructors — the primary API. One per category. +func BadRequest(msg string, opts ...Option) *Error +func Unauthenticated(msg string, opts ...Option) *Error +func Forbidden(msg string, opts ...Option) *Error +func NotFound(msg string, opts ...Option) *Error +func Conflict(msg string, opts ...Option) *Error +func Unavailable(msg string, opts ...Option) *Error +func Internal(msg string, opts ...Option) *Error + +type Option func(*Error) + +// WithReason attaches the domain code (accepts only Reason). +func WithReason(r Reason) Option +// WithMetadata attaches CLIENT-VISIBLE key/value pairs (even count; panics on odd). +func WithMetadata(kv ...string) Option +// WithCause attaches a RAW underlying error for logging. Panics if the cause +// already carries an *errcode.Error (one-Error-per-chain invariant). +func WithCause(err error) Option +``` + +### Classification, parsing, matching + +```go +// Classify converts any error to a client-safe *Error and logs it exactly once +// (category-aware level). nil→nil; *errcode.Error in chain→that; else→internal. +// The single boundary every adapter calls before replying. +func Classify(ctx context.Context, err error) *Error + +// Parse decodes a reply payload into an *Error iff it is an error envelope +// (non-empty "error"). Used by RPC clients to detect remote failures. +func Parse(data []byte) (*Error, bool) + +// ReasonOf returns the Reason of the first *Error in err's chain, or "". +func ReasonOf(err error) Reason +// HasReason reports whether err's chain carries an *Error with reason r. +func HasReason(err error, r Reason) bool +``` + +### Logging context + +```go +// WithLogger stores an explicit logger (mainly tests). +func WithLogger(ctx context.Context, l *slog.Logger) context.Context +// WithLogValues returns ctx whose logger carries the given SERVER-ONLY attrs. +// Call once at handler entry; Classify's log line then includes them. +func WithLogValues(ctx context.Context, args ...any) context.Context +``` + +natsrouter handlers use the cycle-safe method instead of the package func: + +```go +// (*natsrouter.Context).WithLogValues enriches the handler logger, deriving +// from the inner ctx (never from the Context itself, which would cycle). +func (c *Context) WithLogValues(args ...any) +``` + +### Transport adapters + +```go +// errnats +func Marshal(ctx context.Context, err error) []byte // classify+log, return envelope +func Reply(ctx context.Context, msg *nats.Msg, err error) // classify+log, respond +func MarshalQuiet(err error) []byte // NO log (already-logged paths) +func ReplyQuiet(msg *nats.Msg, err error) // NO log + +// errhttp +func Write(ctx context.Context, c *gin.Context, err error) // classify+log, c.JSON(status, env) + +// errtest +func Decode(t *testing.T, data []byte) *errcode.Error +func AssertCode(t *testing.T, data []byte, want errcode.Code) +func AssertReason(t *testing.T, data []byte, want errcode.Reason) +``` + +--- + +## Behavioral Specifications + +### Classification and logging + +- `Classify(ctx, nil)` returns `nil`. +- If any `*errcode.Error` is in the chain (`errors.As`), it is returned verbatim — category, reason, metadata preserved through `fmt.Errorf("…: %w", typed)` wrapping. +- Otherwise the error becomes `internal` / `"internal error"`, with the original chain kept as the unserialized cause. +- Exactly one `slog` line per call, keyed `code`, `reason`, `cause` (the full chain via `err.Error()`), plus any `WithLogValues` attrs (request_id, account, roomID, …). +- Level: `internal`/`unavailable` → ERROR; `bad_request`/`unauthenticated`/`forbidden`/`not_found`/`conflict` → INFO. +- Already-logged transport paths (natsrouter panic backstop, `replyBusy`) use `MarshalQuiet`/`ReplyQuiet` to avoid a redundant second line. + +### Leak guarantee + +`Error.cause` is unexported ⇒ `json.Marshal` omits it. A round-trip test asserts a marshalled envelope never contains the cause string. `Error.Error()` returns the message only. + +### Wrapping invariant + +At most one `*errcode.Error` per chain, propagated by a single `%w` or bare `return`. `WithCause` panics on a nested `*errcode.Error`; semgrep flags both the literal `WithCause(errcode.X(...))` and multi-`%w` mixing. + +--- + +## Per-Service Error Contract + +This is the externally observable contract each migrated service emits. (The implementation plan carries the exhaustive per-site mapping; this is the summary.) + +### room-service (33 sentinels + inline errors) + +All `helper.go` sentinels map to a category, a subset carry reasons: + +| Reason | Category | Condition | +|--------|----------|-----------| +| `not_room_member` | forbidden | actor not a member | +| `not_room_owner` | forbidden | actor not an owner | +| `last_owner_cannot_leave` | conflict | last owner leaving | +| `bot_in_channel` | bad_request | bot in a channel room | +| `bot_not_available` | not_found | bot lookup miss | +| `max_room_size_reached` | conflict | capacity exceeded | +| `dm_already_exists` | *(removed — now a success reply, see below)* | + +Sentinels without a reason map to generic categories (invalid input → bad_request; permission → forbidden; duplicate/last-member → conflict; missing → not_found; channel-expand timeout → unavailable). **Critical:** the deleted `sanitizeError` allowlist currently passes through ~14 *inline* `fmt.Errorf` sites ("only owners can…", the "invalid request" family, "cannot add members", "requester not in room", mute-toggle) that are NOT sentinels — these are re-homed to typed errcodes at the source before the allowlist is deleted, or they would silently collapse to `internal`. + +**DM-already-exists:** returns `model.CreateRoomReply{Status:"exists", RoomID:…}` (success), not an error. + +**Cross-service:** `memberlist_client.go` decodes the remote room-service reply via `errcode.Parse` and remaps `reason==not_room_member` to the local sentinel (replacing brittle message-string equality). Mixed-version rollout: a legacy remote without `code` degrades to `internal`/no-reason — acceptable. + +### message-gatekeeper + +| Reason | Category | Condition | +|--------|----------|-----------| +| `large_room_post_restricted` | forbidden | non-owner/admin posting in a large room | +| `not_subscribed` | forbidden | sender not subscribed to the room | + +All other client-facing validation errors (missing/malformed fields, bad subject, invalid payload) become explicit `bad_request`/`not_found` — they must be typed errcodes, not raw `fmt.Errorf`, or they collapse to `internal`. The infra-vs-validation **Ack/Nak** decision is unchanged (keyed on `infraError`/sentinel identity, independent of the envelope). + +### auth-service (HTTP) + +| Reason | Category (HTTP) | Condition | +|--------|-----------------|-----------| +| `sso_token_expired` | unauthenticated (401) | expired SSO token | +| `invalid_sso_token` | unauthenticated (401) | invalid SSO token | +| — | bad_request (400) | missing/invalid params | +| — | internal (500) | NATS-token generation failure | + +The 500 body changes from `"failed to generate NATS token"` to `"internal error"` (cause logged, not sent). Success and `/healthz` responses are untouched. **Gated** on PM confirmation of the new `unauthenticated` category; fallback folds 401→403 (`forbidden`) with the same reasons. + +### room-worker (async + sync-DM) + +- `AsyncJobResult` gains `Code string` / `Reason string` (json, omitempty; `pkg/model` stays errcode-free). +- **Permanence is explicit, never inferred from category.** A `permanentError` wrapper carries the `*errcode.Error` and drives Ack (permanent) vs Nak (retryable). Many genuinely permanent errors (collision, key-absent, unknown room type) classify to `internal`, so category-inference would Nak them forever — the explicit marker prevents that. +- The room-key-absent alert sentinel is attached via `WithCause(errRoomKeyAbsent)`, so both `errors.As` (find the errcode) and `errors.Is` (alert) resolve in one chain. +- The async consumer goroutine gains a `recover()` (it runs outside natsrouter's recovery; a `WithCause`/`WithMetadata` misuse would otherwise crash the worker). + +### history-service, search-service, mock-user-service + +Straight mechanical mapping of `natsrouter.Err*` → `errcode` named constructors; no domain reasons required today (codes_search.go is a placeholder). search-service's Prometheus status-label path reads the code via `errors.As` (no second log); `query_rooms.go`'s exported `*natsrouter.RouteError` return type changes to `*errcode.Error`. + +--- + +## natsrouter Decoupling + +All error semantics move out of `pkg/natsrouter` into `pkg/errcode`. natsrouter becomes a transport that calls `errnats.Reply`. During migration `RouteError` is a type alias (`= errcode.Error`) and `Err*`/`Code*` are delegating shims, so production callers keep compiling; the shims (and `ReplyRouteError`) are deleted in the cleanup chapter. A new cycle-safe `Context.WithLogValues` seam lets natsrouter handlers attach domain attrs. `Context.ReplyError` and the deserialize-failure path are migrated to errcode too (previously they emitted a `code`-less body). + +--- + +## Enforcement (semgrep) + +Custom rules at `.semgrep/errcode.yml`, wired into `make sast` (blocking CI gate): + +- `errcode-no-reason-literal-outside-catalog` — `errcode.Reason("...")` only in `codes_*.go`. +- `errcode-withcause-must-not-wrap-errcode` — `WithCause(errcode.X(...))` forbidden. +- `errcode-no-multi-wrap-errcode` — `fmt.Errorf("…%w…%w…")` mixing forbidden. +- `errcode-prefer-named-constructor` (warning) — steer `New(CodeX, …)` literals to the named constructor. + +--- + +## Frontend Contract + +- The transport error type gains `reason?: string` and `metadata?: Record`. +- UI logic branches on `reason ?? code`; generic handling keys on `code`. +- Create-DM handles the new `{status:"exists", roomId}` success (navigate to the room) — **must ship in the same release as room-service** (the old client keyed on `.error`). +- The `AsyncJobResult` decoder reads `code`/`reason`. + +--- + +## Testing Strategy + +- **Core (`pkg/errcode`):** unit tests for `HTTPStatus`, leak guarantee (marshalled envelope never contains the cause), `Unwrap`/`errors.Is`, constructor + option behavior (incl. `WithCause`/`WithMetadata` panics), `Classify` (nil, unknown→internal, typed-through-wrapping, ctx values, category-aware level), `Parse`, `ReasonOf`/`HasReason`. ≥80% coverage; ≥90% for the core logic. +- **Adapters:** `errnats.Marshal`/`MarshalQuiet` and `errhttp.Write` unit-tested for status + envelope + (non-)logging; `Reply` paths covered by service integration tests. +- **Per-service migration:** every handler test that asserted `RouteError.Code` (string) moves to `errtest.AssertCode`/`AssertReason` on the decoded reply. The plan enumerates the exact counts (e.g. search ×26, history ×16) so none are missed. +- **TDD throughout** (Red-Green-Refactor), per repo CLAUDE.md. + +--- + +## Migration Overview + +Sequenced as a clean DAG (full step-by-step in the plan): + +1. **Core (Ch 0–9):** types, `Error`, logctx, constructors/options, `Classify`, `Parse`, `match`, `doc`, reason catalogs, `errnats`, `errhttp`, `errtest`. +2. **natsrouter (Ch 10):** route errors through `errnats`, add the `WithLogValues` seam, convert `Err*`/`RouteError` to shims, update CLAUDE.md's error-handling rule. +3. **Per-service (Ch 11–16):** history, search + mock-user, message-gatekeeper, room-service, room-worker, auth-service — each with reply-path migration, reason assignment, test migration, and `docs/client-api.md` updates. +4. **Cleanup (Ch 17):** delete natsrouter shims, retire `model.ErrorResponse` and the legacy `natsutil` error helpers. +5. **Enforcement + frontend + docs (Ch 18):** semgrep rules, frontend cutover, repo-wide gates, `docs/error-handling.md`. + +--- + +## Files Changed + +**New (core + adapters + helper):** +- `pkg/errcode/{category,reason,error,options,classify,parse,match,logctx,doc}.go` + tests +- `pkg/errcode/codes_{room,message,search,auth}.go` + tests +- `pkg/errcode/errnats/{reply.go,reply_test.go}` +- `pkg/errcode/errhttp/{write.go,write_test.go}` +- `pkg/errcode/errtest/{assert.go,assert_test.go}` + +**New (lint/docs):** +- `.semgrep/errcode.yml` +- `docs/error-handling.md` + +**Modified (foundation):** +- `pkg/natsrouter/{errors.go (shim→delete), register.go, router.go, context.go, middleware.go, params.go}` + tests +- `pkg/model/{event.go (AsyncJobResult code/reason; CreateRoomStatusExists), error.go (ErrorResponse removed)}` +- `pkg/natsutil/reply.go` (legacy error helpers removed) +- `Makefile` (semgrep wiring) +- `CLAUDE.md` (error-handling rule) + +**Modified (service migrations):** +- `history-service/*`, `search-service/*` (incl. `metrics.go`, `query_rooms.go`), `mock-user-service/*` +- `message-gatekeeper/*` (incl. `fetcher_history.go`) +- `room-service/*` (incl. `helper.go`, `memberlist_client.go`) +- `room-worker/*`, `auth-service/*` +- `docs/client-api.md`, `chat-frontend/*` + +**Deleted (cleanup chapter):** +- `pkg/natsrouter` `RouteError`/`Err*`/`Code*`/`ReplyRouteError` +- `pkg/model.ErrorResponse` +- `pkg/natsutil` `MarshalError`/`MarshalErrorWithCode`/`ReplyError`/`TryParseError` +- room-service `sanitizeError` + allowlist; message-gatekeeper `codedError`/`marshalErrorReply`; room-worker `sanitizeAsyncJobError`/`sanitizeSyncDMError` + +--- + +## Amendment Log (post-implementation decisions) + +### A1 — `Code.Valid()` and `New()` panic guards (implemented) + +`Code.Valid()` was added to `pkg/errcode/category.go` — it reports whether a value is one of the canonical `Code*` constants (necessary for the `Parse` path to detect non-canonical remote envelopes). `New()` in `options.go` now panics on both a non-canonical `Code` and an empty `Message`; these are programmer errors and fail-fast is preferable to silently producing a broken envelope. `Parse` treats a non-canonical code or empty message in a remote reply as a legacy/non-canonical envelope (see A4 below). + +### A2 — `TooManyRequests` constructor and HTTP 429 (implemented) + +`CodeTooManyRequests` (`too_many_requests`, HTTP 429) and its named constructor `TooManyRequests(msg, opts...)` were added alongside the other 7 categories, resolving the "open item" in the original spec. The distinction from `unavailable` (503) is preserved: `too_many_requests` is per-caller quota/rate-limiting; `unavailable` is server-wide saturation. + +### A3 — Request-ID policy split: StampRequestID vs RequireRequestID (implemented) + +Implementation revealed that a uniform "mint-on-missing" policy breaks client-retry deduplication for handlers that derive JetStream `Nats-Msg-Id` keys and canonical message IDs from the inbound request ID. Two policies now coexist (documented in `docs/error-handling.md` §3a): + +**Default (mint-on-missing):** `natsutil.StampRequestID(ctx, headers, subject) (ctx, id)` — if the header is absent, mint a fresh UUIDv7 silently; if malformed, mint and emit a single `slog.Warn`. Used by all handlers where the request ID is logging/tracing only. + +**Strict (reject-on-missing):** `natsutil.RequireRequestID(ctx, headers, subject) (ctx, id, error)` — returns `errcode.BadRequest` on missing or malformed `X-Request-ID`. Used by: +- All room-service handlers (via the `wrappedCtx(m otelnats.Msg) (context.Context, error)` helper) +- `room-worker.natsServerCreateDM` (sync DM endpoint) + +Rationale: room-service fans out to JetStream publishes whose `Nats-Msg-Id` (via `OutboxDedupID`, `CanonicalDedupID`, `messageDedupSeed`) and message IDs (`idgen.MessageIDFromRequestID`) are derived from the request ID. A silently-minted server-side ID across client retries produces a different dedup key each attempt, silently duplicating outbox events and system messages. + +The room-worker JetStream consume loop keeps the default mint policy defensively (messages arrive from room-service which already validated the header) but logs `slog.Error` if forced to mint, signalling an upstream contract violation. + +**Client contract:** callers targeting room-service or room-worker MUST send a stable `X-Request-ID` header (valid hyphenated UUIDv4 or v7) and reuse it across retries. + +### A4 — Cross-site memberlist: propagate X-Request-ID to remote handler (implemented) + +`room-service/memberlist_client.go` previously constructed a bare `nats.Msg` with an empty header, so the inbound `X-Request-ID` was never forwarded when making cross-site `member.list` NATS requests. Because remote room-service uses `RequireRequestID` (strict mode per A3), the remote handler rejected with `bad_request`. Fixed by using `natsutil.NewMsg(reqCtx, subject, body)` which copies the `X-Request-ID` from the context into the outbound message header. Integration tests (`TestAddMembers_TwoSiteEndToEnd`, `TestRoomsInfoBatchRPC`) were updated correspondingly. + +### A5 — Legacy peer handling in `Parse` / `memberlist_client` (implemented) + +`errcode.Parse` returns `(*Error, bool)`. When `memberlist_client` receives a remote envelope with a non-canonical `Code` (old peer) or empty `Message`, it falls back to `errcode.Internal("remote site returned an error")` and emits a single `slog.Warn("legacy peer emitted non-canonical errcode", ...)`. This ensures graceful mixed-version rollout without a hard dependency on the remote being up-to-date. + +### A6 — `errRoomKeyAbsent` converted to typed errcode (implemented) + +The `errRoomKeyAbsent` sentinel in `room-service/helper.go` (introduced by the room-key-fetch RPC) was converted from `errors.New(...)` to `errcode.NotFound("room key not available")` so it flows through `errnats.Reply` without requiring `sanitizeError`. The room-worker package retains its own `errRoomKeyAbsent = errors.New(...)` sentinel specifically so `errors.Is(err, errRoomKeyAbsent)` can trigger an operational alert path (wrapped via `errcode.Internal(..., errcode.WithCause(errRoomKeyAbsent))`). + +### A7 — `history-service` infra errors intentionally use `fmt.Errorf` (clarification) + +Several `fmt.Errorf("...: %w", err)` calls in `history-service/internal/service/messages.go` are correct by design. They wrap Cassandra read/write errors (infra tier) and collapse to `internal error` at the handler boundary via `Classify`. Client-visible logic (access window, not-found, forbidden) correctly uses `errcode.*` constructors. This two-tier pattern is the intended usage per CLAUDE.md §3 and `docs/error-handling.md` §2. + +### A8 — Worker-side logging boundary (follow-on) + +The errcode boundary (`Classify` → one structured log line + leak-safe envelope) covers only client-facing paths (NATS req/rep, Gin HTTP). JetStream worker paths log errors in several ad-hoc shapes. A follow-on `errcode.LogJobError` helper to unify them was designed but **considered and deferred (YAGNI)**: measurement showed the candidate workers are ~100% raw `fmt.Errorf` with no `WithCause`, so the unification would standardize things that don't actually differ. Only the one concrete gap (`message-worker` missing `request_id`) was fixed directly. See `docs/superpowers/specs/2026-06-02-unified-worker-error-logging-design.md` for the design and the deferral rationale; revisit if workers adopt typed errcode errors. diff --git a/history-service/cmd/main.go b/history-service/cmd/main.go index cacafbc51..285397e53 100644 --- a/history-service/cmd/main.go +++ b/history-service/cmd/main.go @@ -158,6 +158,9 @@ func main() { svc := service.New(cassRepo, subSource, roomSource, pub, threadRoomRepo, &cfg) router := natsrouter.New(nc, "history-service") router.Use(natsrouter.Recovery()) + // RequestID must precede any handler that reads request_id from ctx — + // otherwise Classify's log line records an empty value. + router.Use(natsrouter.RequestID()) router.Use(natsrouter.Logging()) svc.RegisterHandlers(router, cfg.SiteID) diff --git a/history-service/internal/service/messages.go b/history-service/internal/service/messages.go index 194739bef..96b03039b 100644 --- a/history-service/internal/service/messages.go +++ b/history-service/internal/service/messages.go @@ -3,6 +3,7 @@ package service import ( "encoding/json" "errors" + "fmt" "log/slog" "strings" "time" @@ -11,6 +12,7 @@ import ( "github.com/hmchangw/chat/history-service/internal/cassrepo" "github.com/hmchangw/chat/history-service/internal/models" + "github.com/hmchangw/chat/pkg/errcode" "github.com/hmchangw/chat/pkg/model" "github.com/hmchangw/chat/pkg/natsrouter" "github.com/hmchangw/chat/pkg/natsutil" @@ -27,6 +29,7 @@ const ( func (s *HistoryService) LoadHistory(c *natsrouter.Context, req models.LoadHistoryRequest) (*models.LoadHistoryResponse, error) { account := c.Param("account") roomID := c.Param("roomID") + c.WithLogValues("account", account, "room_id", roomID) accessSince, err := s.getAccessSince(c, account, roomID) if err != nil { @@ -84,15 +87,14 @@ func (s *HistoryService) LoadHistory(c *natsrouter.Context, req models.LoadHisto g.Go(func() error { t, rErr := s.rooms.GetMinUserLastSeenAt(gctx, roomID) if rErr != nil { - slog.Warn("loading minUserLastSeenAt", "error", rErr, "request_id", natsutil.RequestIDFromContext(c), "roomID", roomID) + slog.Warn("loading minUserLastSeenAt", "error", rErr, "room_id", roomID) return nil } lastSeenFloor = t return nil }) if err := g.Wait(); err != nil { - slog.Error("loading history", "error", err, "request_id", natsutil.RequestIDFromContext(c), "roomID", roomID) - return nil, natsrouter.ErrInternal("failed to load message history") + return nil, fmt.Errorf("loading history: %w", err) } var minMs *int64 @@ -111,6 +113,7 @@ func (s *HistoryService) LoadHistory(c *natsrouter.Context, req models.LoadHisto func (s *HistoryService) LoadNextMessages(c *natsrouter.Context, req models.LoadNextMessagesRequest) (*models.LoadNextMessagesResponse, error) { account := c.Param("account") roomID := c.Param("roomID") + c.WithLogValues("account", account, "room_id", roomID) accessSince, err := s.getAccessSince(c, account, roomID) if err != nil { @@ -148,8 +151,7 @@ func (s *HistoryService) LoadNextMessages(c *natsrouter.Context, req models.Load page, err = s.msgReader.GetMessagesAfter(c, roomID, lowerBound, ceiling, pageReq) } if err != nil { - slog.Error("loading next messages", "error", err, "request_id", natsutil.RequestIDFromContext(c), "roomID", roomID) - return nil, natsrouter.ErrInternal("failed to load messages") + return nil, fmt.Errorf("loading next messages: %w", err) } redactUnavailableQuotes(page.Data, accessSince) @@ -163,6 +165,7 @@ func (s *HistoryService) LoadNextMessages(c *natsrouter.Context, req models.Load func (s *HistoryService) LoadSurroundingMessages(c *natsrouter.Context, req models.LoadSurroundingMessagesRequest) (*models.LoadSurroundingMessagesResponse, error) { account := c.Param("account") roomID := c.Param("roomID") + c.WithLogValues("account", account, "room_id", roomID) accessSince, err := s.getAccessSince(c, account, roomID) if err != nil { @@ -174,7 +177,7 @@ func (s *HistoryService) LoadSurroundingMessages(c *natsrouter.Context, req mode return nil, err } if accessSince != nil && centralMsg.CreatedAt.Before(*accessSince) { - return nil, natsrouter.ErrForbidden("message is outside access window") + return nil, errcode.Forbidden("message is outside access window", errcode.WithReason(errcode.MessageOutsideAccessWindow)) } now := time.Now().UTC() @@ -225,20 +228,21 @@ func (s *HistoryService) LoadSurroundingMessages(c *natsrouter.Context, req mode beforePage, berr = s.msgReader.GetMessagesBetweenDesc(gctx, roomID, *accessSince, centralMsg.CreatedAt, beforePageReq) } if berr != nil { - slog.Error("loading surrounding messages", "error", berr, "request_id", natsutil.RequestIDFromContext(c), "roomID", roomID, "direction", "before") + return fmt.Errorf("loading surrounding messages (before): %w", berr) } - return berr + return nil }) g.Go(func() error { var aerr error afterPage, aerr = s.msgReader.GetMessagesAfter(gctx, roomID, centralMsg.CreatedAt, ceiling, afterPageReq) if aerr != nil { - slog.Error("loading surrounding messages", "error", aerr, "request_id", natsutil.RequestIDFromContext(c), "roomID", roomID, "direction", "after") + return fmt.Errorf("loading surrounding messages (after): %w", aerr) } - return aerr + return nil }) if err := g.Wait(); err != nil { - return nil, natsrouter.ErrInternal("failed to load surrounding messages") + // errgroup error already carries the (before|after) direction. + return nil, err } // Assemble in ASC order: reverse the DESC before-page, append central, then after-page. @@ -260,6 +264,7 @@ func (s *HistoryService) LoadSurroundingMessages(c *natsrouter.Context, req mode func (s *HistoryService) GetMessageByID(c *natsrouter.Context, req models.GetMessageByIDRequest) (*models.Message, error) { account := c.Param("account") roomID := c.Param("roomID") + c.WithLogValues("account", account, "room_id", roomID) accessSince, err := s.getAccessSince(c, account, roomID) if err != nil { @@ -272,7 +277,7 @@ func (s *HistoryService) GetMessageByID(c *natsrouter.Context, req models.GetMes } if accessSince != nil && msg.CreatedAt.Before(*accessSince) { - return nil, natsrouter.ErrForbidden("message is outside access window") + return nil, errcode.Forbidden("message is outside access window", errcode.WithReason(errcode.MessageOutsideAccessWindow)) } redactUnavailableQuote(msg, accessSince) @@ -284,6 +289,7 @@ func (s *HistoryService) GetMessageByID(c *natsrouter.Context, req models.GetMes func (s *HistoryService) EditMessage(c *natsrouter.Context, siteID string, req models.EditMessageRequest) (*models.EditMessageResponse, error) { account := c.Param("account") roomID := c.Param("roomID") + c.WithLogValues("account", account, "room_id", roomID) if _, err := s.getAccessSince(c, account, roomID); err != nil { return nil, err @@ -296,18 +302,18 @@ func (s *HistoryService) EditMessage(c *natsrouter.Context, siteID string, req m // Editing a soft-deleted message would emit updated after deleted, which consumers can't reconcile. if msg.Deleted { - return nil, natsrouter.ErrNotFound("message not found") + return nil, errcode.NotFound("message not found") } if !canModify(msg, account) { - return nil, natsrouter.ErrForbidden("only the sender can edit") + return nil, errcode.Forbidden("only the sender can edit") } if strings.TrimSpace(req.NewMsg) == "" { - return nil, natsrouter.ErrBadRequest("newMsg must not be empty") + return nil, errcode.BadRequest("newMsg must not be empty") } if len(req.NewMsg) > maxContentBytes { - return nil, natsrouter.ErrBadRequest("newMsg exceeds maximum size") + return nil, errcode.BadRequest("newMsg exceeds maximum size") } editedAt := time.Now().UTC() @@ -317,10 +323,9 @@ func (s *HistoryService) EditMessage(c *natsrouter.Context, siteID string, req m // the repo. Map it to 4xx so it doesn't pollute 5xx telemetry — // it's a benign race, not a server fault. if errors.Is(err, cassrepo.ErrMessageNotFound) { - return nil, natsrouter.ErrNotFound("message not found") + return nil, errcode.NotFound("message not found") } - slog.Error("edit: update content", "error", err, "messageID", req.MessageID) - return nil, natsrouter.ErrInternal("failed to edit message") + return nil, fmt.Errorf("editing message %s: %w", req.MessageID, err) } editedAtMs := editedAt.UnixMilli() @@ -354,6 +359,7 @@ func (s *HistoryService) EditMessage(c *natsrouter.Context, siteID string, req m func (s *HistoryService) DeleteMessage(c *natsrouter.Context, siteID string, req models.DeleteMessageRequest) (*models.DeleteMessageResponse, error) { account := c.Param("account") roomID := c.Param("roomID") + c.WithLogValues("account", account, "room_id", roomID) if _, err := s.getAccessSince(c, account, roomID); err != nil { return nil, err @@ -365,7 +371,7 @@ func (s *HistoryService) DeleteMessage(c *natsrouter.Context, siteID string, req } if !canModify(msg, account) { - return nil, natsrouter.ErrForbidden("only the sender can delete") + return nil, errcode.Forbidden("only the sender can delete") } if msg.Deleted { @@ -382,8 +388,7 @@ func (s *HistoryService) DeleteMessage(c *natsrouter.Context, siteID string, req deletedAt := time.Now().UTC() actualDeletedAt, applied, err := s.msgWriter.SoftDeleteMessage(c, msg, deletedAt) if err != nil { - slog.Error("delete: soft-delete", "error", err, "messageID", req.MessageID) - return nil, natsrouter.ErrInternal("failed to delete message") + return nil, fmt.Errorf("deleting message %s: %w", req.MessageID, err) } if !applied { // Concurrent delete won the CAS — skip publish to avoid a duplicate event. @@ -421,11 +426,11 @@ func (s *HistoryService) publishCanonicalBestEffort(c *natsrouter.Context, subj payload, err := json.Marshal(evt) if err != nil { slog.Warn("canonical marshal failed", - "error", err, "subject", subj, "messageID", evt.Message.ID, "roomID", evt.Message.RoomID) + "error", err, "subject", subj, "messageID", evt.Message.ID, "room_id", evt.Message.RoomID) return } if err := s.publisher.Publish(c, subj, payload, natsutil.CanonicalDedupID(evt)); err != nil { slog.Warn("canonical publish failed", - "error", err, "subject", subj, "messageID", evt.Message.ID, "roomID", evt.Message.RoomID) + "error", err, "subject", subj, "messageID", evt.Message.ID, "room_id", evt.Message.RoomID) } } diff --git a/history-service/internal/service/messages_test.go b/history-service/internal/service/messages_test.go index 45bf19d31..0fb9a71cf 100644 --- a/history-service/internal/service/messages_test.go +++ b/history-service/internal/service/messages_test.go @@ -18,6 +18,7 @@ import ( "github.com/hmchangw/chat/history-service/internal/models" "github.com/hmchangw/chat/history-service/internal/service" "github.com/hmchangw/chat/history-service/internal/service/mocks" + "github.com/hmchangw/chat/pkg/errcode" "github.com/hmchangw/chat/pkg/model" "github.com/hmchangw/chat/pkg/natsrouter" "github.com/hmchangw/chat/pkg/natsutil" @@ -83,36 +84,43 @@ func newServiceWithRoomMock(t *testing.T) (*service.HistoryService, *mocks.MockM return service.New(msgs, subs, rooms, pub, threadRooms, cfg), msgs, subs, rooms, pub, threadRooms } -func assertInternalErr(t *testing.T, err error, wantMsg string) { +// assertInternalErr verifies err collapses to the internal category. Internal +// failures are now propagated as raw wrapped errors (fmt.Errorf("...: %w", err)) +// that errcode.Classify turns into a generic "internal error" envelope at the +// transport boundary, so the test classifies the error the same way. wantCause +// is asserted against the (server-only) wrapped chain, never the client message. +func assertInternalErr(t *testing.T, err error, wantCause string) { t.Helper() - var routeErr *natsrouter.RouteError - require.ErrorAs(t, err, &routeErr) - assert.Equal(t, natsrouter.CodeInternal, routeErr.Code) - assert.Equal(t, wantMsg, routeErr.Message) + require.Error(t, err) + assert.Contains(t, err.Error(), wantCause) + ec := errcode.Classify(context.Background(), err) + require.NotNil(t, ec) + assert.Equal(t, errcode.CodeInternal, ec.Code) + assert.Equal(t, "internal error", ec.Message) } func assertForbiddenErr(t *testing.T, err error, wantMsg string) { t.Helper() - var routeErr *natsrouter.RouteError - require.ErrorAs(t, err, &routeErr) - assert.Equal(t, natsrouter.CodeForbidden, routeErr.Code) - assert.Equal(t, wantMsg, routeErr.Message) + var ec *errcode.Error + require.ErrorAs(t, err, &ec) + assert.Equal(t, errcode.CodeForbidden, ec.Code) + assert.Equal(t, wantMsg, ec.Message) } func assertBadRequestErr(t *testing.T, err error, wantMsg string) { t.Helper() - var routeErr *natsrouter.RouteError - require.ErrorAs(t, err, &routeErr) - assert.Equal(t, natsrouter.CodeBadRequest, routeErr.Code) - assert.Equal(t, wantMsg, routeErr.Message) + var ec *errcode.Error + require.ErrorAs(t, err, &ec) + assert.Equal(t, errcode.CodeBadRequest, ec.Code) + assert.Equal(t, wantMsg, ec.Message) } func assertNotFoundErr(t *testing.T, err error, wantMsg string) { t.Helper() - var routeErr *natsrouter.RouteError - require.ErrorAs(t, err, &routeErr) - assert.Equal(t, natsrouter.CodeNotFound, routeErr.Code) - assert.Equal(t, wantMsg, routeErr.Message) + var ec *errcode.Error + require.ErrorAs(t, err, &ec) + assert.Equal(t, errcode.CodeNotFound, ec.Code) + assert.Equal(t, wantMsg, ec.Message) } func makePage(msgs []models.Message, hasNext bool) cassrepo.Page[models.Message] { @@ -154,7 +162,7 @@ func TestHistoryService_LoadHistory_StoreError(t *testing.T) { _, err := svc.LoadHistory(c, models.LoadHistoryRequest{}) require.Error(t, err) - assertInternalErr(t, err, "failed to load message history") + assertInternalErr(t, err, "loading history") } func TestHistoryService_LoadHistory_SubscriptionError(t *testing.T) { @@ -165,7 +173,7 @@ func TestHistoryService_LoadHistory_SubscriptionError(t *testing.T) { _, err := svc.LoadHistory(c, models.LoadHistoryRequest{}) require.Error(t, err) - assertInternalErr(t, err, "unable to verify room access") + assertInternalErr(t, err, "verifying room access") } func TestHistoryService_LoadHistory_EmptyResult(t *testing.T) { @@ -381,7 +389,7 @@ func TestHistoryService_LoadNextMessages_SubscriptionStoreError(t *testing.T) { _, err := svc.LoadNextMessages(c, models.LoadNextMessagesRequest{}) require.Error(t, err) - assertInternalErr(t, err, "unable to verify room access") + assertInternalErr(t, err, "verifying room access") } func TestHistoryService_LoadNextMessages_StoreErrorAfter(t *testing.T) { @@ -394,7 +402,7 @@ func TestHistoryService_LoadNextMessages_StoreErrorAfter(t *testing.T) { _, err := svc.LoadNextMessages(c, models.LoadNextMessagesRequest{}) require.Error(t, err) - assertInternalErr(t, err, "failed to load messages") + assertInternalErr(t, err, "loading next messages") } func TestHistoryService_LoadNextMessages_StoreErrorLatest(t *testing.T) { @@ -407,7 +415,7 @@ func TestHistoryService_LoadNextMessages_StoreErrorLatest(t *testing.T) { _, err := svc.LoadNextMessages(c, models.LoadNextMessagesRequest{}) require.Error(t, err) - assertInternalErr(t, err, "failed to load messages") + assertInternalErr(t, err, "loading next messages") } func TestHistoryService_LoadNextMessages_HasNext(t *testing.T) { @@ -523,7 +531,7 @@ func TestHistoryService_GetMessageByID_StoreError(t *testing.T) { _, err := svc.GetMessageByID(c, models.GetMessageByIDRequest{MessageID: "m1"}) require.Error(t, err) - assertInternalErr(t, err, "failed to retrieve message") + assertInternalErr(t, err, "retrieving message") } func TestHistoryService_GetMessageByID_NoHSS(t *testing.T) { @@ -668,7 +676,7 @@ func TestHistoryService_LoadSurroundingMessages_SubscriptionError(t *testing.T) MessageID: "m5", Limit: 6, }) require.Error(t, err) - assertInternalErr(t, err, "unable to verify room access") + assertInternalErr(t, err, "verifying room access") } func TestHistoryService_LoadSurroundingMessages_WrongRoom(t *testing.T) { @@ -729,7 +737,7 @@ func TestHistoryService_LoadSurroundingMessages_StoreError(t *testing.T) { MessageID: "m5", Limit: 6, }) require.Error(t, err) - assertInternalErr(t, err, "failed to retrieve message") + assertInternalErr(t, err, "retrieving message") } func TestHistoryService_LoadSurroundingMessages_BeforePageError(t *testing.T) { @@ -747,7 +755,7 @@ func TestHistoryService_LoadSurroundingMessages_BeforePageError(t *testing.T) { MessageID: "m5", Limit: 6, }) require.Error(t, err) - assertInternalErr(t, err, "failed to load surrounding messages") + assertInternalErr(t, err, "loading surrounding messages") } func TestHistoryService_LoadSurroundingMessages_BeforePageError_NoHSS(t *testing.T) { @@ -765,7 +773,7 @@ func TestHistoryService_LoadSurroundingMessages_BeforePageError_NoHSS(t *testing MessageID: "m5", Limit: 6, }) require.Error(t, err) - assertInternalErr(t, err, "failed to load surrounding messages") + assertInternalErr(t, err, "loading surrounding messages") } func TestHistoryService_LoadSurroundingMessages_AfterPageError(t *testing.T) { @@ -783,7 +791,7 @@ func TestHistoryService_LoadSurroundingMessages_AfterPageError(t *testing.T) { MessageID: "m5", Limit: 6, }) require.Error(t, err) - assertInternalErr(t, err, "failed to load surrounding messages") + assertInternalErr(t, err, "loading surrounding messages") } func TestHistoryService_LoadSurroundingMessages_Limit1_OnlyCentral(t *testing.T) { @@ -911,9 +919,9 @@ func TestHistoryService_EditMessage_NotSubscribed(t *testing.T) { resp, err := svc.EditMessage(c, "site-test", models.EditMessageRequest{MessageID: "m-abc", NewMsg: "x"}) assert.Nil(t, resp) - var routeErr *natsrouter.RouteError + var routeErr *errcode.Error require.ErrorAs(t, err, &routeErr) - assert.Equal(t, natsrouter.CodeForbidden, routeErr.Code) + assert.Equal(t, errcode.CodeForbidden, routeErr.Code) assert.Equal(t, "not subscribed to room", routeErr.Message) } @@ -934,9 +942,9 @@ func TestHistoryService_EditMessage_NotSender(t *testing.T) { resp, err := svc.EditMessage(c, "site-test", models.EditMessageRequest{MessageID: "m-abc", NewMsg: "x"}) assert.Nil(t, resp) - var routeErr *natsrouter.RouteError + var routeErr *errcode.Error require.ErrorAs(t, err, &routeErr) - assert.Equal(t, natsrouter.CodeForbidden, routeErr.Code) + assert.Equal(t, errcode.CodeForbidden, routeErr.Code) assert.Equal(t, "only the sender can edit", routeErr.Message) } @@ -950,9 +958,9 @@ func TestHistoryService_EditMessage_NotFound(t *testing.T) { resp, err := svc.EditMessage(c, "site-test", models.EditMessageRequest{MessageID: "missing", NewMsg: "x"}) assert.Nil(t, resp) - var routeErr *natsrouter.RouteError + var routeErr *errcode.Error require.ErrorAs(t, err, &routeErr) - assert.Equal(t, natsrouter.CodeNotFound, routeErr.Code) + assert.Equal(t, errcode.CodeNotFound, routeErr.Code) } func TestHistoryService_EditMessage_WrongRoom(t *testing.T) { @@ -972,9 +980,9 @@ func TestHistoryService_EditMessage_WrongRoom(t *testing.T) { resp, err := svc.EditMessage(c, "site-test", models.EditMessageRequest{MessageID: "m-abc", NewMsg: "x"}) assert.Nil(t, resp) - var routeErr *natsrouter.RouteError + var routeErr *errcode.Error require.ErrorAs(t, err, &routeErr) - assert.Equal(t, natsrouter.CodeNotFound, routeErr.Code) + assert.Equal(t, errcode.CodeNotFound, routeErr.Code) } func TestHistoryService_EditMessage_AlreadyDeleted(t *testing.T) { @@ -997,9 +1005,9 @@ func TestHistoryService_EditMessage_AlreadyDeleted(t *testing.T) { resp, err := svc.EditMessage(c, "site-test", models.EditMessageRequest{MessageID: "m-abc", NewMsg: "x"}) assert.Nil(t, resp) - var routeErr *natsrouter.RouteError + var routeErr *errcode.Error require.ErrorAs(t, err, &routeErr) - assert.Equal(t, natsrouter.CodeNotFound, routeErr.Code) + assert.Equal(t, errcode.CodeNotFound, routeErr.Code) } func TestHistoryService_EditMessage_EmptyNewMsg(t *testing.T) { @@ -1018,9 +1026,9 @@ func TestHistoryService_EditMessage_EmptyNewMsg(t *testing.T) { resp, err := svc.EditMessage(c, "site-test", models.EditMessageRequest{MessageID: "m-abc", NewMsg: " "}) assert.Nil(t, resp) - var routeErr *natsrouter.RouteError + var routeErr *errcode.Error require.ErrorAs(t, err, &routeErr) - assert.Equal(t, natsrouter.CodeBadRequest, routeErr.Code) + assert.Equal(t, errcode.CodeBadRequest, routeErr.Code) assert.Equal(t, "newMsg must not be empty", routeErr.Message) } @@ -1043,9 +1051,9 @@ func TestHistoryService_EditMessage_TooLarge(t *testing.T) { resp, err := svc.EditMessage(c, "site-test", models.EditMessageRequest{MessageID: "m-abc", NewMsg: oversize}) assert.Nil(t, resp) - var routeErr *natsrouter.RouteError + var routeErr *errcode.Error require.ErrorAs(t, err, &routeErr) - assert.Equal(t, natsrouter.CodeBadRequest, routeErr.Code) + assert.Equal(t, errcode.CodeBadRequest, routeErr.Code) assert.Equal(t, "newMsg exceeds maximum size", routeErr.Message) } @@ -1071,7 +1079,7 @@ func TestHistoryService_EditMessage_UpdateFails(t *testing.T) { resp, err := svc.EditMessage(c, "site-test", models.EditMessageRequest{MessageID: "m-abc", NewMsg: "new content"}) assert.Nil(t, resp) - assertInternalErr(t, err, "failed to edit message") + assertInternalErr(t, err, "editing message") } // TestHistoryService_EditMessage_RaceWithDelete_MapsToNotFound verifies the @@ -1237,9 +1245,9 @@ func TestHistoryService_DeleteMessage_NotSubscribed(t *testing.T) { resp, err := svc.DeleteMessage(c, "site-test", models.DeleteMessageRequest{MessageID: "m-abc"}) assert.Nil(t, resp) - var routeErr *natsrouter.RouteError + var routeErr *errcode.Error require.ErrorAs(t, err, &routeErr) - assert.Equal(t, natsrouter.CodeForbidden, routeErr.Code) + assert.Equal(t, errcode.CodeForbidden, routeErr.Code) assert.Equal(t, "not subscribed to room", routeErr.Message) } @@ -1259,9 +1267,9 @@ func TestHistoryService_DeleteMessage_NotSender(t *testing.T) { resp, err := svc.DeleteMessage(c, "site-test", models.DeleteMessageRequest{MessageID: "m-abc"}) assert.Nil(t, resp) - var routeErr *natsrouter.RouteError + var routeErr *errcode.Error require.ErrorAs(t, err, &routeErr) - assert.Equal(t, natsrouter.CodeForbidden, routeErr.Code) + assert.Equal(t, errcode.CodeForbidden, routeErr.Code) assert.Equal(t, "only the sender can delete", routeErr.Message) } @@ -1275,9 +1283,9 @@ func TestHistoryService_DeleteMessage_NotFound(t *testing.T) { resp, err := svc.DeleteMessage(c, "site-test", models.DeleteMessageRequest{MessageID: "missing"}) assert.Nil(t, resp) - var routeErr *natsrouter.RouteError + var routeErr *errcode.Error require.ErrorAs(t, err, &routeErr) - assert.Equal(t, natsrouter.CodeNotFound, routeErr.Code) + assert.Equal(t, errcode.CodeNotFound, routeErr.Code) } func TestHistoryService_DeleteMessage_WrongRoom(t *testing.T) { @@ -1297,9 +1305,9 @@ func TestHistoryService_DeleteMessage_WrongRoom(t *testing.T) { resp, err := svc.DeleteMessage(c, "site-test", models.DeleteMessageRequest{MessageID: "m-abc"}) assert.Nil(t, resp) - var routeErr *natsrouter.RouteError + var routeErr *errcode.Error require.ErrorAs(t, err, &routeErr) - assert.Equal(t, natsrouter.CodeNotFound, routeErr.Code) + assert.Equal(t, errcode.CodeNotFound, routeErr.Code) } func TestHistoryService_DeleteMessage_SoftDeleteFails(t *testing.T) { @@ -1322,7 +1330,7 @@ func TestHistoryService_DeleteMessage_SoftDeleteFails(t *testing.T) { resp, err := svc.DeleteMessage(c, "site-test", models.DeleteMessageRequest{MessageID: "m-abc"}) assert.Nil(t, resp) - assertInternalErr(t, err, "failed to delete message") + assertInternalErr(t, err, "deleting message") } // TestHistoryService_DeleteMessage_ConcurrentDeleteSkipsPublish covers the diff --git a/history-service/internal/service/pin.go b/history-service/internal/service/pin.go index 3fcc13fe9..6813049b3 100644 --- a/history-service/internal/service/pin.go +++ b/history-service/internal/service/pin.go @@ -1,11 +1,12 @@ package service import ( - "log/slog" + "fmt" "regexp" "time" "github.com/hmchangw/chat/history-service/internal/models" + "github.com/hmchangw/chat/pkg/errcode" "github.com/hmchangw/chat/pkg/model" "github.com/hmchangw/chat/pkg/natsrouter" "github.com/hmchangw/chat/pkg/subject" @@ -31,16 +32,15 @@ func canBypassLargeRoomPin(sub *model.Subscription) bool { // UnpinMessage intentionally accepts it so a soft-deleted pin can still be unpinned to free its slot. func (s *HistoryService) pinPreCheck(c *natsrouter.Context, account, roomID, messageID string) (*models.Message, *model.Subscription, error) { if !s.pinEnabled { - return nil, nil, natsrouter.ErrForbidden("pinning is disabled") + return nil, nil, errcode.Forbidden("pinning is disabled", errcode.WithReason(errcode.PinDisabled)) } sub, err := s.subscriptions.GetSubscription(c, account, roomID) if err != nil { - slog.Error("get subscription", "error", err, "account", account, "roomID", roomID) - return nil, nil, natsrouter.ErrInternal("unable to verify room access") + return nil, nil, fmt.Errorf("get subscription: %w", err) } if sub == nil { - return nil, nil, natsrouter.ErrForbidden("not subscribed to room") + return nil, nil, errcode.Forbidden("not subscribed to room", errcode.WithReason(errcode.MessageNotSubscribed)) } msg, err := s.findMessage(c, roomID, messageID) @@ -56,8 +56,7 @@ func (s *HistoryService) pinPreCheck(c *natsrouter.Context, account, roomID, mes func (s *HistoryService) enforcePinLimit(c *natsrouter.Context, roomID, messageID string) (*time.Time, error) { pinned, err := s.msgReader.GetAllPinnedMessages(c, roomID) if err != nil { - slog.Error("count pinned messages", "error", err, "roomID", roomID) - return nil, natsrouter.ErrInternal("unable to verify pin count") + return nil, fmt.Errorf("count pinned messages: %w", err) } for i := range pinned { if pinned[i].MessageID == messageID { @@ -65,7 +64,7 @@ func (s *HistoryService) enforcePinLimit(c *natsrouter.Context, roomID, messageI } } if len(pinned) >= s.maxPinnedPerRoom { - return nil, natsrouter.ErrForbidden("room pin limit reached") + return nil, errcode.Forbidden("room pin limit reached", errcode.WithReason(errcode.PinLimitReached)) } return nil, nil } @@ -75,11 +74,10 @@ func (s *HistoryService) enforceLargeRoomPin(c *natsrouter.Context, roomID strin if !canBypassLargeRoomPin(sub) { count, err := s.rooms.GetRoomUserCount(c, roomID) if err != nil { - slog.Error("get room user count", "error", err, "roomID", roomID) - return natsrouter.ErrInternal("unable to verify room size") + return fmt.Errorf("get room user count: %w", err) } if count > s.largeRoomThreshold { - return natsrouter.ErrForbidden("room is too large to pin") + return errcode.Forbidden("room is too large to pin", errcode.WithReason(errcode.PinRoomTooLarge)) } } return nil @@ -89,13 +87,14 @@ func (s *HistoryService) enforceLargeRoomPin(c *natsrouter.Context, roomID strin func (s *HistoryService) PinMessage(c *natsrouter.Context, siteID string, req models.PinMessageRequest) (*models.PinMessageResponse, error) { account := c.Param("account") roomID := c.Param("roomID") + c.WithLogValues("account", account, "room_id", roomID) msg, sub, err := s.pinPreCheck(c, account, roomID, req.MessageID) if err != nil { return nil, err } if msg.Deleted { - return nil, natsrouter.ErrNotFound("message not found") + return nil, errcode.NotFound("message not found") } // Already pinned: echo existing pinnedAt, no write/publish/large-room check. @@ -119,8 +118,7 @@ func (s *HistoryService) PinMessage(c *natsrouter.Context, siteID string, req mo } pinnedBy := models.Participant{ID: sub.User.ID, Account: sub.User.Account} if err := s.msgWriter.PinMessage(c, msg, pinnedAt, pinnedBy); err != nil { - slog.Error("pin: write", "error", err, "messageID", req.MessageID) - return nil, natsrouter.ErrInternal("failed to pin message") + return nil, fmt.Errorf("pin message %s: %w", req.MessageID, err) } pinnedAtMs := pinnedAt.UnixMilli() @@ -147,6 +145,7 @@ func (s *HistoryService) PinMessage(c *natsrouter.Context, siteID string, req mo func (s *HistoryService) UnpinMessage(c *natsrouter.Context, siteID string, req models.UnpinMessageRequest) (*models.UnpinMessageResponse, error) { account := c.Param("account") roomID := c.Param("roomID") + c.WithLogValues("account", account, "room_id", roomID) msg, sub, err := s.pinPreCheck(c, account, roomID, req.MessageID) if err != nil { @@ -163,8 +162,7 @@ func (s *HistoryService) UnpinMessage(c *natsrouter.Context, siteID string, req } if err := s.msgWriter.UnpinMessage(c, msg); err != nil { - slog.Error("unpin: write", "error", err, "messageID", req.MessageID) - return nil, natsrouter.ErrInternal("failed to unpin message") + return nil, fmt.Errorf("unpin message %s: %w", req.MessageID, err) } evt := model.MessageEvent{ @@ -189,6 +187,7 @@ func (s *HistoryService) UnpinMessage(c *natsrouter.Context, siteID string, req func (s *HistoryService) ListPinnedMessages(c *natsrouter.Context, req models.ListPinnedMessagesRequest) (*models.ListPinnedMessagesResponse, error) { account := c.Param("account") roomID := c.Param("roomID") + c.WithLogValues("account", account, "room_id", roomID) accessSince, err := s.getAccessSince(c, account, roomID) if err != nil { @@ -202,8 +201,7 @@ func (s *HistoryService) ListPinnedMessages(c *natsrouter.Context, req models.Li page, err := s.msgReader.GetPinnedMessages(c, roomID, pageReq) if err != nil { - slog.Error("list pinned messages", "error", err, "roomID", roomID) - return nil, natsrouter.ErrInternal("failed to list pinned messages") + return nil, fmt.Errorf("list pinned messages: %w", err) } // Stub pre-access pins, then stub pre-access quoted parents inside survivors. diff --git a/history-service/internal/service/pin_test.go b/history-service/internal/service/pin_test.go index 5c2de86b2..e3a640ac4 100644 --- a/history-service/internal/service/pin_test.go +++ b/history-service/internal/service/pin_test.go @@ -523,7 +523,7 @@ func TestPinMessage_SubscriptionError(t *testing.T) { _, err := svc.PinMessage(testContext(), "site-a", models.PinMessageRequest{MessageID: "m1"}) - assertInternalErr(t, err, "unable to verify room access") + assertInternalErr(t, err, "get subscription") } func TestPinMessage_RoomUserCountError(t *testing.T) { @@ -534,7 +534,7 @@ func TestPinMessage_RoomUserCountError(t *testing.T) { _, err := svc.PinMessage(testContext(), "site-a", models.PinMessageRequest{MessageID: "m1"}) - assertInternalErr(t, err, "unable to verify room size") + assertInternalErr(t, err, "get room user count") } func TestPinMessage_WriteError(t *testing.T) { @@ -548,7 +548,7 @@ func TestPinMessage_WriteError(t *testing.T) { _, err := svc.PinMessage(testContext(), "site-a", models.PinMessageRequest{MessageID: "m1"}) - assertInternalErr(t, err, "failed to pin message") + assertInternalErr(t, err, "pin message m1") } func TestUnpinMessage_WriteError(t *testing.T) { @@ -564,7 +564,7 @@ func TestUnpinMessage_WriteError(t *testing.T) { _, err := svc.UnpinMessage(testContext(), "site-a", models.UnpinMessageRequest{MessageID: "m1"}) - assertInternalErr(t, err, "failed to unpin message") + assertInternalErr(t, err, "unpin message m1") } func TestListPinnedMessages_StoreError(t *testing.T) { @@ -575,7 +575,7 @@ func TestListPinnedMessages_StoreError(t *testing.T) { _, err := svc.ListPinnedMessages(testContext(), models.ListPinnedMessagesRequest{}) - assertInternalErr(t, err, "failed to list pinned messages") + assertInternalErr(t, err, "list pinned messages") } func TestPinMessage_BotAccountBypassesLargeRoom(t *testing.T) { @@ -686,7 +686,7 @@ func TestPinMessage_PinLimitJustUnderSucceeds(t *testing.T) { } func TestPinMessage_PinLimitCountError(t *testing.T) { - // GetAllPinnedMessages error → internal "unable to verify pin count"; no write/publish. + // GetAllPinnedMessages error → wrapped "count pinned messages" → internal at boundary; no write/publish. svc, msgs, subs, rooms, _, _ := newPinTestService(t) subs.EXPECT().GetSubscription(gomock.Any(), "u1", "r1").Return(subFor(model.RoleMember), nil) msgs.EXPECT().GetMessageByID(gomock.Any(), "m1").Return(pinnableMsg(), nil) @@ -696,7 +696,7 @@ func TestPinMessage_PinLimitCountError(t *testing.T) { _, err := svc.PinMessage(testContext(), "site-a", models.PinMessageRequest{MessageID: "m1"}) - assertInternalErr(t, err, "unable to verify pin count") + assertInternalErr(t, err, "count pinned messages") } func TestPinMessage_PinLimitSkippedOnIdempotentRepin(t *testing.T) { diff --git a/history-service/internal/service/room_times.go b/history-service/internal/service/room_times.go index ad761ef91..a6584d807 100644 --- a/history-service/internal/service/room_times.go +++ b/history-service/internal/service/room_times.go @@ -4,20 +4,16 @@ import ( "context" "errors" "fmt" - "log/slog" "time" "go.mongodb.org/mongo-driver/v2/mongo" "github.com/hmchangw/chat/history-service/internal/models" - "github.com/hmchangw/chat/pkg/natsrouter" + "github.com/hmchangw/chat/pkg/errcode" ) -// resolveRoomTimesOrError invokes resolveRoomTimes and translates the result -// into a natsrouter error suitable for handler return: a wrapped -// mongo.ErrNoDocuments becomes ErrNotFound (the room genuinely does not -// exist), anything else becomes ErrInternal. The raw error is logged -// server-side; only the sanitized RouteError is returned to clients. +// resolveRoomTimesOrError calls resolveRoomTimes and translates the result for +// handler return: mongo.ErrNoDocuments → errcode.NotFound; anything else wraps. func (s *HistoryService) resolveRoomTimesOrError( ctx context.Context, roomID string, @@ -28,11 +24,10 @@ func (s *HistoryService) resolveRoomTimesOrError( if err == nil { return lastMsgAt, createdAt, nil } - slog.Error("resolve room times", "error", err, "roomID", roomID) if errors.Is(err, mongo.ErrNoDocuments) { - return time.Time{}, time.Time{}, natsrouter.ErrNotFound("room not found") + return time.Time{}, time.Time{}, errcode.NotFound("room not found") } - return time.Time{}, time.Time{}, natsrouter.ErrInternal("failed to resolve room metadata") + return time.Time{}, time.Time{}, fmt.Errorf("resolving room metadata for %s: %w", roomID, err) } // clockSkewTolerance allows clients with mildly out-of-sync clocks to still diff --git a/history-service/internal/service/threads.go b/history-service/internal/service/threads.go index 891d209fc..f6035657c 100644 --- a/history-service/internal/service/threads.go +++ b/history-service/internal/service/threads.go @@ -7,6 +7,7 @@ import ( "time" "github.com/hmchangw/chat/history-service/internal/models" + "github.com/hmchangw/chat/pkg/errcode" pkgmodel "github.com/hmchangw/chat/pkg/model" "github.com/hmchangw/chat/pkg/mongoutil" "github.com/hmchangw/chat/pkg/natsrouter" @@ -24,9 +25,10 @@ func emptyThreadResponse() *models.GetThreadMessagesResponse { func (s *HistoryService) GetThreadMessages(c *natsrouter.Context, req models.GetThreadMessagesRequest) (*models.GetThreadMessagesResponse, error) { account := c.Param("account") roomID := c.Param("roomID") + c.WithLogValues("account", account, "room_id", roomID) if req.ThreadMessageID == "" { - return nil, natsrouter.ErrBadRequest("threadMessageId is required") + return nil, errcode.BadRequest("threadMessageId is required") } accessSince, err := s.getAccessSince(c, account, roomID) @@ -65,18 +67,18 @@ func (s *HistoryService) GetThreadMessages(c *natsrouter.Context, req models.Get } if msg.ThreadParentID != "" { - return nil, natsrouter.ErrBadRequest("threadMessageId must be a top-level message, not a reply") + return nil, errcode.BadRequest("threadMessageId must be a top-level message, not a reply") } if accessSince != nil && msg.CreatedAt.Before(*accessSince) { - return nil, natsrouter.ErrForbidden("thread is outside access window") + return nil, errcode.Forbidden("thread is outside access window", errcode.WithReason(errcode.MessageOutsideAccessWindow)) } // Empty ThreadRoomID means no replies yet or a silently-failed stamp in message-worker. if msg.ThreadRoomID == "" { slog.Warn("thread fetch: parent has empty thread_room_id, returning no replies", "request_id", natsutil.RequestIDFromContext(c), - "roomID", roomID, + "room_id", roomID, "messageID", req.ThreadMessageID, "messageCreatedAt", msg.CreatedAt, "account", account, @@ -136,8 +138,7 @@ func (s *HistoryService) GetThreadMessages(c *natsrouter.Context, req models.Get page, err := s.msgReader.GetThreadMessages(c, msg.ThreadRoomID, ceiling, floor, pageReq) if err != nil { - slog.Error("loading thread messages", "error", err, "request_id", natsutil.RequestIDFromContext(c), "roomID", roomID, "threadRoomID", msg.ThreadRoomID) - return nil, natsrouter.ErrInternal("failed to load thread messages") + return nil, fmt.Errorf("loading thread messages: %w", err) } redactUnavailableQuotes(page.Data, accessSince) @@ -156,7 +157,7 @@ func validateThreadFilter(filter models.ThreadFilter) (models.ThreadFilter, erro case models.ThreadFilterFollowing, models.ThreadFilterUnread: return filter, nil default: - return "", natsrouter.ErrBadRequest(fmt.Sprintf("invalid thread filter: %q", filter)) + return "", errcode.BadRequest(fmt.Sprintf("invalid thread filter: %q", filter)) } } @@ -164,6 +165,7 @@ func validateThreadFilter(filter models.ThreadFilter) (models.ThreadFilter, erro func (s *HistoryService) GetThreadParentMessages(c *natsrouter.Context, req models.GetThreadParentMessagesRequest) (*models.GetThreadParentMessagesResponse, error) { account := c.Param("account") roomID := c.Param("roomID") + c.WithLogValues("account", account, "room_id", roomID) accessSince, err := s.getAccessSince(c, account, roomID) if err != nil { @@ -186,12 +188,11 @@ func (s *HistoryService) GetThreadParentMessages(c *natsrouter.Context, req mode case models.ThreadFilterUnread: threadPage, err = s.threadRooms.GetUnreadThreadRooms(c, roomID, account, accessSince, pageReq) default: - slog.Error("unhandled thread filter", "filter", filter) - return nil, natsrouter.ErrInternal("unhandled thread filter") + return nil, errcode.Internal("unhandled thread filter", + errcode.WithCause(fmt.Errorf("unhandled thread filter: %q", filter))) } if err != nil { - slog.Error("loading thread rooms from MongoDB", "error", err, "request_id", natsutil.RequestIDFromContext(c), "roomID", roomID, "filter", filter) - return nil, natsrouter.ErrInternal("failed to load thread parent messages") + return nil, fmt.Errorf("loading thread rooms (filter %s): %w", filter, err) } if len(threadPage.Data) == 0 { @@ -211,8 +212,7 @@ func (s *HistoryService) GetThreadParentMessages(c *natsrouter.Context, req mode cassMessages, err := s.msgReader.GetMessagesByIDs(c, parentIDs) if err != nil { - slog.Error("hydrating thread parent messages from Cassandra", "error", err, "request_id", natsutil.RequestIDFromContext(c), "roomID", roomID) - return nil, natsrouter.ErrInternal("failed to load thread parent messages") + return nil, fmt.Errorf("hydrating thread parent messages: %w", err) } msgByID := make(map[string]models.Message, len(cassMessages)) diff --git a/history-service/internal/service/threads_test.go b/history-service/internal/service/threads_test.go index 004c77674..9f9ef3f12 100644 --- a/history-service/internal/service/threads_test.go +++ b/history-service/internal/service/threads_test.go @@ -12,9 +12,9 @@ import ( "github.com/hmchangw/chat/history-service/internal/cassrepo" "github.com/hmchangw/chat/history-service/internal/models" "github.com/hmchangw/chat/history-service/internal/service" + "github.com/hmchangw/chat/pkg/errcode" pkgmodel "github.com/hmchangw/chat/pkg/model" "github.com/hmchangw/chat/pkg/mongoutil" - "github.com/hmchangw/chat/pkg/natsrouter" ) func makeThreadRooms() []pkgmodel.ThreadRoom { @@ -114,7 +114,7 @@ func TestHistoryService_GetThreadMessages_ParentLookupError(t *testing.T) { _, err := svc.GetThreadMessages(c, models.GetThreadMessagesRequest{ThreadMessageID: "m-parent"}) require.Error(t, err) - assertInternalErr(t, err, "failed to retrieve message") + assertInternalErr(t, err, "retrieving message") } func TestHistoryService_GetThreadMessages_NotSubscribed(t *testing.T) { @@ -136,7 +136,7 @@ func TestHistoryService_GetThreadMessages_SubscriptionStoreError(t *testing.T) { _, err := svc.GetThreadMessages(c, models.GetThreadMessagesRequest{ThreadMessageID: "m-parent"}) require.Error(t, err) - assertInternalErr(t, err, "unable to verify room access") + assertInternalErr(t, err, "verifying room access") } func TestHistoryService_GetThreadMessages_ParentBeforeAccessSince(t *testing.T) { @@ -244,7 +244,7 @@ func TestHistoryService_GetThreadMessages_RepoError(t *testing.T) { _, err := svc.GetThreadMessages(c, models.GetThreadMessagesRequest{ThreadMessageID: "m-parent"}) require.Error(t, err) - assertInternalErr(t, err, "failed to load thread messages") + assertInternalErr(t, err, "loading thread messages") } func TestHistoryService_GetThreadMessages_Limits(t *testing.T) { @@ -400,7 +400,7 @@ func TestHistoryService_GetThreadParentMessages_SubscriptionError(t *testing.T) _, err := svc.GetThreadParentMessages(c, models.GetThreadParentMessagesRequest{Limit: 20}) require.Error(t, err) - assertInternalErr(t, err, "unable to verify room access") + assertInternalErr(t, err, "verifying room access") } func TestHistoryService_GetThreadParentMessages_ThreadRoomError(t *testing.T) { @@ -414,7 +414,7 @@ func TestHistoryService_GetThreadParentMessages_ThreadRoomError(t *testing.T) { _, err := svc.GetThreadParentMessages(c, models.GetThreadParentMessagesRequest{Limit: 20}) require.Error(t, err) - assertInternalErr(t, err, "failed to load thread parent messages") + assertInternalErr(t, err, "loading thread rooms") } func TestHistoryService_GetThreadParentMessages_CassandraError(t *testing.T) { @@ -427,7 +427,7 @@ func TestHistoryService_GetThreadParentMessages_CassandraError(t *testing.T) { _, err := svc.GetThreadParentMessages(c, models.GetThreadParentMessagesRequest{Limit: 20}) require.Error(t, err) - assertInternalErr(t, err, "failed to load thread parent messages") + assertInternalErr(t, err, "hydrating thread parent messages") } func TestHistoryService_GetThreadParentMessages_MissingParentIgnored(t *testing.T) { @@ -483,9 +483,9 @@ func TestHistoryService_GetThreadParentMessages_InvalidFilter(t *testing.T) { _, err := svc.GetThreadParentMessages(c, models.GetThreadParentMessagesRequest{Filter: "bogus", Limit: 20}) require.Error(t, err) - var routeErr *natsrouter.RouteError + var routeErr *errcode.Error require.ErrorAs(t, err, &routeErr) - assert.Equal(t, natsrouter.CodeBadRequest, routeErr.Code) + assert.Equal(t, errcode.CodeBadRequest, routeErr.Code) } // ============================================================ diff --git a/history-service/internal/service/utils.go b/history-service/internal/service/utils.go index 8bae0fa6f..ffcfaa5dd 100644 --- a/history-service/internal/service/utils.go +++ b/history-service/internal/service/utils.go @@ -2,23 +2,24 @@ package service import ( "context" - "log/slog" + "fmt" "time" "github.com/hmchangw/chat/history-service/internal/cassrepo" "github.com/hmchangw/chat/history-service/internal/models" - "github.com/hmchangw/chat/pkg/natsrouter" + "github.com/hmchangw/chat/pkg/errcode" ) // getAccessSince checks subscription and returns the historySharedSince lower bound (nil = full access). func (s *HistoryService) getAccessSince(ctx context.Context, account, roomID string) (*time.Time, error) { accessSince, subscribed, err := s.subscriptions.GetHistorySharedSince(ctx, account, roomID) if err != nil { - slog.Error("checking subscription", "error", err, "account", account, "roomID", roomID) - return nil, natsrouter.ErrInternal("unable to verify room access") + return nil, fmt.Errorf("verifying room access for %s/%s: %w", account, roomID, err) } if !subscribed { - return nil, natsrouter.ErrForbidden("not subscribed to room") + // Parity with message-gatekeeper's identical condition: same reason + // lets the frontend branch consistently without service-by-service text matching. + return nil, errcode.Forbidden("not subscribed to room", errcode.WithReason(errcode.MessageNotSubscribed)) } return accessSince, nil } @@ -33,26 +34,26 @@ func millisToTime(millis *int64) time.Time { func parsePageRequest(cursor string, limit int) (cassrepo.PageRequest, error) { q, err := cassrepo.ParsePageRequest(cursor, limit) if err != nil { - slog.Error("invalid pagination cursor", "error", err, "cursor", cursor) - return cassrepo.PageRequest{}, natsrouter.ErrBadRequest("invalid pagination cursor") + // Cause is the parse error (cursor format/decode) — server-only; + // the user-safe message stays generic. + return cassrepo.PageRequest{}, errcode.BadRequest("invalid pagination cursor", errcode.WithCause(err)) } return q, nil } func (s *HistoryService) findMessage(ctx context.Context, roomID, messageID string) (*models.Message, error) { if messageID == "" { - return nil, natsrouter.ErrBadRequest("messageId is required") + return nil, errcode.BadRequest("messageId is required") } msg, err := s.msgReader.GetMessageByID(ctx, messageID) if err != nil { - slog.Error("finding message", "error", err, "messageID", messageID) - return nil, natsrouter.ErrInternal("failed to retrieve message") + return nil, fmt.Errorf("retrieving message %s: %w", messageID, err) } if msg == nil { - return nil, natsrouter.ErrNotFound("message not found") + return nil, errcode.NotFound("message not found") } if msg.RoomID != roomID { - return nil, natsrouter.ErrNotFound("message not found") + return nil, errcode.NotFound("message not found") } return msg, nil } diff --git a/inbox-worker/handler.go b/inbox-worker/handler.go index 8e1dae09c..abc83e933 100644 --- a/inbox-worker/handler.go +++ b/inbox-worker/handler.go @@ -10,6 +10,7 @@ import ( "go.mongodb.org/mongo-driver/v2/mongo" + "github.com/hmchangw/chat/pkg/errcode" "github.com/hmchangw/chat/pkg/idgen" "github.com/hmchangw/chat/pkg/model" ) @@ -184,7 +185,11 @@ func (h *Handler) handleRoleUpdated(ctx context.Context, evt *model.OutboxEvent) roomID := subEvt.Subscription.RoomID roles := subEvt.Subscription.Roles if len(roles) == 0 { - return fmt.Errorf("role_updated event has empty roles") + // Poison message — return errcode.Permanent so main.go's consume loop + // Acks (vs Nak-forever on a malformed payload). + slog.WarnContext(ctx, "role_updated event has empty roles", + "account", account, "room_id", roomID) + return errcode.Permanent(errcode.BadRequest("role_updated event has empty roles")) } if err := h.store.UpdateSubscriptionRoles(ctx, account, roomID, roles); err != nil { return fmt.Errorf("update subscription roles: %w", err) diff --git a/inbox-worker/handler_test.go b/inbox-worker/handler_test.go index 2cf117e24..e5efd2639 100644 --- a/inbox-worker/handler_test.go +++ b/inbox-worker/handler_test.go @@ -3,6 +3,7 @@ package main import ( "context" "encoding/json" + "errors" "fmt" "sync" "testing" @@ -12,6 +13,7 @@ import ( "github.com/stretchr/testify/require" "go.mongodb.org/mongo-driver/v2/mongo" + "github.com/hmchangw/chat/pkg/errcode" "github.com/hmchangw/chat/pkg/idgen" "github.com/hmchangw/chat/pkg/model" ) @@ -767,6 +769,38 @@ func TestHandleEvent_RoleUpdated_InvalidPayload(t *testing.T) { } } +// Empty-roles payload is a poison message. Handler returns errcode.Permanent +// so main.go's consume loop Acks (not Nak) — preventing infinite redelivery. +// Store is not called. +func TestHandleEvent_RoleUpdated_EmptyRoles(t *testing.T) { + store := &stubInboxStore{} + h := NewHandler(store) + subEvt := model.SubscriptionUpdateEvent{ + Subscription: model.Subscription{ + User: model.SubscriptionUser{ID: "u1", Account: "alice"}, + RoomID: "r1", + Roles: nil, + }, + } + payload, _ := json.Marshal(subEvt) + evt := model.OutboxEvent{Type: "role_updated", SiteID: "site-a", DestSiteID: "site-b", Payload: payload} + evtData, _ := json.Marshal(evt) + + err := h.HandleEvent(context.Background(), evtData) + if err == nil { + t.Fatal("expected errcode.Permanent for empty-roles payload") + } + if _, ok := errcode.IsPermanent(err); !ok { + t.Fatalf("expected errcode.Permanent, got %T: %v", err, err) + } + if !errors.Is(err, errcode.ErrPermanent) { + t.Fatalf("expected errors.Is(err, ErrPermanent), got %v", err) + } + if len(store.getRoleUpdates()) != 0 { + t.Error("store should NOT be called on empty-roles event") + } +} + func TestHandleEvent_MemberRemoved(t *testing.T) { store := &stubInboxStore{} h := NewHandler(store) diff --git a/inbox-worker/main.go b/inbox-worker/main.go index 6d155d393..1d9de469f 100644 --- a/inbox-worker/main.go +++ b/inbox-worker/main.go @@ -15,6 +15,7 @@ import ( "github.com/Marz32onE/instrumentation-go/otel-nats/oteljetstream" + "github.com/hmchangw/chat/pkg/errcode" "github.com/hmchangw/chat/pkg/model" "github.com/hmchangw/chat/pkg/mongoutil" "github.com/hmchangw/chat/pkg/natsutil" @@ -296,8 +297,17 @@ func main() { handler := NewHandler(store) cctx, err := cons.Consume(func(m oteljetstream.Msg) { - handlerCtx := natsutil.ContextWithRequestIDFromHeaders(m.Context(), m.Headers()) + handlerCtx, _ := natsutil.StampRequestID(m.Context(), m.Headers(), m.Subject()) if err := handler.HandleEvent(handlerCtx, m.Data()); err != nil { + // Permanent failures (poison messages) Ack so JetStream stops + // redelivering; transient infra errors Nak for redelivery. + if _, isPermanent := errcode.IsPermanent(err); isPermanent { + slog.Warn("permanent event failure — dropping (Ack)", "error", err, "request_id", natsutil.RequestIDFromContext(handlerCtx)) + if err := m.Ack(); err != nil { + slog.Error("failed to ack permanent message", "error", err) + } + return + } slog.Error("handle event failed", "error", err, "request_id", natsutil.RequestIDFromContext(handlerCtx)) if err := m.Nak(); err != nil { slog.Error("failed to nak message", "error", err) diff --git a/message-gatekeeper/fetcher_history.go b/message-gatekeeper/fetcher_history.go index 55a5cf1ce..90a86b293 100644 --- a/message-gatekeeper/fetcher_history.go +++ b/message-gatekeeper/fetcher_history.go @@ -8,7 +8,7 @@ import ( "github.com/Marz32onE/instrumentation-go/otel-nats/otelnats" - "github.com/hmchangw/chat/pkg/model" + "github.com/hmchangw/chat/pkg/errcode" "github.com/hmchangw/chat/pkg/model/cassandra" "github.com/hmchangw/chat/pkg/subject" ) @@ -55,12 +55,12 @@ func (f *historyParentFetcher) FetchQuotedParent( return nil, fmt.Errorf("history request: %w", err) } - // natsrouter encodes errors as {"error":"...","code":"..."}. Detect that - // shape first; a successful Message has no top-level "error" field, so - // this can't false-positive on a real response. - var errEnv model.ErrorResponse - if jsonErr := json.Unmarshal(msg.Data, &errEnv); jsonErr == nil && errEnv.Error != "" { - return nil, fmt.Errorf("history response error: %s", errEnv.Error) + // Detect the errcode error envelope first; a real Message has no top-level + // "error" field so this cannot false-positive. Propagate the typed remote + // errcode so the caller can preserve the upstream classification (a + // transient infra failure stays unavailable, not collapsed to not_found). + if ee, ok := errcode.Parse(msg.Data); ok && ee.Code.Valid() { + return nil, ee } var parent cassandra.Message diff --git a/message-gatekeeper/fetcher_history_test.go b/message-gatekeeper/fetcher_history_test.go index 6876fa3aa..39b63e81f 100644 --- a/message-gatekeeper/fetcher_history_test.go +++ b/message-gatekeeper/fetcher_history_test.go @@ -12,7 +12,7 @@ import ( "github.com/Marz32onE/instrumentation-go/otel-nats/otelnats" - "github.com/hmchangw/chat/pkg/model" + "github.com/hmchangw/chat/pkg/errcode" "github.com/hmchangw/chat/pkg/model/cassandra" "github.com/hmchangw/chat/pkg/subject" ) @@ -81,11 +81,11 @@ func TestHistoryParentFetcher_FetchQuotedParent(t *testing.T) { assert.Equal(t, threadParentCreatedAt, got.ThreadParentCreatedAt.UTC()) }) - t.Run("history returns natsrouter error envelope — returns error", func(t *testing.T) { + t.Run("history returns errcode error envelope — returns error", func(t *testing.T) { nc := startTestNATS(t) _, err := nc.Subscribe(subject.MsgGet(account, roomID, siteID), func(m otelnats.Msg) { - data, _ := json.Marshal(model.ErrorResponse{Error: "message not found"}) + data, _ := json.Marshal(errcode.NotFound("message not found")) _ = m.Msg.Respond(data) }) require.NoError(t, err) diff --git a/message-gatekeeper/handler.go b/message-gatekeeper/handler.go index f50ce31c7..0fdeace5f 100644 --- a/message-gatekeeper/handler.go +++ b/message-gatekeeper/handler.go @@ -6,12 +6,15 @@ import ( "errors" "fmt" "log/slog" + "strconv" "strings" "time" "github.com/nats-io/nats.go" "github.com/nats-io/nats.go/jetstream" + "github.com/hmchangw/chat/pkg/errcode" + "github.com/hmchangw/chat/pkg/errcode/errnats" "github.com/hmchangw/chat/pkg/idgen" "github.com/hmchangw/chat/pkg/model" "github.com/hmchangw/chat/pkg/model/cassandra" @@ -21,19 +24,6 @@ import ( const maxContentBytes = 20 * 1024 // 20 KB -// infraError represents a transient failure that should be nack'd for retry. -type infraError struct { - cause error -} - -func (e *infraError) Error() string { - return e.cause.Error() -} - -func (e *infraError) Unwrap() error { - return e.cause -} - // replyFunc is the function signature for publishing a reply to a NATS subject. type replyFunc func(ctx context.Context, msg *nats.Msg) error @@ -65,57 +55,77 @@ func NewHandler(store Store, publish publishFunc, reply replyFunc, siteID string // HandleJetStreamMsg processes a JetStream message from the MESSAGES stream. func (h *Handler) HandleJetStreamMsg(ctx context.Context, msg jetstream.Msg) { + // Parse the body once; reused for log enrichment, reply routing, and + // processMessage validation (was triple-decoded on the hot path). + rawData := msg.Data() + var req model.SendMessageRequest + parseErr := json.Unmarshal(rawData, &req) + + // Enrich the logger before the subject parse so even the malformed-subject + // path carries request_id + a best-effort account. roomID is added later. + ctx = errcode.WithLogValues(ctx, + "request_id", req.RequestID, + "account", accountFromSubject(msg.Subject())) + account, roomID, siteID, ok := subject.ParseUserRoomSiteSubject(msg.Subject()) if !ok { slog.Warn("invalid subject", "subject", msg.Subject()) - // Best-effort error reply so the client doesn't hang waiting for a - // response it will never get. Recover the account segment if the - // subject is at least chat.user.{account}.…; sendReply no-ops when the - // account or requestId is unusable. Ack regardless — a malformed - // subject is not retryable, so JetStream must not redeliver it. - h.sendReply(ctx, accountFromSubject(msg.Subject()), msg.Data(), natsutil.MarshalError("invalid message subject")) + // Best-effort error reply so the client doesn't hang; sendReply no-ops + // when account or requestId is unusable. Ack — malformed is not retryable. + h.sendReply(ctx, accountFromSubject(msg.Subject()), &req, errnats.Marshal(ctx, errcode.BadRequest("invalid message subject"))) if err := msg.Ack(); err != nil { slog.Error("failed to ack message", "error", err) } return } - replyData, err := h.processMessage(ctx, account, roomID, siteID, msg.Data()) + ctx = errcode.WithLogValues(ctx, "room_id", roomID) + + if parseErr != nil { + // Do not WithCause(parseErr) — json.SyntaxError strings embed the + // offending substring from an unauthenticated entry-point (see doc.go). + bad := errcode.BadRequest("unmarshal send message request") + h.sendReply(ctx, account, &req, errnats.Marshal(ctx, bad)) + if err := msg.Ack(); err != nil { + slog.Error("failed to ack message", "error", err) + } + return + } + + replyData, err := h.processMessage(ctx, account, roomID, siteID, &req) if err != nil { - slog.Error("process message failed", "error", err, "account", account, "roomID", roomID) - var ie *infraError - if errors.As(err, &ie) { - if err := msg.Nak(); err != nil { - slog.Error("failed to nack message", "error", err) - } - } else { - // Validation error: reply with error and ack. - h.sendReply(ctx, account, msg.Data(), h.marshalErrorReply(err)) + // Typed *errcode.Error → client-facing validation/permanence: reply + Ack. + // Bare error (raw fmt.Errorf) → transient infra failure: Nak for redelivery. + // errnats.Marshal runs Classify which logs once at category-aware level — + // validation branch must NOT also log here. Infra branch owns its log. + var ee *errcode.Error + if errors.As(err, &ee) { + h.sendReply(ctx, account, &req, errnats.Marshal(ctx, err)) if err := msg.Ack(); err != nil { slog.Error("failed to ack message", "error", err) } + } else { + slog.ErrorContext(ctx, "process message failed (infra)", "error", err, "account", account, "room_id", roomID) + if err := msg.Nak(); err != nil { + slog.Error("failed to nack message", "error", err) + } } return } - h.sendReply(ctx, account, msg.Data(), replyData) + h.sendReply(ctx, account, &req, replyData) if err := msg.Ack(); err != nil { slog.Error("failed to ack message", "err", err) } } -// sendReply extracts the requestID from the raw message data and publishes the -// reply payload to the user's response subject. -func (h *Handler) sendReply(ctx context.Context, account string, rawData []byte, replyData []byte) { +// sendReply publishes the reply payload to the user's response subject. Pass +// a zero-value *req when parsing failed — the empty RequestID gate no-ops. +func (h *Handler) sendReply(ctx context.Context, account string, req *model.SendMessageRequest, replyData []byte) { if account == "" { return } - var req model.SendMessageRequest - if err := json.Unmarshal(rawData, &req); err != nil { - slog.Error("unmarshal request for reply", "error", err) - return - } // Skip when requestId is missing or not a valid hyphenated UUID — the reply // subject chat.user.{account}.response.{requestId} would be unroutable, and // processMessage already rejects such requests upstream. @@ -140,19 +150,13 @@ func accountFromSubject(subj string) string { return "" } -// processMessage validates a SendMessageRequest and publishes a MessageEvent to MESSAGES_CANONICAL. -// Returns the serialized Message on success, or an error. -// Validation errors (bad input) are plain errors; transient failures are *infraError. -func (h *Handler) processMessage(ctx context.Context, account, roomID, siteID string, data []byte) ([]byte, error) { +// processMessage validates a SendMessageRequest and publishes a MessageEvent +// to MESSAGES_CANONICAL. Validation errors are typed *errcode.Error (reply + +// Ack); transient infra failures are bare fmt.Errorf (Nak for redelivery). +func (h *Handler) processMessage(ctx context.Context, account, roomID, siteID string, req *model.SendMessageRequest) ([]byte, error) { // Validate siteID matches this service's siteID if siteID != h.siteID { - return nil, fmt.Errorf("siteID mismatch: got %s, want %s", siteID, h.siteID) - } - - // Unmarshal request - var req model.SendMessageRequest - if err := json.Unmarshal(data, &req); err != nil { - return nil, fmt.Errorf("unmarshal send message request: %w", err) + return nil, errcode.BadRequest(fmt.Sprintf("siteID mismatch: got %s, want %s", siteID, h.siteID)) } // Validate requestId is a hyphenated UUID. It is required: the async reply @@ -161,40 +165,45 @@ func (h *Handler) processMessage(ctx context.Context, account, roomID, siteID st // the reply. Rejecting here fails fast instead of publishing an // unacknowledgeable message to MESSAGES_CANONICAL. if !idgen.IsValidUUID(req.RequestID) { - return nil, fmt.Errorf("invalid requestId %q: must be a hyphenated UUID", req.RequestID) + return nil, errcode.BadRequest(fmt.Sprintf("invalid requestId %q: must be a hyphenated UUID", req.RequestID)) } // Validate ID is a valid 20-char base62 message ID if !idgen.IsValidMessageID(req.ID) { - return nil, fmt.Errorf("invalid message ID %q: must be a 20-char base62 string", req.ID) + return nil, errcode.BadRequest(fmt.Sprintf("invalid message ID %q: must be a 20-char base62 string", req.ID)) } if req.ThreadParentMessageID != "" && !idgen.IsValidMessageID(req.ThreadParentMessageID) { - return nil, fmt.Errorf("invalid thread parent message ID %q: must be a 20-char base62 string", req.ThreadParentMessageID) + return nil, errcode.BadRequest(fmt.Sprintf("invalid thread parent message ID %q: must be a 20-char base62 string", req.ThreadParentMessageID)) } // Validate content is non-empty if req.Content == "" { - return nil, fmt.Errorf("content must not be empty") + return nil, errcode.BadRequest("content must not be empty") } // Validate content does not exceed 20KB if len(req.Content) > maxContentBytes { - return nil, fmt.Errorf("content exceeds maximum size of %d bytes", maxContentBytes) + return nil, errcode.BadRequest( + fmt.Sprintf("content exceeds maximum size of %d bytes", maxContentBytes), + errcode.WithMetadata("maxContentBytes", strconv.Itoa(maxContentBytes), "attempted", strconv.Itoa(len(req.Content))), + ) } // Validate thread parent fields are paired if req.ThreadParentMessageID != "" && req.ThreadParentMessageCreatedAt == nil { - return nil, fmt.Errorf("validate thread parent fields: threadParentMessageCreatedAt is required when threadParentMessageId is set") + return nil, errcode.BadRequest("validate thread parent fields: threadParentMessageCreatedAt is required when threadParentMessageId is set") } // Verify subscription sub, err := h.store.GetSubscription(ctx, account, roomID) if err != nil { if errors.Is(err, errNotSubscribed) { - return nil, fmt.Errorf("user %s is not subscribed to room %s", account, roomID) + // Return the wrapped err so server-side logs keep the full chain + // (store wrapped it with %w; errors.Is upstream still matches). + return nil, err } - return nil, &infraError{cause: fmt.Errorf("get subscription for user %s in room %s: %w", account, roomID, err)} + return nil, fmt.Errorf("get subscription for user %s in room %s: %w", account, roomID, err) } // Large-room post restriction: in rooms with more than the configured @@ -207,13 +216,13 @@ func (h *Handler) processMessage(ctx context.Context, account, roomID, siteID st if !isThreadReply && !canBypassLargeRoomCap(sub) { meta, err := h.store.GetRoomMeta(ctx, roomID) if err != nil { - return nil, &infraError{cause: fmt.Errorf("get room meta for %s: %w", roomID, err)} + return nil, fmt.Errorf("get room meta for %s: %w", roomID, err) } if meta.UserCount > h.largeRoomThreshold { slog.Info("send blocked", - "reason", codeLargeRoomPostRestricted, + "reason", string(errcode.MessageLargeRoomPostRestricted), "account", account, - "roomID", roomID, + "room_id", roomID, "userCount", meta.UserCount, "threshold", h.largeRoomThreshold, ) @@ -251,13 +260,13 @@ func (h *Handler) processMessage(ctx context.Context, account, roomID, siteID st evt := model.MessageEvent{Event: model.EventCreated, Message: msg, SiteID: siteID, Timestamp: now.UnixMilli()} evtData, err := json.Marshal(evt) if err != nil { - return nil, &infraError{cause: fmt.Errorf("marshal message event: %w", err)} + return nil, fmt.Errorf("marshal message event: %w", err) } canonicalSubj := subject.MsgCanonicalCreated(siteID) canonicalMsg := natsutil.NewMsg(ctx, canonicalSubj, evtData) if _, err := h.publish(ctx, canonicalMsg, jetstream.WithMsgID(natsutil.CanonicalDedupID(&evt))); err != nil { - return nil, &infraError{cause: fmt.Errorf("publish to MESSAGES_CANONICAL: %w", err)} + return nil, fmt.Errorf("publish to MESSAGES_CANONICAL: %w", err) } return json.Marshal(msg) @@ -274,14 +283,24 @@ func (h *Handler) resolveQuoteSnapshot(ctx context.Context, account, roomID, sit snap, err := h.parentFetcher.FetchQuotedParent(ctx, account, roomID, siteID, quotedParentMessageID) switch { case err != nil: - return nil, fmt.Errorf("fetch quoted parent %s: %w", quotedParentMessageID, err) + // Preserve upstream errcode classification (transient → Unavailable, + // real 404 → NotFound). For non-errcode infra failures (NATS timeout, + // no-responders, unmarshal), classify as Unavailable — a transient + // quoted-parent fetch failure shouldn't surface to the client as 404. + var ee *errcode.Error + if errors.As(err, &ee) { + return nil, ee + } + return nil, errcode.Unavailable(fmt.Sprintf("fetch quoted parent %s", quotedParentMessageID), errcode.WithCause(err)) case snap == nil: - // Treat the fetcher's contract violation as a hard failure rather than - // silently dereferencing snap.ThreadParentID below. + // A nil snapshot with no error is a fetcher contract violation, not a + // genuine missing parent. Return a bare error so the caller's branch + // classifies this as infra (Nak for redelivery + log) rather than + // permanently dropping the message via a 404 reply+Ack. return nil, fmt.Errorf("fetch quoted parent %s: fetcher returned nil snapshot", quotedParentMessageID) case snap.ThreadParentID != newMessageThreadID: - return nil, fmt.Errorf("quoted parent %s thread context mismatch: parent thread %q, new message thread %q", - quotedParentMessageID, snap.ThreadParentID, newMessageThreadID) + return nil, errcode.BadRequest(fmt.Sprintf("quoted parent %s thread context mismatch: parent thread %q, new message thread %q", + quotedParentMessageID, snap.ThreadParentID, newMessageThreadID)) default: return snap, nil } @@ -301,14 +320,3 @@ func canBypassLargeRoomCap(sub *model.Subscription) bool { } return isBot(sub.User.Account) } - -// marshalErrorReply produces the JSON reply payload for a validation error. -// If the error is (or wraps) a *codedError, the reply carries the code; -// otherwise the reply is the legacy uncoded shape. -func (h *Handler) marshalErrorReply(err error) []byte { - var ce *codedError - if errors.As(err, &ce) { - return natsutil.MarshalErrorWithCode(ce.Message, ce.Code) - } - return natsutil.MarshalError(err.Error()) -} diff --git a/message-gatekeeper/handler_test.go b/message-gatekeeper/handler_test.go index fc3943cc0..e8dd0496b 100644 --- a/message-gatekeeper/handler_test.go +++ b/message-gatekeeper/handler_test.go @@ -16,6 +16,9 @@ import ( "go.mongodb.org/mongo-driver/v2/mongo" "go.uber.org/mock/gomock" + "github.com/hmchangw/chat/pkg/errcode" + "github.com/hmchangw/chat/pkg/errcode/errnats" + "github.com/hmchangw/chat/pkg/errcode/errtest" "github.com/hmchangw/chat/pkg/idgen" "github.com/hmchangw/chat/pkg/model" "github.com/hmchangw/chat/pkg/model/cassandra" @@ -709,16 +712,21 @@ func TestHandler_ProcessMessage(t *testing.T) { largeRoomThreshold: threshold, } - data, err := h.processMessage(context.Background(), tc.account, tc.roomID, tc.siteID, tc.buildData()) + var req model.SendMessageRequest + _ = json.Unmarshal(tc.buildData(), &req) // tests build valid payloads; ignore parse errors here + data, err := h.processMessage(context.Background(), tc.account, tc.roomID, tc.siteID, &req) if tc.wantErr { require.Error(t, err) + // Post-infraError-retirement: infra = bare error (no *errcode.Error + // in chain), validation = typed *errcode.Error. Handler routes Nak + // vs Ack on this distinction. + var ee *errcode.Error + hasErrcode := errors.As(err, &ee) if tc.wantInfra { - var ie *infraError - assert.True(t, errors.As(err, &ie), "expected infraError, got %T: %v", err, err) + assert.False(t, hasErrcode, "expected infra error (no *errcode.Error), got %T: %v", err, err) } else { - var ie *infraError - assert.False(t, errors.As(err, &ie), "expected non-infra error, got infraError: %v", err) + assert.True(t, hasErrcode, "expected validation *errcode.Error, got %T: %v", err, err) } if tc.checkErr != nil { tc.checkErr(t, err) @@ -759,8 +767,7 @@ func TestHandler_processMessage_RejectsInvalidThreadParentMessageID(t *testing.T ThreadParentMessageID: "not-a-valid-msg-id", ThreadParentMessageCreatedAt: &parentTs, } - data, _ := json.Marshal(req) - _, err := h.processMessage(context.Background(), "alice", "room-1", "site1", data) + _, err := h.processMessage(context.Background(), "alice", "room-1", "site1", &req) require.Error(t, err) assert.Contains(t, err.Error(), "invalid thread parent message ID") } @@ -784,9 +791,8 @@ func TestHandler_processMessage_PropagatesRequestIDOnCanonicalPublish(t *testing ctx := natsutil.WithRequestID(context.Background(), "req-mg-test-id") req := model.SendMessageRequest{ID: idgen.GenerateMessageID(), Content: "hello", RequestID: "01970a4f-8c2d-7c9a-abcd-e0123456789f"} - data, _ := json.Marshal(req) - _, err := h.processMessage(ctx, "alice", "room-1", "site1", data) + _, err := h.processMessage(ctx, "alice", "room-1", "site1", &req) require.NoError(t, err) require.NotNil(t, capturedHeader, "publish must propagate header from ctx") assert.Equal(t, "req-mg-test-id", capturedHeader.Get(natsutil.RequestIDHeader)) @@ -1077,7 +1083,9 @@ func TestHandler_ProcessMessage_WithQuote(t *testing.T) { largeRoomThreshold: 500, } - data, err := h.processMessage(context.Background(), validAccount, validRoomID, validSiteID, tc.buildData()) + var req model.SendMessageRequest + _ = json.Unmarshal(tc.buildData(), &req) + data, err := h.processMessage(context.Background(), validAccount, validRoomID, validSiteID, &req) if tc.wantErr { require.Error(t, err) @@ -1156,34 +1164,35 @@ func TestCanBypassLargeRoomCap(t *testing.T) { } } -func TestHandler_marshalErrorReply(t *testing.T) { - h := &Handler{} +func TestHandler_errorReplyEnvelope(t *testing.T) { + ctx := context.Background() - t.Run("plain error produces uncoded reply", func(t *testing.T) { - data := h.marshalErrorReply(errors.New("user alice is not subscribed to room R")) - var got model.ErrorResponse - require.NoError(t, json.Unmarshal(data, &got)) - assert.Equal(t, "user alice is not subscribed to room R", got.Error) - assert.Empty(t, got.Code) - // omitempty: the wire bytes must not contain a "code" key. - assert.NotContains(t, string(data), `"code"`) + t.Run("validation error produces bad_request envelope", func(t *testing.T) { + data := errnats.Marshal(ctx, errcode.BadRequest("content must not be empty")) + e := errtest.Decode(t, data) + assert.Equal(t, errcode.CodeBadRequest, e.Code) + assert.Equal(t, "content must not be empty", e.Message) + assert.Empty(t, e.Reason) }) - t.Run("codedError produces coded reply", func(t *testing.T) { - data := h.marshalErrorReply(errLargeRoomPostRestricted) - var got model.ErrorResponse - require.NoError(t, json.Unmarshal(data, &got)) - assert.Equal(t, "posting is restricted to owners and admins in this room", got.Error) - assert.Equal(t, "large_room_post_restricted", got.Code) + t.Run("large-room sentinel produces forbidden envelope with reason", func(t *testing.T) { + data := errnats.Marshal(ctx, errLargeRoomPostRestricted) + errtest.AssertCode(t, data, errcode.CodeForbidden) + errtest.AssertReason(t, data, errcode.MessageLargeRoomPostRestricted) + assert.Equal(t, "posting is restricted to owners and admins in this room", errtest.Decode(t, data).Message) }) - t.Run("wrapped codedError still dispatches", func(t *testing.T) { + t.Run("wrapped large-room sentinel still carries forbidden + reason", func(t *testing.T) { wrapped := fmt.Errorf("context: %w", errLargeRoomPostRestricted) - data := h.marshalErrorReply(wrapped) - var got model.ErrorResponse - require.NoError(t, json.Unmarshal(data, &got)) - assert.Equal(t, "posting is restricted to owners and admins in this room", got.Error) - assert.Equal(t, "large_room_post_restricted", got.Code) + data := errnats.Marshal(ctx, wrapped) + errtest.AssertCode(t, data, errcode.CodeForbidden) + errtest.AssertReason(t, data, errcode.MessageLargeRoomPostRestricted) + }) + + t.Run("not-subscribed sentinel produces forbidden envelope with reason", func(t *testing.T) { + data := errnats.Marshal(ctx, errNotSubscribed) + errtest.AssertCode(t, data, errcode.CodeForbidden) + errtest.AssertReason(t, data, errcode.MessageNotSubscribed) }) } @@ -1215,9 +1224,8 @@ func TestHandler_sendReply(t *testing.T) { return NewHandler(nil, nil, reply, "site-a", nil, 500) } - mk := func(requestID string) []byte { - b, _ := json.Marshal(model.SendMessageRequest{ID: "id", Content: "c", RequestID: requestID}) - return b + mk := func(requestID string) *model.SendMessageRequest { + return &model.SendMessageRequest{ID: "id", Content: "c", RequestID: requestID} } t.Run("valid UUID requestId publishes a reply", func(t *testing.T) { @@ -1249,3 +1257,59 @@ func TestHandler_sendReply(t *testing.T) { assert.Empty(t, captured) }) } + +// ---- HandleJetStreamMsg coverage ---- + +type fakeJSMsg struct { + subject string + data []byte + headers nats.Header + acked bool + naked bool +} + +func (m *fakeJSMsg) Metadata() (*jetstream.MsgMetadata, error) { return nil, nil } +func (m *fakeJSMsg) Data() []byte { return m.data } +func (m *fakeJSMsg) Headers() nats.Header { return m.headers } +func (m *fakeJSMsg) Subject() string { return m.subject } +func (m *fakeJSMsg) Reply() string { return "" } +func (m *fakeJSMsg) Ack() error { m.acked = true; return nil } +func (m *fakeJSMsg) DoubleAck(context.Context) error { m.acked = true; return nil } +func (m *fakeJSMsg) Nak() error { m.naked = true; return nil } +func (m *fakeJSMsg) NakWithDelay(time.Duration) error { m.naked = true; return nil } +func (m *fakeJSMsg) InProgress() error { return nil } +func (m *fakeJSMsg) Term() error { return nil } +func (m *fakeJSMsg) TermWithReason(string) error { return nil } + +// Malformed body Acks (not retryable) and sends a bad_request reply if the +// subject parsed cleanly. +func TestHandleJetStreamMsg_MalformedBody_Acks(t *testing.T) { + var captured []*nats.Msg + reply := func(_ context.Context, m *nats.Msg) error { + captured = append(captured, m) + return nil + } + h := NewHandler(nil, nil, reply, "site-A", nil, 500) + + msg := &fakeJSMsg{ + subject: "chat.user.alice.room.r1.site-A.msg.send", + data: []byte(`{not json`), + } + h.HandleJetStreamMsg(context.Background(), msg) + assert.True(t, msg.acked, "malformed body must Ack — never retryable") + assert.False(t, msg.naked) + // Reply is skipped (no valid requestId in a body that didn't parse). + assert.Empty(t, captured, "no reply when requestId can't be recovered") +} + +// Invalid subject Acks (not retryable) and sends a best-effort reply. +func TestHandleJetStreamMsg_InvalidSubject_Acks(t *testing.T) { + h := NewHandler(nil, nil, func(context.Context, *nats.Msg) error { return nil }, "site-A", nil, 500) + msg := &fakeJSMsg{ + subject: "chat.garbage", + data: []byte(`{}`), + } + h.HandleJetStreamMsg(context.Background(), msg) + assert.True(t, msg.acked, "invalid subject must Ack — not retryable") + assert.False(t, msg.naked) +} diff --git a/message-gatekeeper/main.go b/message-gatekeeper/main.go index adb3f92f7..713793224 100644 --- a/message-gatekeeper/main.go +++ b/message-gatekeeper/main.go @@ -140,7 +140,7 @@ func main() { <-sem wg.Done() }() - handlerCtx := natsutil.ContextWithRequestIDFromHeaders(msgCtx, msg.Headers()) + handlerCtx, _ := natsutil.StampRequestID(msgCtx, msg.Headers(), msg.Subject()) handler.HandleJetStreamMsg(handlerCtx, msg) }() } diff --git a/message-gatekeeper/store.go b/message-gatekeeper/store.go index 783ef36bd..7893fb133 100644 --- a/message-gatekeeper/store.go +++ b/message-gatekeeper/store.go @@ -2,8 +2,8 @@ package main import ( "context" - "errors" + "github.com/hmchangw/chat/pkg/errcode" "github.com/hmchangw/chat/pkg/model" "github.com/hmchangw/chat/pkg/model/cassandra" "github.com/hmchangw/chat/pkg/roommetacache" @@ -11,30 +11,16 @@ import ( //go:generate mockgen -destination=mock_store_test.go -package=main . Store,ParentMessageFetcher -// errNotSubscribed is returned when the user is not subscribed to the room. -var errNotSubscribed = errors.New("not subscribed") +var ( + // errNotSubscribed: matched by identity (errors.Is); forbidden/not_subscribed survives to the client. + errNotSubscribed = errcode.Forbidden("not subscribed", errcode.WithReason(errcode.MessageNotSubscribed)) -// codedError pairs a stable wire code with a user-safe message. Returned by -// validation paths that want the reply to carry a machine-readable code. -type codedError struct { - Code string - Message string -} - -func (e *codedError) Error() string { return e.Message } - -// codeLargeRoomPostRestricted is the wire code emitted when a non-bypass -// sender hits the cap. Shared between the error sentinel and the slog -// "reason" field so log queries and the wire payload stay aligned. -const codeLargeRoomPostRestricted = "large_room_post_restricted" - -// errLargeRoomPostRestricted is returned when a sender without bypass -// privileges (owner, admin, or bot account) attempts to post a top-level -// message in a room whose userCount exceeds the configured threshold. -var errLargeRoomPostRestricted = &codedError{ - Code: codeLargeRoomPostRestricted, - Message: "posting is restricted to owners and admins in this room", -} + // errLargeRoomPostRestricted: returned when a sender without bypass privileges + // (owner/admin/bot) posts to a room whose userCount exceeds the threshold. + errLargeRoomPostRestricted = errcode.Forbidden( + "posting is restricted to owners and admins in this room", + errcode.WithReason(errcode.MessageLargeRoomPostRestricted)) +) type Store interface { GetSubscription(ctx context.Context, account, roomID string) (*model.Subscription, error) diff --git a/message-worker/handler.go b/message-worker/handler.go index f8d08e863..8753d909d 100644 --- a/message-worker/handler.go +++ b/message-worker/handler.go @@ -42,15 +42,15 @@ func NewHandler(store Store, userStore userstore.UserStore, threadStore ThreadSt func (h *Handler) HandleJetStreamMsg(ctx context.Context, msg jetstream.Msg) { if err := h.processMessage(ctx, msg.Data()); err != nil { - slog.Error("process message failed", "error", err) + slog.ErrorContext(ctx, "process message failed", "error", err, "request_id", natsutil.RequestIDFromContext(ctx)) if nakErr := msg.Nak(); nakErr != nil { - slog.Error("failed to nack message", "error", nakErr) + slog.ErrorContext(ctx, "failed to nack message", "error", nakErr, "request_id", natsutil.RequestIDFromContext(ctx)) } return } if err := msg.Ack(); err != nil { - slog.Error("failed to ack message", "err", err) + slog.ErrorContext(ctx, "failed to ack message", "error", err, "request_id", natsutil.RequestIDFromContext(ctx)) } } @@ -71,8 +71,9 @@ func (h *Handler) processMessage(ctx context.Context, data []byte) error { if err != nil { if evt.Message.Type != "" { // System messages may have no real user; proceed with nil sender. - slog.Warn("user not found for system message, using nil sender", - "userID", evt.Message.UserID, "type", evt.Message.Type) + slog.WarnContext(ctx, "user not found for system message, using nil sender", + "user_id", evt.Message.UserID, "type", evt.Message.Type, + "request_id", natsutil.RequestIDFromContext(ctx)) } else { return fmt.Errorf("lookup user %s: %w", evt.Message.UserID, err) } @@ -153,9 +154,10 @@ func (h *Handler) handleFirstThreadReply(ctx context.Context, msg *model.Message parentSender, err := h.store.GetMessageSender(ctx, msg.ThreadParentMessageID) if err != nil { if errors.Is(err, errMessageNotFound) { - slog.Warn("thread reply parent not found — skipping subscription creation", + slog.WarnContext(ctx, "thread reply parent not found — skipping subscription creation", "parentMessageID", msg.ThreadParentMessageID, - "replyID", msg.ID) + "replyID", msg.ID, + "request_id", natsutil.RequestIDFromContext(ctx)) return nil } return fmt.Errorf("get parent message sender: %w", err) @@ -194,12 +196,12 @@ func (h *Handler) handleFirstThreadReply(ctx context.Context, msg *model.Message return fmt.Errorf("stamp thread_room_id on parent message: %w", err) } } else { - slog.Error("first thread reply: ThreadParentMessageCreatedAt is nil, parent thread_room_id stamp skipped", + slog.ErrorContext(ctx, "first thread reply: ThreadParentMessageCreatedAt is nil, parent thread_room_id stamp skipped", "request_id", natsutil.RequestIDFromContext(ctx), "replyID", msg.ID, "parentMessageID", msg.ThreadParentMessageID, "threadRoomID", threadRoomID, - "roomID", msg.RoomID, + "room_id", msg.RoomID, ) } @@ -246,9 +248,10 @@ func (h *Handler) handleSubsequentThreadReply(ctx context.Context, msg *model.Me } case errors.Is(err, errMessageNotFound): parentFound = false - slog.Warn("thread reply parent not found — skipping parent subscription upsert", + slog.WarnContext(ctx, "thread reply parent not found — skipping parent subscription upsert", "parentMessageID", msg.ThreadParentMessageID, - "replyID", msg.ID) + "replyID", msg.ID, + "request_id", natsutil.RequestIDFromContext(ctx)) if replier != nil { replierSub := h.buildThreadSubscription(msg, existingRoom.ID, msg.UserID, msg.UserAccount, eventSiteID, now) if err := h.threadStore.UpsertThreadSubscription(ctx, replierSub); err != nil { @@ -274,20 +277,20 @@ func (h *Handler) handleSubsequentThreadReply(ctx context.Context, msg *model.Me return "", fmt.Errorf("stamp thread_room_id on parent message: %w", err) } case !parentFound: - slog.Error("subsequent thread reply: parent not found in messages_by_id, thread_room_id stamp skipped", + slog.ErrorContext(ctx, "subsequent thread reply: parent not found in messages_by_id, thread_room_id stamp skipped", "request_id", natsutil.RequestIDFromContext(ctx), "replyID", msg.ID, "parentMessageID", msg.ThreadParentMessageID, "threadRoomID", existingRoom.ID, - "roomID", msg.RoomID, + "room_id", msg.RoomID, ) default: // msg.ThreadParentMessageCreatedAt == nil - slog.Error("subsequent thread reply: ThreadParentMessageCreatedAt is nil, parent thread_room_id stamp skipped", + slog.ErrorContext(ctx, "subsequent thread reply: ThreadParentMessageCreatedAt is nil, parent thread_room_id stamp skipped", "request_id", natsutil.RequestIDFromContext(ctx), "replyID", msg.ID, "parentMessageID", msg.ThreadParentMessageID, "threadRoomID", existingRoom.ID, - "roomID", msg.RoomID, + "room_id", msg.RoomID, ) } @@ -302,8 +305,9 @@ func (h *Handler) lookupOwnerSiteID(ctx context.Context, userID, role string) (s user, err := h.userStore.FindUserByID(ctx, userID) if err != nil { if errors.Is(err, userstore.ErrUserNotFound) { - slog.Warn("owner user not found — skipping cross-site outbox publish; local thread subscription insert/upsert continues", - "userID", userID, "role", role) + slog.WarnContext(ctx, "owner user not found — skipping cross-site outbox publish; local thread subscription insert/upsert continues", + "user_id", userID, "role", role, + "request_id", natsutil.RequestIDFromContext(ctx)) return "", nil } return "", fmt.Errorf("lookup user %s: %w", userID, err) @@ -370,8 +374,9 @@ func (h *Handler) markThreadMentions(ctx context.Context, msg *model.Message, th // absorbs duplicates within the dedup window. func (h *Handler) publishThreadSubOutboxIfRemote(ctx context.Context, sub *model.ThreadSubscription, ownerSiteID, msgID string) error { if ownerSiteID == "" { - slog.Warn("owner siteID empty, skipping outbox publish", - "threadRoomID", sub.ThreadRoomID, "userID", sub.UserID, "msgID", msgID) + slog.WarnContext(ctx, "owner siteID empty, skipping outbox publish", + "threadRoomID", sub.ThreadRoomID, "user_id", sub.UserID, "msgID", msgID, + "request_id", natsutil.RequestIDFromContext(ctx)) return nil } if ownerSiteID == h.siteID { diff --git a/message-worker/main.go b/message-worker/main.go index 96e364640..ce0e4d46c 100644 --- a/message-worker/main.go +++ b/message-worker/main.go @@ -171,7 +171,7 @@ func main() { <-sem wg.Done() }() - handlerCtx := natsutil.ContextWithRequestIDFromHeaders(msgCtx, msg.Headers()) + handlerCtx, _ := natsutil.StampRequestID(msgCtx, msg.Headers(), msg.Subject()) handler.HandleJetStreamMsg(handlerCtx, msg) }() } diff --git a/message-worker/store_cassandra.go b/message-worker/store_cassandra.go index e17579d78..999aa7c7a 100644 --- a/message-worker/store_cassandra.go +++ b/message-worker/store_cassandra.go @@ -404,7 +404,7 @@ func (s *CassandraStore) UpdateParentMessageThreadRoomID(ctx context.Context, pa slog.Error("thread_room_id stamp on messages_by_room missed: parent row not found at the given (room_id, bucket, created_at, message_id) coordinates", "request_id", natsutil.RequestIDFromContext(ctx), "messageID", parentMessageID, - "roomID", roomID, + "room_id", roomID, "bucket", parentBucket, "parentCreatedAt", parentCreatedAt, "threadRoomID", threadRoomID, diff --git a/mock-user-service/handler.go b/mock-user-service/handler.go index 78ea07ca0..5821ecc6d 100644 --- a/mock-user-service/handler.go +++ b/mock-user-service/handler.go @@ -3,6 +3,7 @@ package main import ( "time" + "github.com/hmchangw/chat/pkg/errcode" "github.com/hmchangw/chat/pkg/model" "github.com/hmchangw/chat/pkg/natsrouter" "github.com/hmchangw/chat/pkg/subject" @@ -107,7 +108,7 @@ func NewHandler(siteID string) *Handler { func (h *Handler) checkSite(c *natsrouter.Context) error { if c.Param("siteID") != h.siteID { - return natsrouter.ErrNotFound("unknown site") + return errcode.NotFound("unknown site") } return nil } diff --git a/mock-user-service/handler_test.go b/mock-user-service/handler_test.go index 54caee70c..9d70e6228 100644 --- a/mock-user-service/handler_test.go +++ b/mock-user-service/handler_test.go @@ -7,6 +7,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "github.com/hmchangw/chat/pkg/errcode" "github.com/hmchangw/chat/pkg/natsrouter" ) @@ -22,12 +23,12 @@ func TestHandler_CheckSite(t *testing.T) { assert.NoError(t, err) }) - t.Run("mismatch returns ErrNotFound", func(t *testing.T) { + t.Run("mismatch returns CodeNotFound", func(t *testing.T) { err := h.checkSite(newCtx(map[string]string{"siteID": "site-other"})) require.Error(t, err) - var routeErr *natsrouter.RouteError - require.True(t, errors.As(err, &routeErr), "want *natsrouter.RouteError, got %T", err) - assert.Equal(t, natsrouter.CodeNotFound, routeErr.Code) + var ee *errcode.Error + require.True(t, errors.As(err, &ee), "want *errcode.Error, got %T", err) + assert.Equal(t, errcode.CodeNotFound, ee.Code) }) } diff --git a/notification-worker/handler.go b/notification-worker/handler.go index 36df12943..ac311877a 100644 --- a/notification-worker/handler.go +++ b/notification-worker/handler.go @@ -64,6 +64,7 @@ func (h *Handler) HandleMessage(ctx context.Context, data []byte) error { } subj := subject.Notification(subs[i].User.Account) if err := h.pub.Publish(ctx, subj, notifData); err != nil { + // account is intentionally logged for operability; do NOT add message body / token fields. slog.Error("publish notification failed", "error", err, "account", subs[i].User.Account) } } diff --git a/notification-worker/main.go b/notification-worker/main.go index bf2dcf723..752327328 100644 --- a/notification-worker/main.go +++ b/notification-worker/main.go @@ -45,13 +45,13 @@ func (m *mongoMemberLookup) ListSubscriptions(ctx context.Context, roomID string filter := map[string]string{"roomId": roomID} cursor, err := m.col.Find(ctx, filter) if err != nil { - return nil, err + return nil, fmt.Errorf("find subscriptions for room %s: %w", roomID, err) } defer cursor.Close(ctx) var subs []model.Subscription if err := cursor.All(ctx, &subs); err != nil { - return nil, err + return nil, fmt.Errorf("decode subscriptions for room %s: %w", roomID, err) } return subs, nil } @@ -131,7 +131,7 @@ func main() { <-sem wg.Done() }() - handlerCtx := natsutil.ContextWithRequestIDFromHeaders(msgCtx, msg.Headers()) + handlerCtx, _ := natsutil.StampRequestID(msgCtx, msg.Headers(), msg.Subject()) if err := handler.HandleMessage(handlerCtx, msg.Data()); err != nil { slog.Error("handle message failed", "error", err, "request_id", natsutil.RequestIDFromContext(handlerCtx)) if err := msg.Nak(); err != nil { diff --git a/pkg/errcode/category.go b/pkg/errcode/category.go new file mode 100644 index 000000000..38236d16c --- /dev/null +++ b/pkg/errcode/category.go @@ -0,0 +1,49 @@ +package errcode + +// Code is the closed set of generic error classifications; drives HTTP status. +type Code string + +const ( + CodeBadRequest Code = "bad_request" + CodeUnauthenticated Code = "unauthenticated" + CodeForbidden Code = "forbidden" + CodeNotFound Code = "not_found" + CodeConflict Code = "conflict" + CodeTooManyRequests Code = "too_many_requests" + CodeUnavailable Code = "unavailable" + CodeInternal Code = "internal" +) + +// Valid reports whether c is one of the canonical Code* constants. +func (c Code) Valid() bool { + switch c { + case CodeBadRequest, CodeUnauthenticated, CodeForbidden, CodeNotFound, + CodeConflict, CodeTooManyRequests, CodeUnavailable, CodeInternal: + return true + default: + return false + } +} + +// HTTPStatus maps a code to its HTTP status; unknown values map to 500 so a +// misclassification never leaks as 2xx. +func (c Code) HTTPStatus() int { + switch c { + case CodeBadRequest: + return 400 + case CodeUnauthenticated: + return 401 + case CodeForbidden: + return 403 + case CodeNotFound: + return 404 + case CodeConflict: + return 409 + case CodeTooManyRequests: + return 429 + case CodeUnavailable: + return 503 + default: + return 500 + } +} diff --git a/pkg/errcode/category_test.go b/pkg/errcode/category_test.go new file mode 100644 index 000000000..01ab50664 --- /dev/null +++ b/pkg/errcode/category_test.go @@ -0,0 +1,22 @@ +package errcode + +import "testing" + +func TestCode_HTTPStatus(t *testing.T) { + cases := map[Code]int{ + CodeBadRequest: 400, + CodeUnauthenticated: 401, + CodeForbidden: 403, + CodeNotFound: 404, + CodeConflict: 409, + CodeTooManyRequests: 429, + CodeUnavailable: 503, + CodeInternal: 500, + Code("weird"): 500, + } + for c, want := range cases { + if got := c.HTTPStatus(); got != want { + t.Errorf("%s.HTTPStatus() = %d, want %d", c, got, want) + } + } +} diff --git a/pkg/errcode/classify.go b/pkg/errcode/classify.go new file mode 100644 index 000000000..761f5840e --- /dev/null +++ b/pkg/errcode/classify.go @@ -0,0 +1,51 @@ +package errcode + +import ( + "context" + "errors" + "log/slog" +) + +// Classify converts any error into a client-safe *Error and logs it exactly once. +// Server faults log at ERROR, expected client errors at INFO. See doc.go. +func Classify(ctx context.Context, err error) *Error { + if err == nil { + return nil + } + var e *Error + var underlying string + hasErrcode := errors.As(err, &e) + if hasErrcode { + if e.cause != nil { + underlying = e.cause.Error() + } + } else { + e = &Error{Code: CodeInternal, Message: "internal error", cause: err} + } + // Only compute the cause string when it's actually distinct from the + // message — for a bare errcode.BadRequest("x") it would just duplicate the + // "code"/"reason" attrs and waste the err.Error() allocation per 4xx. + cause := e.Message + if !hasErrcode || err != e { + cause = err.Error() + } + attrs := []any{ + "code", string(e.Code), + "reason", string(e.Reason), + "cause", cause, + } + if underlying != "" { + attrs = append(attrs, "underlying", underlying) + } + loggerFrom(ctx).Log(ctx, e.logLevel(), "request failed", attrs...) + return e +} + +func (e *Error) logLevel() slog.Level { + switch e.Code { + case CodeInternal, CodeUnavailable: + return slog.LevelError + default: + return slog.LevelInfo + } +} diff --git a/pkg/errcode/classify_test.go b/pkg/errcode/classify_test.go new file mode 100644 index 000000000..e53f33adf --- /dev/null +++ b/pkg/errcode/classify_test.go @@ -0,0 +1,143 @@ +package errcode + +import ( + "bytes" + "context" + "encoding/json" + "errors" + "fmt" + "log/slog" + "strings" + "testing" +) + +func newCapture() (context.Context, *bytes.Buffer) { + var buf bytes.Buffer + l := slog.New(slog.NewJSONHandler(&buf, &slog.HandlerOptions{Level: slog.LevelInfo})) + return WithLogger(context.Background(), l), &buf +} + +func TestClassify_NilReturnsNil(t *testing.T) { + ctx, _ := newCapture() + if Classify(ctx, nil) != nil { + t.Fatal("nil → nil") + } +} + +func TestClassify_UnknownBecomesInternalAndLogsCause(t *testing.T) { + ctx, buf := newCapture() + raw := fmt.Errorf("load room: %w", errors.New("mongo: connection refused 10.0.0.5")) + e := Classify(ctx, raw) + if e.Code != CodeInternal || e.Message != "internal error" { + t.Fatalf("got %+v", e) + } + if !strings.Contains(buf.String(), "mongo: connection refused") { + t.Fatalf("cause not logged: %s", buf.String()) + } + b, _ := json.Marshal(e) + if strings.Contains(string(b), "mongo") { + t.Fatalf("cause leaked into reply: %s", b) + } +} + +func TestClassify_TypedErrorPreservedThroughWrapping(t *testing.T) { + ctx, _ := newCapture() + typed := NotFound("room not found", WithReason("room_not_found")) + e := Classify(ctx, fmt.Errorf("checking room: %w", typed)) + if e.Code != CodeNotFound || e.Reason != "room_not_found" { + t.Fatalf("typed lost: %+v", e) + } +} + +func TestClassify_LogsCtxValues(t *testing.T) { + ctx, buf := newCapture() + ctx = WithLogValues(ctx, "request_id", "req-123", "account", "alice") + Classify(ctx, errors.New("boom")) + if l := buf.String(); !strings.Contains(l, "req-123") || !strings.Contains(l, "alice") { + t.Fatalf("ctx values missing: %s", l) + } +} + +func TestClassify_LogsAttachedCause(t *testing.T) { + // The whole point of WithCause: the raw underlying error must appear in the + // server log, even though the client only sees "internal error". + raw := errors.New("mongo: connection refused 10.0.0.5") + + // Direct: errcode error with a cause. + ctx, buf := newCapture() + e := Classify(ctx, Internal("internal error", WithCause(raw))) + if !strings.Contains(buf.String(), "mongo: connection refused") { + t.Fatalf("direct WithCause not logged: %s", buf.String()) + } + b, _ := json.Marshal(e) + if strings.Contains(string(b), "mongo") { + t.Fatalf("cause leaked into reply: %s", b) + } + + // Wrapped: outer context + the hidden cause must both survive. + ctx, buf = newCapture() + Classify(ctx, fmt.Errorf("handler save: %w", Internal("internal error", WithCause(raw)))) + if l := buf.String(); !strings.Contains(l, "handler save") || !strings.Contains(l, "mongo: connection refused") { + t.Fatalf("wrapped cause/context lost: %s", l) + } +} + +func TestClassify_CauseAndUnderlyingAreSeparateLogFields(t *testing.T) { + // Task 20.20: the underlying-cause text is logged as its own slog field so + // log aggregators can pivot on it independently and Classify avoids the + // per-request string-concat allocation. + ctx, buf := newCapture() + raw := errors.New("mongo: connection refused 10.0.0.5") + Classify(ctx, fmt.Errorf("handler save: %w", Internal("internal error", WithCause(raw)))) + var line map[string]any + if err := json.Unmarshal(buf.Bytes(), &line); err != nil { + t.Fatalf("log not JSON: %v", err) + } + cause, _ := line["cause"].(string) + underlying, _ := line["underlying"].(string) + if !strings.Contains(cause, "handler save") { + t.Fatalf(`cause field missing outer message: %q`, cause) + } + if !strings.Contains(underlying, "mongo: connection refused") { + t.Fatalf(`underlying field missing raw cause: %q`, underlying) + } + if strings.Contains(cause, "mongo: connection refused") { + t.Fatalf("raw cause must NOT be concatenated into cause field: %q", cause) + } +} + +func TestClassify_NoUnderlyingFieldWhenNoCause(t *testing.T) { + ctx, buf := newCapture() + Classify(ctx, NotFound("room not found")) + var line map[string]any + if err := json.Unmarshal(buf.Bytes(), &line); err != nil { + t.Fatalf("log not JSON: %v", err) + } + if _, present := line["underlying"]; present { + t.Fatalf("underlying field present without WithCause: %s", buf.String()) + } +} + +func TestClassify_LevelIsCategoryAware(t *testing.T) { + level := func(err error) string { + ctx, buf := newCapture() + Classify(ctx, err) + var line map[string]any + _ = json.Unmarshal(buf.Bytes(), &line) + return line["level"].(string) + } + // Expected client errors must NOT log at ERROR (would pollute alerting). + if got := level(BadRequest("name is required")); got != "INFO" { + t.Fatalf("4xx level = %s, want INFO", got) + } + if got := level(NotFound("gone")); got != "INFO" { + t.Fatalf("not_found level = %s, want INFO", got) + } + // Server/infra errors log at ERROR. + if got := level(errors.New("mongo down")); got != "ERROR" { + t.Fatalf("internal level = %s, want ERROR", got) + } + if got := level(Unavailable("service busy")); got != "ERROR" { + t.Fatalf("unavailable level = %s, want ERROR", got) + } +} diff --git a/pkg/errcode/codes_auth.go b/pkg/errcode/codes_auth.go new file mode 100644 index 000000000..28293ac00 --- /dev/null +++ b/pkg/errcode/codes_auth.go @@ -0,0 +1,10 @@ +package errcode + +// Reasons emitted by auth-service. +const ( + AuthTokenExpired Reason = "sso_token_expired" + AuthInvalidToken Reason = "invalid_sso_token" + AuthInvalidRequest Reason = "invalid_request" + AuthInvalidNKey Reason = "invalid_nkey" + AuthMissingFields Reason = "missing_fields" +) diff --git a/pkg/errcode/codes_message.go b/pkg/errcode/codes_message.go new file mode 100644 index 000000000..e56328ad8 --- /dev/null +++ b/pkg/errcode/codes_message.go @@ -0,0 +1,17 @@ +package errcode + +// Reasons emitted by message-gatekeeper and history-service. +const ( + MessageLargeRoomPostRestricted Reason = "large_room_post_restricted" + MessageNotSubscribed Reason = "not_subscribed" + // MessageOutsideAccessWindow distinguishes "caller IS subscribed but the + // message predates HSS" from MessageNotSubscribed — the frontend renders + // different UX ("history hidden before you joined"). + MessageOutsideAccessWindow Reason = "outside_access_window" + // Pin-feature reasons — all three are "forbidden" cases the frontend needs + // to distinguish to render the right copy (kill-switch vs hard cap vs + // large-room gate). "not subscribed" reuses MessageNotSubscribed above. + PinDisabled Reason = "pin_disabled" + PinLimitReached Reason = "pin_limit_reached" + PinRoomTooLarge Reason = "pin_room_too_large" +) diff --git a/pkg/errcode/codes_platform.go b/pkg/errcode/codes_platform.go new file mode 100644 index 000000000..30fcb61f8 --- /dev/null +++ b/pkg/errcode/codes_platform.go @@ -0,0 +1,11 @@ +package errcode + +// Reasons emitted by cross-cutting platform middleware (pkg/natsutil, pkg/natsrouter) +// rather than a single domain service. +const ( + // RequestIDRequired marks a rejected request that arrived without a valid + // X-Request-ID on a dedup-critical path (see natsutil.RequireRequestID). + // Clients should special-case it by retrying with a freshly minted + // hyphenated UUID rather than surfacing a generic "bad request". + RequestIDRequired Reason = "request_id_required" +) diff --git a/pkg/errcode/codes_room.go b/pkg/errcode/codes_room.go new file mode 100644 index 000000000..7b833b9d7 --- /dev/null +++ b/pkg/errcode/codes_room.go @@ -0,0 +1,24 @@ +package errcode + +// Reasons emitted by room-service and room-worker. +const ( + RoomMaxSizeReached Reason = "max_room_size_reached" + RoomNotMember Reason = "not_room_member" + RoomNotOwner Reason = "not_room_owner" + RoomLastOwnerCannotLeave Reason = "last_owner_cannot_leave" + RoomBotInChannel Reason = "bot_in_channel" + RoomBotNotAvailable Reason = "bot_not_available" + RoomUserNotFound Reason = "user_not_found" + RoomInvalidOrg Reason = "invalid_org" + RoomSelfDM Reason = "self_dm" + RoomLastMemberCannotRemove Reason = "last_member_cannot_remove" + RoomTargetNotMember Reason = "target_not_member" + RoomAlreadyOwner Reason = "already_owner" + RoomCannotDemoteLastOwner Reason = "cannot_demote_last_owner" + RoomPromoteRequiresIndividual Reason = "promote_requires_individual" + // RoomNonChannelOperation marks operations that are only supported on + // channel rooms (add-member, remove-member, role update) but were invoked + // against a DM or bot-DM. The frontend uses it to render a "this only + // works in channels" hint instead of a generic 400. + RoomNonChannelOperation Reason = "non_channel_operation" +) diff --git a/pkg/errcode/codes_test.go b/pkg/errcode/codes_test.go new file mode 100644 index 000000000..f78912e82 --- /dev/null +++ b/pkg/errcode/codes_test.go @@ -0,0 +1,38 @@ +package errcode + +import ( + "regexp" + "testing" +) + +var allReasons = []Reason{ + RoomMaxSizeReached, RoomNotMember, RoomNotOwner, + RoomLastOwnerCannotLeave, RoomBotInChannel, RoomBotNotAvailable, + RoomUserNotFound, RoomInvalidOrg, + RoomSelfDM, RoomLastMemberCannotRemove, RoomTargetNotMember, + RoomAlreadyOwner, RoomCannotDemoteLastOwner, RoomPromoteRequiresIndividual, + RoomNonChannelOperation, + MessageLargeRoomPostRestricted, MessageNotSubscribed, MessageOutsideAccessWindow, + PinDisabled, PinLimitReached, PinRoomTooLarge, + AuthTokenExpired, AuthInvalidToken, AuthInvalidRequest, AuthInvalidNKey, AuthMissingFields, + RequestIDRequired, +} + +func TestReasons_SnakeCase(t *testing.T) { + re := regexp.MustCompile(`^[a-z][a-z0-9_]*[a-z0-9]$`) + for _, r := range allReasons { + if !re.MatchString(string(r)) { + t.Errorf("reason %q is not flat snake_case", r) + } + } +} + +func TestReasons_Unique(t *testing.T) { + seen := map[Reason]bool{} + for _, r := range allReasons { + if seen[r] { + t.Errorf("duplicate reason: %q", r) + } + seen[r] = true + } +} diff --git a/pkg/errcode/doc.go b/pkg/errcode/doc.go new file mode 100644 index 000000000..19c4062e5 --- /dev/null +++ b/pkg/errcode/doc.go @@ -0,0 +1,63 @@ +// Package errcode is the single source of client-facing error envelopes for +// every transport (NATS request/reply, JetStream replies, Gin HTTP). +// +// # Wire envelope +// +// {"error":"","code":"","reason":""?,"metadata":{…}?} +// +// - error — human-readable, user-safe message. +// - code — one Code (bad_request, unauthenticated, forbidden, +// not_found, conflict, too_many_requests, unavailable, internal). Always present. +// - reason — optional Reason (domain code, e.g. "max_room_size_reached"), +// declared in codes_.go. Frontend logic: trigger = reason ?? code. +// - metadata — optional map[string]string for structured detail. +// +// # Two types, by design +// +// Code and Reason are distinct types so the compiler rejects +// New(SomeReason, …) and WithReason(SomeCode). +// +// # Leak guarantee +// +// Error.cause is unexported; encoding/json cannot serialize it. The cause is +// reachable only server-side via Unwrap()/errors.Is/As and is logged exactly +// once by Classify. +// +// # Wrapping invariant: at most one *errcode.Error per chain +// +// Allowed: +// +// return errcode.BadRequest("name is required") +// return errcode.NotFound("x", errcode.WithReason(RoomNotMember)) +// return errcode.Internal("x", errcode.WithCause(rawDBErr)) // RAW cause only +// return fmt.Errorf("checking room: %w", typedErr) // typed survives +// return typedErr +// +// Forbidden (WithCause panics; semgrep-flagged): +// +// return errcode.Internal("x", errcode.WithCause(anotherErrcodeErr)) +// +// Also forbidden (defeats the invariant; semgrep-flagged): two errcode errors +// in one chain via multi-verb fmt.Errorf — Classify picks the first. +// +// return fmt.Errorf("%w and %w", errcodeA, errcodeB) +// +// Propagate with a single %w only. +// +// # Logging +// +// Classify logs each error exactly once at a category-aware level (server +// faults ERROR, expected client errors INFO). Handlers must NOT log-then-reply. +// +// Attach domain context once at handler entry: +// - natsrouter handler (has *Context): c.WithLogValues("account", a) +// - Gin / raw NATS (has context.Context): ctx = errcode.WithLogValues(ctx, …) +// +// Never call the package func errcode.WithLogValues with a *natsrouter.Context +// as parent — use the method, which derives from the inner ctx and avoids the +// Value-delegation cycle. +// +// Trust boundary: WithLogValues attributes are SERVER-ONLY (never serialized); +// WithMetadata is CLIENT-VISIBLE (ships in the envelope). Never wrap raw +// message bodies, tokens, or secrets into a cause — WithCause logs err.Error(). +package errcode diff --git a/pkg/errcode/errhttp/write.go b/pkg/errcode/errhttp/write.go new file mode 100644 index 000000000..2b8e576a4 --- /dev/null +++ b/pkg/errcode/errhttp/write.go @@ -0,0 +1,16 @@ +// Package errhttp adapts errcode.Error to Gin HTTP responses. +package errhttp + +import ( + "context" + + "github.com/gin-gonic/gin" + + "github.com/hmchangw/chat/pkg/errcode" +) + +// Write classifies err (logging once) and writes the envelope with its HTTP status. +func Write(ctx context.Context, c *gin.Context, err error) { + e := errcode.Classify(ctx, err) + c.JSON(e.HTTPStatus(), e) +} diff --git a/pkg/errcode/errhttp/write_test.go b/pkg/errcode/errhttp/write_test.go new file mode 100644 index 000000000..faa4d0049 --- /dev/null +++ b/pkg/errcode/errhttp/write_test.go @@ -0,0 +1,40 @@ +package errhttp + +import ( + "encoding/json" + "errors" + "net/http" + "net/http/httptest" + "testing" + + "github.com/gin-gonic/gin" + + "github.com/hmchangw/chat/pkg/errcode" +) + +func TestWrite_StatusAndEnvelope(t *testing.T) { + gin.SetMode(gin.TestMode) + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Request = httptest.NewRequest(http.MethodPost, "/auth", nil) + Write(c.Request.Context(), c, errcode.Unauthenticated("token expired", errcode.WithReason(errcode.AuthTokenExpired))) + if w.Code != http.StatusUnauthorized { + t.Fatalf("status = %d, want 401", w.Code) + } + var got map[string]any + _ = json.Unmarshal(w.Body.Bytes(), &got) + if got["code"] != "unauthenticated" || got["reason"] != "sso_token_expired" { + t.Fatalf("envelope = %v", got) + } +} + +func TestWrite_UnknownIs500(t *testing.T) { + gin.SetMode(gin.TestMode) + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Request = httptest.NewRequest(http.MethodGet, "/x", nil) + Write(c.Request.Context(), c, errors.New("db exploded")) + if w.Code != http.StatusInternalServerError || !json.Valid(w.Body.Bytes()) { + t.Fatalf("status=%d body=%q", w.Code, w.Body.String()) + } +} diff --git a/pkg/errcode/errnats/reply.go b/pkg/errcode/errnats/reply.go new file mode 100644 index 000000000..f60d2126b --- /dev/null +++ b/pkg/errcode/errnats/reply.go @@ -0,0 +1,52 @@ +// Package errnats adapts errcode.Error to NATS request/reply responses. +package errnats + +import ( + "context" + "encoding/json" + "errors" + "log/slog" + + "github.com/nats-io/nats.go" + + "github.com/hmchangw/chat/pkg/errcode" +) + +const fallback = `{"code":"internal","error":"internal error"}` + +// Marshal classifies err (logging it once) and returns the JSON envelope. +func Marshal(ctx context.Context, err error) []byte { + data, mErr := json.Marshal(errcode.Classify(ctx, err)) + if mErr != nil { + return []byte(fallback) + } + return data +} + +// MarshalQuiet returns the envelope WITHOUT logging. Use only on paths that +// already logged the failure (panic backstop, admission/replyBusy). +func MarshalQuiet(err error) []byte { + var e *errcode.Error + if !errors.As(err, &e) { + e = errcode.Internal("internal error") + } + data, mErr := json.Marshal(e) + if mErr != nil { + return []byte(fallback) + } + return data +} + +// Reply classifies err (logging once) and sends the envelope on msg's reply subject. +func Reply(ctx context.Context, msg *nats.Msg, err error) { + if rErr := msg.Respond(Marshal(ctx, err)); rErr != nil { + slog.ErrorContext(ctx, "error reply failed", "error", rErr, "subject", msg.Subject) + } +} + +// ReplyQuiet sends the envelope WITHOUT logging the failure (see MarshalQuiet). +func ReplyQuiet(msg *nats.Msg, err error) { + if rErr := msg.Respond(MarshalQuiet(err)); rErr != nil { + slog.Error("error reply failed", "error", rErr, "subject", msg.Subject) + } +} diff --git a/pkg/errcode/errnats/reply_test.go b/pkg/errcode/errnats/reply_test.go new file mode 100644 index 000000000..0c3359222 --- /dev/null +++ b/pkg/errcode/errnats/reply_test.go @@ -0,0 +1,161 @@ +package errnats + +import ( + "bytes" + "context" + "encoding/json" + "errors" + "log/slog" + "strings" + "testing" + "time" + + natsserver "github.com/nats-io/nats-server/v2/server" + "github.com/nats-io/nats.go" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/hmchangw/chat/pkg/errcode" +) + +// startTestNATS spins up an in-memory NATS server bound to a random port and +// returns a connected *nats.Conn. Cleanup runs on t.Cleanup. +func startTestNATS(t *testing.T) *nats.Conn { + t.Helper() + opts := &natsserver.Options{Port: -1} + ns, err := natsserver.NewServer(opts) + require.NoError(t, err) + ns.Start() + require.True(t, ns.ReadyForConnections(5*time.Second), "nats server did not become ready") + t.Cleanup(ns.Shutdown) + + nc, err := nats.Connect(ns.ClientURL()) + require.NoError(t, err) + t.Cleanup(nc.Close) + return nc +} + +// captureCtx returns a ctx whose logger writes JSON to buf. +func captureCtx() (context.Context, *bytes.Buffer) { + var buf bytes.Buffer + l := slog.New(slog.NewJSONHandler(&buf, &slog.HandlerOptions{Level: slog.LevelInfo})) + return errcode.WithLogger(context.Background(), l), &buf +} + +func ctxQuiet() context.Context { + return errcode.WithLogger(context.Background(), slog.New(slog.NewJSONHandler(&bytes.Buffer{}, nil))) +} + +func TestMarshal_TypedError(t *testing.T) { + data := Marshal(ctxQuiet(), errcode.NotFound("room not found", errcode.WithReason(errcode.RoomNotMember))) + var got map[string]any + require.NoError(t, json.Unmarshal(data, &got)) + assert.Equal(t, "not_found", got["code"]) + assert.Equal(t, "not_room_member", got["reason"]) + assert.Equal(t, "room not found", got["error"]) +} + +func TestMarshal_UnknownCollapsesToInternal(t *testing.T) { + data := Marshal(ctxQuiet(), errors.New("mongo down")) + var got map[string]any + require.NoError(t, json.Unmarshal(data, &got)) + assert.Equal(t, "internal", got["code"]) + assert.Equal(t, "internal error", got["error"]) + assert.NotContains(t, got, "reason", "reason should be absent") +} + +func TestMarshalQuiet_DoesNotLogButStillCollapses(t *testing.T) { + var buf bytes.Buffer + old := slog.Default() + slog.SetDefault(slog.New(slog.NewJSONHandler(&buf, nil))) + defer slog.SetDefault(old) + + data := MarshalQuiet(errors.New("mongo down")) + var got map[string]any + require.NoError(t, json.Unmarshal(data, &got)) + assert.Equal(t, "internal", got["code"]) + assert.Equal(t, "internal error", got["error"]) + assert.Empty(t, buf.String(), "MarshalQuiet must not log") +} + +// requestAndCaptureReply opens a subscriber on subj that runs handler on each +// inbound msg, then publishes a request and returns the reply bytes. +func requestAndCaptureReply(t *testing.T, nc *nats.Conn, subj string, handler func(m *nats.Msg)) []byte { + t.Helper() + sub, err := nc.Subscribe(subj, handler) + require.NoError(t, err) + defer func() { _ = sub.Unsubscribe() }() + reply, err := nc.Request(subj, []byte(`{}`), 2*time.Second) + require.NoError(t, err) + return reply.Data +} + +func TestReply_RespondsWithEnvelopeAndLogsOnce(t *testing.T) { + ctx, buf := captureCtx() + nc := startTestNATS(t) + + data := requestAndCaptureReply(t, nc, "test.reply.fb", func(m *nats.Msg) { + Reply(ctx, m, errcode.Forbidden("not allowed", errcode.WithReason(errcode.RoomNotMember))) + }) + + var got map[string]any + require.NoError(t, json.Unmarshal(data, &got)) + assert.Equal(t, "forbidden", got["code"]) + assert.Equal(t, "not_room_member", got["reason"]) + assert.Equal(t, "not allowed", got["error"]) + + // Exactly one Classify log line. + lines := strings.Split(strings.TrimSpace(buf.String()), "\n") + require.Len(t, lines, 1, "want exactly one log line, got: %s", buf.String()) + var line map[string]any + require.NoError(t, json.Unmarshal([]byte(lines[0]), &line)) + assert.Equal(t, "request failed", line["msg"]) +} + +func TestReply_LogsAtErrorLevelOnInternal(t *testing.T) { + ctx, buf := captureCtx() + nc := startTestNATS(t) + + _ = requestAndCaptureReply(t, nc, "test.reply.internal", func(m *nats.Msg) { + Reply(ctx, m, errcode.Internal("boom")) + }) + + var line map[string]any + require.NoError(t, json.Unmarshal([]byte(strings.TrimSpace(buf.String())), &line)) + assert.Equal(t, "ERROR", line["level"], "internal must log at ERROR") +} + +func TestReply_UnknownErrorCollapsesToInternal(t *testing.T) { + ctx, buf := captureCtx() + nc := startTestNATS(t) + + data := requestAndCaptureReply(t, nc, "test.reply.unknown", func(m *nats.Msg) { + Reply(ctx, m, errors.New("mongo down 10.0.0.5")) + }) + + var got map[string]any + require.NoError(t, json.Unmarshal(data, &got)) + assert.Equal(t, "internal", got["code"]) + assert.Equal(t, "internal error", got["error"]) + assert.NotContains(t, string(data), "mongo", "raw cause must NOT appear on the wire") + assert.Contains(t, buf.String(), "mongo down", "raw cause must appear in the SERVER log") +} + +func TestReplyQuiet_RespondsButEmitsNoClassifyLine(t *testing.T) { + nc := startTestNATS(t) + + var buf bytes.Buffer + old := slog.Default() + slog.SetDefault(slog.New(slog.NewJSONHandler(&buf, nil))) + defer slog.SetDefault(old) + + data := requestAndCaptureReply(t, nc, "test.reply.quiet", func(m *nats.Msg) { + ReplyQuiet(m, errcode.Unavailable("service busy")) + }) + + var got map[string]any + require.NoError(t, json.Unmarshal(data, &got)) + assert.Equal(t, "unavailable", got["code"]) + assert.Equal(t, "service busy", got["error"]) + assert.NotContains(t, buf.String(), "request failed", "ReplyQuiet must not emit a Classify log line") +} diff --git a/pkg/errcode/error.go b/pkg/errcode/error.go new file mode 100644 index 000000000..50c126930 --- /dev/null +++ b/pkg/errcode/error.go @@ -0,0 +1,21 @@ +package errcode + +// Error is the canonical client-facing error; marshals to {error, code, reason?, metadata?}. +// cause is unexported so encoding/json cannot leak it; reachable only via Unwrap. See doc.go. +type Error struct { + Code Code `json:"code"` + Reason Reason `json:"reason,omitempty"` + Message string `json:"error"` + Metadata map[string]string `json:"metadata,omitempty"` + cause error +} + +// Error returns ONLY the user-safe message, never the cause. +func (e *Error) Error() string { return e.Message } + +// Unwrap exposes the wrapped cause for errors.Is/As and server-side logging. +// JSON marshalling does not call Unwrap, so the cause never reaches clients. +func (e *Error) Unwrap() error { return e.cause } + +// HTTPStatus returns the HTTP status for this error's category. +func (e *Error) HTTPStatus() int { return e.Code.HTTPStatus() } diff --git a/pkg/errcode/error_test.go b/pkg/errcode/error_test.go new file mode 100644 index 000000000..f3bc98cff --- /dev/null +++ b/pkg/errcode/error_test.go @@ -0,0 +1,57 @@ +package errcode + +import ( + "encoding/json" + "errors" + "strings" + "testing" +) + +func TestError_Error_ReturnsMessageOnly(t *testing.T) { + e := &Error{Code: CodeBadRequest, Message: "name is required", cause: errors.New("secret db detail")} + if e.Error() != "name is required" { + t.Fatalf("Error() = %q, want safe message only", e.Error()) + } +} + +func TestError_Unwrap(t *testing.T) { + root := errors.New("root") + e := &Error{Code: CodeInternal, Message: "internal error", cause: root} + if !errors.Is(e, root) { + t.Fatal("errors.Is should reach the wrapped cause via Unwrap") + } +} + +func TestError_MarshalJSON_NeverLeaksCause(t *testing.T) { + e := &Error{ + Code: CodeBadRequest, + Reason: "max_room_size_reached", + Message: "room is full", + Metadata: map[string]string{"limit": "500"}, + cause: errors.New("mongo: connection refused at 10.0.0.5"), + } + b, err := json.Marshal(e) + if err != nil { + t.Fatal(err) + } + want := `{"code":"bad_request","reason":"max_room_size_reached","error":"room is full","metadata":{"limit":"500"}}` + if string(b) != want { + t.Fatalf("marshal = %s, want %s", b, want) + } + if strings.Contains(string(b), "mongo") { + t.Fatal("cause leaked into JSON") + } +} + +func TestError_MarshalJSON_OmitsEmptyOptionalFields(t *testing.T) { + b, _ := json.Marshal(&Error{Code: CodeNotFound, Message: "not found"}) + if want := `{"code":"not_found","error":"not found"}`; string(b) != want { + t.Fatalf("marshal = %s, want %s", b, want) + } +} + +func TestError_HTTPStatus(t *testing.T) { + if (&Error{Code: CodeNotFound}).HTTPStatus() != 404 { + t.Fatal("HTTPStatus should delegate to Code.HTTPStatus") + } +} diff --git a/pkg/errcode/errtest/assert.go b/pkg/errcode/errtest/assert.go new file mode 100644 index 000000000..14775b7ca --- /dev/null +++ b/pkg/errcode/errtest/assert.go @@ -0,0 +1,44 @@ +// Package errtest provides assertions for errcode wire envelopes in tests. +package errtest + +import ( + "testing" + + "github.com/hmchangw/chat/pkg/errcode" +) + +// Decode parses an error envelope from a reply payload, failing the test if it +// is not one. testing.TB so helpers compose with sub-test/bench/mock callers. +func Decode(t testing.TB, data []byte) *errcode.Error { + t.Helper() + e, ok := errcode.Parse(data) + if !ok { + t.Fatalf("payload is not an error envelope: %s", data) + return nil // unreachable on real *testing.T; lets recording mocks return cleanly + } + return e +} + +// AssertCode fails unless data is an error envelope with the given code. +func AssertCode(t testing.TB, data []byte, want errcode.Code) { + t.Helper() + e := Decode(t, data) + if e == nil { + return + } + if got := e.Code; got != want { + t.Fatalf("code = %q, want %q (payload %s)", got, want, data) + } +} + +// AssertReason fails unless data is an error envelope with the given reason. +func AssertReason(t testing.TB, data []byte, want errcode.Reason) { + t.Helper() + e := Decode(t, data) + if e == nil { + return + } + if got := e.Reason; got != want { + t.Fatalf("reason = %q, want %q (payload %s)", got, want, data) + } +} diff --git a/pkg/errcode/errtest/assert_test.go b/pkg/errcode/errtest/assert_test.go new file mode 100644 index 000000000..45c7f0afb --- /dev/null +++ b/pkg/errcode/errtest/assert_test.go @@ -0,0 +1,84 @@ +package errtest + +import ( + "encoding/json" + "fmt" + "testing" + + "github.com/hmchangw/chat/pkg/errcode" +) + +func TestAssertEnvelope(t *testing.T) { + data, _ := json.Marshal(errcode.NotFound("room not found", errcode.WithReason(errcode.RoomNotMember))) + AssertCode(t, data, errcode.CodeNotFound) + AssertReason(t, data, errcode.RoomNotMember) +} + +// recordingT implements the subset of testing.TB that the errtest helpers use +// (Helper, Fatalf). Embedding testing.T fills out the rest of the interface +// at compile time; we override Fatalf to capture instead of abort so the +// outer test can assert on the recorded message. +type recordingT struct { + testing.T + failed bool + msg string +} + +func (r *recordingT) Helper() {} + +func (r *recordingT) Fatalf(format string, args ...any) { + r.failed = true + r.msg = fmt.Sprintf(format, args...) + // Do NOT terminate; just record. The helper returns early via the + // nil-check guard in assert.go. +} + +func TestDecode_FailsOnNonEnvelope(t *testing.T) { + rt := &recordingT{} + got := Decode(rt, []byte(`{"not":"an_envelope"}`)) + if !rt.failed { + t.Fatal("Decode must call Fatalf on a non-envelope payload") + } + if got != nil { + t.Fatalf("Decode must return nil on Fatalf path, got %+v", got) + } + if rt.msg == "" { + t.Fatal("Fatalf message should describe the failure") + } +} + +func TestAssertCode_FailsOnMismatch(t *testing.T) { + data, _ := json.Marshal(errcode.NotFound("x")) + rt := &recordingT{} + AssertCode(rt, data, errcode.CodeConflict) + if !rt.failed { + t.Fatal("AssertCode must call Fatalf when the envelope code does not match") + } +} + +func TestAssertReason_FailsOnMismatch(t *testing.T) { + data, _ := json.Marshal(errcode.Forbidden("x", errcode.WithReason(errcode.RoomNotMember))) + rt := &recordingT{} + AssertReason(rt, data, errcode.RoomNotOwner) + if !rt.failed { + t.Fatal("AssertReason must call Fatalf when the envelope reason does not match") + } +} + +func TestAssertCode_PassesSilentlyOnMatch(t *testing.T) { + data, _ := json.Marshal(errcode.NotFound("x")) + rt := &recordingT{} + AssertCode(rt, data, errcode.CodeNotFound) + if rt.failed { + t.Fatalf("AssertCode must not fail on matching code; recorded: %s", rt.msg) + } +} + +func TestAssertReason_PassesSilentlyOnMatch(t *testing.T) { + data, _ := json.Marshal(errcode.Forbidden("x", errcode.WithReason(errcode.RoomNotOwner))) + rt := &recordingT{} + AssertReason(rt, data, errcode.RoomNotOwner) + if rt.failed { + t.Fatalf("AssertReason must not fail on matching reason; recorded: %s", rt.msg) + } +} diff --git a/pkg/errcode/logctx.go b/pkg/errcode/logctx.go new file mode 100644 index 000000000..aaff20f87 --- /dev/null +++ b/pkg/errcode/logctx.go @@ -0,0 +1,28 @@ +package errcode + +import ( + "context" + "log/slog" +) + +type loggerCtxKey struct{} + +// WithLogger stores an explicit *slog.Logger in ctx (mainly for tests). +func WithLogger(ctx context.Context, l *slog.Logger) context.Context { + return context.WithValue(ctx, loggerCtxKey{}, l) +} + +// WithLogValues returns ctx with a logger enriched by key/value pairs. Call +// once at handler entry; Classify's log line carries them. SERVER-ONLY — never +// serialized into the client envelope. +func WithLogValues(ctx context.Context, args ...any) context.Context { + return WithLogger(ctx, loggerFrom(ctx).With(args...)) +} + +// loggerFrom returns the ctx logger, or slog.Default(). +func loggerFrom(ctx context.Context) *slog.Logger { + if l, ok := ctx.Value(loggerCtxKey{}).(*slog.Logger); ok && l != nil { + return l + } + return slog.Default() +} diff --git a/pkg/errcode/logctx_test.go b/pkg/errcode/logctx_test.go new file mode 100644 index 000000000..bcc5b9e76 --- /dev/null +++ b/pkg/errcode/logctx_test.go @@ -0,0 +1,31 @@ +package errcode + +import ( + "bytes" + "context" + "encoding/json" + "log/slog" + "testing" +) + +func TestWithLogValues_AccumulatesAttrs(t *testing.T) { + var buf bytes.Buffer + ctx := WithLogger(context.Background(), slog.New(slog.NewJSONHandler(&buf, nil))) + ctx = WithLogValues(ctx, "account", "alice") + ctx = WithLogValues(ctx, "roomID", "r1") + loggerFrom(ctx).Info("hello") + + var line map[string]any + if err := json.Unmarshal(buf.Bytes(), &line); err != nil { + t.Fatal(err) + } + if line["account"] != "alice" || line["roomID"] != "r1" { + t.Fatalf("attrs not accumulated: %v", line) + } +} + +func TestLoggerFrom_DefaultsWhenAbsent(t *testing.T) { + if loggerFrom(context.Background()) == nil { + t.Fatal("loggerFrom must never return nil") + } +} diff --git a/pkg/errcode/match.go b/pkg/errcode/match.go new file mode 100644 index 000000000..179192487 --- /dev/null +++ b/pkg/errcode/match.go @@ -0,0 +1,15 @@ +package errcode + +import "errors" + +// ReasonOf returns the Reason of the first *Error in err's chain, or "". +func ReasonOf(err error) Reason { + var e *Error + if errors.As(err, &e) { + return e.Reason + } + return "" +} + +// HasReason reports whether err's chain carries an *Error with reason r. +func HasReason(err error, r Reason) bool { return ReasonOf(err) == r } diff --git a/pkg/errcode/match_test.go b/pkg/errcode/match_test.go new file mode 100644 index 000000000..160c122f7 --- /dev/null +++ b/pkg/errcode/match_test.go @@ -0,0 +1,26 @@ +package errcode + +import ( + "errors" + "fmt" + "testing" +) + +func TestReasonOf(t *testing.T) { + err := fmt.Errorf("ctx: %w", NotFound("x", WithReason(RoomNotMember))) + if ReasonOf(err) != RoomNotMember { + t.Fatalf("ReasonOf = %q", ReasonOf(err)) + } + if ReasonOf(errors.New("plain")) != "" { + t.Fatal("non-errcode error must yield empty reason") + } +} + +func TestHasReason(t *testing.T) { + if !HasReason(NotFound("x", WithReason(RoomNotMember)), RoomNotMember) { + t.Fatal("HasReason should match") + } + if HasReason(NotFound("x"), RoomNotMember) { + t.Fatal("HasReason must not match an absent reason") + } +} diff --git a/pkg/errcode/options.go b/pkg/errcode/options.go new file mode 100644 index 000000000..8c2f0d676 --- /dev/null +++ b/pkg/errcode/options.go @@ -0,0 +1,71 @@ +package errcode + +import "errors" + +// Option configures an *Error during construction. +type Option func(*Error) + +// New builds an *Error; prefer the named constructors. Panics on a +// non-canonical Code or empty message — both are programmer errors. +func New(code Code, message string, opts ...Option) *Error { + if !code.Valid() { + panic("errcode: New called with non-canonical Code " + string(code) + + " — use one of the named constructors (NotFound, Forbidden, ...) " + + "or pass a Code* constant") + } + if message == "" { + panic("errcode: empty message — every constructor requires user-safe text") + } + e := &Error{Code: code, Message: message} + for _, opt := range opts { + opt(e) + } + return e +} + +// Named constructors are the entire constructor API: one per category. No *f +// variants — they would swallow trailing Option args; pass fmt.Sprintf as msg. +func BadRequest(msg string, opts ...Option) *Error { return New(CodeBadRequest, msg, opts...) } +func Unauthenticated(msg string, opts ...Option) *Error { + return New(CodeUnauthenticated, msg, opts...) +} +func Forbidden(msg string, opts ...Option) *Error { return New(CodeForbidden, msg, opts...) } +func NotFound(msg string, opts ...Option) *Error { return New(CodeNotFound, msg, opts...) } +func Conflict(msg string, opts ...Option) *Error { return New(CodeConflict, msg, opts...) } +func TooManyRequests(msg string, opts ...Option) *Error { + return New(CodeTooManyRequests, msg, opts...) +} +func Unavailable(msg string, opts ...Option) *Error { return New(CodeUnavailable, msg, opts...) } +func Internal(msg string, opts ...Option) *Error { return New(CodeInternal, msg, opts...) } + +// WithReason attaches the specific machine code the frontend switches on. +func WithReason(r Reason) Option { return func(e *Error) { e.Reason = r } } + +// WithMetadata attaches CLIENT-VISIBLE key/value metadata to the wire envelope +// (use WithLogValues for server-internal detail). Panics on odd len(kv). +func WithMetadata(kv ...string) Option { + return func(e *Error) { + if len(kv)%2 != 0 { + panic("errcode: WithMetadata requires an even number of args (key/value pairs)") + } + if e.Metadata == nil { + e.Metadata = make(map[string]string, len(kv)/2) + } + for i := 0; i < len(kv); i += 2 { + e.Metadata[kv[i]] = kv[i+1] + } + } +} + +// WithCause attaches a raw infra/third-party error for server-side logging. +// PANICS if err already carries an *errcode.Error (one-per-chain invariant). See doc.go. +func WithCause(err error) Option { + return func(e *Error) { + var nested *Error + if errors.As(err, &nested) { + panic("errcode: WithCause must not wrap another *errcode.Error; " + + `propagate it with "return err" or fmt.Errorf("...: %w", err) instead`) + } + e.cause = err + } +} diff --git a/pkg/errcode/options_test.go b/pkg/errcode/options_test.go new file mode 100644 index 000000000..a7c14b0b8 --- /dev/null +++ b/pkg/errcode/options_test.go @@ -0,0 +1,121 @@ +package errcode + +import ( + "errors" + "fmt" + "testing" +) + +func TestNamedConstructors(t *testing.T) { + if e := BadRequest("name is required"); e.Code != CodeBadRequest || e.Message != "name is required" { + t.Fatalf("BadRequest: %+v", e) + } + if e := NotFound("gone"); e.Code != CodeNotFound { + t.Fatal("NotFound") + } + for _, e := range []*Error{ + Unauthenticated("x"), Forbidden("x"), Conflict("x"), + TooManyRequests("x"), Unavailable("x"), Internal("x"), + } { + if e.Message != "x" { + t.Fatalf("constructor message: %+v", e) + } + } + // Spot-check that TooManyRequests sets the 429 category specifically + // (cheap pin against future copy-paste regressions). + if e := TooManyRequests("rate limited"); e.Code != CodeTooManyRequests { + t.Fatalf("TooManyRequests Code = %q, want %q", e.Code, CodeTooManyRequests) + } +} + +func TestConstructorDoesNotFormat_LiteralPercentIsSafe(t *testing.T) { + if got := BadRequest("100% full").Message; got != "100% full" { + t.Fatalf("constructor must not format: %q", got) + } +} + +func TestFormattingPlusOptionUsesSprintfAtCallSite(t *testing.T) { + // The supported pattern for dynamic text + a reason: caller formats, options stay first-class. + e := Conflict(fmt.Sprintf("room %s is full", "r1"), WithReason("max_room_size_reached")) + if e.Message != "room r1 is full" || e.Reason != "max_room_size_reached" { + t.Fatalf("got %+v", e) + } +} + +func TestWithReason(t *testing.T) { + e := BadRequest("room full", WithReason("max_room_size_reached")) + if e.Reason != "max_room_size_reached" { + t.Fatalf("reason = %q", e.Reason) + } +} + +func TestWithMetadata_Pairs(t *testing.T) { + e := Conflict("dm exists", WithMetadata("roomId", "r1", "kind", "dm")) + if e.Metadata["roomId"] != "r1" || e.Metadata["kind"] != "dm" { + t.Fatalf("meta = %v", e.Metadata) + } +} + +func TestWithMetadata_OddArgsPanics(t *testing.T) { + defer func() { + if recover() == nil { + t.Fatal("odd WithMetadata args must panic") + } + }() + BadRequest("x", WithMetadata("lonely")) +} + +func TestWithCause_RawError(t *testing.T) { + root := errors.New("mongo down") + if e := Internal("internal error", WithCause(root)); !errors.Is(e, root) { + t.Fatal("cause not attached") + } +} + +func TestWithCause_PanicsOnNestedErrcode(t *testing.T) { + inner := NotFound("room not found") + defer func() { + if recover() == nil { + t.Fatal("WithCause(errcode.Error) must panic — invariant: one *Error per chain") + } + }() + Internal("x", WithCause(inner)) +} + +func TestWithCause_PanicsOnWrappedNestedErrcode(t *testing.T) { + inner := NotFound("room not found") + wrapped := fmt.Errorf("ctx: %w", inner) + defer func() { + if recover() == nil { + t.Fatal("WithCause must detect *Error even when wrapped") + } + }() + Internal("x", WithCause(wrapped)) +} + +func TestNew_PanicsOnUnknownCategory(t *testing.T) { + defer func() { + if recover() == nil { + t.Fatal("New must panic on a non-canonical Code") + } + }() + New(Code("made_up"), "msg") +} + +func TestNew_PanicsOnEmptyMessage(t *testing.T) { + defer func() { + if recover() == nil { + t.Fatal("New must panic on an empty message") + } + }() + New(CodeNotFound, "") +} + +func TestNamedConstructor_PanicsOnEmptyMessage(t *testing.T) { + defer func() { + if recover() == nil { + t.Fatal("named constructors must inherit the empty-message panic") + } + }() + NotFound("") +} diff --git a/pkg/errcode/parse.go b/pkg/errcode/parse.go new file mode 100644 index 000000000..e7efa9e3b --- /dev/null +++ b/pkg/errcode/parse.go @@ -0,0 +1,18 @@ +package errcode + +import "encoding/json" + +// Parse decodes a reply payload into an *Error iff it is an error envelope +// (non-empty "error" field). Returns (nil, false) for success payloads or garbage. +// +// Parse does NOT validate Code against the closed set — a malformed/foreign +// payload may yield a non-canonical Code. Callers that re-emit a remote +// envelope MUST check Code.Valid() before passing to New (which panics). +func Parse(data []byte) (*Error, bool) { + var e Error + //nolint:nilerr // a malformed payload is simply "not an error envelope"; the unmarshal error is intentionally not surfaced + if err := json.Unmarshal(data, &e); err != nil || e.Message == "" { + return nil, false + } + return &e, true +} diff --git a/pkg/errcode/parse_test.go b/pkg/errcode/parse_test.go new file mode 100644 index 000000000..2922e2189 --- /dev/null +++ b/pkg/errcode/parse_test.go @@ -0,0 +1,22 @@ +package errcode + +import "testing" + +func TestParse_ErrorEnvelope(t *testing.T) { + e, ok := Parse([]byte(`{"code":"forbidden","reason":"not_room_member","error":"only room members can perform this action"}`)) + if !ok || e.Code != CodeForbidden || e.Reason != "not_room_member" { + t.Fatalf("parse failed: %+v ok=%v", e, ok) + } +} + +func TestParse_NonErrorJSON(t *testing.T) { + if _, ok := Parse([]byte(`{"roomId":"r1","status":"accepted"}`)); ok { + t.Fatal("payload without non-empty error must not parse as error") + } +} + +func TestParse_Malformed(t *testing.T) { + if _, ok := Parse([]byte(`not json`)); ok { + t.Fatal("malformed must not parse") + } +} diff --git a/pkg/errcode/permanent.go b/pkg/errcode/permanent.go new file mode 100644 index 000000000..f6193faa2 --- /dev/null +++ b/pkg/errcode/permanent.go @@ -0,0 +1,41 @@ +package errcode + +import "errors" + +// ErrPermanent is the sentinel callers match via errors.Is to detect a +// non-retryable job failure. Wrap with Permanent to mark one. +var ErrPermanent = errors.New("permanent") + +// PermanentError marks an *Error as non-retryable: JetStream consumers Ack +// (drop) rather than Nak. Permanence is INDEPENDENT of category — an Internal +// can be permanent; a retryable infra error stays unwrapped. +type PermanentError struct{ ec *Error } + +// Permanent wraps an *Error as a non-retryable failure. Panics on nil — a +// caller with no classified error to wrap is a programmer bug. +func Permanent(ec *Error) *PermanentError { + if ec == nil { + panic("errcode.Permanent: nil *Error") + } + return &PermanentError{ec: ec} +} + +// Error returns the wrapped *Error's message. +func (p *PermanentError) Error() string { return p.ec.Error() } + +// Unwrap exposes the wrapped *Error (and, transitively, its WithCause cause). +func (p *PermanentError) Unwrap() error { return p.ec } + +// Is matches the ErrPermanent sentinel so callers branch on permanence without +// importing the concrete type. +func (p *PermanentError) Is(target error) bool { return target == ErrPermanent } + +// IsPermanent reports whether err's chain carries a *PermanentError, returning +// the wrapped *Error. Returns (nil, false) for any non-permanent error. +func IsPermanent(err error) (*Error, bool) { + var p *PermanentError + if errors.As(err, &p) { + return p.ec, true + } + return nil, false +} diff --git a/pkg/errcode/permanent_test.go b/pkg/errcode/permanent_test.go new file mode 100644 index 000000000..41ae16866 --- /dev/null +++ b/pkg/errcode/permanent_test.go @@ -0,0 +1,63 @@ +package errcode + +import ( + "errors" + "fmt" + "testing" +) + +func TestPermanent_PanicsOnNil(t *testing.T) { + defer func() { + if recover() == nil { + t.Fatal("Permanent(nil) must panic") + } + }() + Permanent(nil) +} + +func TestPermanent_UnwrapReachesErrcode(t *testing.T) { + inner := NotFound("room not found", WithReason("room_not_found")) + p := Permanent(inner) + var got *Error + if !errors.As(p, &got) { + t.Fatal("errors.As must reach the wrapped *Error") + } + if got.Code != CodeNotFound || got.Reason != "room_not_found" { + t.Fatalf("wrapped *Error lost: %+v", got) + } +} + +func TestPermanent_IsMatchesSentinel(t *testing.T) { + p := Permanent(Internal("boom")) + if !errors.Is(p, ErrPermanent) { + t.Fatal("errors.Is(p, ErrPermanent) must hold") + } + wrapped := fmt.Errorf("publish: %w", p) + if !errors.Is(wrapped, ErrPermanent) { + t.Fatal("errors.Is must traverse the wrap") + } +} + +func TestIsPermanent_DetectsWrapper(t *testing.T) { + inner := Forbidden("denied") + p := Permanent(inner) + ec, ok := IsPermanent(p) + if !ok { + t.Fatal("IsPermanent must return true on wrapped") + } + if ec.Code != CodeForbidden { + t.Fatalf("wrapped *Error lost: %+v", ec) + } +} + +func TestIsPermanent_FalseOnPlainErrcode(t *testing.T) { + if _, ok := IsPermanent(Internal("boom")); ok { + t.Fatal("plain *Error is not permanent") + } + if _, ok := IsPermanent(errors.New("raw")); ok { + t.Fatal("raw error is not permanent") + } + if _, ok := IsPermanent(nil); ok { + t.Fatal("nil is not permanent") + } +} diff --git a/pkg/errcode/reason.go b/pkg/errcode/reason.go new file mode 100644 index 000000000..45adf07b4 --- /dev/null +++ b/pkg/errcode/reason.go @@ -0,0 +1,5 @@ +package errcode + +// Reason is the wire `reason` field: an open set of domain-specific machine +// codes the frontend switches on. Concrete reasons live in codes_.go. +type Reason string diff --git a/pkg/errcode/reason_test.go b/pkg/errcode/reason_test.go new file mode 100644 index 000000000..5c246136b --- /dev/null +++ b/pkg/errcode/reason_test.go @@ -0,0 +1,10 @@ +package errcode + +import "testing" + +func TestReason_IsString(t *testing.T) { + var r Reason = "max_room_size_reached" + if string(r) != "max_room_size_reached" { + t.Fatal("Reason must be a string-backed type") + } +} diff --git a/pkg/idgen/idgen.go b/pkg/idgen/idgen.go index 9d502148e..dd0dd8d32 100644 --- a/pkg/idgen/idgen.go +++ b/pkg/idgen/idgen.go @@ -157,6 +157,24 @@ func GenerateRequestID() string { return u.String() } +// ResolveRequestID enforces the repo-wide "mint everywhere" policy on inbound +// X-Request-ID values: if inbound is a valid hyphenated UUID, it passes through +// unchanged; otherwise a fresh UUIDv7 is minted. replaced is true ONLY when +// inbound was non-empty-and-invalid (i.e., a malformed client value was +// swapped) — empty inbound returns (fresh, false) because "missing" is the +// benign common case, not a client bug. Callers should emit a Warn on +// replaced=true so a buggy client stays traceable. +// +// This is the transport-agnostic primitive. NATS callers wrap it in +// natsutil.StampRequestID, which also handles ctx-stamping and the warn log; +// HTTP callers (Gin middleware) call it directly with c.GetHeader(...). +func ResolveRequestID(inbound string) (id string, replaced bool) { + if IsValidUUID(inbound) { + return inbound, false + } + return GenerateRequestID(), inbound != "" +} + // IsValidUUID reports whether s is a well-formed hyphenated UUID of any version // (case-insensitive). Used to validate inbound X-Request-ID headers — we don't // care which UUID scheme the caller used (v4 or v7), only that the shape is diff --git a/pkg/idgen/idgen_test.go b/pkg/idgen/idgen_test.go index b48dd40dc..42edfd37a 100644 --- a/pkg/idgen/idgen_test.go +++ b/pkg/idgen/idgen_test.go @@ -310,3 +310,49 @@ func TestIsValidUUID_AcceptsGenerateRequestIDOutput(t *testing.T) { assert.True(t, idgen.IsValidUUID(idgen.GenerateRequestID())) } } + +func TestResolveRequestID(t *testing.T) { + cases := []struct { + name string + inbound string + wantID string // "" means "any minted UUID, just not the inbound" + wantReplaced bool + }{ + { + name: "valid_uuid_passes_through", + inbound: "01970a4f-8c2d-7c9a-abcd-e0123456789f", + wantID: "01970a4f-8c2d-7c9a-abcd-e0123456789f", + wantReplaced: false, + }, + { + name: "empty_mints_fresh_not_replaced", + inbound: "", + wantID: "", + wantReplaced: false, // empty inbound is "missing", not "replaced" + }, + { + name: "malformed_mints_fresh_and_reports_replaced", + inbound: "not-a-uuid", + wantID: "", + wantReplaced: true, + }, + { + name: "wrong_length_mints_fresh_and_reports_replaced", + inbound: "01970a4f-8c2d-7c9a-abcd", + wantID: "", + wantReplaced: true, + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + id, replaced := idgen.ResolveRequestID(tc.inbound) + assert.Equal(t, tc.wantReplaced, replaced) + if tc.wantID != "" { + assert.Equal(t, tc.wantID, id) + } else { + assert.True(t, idgen.IsValidUUID(id), "minted id must be a valid UUID, got %q", id) + assert.NotEqual(t, tc.inbound, id) + } + }) + } +} diff --git a/pkg/model/error.go b/pkg/model/error.go deleted file mode 100644 index 2e1e44ab4..000000000 --- a/pkg/model/error.go +++ /dev/null @@ -1,7 +0,0 @@ -package model - -type ErrorResponse struct { - Error string `json:"error"` - Code string `json:"code,omitempty"` - RoomID string `json:"roomId,omitempty"` -} diff --git a/pkg/model/event.go b/pkg/model/event.go index 9793bd1fb..c483bf2af 100644 --- a/pkg/model/event.go +++ b/pkg/model/event.go @@ -329,6 +329,10 @@ type AsyncJobResult struct { Status string `json:"status"` RoomID string `json:"roomId,omitempty"` Error string `json:"error,omitempty"` + // Code and Reason mirror the errcode envelope; typed as string so pkg/model + // does not import pkg/errcode. + Code string `json:"code,omitempty"` + Reason string `json:"reason,omitempty"` Timestamp int64 `json:"timestamp"` } @@ -367,3 +371,7 @@ type CreateRoomReply struct { // CreateRoomReplyAccepted means validated + queued; persistence happens later in room-worker. const CreateRoomReplyAccepted = "accepted" + +// CreateRoomStatusExists indicates the requested DM already existed; RoomID is +// the existing room. Clients treat it as success and open that room. +const CreateRoomStatusExists = "exists" diff --git a/pkg/model/member.go b/pkg/model/member.go index fb5084acc..d8e788d67 100644 --- a/pkg/model/member.go +++ b/pkg/model/member.go @@ -158,7 +158,7 @@ type SyncCreateDMRequest struct { OtherAccount string `json:"otherAccount" bson:"otherAccount"` } -// SyncCreateDMReply is the success reply; errors flow via natsutil.ReplyError instead. +// SyncCreateDMReply is the success reply; errors flow via errnats.Reply (pkg/errcode envelope) instead. type SyncCreateDMReply struct { Success bool `json:"success" bson:"success"` Subscription Subscription `json:"subscription" bson:"subscription"` diff --git a/pkg/model/model_test.go b/pkg/model/model_test.go index 956c3a013..0e1c898d0 100644 --- a/pkg/model/model_test.go +++ b/pkg/model/model_test.go @@ -2072,17 +2072,10 @@ func TestCreateRoomRequestRoundtrip(t *testing.T) { assert.Equal(t, int64(1740000000000), dst.Timestamp) } -func TestErrorResponseRoomIDOmitempty(t *testing.T) { - er := model.ErrorResponse{Error: "internal"} - body, err := json.Marshal(er) - require.NoError(t, err) - assert.NotContains(t, string(body), "roomId") - - er2 := model.ErrorResponse{Error: "dm already exists", RoomID: "r1"} - body2, err := json.Marshal(er2) - require.NoError(t, err) - assert.Contains(t, string(body2), `"roomId":"r1"`) -} +// TestErrorResponseRoomIDOmitempty was removed: model.ErrorResponse was deleted +// alongside the rest of the legacy error machinery (see pkg/errcode for the +// canonical client-facing error type). The DM-exists path now returns a success +// reply (model.CreateRoomReply{Status: CreateRoomStatusExists, RoomID}). func TestAsyncJobResultShape(t *testing.T) { r := model.AsyncJobResult{ @@ -2102,10 +2095,24 @@ func TestAsyncJobResultShape(t *testing.T) { assert.NotContains(t, string(data), `"job"`) assert.NotContains(t, string(data), `"success"`) - r2 := model.AsyncJobResult{Operation: model.AsyncJobOpRoomMemberAdd, Status: "error", Error: "failed"} + // Success case must omit the error-only fields. + assert.NotContains(t, string(data), `"code"`) + assert.NotContains(t, string(data), `"reason"`) + + r2 := model.AsyncJobResult{ + Operation: model.AsyncJobOpRoomMemberAdd, + Status: "error", + Error: "not subscribed", + Code: "forbidden", + Reason: "not_subscribed", + } raw2, err := json.Marshal(r2) require.NoError(t, err) assert.NotContains(t, string(raw2), `"roomId"`) + var dst2 model.AsyncJobResult + require.NoError(t, json.Unmarshal(raw2, &dst2)) + assert.Equal(t, "forbidden", dst2.Code) + assert.Equal(t, "not_subscribed", dst2.Reason) } func TestAsyncJobResultOpConstants(t *testing.T) { @@ -2128,23 +2135,8 @@ func TestAddMembersRequestNoRequestIDField(t *testing.T) { assert.NotContains(t, string(body), "requestId") } -func TestErrorResponseJSON(t *testing.T) { - t.Run("without code, omitempty hides the field", func(t *testing.T) { - src := model.ErrorResponse{Error: "boom"} - data, err := json.Marshal(src) - require.NoError(t, err) - assert.JSONEq(t, `{"error":"boom"}`, string(data)) - roundTrip(t, &src, &model.ErrorResponse{}) - }) - - t.Run("with code, both fields present", func(t *testing.T) { - src := model.ErrorResponse{Error: "blocked", Code: "large_room_post_restricted"} - data, err := json.Marshal(src) - require.NoError(t, err) - assert.JSONEq(t, `{"error":"blocked","code":"large_room_post_restricted"}`, string(data)) - roundTrip(t, &src, &model.ErrorResponse{}) - }) -} +// TestErrorResponseJSON was removed alongside model.ErrorResponse. The wire +// envelope is now owned by pkg/errcode (see pkg/errcode/error_test.go). func TestReadReceiptRequestJSON(t *testing.T) { r := model.ReadReceiptRequest{MessageID: "m1"} diff --git a/pkg/natsrouter/README.md b/pkg/natsrouter/README.md index 7888f09ef..d2df321ef 100644 --- a/pkg/natsrouter/README.md +++ b/pkg/natsrouter/README.md @@ -68,7 +68,7 @@ Under the unbounded default, callers that hit a timeout receive a generic `{"err ```go if errors.Is(err, context.DeadlineExceeded) { - return nil, natsrouter.ErrUnavailable("request timed out") + return nil, errcode.Unavailable("request timed out") } ``` @@ -225,10 +225,15 @@ func (c *Context) Abort() // Check if the chain was aborted. func (c *Context) IsAborted() bool -// Reply helpers. +// Reply helpers. For errors prefer returning a typed *errcode.Error from +// the handler — the router calls errnats.Reply automatically. ReplyError is +// kept for the request-payload-deserialize path. func (c *Context) ReplyJSON(v any) -func (c *Context) ReplyError(msg string) -func (c *Context) ReplyRouteError(e *RouteError) +func (c *Context) ReplyError(msg string) // emits {"code":"bad_request","error":msg} + +// WithLogValues enriches the ctx logger so the centralized errcode.Classify +// log line carries the given attrs. Cycle-safe (derives from the inner ctx). +func (c *Context) WithLogValues(args ...any) // The raw NATS message (for advanced use cases). c.Msg *nats.Msg @@ -263,40 +268,32 @@ type HandlerFunc func(c *Context) type Middleware = HandlerFunc ``` -### RouteError +### Error replies — owned by `pkg/errcode` + +Client-facing errors live in `pkg/errcode` (not in this package). natsrouter is the transport: when a handler returns any error, the router invokes `errnats.Reply(ctx, msg, err)`, which calls `errcode.Classify` and writes the JSON envelope. The full developer guide is `docs/error-handling.md`; the wire-side reference is `docs/client-api.md` §6. + +Quick reference for handler authors: ```go -// User-facing error with optional machine-readable code. -type RouteError struct { - Message string `json:"error"` - Code string `json:"code,omitempty"` +// Typed client-facing errors — named constructor per category. +return nil, errcode.BadRequest("name is required") +return nil, errcode.NotFound("room not found") +return nil, errcode.Forbidden("only owners can update roles") +return nil, errcode.Conflict("room is at maximum capacity", + errcode.WithReason(errcode.RoomMaxSizeReached)) + +// Dynamic message — format at the call site (no *f variants on purpose). +return nil, errcode.BadRequest(fmt.Sprintf("batch size %d exceeds limit %d", n, max)) + +// Infra / DB / third-party — DON'T classify manually; bubble up and let +// Classify collapse to internal at the boundary (real cause logged once, +// never sent to the client). +if err := h.store.Find(ctx, id); err != nil { + return nil, fmt.Errorf("loading room: %w", err) // → client sees "internal error" } - -// Constructors. -func Err(message string) *RouteError -func Errf(format string, args ...any) *RouteError -func ErrWithCode(code, message string) *RouteError - -// Convenience constructors with standard codes. -func ErrBadRequest(message string) *RouteError // code: "bad_request" -func ErrNotFound(message string) *RouteError // code: "not_found" -func ErrForbidden(message string) *RouteError // code: "forbidden" -func ErrConflict(message string) *RouteError // code: "conflict" -func ErrInternal(message string) *RouteError // code: "internal" -func ErrUnavailable(message string) *RouteError // code: "unavailable" - -// Standard error code constants. -const ( - CodeBadRequest = "bad_request" - CodeNotFound = "not_found" - CodeForbidden = "forbidden" - CodeConflict = "conflict" - CodeInternal = "internal" - CodeUnavailable = "unavailable" // emitted by admission control -) ``` -`ErrUnavailable` is the structured reply emitted automatically by the router when the admission semaphore is saturated. Application code can also emit it explicitly to signal a recoverable, retry-worthy condition (e.g. mapping `context.DeadlineExceeded` from a downstream call — see `HandlerTimeout` doc). +`errcode.Unavailable("service busy")` is also what the router emits automatically when the admission semaphore is saturated. Application code can emit it explicitly to signal a recoverable condition (e.g. mapping `context.DeadlineExceeded` from a downstream call — see `HandlerTimeout`). ### Built-in Middleware @@ -329,7 +326,7 @@ func Logging() HandlerFunc // bound code will run past the deadline. Recommended pattern when a // downstream call returns context.DeadlineExceeded: // if errors.Is(err, context.DeadlineExceeded) { -// return nil, natsrouter.ErrUnavailable("request timed out") +// return nil, errcode.Unavailable("request timed out") // } func HandlerTimeout(d time.Duration) HandlerFunc ``` @@ -385,7 +382,7 @@ func (s *Service) GetRoom(c *natsrouter.Context, req GetRoomReq) (*Room, error) return nil, fmt.Errorf("finding room: %w", err) } if room == nil { - return nil, natsrouter.ErrNotFound("room not found") + return nil, errcode.NotFound("room not found") } return room, nil } @@ -472,11 +469,11 @@ func (s *Service) CreateRoom(c *natsrouter.Context, req CreateReq) (*Room, error ```go v, ok := c.Get("user") if !ok { - return nil, natsrouter.ErrForbidden("authentication required") + return nil, errcode.Forbidden("authentication required") } user, ok := v.(User) if !ok { - return nil, natsrouter.ErrInternal("user value has unexpected type") + return nil, errcode.Internal("user value has unexpected type") } ``` @@ -561,7 +558,7 @@ func (s *Service) GetRoom(c *natsrouter.Context, req GetReq) (*Room, error) { } if room == nil { // User-facing error — client sees: {"error":"room not found","code":"not_found"} - return nil, natsrouter.ErrNotFound("room not found") + return nil, errcode.NotFound("room not found") } return room, nil } @@ -570,7 +567,7 @@ func (s *Service) GetRoom(c *natsrouter.Context, req GetReq) (*Room, error) { RouteErrors can be wrapped and still detected: ```go -return nil, fmt.Errorf("access check: %w", natsrouter.ErrForbidden("denied")) +return nil, fmt.Errorf("access check: %w", errcode.Forbidden("denied")) // Client still receives: {"error":"denied","code":"forbidden"} ``` @@ -580,7 +577,7 @@ When `HandlerTimeout` (or any other context source) cancels a request mid-flight ```go if errors.Is(err, context.DeadlineExceeded) { - return nil, natsrouter.ErrUnavailable("request timed out") + return nil, errcode.Unavailable("request timed out") } ``` diff --git a/pkg/natsrouter/context.go b/pkg/natsrouter/context.go index 42a95a228..a2a58daa8 100644 --- a/pkg/natsrouter/context.go +++ b/pkg/natsrouter/context.go @@ -7,6 +7,7 @@ import ( "github.com/nats-io/nats.go" + "github.com/hmchangw/chat/pkg/errcode" "github.com/hmchangw/chat/pkg/natsutil" ) @@ -62,6 +63,10 @@ func releaseContext(c *Context) { c.chain.handlers = nil c.chain.index = 0 chainPool.Put(c.chain) + // Nil out so Next/Abort/IsAborted panic loudly if a post-handler + // goroutine calls them — otherwise it would silently read the next + // request's chain state from the pool. + c.chain = nil // c itself is left to GC. External ctx consumers may still hold it; // every field they can observe is stable from the moment of construction // (Msg, Params, keys); the underlying ctx may have been swapped by @@ -84,8 +89,16 @@ func (c *Context) Done() <-chan struct{} { return c.ctx.Done() } func (c *Context) Err() error { return c.ctx.Err() } func (c *Context) Value(key any) any { return c.ctx.Value(key) } +// Chain methods are handler-internal. Calling them from a post-handler +// goroutine panics — chainState is pooled and would otherwise silently read +// the next request's state. +const chainAfterReleasePanic = "natsrouter: chain method called after handler chain ended; pass values out via c.Value/c.Get before returning" + // Next executes the next handler in the chain. func (c *Context) Next() { + if c.chain == nil { + panic(chainAfterReleasePanic) + } c.chain.index++ for c.chain.index < len(c.chain.handlers) { c.chain.handlers[c.chain.index](c) @@ -95,11 +108,17 @@ func (c *Context) Next() { // Abort stops the middleware chain. func (c *Context) Abort() { + if c.chain == nil { + panic(chainAfterReleasePanic) + } c.chain.index = len(c.chain.handlers) } // IsAborted returns true if the chain was aborted. func (c *Context) IsAborted() bool { + if c.chain == nil { + panic(chainAfterReleasePanic) + } return c.chain.index >= len(c.chain.handlers) } @@ -149,6 +168,12 @@ func (c *Context) SetContext(ctx context.Context) { c.ctx = ctx } +// WithLogValues enriches the ctx logger with key/value pairs for the errcode +// log line. Derives from c.ctx (avoids the SetContext Value-delegation cycle). +func (c *Context) WithLogValues(args ...any) { + c.SetContext(errcode.WithLogValues(c.ctx, args...)) +} + // Param returns a named parameter from the subject. Shortcut for c.Params.Get(key). func (c *Context) Param(key string) string { return c.Params.Get(key) @@ -188,14 +213,3 @@ func (c *Context) GetHeader(key string) string { func (c *Context) ReplyJSON(v any) { natsutil.ReplyJSON(c.Msg, v) } - -// ReplyError sends an error response to the client. -func (c *Context) ReplyError(msg string) { - natsutil.ReplyError(c.Msg, msg) -} - -// ReplyRouteError sends a structured error response with an optional code. -// Use this from middleware when you need machine-readable error codes. -func (c *Context) ReplyRouteError(e *RouteError) { - natsutil.ReplyJSON(c.Msg, e) -} diff --git a/pkg/natsrouter/context_test.go b/pkg/natsrouter/context_test.go index fb7907653..bb36c006a 100644 --- a/pkg/natsrouter/context_test.go +++ b/pkg/natsrouter/context_test.go @@ -1,15 +1,38 @@ package natsrouter import ( + "bytes" "context" + "errors" + "log/slog" "strconv" + "strings" "sync" "testing" "github.com/nats-io/nats.go" "github.com/stretchr/testify/assert" + + "github.com/hmchangw/chat/pkg/errcode" ) +// TestContext_WithLogValues_NoCycleAndEnriches verifies the seam derives from +// the inner ctx (no Value-delegation cycle) and that the attached attrs reach +// the centralized Classify log line. +func TestContext_WithLogValues_NoCycleAndEnriches(t *testing.T) { + var buf bytes.Buffer + c := NewContext(map[string]string{}) + c.SetContext(errcode.WithLogger(c.ctx, slog.New(slog.NewJSONHandler(&buf, nil)))) + + c.WithLogValues("account", "alice") // must not hang (no ctx cycle) + _ = c.Value("anything") // a lookup must terminate (would loop on a cycle) + + errcode.Classify(c, errors.New("boom")) + if !strings.Contains(buf.String(), "alice") { + t.Fatalf("log values not applied: %s", buf.String()) + } +} + // TestContext_ConcurrentKeysAccess_NoRace proves that Set and Get are safe to // call concurrently. Without a mutex, Go's map detector panics on concurrent // writes and the race detector flags concurrent read/write. @@ -160,3 +183,16 @@ func TestContext_GetHeader(t *testing.T) { "exact case match must succeed") }) } + +// Use-after-release safety: chainState is pooled, so a post-handler goroutine +// calling Next/Abort/IsAborted on a released *Context would silently read the +// next request's chain state. The nil-out + nil-check converts the silent +// corruption into a loud panic. +func TestContext_ChainMethodsPanicAfterRelease(t *testing.T) { + c := acquireContext(context.Background(), nil, Params{}, []HandlerFunc{func(*Context) {}}) + releaseContext(c) + + assert.PanicsWithValue(t, chainAfterReleasePanic, func() { c.Next() }) + assert.PanicsWithValue(t, chainAfterReleasePanic, func() { c.Abort() }) + assert.PanicsWithValue(t, chainAfterReleasePanic, func() { c.IsAborted() }) +} diff --git a/pkg/natsrouter/doc.go b/pkg/natsrouter/doc.go index 188a21bd8..d6f560447 100644 --- a/pkg/natsrouter/doc.go +++ b/pkg/natsrouter/doc.go @@ -15,7 +15,7 @@ // context.DeadlineExceeded to ErrUnavailable explicitly: // // if errors.Is(err, context.DeadlineExceeded) { -// return nil, natsrouter.ErrUnavailable("request timed out") +// return nil, errcode.Unavailable("request timed out") // } // // Without that mapping there is no structured retry signal in the diff --git a/pkg/natsrouter/errors.go b/pkg/natsrouter/errors.go deleted file mode 100644 index 91918ddaa..000000000 --- a/pkg/natsrouter/errors.go +++ /dev/null @@ -1,89 +0,0 @@ -package natsrouter - -import "fmt" - -// RouteError is an error that produces a user-facing response. -// When a handler returns a RouteError, the router sends it as the reply -// instead of the generic "internal error". Use this for expected error -// conditions that the client should see (not found, forbidden, validation, etc.). -// -// Any other error returned by a handler is treated as an internal error — -// it is logged and the client receives "internal error". -// -// Example: -// -// func (s *Service) GetRoom(ctx context.Context, p Params, req GetRoomReq) (*Room, error) { -// room, err := s.store.Find(ctx, req.ID) -// if err != nil { -// return nil, fmt.Errorf("finding room: %w", err) // → "internal error" to client -// } -// if room == nil { -// return nil, natsrouter.Errorf("room %s not found", req.ID) // → sent to client as-is -// } -// return room, nil -// } -type RouteError struct { - Message string `json:"error"` - Code string `json:"code,omitempty"` -} - -// Error implements the error interface. -func (e *RouteError) Error() string { - if e.Code != "" { - return fmt.Sprintf("%s: %s", e.Code, e.Message) - } - return e.Message -} - -// Err creates a RouteError with the given message. -// The client receives: {"error": "message"} -func Err(message string) *RouteError { - return &RouteError{Message: message} -} - -// Errf creates a RouteError with a formatted message. -// The client receives: {"error": "formatted message"} -func Errf(format string, args ...any) *RouteError { - return &RouteError{Message: fmt.Sprintf(format, args...)} -} - -// ErrWithCode creates a RouteError with a machine-readable code and message. -// The client receives: {"error": "message", "code": "code"} -// -// Common codes: "not_found", "forbidden", "bad_request", "conflict" -func ErrWithCode(code, message string) *RouteError { - return &RouteError{Message: message, Code: code} -} - -// Standard error codes. -const ( - CodeBadRequest = "bad_request" - CodeNotFound = "not_found" - CodeForbidden = "forbidden" - CodeConflict = "conflict" - CodeInternal = "internal" - // CodeUnavailable signals the service is temporarily over capacity and - // the caller should retry. Used by the router's admission control when - // the per-pod handler concurrency cap is reached. - CodeUnavailable = "unavailable" -) - -// ErrBadRequest creates a user-facing bad request error. -func ErrBadRequest(message string) *RouteError { return ErrWithCode(CodeBadRequest, message) } - -// ErrNotFound creates a user-facing not found error. -func ErrNotFound(message string) *RouteError { return ErrWithCode(CodeNotFound, message) } - -// ErrForbidden creates a user-facing forbidden error. -func ErrForbidden(message string) *RouteError { return ErrWithCode(CodeForbidden, message) } - -// ErrConflict creates a user-facing conflict error. -func ErrConflict(message string) *RouteError { return ErrWithCode(CodeConflict, message) } - -// ErrInternal creates a user-facing internal error. -func ErrInternal(message string) *RouteError { return ErrWithCode(CodeInternal, message) } - -// ErrUnavailable creates a user-facing service-busy error. Returned by the -// router's admission control when the per-pod handler concurrency cap is -// reached. Callers should retry with backoff. -func ErrUnavailable(message string) *RouteError { return ErrWithCode(CodeUnavailable, message) } diff --git a/pkg/natsrouter/errors_test.go b/pkg/natsrouter/errors_test.go deleted file mode 100644 index 130818c74..000000000 --- a/pkg/natsrouter/errors_test.go +++ /dev/null @@ -1,18 +0,0 @@ -package natsrouter - -import ( - "testing" - - "github.com/stretchr/testify/assert" -) - -func TestErrUnavailable_HasCodeAndMessage(t *testing.T) { - err := ErrUnavailable("service busy") - assert.Equal(t, "unavailable", err.Code) - assert.Equal(t, "service busy", err.Message) -} - -func TestCodeUnavailable_UsedByErrUnavailable(t *testing.T) { - err := ErrUnavailable("any message") - assert.Equal(t, CodeUnavailable, err.Code) -} diff --git a/pkg/natsrouter/example_test.go b/pkg/natsrouter/example_test.go index d48d76443..bb95727ab 100644 --- a/pkg/natsrouter/example_test.go +++ b/pkg/natsrouter/example_test.go @@ -8,6 +8,8 @@ import ( "github.com/Marz32onE/instrumentation-go/otel-nats/otelnats" + "github.com/hmchangw/chat/pkg/errcode" + "github.com/hmchangw/chat/pkg/errcode/errnats" "github.com/hmchangw/chat/pkg/natsrouter" ) @@ -89,8 +91,8 @@ func Example_errorHandling() { func(c *natsrouter.Context, req GreetRequest) (*Room, error) { room := findRoom(c.Param("roomID")) if room == nil { - // User-facing error — client receives: {"error":"room not found","code":"not_found"} - return nil, natsrouter.ErrWithCode("not_found", "room not found") + // User-facing error — client receives: {"code":"not_found","error":"room not found"} + return nil, errcode.NotFound("room not found") } return room, nil // If findRoom returned a Go error (e.g. DB failure), return it as-is: @@ -127,10 +129,12 @@ func Example_customMiddleware() { nc, _ := otelnats.Connect(nats.DefaultURL) router := natsrouter.New(nc, "my-service") - // Custom middleware that rejects requests with empty payloads. + // Custom middleware that rejects requests with empty payloads. Middleware + // can't return an error like a handler, so it replies with a typed errcode + // envelope directly via errnats.Reply. requireBody := natsrouter.HandlerFunc(func(c *natsrouter.Context) { if len(c.Msg.Data) == 0 { - c.ReplyError("request body required") + errnats.Reply(c, c.Msg, errcode.BadRequest("request body required")) return } c.Next() diff --git a/pkg/natsrouter/integration_test.go b/pkg/natsrouter/integration_test.go index 61d910b38..26cfd05b9 100644 --- a/pkg/natsrouter/integration_test.go +++ b/pkg/natsrouter/integration_test.go @@ -16,6 +16,7 @@ import ( "github.com/Marz32onE/instrumentation-go/otel-nats/otelnats" + "github.com/hmchangw/chat/pkg/errcode" "github.com/hmchangw/chat/pkg/testutil" ) @@ -242,9 +243,9 @@ func TestIntegration_BusyReplyOnSaturation(t *testing.T) { data, _ := json.Marshal(echoReq{Seq: 2}) resp, err := nc.Request(context.Background(), "busy.2", data, 2*time.Second) require.NoError(t, err) - var re RouteError - require.NoError(t, json.Unmarshal(resp.Data, &re)) - assert.Equal(t, CodeUnavailable, re.Code, "expected busy reply once slot is held") + ee, gotEnvelope := errcode.Parse(resp.Data) + require.True(t, gotEnvelope, "expected error envelope once slot is held") + assert.Equal(t, errcode.CodeUnavailable, ee.Code, "expected busy reply once slot is held") // Release the gate; first request must complete normally. close(gate) diff --git a/pkg/natsrouter/middleware.go b/pkg/natsrouter/middleware.go index 2eb077e81..db86c550d 100644 --- a/pkg/natsrouter/middleware.go +++ b/pkg/natsrouter/middleware.go @@ -5,7 +5,10 @@ import ( "log/slog" "time" - "github.com/hmchangw/chat/pkg/idgen" + "github.com/nats-io/nats.go" + + "github.com/hmchangw/chat/pkg/errcode" + "github.com/hmchangw/chat/pkg/errcode/errnats" "github.com/hmchangw/chat/pkg/natsutil" ) @@ -16,18 +19,24 @@ type Middleware = HandlerFunc // requestIDKey is the context key used to store the request ID. const requestIDKey = "requestID" -// RequestID returns middleware that extracts X-Request-ID (or mints via idgen) and stores it on both the natsrouter keys map and the underlying ctx. +// RequestID extracts X-Request-ID (or mints via idgen), stores it on the +// natsrouter keys map AND the underlying ctx, AND enriches the ctx logger so +// every Classify line on this request automatically carries request_id — +// handlers don't need to re-pass it. func RequestID() HandlerFunc { return func(c *Context) { - reqID := "" - if c.Msg != nil && c.Msg.Header != nil { - reqID = c.Msg.Header.Get(natsutil.RequestIDHeader) - } - if !idgen.IsValidUUID(reqID) { - reqID = idgen.GenerateRequestID() + var ( + headers nats.Header + subj string + ) + if c.Msg != nil { + headers = c.Msg.Header + subj = c.Msg.Subject } + ctx, reqID := natsutil.StampRequestID(c.ctx, headers, subj) c.Set(requestIDKey, reqID) - c.SetContext(natsutil.WithRequestID(c.ctx, reqID)) + c.SetContext(ctx) + c.WithLogValues("request_id", reqID) c.Next() } } @@ -39,7 +48,7 @@ func requestAttrs(c *Context) []any { attrs = append(attrs, "subject", c.Msg.Subject) } if id, ok := c.Get(requestIDKey); ok { - attrs = append(attrs, "requestID", id) + attrs = append(attrs, "request_id", id) } return attrs } @@ -51,7 +60,8 @@ func Recovery() HandlerFunc { if r := recover(); r != nil { attrs := append(requestAttrs(c), "panic", r) slog.Error("panic recovered", attrs...) - c.ReplyError("internal error") + // Already logged above; ReplyQuiet avoids a redundant Classify line. + errnats.ReplyQuiet(c.Msg, errcode.Internal("internal error")) c.Abort() } }() @@ -82,12 +92,13 @@ func Logging() HandlerFunc { // // Reply mapping — when a context-aware downstream call returns // context.DeadlineExceeded and the handler returns -// `fmt.Errorf("...: %w", err)`, the router's replyErr path falls through -// to `"internal error"` (no RouteError match). Recommended pattern: in -// the handler, map the deadline-expired sentinel explicitly, e.g. +// `fmt.Errorf("...: %w", err)`, the router's replyErr path collapses +// to `{"code":"internal","error":"internal error"}` (no typed errcode +// match). Recommended pattern: in the handler, map the deadline-expired +// sentinel explicitly, e.g. // // if errors.Is(err, context.DeadlineExceeded) { -// return nil, natsrouter.ErrUnavailable("request timed out") +// return nil, errcode.Unavailable("request timed out") // } // // so the caller sees a structured "unavailable" code instead of a diff --git a/pkg/natsrouter/params.go b/pkg/natsrouter/params.go index 18f95706e..40aa63f12 100644 --- a/pkg/natsrouter/params.go +++ b/pkg/natsrouter/params.go @@ -3,6 +3,8 @@ package natsrouter import ( "fmt" "strings" + + "github.com/hmchangw/chat/pkg/errcode" ) // Params holds named tokens extracted from a NATS subject at request time. @@ -38,7 +40,7 @@ func (p Params) MustGet(key string) string { func (p Params) Require(key string) (string, error) { v, ok := p.values[key] if !ok || v == "" { - return "", ErrBadRequest("missing required param: " + key) + return "", errcode.BadRequest("missing required param: " + key) } return v, nil } diff --git a/pkg/natsrouter/register.go b/pkg/natsrouter/register.go index 772c24f7e..de3fb225d 100644 --- a/pkg/natsrouter/register.go +++ b/pkg/natsrouter/register.go @@ -2,8 +2,10 @@ package natsrouter import ( "encoding/json" - "errors" "log/slog" + + "github.com/hmchangw/chat/pkg/errcode" + "github.com/hmchangw/chat/pkg/errcode/errnats" ) // Register subscribes a typed handler to a subject pattern. @@ -17,7 +19,10 @@ func Register[Req, Resp any]( handler := HandlerFunc(func(c *Context) { var req Req if err := json.Unmarshal(c.Msg.Data, &req); err != nil { - c.ReplyError("invalid request payload") + // Cause preserves the parse-error chain for the Classify server log + // without echoing it to the client (errcode.Error.cause is unexported, + // never JSON-serialized). The user-facing message stays generic. + replyErr(c, errcode.BadRequest("invalid request payload", errcode.WithCause(err))) return } @@ -73,12 +78,7 @@ func RegisterVoid[Req any]( r.addRoute(pattern, []HandlerFunc{handler}) } +// replyErr classifies err and sends the errcode envelope on the reply subject. func replyErr(c *Context, err error) { - var routeErr *RouteError - if errors.As(err, &routeErr) { - c.ReplyJSON(routeErr) - return - } - slog.Error("handler error", "error", err, "subject", c.Msg.Subject) - c.ReplyError("internal error") + errnats.Reply(c, c.Msg, err) } diff --git a/pkg/natsrouter/router.go b/pkg/natsrouter/router.go index c29dd2c6f..3596b46bd 100644 --- a/pkg/natsrouter/router.go +++ b/pkg/natsrouter/router.go @@ -13,7 +13,8 @@ import ( "github.com/Marz32onE/instrumentation-go/otel-nats/otelnats" - "github.com/hmchangw/chat/pkg/natsutil" + "github.com/hmchangw/chat/pkg/errcode" + "github.com/hmchangw/chat/pkg/errcode/errnats" ) // Router manages NATS subscriptions with pattern-based routing and middleware. @@ -119,7 +120,8 @@ func (r *Router) replyBusy(msg *nats.Msg) { "subject", msg.Subject) return } - natsutil.ReplyJSON(msg, ErrUnavailable("service busy")) + // Admission rejection is operational, not a request failure; ReplyQuiet skips Classify. + errnats.ReplyQuiet(msg, errcode.Unavailable("service busy")) } // admit attempts to acquire an admission slot. The returned release @@ -201,7 +203,8 @@ func (r *Router) addRoute(pattern string, handlers []HandlerFunc) { "panic", rec, "stack", string(debug.Stack())) if m.Msg.Reply != "" { - natsutil.ReplyError(m.Msg, "internal error") + // Already logged via the Warn above; ReplyQuiet avoids a second line. + errnats.ReplyQuiet(m.Msg, errcode.Internal("internal error")) } } }() diff --git a/pkg/natsrouter/router_test.go b/pkg/natsrouter/router_test.go index 225cfe32b..d221b6e52 100644 --- a/pkg/natsrouter/router_test.go +++ b/pkg/natsrouter/router_test.go @@ -14,8 +14,8 @@ import ( "github.com/Marz32onE/instrumentation-go/otel-nats/otelnats" + "github.com/hmchangw/chat/pkg/errcode" "github.com/hmchangw/chat/pkg/idgen" - "github.com/hmchangw/chat/pkg/model" "github.com/hmchangw/chat/pkg/natsutil" ) @@ -93,9 +93,9 @@ func TestRegister_InvalidJSON(t *testing.T) { resp, err := nc.Request(context.Background(), "test.123", []byte("not json"), 2*time.Second) require.NoError(t, err) - var errResp model.ErrorResponse + var errResp errcode.Error require.NoError(t, json.Unmarshal(resp.Data, &errResp)) - assert.Equal(t, "invalid request payload", errResp.Error) + assert.Equal(t, "invalid request payload", errResp.Message) } func TestRegister_HandlerError(t *testing.T) { @@ -111,9 +111,9 @@ func TestRegister_HandlerError(t *testing.T) { resp, err := nc.Request(context.Background(), "test.123", data, 2*time.Second) require.NoError(t, err) - var errResp model.ErrorResponse + var errResp errcode.Error require.NoError(t, json.Unmarshal(resp.Data, &errResp)) - assert.Equal(t, "internal error", errResp.Error) + assert.Equal(t, "internal error", errResp.Message) } func TestRegisterNoBody_Success(t *testing.T) { @@ -215,9 +215,9 @@ func TestRecovery_CatchesPanic(t *testing.T) { resp, err := nc.Request(context.Background(), "test.123", data, 2*time.Second) require.NoError(t, err) - var errResp model.ErrorResponse + var errResp errcode.Error require.NoError(t, json.Unmarshal(resp.Data, &errResp)) - assert.Equal(t, "internal error", errResp.Error) + assert.Equal(t, "internal error", errResp.Message) } func TestRegister_NoParams(t *testing.T) { @@ -244,17 +244,17 @@ func TestRegister_RouteError(t *testing.T) { Register(r, "test.{id}", func(c *Context, req testReq) (*testResp, error) { - return nil, ErrWithCode("not_found", "thing not found") + return nil, errcode.NotFound("thing not found") }) data, _ := json.Marshal(testReq{Name: "test"}) resp, err := nc.Request(context.Background(), "test.123", data, 2*time.Second) require.NoError(t, err) - var result RouteError + var result errcode.Error require.NoError(t, json.Unmarshal(resp.Data, &result)) assert.Equal(t, "thing not found", result.Message) - assert.Equal(t, "not_found", result.Code) + assert.Equal(t, "not_found", string(result.Code)) } func TestRegister_RouteErrorSimple(t *testing.T) { @@ -263,17 +263,18 @@ func TestRegister_RouteErrorSimple(t *testing.T) { Register(r, "test.{id}", func(c *Context, req testReq) (*testResp, error) { - return nil, Errf("user %s not allowed", "alice") + return nil, errcode.BadRequest(fmt.Sprintf("user %s not allowed", "alice")) }) data, _ := json.Marshal(testReq{Name: "test"}) resp, err := nc.Request(context.Background(), "test.123", data, 2*time.Second) require.NoError(t, err) - var result RouteError + var result errcode.Error require.NoError(t, json.Unmarshal(resp.Data, &result)) assert.Equal(t, "user alice not allowed", result.Message) - assert.Equal(t, "", result.Code) + // Err/Errf now map to bad_request (code is always present in the new envelope). + assert.Equal(t, "bad_request", string(result.Code)) } func TestRegister_InternalErrorNotExposed(t *testing.T) { @@ -289,9 +290,9 @@ func TestRegister_InternalErrorNotExposed(t *testing.T) { resp, err := nc.Request(context.Background(), "test.123", data, 2*time.Second) require.NoError(t, err) - var errResp model.ErrorResponse + var errResp errcode.Error require.NoError(t, json.Unmarshal(resp.Data, &errResp)) - assert.Equal(t, "internal error", errResp.Error) + assert.Equal(t, "internal error", errResp.Message) assert.NotContains(t, string(resp.Data), "database") } @@ -333,32 +334,33 @@ func TestRegisterVoid_NoReply(t *testing.T) { require.Error(t, err) } -func TestRouteError_Error(t *testing.T) { - e := ErrWithCode("not_found", "room not found") - assert.Equal(t, "not_found: room not found", e.Error()) +func TestErrcodeError_Error(t *testing.T) { + // errcode.Error.Error() returns the user-safe message only (no "code: " prefix). + e := errcode.NotFound("room not found") + assert.Equal(t, "room not found", e.Error()) - e2 := Err("simple error") + e2 := errcode.BadRequest("simple error") assert.Equal(t, "simple error", e2.Error()) } -func TestRouteError_WrappedInFmtErrorf(t *testing.T) { +func TestErrcodeError_WrappedInFmtErrorf(t *testing.T) { nc := startTestNATS(t) r := New(nc, "test-service") - // RouteError wrapped with fmt.Errorf should still be detected via errors.As + // errcode error wrapped with fmt.Errorf should still be detected via errors.As Register(r, "test.{id}", func(c *Context, req testReq) (*testResp, error) { - return nil, fmt.Errorf("context: %w", ErrWithCode("forbidden", "not allowed")) + return nil, fmt.Errorf("context: %w", errcode.Forbidden("not allowed")) }) data, _ := json.Marshal(testReq{Name: "test"}) resp, err := nc.Request(context.Background(), "test.123", data, 2*time.Second) require.NoError(t, err) - var result RouteError + var result errcode.Error require.NoError(t, json.Unmarshal(resp.Data, &result)) assert.Equal(t, "not allowed", result.Message) - assert.Equal(t, "forbidden", result.Code) + assert.Equal(t, "forbidden", string(result.Code)) } func TestContext_SetGet(t *testing.T) { @@ -460,9 +462,9 @@ func TestRegisterNoBody_HandlerError(t *testing.T) { resp, err := nc.Request(context.Background(), "test.123", nil, 2*time.Second) require.NoError(t, err) - var errResp model.ErrorResponse + var errResp errcode.Error require.NoError(t, json.Unmarshal(resp.Data, &errResp)) - assert.Equal(t, "internal error", errResp.Error) + assert.Equal(t, "internal error", errResp.Message) } func TestRegisterNoBody_RouteError(t *testing.T) { @@ -471,16 +473,16 @@ func TestRegisterNoBody_RouteError(t *testing.T) { RegisterNoBody(r, "test.{id}", func(c *Context) (*testResp, error) { - return nil, ErrNotFound("item not found") + return nil, errcode.NotFound("item not found") }) resp, err := nc.Request(context.Background(), "test.123", nil, 2*time.Second) require.NoError(t, err) - var result RouteError + var result errcode.Error require.NoError(t, json.Unmarshal(resp.Data, &result)) assert.Equal(t, "item not found", result.Message) - assert.Equal(t, "not_found", result.Code) + assert.Equal(t, "not_found", string(result.Code)) } func TestLogging_LogsRequest(t *testing.T) { @@ -502,67 +504,27 @@ func TestLogging_LogsRequest(t *testing.T) { assert.Equal(t, "ok", result.Greeting) } -func TestReplyRouteError(t *testing.T) { - nc := startTestNATS(t) - r := New(nc, "test-service") - - r.Use(func(c *Context) { - c.ReplyRouteError(ErrForbidden("access denied")) - c.Abort() - }) - - Register(r, "test.{id}", - func(c *Context, req testReq) (*testResp, error) { - t.Fatal("handler should not be called") - return nil, nil - }) - - data, _ := json.Marshal(testReq{Name: "test"}) - resp, err := nc.Request(context.Background(), "test.123", data, 2*time.Second) - require.NoError(t, err) - - var result RouteError - require.NoError(t, json.Unmarshal(resp.Data, &result)) - assert.Equal(t, "access denied", result.Message) - assert.Equal(t, "forbidden", result.Code) -} - -func TestErrConstants(t *testing.T) { - e := ErrBadRequest("invalid input") - assert.Equal(t, "bad_request", e.Code) - assert.Equal(t, "invalid input", e.Message) - - e = ErrNotFound("not here") - assert.Equal(t, "not_found", e.Code) - - e = ErrForbidden("nope") - assert.Equal(t, "forbidden", e.Code) - - e = ErrConflict("already exists") - assert.Equal(t, "conflict", e.Code) - - e = ErrInternal("service unavailable") - assert.Equal(t, "internal", e.Code) - assert.Equal(t, "service unavailable", e.Message) -} +// TestReplyRouteError and TestErrConstants were removed when the natsrouter +// shim (Err*/RouteError/ReplyRouteError) was deleted — they tested the shim +// itself. errcode constructors are covered by pkg/errcode/options_test.go. -func TestRegister_ErrInternal(t *testing.T) { +func TestRegister_TypedInternalError(t *testing.T) { nc := startTestNATS(t) r := New(nc, "test-service") Register(r, "test.{id}", func(c *Context, req testReq) (*testResp, error) { - return nil, ErrInternal("failed to load data") + return nil, errcode.Internal("failed to load data") }) data, _ := json.Marshal(testReq{Name: "test"}) resp, err := nc.Request(context.Background(), "test.123", data, 2*time.Second) require.NoError(t, err) - var result RouteError + var result errcode.Error require.NoError(t, json.Unmarshal(resp.Data, &result)) assert.Equal(t, "failed to load data", result.Message) - assert.Equal(t, "internal", result.Code) + assert.Equal(t, "internal", string(result.Code)) } func TestContext_SetContext_Propagates(t *testing.T) { diff --git a/pkg/natsutil/reply.go b/pkg/natsutil/reply.go index eb83e79df..298ef1c89 100644 --- a/pkg/natsutil/reply.go +++ b/pkg/natsutil/reply.go @@ -1,3 +1,8 @@ +// Package natsutil holds the few JSON/reply helpers the chat services share +// for NATS request/reply. Client-facing errors flow through pkg/errcode +// (errnats.Reply / errhttp.Write) — this package is success-reply mechanics +// only; the legacy MarshalError/MarshalErrorWithCode/ReplyError/TryParseError +// helpers were deleted alongside model.ErrorResponse. package natsutil import ( @@ -5,8 +10,6 @@ import ( "log/slog" "github.com/nats-io/nats.go" - - "github.com/hmchangw/chat/pkg/model" ) // MarshalResponse encodes a value as JSON for NATS responses. @@ -14,44 +17,20 @@ func MarshalResponse(v any) ([]byte, error) { return json.Marshal(v) } -// MarshalError encodes an error message as a JSON ErrorResponse. -func MarshalError(errMsg string) []byte { - data, _ := json.Marshal(model.ErrorResponse{Error: errMsg}) - return data -} - -// MarshalErrorWithCode encodes an error message and machine-readable code -// as a JSON ErrorResponse. The code is omitted from the wire payload when -// empty (omitempty on the Code field). -func MarshalErrorWithCode(errMsg, code string) []byte { - data, _ := json.Marshal(model.ErrorResponse{Error: errMsg, Code: code}) - return data -} - -// ReplyJSON sends a JSON-encoded response to a NATS message. +// ReplyJSON sends a JSON-encoded success response on msg's reply subject. +// On a marshal failure (an unmarshalable v — typically a programmer error), +// responds with a generic internal-error errcode envelope so the caller is +// not left hanging. func ReplyJSON(msg *nats.Msg, v any) { data, err := MarshalResponse(v) if err != nil { - ReplyError(msg, "marshal error: "+err.Error()) + slog.Error("marshal response failed", "error", err, "subject", msg.Subject) + if rErr := msg.Respond([]byte(`{"code":"internal","error":"internal error"}`)); rErr != nil { + slog.Error("reply failed", "error", rErr, "subject", msg.Subject) + } return } if err := msg.Respond(data); err != nil { - slog.Error("reply failed", "error", err) - } -} - -// ReplyError sends a JSON-encoded error response to a NATS message. -func ReplyError(msg *nats.Msg, errMsg string) { - if err := msg.Respond(MarshalError(errMsg)); err != nil { - slog.Error("error reply failed", "error", err) - } -} - -// TryParseError returns the ErrorResponse iff data decodes cleanly with a non-empty Error. -func TryParseError(data []byte) (model.ErrorResponse, bool) { - var r model.ErrorResponse - if err := json.Unmarshal(data, &r); err != nil || r.Error == "" { - return model.ErrorResponse{}, false + slog.Error("reply failed", "error", err, "subject", msg.Subject) } - return r, true } diff --git a/pkg/natsutil/reply_test.go b/pkg/natsutil/reply_test.go index 561453182..b77f3ebc9 100644 --- a/pkg/natsutil/reply_test.go +++ b/pkg/natsutil/reply_test.go @@ -2,9 +2,12 @@ package natsutil_test import ( "encoding/json" + "strings" "testing" + "time" - "github.com/stretchr/testify/assert" + natsserver "github.com/nats-io/nats-server/v2/server" + "github.com/nats-io/nats.go" "github.com/stretchr/testify/require" "github.com/hmchangw/chat/pkg/model" @@ -26,64 +29,59 @@ func TestMarshalResponse(t *testing.T) { } } -func TestMarshalError(t *testing.T) { - data := natsutil.MarshalError("something went wrong") - var got model.ErrorResponse - if err := json.Unmarshal(data, &got); err != nil { - t.Fatalf("unmarshal: %v", err) - } - if got.Error != "something went wrong" { - t.Errorf("got %q", got.Error) - } +func startTestNATS(t *testing.T) *nats.Conn { + t.Helper() + opts := &natsserver.Options{Port: -1} + ns, err := natsserver.NewServer(opts) + require.NoError(t, err) + ns.Start() + require.True(t, ns.ReadyForConnections(5*time.Second)) + t.Cleanup(ns.Shutdown) + nc, err := nats.Connect(ns.ClientURL()) + require.NoError(t, err) + t.Cleanup(nc.Close) + return nc } -func TestTryParseError(t *testing.T) { - t.Run("error body returns parsed response and true", func(t *testing.T) { - data := natsutil.MarshalError("boom") - resp, ok := natsutil.TryParseError(data) - if !ok { - t.Fatal("expected ok=true for error body") - } - if resp.Error != "boom" { - t.Errorf("got %q, want %q", resp.Error, "boom") - } +// ReplyJSON's marshal-failure branch writes a fixed internal-error envelope. +// Pass an unmarshalable value (channels can't be JSON-encoded) and assert the +// fallback envelope reaches the requester rather than leaving them hanging. +func TestReplyJSON_MarshalFailure(t *testing.T) { + nc := startTestNATS(t) + const subj = "test.replyjson.failure" + sub, err := nc.Subscribe(subj, func(m *nats.Msg) { + natsutil.ReplyJSON(m, make(chan int)) }) + require.NoError(t, err) + defer func() { _ = sub.Unsubscribe() }() - t.Run("success body with no error field returns false", func(t *testing.T) { - data, err := json.Marshal(model.ListRoomMembersResponse{Members: nil}) - if err != nil { - t.Fatalf("marshal: %v", err) - } - if _, ok := natsutil.TryParseError(data); ok { - t.Fatal("expected ok=false for success body") - } - }) - - t.Run("empty object returns false", func(t *testing.T) { - if _, ok := natsutil.TryParseError([]byte(`{}`)); ok { - t.Fatal("expected ok=false for {}") - } - }) + reply, err := nc.Request(subj, []byte(`{}`), 2*time.Second) + require.NoError(t, err) + body := string(reply.Data) + if !strings.Contains(body, `"code":"internal"`) || !strings.Contains(body, `"error":"internal error"`) { + t.Fatalf("expected fallback internal-error envelope, got: %s", body) + } +} - t.Run("malformed json returns false", func(t *testing.T) { - if _, ok := natsutil.TryParseError([]byte(`{not json`)); ok { - t.Fatal("expected ok=false for malformed json") - } +func TestReplyJSON_HappyPath(t *testing.T) { + nc := startTestNATS(t) + const subj = "test.replyjson.happy" + sub, err := nc.Subscribe(subj, func(m *nats.Msg) { + natsutil.ReplyJSON(m, model.Room{ID: "r1", Name: "general"}) }) + require.NoError(t, err) + defer func() { _ = sub.Unsubscribe() }() - t.Run("error field with empty string returns false", func(t *testing.T) { - // Guards against rogue callers sending {"error":""}; we treat them as success bodies. - if _, ok := natsutil.TryParseError([]byte(`{"error":""}`)); ok { - t.Fatal("expected ok=false for empty error string") - } - }) + reply, err := nc.Request(subj, []byte(`{}`), 2*time.Second) + require.NoError(t, err) + var got model.Room + require.NoError(t, json.Unmarshal(reply.Data, &got)) + if got.ID != "r1" { + t.Fatalf("got %+v", got) + } } -func TestMarshalErrorWithCode(t *testing.T) { - data := natsutil.MarshalErrorWithCode("only owners can post in this room", "large_room_post_restricted") - - var got model.ErrorResponse - require.NoError(t, json.Unmarshal(data, &got)) - assert.Equal(t, "only owners can post in this room", got.Error) - assert.Equal(t, "large_room_post_restricted", got.Code) -} +// TestMarshalError / TestTryParseError / TestMarshalErrorWithCode were removed +// when the legacy ErrorResponse helpers were deleted. Client-facing error +// envelope marshalling is covered by pkg/errcode/errnats/reply_test.go; +// envelope parsing is covered by pkg/errcode/parse_test.go. diff --git a/pkg/natsutil/request_id.go b/pkg/natsutil/request_id.go index e01a12b39..7071c4a62 100644 --- a/pkg/natsutil/request_id.go +++ b/pkg/natsutil/request_id.go @@ -1,4 +1,8 @@ -// request_id.go: helpers to propagate X-Request-ID between context.Context and nats.Header. Missing IDs degrade to a log gap, not a correctness failure. +// request_id.go: helpers to propagate X-Request-ID between context.Context and nats.Header. +// Two entry-point helpers per docs/error-handling.md §3a: +// - StampRequestID — mint-on-missing (default; safe for paths where the ID is logging-only). +// - RequireRequestID — reject-on-missing (for paths that derive JetStream Nats-Msg-Id +// or document IDs from the request ID, where server-side minting would break client-retry dedup). package natsutil import ( @@ -6,6 +10,9 @@ import ( "log/slog" "github.com/nats-io/nats.go" + + "github.com/hmchangw/chat/pkg/errcode" + "github.com/hmchangw/chat/pkg/idgen" ) // RequestIDHeader is the canonical NATS/HTTP header for the request correlation ID. @@ -29,18 +36,6 @@ func RequestIDFromContext(ctx context.Context) string { return id } -// ContextWithRequestIDFromHeaders returns ctx augmented with X-Request-ID from headers, or ctx unchanged if absent. -func ContextWithRequestIDFromHeaders(ctx context.Context, headers nats.Header) context.Context { - if headers == nil { - return ctx - } - id := headers.Get(RequestIDHeader) - if id == "" { - return ctx - } - return WithRequestID(ctx, id) -} - // HeaderForContext returns a nats.Header carrying X-Request-ID from ctx, or nil if ctx has no request ID. func HeaderForContext(ctx context.Context) nats.Header { id := RequestIDFromContext(ctx) @@ -59,6 +54,57 @@ func NewMsg(ctx context.Context, subj string, data []byte) *nats.Msg { } } +// StampRequestID is the single boundary helper every NATS entry point should +// use. It: +// 1. Resolves the inbound X-Request-ID via idgen.ResolveRequestID (mint when +// missing/malformed per the repo-wide policy in docs/error-handling.md), +// 2. Stamps the resolved id onto ctx via WithRequestID, +// 3. Emits a single Warn line when a malformed inbound value was replaced +// (silent on missing — that's the benign common case), +// 4. Returns the new ctx and the id so the caller can also enrich its slog +// values (c.WithLogValues for natsrouter, errcode.WithLogValues for raw +// QueueSubscribe handlers). +// +// subject is logged alongside the warn for trace context; pass "" if not +// applicable (e.g., JetStream consume loops that prefer msg.Subject() at the +// call site). +func StampRequestID(ctx context.Context, headers nats.Header, subject string) (context.Context, string) { + var inbound string + if headers != nil { + inbound = headers.Get(RequestIDHeader) + } + id, replaced := idgen.ResolveRequestID(inbound) + ctx = WithRequestID(ctx, id) + if replaced { + slog.WarnContext(ctx, "minted request_id (inbound invalid)", "inbound", inbound, "subject", subject) + } + return ctx, id +} + +// RequireRequestID is the strict variant of StampRequestID. Use it on entry +// points whose downstream pipeline derives JetStream Nats-Msg-Id components +// or deterministic document IDs from the request ID (room-service handlers, +// room-worker.natsServerCreateDM) — silently minting a fresh UUID server-side +// would break client-retry deduplication on those paths. Missing or malformed +// inbound headers return an errcode.BadRequest; the ctx is returned unchanged +// so the caller can still use it for logging the failure. +// +// See docs/error-handling.md §3a for the rationale and the list of paths that +// must use this instead of StampRequestID. +func RequireRequestID(ctx context.Context, headers nats.Header, subject string) (context.Context, string, error) { + var inbound string + if headers != nil { + inbound = headers.Get(RequestIDHeader) + } + if !idgen.IsValidUUID(inbound) { + return ctx, "", errcode.BadRequest( + "X-Request-ID header is required (must be a valid hyphenated UUID per docs/error-handling.md §3a)", + errcode.WithReason(errcode.RequestIDRequired), + ) + } + return WithRequestID(ctx, inbound), inbound, nil +} + // OutboxDedupID composes a JetStream Nats-Msg-Id as base+":"+destSiteID. base // is the X-Request-ID from ctx; falls back to payloadSeed when ctx carries no // request ID, with a warn log so partial-deployment cases are observable. diff --git a/pkg/natsutil/request_id_test.go b/pkg/natsutil/request_id_test.go index da186125e..001610a6f 100644 --- a/pkg/natsutil/request_id_test.go +++ b/pkg/natsutil/request_id_test.go @@ -2,11 +2,14 @@ package natsutil_test import ( "context" + "errors" "testing" "github.com/nats-io/nats.go" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "github.com/hmchangw/chat/pkg/errcode" "github.com/hmchangw/chat/pkg/natsutil" ) @@ -32,27 +35,6 @@ func TestRequestIDFromContext_MissingReturnsEmpty(t *testing.T) { assert.Equal(t, "", natsutil.RequestIDFromContext(context.Background())) } -func TestContextWithRequestIDFromHeaders_HeaderPresent(t *testing.T) { - h := nats.Header{} - h.Set(natsutil.RequestIDHeader, "req-from-header") - ctx := natsutil.ContextWithRequestIDFromHeaders(context.Background(), h) - assert.Equal(t, "req-from-header", natsutil.RequestIDFromContext(ctx)) -} - -func TestContextWithRequestIDFromHeaders_NilHeaderIsNoOp(t *testing.T) { - parent := context.Background() - ctx := natsutil.ContextWithRequestIDFromHeaders(parent, nil) - assert.True(t, ctx == parent) - assert.Equal(t, "", natsutil.RequestIDFromContext(ctx)) -} - -func TestContextWithRequestIDFromHeaders_EmptyHeaderValueIsNoOp(t *testing.T) { - parent := context.Background() - ctx := natsutil.ContextWithRequestIDFromHeaders(parent, nats.Header{}) - assert.True(t, ctx == parent) - assert.Equal(t, "", natsutil.RequestIDFromContext(ctx)) -} - func TestHeaderForContext_WithID(t *testing.T) { ctx := natsutil.WithRequestID(context.Background(), "req-xyz") h := natsutil.HeaderForContext(ctx) @@ -65,11 +47,12 @@ func TestHeaderForContext_WithoutIDReturnsNil(t *testing.T) { assert.Nil(t, h, "no request ID in ctx must return a nil header (not an empty one)") } -func TestHeaderForContext_ReversibleViaContextFromHeaders(t *testing.T) { - original := natsutil.WithRequestID(context.Background(), "round-trip-id") +func TestHeaderForContext_RoundTripViaStampRequestID(t *testing.T) { + original := natsutil.WithRequestID(context.Background(), "01970a4f-8c2d-7c9a-abcd-e0123456789f") h := natsutil.HeaderForContext(original) - recovered := natsutil.ContextWithRequestIDFromHeaders(context.Background(), h) - assert.Equal(t, "round-trip-id", natsutil.RequestIDFromContext(recovered)) + recovered, id := natsutil.StampRequestID(context.Background(), h, "") + assert.Equal(t, "01970a4f-8c2d-7c9a-abcd-e0123456789f", id) + assert.Equal(t, "01970a4f-8c2d-7c9a-abcd-e0123456789f", natsutil.RequestIDFromContext(recovered)) } func TestRequestIDHeader_Constant(t *testing.T) { @@ -88,3 +71,88 @@ func TestNewMsg_NoIDLeavesHeaderNil(t *testing.T) { msg := natsutil.NewMsg(context.Background(), "chat.foo.bar", []byte("payload")) assert.Nil(t, msg.Header) } + +func TestStampRequestID(t *testing.T) { + const validUUID = "01970a4f-8c2d-7c9a-abcd-e0123456789f" + cases := []struct { + name string + headers nats.Header + wantID string // "" means "any minted UUID" + }{ + { + name: "valid_uuid_passes_through", + headers: nats.Header{natsutil.RequestIDHeader: []string{validUUID}}, + wantID: validUUID, + }, + { + name: "nil_headers_mints_fresh", + headers: nil, + }, + { + name: "empty_headers_mints_fresh", + headers: nats.Header{}, + }, + { + name: "empty_value_mints_fresh", + headers: nats.Header{natsutil.RequestIDHeader: []string{""}}, + }, + { + name: "malformed_value_mints_fresh", + headers: nats.Header{natsutil.RequestIDHeader: []string{"not-a-uuid"}}, + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + ctx, id := natsutil.StampRequestID(context.Background(), tc.headers, "chat.test.subject") + assert.NotEmpty(t, id) + assert.Equal(t, id, natsutil.RequestIDFromContext(ctx), + "id must be stamped on returned ctx") + if tc.wantID != "" { + assert.Equal(t, tc.wantID, id) + } else { + assert.Len(t, id, 36, "minted id must be a 36-char hyphenated UUID") + } + }) + } +} + +// RequireRequestID is the strict variant used on entry points whose downstream +// pipeline derives JetStream Nats-Msg-Id / Mongo dedup keys from the request +// ID. Silently minting at the server would break client-retry deduplication +// for those paths, so missing/malformed inbound headers must produce a typed +// BadRequest instead. +func TestRequireRequestID(t *testing.T) { + const validUUID = "01970a4f-8c2d-7c9a-abcd-e0123456789f" + + t.Run("valid_uuid_passes_through_and_stamps_ctx", func(t *testing.T) { + h := nats.Header{natsutil.RequestIDHeader: []string{validUUID}} + ctx, id, err := natsutil.RequireRequestID(context.Background(), h, "chat.test.subject") + require.NoError(t, err) + assert.Equal(t, validUUID, id) + assert.Equal(t, validUUID, natsutil.RequestIDFromContext(ctx)) + }) + + cases := []struct { + name string + headers nats.Header + }{ + {name: "nil_headers_rejects", headers: nil}, + {name: "empty_headers_rejects", headers: nats.Header{}}, + {name: "empty_value_rejects", headers: nats.Header{natsutil.RequestIDHeader: []string{""}}}, + {name: "malformed_value_rejects", headers: nats.Header{natsutil.RequestIDHeader: []string{"not-a-uuid"}}}, + {name: "wrong_length_rejects", headers: nats.Header{natsutil.RequestIDHeader: []string{"01970a4f8c2d7c9aabcde0123456789f"}}}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + ctx, id, err := natsutil.RequireRequestID(context.Background(), tc.headers, "chat.test.subject") + require.Error(t, err) + assert.Empty(t, id, "no id should be returned on reject") + assert.Empty(t, natsutil.RequestIDFromContext(ctx), + "ctx must not carry a minted id on reject (callers may still want to log against the inbound ctx)") + + var ec *errcode.Error + require.True(t, errors.As(err, &ec), "must return a typed *errcode.Error so errnats.Reply maps it to BadRequest") + assert.Equal(t, errcode.CodeBadRequest, ec.Code) + }) + } +} diff --git a/pkg/roomcrypto/integration_test.go b/pkg/roomcrypto/integration_test.go index 3c2888ac0..fd6d20afa 100644 --- a/pkg/roomcrypto/integration_test.go +++ b/pkg/roomcrypto/integration_test.go @@ -30,10 +30,22 @@ type decryptPayload struct { Message *EncryptedMessage `json:"message"` } +// skipOnVFS skips the test when DOCKER_STORAGE_DRIVER=vfs. VFS has no +// copy-on-write so pulling node:20-alpine + running npm install eats minutes +// of disk thrash — exceeds default timeouts. Matches the pkg/roomkeysender +// pattern. Unset env var keeps the test running on real overlay2/btrfs CI. +func skipOnVFS(t *testing.T) { + t.Helper() + if os.Getenv("DOCKER_STORAGE_DRIVER") == "vfs" { + t.Skip("skipping node container test: VFS storage driver is too slow (unset DOCKER_STORAGE_DRIVER or set to overlay2/btrfs to enable)") + } +} + // setupNodeContainer starts a node:20-alpine container, copies decrypt.ts into it, // and installs tsx. The container is terminated via t.Cleanup. func setupNodeContainer(t *testing.T) testcontainers.Container { t.Helper() + skipOnVFS(t) ctx := context.Background() container, err := testcontainers.GenericContainer(ctx, testcontainers.GenericContainerRequest{ diff --git a/pkg/testutil/cassandra.go b/pkg/testutil/cassandra.go index 3cd10a1d8..8996745a6 100644 --- a/pkg/testutil/cassandra.go +++ b/pkg/testutil/cassandra.go @@ -14,9 +14,15 @@ import ( "github.com/gocql/gocql" "github.com/testcontainers/testcontainers-go" "github.com/testcontainers/testcontainers-go/wait" + + "github.com/hmchangw/chat/pkg/testutil/testimages" ) -const cassandraImage = "cassandra:5" +// Pinned via pkg/testutil/testimages so every integration test (and the +// docker-local compose stack) tracks a single Cassandra tag. See testimages +// doc for why this diverges from prod (cassandra:5 OOMs the testcontainers-go +// default heap on standard CI runners). +const cassandraImage = testimages.Cassandra var ( cassOnce sync.Once diff --git a/room-service/handler.go b/room-service/handler.go index 24f78a181..349893f8e 100644 --- a/room-service/handler.go +++ b/room-service/handler.go @@ -8,18 +8,20 @@ import ( "fmt" "log/slog" "slices" + "strconv" "strings" "time" "unicode/utf8" "github.com/Marz32onE/instrumentation-go/otel-nats/otelnats" - "github.com/nats-io/nats.go" "go.mongodb.org/mongo-driver/v2/mongo" "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/metric" "go.opentelemetry.io/otel/trace" "golang.org/x/sync/errgroup" + "github.com/hmchangw/chat/pkg/errcode" + "github.com/hmchangw/chat/pkg/errcode/errnats" "github.com/hmchangw/chat/pkg/idgen" "github.com/hmchangw/chat/pkg/model" "github.com/hmchangw/chat/pkg/natsutil" @@ -62,9 +64,23 @@ func NewHandler(store RoomStore, keyStore RoomKeyStore, memberListClient MemberL } } -// wrappedCtx returns m.Context() augmented with X-Request-ID from the inbound msg header; entry ctx for every nats* handler. -func wrappedCtx(m otelnats.Msg) context.Context { - return natsutil.ContextWithRequestIDFromHeaders(m.Context(), m.Msg.Header) +// wrappedCtx validates the inbound X-Request-ID via natsutil.RequireRequestID +// (strict mode) and returns m.Context() seeded with the id for the centralized +// errcode.Classify log line. Missing/malformed headers return an +// errcode.BadRequest that the caller must reply to via errnats.Reply. +// +// Strict mode is required here — not the mint-on-missing default — because +// room-service handlers fan out to room-worker, whose JetStream publishes +// derive Nats-Msg-Id / message IDs from this request ID (OutboxDedupID, +// messageDedupSeed, idgen.MessageIDFromRequestID). A silently-minted server- +// side ID would break dedup across client retries. See docs/error-handling.md +// §3a. +func wrappedCtx(m otelnats.Msg) (context.Context, error) { + ctx, id, err := natsutil.RequireRequestID(m.Context(), m.Msg.Header, m.Msg.Subject) + if err != nil { + return m.Context(), err + } + return errcode.WithLogValues(ctx, "request_id", id), nil } // RegisterCRUD registers NATS request/reply handlers for room CRUD with queue group. @@ -113,34 +129,18 @@ func (h *Handler) RegisterCRUD(nc *otelnats.Conn) error { } func (h *Handler) natsCreateRoom(m otelnats.Msg) { - ctx := wrappedCtx(m) - resp, err := h.handleCreateRoom(ctx, m.Msg.Subject, m.Msg.Data) + ctx, err := wrappedCtx(m) if err != nil { - var dmExists *dmExistsError - if errors.As(err, &dmExists) { - h.replyDMExists(m.Msg, dmExists.RoomID()) - return - } - slog.Error("create-room failed", "error", err, "subject", m.Msg.Subject) - natsutil.ReplyError(m.Msg, sanitizeError(err)) + errnats.Reply(ctx, m.Msg, err) return } - if err := m.Msg.Respond(resp); err != nil { - slog.Error("failed to respond to create-room", "error", err) - } -} - -func (h *Handler) replyDMExists(msg *nats.Msg, existingRoomID string) { - body, err := json.Marshal(model.ErrorResponse{ - Error: "dm already exists", - RoomID: existingRoomID, - }) + resp, err := h.handleCreateRoom(ctx, m.Msg.Subject, m.Msg.Data) if err != nil { - natsutil.ReplyError(msg, "internal error") + errnats.Reply(ctx, m.Msg, err) return } - if err := msg.Respond(body); err != nil { - slog.Error("failed to respond DM exists", "error", err) + if err := m.Msg.Respond(resp); err != nil { + slog.Error("failed to respond to create-room", "error", err) } } @@ -150,17 +150,9 @@ func (h *Handler) handleCreateRoom(ctx context.Context, subj string, data []byte return nil, fmt.Errorf("invalid create-room subject: %s", subj) } - requestID := natsutil.RequestIDFromContext(ctx) - if requestID == "" { - return nil, errMissingRequestID - } - if !idgen.IsValidUUID(requestID) { - return nil, errInvalidRequestID - } - var req model.CreateRoomRequest if err := json.Unmarshal(data, &req); err != nil { - return nil, fmt.Errorf("invalid request: %w", err) + return nil, errcode.BadRequest("invalid request") } roomType, err := classifyAndValidate(&req, requesterAccount) @@ -171,7 +163,7 @@ func (h *Handler) handleCreateRoom(ctx context.Context, subj string, data []byte requester, err := h.store.GetUser(ctx, requesterAccount) if err != nil { if errors.Is(err, ErrUserNotFound) { - return nil, errUserNotFound + return nil, errcode.NotFound("user not found", errcode.WithReason(errcode.RoomUserNotFound)) } return nil, fmt.Errorf("get requester: %w", err) } @@ -243,7 +235,7 @@ func (h *Handler) handleCreateRoomDMOrBotDM(ctx context.Context, req *model.Crea other, err := h.store.GetUser(ctx, otherAccount) if err != nil { if errors.Is(err, ErrUserNotFound) { - return nil, errUserNotFound + return nil, errcode.NotFound("user not found", errcode.WithReason(errcode.RoomUserNotFound)) } return nil, fmt.Errorf("get counterpart: %w", err) } @@ -263,7 +255,13 @@ func (h *Handler) handleCreateRoomDMOrBotDM(ctx context.Context, req *model.Crea // the deterministic "open-or-create" contract for DMs. existing, err := h.store.FindDMSubscription(ctx, requester.Account, other.Account) if err == nil && existing != nil { - return nil, newDMExistsError(existing.RoomID) + // DM already exists: this is a success ("open-or-create"), not an error. + // Return the existing room ID so the client opens it. RoomType is left + // empty on this branch, matching the prior error-reply behaviour. + return json.Marshal(model.CreateRoomReply{ + Status: model.CreateRoomStatusExists, + RoomID: existing.RoomID, + }) } if err != nil && !errors.Is(err, model.ErrSubscriptionNotFound) { return nil, fmt.Errorf("dm dedup check: %w", err) @@ -322,7 +320,11 @@ func (h *Handler) handleCreateRoomChannel(ctx context.Context, req *model.Create // N members, not N+1. totalMembers := 1 + newCount if totalMembers > h.maxRoomSize { - return nil, fmt.Errorf("exceeds maximum capacity (%d): would create %d members", h.maxRoomSize, totalMembers) + return nil, errcode.Conflict( + fmt.Sprintf("exceeds maximum capacity (%d): would create %d members", h.maxRoomSize, totalMembers), + errcode.WithReason(errcode.RoomMaxSizeReached), + errcode.WithMetadata("maxRoomSize", strconv.Itoa(h.maxRoomSize), "attempted", strconv.Itoa(totalMembers)), + ) } // Preserve req.Users / req.Orgs as the literal client request for sys-message payloads. @@ -383,11 +385,14 @@ func (h *Handler) publishCreateRoom(ctx context.Context, req *model.CreateRoomRe // NatsHandleRemoveMember handles remove-member authorization requests. func (h *Handler) NatsHandleRemoveMember(m otelnats.Msg) { - ctx := wrappedCtx(m) + ctx, err := wrappedCtx(m) + if err != nil { + errnats.Reply(ctx, m.Msg, err) + return + } resp, err := h.handleRemoveMember(ctx, m.Msg.Subject, m.Msg.Data) if err != nil { - slog.Error("remove member failed", "error", err) - natsutil.ReplyError(m.Msg, sanitizeError(err)) + errnats.Reply(ctx, m.Msg, err) return } if err := m.Msg.Respond(resp); err != nil { @@ -396,22 +401,28 @@ func (h *Handler) NatsHandleRemoveMember(m otelnats.Msg) { } func (h *Handler) natsListMembers(m otelnats.Msg) { - ctx := wrappedCtx(m) + ctx, err := wrappedCtx(m) + if err != nil { + errnats.Reply(ctx, m.Msg, err) + return + } resp, err := h.handleListMembers(ctx, m.Msg.Subject, m.Msg.Data) if err != nil { - slog.Error("list members failed", "error", err) - natsutil.ReplyError(m.Msg, sanitizeError(err)) + errnats.Reply(ctx, m.Msg, err) return } natsutil.ReplyJSON(m.Msg, resp) } func (h *Handler) natsListOrgMembers(m otelnats.Msg) { - ctx := wrappedCtx(m) + ctx, err := wrappedCtx(m) + if err != nil { + errnats.Reply(ctx, m.Msg, err) + return + } resp, err := h.handleListOrgMembers(ctx, m.Msg.Subject) if err != nil { - slog.Error("list org members failed", "error", err) - natsutil.ReplyError(m.Msg, sanitizeError(err)) + errnats.Reply(ctx, m.Msg, err) return } natsutil.ReplyJSON(m.Msg, resp) @@ -424,8 +435,8 @@ func (h *Handler) handleListOrgMembers(ctx context.Context, subj string) (model. } members, err := h.store.ListOrgMembers(ctx, orgID) if err != nil { - if errors.Is(err, errInvalidOrg) { - return model.ListOrgMembersResponse{}, errInvalidOrg + if errcode.HasReason(err, errcode.RoomInvalidOrg) { + return model.ListOrgMembersResponse{}, errcode.BadRequest("invalid org", errcode.WithReason(errcode.RoomInvalidOrg)) } return model.ListOrgMembersResponse{}, fmt.Errorf("get org members: %w", err) } @@ -449,7 +460,7 @@ func (h *Handler) handleListMembers(ctx context.Context, subj string, data []byt var req model.ListRoomMembersRequest if len(data) > 0 { if err := json.Unmarshal(data, &req); err != nil { - return model.ListRoomMembersResponse{}, fmt.Errorf("invalid request: %w", err) + return model.ListRoomMembersResponse{}, errcode.BadRequest("invalid request") } } if req.Limit != nil && *req.Limit <= 0 { @@ -467,11 +478,14 @@ func (h *Handler) handleListMembers(ctx context.Context, subj string, data []byt } func (h *Handler) natsGetRoomKey(m otelnats.Msg) { - ctx := wrappedCtx(m) + ctx, err := wrappedCtx(m) + if err != nil { + errnats.Reply(ctx, m.Msg, err) + return + } resp, err := h.handleGetRoomKey(ctx, m.Msg.Subject, m.Msg.Data) if err != nil { - slog.Error("get room key failed", "error", err) - natsutil.ReplyError(m.Msg, sanitizeError(err)) + errnats.Reply(ctx, m.Msg, err) return } if err := m.Msg.Respond(resp); err != nil { @@ -499,7 +513,7 @@ func (h *Handler) handleGetRoomKey(ctx context.Context, subj string, data []byte var req model.RoomKeyGetRequest if len(data) > 0 { if err := json.Unmarshal(data, &req); err != nil { - return nil, fmt.Errorf("invalid request: %w", err) + return nil, errcode.BadRequest("invalid request") } } @@ -542,7 +556,7 @@ func (h *Handler) handleRemoveMember(ctx context.Context, subj string, data []by var req model.RemoveMemberRequest if err := json.Unmarshal(data, &req); err != nil { - return nil, fmt.Errorf("invalid request: %w", err) + return nil, errcode.BadRequest("invalid request") } if req.RoomID != "" && req.RoomID != roomID { @@ -557,7 +571,9 @@ func (h *Handler) handleRemoveMember(ctx context.Context, subj string, data []by return nil, fmt.Errorf("get room: %w", err) } if room.Type != model.RoomTypeChannel { - return nil, fmt.Errorf("%w, got %s", errRemoveChannelOnly, room.Type) + // Preserve sentinel identity (errors.Is matches via %w unwrap) while + // carrying the actual room type for client-side context. + return nil, fmt.Errorf("%w (got %s)", errRemoveChannelOnly, room.Type) } // Carry room type to room-worker to avoid a redundant GetRoom round-trip there. req.RoomType = room.Type @@ -582,7 +598,7 @@ func (h *Handler) handleRemoveMember(ctx context.Context, subj string, data []by return nil, fmt.Errorf("get requester subscription: %w", err) } if !hasRole(requesterSub.Roles, model.RoleOwner) { - return nil, fmt.Errorf("only owners can remove members") + return nil, errOnlyOwnersCanRemove } } counts, err := h.store.CountMembersAndOwners(ctx, roomID) @@ -602,7 +618,7 @@ func (h *Handler) handleRemoveMember(ctx context.Context, subj string, data []by return nil, fmt.Errorf("get requester subscription: %w", err) } if !hasRole(sub.Roles, model.RoleOwner) { - return nil, fmt.Errorf("only owners can remove members") + return nil, errOnlyOwnersCanRemove } } @@ -622,11 +638,14 @@ func (h *Handler) handleRemoveMember(ctx context.Context, subj string, data []by } func (h *Handler) natsUpdateRole(m otelnats.Msg) { - ctx := wrappedCtx(m) + ctx, err := wrappedCtx(m) + if err != nil { + errnats.Reply(ctx, m.Msg, err) + return + } resp, err := h.handleUpdateRole(ctx, m.Msg.Subject, m.Msg.Data) if err != nil { - slog.Error("update role failed", "error", err) - natsutil.ReplyError(m.Msg, sanitizeError(err)) + errnats.Reply(ctx, m.Msg, err) return } if err := m.Msg.Respond(resp); err != nil { @@ -641,10 +660,10 @@ func (h *Handler) handleUpdateRole(ctx context.Context, subj string, data []byte } var req model.UpdateRoleRequest if err := json.Unmarshal(data, &req); err != nil { - return nil, fmt.Errorf("invalid request: %w", err) + return nil, errcode.BadRequest("invalid request") } if req.RoomID != "" && req.RoomID != roomID { - return nil, fmt.Errorf("invalid request: room ID mismatch") + return nil, errRoomIDMismatch } req.RoomID = roomID if req.NewRole != model.RoleOwner && req.NewRole != model.RoleMember { @@ -706,11 +725,14 @@ func (h *Handler) handleUpdateRole(ctx context.Context, subj string, data []byte } func (h *Handler) natsAddMembers(m otelnats.Msg) { - ctx := wrappedCtx(m) + ctx, err := wrappedCtx(m) + if err != nil { + errnats.Reply(ctx, m.Msg, err) + return + } resp, err := h.handleAddMembers(ctx, m.Msg.Subject, m.Msg.Data) if err != nil { - slog.Error("add-members failed", "error", err) - natsutil.ReplyError(m.Msg, sanitizeError(err)) + errnats.Reply(ctx, m.Msg, err) return } if err := m.Msg.Respond(resp); err != nil { @@ -725,10 +747,15 @@ func (h *Handler) handleAddMembers(ctx context.Context, subj string, data []byte return nil, fmt.Errorf("invalid add-members subject: %s", subj) } - // 2. Verify requester is in room + // 2. Verify requester is in room. Distinguish "not a member" (typed + // forbidden — the user genuinely can't add members) from an infra failure + // (Mongo timeout etc. — must NOT collapse to a 403 user-error). sub, err := h.store.GetSubscription(ctx, requester, roomID) if err != nil { - return nil, fmt.Errorf("requester not in room: %w", err) + if errors.Is(err, model.ErrSubscriptionNotFound) { + return nil, errNotRoomMember + } + return nil, fmt.Errorf("check requester room membership: %w", err) } // 3. Get room and guard on type @@ -737,19 +764,19 @@ func (h *Handler) handleAddMembers(ctx context.Context, subj string, data []byte return nil, fmt.Errorf("get room: %w", err) } if room.Type != model.RoomTypeChannel { - return nil, fmt.Errorf("cannot add members to a non-channel room") + return nil, errAddMembersChannelOnly } if room.Restricted && !hasRole(sub.Roles, model.RoleOwner) { - return nil, fmt.Errorf("only owners can add members to a restricted room") + return nil, errOnlyOwnersCanAddToRes } // 4. Unmarshal request var req model.AddMembersRequest if err := json.Unmarshal(data, &req); err != nil { - return nil, fmt.Errorf("invalid request: %w", err) + return nil, errcode.BadRequest("invalid request") } if req.RoomID != "" && req.RoomID != roomID { - return nil, fmt.Errorf("invalid request: room ID mismatch") + return nil, errRoomIDMismatch } // Reject direct bots up front — mirrors classifyAndValidate in @@ -791,7 +818,13 @@ func (h *Handler) handleAddMembers(ctx context.Context, subj string, data []byte // ReconcileUserCount after each membership change) instead of issuing a // separate CountSubscriptions query. if room.UserCount+newCount > h.maxRoomSize { - return nil, fmt.Errorf("room is at maximum capacity (%d): cannot add %d members to room with %d existing", h.maxRoomSize, newCount, room.UserCount) + return nil, errcode.Conflict( + fmt.Sprintf("room is at maximum capacity (%d): cannot add %d members to room with %d existing", h.maxRoomSize, newCount, room.UserCount), + errcode.WithReason(errcode.RoomMaxSizeReached), + errcode.WithMetadata("maxRoomSize", strconv.Itoa(h.maxRoomSize), + "currentUserCount", strconv.Itoa(room.UserCount), + "attempted", strconv.Itoa(room.UserCount+newCount)), + ) } // 9. Normalize and publish — Users and Orgs ship as merged-but-unresolved. @@ -814,10 +847,10 @@ func (h *Handler) handleAddMembers(ctx context.Context, subj string, data []byte return json.Marshal(map[string]string{"status": "accepted"}) } -// validateAccountsExist wraps errUserNotFound with the first phantom account -// (via fmt.Errorf("user %q: %w", …)) when any account has no matching user -// document; errors.Is(err, errUserNotFound) holds. Without this gate a typo'd -// account is silently dropped and the async job reports success. +// validateAccountsExist returns a RoomUserNotFound-reason errcode naming the +// first phantom account when any account has no matching user document. +// errcode.HasReason(err, errcode.RoomUserNotFound) holds. Without this gate a +// typo'd account is silently dropped and the async job reports success. func (h *Handler) validateAccountsExist(ctx context.Context, accounts []string) error { if len(accounts) == 0 { return nil @@ -835,15 +868,15 @@ func (h *Handler) validateAccountsExist(ctx context.Context, accounts []string) } for _, a := range accounts { if _, ok := have[a]; !ok { - return fmt.Errorf("user %q: %w", a, errUserNotFound) + return errcode.NotFound(fmt.Sprintf("user %q not found", a), errcode.WithReason(errcode.RoomUserNotFound)) } } return nil } -// validateOrgIDs wraps errInvalidOrg with the first phantom orgID (via -// fmt.Errorf("org %q: %w", …)) when any orgID has zero backing users -// (no user with sectId==orgID or deptId==orgID); errors.Is(err, errInvalidOrg) +// validateOrgIDs returns a RoomInvalidOrg-reason errcode naming the first +// phantom orgID when any orgID has zero backing users (no user with +// sectId==orgID or deptId==orgID). errcode.HasReason(err, errcode.RoomInvalidOrg) // holds. No-op when orgIDs is empty. func (h *Handler) validateOrgIDs(ctx context.Context, orgIDs []string) error { if len(orgIDs) == 0 { @@ -862,7 +895,7 @@ func (h *Handler) validateOrgIDs(ctx context.Context, orgIDs []string) error { } for _, id := range orgIDs { if _, ok := have[id]; !ok { - return fmt.Errorf("org %q: %w", id, errInvalidOrg) + return errcode.BadRequest(fmt.Sprintf("invalid org %q", id), errcode.WithReason(errcode.RoomInvalidOrg)) } } return nil @@ -894,15 +927,15 @@ func (h *Handler) expandChannelRefs(ctx context.Context, requester string, refs // Per-ref deadline so a slow same-site Mongo query or unresponsive // remote site cannot stall the create/add request indefinitely; a - // timeout here surfaces to the caller as channelExpandTimeoutError - // with site+roomId so the requester can see which channel stalled. + // timeout here surfaces to the caller as an Unavailable errcode with + // site+roomId so the requester can see which channel stalled. refCtx, cancel := h.contextWithMemberListTimeout(ctx) if ref.SiteID == h.siteID { if _, subErr := h.store.GetSubscription(refCtx, requester, ref.RoomID); subErr != nil { cancel() if errors.Is(subErr, context.DeadlineExceeded) { - return nil, nil, newChannelExpandTimeoutError(ref.SiteID, ref.RoomID) + return nil, nil, errcode.Unavailable(fmt.Sprintf("timeout listing members of channel %s@%s", ref.RoomID, ref.SiteID)) } if errors.Is(subErr, model.ErrSubscriptionNotFound) { return nil, nil, errNotRoomMember @@ -913,7 +946,7 @@ func (h *Handler) expandChannelRefs(ctx context.Context, requester string, refs cancel() if err != nil { if errors.Is(err, context.DeadlineExceeded) { - return nil, nil, newChannelExpandTimeoutError(ref.SiteID, ref.RoomID) + return nil, nil, errcode.Unavailable(fmt.Sprintf("timeout listing members of channel %s@%s", ref.RoomID, ref.SiteID)) } return nil, nil, fmt.Errorf("local list-members %s: %w", ref.RoomID, err) } @@ -922,7 +955,7 @@ func (h *Handler) expandChannelRefs(ctx context.Context, requester string, refs cancel() if err != nil { if errors.Is(err, context.DeadlineExceeded) { - return nil, nil, newChannelExpandTimeoutError(ref.SiteID, ref.RoomID) + return nil, nil, errcode.Unavailable(fmt.Sprintf("timeout listing members of channel %s@%s", ref.RoomID, ref.SiteID)) } // Pass the sentinel through unwrapped so same-site and cross-site "not a member" // produce identical behavior — errors.Is(err, errNotRoomMember) matches both. @@ -964,11 +997,14 @@ func (h *Handler) expandChannelRefs(ctx context.Context, requester string, refs } func (h *Handler) natsRoomsInfoBatch(m otelnats.Msg) { - ctx := wrappedCtx(m) + ctx, err := wrappedCtx(m) + if err != nil { + errnats.Reply(ctx, m.Msg, err) + return + } resp, err := h.handleRoomsInfoBatch(ctx, m.Msg.Data) if err != nil { - slog.Error("rooms info batch failed", "error", err) - natsutil.ReplyError(m.Msg, sanitizeError(err)) + errnats.Reply(ctx, m.Msg, err) return } if err := m.Msg.Respond(resp); err != nil { @@ -980,13 +1016,13 @@ func (h *Handler) handleRoomsInfoBatch(ctx context.Context, data []byte) ([]byte start := time.Now() var req model.RoomsInfoBatchRequest if err := json.Unmarshal(data, &req); err != nil { - return nil, fmt.Errorf("invalid request: %w", err) + return nil, errcode.BadRequest("invalid request") } if len(req.RoomIDs) == 0 { - return nil, fmt.Errorf("roomIds must not be empty") + return nil, errcode.BadRequest("roomIds must not be empty") } if len(req.RoomIDs) > h.maxBatchSize { - return nil, fmt.Errorf("batch size %d exceeds limit %d", len(req.RoomIDs), h.maxBatchSize) + return nil, errcode.BadRequest(fmt.Sprintf("batch size %d exceeds limit %d", len(req.RoomIDs), h.maxBatchSize)) } if span := trace.SpanFromContext(ctx); span.IsRecording() { @@ -1099,11 +1135,14 @@ func chunkedGetKeys(ctx context.Context, ks RoomKeyStore, ids []string) (map[str } func (h *Handler) natsMessageRead(m otelnats.Msg) { - ctx := wrappedCtx(m) + ctx, err := wrappedCtx(m) + if err != nil { + errnats.Reply(ctx, m.Msg, err) + return + } resp, err := h.handleMessageRead(ctx, m.Msg.Subject, m.Msg.Data) if err != nil { - slog.Error("message read failed", "error", err) - natsutil.ReplyError(m.Msg, sanitizeError(err)) + errnats.Reply(ctx, m.Msg, err) return } if err := m.Msg.Respond(resp); err != nil { @@ -1217,11 +1256,14 @@ func (h *Handler) handleMessageRead(ctx context.Context, subj string, _ []byte) } func (h *Handler) natsMessageReadReceipt(m otelnats.Msg) { - ctx := wrappedCtx(m) + ctx, err := wrappedCtx(m) + if err != nil { + errnats.Reply(ctx, m.Msg, err) + return + } resp, err := h.handleMessageReadReceipt(ctx, m.Msg.Subject, m.Msg.Data) if err != nil { - slog.Error("message read-receipt failed", "error", err) - natsutil.ReplyError(m.Msg, sanitizeError(err)) + errnats.Reply(ctx, m.Msg, err) return } if err := m.Msg.Respond(resp); err != nil { @@ -1237,10 +1279,10 @@ func (h *Handler) handleMessageReadReceipt(ctx context.Context, subj string, dat var req model.ReadReceiptRequest if err := json.Unmarshal(data, &req); err != nil { - return nil, fmt.Errorf("invalid request: %w", err) + return nil, errcode.BadRequest("invalid request") } if req.MessageID == "" { - return nil, fmt.Errorf("invalid request: messageId is required") + return nil, errcode.BadRequest("invalid request: messageId is required") } if span := trace.SpanFromContext(ctx); span.IsRecording() { @@ -1310,11 +1352,14 @@ func (h *Handler) handleMessageReadReceipt(ctx context.Context, subj string, dat } func (h *Handler) natsMessageThreadRead(m otelnats.Msg) { - ctx := wrappedCtx(m) + ctx, err := wrappedCtx(m) + if err != nil { + errnats.Reply(ctx, m.Msg, err) + return + } resp, err := h.handleMessageThreadRead(ctx, m.Msg.Subject, m.Msg.Data) if err != nil { - slog.Error("message thread-read failed", "error", err) - natsutil.ReplyError(m.Msg, sanitizeError(err)) + errnats.Reply(ctx, m.Msg, err) return } if err := m.Msg.Respond(resp); err != nil { @@ -1330,7 +1375,7 @@ func (h *Handler) handleMessageThreadRead(ctx context.Context, subj string, data var req model.MessageThreadReadRequest if err := json.Unmarshal(data, &req); err != nil { - return nil, fmt.Errorf("unmarshal thread-read request: %w", err) + return nil, errcode.BadRequest("invalid request") } if strings.TrimSpace(req.ThreadID) == "" { return nil, errInvalidThreadID @@ -1441,11 +1486,14 @@ func (h *Handler) handleMessageThreadRead(ctx context.Context, subj string, data // bytes — encryption/decryption is performed by broadcast-worker and clients, // which read keys from Valkey directly. func (h *Handler) NatsHandleEnsureRoomKey(m otelnats.Msg) { - ctx := wrappedCtx(m) + ctx, err := wrappedCtx(m) + if err != nil { + errnats.Reply(ctx, m.Msg, err) + return + } resp, err := h.handleEnsureRoomKey(ctx, m.Msg.Data) if err != nil { - slog.Error("ensure room key failed", "error", err) - natsutil.ReplyError(m.Msg, sanitizeError(err)) + errnats.Reply(ctx, m.Msg, err) return } if err := m.Msg.Respond(resp); err != nil { @@ -1455,14 +1503,20 @@ func (h *Handler) NatsHandleEnsureRoomKey(m otelnats.Msg) { func (h *Handler) handleEnsureRoomKey(ctx context.Context, data []byte) ([]byte, error) { if h.keyStore == nil { - return nil, fmt.Errorf("ensure room key: key store not configured") + // Local Valkey disabled — surfaces to peer sites as a transient outage + // (symmetric with the timeout-class failures in :808/:819/:828). + return nil, errcode.Unavailable("room key store not configured") } var req model.RoomKeyEnsureRequest if err := json.Unmarshal(data, &req); err != nil { - return nil, fmt.Errorf("ensure room key: decode request: %w", err) + // Per doc.go and pkg/errcode logging contract: json.SyntaxError / + // UnmarshalTypeError strings embed the offending substring and field + // shape from an unauthenticated payload — never WithCause(err) here. + // Same shape as message-gatekeeper:173. + return nil, errcode.BadRequest("invalid ensure-room-key request") } if req.RoomID == "" { - return nil, fmt.Errorf("ensure room key: roomId is required") + return nil, errcode.BadRequest("roomId is required") } existing, err := h.keyStore.Get(ctx, req.RoomID) @@ -1491,11 +1545,14 @@ func (h *Handler) handleEnsureRoomKey(ctx context.Context, data []byte) ([]byte, } func (h *Handler) natsMuteToggle(m otelnats.Msg) { - ctx := wrappedCtx(m) + ctx, err := wrappedCtx(m) + if err != nil { + errnats.Reply(ctx, m.Msg, err) + return + } resp, err := h.handleMuteToggle(ctx, m.Msg.Subject, m.Msg.Data) if err != nil { - slog.Error("mute toggle failed", "error", err, "subject", m.Msg.Subject) - natsutil.ReplyError(m.Msg, sanitizeError(err)) + errnats.Reply(ctx, m.Msg, err) return } if err := m.Msg.Respond(resp); err != nil { diff --git a/room-service/handler_test.go b/room-service/handler_test.go index d32c3d958..b22240d67 100644 --- a/room-service/handler_test.go +++ b/room-service/handler_test.go @@ -17,6 +17,7 @@ import ( "github.com/stretchr/testify/require" "go.uber.org/mock/gomock" + "github.com/hmchangw/chat/pkg/errcode" "github.com/hmchangw/chat/pkg/idgen" "github.com/hmchangw/chat/pkg/model" "github.com/hmchangw/chat/pkg/natsutil" @@ -94,12 +95,7 @@ func TestHandler_UpdateRole_NonOwnerRejected(t *testing.T) { subj := subject.MemberRoleUpdate("bob", "r1", "site-a") _, err := h.handleUpdateRole(context.Background(), subj, data) - if err == nil { - t.Fatal("expected error for non-owner role update") - } - if err.Error() != "only owners can update roles" { - t.Errorf("unexpected error: %v", err) - } + require.ErrorIs(t, err, errOnlyOwners) } func TestHandler_UpdateRole_DMRejected(t *testing.T) { @@ -119,12 +115,7 @@ func TestHandler_UpdateRole_DMRejected(t *testing.T) { subj := subject.MemberRoleUpdate("alice", "r1", "site-a") _, err := h.handleUpdateRole(context.Background(), subj, data) - if err == nil { - t.Fatal("expected error for DM room role update") - } - if err.Error() != "role update is only allowed in channel rooms" { - t.Errorf("unexpected error: %v", err) - } + require.ErrorIs(t, err, errRoomTypeGuard) } func TestHandler_UpdateRole_InvalidRole(t *testing.T) { @@ -140,12 +131,7 @@ func TestHandler_UpdateRole_InvalidRole(t *testing.T) { subj := subject.MemberRoleUpdate("alice", "r1", "site-a") _, err := h.handleUpdateRole(context.Background(), subj, data) - if err == nil { - t.Fatal("expected error for invalid role") - } - if err.Error() != "invalid role: must be owner or member" { - t.Errorf("unexpected error: %v", err) - } + require.ErrorIs(t, err, errInvalidRole) } func TestHandler_UpdateRole_AlreadyHasRole(t *testing.T) { @@ -174,12 +160,7 @@ func TestHandler_UpdateRole_AlreadyHasRole(t *testing.T) { subj := subject.MemberRoleUpdate("alice", "r1", "site-a") _, err := h.handleUpdateRole(context.Background(), subj, data) - if err == nil { - t.Fatal("expected error for duplicate role") - } - if err.Error() != "user is already an owner" { - t.Errorf("unexpected error: %v", err) - } + require.ErrorIs(t, err, errAlreadyOwner) } // Bug 5: an org-only subscriber must not be promotable to owner. @@ -293,12 +274,7 @@ func TestHandler_UpdateRole_DemoteNonOwner(t *testing.T) { subj := subject.MemberRoleUpdate("alice", "r1", "site-a") _, err := h.handleUpdateRole(context.Background(), subj, data) - if err == nil { - t.Fatal("expected error for demoting non-owner") - } - if err.Error() != "user is not an owner" { - t.Errorf("unexpected error: %v", err) - } + require.ErrorIs(t, err, errNotOwner) } func TestHandler_UpdateRole_LastOwnerCannotDemote(t *testing.T) { @@ -330,12 +306,7 @@ func TestHandler_UpdateRole_LastOwnerCannotDemote(t *testing.T) { subj := subject.MemberRoleUpdate("alice", "r1", "site-a") _, err := h.handleUpdateRole(context.Background(), subj, data) - if err == nil { - t.Fatal("expected error for last owner demotion") - } - if err.Error() != "cannot demote the last owner" { - t.Errorf("unexpected error: %v", err) - } + require.ErrorIs(t, err, errCannotDemoteLast) } // --- Error-path tests --- @@ -383,7 +354,7 @@ func TestHandler_UpdateRole_RoomIDMismatch(t *testing.T) { if err == nil { t.Fatal("expected error for RoomID mismatch") } - if err.Error() != "invalid request: room ID mismatch" { + if !errors.Is(err, errRoomIDMismatch) { t.Errorf("unexpected error: %v", err) } } @@ -1091,8 +1062,9 @@ func expectAliceOwnerOfR1(store *MockRoomStore) { } // errStoreFailure is a sentinel used in store-error branch tests. Distinct -// from the validators' errInvalidOrg/errUserNotFound so the test can verify -// that the store error wraps cleanly without being masked by the sentinel. +// from the validators' RoomInvalidOrg/RoomUserNotFound reasons so the test can +// verify that the store error wraps cleanly without being masked by the +// reason-keyed identity check. var errStoreFailure = errors.New("store boom") // TestHandler_AddMembers_PhantomValidation covers the gate that converts the @@ -1108,6 +1080,7 @@ func TestHandler_AddMembers_PhantomValidation(t *testing.T) { req model.AddMembersRequest setupMocks func(store *MockRoomStore) wantErr bool + wantReason errcode.Reason wantErrSentinel error wantPublish bool }{ @@ -1117,7 +1090,7 @@ func TestHandler_AddMembers_PhantomValidation(t *testing.T) { setupMocks: func(store *MockRoomStore) { store.EXPECT().FindExistingOrgIDs(gomock.Any(), []string{"org-nope"}).Return(nil, nil) }, - wantErr: true, wantErrSentinel: errInvalidOrg, wantPublish: false, + wantErr: true, wantReason: errcode.RoomInvalidOrg, wantPublish: false, }, { name: "partially invalid org rejected", @@ -1126,7 +1099,7 @@ func TestHandler_AddMembers_PhantomValidation(t *testing.T) { store.EXPECT().FindExistingOrgIDs(gomock.Any(), gomock.InAnyOrder([]string{"good-org", "bad-org"})). Return([]string{"good-org"}, nil) }, - wantErr: true, wantErrSentinel: errInvalidOrg, wantPublish: false, + wantErr: true, wantReason: errcode.RoomInvalidOrg, wantPublish: false, }, { name: "no orgs skips org validation", @@ -1145,7 +1118,7 @@ func TestHandler_AddMembers_PhantomValidation(t *testing.T) { store.EXPECT().FindExistingAccounts(gomock.Any(), gomock.InAnyOrder([]string{"bob", "ghost"})). Return([]string{"bob"}, nil) }, - wantErr: true, wantErrSentinel: errUserNotFound, wantPublish: false, + wantErr: true, wantReason: errcode.RoomUserNotFound, wantPublish: false, }, { name: "no users skips user validation", @@ -1183,7 +1156,7 @@ func TestHandler_AddMembers_PhantomValidation(t *testing.T) { store.EXPECT().FindExistingOrgIDs(gomock.Any(), []string{"org-nope"}).Return(nil, nil) store.EXPECT().FindExistingAccounts(gomock.Any(), []string{"ghost"}).Return(nil, nil) }, - wantErr: true, wantErrSentinel: errInvalidOrg, wantPublish: false, + wantErr: true, wantReason: errcode.RoomInvalidOrg, wantPublish: false, }, } @@ -1205,6 +1178,9 @@ func TestHandler_AddMembers_PhantomValidation(t *testing.T) { _, err := h.handleAddMembers(context.Background(), subject.MemberAdd("alice", "r1", "site-a"), body) if tc.wantErr { require.Error(t, err) + if tc.wantReason != "" { + assert.True(t, errcode.HasReason(err, tc.wantReason), "want reason %v, got %v", tc.wantReason, err) + } if tc.wantErrSentinel != nil { assert.True(t, errors.Is(err, tc.wantErrSentinel), "want %v, got %v", tc.wantErrSentinel, err) } @@ -1226,6 +1202,7 @@ func TestHandler_CreateRoomChannel_PhantomValidation(t *testing.T) { req model.CreateRoomRequest setupMocks func(store *MockRoomStore) wantErr bool + wantReason errcode.Reason wantErrSentinel error wantPublish bool }{ @@ -1235,7 +1212,7 @@ func TestHandler_CreateRoomChannel_PhantomValidation(t *testing.T) { setupMocks: func(store *MockRoomStore) { store.EXPECT().FindExistingOrgIDs(gomock.Any(), []string{"org-nope"}).Return(nil, nil) }, - wantErr: true, wantErrSentinel: errInvalidOrg, wantPublish: false, + wantErr: true, wantReason: errcode.RoomInvalidOrg, wantPublish: false, }, { name: "phantom user rejected", @@ -1244,7 +1221,7 @@ func TestHandler_CreateRoomChannel_PhantomValidation(t *testing.T) { store.EXPECT().FindExistingAccounts(gomock.Any(), gomock.InAnyOrder([]string{"bob", "ghost"})). Return([]string{"bob"}, nil) }, - wantErr: true, wantErrSentinel: errUserNotFound, wantPublish: false, + wantErr: true, wantReason: errcode.RoomUserNotFound, wantPublish: false, }, { name: "FindExistingOrgIDs store error propagates", @@ -1269,7 +1246,7 @@ func TestHandler_CreateRoomChannel_PhantomValidation(t *testing.T) { store.EXPECT().FindExistingOrgIDs(gomock.Any(), []string{"org-nope"}).Return(nil, nil) store.EXPECT().FindExistingAccounts(gomock.Any(), []string{"ghost"}).Return(nil, nil) }, - wantErr: true, wantErrSentinel: errInvalidOrg, wantPublish: false, + wantErr: true, wantReason: errcode.RoomInvalidOrg, wantPublish: false, }, } @@ -1291,6 +1268,9 @@ func TestHandler_CreateRoomChannel_PhantomValidation(t *testing.T) { _, err := h.handleCreateRoom(ctxWithReqID(), createRoomSubj("alice", "site-a"), body) if tc.wantErr { require.Error(t, err) + if tc.wantReason != "" { + assert.True(t, errcode.HasReason(err, tc.wantReason), "want reason %v, got %v", tc.wantReason, err) + } if tc.wantErrSentinel != nil { assert.True(t, errors.Is(err, tc.wantErrSentinel), "want %v, got %v", tc.wantErrSentinel, err) } @@ -1449,14 +1429,12 @@ func TestHandler_AddMembers_ChannelExpansion(t *testing.T) { _, _, err := h.expandChannelRefs(context.Background(), "alice", []model.ChannelRef{ch}) require.Error(t, err) - var te *channelExpandTimeoutError - require.ErrorAs(t, err, &te) - assert.Equal(t, "site-a", te.SiteID) - assert.Equal(t, "ch-slow", te.RoomID) - // User-facing message includes site+roomId. - assert.Equal(t, "timeout listing members of channel ch-slow@site-a", te.Error()) - // sanitizeError surfaces it verbatim, NOT "internal error". - assert.Equal(t, te.Error(), sanitizeError(err)) + var ee *errcode.Error + require.ErrorAs(t, err, &ee) + // Channel-expand timeouts surface as Unavailable with site+roomId so the + // requester sees which channel stalled, NOT a collapsed "internal error". + assert.Equal(t, errcode.CodeUnavailable, ee.Code) + assert.Equal(t, "timeout listing members of channel ch-slow@site-a", ee.Message) }) t.Run("cross-site member.list deadline-exceeded yields typed timeout error", func(t *testing.T) { @@ -1472,11 +1450,10 @@ func TestHandler_AddMembers_ChannelExpansion(t *testing.T) { _, _, err := h.expandChannelRefs(context.Background(), "alice", []model.ChannelRef{ch}) require.Error(t, err) - var te *channelExpandTimeoutError - require.ErrorAs(t, err, &te) - assert.Equal(t, "site-b", te.SiteID) - assert.Equal(t, "ch-remote", te.RoomID) - assert.Equal(t, "timeout listing members of channel ch-remote@site-b", sanitizeError(err)) + var ee *errcode.Error + require.ErrorAs(t, err, &ee) + assert.Equal(t, errcode.CodeUnavailable, ee.Code) + assert.Equal(t, "timeout listing members of channel ch-remote@site-b", ee.Message) }) t.Run("same-site ListRoomMembers error", func(t *testing.T) { @@ -1822,6 +1799,7 @@ func TestHandler_ListOrgMembers(t *testing.T) { type want struct { errContains string errIs error + wantReason errcode.Reason members []model.OrgMember } tests := []struct { @@ -1845,12 +1823,12 @@ func TestHandler_ListOrgMembers(t *testing.T) { want: want{errContains: "invalid org-members subject"}, }, { - name: "empty org returns errInvalidOrg", + name: "empty org returns RoomInvalidOrg-reason errcode", subject: subj, setupMock: func(s *MockRoomStore) { - s.EXPECT().ListOrgMembers(gomock.Any(), orgID).Return(nil, errInvalidOrg) + s.EXPECT().ListOrgMembers(gomock.Any(), orgID).Return(nil, errcode.BadRequest(fmt.Sprintf("list org members for %q", orgID), errcode.WithReason(errcode.RoomInvalidOrg))) }, - want: want{errIs: errInvalidOrg}, + want: want{wantReason: errcode.RoomInvalidOrg}, }, { name: "store error is wrapped", @@ -1882,6 +1860,11 @@ func TestHandler_ListOrgMembers(t *testing.T) { assert.True(t, errors.Is(err, tc.want.errIs), "error chain should contain %v, got %v", tc.want.errIs, err) return } + if tc.want.wantReason != "" { + require.Error(t, err) + assert.True(t, errcode.HasReason(err, tc.want.wantReason), "want reason %v, got %v", tc.want.wantReason, err) + return + } require.NoError(t, err) assert.Equal(t, tc.want.members, resp.Members) }) @@ -2183,33 +2166,57 @@ func TestHandler_handleUpdateRole_PropagatesRequestID(t *testing.T) { assert.Equal(t, "req-room-svc-test", capturedHeader.Get(natsutil.RequestIDHeader)) } -func TestWrappedCtx_PropagatesXRequestIDFromHeaderToContext(t *testing.T) { +func TestWrappedCtx_PropagatesValidUUIDFromHeader(t *testing.T) { + const inbound = "01970a4f-8c2d-7c9a-abcd-e0123456789f" + rawMsg := &nats.Msg{ + Subject: "chat.room.test", + Data: []byte("ignored"), + Header: nats.Header{natsutil.RequestIDHeader: []string{inbound}}, + } + m := otelnats.Msg{Msg: rawMsg, Ctx: context.Background()} + + got, err := wrappedCtx(m) + + require.NoError(t, err) + assert.Equal(t, inbound, natsutil.RequestIDFromContext(got), + "valid inbound UUID must pass through unchanged") +} + +// room-service handlers feed dedup-critical paths in room-worker +// (OutboxDedupID, messageDedupSeed, idgen.MessageIDFromRequestID) where a +// server-side mint would break client-retry dedup. wrappedCtx therefore uses +// the strict natsutil.RequireRequestID and surfaces an errcode.BadRequest when +// the inbound header is missing or malformed. +func TestWrappedCtx_MalformedHeaderRejects(t *testing.T) { rawMsg := &nats.Msg{ Subject: "chat.room.test", Data: []byte("ignored"), - Header: nats.Header{natsutil.RequestIDHeader: []string{"req-from-inbound-header"}}, + Header: nats.Header{natsutil.RequestIDHeader: []string{"not-a-uuid"}}, } m := otelnats.Msg{Msg: rawMsg, Ctx: context.Background()} - got := wrappedCtx(m) + _, err := wrappedCtx(m) - assert.Equal(t, "req-from-inbound-header", natsutil.RequestIDFromContext(got), - "wrappedCtx must extract X-Request-ID from m.Msg.Header into the returned context") + require.Error(t, err) + var ec *errcode.Error + require.True(t, errors.As(err, &ec)) + assert.Equal(t, errcode.CodeBadRequest, ec.Code) } -func TestWrappedCtx_NoHeaderReturnsCtxUnchanged(t *testing.T) { +func TestWrappedCtx_NoHeaderRejects(t *testing.T) { rawMsg := &nats.Msg{ Subject: "chat.room.test", Data: []byte("ignored"), Header: nats.Header{}, } - parent := context.Background() - m := otelnats.Msg{Msg: rawMsg, Ctx: parent} + m := otelnats.Msg{Msg: rawMsg, Ctx: context.Background()} - got := wrappedCtx(m) + _, err := wrappedCtx(m) - assert.Empty(t, natsutil.RequestIDFromContext(got), - "missing inbound header → empty request ID on returned ctx") + require.Error(t, err) + var ec *errcode.Error + require.True(t, errors.As(err, &ec)) + assert.Equal(t, errcode.CodeBadRequest, ec.Code) } // --- Phase 5c: handleCreateRoom (3-arg) tests --- @@ -2250,16 +2257,10 @@ func TestHandleCreateRoom_InvalidSubject(t *testing.T) { assert.Contains(t, err.Error(), "invalid create-room subject") } -func TestHandleCreateRoom_MissingRequestID(t *testing.T) { - ctrl := gomock.NewController(t) - store := NewMockRoomStore(ctrl) - h := &Handler{store: store, siteID: "site-a", maxRoomSize: 1000} - - body, _ := json.Marshal(model.CreateRoomRequest{Users: []string{"bob"}}) - _, err := h.handleCreateRoom(context.Background(), createRoomSubj("alice", "site-a"), body) - require.Error(t, err) - assert.True(t, errors.Is(err, errMissingRequestID)) -} +// Boundary-level reject behavior is tested via wrappedCtx (above); the +// helper itself is unit-tested in pkg/natsutil.RequireRequestID. The +// dedup-critical paths fanned out from room-service make server-side minting +// unsafe — see docs/error-handling.md §3a. func TestHandleCreateRoom_EmptyPayload(t *testing.T) { ctrl := gomock.NewController(t) @@ -2282,7 +2283,7 @@ func TestHandleCreateRoom_RequesterNotFound(t *testing.T) { body, _ := json.Marshal(model.CreateRoomRequest{Users: []string{"bob"}}) _, err := h.handleCreateRoom(ctxWithReqID(), createRoomSubj("alice", "site-a"), body) require.Error(t, err) - assert.True(t, errors.Is(err, errUserNotFound)) + assert.True(t, errcode.HasReason(err, errcode.RoomUserNotFound), "want RoomUserNotFound, got %v", err) } func TestHandleCreateRoom_RequesterMissingNameFields(t *testing.T) { @@ -2349,11 +2350,13 @@ func TestHandleCreateRoom_DM_AlreadyExists(t *testing.T) { h := &Handler{store: store, siteID: "site-a", maxRoomSize: 1000} body, _ := json.Marshal(model.CreateRoomRequest{Users: []string{"bob"}}) - _, err := h.handleCreateRoom(ctxWithReqID(), createRoomSubj("alice", "site-a"), body) - require.Error(t, err) - var dmErr *dmExistsError - require.True(t, errors.As(err, &dmErr)) - assert.Equal(t, "existing-dm-room", dmErr.RoomID()) + resp, err := h.handleCreateRoom(ctxWithReqID(), createRoomSubj("alice", "site-a"), body) + require.NoError(t, err) + + var reply model.CreateRoomReply + require.NoError(t, json.Unmarshal(resp, &reply)) + assert.Equal(t, model.CreateRoomStatusExists, reply.Status) + assert.Equal(t, "existing-dm-room", reply.RoomID) } func TestHandleCreateRoom_BotDM_HappyPath(t *testing.T) { @@ -2445,8 +2448,8 @@ func TestHandleCreateRoom_BotDM_Disabled(t *testing.T) { } // New: existing botDM where the bot was later disabled MUST still return the -// existing roomId via dmExistsError, not errBotNotAvailable. This is the -// idempotent open-or-create contract. +// existing roomId via the success "exists" reply, not errBotNotAvailable. This +// is the idempotent open-or-create contract. func TestHandleCreateRoom_BotDM_DisabledButExisting(t *testing.T) { ctrl := gomock.NewController(t) store := NewMockRoomStore(ctrl) @@ -2458,11 +2461,13 @@ func TestHandleCreateRoom_BotDM_DisabledButExisting(t *testing.T) { h := &Handler{store: store, siteID: "site-a", maxRoomSize: 1000} body, _ := json.Marshal(model.CreateRoomRequest{Users: []string{"helper.bot"}}) - _, err := h.handleCreateRoom(ctxWithReqID(), createRoomSubj("alice", "site-a"), body) - require.Error(t, err) - var de *dmExistsError - require.ErrorAs(t, err, &de) - assert.Equal(t, "existing-bot-dm", de.RoomID()) + resp, err := h.handleCreateRoom(ctxWithReqID(), createRoomSubj("alice", "site-a"), body) + require.NoError(t, err) + + var reply model.CreateRoomReply + require.NoError(t, json.Unmarshal(resp, &reply)) + assert.Equal(t, model.CreateRoomStatusExists, reply.Status) + assert.Equal(t, "existing-bot-dm", reply.RoomID) } func TestHandleCreateRoom_Channel_HappyPath(t *testing.T) { @@ -2666,22 +2671,20 @@ func TestHandleCreateRoom_BotDM_PUnderscoreWebhookBot(t *testing.T) { // --- Phase 5c: natsCreateRoom adapter tests --- func TestNatsCreateRoom_DMExistsReply(t *testing.T) { - // Verify the ErrorResponse shape that replyDMExists marshals is correct. - // We can't hook *nats.Msg.Respond in unit tests without a NATS server, so we - // verify the JSON shape by marshaling the same struct directly. - body, err := json.Marshal(model.ErrorResponse{Error: "dm already exists", RoomID: "existing-dm"}) + // DM-exists is now a SUCCESS reply: {status:"exists", roomId:…}, not an error. + body, err := json.Marshal(model.CreateRoomReply{Status: model.CreateRoomStatusExists, RoomID: "existing-dm"}) require.NoError(t, err) - var errResp model.ErrorResponse - require.NoError(t, json.Unmarshal(body, &errResp)) - assert.Equal(t, "dm already exists", errResp.Error) - assert.Equal(t, "existing-dm", errResp.RoomID) + var reply model.CreateRoomReply + require.NoError(t, json.Unmarshal(body, &reply)) + assert.Equal(t, model.CreateRoomStatusExists, reply.Status) + assert.Equal(t, "existing-dm", reply.RoomID) assert.Contains(t, string(body), `"roomId":"existing-dm"`) } -func TestNatsCreateRoom_DMExistsError_FlowTriggered(t *testing.T) { - // Verify that handleCreateRoom returns a dmExistsError when FindDMSubscription - // returns an existing subscription — this is what natsCreateRoom routes to replyDMExists. +func TestNatsCreateRoom_DMExistsSuccess_FlowTriggered(t *testing.T) { + // Verify handleCreateRoom returns a SUCCESS "exists" reply (not an error) + // when FindDMSubscription returns an existing subscription. ctrl := gomock.NewController(t) store := NewMockRoomStore(ctrl) store.EXPECT().GetUser(gomock.Any(), "alice").Return(aliceUser(), nil) @@ -2692,20 +2695,21 @@ func TestNatsCreateRoom_DMExistsError_FlowTriggered(t *testing.T) { h := &Handler{store: store, siteID: "site-a", maxRoomSize: 1000} reqBody, _ := json.Marshal(model.CreateRoomRequest{Users: []string{"bob"}}) - _, err := h.handleCreateRoom( + resp, err := h.handleCreateRoom( natsutil.WithRequestID(context.Background(), idgen.GenerateRequestID()), createRoomSubj("alice", "site-a"), reqBody, ) - require.Error(t, err) - var dmErr *dmExistsError - require.True(t, errors.As(err, &dmErr), "natsCreateRoom must receive *dmExistsError to route to replyDMExists") - assert.Equal(t, "existing-dm", dmErr.RoomID()) + require.NoError(t, err) + + var reply model.CreateRoomReply + require.NoError(t, json.Unmarshal(resp, &reply)) + assert.Equal(t, model.CreateRoomStatusExists, reply.Status) + assert.Equal(t, "existing-dm", reply.RoomID) } func TestNatsCreateRoom_GenericErrorReply(t *testing.T) { - // Verify sanitizeError is called for generic errors by testing the - // error handling path of natsCreateRoom via its handler function. + // A bare DB error collapses to internal at the reply boundary (Classify). ctrl := gomock.NewController(t) store := NewMockRoomStore(ctrl) store.EXPECT().GetUser(gomock.Any(), "alice").Return(nil, fmt.Errorf("mongo connection refused")) @@ -2718,8 +2722,9 @@ func TestNatsCreateRoom_GenericErrorReply(t *testing.T) { body, ) require.Error(t, err) - // sanitizeError on an unwrapped db error returns "internal error" - assert.Equal(t, "internal error", sanitizeError(err)) + // Not a typed *errcode.Error — Classify will collapse it to internal. + var ee *errcode.Error + assert.False(t, errors.As(err, &ee), "bare DB error must not be a typed errcode") } // --- message.read tests --- @@ -4056,7 +4061,7 @@ func TestHandler_natsGetRoomKey(t *testing.T) { type want struct { replyJSON string // expected JSON of the success reply (empty when err) - errSubstr string // expected substring in sanitizeError(err) (empty when ok) + errSubstr string // expected substring in err.Error() (empty when ok) } cases := []struct { @@ -4122,7 +4127,7 @@ func TestHandler_natsGetRoomKey(t *testing.T) { Return(&model.Subscription{}, nil) ks.EXPECT().Get(gomock.Any(), roomID).Return(nil, errors.New("valkey down")) }, - want: want{errSubstr: "internal error"}, + want: want{errSubstr: "get room key:"}, }, { name: "store error on explicit version", @@ -4132,7 +4137,7 @@ func TestHandler_natsGetRoomKey(t *testing.T) { Return(&model.Subscription{}, nil) ks.EXPECT().GetByVersion(gomock.Any(), roomID, 5).Return(nil, errors.New("valkey down")) }, - want: want{errSubstr: "internal error"}, + want: want{errSubstr: "get room key:"}, }, { name: "malformed body", @@ -4156,7 +4161,7 @@ func TestHandler_natsGetRoomKey(t *testing.T) { resp, err := h.handleGetRoomKey(t.Context(), subj, tc.body) if tc.want.errSubstr != "" { require.Error(t, err) - require.Contains(t, sanitizeError(err), tc.want.errSubstr) + require.Contains(t, err.Error(), tc.want.errSubstr) return } require.NoError(t, err) diff --git a/room-service/helper.go b/room-service/helper.go index 6ca3cd3e4..a8a93205a 100644 --- a/room-service/helper.go +++ b/room-service/helper.go @@ -2,69 +2,75 @@ package main import ( "context" - "errors" - "fmt" "regexp" - "strings" "time" + "github.com/hmchangw/chat/pkg/errcode" "github.com/hmchangw/chat/pkg/model" ) -// Sentinel errors for user-facing validation failures. +// Sentinel errors for user-facing validation failures, typed as *errcode.Error +// so they classify directly at the reply boundary (errnats.Reply) without a +// per-message allowlist. +// +// These are package-level singletons SHARED across all goroutines. Callers +// MUST NOT mutate (today's Options return fresh *Error values so mutation is +// not a concern, but a future Option that wrote in place would silently alias +// state across callers). Use errors.Is for identity, errcode.HasReason for +// reason matching, and construct fresh *Error values via the named +// constructors when a caller needs a wrapped message or extra metadata. var ( - errInvalidRole = errors.New("invalid role: must be owner or member") - errOnlyOwners = errors.New("only owners can update roles") - errAlreadyOwner = errors.New("user is already an owner") - errNotOwner = errors.New("user is not an owner") - errCannotDemoteLast = errors.New("cannot demote the last owner") - errRoomTypeGuard = errors.New("role update is only allowed in channel rooms") - errTargetNotMember = errors.New("target user is not a member of this room") + errInvalidRole = errcode.BadRequest("invalid role: must be owner or member") + errOnlyOwners = errcode.Forbidden("only owners can update roles", errcode.WithReason(errcode.RoomNotOwner)) + errOnlyOwnersCanRemove = errcode.Forbidden("only owners can remove members", errcode.WithReason(errcode.RoomNotOwner)) + errOnlyOwnersCanAddToRes = errcode.Forbidden("only owners can add members to a restricted room", errcode.WithReason(errcode.RoomNotOwner)) + errAlreadyOwner = errcode.Conflict("user is already an owner", errcode.WithReason(errcode.RoomAlreadyOwner)) + errNotOwner = errcode.Forbidden("user is not an owner", errcode.WithReason(errcode.RoomNotOwner)) + errCannotDemoteLast = errcode.Conflict("cannot demote the last owner", errcode.WithReason(errcode.RoomCannotDemoteLastOwner)) + errRoomTypeGuard = errcode.BadRequest("role update is only allowed in channel rooms", errcode.WithReason(errcode.RoomNonChannelOperation)) + errAddMembersChannelOnly = errcode.BadRequest("cannot add members to a non-channel room", errcode.WithReason(errcode.RoomNonChannelOperation)) + errTargetNotMember = errcode.BadRequest("target user is not a member of this room", errcode.WithReason(errcode.RoomTargetNotMember)) // Used by both list-members (requester subscription check) and add-member // channel-source expansion. Both contexts mean "the requester is not a // member of the room they are asking about". - errNotRoomMember = errors.New("only room members can list members") - errInvalidOrg = errors.New("invalid org") - errInvalidThreadID = errors.New("threadId is required") - errThreadSubNotFound = errors.New("thread subscription not found") + errNotRoomMember = errcode.Forbidden("only room members can perform this action", errcode.WithReason(errcode.RoomNotMember)) + errInvalidThreadID = errcode.BadRequest("threadId is required") + errThreadSubNotFound = errcode.NotFound("thread subscription not found") // Only subscribers with an individual membership source can hold the owner // role. Remove-member's dual-membership path relies on this invariant: // stripping the owner role during an individual-leave is only sound when // the role can only be held alongside an individual entry. - errPromoteRequiresIndividual = errors.New("only individual members can be promoted to owner") + errPromoteRequiresIndividual = errcode.BadRequest("only individual members can be promoted to owner", errcode.WithReason(errcode.RoomPromoteRequiresIndividual)) // Sentinels for create-room validation. - errEmptyCreateRequest = errors.New("request must include at least one of users, orgs, channels, or name") - errSelfDM = errors.New("cannot create a DM with yourself") - errBotInChannel = errors.New("bots cannot be added to a channel") - errBotNotAvailable = errors.New("bot not available") - errInvalidUserData = errors.New("user is missing required name fields") - errMissingRequestID = errors.New("missing X-Request-ID header") - errInvalidRequestID = errors.New("invalid X-Request-ID format") - errChannelNameRequired = errors.New("channel name is required") - errChannelNameTooLong = errors.New("channel name must be at most 100 characters") - errUserNotFound = errors.New("user not found") - - errMessageNotFound = errors.New("message not found") - errMessageRoomMismatch = errors.New("message does not belong to this room") - errNotMessageSender = errors.New("only the message sender can view read receipts") + errEmptyCreateRequest = errcode.BadRequest("request must include at least one of users, orgs, channels, or name") + errSelfDM = errcode.BadRequest("cannot create a DM with yourself", errcode.WithReason(errcode.RoomSelfDM)) + errBotInChannel = errcode.BadRequest("bots cannot be added to a channel", errcode.WithReason(errcode.RoomBotInChannel)) + errBotNotAvailable = errcode.NotFound("bot not available", errcode.WithReason(errcode.RoomBotNotAvailable)) + errInvalidUserData = errcode.BadRequest("user is missing required name fields") + errChannelNameRequired = errcode.BadRequest("channel name is required") + errChannelNameTooLong = errcode.BadRequest("channel name must be at most 100 characters") + + errMessageNotFound = errcode.NotFound("message not found") + errMessageRoomMismatch = errcode.BadRequest("message does not belong to this room") + errNotMessageSender = errcode.Forbidden("only the message sender can view read receipts") // Sentinels for remove-member validation (surfaced to the client verbatim). - errRemoveTargetAmbiguous = errors.New("exactly one of account or orgId must be set") - errCannotRemoveLastMember = errors.New("cannot remove the last member of the room") - errLastOwnerCannotLeave = errors.New("last owner cannot leave the room") - errOrgMemberCannotLeaveSolo = errors.New("org members cannot leave individually") - errRoomIDMismatch = errors.New("room ID mismatch") - errRemoveChannelOnly = errors.New("remove-member only supported on channel rooms") + errRemoveTargetAmbiguous = errcode.BadRequest("exactly one of account or orgId must be set") + errCannotRemoveLastMember = errcode.Conflict("cannot remove the last member of the room", errcode.WithReason(errcode.RoomLastMemberCannotRemove)) + errLastOwnerCannotLeave = errcode.Conflict("last owner cannot leave the room", errcode.WithReason(errcode.RoomLastOwnerCannotLeave)) + errOrgMemberCannotLeaveSolo = errcode.Forbidden("org members cannot leave individually") + errRoomIDMismatch = errcode.BadRequest("room ID mismatch") + errRemoveChannelOnly = errcode.BadRequest("remove-member only supported on channel rooms", errcode.WithReason(errcode.RoomNonChannelOperation)) // Sentinels for list-members pagination validation. - errListLimitInvalid = errors.New("limit must be > 0") - errListOffsetInvalid = errors.New("offset must be >= 0") + errListLimitInvalid = errcode.BadRequest("limit must be > 0") + errListOffsetInvalid = errcode.BadRequest("offset must be >= 0") // errRoomKeyAbsent is returned when the requested key version is not held // by the key store (either the current key is missing or the historical // version has aged out of the grace window). - errRoomKeyAbsent = errors.New("room key not available") + errRoomKeyAbsent = errcode.NotFound("room key not available") ) var botPattern = regexp.MustCompile(`\.bot$|^p_`) @@ -135,27 +141,6 @@ func determineRoomType(req *model.CreateRoomRequest) model.RoomType { return model.RoomTypeChannel } -// channelExpandTimeoutError reports which (site, room) the channel-expansion -// step failed to read within the per-ref deadline. The sync reply surfaces it -// so the requester can see exactly which channel source stalled. -type channelExpandTimeoutError struct { - SiteID string - RoomID string -} - -func newChannelExpandTimeoutError(siteID, roomID string) *channelExpandTimeoutError { - return &channelExpandTimeoutError{SiteID: siteID, RoomID: roomID} -} - -func (e *channelExpandTimeoutError) Error() string { - return fmt.Sprintf("timeout listing members of channel %s@%s", e.RoomID, e.SiteID) -} - -func (e *channelExpandTimeoutError) Is(target error) bool { - _, ok := target.(*channelExpandTimeoutError) - return ok -} - // contextWithMemberListTimeout returns a derived context bounded by the // configured per-ref member-list timeout. When the configured timeout is // non-positive, the parent ctx is returned unchanged with a no-op cancel. @@ -166,23 +151,6 @@ func (h *Handler) contextWithMemberListTimeout(ctx context.Context) (context.Con return context.WithTimeout(ctx, h.memberListTimeout) } -// Compile-time check that channelExpandTimeoutError satisfies error. -var _ error = (*channelExpandTimeoutError)(nil) - -// dmExistsError carries the existing DM/botDM room ID for the "dm already exists" reply. -type dmExistsError struct{ existingRoomID string } - -func newDMExistsError(roomID string) *dmExistsError { - return &dmExistsError{existingRoomID: roomID} -} - -func (e *dmExistsError) Error() string { return "dm already exists" } -func (e *dmExistsError) RoomID() string { return e.existingRoomID } -func (e *dmExistsError) Is(target error) bool { - _, ok := target.(*dmExistsError) - return ok -} - // stripAccount returns slice with all occurrences of account removed (order preserved). func stripAccount(slice []string, account string) []string { out := make([]string, 0, len(slice)) @@ -193,64 +161,3 @@ func stripAccount(slice []string, account string) []string { } return out } - -// sanitizeError returns a user-safe error message for known error sentinels and approved patterns. -func sanitizeError(err error) string { - // Typed timeout error: surface the underlying message (site+roomId) directly, - // stripping any "expand channels: %w" or other wrapper context. - var ct *channelExpandTimeoutError - if errors.As(err, &ct) { - return ct.Error() - } - switch { - case errors.Is(err, errNotRoomMember): - // Always return the sentinel message, even when wrapped (e.g. by - // add-member's "expand channels: %w"), so callers get a clean - // user-safe message without the wrapping context. - return errNotRoomMember.Error() - case errors.Is(err, errInvalidRole), - errors.Is(err, errOnlyOwners), - errors.Is(err, errAlreadyOwner), - errors.Is(err, errNotOwner), - errors.Is(err, errCannotDemoteLast), - errors.Is(err, errRoomTypeGuard), - errors.Is(err, errTargetNotMember), - errors.Is(err, errInvalidOrg), - errors.Is(err, errPromoteRequiresIndividual), - errors.Is(err, errEmptyCreateRequest), - errors.Is(err, errSelfDM), - errors.Is(err, errBotInChannel), - errors.Is(err, errBotNotAvailable), - errors.Is(err, errInvalidUserData), - errors.Is(err, errMissingRequestID), - errors.Is(err, errInvalidRequestID), - errors.Is(err, errChannelNameRequired), - errors.Is(err, errChannelNameTooLong), - errors.Is(err, errUserNotFound), - errors.Is(err, errMessageNotFound), - errors.Is(err, errMessageRoomMismatch), - errors.Is(err, errNotMessageSender), - errors.Is(err, errInvalidThreadID), - errors.Is(err, errThreadSubNotFound), - errors.Is(err, errRemoveTargetAmbiguous), - errors.Is(err, errCannotRemoveLastMember), - errors.Is(err, errLastOwnerCannotLeave), - errors.Is(err, errOrgMemberCannotLeaveSolo), - errors.Is(err, errRoomIDMismatch), - errors.Is(err, errRemoveChannelOnly), - errors.Is(err, errListLimitInvalid), - errors.Is(err, errListOffsetInvalid), - errors.Is(err, errRoomKeyAbsent), - errors.Is(err, &dmExistsError{}), - errors.Is(err, &channelExpandTimeoutError{}): - return err.Error() - default: - msg := err.Error() - for _, safe := range []string{"only owners can", "cannot add members", "room is at maximum capacity", "exceeds maximum capacity", "requester not in room", "invalid request", "remote member.list:", "invalid mute-toggle subject"} { - if strings.Contains(msg, safe) { - return msg - } - } - return "internal error" - } -} diff --git a/room-service/helper_test.go b/room-service/helper_test.go index b2bd5d625..2307db717 100644 --- a/room-service/helper_test.go +++ b/room-service/helper_test.go @@ -1,13 +1,11 @@ package main import ( - "errors" - "fmt" "testing" "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" + "github.com/hmchangw/chat/pkg/errcode" "github.com/hmchangw/chat/pkg/model" ) @@ -35,51 +33,6 @@ func TestHasRole(t *testing.T) { } } -func TestSanitizeError(t *testing.T) { - tests := []struct { - name string - err error - want string - }{ - {"sentinel: invalid role", errInvalidRole, "invalid role: must be owner or member"}, - {"sentinel: only owners", errOnlyOwners, "only owners can update roles"}, - {"sentinel: cannot demote", errCannotDemoteLast, "cannot demote the last owner"}, - {"sentinel: already owner", errAlreadyOwner, "user is already an owner"}, - {"sentinel: not owner", errNotOwner, "user is not an owner"}, - {"sentinel: room type", errRoomTypeGuard, "role update is only allowed in channel rooms"}, - {"sentinel: target not member", errTargetNotMember, "target user is not a member of this room"}, - {"sentinel: not room member", errNotRoomMember, "only room members can list members"}, - {"sentinel: invalid org", errInvalidOrg, "invalid org"}, - {"sentinel: promote requires individual", errPromoteRequiresIndividual, "only individual members can be promoted to owner"}, - {"sentinel: remove target ambiguous", errRemoveTargetAmbiguous, "exactly one of account or orgId must be set"}, - {"sentinel: cannot remove last member", errCannotRemoveLastMember, "cannot remove the last member of the room"}, - {"sentinel: last owner cannot leave", errLastOwnerCannotLeave, "last owner cannot leave the room"}, - {"sentinel: org member cannot leave solo", errOrgMemberCannotLeaveSolo, "org members cannot leave individually"}, - {"sentinel: room ID mismatch", errRoomIDMismatch, "room ID mismatch"}, - {"wrapped remove-channel-only passes through", fmt.Errorf("%w, got %s", errRemoveChannelOnly, "dm"), "remove-member only supported on channel rooms, got dm"}, - {"sentinel: list limit invalid", errListLimitInvalid, "limit must be > 0"}, - {"sentinel: list offset invalid", errListOffsetInvalid, "offset must be >= 0"}, - {"wrapped sentinel passes through", fmt.Errorf("get room: %w", errRoomTypeGuard), "get room: role update is only allowed in channel rooms"}, - {"safe owner message", errors.New("only owners can add members"), "only owners can add members"}, - {"safe cannot add", errors.New("cannot add members to a DM room"), "cannot add members to a DM room"}, - {"safe capacity", errors.New("room is at maximum capacity (1000)"), "room is at maximum capacity (1000)"}, - {"safe exceeds capacity", errors.New("exceeds maximum capacity (1000): would create 1001 members"), "exceeds maximum capacity (1000): would create 1001 members"}, - {"safe requester", errors.New("requester not in room: not found"), "requester not in room: not found"}, - {"safe invalid", errors.New("invalid request: bad json"), "invalid request: bad json"}, - {"passes through invalid mute-toggle subject", fmt.Errorf("invalid mute-toggle subject: chat.user.alice.foo"), "invalid mute-toggle subject: chat.user.alice.foo"}, - {"internal db error", fmt.Errorf("mongo timeout"), "internal error"}, - {"generic error", fmt.Errorf("unexpected failure"), "internal error"}, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - got := sanitizeError(tt.err) - if got != tt.want { - t.Errorf("sanitizeError(%v) = %q, want %q", tt.err, got, tt.want) - } - }) - } -} - func TestIsBot(t *testing.T) { tests := []struct { name string @@ -121,29 +74,55 @@ func TestDedup_Empty(t *testing.T) { assert.Nil(t, got) } -func TestSanitizeError_NotRoomMember_WhenWrapped(t *testing.T) { - // Guards the errors.Is whitelist — wrapping (e.g. by add-member's - // "expand channels: %w") must not lose the user-safe message. - wrapped := fmt.Errorf("expand channels: %w", errNotRoomMember) - assert.Equal(t, "only room members can list members", sanitizeError(wrapped)) -} - -func TestSanitizeError_RemoteMemberListPrefix(t *testing.T) { - remote := errors.New("remote member.list: only room members can list members") - assert.Equal(t, "remote member.list: only room members can list members", sanitizeError(remote)) -} - -func TestSanitizeError_RemoteMemberListWithContext(t *testing.T) { - // Error from cross-site RPC includes site context; preserve user-safe message. - remote := errors.New("expand channels: remote member.list: room not found") - msg := sanitizeError(remote) - assert.Contains(t, msg, "remote member.list:") - assert.Contains(t, msg, "room not found") -} - -func TestSanitizeError_TransportFailureStillOpaque(t *testing.T) { - // Generic transport failure from the client — no user-safe substring — must still be "internal error". - assert.Equal(t, "internal error", sanitizeError(errors.New("member.list request to site-eu: nats: timeout"))) +// TestSentinelCodesAndReasons verifies each migrated sentinel carries the +// category (and where applicable the reason) from the plan's mapping table. +// This replaces the deleted sanitizeError suite. +func TestSentinelCodesAndReasons(t *testing.T) { + cases := []struct { + name string + err *errcode.Error + code errcode.Code + reason errcode.Reason + }{ + {"invalid role", errInvalidRole, errcode.CodeBadRequest, ""}, + {"only owners", errOnlyOwners, errcode.CodeForbidden, errcode.RoomNotOwner}, + {"only owners can remove", errOnlyOwnersCanRemove, errcode.CodeForbidden, errcode.RoomNotOwner}, + {"only owners can add to restricted", errOnlyOwnersCanAddToRes, errcode.CodeForbidden, errcode.RoomNotOwner}, + {"already owner", errAlreadyOwner, errcode.CodeConflict, errcode.RoomAlreadyOwner}, + {"not owner", errNotOwner, errcode.CodeForbidden, errcode.RoomNotOwner}, + {"cannot demote last", errCannotDemoteLast, errcode.CodeConflict, errcode.RoomCannotDemoteLastOwner}, + {"room type guard", errRoomTypeGuard, errcode.CodeBadRequest, errcode.RoomNonChannelOperation}, + {"add members channel only", errAddMembersChannelOnly, errcode.CodeBadRequest, errcode.RoomNonChannelOperation}, + {"target not member", errTargetNotMember, errcode.CodeBadRequest, errcode.RoomTargetNotMember}, + {"not room member", errNotRoomMember, errcode.CodeForbidden, errcode.RoomNotMember}, + {"invalid thread id", errInvalidThreadID, errcode.CodeBadRequest, ""}, + {"thread sub not found", errThreadSubNotFound, errcode.CodeNotFound, ""}, + {"promote requires individual", errPromoteRequiresIndividual, errcode.CodeBadRequest, errcode.RoomPromoteRequiresIndividual}, + {"empty create request", errEmptyCreateRequest, errcode.CodeBadRequest, ""}, + {"self dm", errSelfDM, errcode.CodeBadRequest, errcode.RoomSelfDM}, + {"bot in channel", errBotInChannel, errcode.CodeBadRequest, errcode.RoomBotInChannel}, + {"bot not available", errBotNotAvailable, errcode.CodeNotFound, errcode.RoomBotNotAvailable}, + {"invalid user data", errInvalidUserData, errcode.CodeBadRequest, ""}, + {"channel name required", errChannelNameRequired, errcode.CodeBadRequest, ""}, + {"channel name too long", errChannelNameTooLong, errcode.CodeBadRequest, ""}, + {"message not found", errMessageNotFound, errcode.CodeNotFound, ""}, + {"message room mismatch", errMessageRoomMismatch, errcode.CodeBadRequest, ""}, + {"not message sender", errNotMessageSender, errcode.CodeForbidden, ""}, + {"remove target ambiguous", errRemoveTargetAmbiguous, errcode.CodeBadRequest, ""}, + {"cannot remove last member", errCannotRemoveLastMember, errcode.CodeConflict, errcode.RoomLastMemberCannotRemove}, + {"last owner cannot leave", errLastOwnerCannotLeave, errcode.CodeConflict, errcode.RoomLastOwnerCannotLeave}, + {"org member cannot leave solo", errOrgMemberCannotLeaveSolo, errcode.CodeForbidden, ""}, + {"room id mismatch", errRoomIDMismatch, errcode.CodeBadRequest, ""}, + {"remove channel only", errRemoveChannelOnly, errcode.CodeBadRequest, errcode.RoomNonChannelOperation}, + {"list limit invalid", errListLimitInvalid, errcode.CodeBadRequest, ""}, + {"list offset invalid", errListOffsetInvalid, errcode.CodeBadRequest, ""}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + assert.Equal(t, tc.code, tc.err.Code) + assert.Equal(t, tc.reason, tc.err.Reason) + }) + } } func TestNewSentinelErrorsExist(t *testing.T) { @@ -152,25 +131,8 @@ func TestNewSentinelErrorsExist(t *testing.T) { assert.Equal(t, "bots cannot be added to a channel", errBotInChannel.Error()) assert.Equal(t, "bot not available", errBotNotAvailable.Error()) assert.Equal(t, "user is missing required name fields", errInvalidUserData.Error()) - assert.Equal(t, "missing X-Request-ID header", errMissingRequestID.Error()) - assert.Equal(t, "invalid X-Request-ID format", errInvalidRequestID.Error()) assert.Equal(t, "channel name is required", errChannelNameRequired.Error()) assert.Equal(t, "channel name must be at most 100 characters", errChannelNameTooLong.Error()) - assert.Equal(t, "user not found", errUserNotFound.Error()) -} - -func TestDMExistsErrorWrapsCorrectly(t *testing.T) { - e := newDMExistsError("r_existing") - assert.Equal(t, "dm already exists", e.Error()) - assert.Equal(t, "r_existing", e.RoomID()) - - var sentinel *dmExistsError - assert.True(t, errors.Is(e, sentinel)) - - wrapped := fmt.Errorf("validation failed: %w", e) - var target *dmExistsError - require.True(t, errors.As(wrapped, &target)) - assert.Equal(t, "r_existing", target.RoomID()) } func TestStripAccount(t *testing.T) { @@ -194,30 +156,6 @@ func TestStripAccount(t *testing.T) { } } -func TestSanitizeErrorPassesThroughCreateRoomSentinels(t *testing.T) { - cases := []error{ - errEmptyCreateRequest, - errSelfDM, - errBotInChannel, - errBotNotAvailable, - errInvalidUserData, - errMissingRequestID, - errUserNotFound, - newDMExistsError("r_existing"), - fmt.Errorf("validation: %w", errSelfDM), - } - for _, e := range cases { - t.Run(e.Error(), func(t *testing.T) { - assert.Equal(t, e.Error(), sanitizeError(e)) - }) - } -} - -func TestSanitizeErrorCollapsesUnknown(t *testing.T) { - got := sanitizeError(errors.New("mongo: connection refused: tcp 127.0.0.1:27017")) - assert.Equal(t, "internal error", got) -} - func TestDetermineRoomType(t *testing.T) { tests := []struct { name string diff --git a/room-service/integration_test.go b/room-service/integration_test.go index 3bc6d94b3..372413884 100644 --- a/room-service/integration_test.go +++ b/room-service/integration_test.go @@ -20,6 +20,7 @@ import ( "go.mongodb.org/mongo-driver/v2/bson" "go.mongodb.org/mongo-driver/v2/mongo" + "github.com/hmchangw/chat/pkg/errcode" "github.com/hmchangw/chat/pkg/idgen" "github.com/hmchangw/chat/pkg/model" "github.com/hmchangw/chat/pkg/natsutil" @@ -794,17 +795,17 @@ func TestMongoStore_ListOrgMembers_Integration(t *testing.T) { assert.ElementsMatch(t, []string{"alice", "bob"}, accounts) }) - t.Run("empty org returns errInvalidOrg", func(t *testing.T) { + t.Run("empty org returns RoomInvalidOrg reason", func(t *testing.T) { db := setupMongo(t) store := NewMongoStore(db) insertUser(t, db, model.User{ID: "u-alice", Account: "alice", SectID: "sect-eng"}) _, err := store.ListOrgMembers(ctx, "sect-nope") require.Error(t, err) - assert.True(t, errors.Is(err, errInvalidOrg), "want errInvalidOrg in chain, got %v", err) + assert.True(t, errcode.HasReason(err, errcode.RoomInvalidOrg), "want RoomInvalidOrg in chain, got %v", err) }) - t.Run("returns errInvalidOrg when neither sectId nor deptId matches", func(t *testing.T) { + t.Run("returns RoomInvalidOrg reason when neither sectId nor deptId matches", func(t *testing.T) { // Users carry both sectId and deptId, but neither field equals the // queried orgID — guards against an accidental match on the wrong // branch of the $or (e.g. a future query rewrite that collapses to @@ -816,7 +817,7 @@ func TestMongoStore_ListOrgMembers_Integration(t *testing.T) { _, err := store.ListOrgMembers(ctx, "sect-nope") require.Error(t, err) - assert.True(t, errors.Is(err, errInvalidOrg), "want errInvalidOrg in chain, got %v", err) + assert.True(t, errcode.HasReason(err, errcode.RoomInvalidOrg), "want RoomInvalidOrg in chain, got %v", err) }) t.Run("returns expected OrgMember shape", func(t *testing.T) { @@ -1217,7 +1218,10 @@ func TestAddMembers_TwoSiteEndToEnd(t *testing.T) { require.NoError(t, err) t.Cleanup(func() { _ = otelNCb.Drain() }) - ctx := context.Background() + // Test bypasses wrappedCtx by calling handleAddMembers directly, so we stamp + // a request_id here — the cross-site memberlist client forwards it to site-B, + // whose handler now enforces RequireRequestID (strict). + ctx := natsutil.WithRequestID(context.Background(), idgen.GenerateRequestID()) // Site-A: target room; requester subscribed; user document needed for ResolveAccounts. mustInsertRoom(t, dbA, &model.Room{ID: "target", Type: model.RoomTypeChannel, SiteID: "site-a"}) @@ -1330,13 +1334,12 @@ func TestAddMembers_CrossSiteTimeout(t *testing.T) { _, err = handler.handleAddMembers(ctx, subject.MemberAdd("alice", "target", "site-a"), data) require.Error(t, err) - // Cross-site member.list deadline → typed channelExpandTimeoutError naming - // the offending site+roomId. sanitizeError surfaces the message verbatim. - var te *channelExpandTimeoutError - require.ErrorAs(t, err, &te, "expected channelExpandTimeoutError, got %v", err) - assert.Equal(t, "site-b", te.SiteID) - assert.Equal(t, "source", te.RoomID) - assert.Equal(t, "timeout listing members of channel source@site-b", sanitizeError(err)) + // Cross-site member.list deadline → Unavailable errcode naming the offending + // site+roomId so the requester can see which channel source stalled. + var ee *errcode.Error + require.ErrorAs(t, err, &ee, "expected *errcode.Error, got %v", err) + assert.Equal(t, errcode.CodeUnavailable, ee.Code) + assert.Equal(t, "timeout listing members of channel source@site-b", ee.Message) } func TestRoomsInfoBatchRPC(t *testing.T) { @@ -1381,7 +1384,11 @@ func TestRoomsInfoBatchRPC(t *testing.T) { data, err := json.Marshal(req) require.NoError(t, err) - msg, err := nc.Request(subject.RoomsInfoBatch("site-a"), data, 3*time.Second) + ctxReq, cancel := context.WithTimeout(ctx, 3*time.Second) + defer cancel() + reqMsg := natsutil.NewMsg(natsutil.WithRequestID(ctxReq, idgen.GenerateRequestID()), + subject.RoomsInfoBatch("site-a"), data) + msg, err := nc.RequestMsgWithContext(ctxReq, reqMsg) require.NoError(t, err) var resp model.RoomsInfoBatchResponse @@ -1526,10 +1533,9 @@ func TestIntegration_HandleGetRoomKey(t *testing.T) { { body, _ := json.Marshal(model.RoomKeyGetRequest{}) _, err := h.handleGetRoomKey(ctx, subject.RoomKeyGet("bob", roomID, "site-A"), body) - require.Error(t, err) - // sanitizeError(err) should contain the "only room members" text, - // confirming errNotRoomMember was returned and surfaced for clients. - assert.Contains(t, sanitizeError(err), "only room members") + // errNotRoomMember (a typed *errcode.Error) is returned and surfaced for + // clients via errnats.Reply; assert on identity, not the message text. + require.ErrorIs(t, err, errNotRoomMember) } } @@ -1651,13 +1657,13 @@ func TestCreateRoomDMAlreadyExists(t *testing.T) { body, err := json.Marshal(model.CreateRoomRequest{Users: []string{"bob"}}) require.NoError(t, err) - _, herr := h.handleCreateRoom(ctx, subject.RoomCreate("alice", "site-A"), body) - require.Error(t, herr) + resp, herr := h.handleCreateRoom(ctx, subject.RoomCreate("alice", "site-A"), body) + require.NoError(t, herr) - var dmErr *dmExistsError - require.True(t, errors.As(herr, &dmErr), "expected dmExistsError, got %T: %v", herr, herr) - assert.Equal(t, "dm already exists", dmErr.Error()) - assert.Equal(t, roomID, dmErr.RoomID()) + var reply model.CreateRoomReply + require.NoError(t, json.Unmarshal(resp, &reply)) + assert.Equal(t, model.CreateRoomStatusExists, reply.Status) + assert.Equal(t, roomID, reply.RoomID) } func TestMongoStore_UpdateSubscriptionRead_Integration(t *testing.T) { diff --git a/room-service/memberlist_client.go b/room-service/memberlist_client.go index 033787cd8..304dac0b2 100644 --- a/room-service/memberlist_client.go +++ b/room-service/memberlist_client.go @@ -6,10 +6,12 @@ import ( "context" "encoding/json" "fmt" + "log/slog" "time" "github.com/nats-io/nats.go" + "github.com/hmchangw/chat/pkg/errcode" "github.com/hmchangw/chat/pkg/model" "github.com/hmchangw/chat/pkg/natsutil" "github.com/hmchangw/chat/pkg/subject" @@ -52,25 +54,49 @@ func (c *natsMemberListClient) ListMembers(ctx context.Context, requester string reqCtx, cancel := context.WithTimeout(ctx, c.timeout) defer cancel() - out := &nats.Msg{ - Subject: subject.MemberList(requester, ch.RoomID, ch.SiteID), - Data: body, - Header: nats.Header{}, - } + // natsutil.NewMsg forwards the X-Request-ID from ctx; the remote + // room-service.handleListMembers uses RequireRequestID (strict) and would + // reject a header-less call with bad_request. + out := natsutil.NewMsg(reqCtx, subject.MemberList(requester, ch.RoomID, ch.SiteID), body) reply, err := c.nc.RequestMsgWithContext(reqCtx, out) if err != nil { return nil, fmt.Errorf("member.list request to %s: %w", ch.SiteID, err) } - if errResp, ok := natsutil.TryParseError(reply.Data); ok { - // Map the remote sentinel string back onto the local sentinel so callers + if ee, ok := errcode.Parse(reply.Data); ok { + // Map the remote not-member reason back onto the local sentinel so callers // can use errors.Is(err, errNotRoomMember) uniformly regardless of which - // site the source channel lives on. Other remote errors are passed - // through via the "remote member.list:" prefix sanitizeError whitelists. - if errResp.Error == errNotRoomMember.Error() { + // site the source channel lives on. Other remote errors are reconstructed + // as a typed *errcode.Error preserving the remote code/message/reason. + // + // Mixed-version rollout: a legacy remote that replies without a "code" + // still parses (only "error" is required) but yields Code=="" and no + // reason, so the not-member remap simply does not fire until both sides + // are upgraded — an acceptable degradation, not a bug. Tasks 20.5/20.16: + // errcode.New now panics on a non-canonical Code OR empty Message, so a + // legacy/non-canonical envelope falls back to errcode.Internal here and + // emits a single warn so SREs can spot legacy peers. + if ee.Reason == errcode.RoomNotMember { return nil, errNotRoomMember } - return nil, fmt.Errorf("remote member.list: %s", errResp.Error) + if !ee.Code.Valid() || ee.Message == "" { + slog.WarnContext(ctx, "legacy peer emitted non-canonical errcode", + "code", string(ee.Code), "message", ee.Message, "site", ch.SiteID) + msg := ee.Message + if msg == "" { + msg = "remote site returned an error" + } + return nil, errcode.Internal(msg) + } + opts := []errcode.Option{errcode.WithReason(ee.Reason)} + if len(ee.Metadata) > 0 { + kv := make([]string, 0, 2*len(ee.Metadata)) + for k, v := range ee.Metadata { + kv = append(kv, k, v) + } + opts = append(opts, errcode.WithMetadata(kv...)) + } + return nil, errcode.New(ee.Code, ee.Message, opts...) } var resp model.ListRoomMembersResponse diff --git a/room-service/memberlist_client_test.go b/room-service/memberlist_client_test.go index 413231787..94ed4f76c 100644 --- a/room-service/memberlist_client_test.go +++ b/room-service/memberlist_client_test.go @@ -12,8 +12,8 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "github.com/hmchangw/chat/pkg/errcode" "github.com/hmchangw/chat/pkg/model" - "github.com/hmchangw/chat/pkg/natsutil" "github.com/hmchangw/chat/pkg/subject" ) @@ -64,10 +64,10 @@ func TestNATSMemberListClient_RemoteError(t *testing.T) { ch := model.ChannelRef{RoomID: "room-eng", SiteID: "site-us"} requester := "alice" - // Generic remote error (not the "not a member" sentinel mapping) passes through - // verbatim behind the "remote member.list:" prefix whitelisted by sanitizeError. + // Generic remote error (not the not-member reason) is reconstructed as a + // typed *errcode.Error preserving the remote code and message. sub, err := nc.Subscribe(subject.MemberList(requester, ch.RoomID, ch.SiteID), func(m *nats.Msg) { - data := natsutil.MarshalError("room not found") + data, _ := json.Marshal(errcode.NotFound("room not found")) _ = m.Respond(data) }) require.NoError(t, err) @@ -75,8 +75,10 @@ func TestNATSMemberListClient_RemoteError(t *testing.T) { _, err = client.ListMembers(context.Background(), requester, ch, 0) require.Error(t, err) - assert.Contains(t, err.Error(), "remote member.list:") assert.Contains(t, err.Error(), "room not found") + var ee *errcode.Error + require.ErrorAs(t, err, &ee) + assert.Equal(t, errcode.CodeNotFound, ee.Code) assert.False(t, errors.Is(err, errNotRoomMember), "generic remote errors must not masquerade as the sentinel") } @@ -87,11 +89,12 @@ func TestNATSMemberListClient_RemoteNotMember_MapsToSentinel(t *testing.T) { ch := model.ChannelRef{RoomID: "room-eng", SiteID: "site-us"} requester := "alice" - // Remote site returns errNotRoomMember's exact message — the client must map - // it back onto the local errNotRoomMember sentinel so cross-site and - // same-site "not a member" behave uniformly under errors.Is. + // Remote site replies with an errcode envelope carrying reason + // not_room_member — the client must map it back onto the local + // errNotRoomMember sentinel so cross-site and same-site "not a member" + // behave uniformly under errors.Is. sub, err := nc.Subscribe(subject.MemberList(requester, ch.RoomID, ch.SiteID), func(m *nats.Msg) { - data := natsutil.MarshalError(errNotRoomMember.Error()) + data := []byte(`{"code":"forbidden","reason":"not_room_member","error":"only room members can perform this action"}`) _ = m.Respond(data) }) require.NoError(t, err) diff --git a/room-service/store.go b/room-service/store.go index 4262ed349..70fdea7b6 100644 --- a/room-service/store.go +++ b/room-service/store.go @@ -62,8 +62,9 @@ type RoomStore interface { // display fields are left zero. ListRoomMembers(ctx context.Context, roomID string, limit, offset *int, enrich bool) ([]model.RoomMember, error) // ListOrgMembers returns all users whose sectId OR deptId equals orgID, - // projected as OrgMember rows sorted by account ascending. Returns - // errInvalidOrg when no users match (treated as "orgId is not valid"). + // projected as OrgMember rows sorted by account ascending. Returns a + // RoomInvalidOrg-reason errcode when no users match (treated as "orgId is + // not valid"). ListOrgMembers(ctx context.Context, orgID string) ([]model.OrgMember, error) // FindExistingOrgIDs returns the subset of orgIDs that match at least // one user via sectId or deptId. Used by handleAddMembers and diff --git a/room-service/store_mongo.go b/room-service/store_mongo.go index eca16f899..1e830a084 100644 --- a/room-service/store_mongo.go +++ b/room-service/store_mongo.go @@ -11,6 +11,7 @@ import ( "go.mongodb.org/mongo-driver/v2/mongo/options" "github.com/hmchangw/chat/pkg/displayfmt" + "github.com/hmchangw/chat/pkg/errcode" "github.com/hmchangw/chat/pkg/model" "github.com/hmchangw/chat/pkg/pipelines" ) @@ -709,8 +710,8 @@ func (s *MongoStore) FindDMSubscription(ctx context.Context, account, targetName // GetUserWithMembership): an org added by a dept-only match stores // member.id = deptId in room_members, so the expansion RPC must look up // users by deptId too. Both (sectId, account) and (deptId, account) indexes -// exist (see ensureIndexes) so the $or stays index-backed. Returns -// errInvalidOrg when neither branch matches any users. +// exist (see ensureIndexes) so the $or stays index-backed. Returns a +// RoomInvalidOrg-reason errcode when neither branch matches any users. func (s *MongoStore) ListOrgMembers(ctx context.Context, orgID string) ([]model.OrgMember, error) { opts := options.Find(). SetSort(bson.D{{Key: "account", Value: 1}}). @@ -735,7 +736,7 @@ func (s *MongoStore) ListOrgMembers(ctx context.Context, orgID string) ([]model. return nil, fmt.Errorf("decode users for org %q: %w", orgID, err) } if len(members) == 0 { - return nil, fmt.Errorf("list org members for %q: %w", orgID, errInvalidOrg) + return nil, errcode.BadRequest(fmt.Sprintf("list org members for %q", orgID), errcode.WithReason(errcode.RoomInvalidOrg)) } return members, nil } diff --git a/room-worker/handler.go b/room-worker/handler.go index b65ccf0b4..d0f84b95a 100644 --- a/room-worker/handler.go +++ b/room-worker/handler.go @@ -12,11 +12,14 @@ import ( "time" "github.com/Marz32onE/instrumentation-go/otel-nats/otelnats" + "github.com/nats-io/nats.go" "github.com/nats-io/nats.go/jetstream" "go.mongodb.org/mongo-driver/v2/mongo" "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/metric" + "github.com/hmchangw/chat/pkg/errcode" + "github.com/hmchangw/chat/pkg/errcode/errnats" "github.com/hmchangw/chat/pkg/idgen" "github.com/hmchangw/chat/pkg/model" "github.com/hmchangw/chat/pkg/natsutil" @@ -27,7 +30,10 @@ import ( ) // errPermanent marks non-retryable errors (caller Acks instead of Nak). -var errPermanent = errors.New("permanent") +// Aliased onto the consolidated errcode.ErrPermanent sentinel so the existing +// errors.Is(err, errPermanent) call sites (handler + ~18 test sites) keep +// working without churn. +var errPermanent = errcode.ErrPermanent // errRoomKeyAbsent fires when keyStore.Get returns (nil, nil) — Valkey responded but the room // has no current key. Distinct from transient Valkey errors so operators can alert separately. @@ -81,8 +87,8 @@ func messageDedupSeed(ctx context.Context, handler, roomID, payloadSeed string) if seed := natsutil.RequestIDFromContext(ctx); seed != "" { return seed } - slog.Warn("missing X-Request-ID; falling back to payload-derived seed", - "handler", handler, "roomID", roomID) + slog.WarnContext(ctx, "missing X-Request-ID; falling back to payload-derived seed", + "handler", handler, "room_id", roomID) return payloadSeed } @@ -92,7 +98,7 @@ func historySharedSincePtr(history model.HistoryConfig, timestamp int64, roomID return nil } if timestamp <= 0 { - slog.Error("restricted history with missing timestamp, emitting nil", "roomID", roomID, "mode", history.Mode) + slog.Error("restricted history with missing timestamp, emitting nil", "room_id", roomID, "mode", history.Mode) return nil } return ×tamp @@ -112,62 +118,29 @@ func (h *Handler) publishAsyncJobResult(ctx context.Context, requesterAccount, o Timestamp: time.Now().UTC().UnixMilli(), } if jobErr != nil { - result.Status = model.AsyncJobStatusError - result.Error = sanitizeAsyncJobError(jobErr) - slog.Error("async room job failed", "error", jobErr, "operation", operation, "requestID", requestID, "roomID", roomID) + // Enrich the ctx so fillAsyncError's single Classify log line carries these + // fields at a category-aware level — no separate (ERROR-forced) log here. + ctx = errcode.WithLogValues(ctx, "request_id", requestID, "operation", operation, "room_id", roomID) + h.fillAsyncError(ctx, &result, jobErr) } data, _ := json.Marshal(result) if err := h.publish(ctx, subject.UserResponse(requesterAccount, requestID), data, ""); err != nil { - slog.Warn("publish async job result failed", "error", err, "requestID", requestID) + slog.WarnContext(ctx, "publish async job result failed", "error", err, "request_id", requestID) } } -// permanentError pairs a user-safe message with the errPermanent sentinel so -// HandleJetStreamMsg can Ack the JetStream message AND publishAsyncJobResult -// can render a clean per-cause string without depending on suffix matching of -// the wrapped Error() output. An optional cause allows errors.Is(err, cause) checks. -type permanentError struct { - msg string - cause error // optional; allows errors.Is(err, cause) matching -} - -func newPermanent(format string, args ...any) error { - return &permanentError{msg: fmt.Sprintf(format, args...)} -} - -// newPermanentAbsent returns a permanent error that also satisfies errors.Is(err, errRoomKeyAbsent). -func newPermanentAbsent(format string, args ...any) error { - return &permanentError{msg: fmt.Sprintf(format, args...), cause: errRoomKeyAbsent} -} - -func (e *permanentError) Error() string { return e.msg } -func (e *permanentError) Unwrap() error { return e.cause } -func (e *permanentError) Is(target error) bool { - if target == errPermanent { - return true - } - _, ok := target.(*permanentError) - return ok -} - -// sanitizeAsyncJobError surfaces permanent errors verbatim and collapses everything else. -func sanitizeAsyncJobError(err error) string { - if err == nil { - return "" - } - var pe *permanentError - if errors.As(err, &pe) { - return pe.msg - } - if errors.Is(err, errPermanent) { - // Legacy %w-wrapped errPermanent: trim the trailing ": permanent" suffix. - msg := err.Error() - if idx := strings.LastIndex(msg, ": "+errPermanent.Error()); idx >= 0 { - msg = msg[:idx] - } - return msg - } - return "operation failed" +// permanent wraps an *errcode.Error as a non-retryable job failure. Thin local +// alias for errcode.Permanent so call sites stay short — the marker type and +// sentinel-Is shim now live in pkg/errcode (Task 20.15). +func permanent(ec *errcode.Error) error { return errcode.Permanent(ec) } + +// fillAsyncError classifies jobErr once and populates the result's error +// envelope fields. The Ack/Nak decision is INDEPENDENT of this — it stays keyed +// on the explicit errcode.Permanent marker (see HandleJetStreamMsg). +func (h *Handler) fillAsyncError(ctx context.Context, result *model.AsyncJobResult, jobErr error) { + e := errcode.Classify(ctx, jobErr) + result.Status = model.AsyncJobStatusError + result.Error, result.Code, result.Reason = e.Message, string(e.Code), string(e.Reason) } // reconcileRoomOnDuplicateKey verifies the existing room is structurally compatible with the want spec; one source of truth for both create paths. @@ -184,8 +157,11 @@ func (h *Handler) reconcileRoomOnDuplicateKey(ctx context.Context, want *model.R return nil, fmt.Errorf("fetch existing room on duplicate-key: %w", err) } if existing.Type != want.Type || existing.SiteID != want.SiteID { - return nil, newPermanent("room ID collision (existing type=%s site=%s; want %s/%s)", - existing.Type, existing.SiteID, want.Type, want.SiteID) + // Conflict mirrors the sync-DM path's errRoomIDCollision; Classify then + // logs at INFO instead of ERROR — this IS an expected data condition + // (concurrent create with mismatched type), not a server fault. + return nil, permanent(errcode.Conflict(fmt.Sprintf("room ID collision (existing type=%s site=%s; want %s/%s)", + existing.Type, existing.SiteID, want.Type, want.SiteID))) } return existing, nil } @@ -203,33 +179,42 @@ func (h *Handler) HandleJetStreamMsg(ctx context.Context, msg jetstream.Msg) { case strings.HasSuffix(subj, ".create"): err = h.processCreateRoom(ctx, msg.Data()) default: - slog.Warn("unknown member operation", "subject", subj) + slog.WarnContext(ctx, "unknown member operation", "subject", subj) } if err != nil { - slog.Error("process message failed", "error", err, "subject", subj) + // NB: do NOT slog.Error here — fillAsyncError → errcode.Classify already + // logs the failure exactly once at a category-aware level (internal/ + // unavailable → ERROR, expected client errors → INFO). An extra ERROR + // line here would double-log every failure and force ERROR on client- + // category permanent errors (e.g. NotFound for missing user), defeating + // the category-aware level the migration shipped. + // // Permanent failures must Ack so JetStream stops redelivering. The async-job // error event has already been published to the requester via the per-handler - // defer in processCreateRoom / processAddMembers / processRemove*. - if errors.Is(err, errPermanent) { + // defer in processCreateRoom / processAddMembers / processRemove*. Permanence + // is explicit (the errcode.Permanent marker), never inferred from the errcode + // category — many permanent errors classify to internal and would otherwise + // be Nak'd forever. + if _, ok := errcode.IsPermanent(err); ok { if ackErr := msg.Ack(); ackErr != nil { - slog.Error("failed to ack permanent-error message", "error", ackErr) + slog.ErrorContext(ctx, "failed to ack permanent-error message", "error", ackErr) } return } if nakErr := msg.Nak(); nakErr != nil { - slog.Error("failed to nak message", "error", nakErr) + slog.ErrorContext(ctx, "failed to nak message", "error", nakErr) } return } if err := msg.Ack(); err != nil { - slog.Error("failed to ack message", "error", err) + slog.ErrorContext(ctx, "failed to ack message", "error", err) } } func (h *Handler) processRoleUpdate(ctx context.Context, data []byte) error { var req model.UpdateRoleRequest if err := json.Unmarshal(data, &req); err != nil { - return fmt.Errorf("unmarshal role update request: %w", err) + return permanent(errcode.BadRequest("unmarshal role update request")) } if req.Timestamp <= 0 { req.Timestamp = time.Now().UTC().UnixMilli() @@ -250,7 +235,7 @@ func (h *Handler) processRoleUpdate(ctx context.Context, data []byte) error { return fmt.Errorf("remove owner role: %w", err) } default: - return fmt.Errorf("unsupported role: %s", req.NewRole) + return permanent(errcode.BadRequest(fmt.Sprintf("unsupported role: %s", req.NewRole))) } // Re-read subscription to get the updated roles for the event @@ -303,15 +288,33 @@ func (h *Handler) processRoleUpdate(ctx context.Context, data []byte) error { return nil } -func (h *Handler) processRemoveMember(ctx context.Context, data []byte) error { +func (h *Handler) processRemoveMember(ctx context.Context, data []byte) (err error) { + // Subhandlers (processRemoveOrg, processRemoveIndividual) own their own + // async-result publish; dispatched=true tells our defer to skip publishing + // on the happy path. Pre-dispatch failures (unmarshal, type-guard, key-get) + // publish from here using the generic remove operation. + var ( + requesterAccount string + roomID string + dispatched bool + ) + defer func() { + if dispatched { + return + } + h.publishAsyncJobResult(ctx, requesterAccount, model.AsyncJobOpRoomMemberRemove, roomID, err) + }() + var req model.RemoveMemberRequest - if err := json.Unmarshal(data, &req); err != nil { - return fmt.Errorf("unmarshal RemoveMemberRequest: %w", err) + if err = json.Unmarshal(data, &req); err != nil { + return permanent(errcode.BadRequest("unmarshal RemoveMemberRequest")) } + requesterAccount = req.Requester + roomID = req.RoomID // Pre-upgrade senders omit RoomType; treat zero value as channel since room-service validated it. if req.RoomType != "" && req.RoomType != model.RoomTypeChannel { - return newPermanent("remove-member only valid on channel rooms, got %s", req.RoomType) + return permanent(errcode.BadRequest(fmt.Sprintf("remove-member only valid on channel rooms, got %s", req.RoomType))) } // Removed-user-read window: between this canonical event being published and the Mongo // delete below, broadcast-worker may still address the removed user with the old key. @@ -322,6 +325,7 @@ func (h *Handler) processRemoveMember(ctx context.Context, data []byte) error { return fmt.Errorf("get room key: %w", err) } + dispatched = true if req.OrgID != "" { return h.processRemoveOrg(ctx, &req, currentPair) } @@ -433,7 +437,7 @@ func (h *Handler) processRemoveIndividual(ctx context.Context, req *model.Remove } subEvtData, _ := json.Marshal(subEvt) if err := h.publish(ctx, subject.SubscriptionUpdate(req.Account), subEvtData, ""); err != nil { - slog.Error("subscription update publish failed", "error", err, "account", req.Account) + slog.ErrorContext(ctx, "subscription update publish failed", "error", err, "account", req.Account) } // Member change event @@ -450,7 +454,7 @@ func (h *Handler) processRemoveIndividual(ctx context.Context, req *model.Remove } memberEvtData, _ := json.Marshal(memberEvt) if err := h.publish(ctx, subject.MemberEvent(req.RoomID), memberEvtData, ""); err != nil { - slog.Error("member event publish failed", "error", err, "roomID", req.RoomID) + slog.ErrorContext(ctx, "member event publish failed", "error", err, "room_id", req.RoomID) } // Wrapper Type collapses to member_removed even for self-leave so @@ -465,7 +469,7 @@ func (h *Handler) processRemoveIndividual(ctx context.Context, req *model.Remove inboxData, _ := json.Marshal(inboxOutbox) inboxSeed := fmt.Sprintf("%s:%s:%d", req.RoomID, req.Account, req.Timestamp) if err := h.publish(ctx, subject.InboxMemberRemoved(h.siteID), inboxData, natsutil.OutboxDedupID(ctx, h.siteID, inboxSeed)); err != nil { - slog.Error("local inbox member_removed publish failed", "error", err, "roomID", req.RoomID) + slog.ErrorContext(ctx, "local inbox member_removed publish failed", "error", err, "room_id", req.RoomID) } // Sys-msg sender: leaving user for self-leave, requester for forced removal. @@ -474,7 +478,7 @@ func (h *Handler) processRemoveIndividual(ctx context.Context, req *model.Remove requester, err = h.store.GetUser(ctx, req.Requester) if err != nil { if errors.Is(err, ErrUserNotFound) { - return newPermanent("requester %s not found (room %s)", req.Requester, req.RoomID) + return permanent(errcode.NotFound(fmt.Sprintf("requester %s not found (room %s)", req.Requester, req.RoomID), errcode.WithReason(errcode.RoomUserNotFound))) } return fmt.Errorf("get requester: %w", err) } @@ -578,9 +582,9 @@ func (h *Handler) processRemoveOrg(ctx context.Context, req *model.RemoveMemberR name, tcName = sectName, sectTCName } if name == "" && tcName == "" { - slog.Warn("org-remove: no name resolved from any member; falling back to orgID", - "requestID", natsutil.RequestIDFromContext(ctx), - "roomID", req.RoomID, "orgID", req.OrgID) + slog.WarnContext(ctx, "org-remove: no name resolved from any member; falling back to orgID", + "request_id", natsutil.RequestIDFromContext(ctx), + "room_id", req.RoomID, "orgID", req.OrgID) } // Skip members who still have an individual row OR are still reachable @@ -644,7 +648,7 @@ func (h *Handler) processRemoveOrg(ctx context.Context, req *model.RemoveMemberR } subEvtData, _ := json.Marshal(subEvt) if err := h.publish(ctx, subject.SubscriptionUpdate(m.Account), subEvtData, ""); err != nil { - slog.Error("subscription update publish failed", "error", err, "account", m.Account) + slog.ErrorContext(ctx, "subscription update publish failed", "error", err, "account", m.Account) } } @@ -660,7 +664,7 @@ func (h *Handler) processRemoveOrg(ctx context.Context, req *model.RemoveMemberR } memberEvtData, _ := json.Marshal(memberEvt) if err := h.publish(ctx, subject.MemberEvent(req.RoomID), memberEvtData, ""); err != nil { - slog.Error("member event publish failed", "error", err, "roomID", req.RoomID) + slog.ErrorContext(ctx, "member event publish failed", "error", err, "room_id", req.RoomID) } inboxOutbox := model.OutboxEvent{ @@ -673,7 +677,7 @@ func (h *Handler) processRemoveOrg(ctx context.Context, req *model.RemoveMemberR inboxData, _ := json.Marshal(inboxOutbox) inboxSeed := fmt.Sprintf("%s:%s:%d", req.RoomID, req.OrgID, req.Timestamp) if err := h.publish(ctx, subject.InboxMemberRemoved(h.siteID), inboxData, natsutil.OutboxDedupID(ctx, h.siteID, inboxSeed)); err != nil { - slog.Error("local inbox member_removed publish failed", "error", err, "roomID", req.RoomID) + slog.ErrorContext(ctx, "local inbox member_removed publish failed", "error", err, "room_id", req.RoomID) } } @@ -681,7 +685,7 @@ func (h *Handler) processRemoveOrg(ctx context.Context, req *model.RemoveMemberR requester, err := h.store.GetUser(ctx, req.Requester) if err != nil { if errors.Is(err, ErrUserNotFound) { - return newPermanent("requester %s not found (room %s)", req.Requester, req.RoomID) + return permanent(errcode.NotFound(fmt.Sprintf("requester %s not found (room %s)", req.Requester, req.RoomID), errcode.WithReason(errcode.RoomUserNotFound))) } return fmt.Errorf("get requester: %w", err) } @@ -748,24 +752,24 @@ func (h *Handler) processRemoveOrg(ctx context.Context, req *model.RemoveMemberR } func (h *Handler) processAddMembers(ctx context.Context, data []byte) (err error) { + // Defer must cover early failures; populate requesterAccount/roomID once available. + var ( + requesterAccount string + roomID string + ) + defer func() { + h.publishAsyncJobResult(ctx, requesterAccount, model.AsyncJobOpRoomMemberAdd, roomID, err) + }() + var req model.AddMembersRequest if err = json.Unmarshal(data, &req); err != nil { - return fmt.Errorf("unmarshal add members request: %w", err) - } - requestID := natsutil.RequestIDFromContext(ctx) - if requestID == "" { - return newPermanent("missing X-Request-ID") - } - if !idgen.IsValidUUID(requestID) { - return newPermanent("invalid X-Request-ID: must be a hyphenated UUID") + return permanent(errcode.BadRequest("unmarshal add members request")) } + requesterAccount = req.RequesterAccount + roomID = req.RoomID if req.Timestamp <= 0 { req.Timestamp = time.Now().UTC().UnixMilli() } - // Now req is populated; defer the result publish covers all subsequent return paths. - defer func() { - h.publishAsyncJobResult(ctx, req.RequesterAccount, model.AsyncJobOpRoomMemberAdd, req.RoomID, err) - }() room, err := h.store.GetRoom(ctx, req.RoomID) if err != nil { @@ -773,7 +777,7 @@ func (h *Handler) processAddMembers(ctx context.Context, data []byte) (err error } // Defensive channel-only guard. if room.Type != model.RoomTypeChannel { - return newPermanent("add-member only valid on channel rooms, got %s", room.Type) + return permanent(errcode.BadRequest(fmt.Sprintf("add-member only valid on channel rooms, got %s", room.Type))) } // Resolve candidates and per-candidate flags (has-sub / has-individual-row). @@ -846,7 +850,7 @@ func (h *Handler) processAddMembers(ctx context.Context, data []byte) (err error } for _, acc := range lookupAccounts { if _, ok := userMap[acc]; !ok { - return newPermanent("user %s not found in room.member.add (room %s)", acc, req.RoomID) + return permanent(errcode.NotFound(fmt.Sprintf("user %s not found in room.member.add (room %s)", acc, req.RoomID), errcode.WithReason(errcode.RoomUserNotFound))) } } } @@ -854,7 +858,7 @@ func (h *Handler) processAddMembers(ctx context.Context, data []byte) (err error requester, err := h.store.GetUser(ctx, req.RequesterAccount) if err != nil { if errors.Is(err, ErrUserNotFound) { - return newPermanent("requester %s not found (room %s)", req.RequesterAccount, req.RoomID) + return permanent(errcode.NotFound(fmt.Sprintf("requester %s not found (room %s)", req.RequesterAccount, req.RoomID), errcode.WithReason(errcode.RoomUserNotFound))) } return fmt.Errorf("get requester: %w", err) } @@ -965,7 +969,7 @@ func (h *Handler) processAddMembers(ctx context.Context, data []byte) (err error } for _, acc := range backfillAccounts { if _, ok := found[acc]; !ok { - return newPermanent("backfill user %s not found in room.member.add (room %s)", acc, req.RoomID) + return permanent(errcode.NotFound(fmt.Sprintf("backfill user %s not found in room.member.add (room %s)", acc, req.RoomID), errcode.WithReason(errcode.RoomUserNotFound))) } } for i := range backfillUsers { @@ -1006,7 +1010,7 @@ func (h *Handler) processAddMembers(ctx context.Context, data []byte) (err error } subEvtData, _ := json.Marshal(subEvt) if err := h.publish(ctx, subject.SubscriptionUpdate(sub.User.Account), subEvtData, ""); err != nil { - slog.Error("subscription update publish failed", "error", err, "account", sub.User.Account) + slog.ErrorContext(ctx, "subscription update publish failed", "error", err, "account", sub.User.Account) } } @@ -1053,10 +1057,10 @@ func (h *Handler) processAddMembers(ctx context.Context, data []byte) (err error } memberAddData, _ := json.Marshal(memberAddEvt) if err := h.publish(ctx, subject.RoomMemberEvent(req.RoomID), memberAddData, ""); err != nil { - slog.Error("member add event publish failed", + slog.ErrorContext(ctx, "member add event publish failed", "error", err, - "roomID", req.RoomID, - "requestID", natsutil.RequestIDFromContext(ctx), + "room_id", req.RoomID, + "request_id", natsutil.RequestIDFromContext(ctx), ) } @@ -1071,10 +1075,10 @@ func (h *Handler) processAddMembers(ctx context.Context, data []byte) (err error inboxData, _ := json.Marshal(inboxOutbox) inboxSeed := fmt.Sprintf("%s:%s:%d", req.RoomID, req.RequesterAccount, req.Timestamp) if err := h.publish(ctx, subject.InboxMemberAdded(room.SiteID), inboxData, natsutil.OutboxDedupID(ctx, room.SiteID, inboxSeed)); err != nil { - slog.Error("local inbox member_added publish failed", + slog.ErrorContext(ctx, "local inbox member_added publish failed", "error", err, - "roomID", req.RoomID, - "requestID", natsutil.RequestIDFromContext(ctx), + "room_id", req.RoomID, + "request_id", natsutil.RequestIDFromContext(ctx), ) } } @@ -1220,16 +1224,12 @@ func (h *Handler) processCreateRoom(ctx context.Context, data []byte) (err error }() requestID := natsutil.RequestIDFromContext(ctx) - if requestID == "" { - return newPermanent("missing X-Request-ID") - } - if !idgen.IsValidUUID(requestID) { - return newPermanent("invalid X-Request-ID: must be a hyphenated UUID") - } var req model.CreateRoomRequest if err := json.Unmarshal(data, &req); err != nil { - return newPermanent("unmarshal create-room: %s", err.Error()) + // Never interpolate err.Error() — json.SyntaxError embeds the offending + // payload substring from an unauthenticated entry-point (see doc.go). + return permanent(errcode.BadRequest("unmarshal create-room")) } requesterAccount = req.RequesterAccount roomID = req.RoomID @@ -1242,13 +1242,13 @@ func (h *Handler) processCreateRoom(ctx context.Context, data []byte) (err error } if pair == nil { roomkeymetrics.KeyAbsentErrors.Add(ctx, 1) - return newPermanentAbsent("room key absent for %s", req.RoomID) + return permanent(errcode.Internal("room key absent", errcode.WithCause(errRoomKeyAbsent))) } requester, err := h.store.GetUser(ctx, req.RequesterAccount) if err != nil { if errors.Is(err, ErrUserNotFound) { - return newPermanent("requester not found") + return permanent(errcode.NotFound("requester not found", errcode.WithReason(errcode.RoomUserNotFound))) } return fmt.Errorf("get requester: %w", err) } @@ -1274,9 +1274,9 @@ func (h *Handler) processCreateRoom(ctx context.Context, data []byte) (err error if err != nil { if errors.Is(err, ErrUserNotFound) { if roomType == model.RoomTypeBotDM { - return newPermanent("bot user not found") + return permanent(errcode.NotFound("bot user not found", errcode.WithReason(errcode.RoomBotNotAvailable))) } - return newPermanent("counterpart not found") + return permanent(errcode.NotFound("counterpart not found", errcode.WithReason(errcode.RoomUserNotFound))) } return fmt.Errorf("get counterpart: %w", err) } @@ -1317,7 +1317,9 @@ func (h *Handler) processCreateRoom(ctx context.Context, data []byte) (err error case model.RoomTypeChannel: return h.processCreateRoomChannel(ctx, &req, room, requester, pair, requestID, acceptedAt, now) default: - return newPermanent("unknown room type %q", roomType) + // Client-provided value — BadRequest is the right category (Classify + // then logs at INFO, not ERROR). + return permanent(errcode.BadRequest(fmt.Sprintf("unknown room type %q", roomType))) } } @@ -1357,7 +1359,7 @@ func (h *Handler) processCreateRoomChannel(ctx context.Context, req *model.Creat } for _, account := range accounts { if _, ok := userSet[account]; !ok { - return newPermanent("user %s not found", account) + return permanent(errcode.NotFound(fmt.Sprintf("user %s not found", account), errcode.WithReason(errcode.RoomUserNotFound))) } } @@ -1432,11 +1434,11 @@ func (h *Handler) finishCreateRoom(ctx context.Context, req *model.CreateRoomReq } data, err := json.Marshal(evt) if err != nil { - slog.Error("marshal subscription.update failed", "error", err, "account", sub.User.Account) + slog.ErrorContext(ctx, "marshal subscription.update failed", "error", err, "account", sub.User.Account) continue } if err := h.publish(ctx, subject.SubscriptionUpdate(sub.User.Account), data, ""); err != nil { - slog.Error("publish subscription.update failed", "error", err, "account", sub.User.Account) + slog.ErrorContext(ctx, "publish subscription.update failed", "error", err, "account", sub.User.Account) } } @@ -1474,7 +1476,7 @@ func (h *Handler) finishCreateRoom(ctx context.Context, req *model.CreateRoomReq outboxData, _ := json.Marshal(outbox) payloadSeed := fmt.Sprintf("%s:%s:%d", room.ID, requester.Account, req.Timestamp) if err := h.publish(ctx, subject.InboxMemberAdded(room.SiteID), outboxData, natsutil.OutboxDedupID(ctx, room.SiteID, payloadSeed)); err != nil { - slog.Error("local inbox member_added publish failed", "error", err, "roomID", room.ID, "requestID", requestID) + slog.ErrorContext(ctx, "local inbox member_added publish failed", "error", err, "room_id", room.ID, "request_id", requestID) } // Task 37: outbox per remote site @@ -1594,44 +1596,29 @@ func (h *Handler) publishCanonical(ctx context.Context, msg *model.Message, site // Sync DM endpoint handlers (chat.server.request.room.{siteID}.create.dm). var ( - errMissingRequestID = errors.New("missing X-Request-ID header") - errInvalidRequestID = errors.New("invalid X-Request-ID header") - errInvalidSyncDMRequest = errors.New("invalid sync DM request") - errUserLookupFailed = errors.New("user lookup failed") - errCrossSiteRequester = errors.New("requester is not on this site") - errRoomIDCollision = errors.New("room ID collision (existing room metadata mismatch)") + errInvalidSyncDMRequest = errcode.BadRequest("invalid sync DM request") + // errUserLookupFailed stays a raw error so Classify collapses it to internal + // (the requester learns the room couldn't be created, not who is missing). + errUserLookupFailed = errors.New("user lookup failed") + errCrossSiteRequester = errcode.BadRequest("requester is not on this site") + // errRoomIDCollision is an unrecoverable structural collision: permanent so + // the JetStream-driven create paths Ack, conflict so the client sees 409. + errRoomIDCollision = permanent(errcode.Conflict("room id collision (existing room metadata mismatch)")) ) -// sanitizeSyncDMError surfaces sentinel messages; masks anything else as "internal error". -func sanitizeSyncDMError(err error) string { - if err == nil { - return "" - } - switch { - case errors.Is(err, errMissingRequestID), - errors.Is(err, errInvalidRequestID), - errors.Is(err, errInvalidSyncDMRequest), - errors.Is(err, errUserLookupFailed), - errors.Is(err, errCrossSiteRequester): - return err.Error() - default: - return "internal error" - } -} - // handleSyncCreateDM creates a DM, self-DM, or botDM room and returns the requester's subscription. +// Errors flow through the centralized errcode.Classify path (the legacy +// sanitizeSyncDMError helper was retired by the errcode migration). func (h *Handler) handleSyncCreateDM(ctx context.Context, data []byte) (*model.SyncCreateDMReply, error) { requestID := natsutil.RequestIDFromContext(ctx) - if requestID == "" { - return nil, errMissingRequestID - } - if !idgen.IsValidUUID(requestID) { - return nil, errInvalidRequestID - } var req model.SyncCreateDMRequest if err := json.Unmarshal(data, &req); err != nil { - return nil, errInvalidSyncDMRequest + // Single %w on the errcode sentinel preserves errors.Is identity; + // the json.Unmarshal error text is folded in as %v so it surfaces in + // Classify's server-side log line without adding a second errcode to + // the chain (the semgrep no-multi-%w rule trips on two %w verbs). + return nil, fmt.Errorf("%w: %v", errInvalidSyncDMRequest, err) } if err := validateSyncCreateDMShape(&req); err != nil { return nil, err @@ -1716,10 +1703,10 @@ func (h *Handler) handleSyncCreateDM(ctx context.Context, data []byte) (*model.S // Permanent errors from reconcile mean an unrecoverable collision; the // sync-DM caller surfaces errRoomIDCollision verbatim, so map any // permanent error onto that sentinel and keep the rich detail in the log. - if errors.Is(reconcileErr, errPermanent) { - slog.Error("sync DM: room ID collision", - "roomID", room.ID, - "requestID", requestID, + if _, ok := errcode.IsPermanent(reconcileErr); ok { + slog.ErrorContext(ctx, "sync DM: room ID collision", + "room_id", room.ID, + "request_id", requestID, "error", reconcileErr) return nil, errRoomIDCollision } @@ -1818,13 +1805,13 @@ func (h *Handler) publishSubscriptionUpdates(ctx context.Context, subs []*model. } data, err := json.Marshal(evt) if err != nil { - slog.Error("sync DM: marshal subscription.update failed", - "error", err, "account", sub.User.Account, "requestID", requestID) + slog.ErrorContext(ctx, "sync DM: marshal subscription.update failed", + "error", err, "account", sub.User.Account, "request_id", requestID) continue } if err := h.publish(ctx, subject.SubscriptionUpdate(sub.User.Account), data, ""); err != nil { - slog.Error("sync DM: publish subscription.update failed", - "error", err, "account", sub.User.Account, "requestID", requestID) + slog.ErrorContext(ctx, "sync DM: publish subscription.update failed", + "error", err, "account", sub.User.Account, "request_id", requestID) } } } @@ -1872,15 +1859,26 @@ func (h *Handler) publishSyncDMOutbox(ctx context.Context, room *model.Room, req ) } +// requireDedupRequestID is the strict X-Request-ID gate used by sync entry +// points (natsServerCreateDM) whose downstream pipeline derives JetStream +// Nats-Msg-Id and message-ID dedup keys from the request ID. Silently minting +// would break client-retry dedup; see docs/error-handling.md §3a. Thin wrapper +// over natsutil.RequireRequestID so the test sits in the same package. +func requireDedupRequestID(ctx context.Context, headers nats.Header, subject string) (context.Context, string, error) { + return natsutil.RequireRequestID(ctx, headers, subject) +} + // natsServerCreateDM is the NATS entry point for chat.server.request.room.{siteID}.create.dm. func (h *Handler) natsServerCreateDM(m otelnats.Msg) { - ctx := natsutil.ContextWithRequestIDFromHeaders(m.Context(), m.Msg.Header) + ctx, id, err := requireDedupRequestID(m.Context(), m.Msg.Header, m.Msg.Subject) + if err != nil { + errnats.Reply(errcode.WithLogValues(m.Context(), "subject", m.Msg.Subject), m.Msg, err) + return + } + ctx = errcode.WithLogValues(ctx, "request_id", id, "subject", m.Msg.Subject) reply, err := h.handleSyncCreateDM(ctx, m.Msg.Data) if err != nil { - slog.Error("sync DM: handler failed", - "error", err, "subject", m.Msg.Subject, - "requestID", natsutil.RequestIDFromContext(ctx)) - natsutil.ReplyError(m.Msg, sanitizeSyncDMError(err)) + errnats.Reply(ctx, m.Msg, err) return } natsutil.ReplyJSON(m.Msg, reply) @@ -1905,7 +1903,7 @@ func (h *Handler) fanOutRoomKeyToSurvivors(ctx context.Context, roomID string, p func (h *Handler) buildAndFanOutRoomKey(ctx context.Context, roomID string, pair *roomkeystore.VersionedKeyPair, users []model.User) error { if pair == nil { roomkeymetrics.KeyAbsentErrors.Add(ctx, 1) - return newPermanentAbsent("room key absent for %s", roomID) + return permanent(errcode.Internal("room key absent", errcode.WithCause(errRoomKeyAbsent))) } // PublicKey omitted: server-side only, read from Valkey by broadcast-worker. evt := model.RoomKeyEvent{ @@ -1964,7 +1962,7 @@ func (h *Handler) fanOutKey(ctx context.Context, roomID string, accounts []strin wg.Done() }() if err := h.keySender.SendData(acct, data); err != nil { - slog.Error("send room key", "error", err, "account", acct, "roomId", roomID) + slog.ErrorContext(ctx, "send room key", "error", err, "account", acct, "roomId", roomID) roomkeymetrics.FanoutErrors.Add(ctx, 1, metric.WithAttributes(attribute.String("roomId", roomID))) } }(account) diff --git a/room-worker/handler_test.go b/room-worker/handler_test.go index 6f3270433..6e2e184f3 100644 --- a/room-worker/handler_test.go +++ b/room-worker/handler_test.go @@ -13,11 +13,15 @@ import ( "testing" "time" + "github.com/nats-io/nats.go" + "github.com/nats-io/nats.go/jetstream" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "go.mongodb.org/mongo-driver/v2/mongo" "go.uber.org/mock/gomock" + "github.com/hmchangw/chat/pkg/errcode" + "github.com/hmchangw/chat/pkg/errcode/errnats" "github.com/hmchangw/chat/pkg/idgen" "github.com/hmchangw/chat/pkg/model" "github.com/hmchangw/chat/pkg/natsutil" @@ -1684,7 +1688,9 @@ func TestHandler_processAddMembers_PublishesFailureEventOnError(t *testing.T) { assert.Equal(t, testRequestID, result.RequestID) assert.Equal(t, model.AsyncJobOpRoomMemberAdd, result.Operation) assert.Equal(t, "error", result.Status, "failure event must have Status=error") - assert.Equal(t, "operation failed", result.Error, "failure event must carry sanitized error message") + // Raw infra error collapses to internal — the cause never leaks to the client. + assert.Equal(t, "internal error", result.Error, "failure event must carry sanitized error message") + assert.Equal(t, string(errcode.CodeInternal), result.Code) assert.Greater(t, result.Timestamp, int64(0)) } @@ -1710,7 +1716,8 @@ func TestHandler_publishAsyncJobResult_PopulatesErrorOnFailure(t *testing.T) { assert.Equal(t, testRequestID, result.RequestID) assert.Equal(t, model.AsyncJobOpRoomMemberAdd, result.Operation) assert.Equal(t, "error", result.Status) - assert.Equal(t, "operation failed", result.Error) + assert.Equal(t, "internal error", result.Error) + assert.Equal(t, string(errcode.CodeInternal), result.Code) assert.Equal(t, "r1", result.RoomID) } @@ -1791,21 +1798,32 @@ func setupAddMembersHappyPath(t *testing.T, mockStore *MockSubscriptionStore, ac mockStore.EXPECT().ReconcileMemberCounts(gomock.Any(), "r1").Return(nil) } -// Task 12: missing X-Request-ID must return a permanent error immediately. -func TestProcessAddMembers_RequiresRequestID(t *testing.T) { +// The legacy TestProcessAddMembers_RequiresRequestID test pinned the old +// reject-on-missing behavior. Under the repo-wide "mint everywhere" policy +// (docs/error-handling.md), missing/malformed X-Request-ID is no longer a +// rejectable condition — the boundary (main.go JetStream consume loop / +// natsrouter.RequestID middleware) mints a fresh UUIDv7 via +// natsutil.StampRequestID before the handler sees the ctx. + +func TestProcessAddMembers_MalformedJSON_IsPermanent(t *testing.T) { + // Regression for the Nak-forever bug: a malformed payload must be Acked + // (Permanent), not Naked. Pre-fix the bare fmt.Errorf made JetStream + // redeliver the same corrupt JSON until MaxDeliver. h, _, _ := newAddMembersTestHandler(t) - body, err := json.Marshal(model.AddMembersRequest{ - RoomID: "r1", Users: []string{"bob"}, - RequesterID: "u_alice", RequesterAccount: "alice", - Timestamp: time.Now().UnixMilli(), - }) - require.NoError(t, err) + err := h.processAddMembers(context.Background(), []byte(`{not json`)) + require.Error(t, err) + assert.ErrorIs(t, err, errPermanent, "unmarshal failure must be Permanent so JetStream Acks") + assert.NotContains(t, err.Error(), "not json", "must not echo raw payload bytes into the user-facing message") +} - // ctx has no request ID - err = h.processAddMembers(context.Background(), body) +func TestProcessRemoveMember_MalformedJSON_IsPermanent(t *testing.T) { + // Unmarshal failure short-circuits before any store call — a bare Handler + // is enough. + h := &Handler{publish: func(context.Context, string, []byte, string) error { return nil }} + err := h.processRemoveMember(context.Background(), []byte(`{not json`)) require.Error(t, err) - assert.Contains(t, err.Error(), "missing X-Request-ID") - assert.ErrorIs(t, err, errPermanent) + assert.ErrorIs(t, err, errPermanent, "unmarshal failure must be Permanent so JetStream Acks") + assert.NotContains(t, err.Error(), "not json") } // Task 14: subscription must carry Name == room.Name and RoomType == channel. @@ -2120,18 +2138,9 @@ func makeCreateRoomBody(t *testing.T, req *model.CreateRoomRequest) []byte { // ---- Task 32: skeleton tests ---- -func TestProcessCreateRoom_RequiresRequestID(t *testing.T) { - h, _, _ := newCreateRoomTestHandler(t) - body := makeCreateRoomBody(t, &model.CreateRoomRequest{ - RoomID: "room1", RequesterAccount: "alice", Timestamp: time.Now().UnixMilli(), - Users: []string{"bob"}, - }) - - err := h.processCreateRoom(context.Background(), body) - require.Error(t, err) - assert.Contains(t, err.Error(), "missing X-Request-ID") - assert.ErrorIs(t, err, errPermanent) -} +// TestProcessCreateRoom_RequiresRequestID retired: see comment above the +// add-members equivalent. Missing X-Request-ID is now minted at the boundary +// rather than rejected at the handler. // ---- Task 33: DM branch tests ---- @@ -2596,7 +2605,7 @@ func TestProcessCreateRoom_Channel_EmitsAsyncJobOk(t *testing.T) { assert.Equal(t, model.AsyncJobOpRoomCreate, result.Operation) } -// ---- Permanent-error coverage for HandleJetStreamMsg Ack path + new permanentError type ---- +// ---- Permanent-error coverage for HandleJetStreamMsg Ack path + errcode.Permanent marker ---- func TestProcessCreateRoom_RoomIDCollisionMismatchType_ReturnsPermanent(t *testing.T) { h, mockStore, getPublished := newCreateRoomTestHandler(t) @@ -2627,35 +2636,52 @@ func TestProcessCreateRoom_RoomIDCollisionMismatchType_ReturnsPermanent(t *testi err := h.processCreateRoom(ctx, body) require.Error(t, err) assert.ErrorIs(t, err, errPermanent) + var pe *errcode.PermanentError + assert.True(t, errors.As(err, &pe), "collision must be an explicit permanent error") assert.Contains(t, err.Error(), "room ID collision") // Async-job error event must be published (defer fires before return). + // Collision classifies to conflict (mirrors the sync-DM errRoomIDCollision + // path); permanence is explicit, not category-inferred. responses := userResponseFor(getPublished(), "alice") require.NotEmpty(t, responses, "permanent error must publish async-job error event") var result model.AsyncJobResult require.NoError(t, json.Unmarshal(responses[0].data, &result)) assert.Equal(t, model.AsyncJobStatusError, result.Status) - assert.Contains(t, result.Error, "room ID collision") - // Sanitized error must NOT contain the trailing ": permanent" suffix. - assert.NotContains(t, result.Error, ": permanent") + assert.Equal(t, string(errcode.CodeConflict), result.Code) } -func TestSanitizeAsyncJobError_PermanentErrorTypeReturnsCleanMessage(t *testing.T) { - err := newPermanent("counterpart not found") - got := sanitizeAsyncJobError(err) - assert.Equal(t, "counterpart not found", got) +func TestFillAsyncError_PermanentForbiddenWithReason(t *testing.T) { + h := &Handler{} + var result model.AsyncJobResult + jobErr := permanent(errcode.Forbidden("only room members can act", errcode.WithReason(errcode.RoomNotMember))) + h.fillAsyncError(context.Background(), &result, jobErr) + assert.Equal(t, model.AsyncJobStatusError, result.Status) + assert.Equal(t, "only room members can act", result.Error) + assert.Equal(t, string(errcode.CodeForbidden), result.Code) + assert.Equal(t, string(errcode.RoomNotMember), result.Reason) } -func TestSanitizeAsyncJobError_LegacyWrappedSentinelStillTrimmed(t *testing.T) { - err := fmt.Errorf("legacy reason: %w", errPermanent) - got := sanitizeAsyncJobError(err) - assert.Equal(t, "legacy reason", got) +func TestFillAsyncError_RawInfraCollapsesToInternal(t *testing.T) { + h := &Handler{} + var result model.AsyncJobResult + jobErr := fmt.Errorf("transient store error: %w", errors.New("connection reset")) + h.fillAsyncError(context.Background(), &result, jobErr) + assert.Equal(t, model.AsyncJobStatusError, result.Status) + assert.Equal(t, "internal error", result.Error) + assert.Equal(t, string(errcode.CodeInternal), result.Code) + assert.Empty(t, result.Reason) } -func TestSanitizeAsyncJobError_NonPermanentCollapsed(t *testing.T) { - err := fmt.Errorf("transient store error: %w", errors.New("connection reset")) - got := sanitizeAsyncJobError(err) - assert.Equal(t, "operation failed", got) +func TestFillAsyncError_PermanentInternalCollision(t *testing.T) { + h := &Handler{} + var result model.AsyncJobResult + jobErr := permanent(errcode.Internal("room ID collision (existing type=channel)")) + h.fillAsyncError(context.Background(), &result, jobErr) + assert.Equal(t, string(errcode.CodeInternal), result.Code) + // Permanence is explicit; an internal-category permanent error still Acks. + var pe *errcode.PermanentError + assert.True(t, errors.As(jobErr, &pe)) } // newRequestCtx returns a context carrying a syntactically-valid X-Request-ID. @@ -2703,39 +2729,49 @@ func marshalReq(t *testing.T, v any) []byte { return data } -func TestSanitizeSyncDMError(t *testing.T) { +// assertSyncDMInternal asserts err marshals (via errnats) to an internal-code +// envelope with the generic "internal error" message and no leaked cause. +func assertSyncDMInternal(t *testing.T, err error) { + t.Helper() + data := errnats.Marshal(context.Background(), err) + e, ok := errcode.Parse(data) + require.True(t, ok, "must marshal to an error envelope: %s", data) + assert.Equal(t, errcode.CodeInternal, e.Code) + assert.Equal(t, "internal error", e.Message) +} + +// TestSyncDMErrorEnvelope asserts the wire envelope errnats produces for each +// sync-DM failure mode (replacing the deleted sanitizeSyncDMError). Validation +// sentinels surface as bad_request with their message; lookup/infra/collision +// detail never leaks — user-lookup and unknown errors collapse to internal, +// the collision becomes a conflict with a generic message. +func TestSyncDMErrorEnvelope(t *testing.T) { cases := []struct { - name string - in error - want string + name string + in error + wantCode errcode.Code + wantMsg string }{ - {"nil returns empty", nil, ""}, - {"missing request ID surfaced", errMissingRequestID, "missing X-Request-ID header"}, - {"invalid request ID surfaced", errInvalidRequestID, "invalid X-Request-ID header"}, - {"invalid sync DM request surfaced", errInvalidSyncDMRequest, "invalid sync DM request"}, - {"user lookup failed surfaced", errUserLookupFailed, "user lookup failed"}, - {"cross-site requester surfaced", errCrossSiteRequester, "requester is not on this site"}, - {"room ID collision masked as internal", errRoomIDCollision, "internal error"}, - {"unknown error masked as internal", errors.New("mongo: connection refused"), "internal error"}, + {"invalid sync DM request", errInvalidSyncDMRequest, errcode.CodeBadRequest, "invalid sync DM request"}, + {"cross-site requester", errCrossSiteRequester, errcode.CodeBadRequest, "requester is not on this site"}, + {"user lookup failed collapses to internal", errUserLookupFailed, errcode.CodeInternal, "internal error"}, + {"room ID collision is a conflict", errRoomIDCollision, errcode.CodeConflict, "room id collision (existing room metadata mismatch)"}, + {"unknown error collapses to internal", errors.New("mongo: connection refused"), errcode.CodeInternal, "internal error"}, } for _, tc := range cases { t.Run(tc.name, func(t *testing.T) { - assert.Equal(t, tc.want, sanitizeSyncDMError(tc.in)) + data := errnats.Marshal(context.Background(), tc.in) + e, ok := errcode.Parse(data) + require.True(t, ok, "must marshal to an error envelope: %s", data) + assert.Equal(t, tc.wantCode, e.Code) + assert.Equal(t, tc.wantMsg, e.Message) + assert.NotContains(t, string(data), "mongo", "infra cause must never leak") }) } } -func TestHandleSyncCreateDM_MissingRequestID(t *testing.T) { - h := &Handler{siteID: "site-a"} - req := model.SyncCreateDMRequest{ - RoomType: model.RoomTypeDM, - RequesterAccount: "alice", - OtherAccount: "bob", - } - data := marshalReq(t, req) - _, err := h.handleSyncCreateDM(context.Background(), data) - assert.ErrorIs(t, err, errMissingRequestID) -} +// TestHandleSyncCreateDM_MissingRequestID retired — see the comment above +// TestProcessAddMembers_RequiresRequestID. func TestHandleSyncCreateDM_InvalidJSON(t *testing.T) { h := &Handler{siteID: "site-a"} @@ -3088,9 +3124,9 @@ func TestHandleSyncCreateDM_ReturnsCanonicalPersistedSub(t *testing.T) { assert.Equal(t, "canonical-sub", reply.Subscription.ID) } -// Transient store errors on GetUser must NOT be sanitized as errUserLookupFailed (which -// signals "user does not exist"); they should propagate as wrapped errors and surface -// as "internal error" via sanitizeSyncDMError. +// Transient store errors on GetUser must NOT be tagged as errUserLookupFailed (which +// signals "user does not exist"); they propagate as wrapped errors and surface +// as "internal error" in the errnats envelope. func TestHandleSyncCreateDM_GetUserTransientError_Internal(t *testing.T) { h, store, _ := newSyncDMTestHandler(t) @@ -3102,7 +3138,7 @@ func TestHandleSyncCreateDM_GetUserTransientError_Internal(t *testing.T) { require.Error(t, err) assert.NotErrorIs(t, err, errUserLookupFailed, "transient error must not be tagged as user-not-found") - assert.Equal(t, "internal error", sanitizeSyncDMError(err)) + assertSyncDMInternal(t, err) } func TestHandleSyncCreateDM_PublishesSubscriptionUpdateForBothUsers(t *testing.T) { @@ -3225,7 +3261,7 @@ func TestHandleSyncCreateDM_OutboxPublishFails_FailsRequest(t *testing.T) { data := marshalReq(t, req) _, err := h.handleSyncCreateDM(newRequestCtx(), data) require.Error(t, err) - assert.Equal(t, "internal error", sanitizeSyncDMError(err)) + assertSyncDMInternal(t, err) } // BulkCreateSubscriptions returning a transient error must surface as "internal error". @@ -3243,7 +3279,7 @@ func TestHandleSyncCreateDM_BulkCreateSubsTransientError(t *testing.T) { data := marshalReq(t, req) _, err := h.handleSyncCreateDM(newRequestCtx(), data) require.Error(t, err) - assert.Equal(t, "internal error", sanitizeSyncDMError(err)) + assertSyncDMInternal(t, err) } // On a CreateRoom dup-key with matching existing room (idempotent re-delivery), @@ -3571,6 +3607,11 @@ func TestProcessCreateRoom_PermanentErrorWhenKeyMissing(t *testing.T) { require.Error(t, err) assert.True(t, errors.Is(err, errPermanent), "missing key must be permanent") assert.True(t, errors.Is(err, errRoomKeyAbsent), "missing key must satisfy errRoomKeyAbsent sentinel") + // The SAME error value must yield the *errcode.Error (internal) for the reply + // envelope AND still satisfy errors.Is(errRoomKeyAbsent) for the alert path. + var ee *errcode.Error + require.True(t, errors.As(err, &ee), "absent-key error must carry an *errcode.Error") + assert.Equal(t, errcode.CodeInternal, ee.Code) } // ---- Task 11: fan-out current key to newly-added channel members ---- @@ -4595,42 +4636,39 @@ func TestHandler_ProcessAddMembers_HasOrgRoomMembersError_FailsClosed(t *testing assert.Contains(t, err.Error(), "check existing org room members") } -// X-Request-ID must be a hyphenated UUID; non-UUIDs leak into reply subjects. -func TestHandler_ProcessAddMembers_InvalidRequestID_ReturnsPermanent(t *testing.T) { - ctrl := gomock.NewController(t) - store := NewMockSubscriptionStore(ctrl) - // No store mocks — validation must short-circuit before any store call. - - h := &Handler{store: store, siteID: "site-a", publish: func(_ context.Context, _ string, _ []byte, _ string) error { return nil }, keyStore: testKeyStore, keySender: testKeySender} - req := model.AddMembersRequest{ - RoomID: "r1", RequesterID: "u_a", RequesterAccount: "alice", - Users: []string{"u1"}, Timestamp: 1, - } - data, _ := json.Marshal(req) - ctx := natsutil.WithRequestID(context.Background(), "not-a-uuid") - - err := h.processAddMembers(ctx, data) - require.Error(t, err) - assert.ErrorIs(t, err, errPermanent) - assert.Contains(t, err.Error(), "invalid X-Request-ID") -} - -func TestProcessCreateRoom_InvalidRequestID_ReturnsPermanent(t *testing.T) { - h, mockStore, _ := newCreateRoomTestHandler(t) - _ = mockStore // store mocks intentionally unset — must short-circuit before any call - ctx := natsutil.WithRequestID(context.Background(), "not-a-uuid") - - body := makeCreateRoomBody(t, &model.CreateRoomRequest{ - RoomID: "room-1", - RequesterAccount: "alice", - Users: []string{"bob"}, - Timestamp: time.Now().UnixMilli(), +// natsServerCreateDM and the JetStream consume loop call this helper to +// validate the inbound X-Request-ID before any downstream dedup-key derivation +// runs. Missing/malformed → BadRequest (no server-side mint). The asymmetric +// policy vs the consume loop (which still mints defensively) lives in +// docs/error-handling.md §3a. +func TestRequireDedupRequestID(t *testing.T) { + const validUUID = "01970a4f-8c2d-7c9a-abcd-e0123456789f" + + t.Run("valid_passes", func(t *testing.T) { + h := nats.Header{natsutil.RequestIDHeader: []string{validUUID}} + ctx, id, err := requireDedupRequestID(context.Background(), h, "chat.test.subject") + require.NoError(t, err) + assert.Equal(t, validUUID, id) + assert.Equal(t, validUUID, natsutil.RequestIDFromContext(ctx)) }) - err := h.processCreateRoom(ctx, body) - require.Error(t, err) - assert.ErrorIs(t, err, errPermanent) - assert.Contains(t, err.Error(), "invalid X-Request-ID") + cases := []struct { + name string + headers nats.Header + }{ + {name: "nil_rejects", headers: nil}, + {name: "empty_rejects", headers: nats.Header{}}, + {name: "malformed_rejects", headers: nats.Header{natsutil.RequestIDHeader: []string{"not-a-uuid"}}}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + _, _, err := requireDedupRequestID(context.Background(), tc.headers, "chat.test.subject") + require.Error(t, err) + var ec *errcode.Error + require.True(t, errors.As(err, &ec)) + assert.Equal(t, errcode.CodeBadRequest, ec.Code) + }) + } } // TestHandler_RotateAndFanOut_ErrNoCurrentKey_UsesPredictedVersion pins the @@ -4827,3 +4865,76 @@ func TestHandleSyncCreateDM_DEKFailure_AbortsBeforeCreate(t *testing.T) { require.Error(t, err) assert.Len(t, prov.calls, 1, "EnsureDEK should have been attempted once") } + +// ---- HandleJetStreamMsg Ack/Nak + async-consumer panic recovery ---- + +// fakeJSMsg is a minimal jetstream.Msg stub recording Ack/Nak calls so tests +// can assert the consumer's permanence-driven decision without a NATS server. +type fakeJSMsg struct { + subject string + data []byte + headers nats.Header + acked bool + naked bool + ackErr error + nakErr error +} + +func (m *fakeJSMsg) Metadata() (*jetstream.MsgMetadata, error) { return nil, nil } +func (m *fakeJSMsg) Data() []byte { return m.data } +func (m *fakeJSMsg) Headers() nats.Header { return m.headers } +func (m *fakeJSMsg) Subject() string { return m.subject } +func (m *fakeJSMsg) Reply() string { return "" } +func (m *fakeJSMsg) Ack() error { m.acked = true; return m.ackErr } +func (m *fakeJSMsg) DoubleAck(context.Context) error { m.acked = true; return m.ackErr } +func (m *fakeJSMsg) Nak() error { m.naked = true; return m.nakErr } +func (m *fakeJSMsg) NakWithDelay(time.Duration) error { m.naked = true; return m.nakErr } +func (m *fakeJSMsg) InProgress() error { return nil } +func (m *fakeJSMsg) Term() error { return nil } +func (m *fakeJSMsg) TermWithReason(string) error { return nil } + +// processRoleUpdate's "unsupported role" path was historically Nak'd forever +// (a bare fmt.Errorf). It is now an explicit permanent error so the consumer +// Acks and JetStream stops redelivering. +func TestHandleJetStreamMsg_UnsupportedRole_Acks(t *testing.T) { + h, _, _ := newCreateRoomTestHandler(t) + body := marshalReq(t, model.UpdateRoleRequest{ + Account: "alice", RoomID: "room-1", NewRole: model.Role("bogus"), + }) + msg := &fakeJSMsg{subject: "chat.room.room-1.member.role-update", data: body} + h.HandleJetStreamMsg(context.Background(), msg) + assert.True(t, msg.acked, "unsupported role must Ack (permanent), not Nak forever") + assert.False(t, msg.naked) +} + +// A transient store error on a role update must Nak (retryable), confirming the +// Ack/Nak decision is keyed on the explicit permanent marker, not the category. +func TestHandleJetStreamMsg_TransientRoleUpdate_Naks(t *testing.T) { + h, mockStore, _ := newCreateRoomTestHandler(t) + mockStore.EXPECT().AddRole(gomock.Any(), "alice", "room-1", model.RoleOwner). + Return(errors.New("mongo: connection reset")) + body := marshalReq(t, model.UpdateRoleRequest{ + Account: "alice", RoomID: "room-1", NewRole: model.RoleOwner, + }) + msg := &fakeJSMsg{subject: "chat.room.room-1.member.role-update", data: body} + h.HandleJetStreamMsg(context.Background(), msg) + assert.True(t, msg.naked, "transient infra error must Nak for redelivery") + assert.False(t, msg.acked) +} + +// panicProcessor panics on every message — stands in for a WithCause/WithMetadata +// misuse that would otherwise crash the async consumer goroutine. +type panicProcessor struct{} + +func (panicProcessor) HandleJetStreamMsg(context.Context, jetstream.Msg) { + panic("boom: errcode option misuse") +} + +func TestRunJobWithRecovery_PanicAcksAndDoesNotCrash(t *testing.T) { + msg := &fakeJSMsg{subject: "chat.room.room-1.create"} + assert.NotPanics(t, func() { + runJobWithRecovery(context.Background(), panicProcessor{}, msg) + }, "a panicking handler must be recovered, not crash the worker") + assert.True(t, msg.acked, "panic must Ack (poison-pill drop), not Nak — a deterministic panic would otherwise loop on redelivery") + assert.False(t, msg.naked) +} diff --git a/room-worker/main.go b/room-worker/main.go index a1d694f7e..4c6ee2f99 100644 --- a/room-worker/main.go +++ b/room-worker/main.go @@ -14,6 +14,7 @@ import ( "github.com/Marz32onE/instrumentation-go/otel-nats/oteljetstream" "github.com/hmchangw/chat/pkg/atrest" + "github.com/hmchangw/chat/pkg/idgen" "github.com/hmchangw/chat/pkg/mongoutil" "github.com/hmchangw/chat/pkg/natsutil" "github.com/hmchangw/chat/pkg/otelutil" @@ -177,12 +178,15 @@ func main() { sem <- struct{}{} wg.Add(1) go func() { + // recover() must run BEFORE the slot release so a panicking handler + // (e.g. a WithCause/WithMetadata misuse) Naks and is redelivered + // instead of crashing the worker — the async path runs outside + // natsrouter's recovery middleware. defer func() { <-sem wg.Done() }() - handlerCtx := natsutil.ContextWithRequestIDFromHeaders(msgCtx, msg.Headers()) - handler.HandleJetStreamMsg(handlerCtx, msg) + runJobWithRecovery(msgCtx, handler, msg) }() } }() @@ -223,6 +227,48 @@ func main() { shutdown.Wait(ctx, 25*time.Second, hooks...) } +// jobProcessor is the slice of the handler that the consumer goroutine drives; +// narrowing it to an interface lets runJobWithRecovery be unit-tested with a +// panicking stub (no NATS connection required). +type jobProcessor interface { + HandleJetStreamMsg(ctx context.Context, msg jetstream.Msg) +} + +// runJobWithRecovery processes one async job and contains any panic so the +// worker survives. A panic ACKS the message (poison-pill drop) rather than +// Naking — a deterministic panic (e.g. odd-arg WithMetadata, WithCause on an +// *errcode.Error) would otherwise loop on redelivery until MaxDeliver and +// hammer the worker through every backoff. This mirrors natsrouter.Recovery, +// which Acks-on-panic with an Internal reply. +func runJobWithRecovery(msgCtx context.Context, handler jobProcessor, msg jetstream.Msg) { + defer func() { + if r := recover(); r != nil { + slog.Error("panic in async job handler — dropping (Ack)", "panic", r, "subject", msg.Subject()) + if ackErr := msg.Ack(); ackErr != nil { + slog.Error("failed to ack after panic", "error", ackErr) + } + } + }() + // Defensive mint: room-service rejects missing/malformed X-Request-ID at + // publish time (RequireRequestID), so by the time a message lands on the + // ROOMS stream the header should always be a valid UUID. If we end up + // minting here, that's an upstream contract violation worth an Error log — + // downstream OutboxDedupID / message-ID generation will derive dedup keys + // from the fresh mint, breaking client-retry dedup. See + // docs/error-handling.md §3a. + inbound := "" + if h := msg.Headers(); h != nil { + inbound = h.Get(natsutil.RequestIDHeader) + } + id, replaced := idgen.ResolveRequestID(inbound) + if replaced || inbound == "" { + slog.Error("ROOMS stream message missing or invalid X-Request-ID — minting defensively; upstream contract broken", + "inbound", inbound, "subject", msg.Subject()) + } + handlerCtx := natsutil.WithRequestID(msgCtx, id) + handler.HandleJetStreamMsg(handlerCtx, msg) +} + // buildConsumerConfig returns the durable consumer config for // room-worker. Centralized so it is unit-testable without NATS. func buildConsumerConfig(s stream.ConsumerSettings) jetstream.ConsumerConfig { diff --git a/search-service/handler.go b/search-service/handler.go index f3eb597b2..dea133f22 100644 --- a/search-service/handler.go +++ b/search-service/handler.go @@ -3,12 +3,15 @@ package main import ( "context" "errors" + "fmt" "log/slog" "strings" "time" + "github.com/hmchangw/chat/pkg/errcode" "github.com/hmchangw/chat/pkg/model" "github.com/hmchangw/chat/pkg/natsrouter" + "github.com/hmchangw/chat/pkg/natsutil" "github.com/hmchangw/chat/pkg/subject" ) @@ -74,12 +77,14 @@ func (h *handler) searchMessages(c *natsrouter.Context, req model.SearchMessages if rerr != nil { return nil, rerr } + c.WithLogValues("request_id", natsutil.RequestIDFromContext(c), "account", account) if err := h.normalizePagination(&req.Size, &req.Offset); err != nil { return nil, err } + req.Query = strings.TrimSpace(req.Query) if req.Query == "" { - return nil, natsrouter.ErrBadRequest("query is required") + return nil, errcode.BadRequest("query is required") } ctx, cancel := h.withRequestTimeout(c) @@ -98,22 +103,19 @@ func (h *handler) searchMessages(c *natsrouter.Context, req model.SearchMessages // so no handler-level pre-classification is needed. body, err := buildMessageQuery(req, account, restricted, h.cfg.RecentWindow, h.cfg.UserRoomIndex) if err != nil { - slog.Error("build message query failed", "account", account, "error", err) - return nil, natsrouter.ErrInternal("unable to build search query") + return nil, fmt.Errorf("building search query: %w", err) } observeESDone := observeES() raw, err := h.store.Search(ctx, MessageIndexPattern, body) observeESDone() if err != nil { - slog.Error("message search backend failed", "account", account, "error", err) - return nil, natsrouter.ErrInternal("search backend unavailable") + return nil, fmt.Errorf("message search backend: %w", err) } hits, total, err := parseMessagesResponse(raw) if err != nil { - slog.Error("parse messages response failed", "account", account, "error", err) - return nil, natsrouter.ErrInternal("unexpected search response") + return nil, fmt.Errorf("parsing search response: %w", err) } messages := make([]model.SearchMessage, 0, len(hits)) @@ -130,6 +132,7 @@ func (h *handler) searchRooms(c *natsrouter.Context, req model.SearchRoomsReques if rerr != nil { return nil, rerr } + c.WithLogValues("request_id", natsutil.RequestIDFromContext(c), "account", account) if err := h.normalizePagination(&req.Size, &req.Offset); err != nil { return nil, err @@ -137,7 +140,7 @@ func (h *handler) searchRooms(c *natsrouter.Context, req model.SearchRoomsReques query := strings.TrimSpace(req.Query) if query == "" { - return nil, natsrouter.ErrBadRequest("query is required") + return nil, errcode.BadRequest("query is required") } req.Query = query @@ -146,35 +149,32 @@ func (h *handler) searchRooms(c *natsrouter.Context, req model.SearchRoomsReques body, err := buildRoomQuery(req, account) if err != nil { - // RouteError (invalid roomType) passes through; + // A typed errcode error (invalid roomType) passes through; // anything else (marshal failure — unreachable) gets sanitized. - var rerr *natsrouter.RouteError - if errors.As(err, &rerr) { + var ee *errcode.Error + if errors.As(err, &ee) { return nil, err } - slog.Error("build subscription query failed", "account", account, "error", err) - return nil, natsrouter.ErrInternal("unable to build search query") + return nil, fmt.Errorf("building search query: %w", err) } observeESDone := observeES() raw, err := h.store.Search(ctx, []string{h.cfg.SpotlightReadPattern}, body) observeESDone() if err != nil { - slog.Error("subscription search backend failed", "account", account, "error", err) - return nil, natsrouter.ErrInternal("search backend unavailable") + return nil, fmt.Errorf("subscription search backend: %w", err) } rooms, err := parseRooms(raw) if err != nil { - slog.Error("parse spotlight rooms failed", "account", account, "error", err) - return nil, natsrouter.ErrInternal("unexpected search response") + return nil, fmt.Errorf("parsing spotlight rooms: %w", err) } return &model.SearchRoomsResponse{Rooms: rooms}, nil } // loadRestricted implements the 2-tier Valkey → ES read. Cache errors // alone never fail the request — log-and-fall-through. Only when both -// cache AND ES prefetch fail do we surface ErrInternal. +// cache AND ES prefetch fail do we collapse to errcode.Internal at the boundary. func (h *handler) loadRestricted(ctx context.Context, account string) (map[string]int64, error) { cached, hit, cerr := h.cache.GetRestricted(ctx, account) if cerr != nil { @@ -185,12 +185,14 @@ func (h *handler) loadRestricted(ctx context.Context, account string) (map[strin } doc, _, err := h.store.GetUserRoomDoc(ctx, account) if err != nil { - // Always log the store error, even if the cache GET also failed - // — it's the actionable signal when both fail. Include cache_err - // so operators can correlate, but don't let the cache warning - // mask the ES root cause. - slog.Error("user-room doc fetch failed", "account", account, "error", err, "cache_err", cerr) - return nil, natsrouter.ErrInternal("unable to resolve room access") + // Classify (via errnats.Reply at the handler boundary) logs the wrapped + // chain exactly once at ERROR; do not slog.Error here or every failure + // double-logs. cache_err is the only detail we'd add — fold it into the + // wrap so it survives in the centralized cause field. + if cerr != nil { + return nil, fmt.Errorf("resolving room access (cache_err=%v): %w", cerr, err) + } + return nil, fmt.Errorf("resolving room access: %w", err) } restricted := doc.RestrictedRooms @@ -219,6 +221,7 @@ func (h *handler) searchApps(c *natsrouter.Context, req model.SearchAppsRequest) if rerr != nil { return nil, rerr } + c.WithLogValues("request_id", natsutil.RequestIDFromContext(c), "account", account) if err := h.normalizePagination(&req.Size, &req.Offset); err != nil { return nil, err @@ -226,7 +229,7 @@ func (h *handler) searchApps(c *natsrouter.Context, req model.SearchAppsRequest) query := strings.TrimSpace(req.Query) if query == "" { - return nil, natsrouter.ErrBadRequest("query is required") + return nil, errcode.BadRequest("query is required") } ctx, cancel := h.withRequestTimeout(c) @@ -234,8 +237,7 @@ func (h *handler) searchApps(c *natsrouter.Context, req model.SearchAppsRequest) apps, err := h.mongo.SearchAppsByName(ctx, query, account, req.AssistantEnabled, req.Offset, req.Size) if err != nil { - slog.Error("app search backend failed", "account", account, "error", err) - return nil, natsrouter.ErrInternal("search backend unavailable") + return nil, fmt.Errorf("app search backend: %w", err) } if apps == nil { @@ -255,13 +257,14 @@ func (h *handler) searchUsers(c *natsrouter.Context, req model.SearchUsersReques if rerr != nil { return nil, rerr } + c.WithLogValues("request_id", natsutil.RequestIDFromContext(c), "account", account) query := strings.TrimSpace(req.Query) if query == "" { - return nil, natsrouter.ErrBadRequest("query is required") + return nil, errcode.BadRequest("query is required") } if req.Offset < 0 || req.Limit < 0 { - return nil, natsrouter.ErrBadRequest("offset and limit must be non-negative") + return nil, errcode.BadRequest("offset and limit must be non-negative") } limit := req.Limit if limit == 0 { @@ -276,8 +279,7 @@ func (h *handler) searchUsers(c *natsrouter.Context, req model.SearchUsersReques users, err := h.users.SearchUsers(ctx, query, req.Offset, limit) if err != nil { - slog.Error("user search backend failed", "account", account, "error", err) - return nil, natsrouter.ErrInternal("user search backend unavailable") + return nil, fmt.Errorf("user search backend: %w", err) } if users == nil { @@ -289,10 +291,10 @@ func (h *handler) searchUsers(c *natsrouter.Context, req model.SearchUsersReques // normalizePagination validates and clamps size/offset in place. size=0 // falls back to DocCounts; size>MaxDocCounts is capped. Negative size // or offset is a client bug, not a defaultable value, so it returns -// ErrBadRequest. +// errcode.BadRequest. func (h *handler) normalizePagination(size, offset *int) error { if *size < 0 || *offset < 0 { - return natsrouter.ErrBadRequest("size and offset must be non-negative") + return errcode.BadRequest("size and offset must be non-negative") } if *size == 0 { *size = h.cfg.DocCounts diff --git a/search-service/handler_test.go b/search-service/handler_test.go index 8b655bf44..52d3585f6 100644 --- a/search-service/handler_test.go +++ b/search-service/handler_test.go @@ -10,6 +10,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "github.com/hmchangw/chat/pkg/errcode" "github.com/hmchangw/chat/pkg/model" "github.com/hmchangw/chat/pkg/natsrouter" ) @@ -151,9 +152,8 @@ func TestHandler_SearchMessages_CacheAndESFailReturnInternal(t *testing.T) { _, err := h.searchMessages(ctxWithAccount("alice"), model.SearchMessagesRequest{Query: "hi"}) require.Error(t, err) - var rerr *natsrouter.RouteError - require.True(t, errors.As(err, &rerr)) - assert.Equal(t, natsrouter.CodeInternal, rerr.Code) + classified := errcode.Classify(context.Background(), err) + assert.Equal(t, errcode.CodeInternal, classified.Code) } func TestHandler_SearchMessages_ESSearchError(t *testing.T) { @@ -164,27 +164,26 @@ func TestHandler_SearchMessages_ESSearchError(t *testing.T) { h := newTestHandler(store, nil, nil, cache) _, err := h.searchMessages(ctxWithAccount("alice"), model.SearchMessagesRequest{Query: "hi"}) require.Error(t, err) - var rerr *natsrouter.RouteError - require.True(t, errors.As(err, &rerr)) - assert.Equal(t, natsrouter.CodeInternal, rerr.Code) + classified := errcode.Classify(context.Background(), err) + assert.Equal(t, errcode.CodeInternal, classified.Code) } func TestHandler_SearchMessages_EmptyQuery(t *testing.T) { h := newTestHandler(&fakeStore{}, nil, nil, newFakeCache()) _, err := h.searchMessages(ctxWithAccount("alice"), model.SearchMessagesRequest{}) require.Error(t, err) - var rerr *natsrouter.RouteError + var rerr *errcode.Error require.True(t, errors.As(err, &rerr)) - assert.Equal(t, natsrouter.CodeBadRequest, rerr.Code) + assert.Equal(t, errcode.CodeBadRequest, rerr.Code) } func TestHandler_SearchMessages_NegativeSizeRejected(t *testing.T) { h := newTestHandler(&fakeStore{}, nil, nil, newFakeCache()) _, err := h.searchMessages(ctxWithAccount("alice"), model.SearchMessagesRequest{Query: "x", Size: -1}) require.Error(t, err) - var rerr *natsrouter.RouteError + var rerr *errcode.Error require.True(t, errors.As(err, &rerr)) - assert.Equal(t, natsrouter.CodeBadRequest, rerr.Code) + assert.Equal(t, errcode.CodeBadRequest, rerr.Code) } func TestHandler_SearchMessages_SizeClamped(t *testing.T) { @@ -277,27 +276,27 @@ func TestHandler_SearchRooms_EmptyQueryRejected(t *testing.T) { h := newTestHandler(&fakeStore{}, &fakeMongo{}, nil, newFakeCache()) _, err := h.searchRooms(ctxWithAccount("alice"), model.SearchRoomsRequest{}) require.Error(t, err) - var rerr *natsrouter.RouteError + var rerr *errcode.Error require.True(t, errors.As(err, &rerr)) - assert.Equal(t, natsrouter.CodeBadRequest, rerr.Code) + assert.Equal(t, errcode.CodeBadRequest, rerr.Code) } func TestHandler_SearchRooms_WhitespaceQueryRejected(t *testing.T) { h := newTestHandler(&fakeStore{}, &fakeMongo{}, nil, newFakeCache()) _, err := h.searchRooms(ctxWithAccount("alice"), model.SearchRoomsRequest{Query: " "}) require.Error(t, err) - var rerr *natsrouter.RouteError + var rerr *errcode.Error require.True(t, errors.As(err, &rerr)) - assert.Equal(t, natsrouter.CodeBadRequest, rerr.Code) + assert.Equal(t, errcode.CodeBadRequest, rerr.Code) } func TestHandler_SearchRooms_RoomTypeAppRejected(t *testing.T) { h := newTestHandler(&fakeStore{}, &fakeMongo{}, nil, newFakeCache()) _, err := h.searchRooms(ctxWithAccount("alice"), model.SearchRoomsRequest{Query: "x", RoomType: "app"}) require.Error(t, err) - var rerr *natsrouter.RouteError + var rerr *errcode.Error require.True(t, errors.As(err, &rerr)) - assert.Equal(t, natsrouter.CodeBadRequest, rerr.Code) + assert.Equal(t, errcode.CodeBadRequest, rerr.Code) assert.Contains(t, rerr.Message, "invalid roomType") } @@ -305,9 +304,9 @@ func TestHandler_SearchRooms_UnknownRoomTypeRejected(t *testing.T) { h := newTestHandler(&fakeStore{}, &fakeMongo{}, nil, newFakeCache()) _, err := h.searchRooms(ctxWithAccount("alice"), model.SearchRoomsRequest{Query: "x", RoomType: "zzz"}) require.Error(t, err) - var rerr *natsrouter.RouteError + var rerr *errcode.Error require.True(t, errors.As(err, &rerr)) - assert.Equal(t, natsrouter.CodeBadRequest, rerr.Code) + assert.Equal(t, errcode.CodeBadRequest, rerr.Code) } func TestHandler_SearchRooms_ESErrorSanitized(t *testing.T) { @@ -315,10 +314,9 @@ func TestHandler_SearchRooms_ESErrorSanitized(t *testing.T) { h := newTestHandler(store, &fakeMongo{}, nil, newFakeCache()) _, err := h.searchRooms(ctxWithAccount("alice"), model.SearchRoomsRequest{Query: "general"}) require.Error(t, err) - var rerr *natsrouter.RouteError - require.True(t, errors.As(err, &rerr)) - assert.Equal(t, natsrouter.CodeInternal, rerr.Code) - assert.NotContains(t, rerr.Message, "es failed") + classified := errcode.Classify(context.Background(), err) + assert.Equal(t, errcode.CodeInternal, classified.Code) + assert.NotContains(t, classified.Message, "es failed") } func TestHandler_SearchRooms_EmptyESResult(t *testing.T) { @@ -354,9 +352,9 @@ func TestHandler_SearchRooms_NegativeSizeRejected(t *testing.T) { h := newTestHandler(&fakeStore{}, &fakeMongo{}, nil, newFakeCache()) _, err := h.searchRooms(ctxWithAccount("alice"), model.SearchRoomsRequest{Query: "x", Size: -1}) require.Error(t, err) - var rerr *natsrouter.RouteError + var rerr *errcode.Error require.True(t, errors.As(err, &rerr)) - assert.Equal(t, natsrouter.CodeBadRequest, rerr.Code) + assert.Equal(t, errcode.CodeBadRequest, rerr.Code) } func TestHandler_SearchRooms_UsesSpotlightIndex(t *testing.T) { @@ -430,9 +428,9 @@ func TestHandler_SearchApps_EmptyQueryRejected(t *testing.T) { _, err := h.searchApps(ctxWithAccount("alice"), model.SearchAppsRequest{Query: ""}) require.Error(t, err) - var rerr *natsrouter.RouteError + var rerr *errcode.Error require.True(t, errors.As(err, &rerr)) - assert.Equal(t, natsrouter.CodeBadRequest, rerr.Code) + assert.Equal(t, errcode.CodeBadRequest, rerr.Code) assert.Len(t, mongo.searchAppsCalls, 0, "validation must short-circuit before backend call") } @@ -443,9 +441,9 @@ func TestHandler_SearchApps_WhitespaceQueryRejected(t *testing.T) { _, err := h.searchApps(ctxWithAccount("alice"), model.SearchAppsRequest{Query: " \t "}) require.Error(t, err) - var rerr *natsrouter.RouteError + var rerr *errcode.Error require.True(t, errors.As(err, &rerr)) - assert.Equal(t, natsrouter.CodeBadRequest, rerr.Code) + assert.Equal(t, errcode.CodeBadRequest, rerr.Code) } func TestHandler_SearchApps_BackendErrorSanitized(t *testing.T) { @@ -454,10 +452,9 @@ func TestHandler_SearchApps_BackendErrorSanitized(t *testing.T) { _, err := h.searchApps(ctxWithAccount("alice"), model.SearchAppsRequest{Query: "weather"}) require.Error(t, err) - var rerr *natsrouter.RouteError - require.True(t, errors.As(err, &rerr)) - assert.Equal(t, natsrouter.CodeInternal, rerr.Code, "raw store error must not leak; sanitize to ErrInternal") - assert.NotContains(t, rerr.Message, "mongo down", "internal error text must not surface to client") + classified := errcode.Classify(context.Background(), err) + assert.Equal(t, errcode.CodeInternal, classified.Code, "raw store error must not leak; sanitize to internal") + assert.NotContains(t, classified.Message, "mongo down", "internal error text must not surface to client") } func TestHandler_SearchApps_EmptyResultsReturnsEmptySlice(t *testing.T) { @@ -494,9 +491,9 @@ func TestHandler_SearchApps_NegativeSizeRejected(t *testing.T) { Size: -1, }) require.Error(t, err) - var rerr *natsrouter.RouteError + var rerr *errcode.Error require.True(t, errors.As(err, &rerr)) - assert.Equal(t, natsrouter.CodeBadRequest, rerr.Code) + assert.Equal(t, errcode.CodeBadRequest, rerr.Code) } func TestHandler_SearchMessages_ScopedPartitioning(t *testing.T) { @@ -582,9 +579,9 @@ func TestHandler_SearchUsers_NegativePaginationRejected(t *testing.T) { _, err := h.searchUsers(ctxWithAccount("alice"), tt.req) require.Error(t, err) - var rerr *natsrouter.RouteError + var rerr *errcode.Error require.True(t, errors.As(err, &rerr)) - assert.Equal(t, natsrouter.CodeBadRequest, rerr.Code) + assert.Equal(t, errcode.CodeBadRequest, rerr.Code) assert.Empty(t, fu.calls, "backend must not be called on invalid pagination") }) } @@ -596,9 +593,9 @@ func TestHandler_SearchUsers_EmptyQueryRejected(t *testing.T) { _, err := h.searchUsers(ctxWithAccount("alice"), model.SearchUsersRequest{Query: ""}) require.Error(t, err) - var rerr *natsrouter.RouteError + var rerr *errcode.Error require.True(t, errors.As(err, &rerr)) - assert.Equal(t, natsrouter.CodeBadRequest, rerr.Code) + assert.Equal(t, errcode.CodeBadRequest, rerr.Code) assert.Len(t, fu.calls, 0, "validation must short-circuit before backend call") } @@ -609,9 +606,9 @@ func TestHandler_SearchUsers_WhitespaceQueryRejected(t *testing.T) { _, err := h.searchUsers(ctxWithAccount("alice"), model.SearchUsersRequest{Query: " \t "}) require.Error(t, err) - var rerr *natsrouter.RouteError + var rerr *errcode.Error require.True(t, errors.As(err, &rerr)) - assert.Equal(t, natsrouter.CodeBadRequest, rerr.Code) + assert.Equal(t, errcode.CodeBadRequest, rerr.Code) assert.Len(t, fu.calls, 0) } @@ -622,10 +619,9 @@ func TestHandler_SearchUsers_BackendErrorSanitized(t *testing.T) { _, err := h.searchUsers(ctxWithAccount("alice"), model.SearchUsersRequest{Query: "alice"}) require.Error(t, err) - var rerr *natsrouter.RouteError - require.True(t, errors.As(err, &rerr)) - assert.Equal(t, natsrouter.CodeInternal, rerr.Code, "raw backend error must not leak") - assert.NotContains(t, rerr.Message, "third-party down", "internal text must not surface to client") + classified := errcode.Classify(context.Background(), err) + assert.Equal(t, errcode.CodeInternal, classified.Code, "raw backend error must not leak") + assert.NotContains(t, classified.Message, "third-party down", "internal text must not surface to client") } func TestHandler_SearchUsers_EmptyResultsReturnsEmptySlice(t *testing.T) { diff --git a/search-service/integration_apps_test.go b/search-service/integration_apps_test.go index d58bbff59..7fca030bb 100644 --- a/search-service/integration_apps_test.go +++ b/search-service/integration_apps_test.go @@ -16,8 +16,9 @@ import ( "go.mongodb.org/mongo-driver/v2/mongo" + "github.com/hmchangw/chat/pkg/errcode" + "github.com/hmchangw/chat/pkg/errcode/errtest" "github.com/hmchangw/chat/pkg/model" - "github.com/hmchangw/chat/pkg/natsrouter" "github.com/hmchangw/chat/pkg/subject" "github.com/hmchangw/chat/pkg/testutil" ) @@ -105,8 +106,5 @@ func TestIntegration_SearchApps_EmptyQueryReturnsBadRequest(t *testing.T) { msg, err := f.clientNATS.Request(subject.SearchApps("alice", testSiteID), reqBytes, 5*time.Second) require.NoError(t, err) - var envelope model.ErrorResponse - require.NoError(t, json.Unmarshal(msg.Data, &envelope)) - require.NotEmpty(t, envelope.Error) - assert.Equal(t, natsrouter.CodeBadRequest, envelope.Code) + errtest.AssertCode(t, msg.Data, errcode.CodeBadRequest) } diff --git a/search-service/integration_messages_test.go b/search-service/integration_messages_test.go index 56c459a83..ab51e4a87 100644 --- a/search-service/integration_messages_test.go +++ b/search-service/integration_messages_test.go @@ -17,8 +17,9 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "github.com/hmchangw/chat/pkg/errcode" + "github.com/hmchangw/chat/pkg/errcode/errtest" "github.com/hmchangw/chat/pkg/model" - "github.com/hmchangw/chat/pkg/natsrouter" "github.com/hmchangw/chat/pkg/searchengine" "github.com/hmchangw/chat/pkg/subject" ) @@ -95,7 +96,5 @@ func TestIntegration_SearchMessages_V2_EmptyQueryReturnsBadRequest(t *testing.T) msg, err := f.clientNATS.Request(subject.SearchMessages("alice", testSiteID), reqBytes, 5*time.Second) require.NoError(t, err) - var envelope model.ErrorResponse - require.NoError(t, json.Unmarshal(msg.Data, &envelope)) - assert.Equal(t, natsrouter.CodeBadRequest, envelope.Code) + errtest.AssertCode(t, msg.Data, errcode.CodeBadRequest) } diff --git a/search-service/integration_rooms_test.go b/search-service/integration_rooms_test.go index 7de34aa00..7636b10a6 100644 --- a/search-service/integration_rooms_test.go +++ b/search-service/integration_rooms_test.go @@ -17,8 +17,9 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "github.com/hmchangw/chat/pkg/errcode" + "github.com/hmchangw/chat/pkg/errcode/errtest" "github.com/hmchangw/chat/pkg/model" - "github.com/hmchangw/chat/pkg/natsrouter" "github.com/hmchangw/chat/pkg/searchengine" "github.com/hmchangw/chat/pkg/subject" "github.com/hmchangw/chat/pkg/testutil" @@ -190,10 +191,7 @@ func TestIntegration_SearchRooms_EmptyQueryReturnsBadRequest(t *testing.T) { msg, err := f.clientNATS.Request(subject.SearchRooms("alice", testSiteID), reqBytes, 5*time.Second) require.NoError(t, err) - var envelope model.ErrorResponse - require.NoError(t, json.Unmarshal(msg.Data, &envelope)) - require.NotEmpty(t, envelope.Error) - assert.Equal(t, natsrouter.CodeBadRequest, envelope.Code) + errtest.AssertCode(t, msg.Data, errcode.CodeBadRequest) } func TestIntegration_SearchRooms_RoomTypeAppReturnsBadRequest(t *testing.T) { @@ -205,9 +203,7 @@ func TestIntegration_SearchRooms_RoomTypeAppReturnsBadRequest(t *testing.T) { msg, err := f.clientNATS.Request(subject.SearchRooms("alice", testSiteID), reqBytes, 5*time.Second) require.NoError(t, err) - var envelope model.ErrorResponse - require.NoError(t, json.Unmarshal(msg.Data, &envelope)) - require.NotEmpty(t, envelope.Error) - assert.Equal(t, natsrouter.CodeBadRequest, envelope.Code) - assert.Contains(t, envelope.Error, "invalid roomType") + envelope := errtest.Decode(t, msg.Data) + assert.Equal(t, errcode.CodeBadRequest, envelope.Code) + assert.Contains(t, envelope.Message, "invalid roomType") } diff --git a/search-service/integration_users_test.go b/search-service/integration_users_test.go index e0fba7784..bec7cf4b6 100644 --- a/search-service/integration_users_test.go +++ b/search-service/integration_users_test.go @@ -15,8 +15,9 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "github.com/hmchangw/chat/pkg/errcode" + "github.com/hmchangw/chat/pkg/errcode/errtest" "github.com/hmchangw/chat/pkg/model" - "github.com/hmchangw/chat/pkg/natsrouter" "github.com/hmchangw/chat/pkg/restyutil" "github.com/hmchangw/chat/pkg/subject" ) @@ -78,10 +79,7 @@ func TestIntegration_SearchUsers_EmptyQueryReturnsBadRequest(t *testing.T) { msg, err := f.clientNATS.Request(subject.SearchUsers("alice", testSiteID), reqBytes, 5*time.Second) require.NoError(t, err) - var envelope model.ErrorResponse - require.NoError(t, json.Unmarshal(msg.Data, &envelope)) - require.NotEmpty(t, envelope.Error) - assert.Equal(t, natsrouter.CodeBadRequest, envelope.Code) + errtest.AssertCode(t, msg.Data, errcode.CodeBadRequest) } func TestIntegration_SearchUsers_ThirdPartyErrorReturnsInternal(t *testing.T) { @@ -95,11 +93,9 @@ func TestIntegration_SearchUsers_ThirdPartyErrorReturnsInternal(t *testing.T) { msg, err := f.clientNATS.Request(subject.SearchUsers("alice", testSiteID), reqBytes, 5*time.Second) require.NoError(t, err) - var envelope model.ErrorResponse - require.NoError(t, json.Unmarshal(msg.Data, &envelope)) - require.NotEmpty(t, envelope.Error) - assert.Equal(t, natsrouter.CodeInternal, envelope.Code, + envelope := errtest.Decode(t, msg.Data) + assert.Equal(t, errcode.CodeInternal, envelope.Code, "non-2xx from third-party must surface as internal error, not raw status") // Raw third-party details must not leak to the caller. - assert.NotContains(t, envelope.Error, "503", "status code from third-party must not leak") + assert.NotContains(t, envelope.Message, "503", "status code from third-party must not leak") } diff --git a/search-service/metrics.go b/search-service/metrics.go index 7f12ae6b7..1828a4253 100644 --- a/search-service/metrics.go +++ b/search-service/metrics.go @@ -9,7 +9,7 @@ import ( "github.com/prometheus/client_golang/prometheus/promauto" "github.com/prometheus/client_golang/prometheus/promhttp" - "github.com/hmchangw/chat/pkg/natsrouter" + "github.com/hmchangw/chat/pkg/errcode" ) // All collectors register with the default Prometheus registry via @@ -34,7 +34,7 @@ var ( ) // Per-kind handles for the request-path metrics. The `status` label on -// requests_total is resolved lazily (5 values × 2 kinds = 10 perms would +// requests_total is resolved lazily (9 values × 4 kinds = 36 perms would // clutter here); the duration handles are fully bound. const ( metricKindMessages = "messages" @@ -90,18 +90,39 @@ func durFor(kind string) prometheus.Observer { } // statusLabel maps a handler's returned error onto the requests_total -// `status` label. nil → "ok"; non-internal RouteError → its Code -// (bad_request, not_found, forbidden, conflict) so operators can -// distinguish 4xx-equivalents; everything else → "internal". +// `status` label. nil → "ok"; a non-empty *errcode.Error in the chain → its +// Code (one of the 8 canonical Codes below); everything else → "internal". +// +// The label set is pinned to keep Prometheus cardinality bounded — at most +// 9 × len(kinds) series. A non-canonical Code (e.g. a future Code constant +// added without updating this allowlist, or a foreign envelope on a federation +// path) collapses to "internal" rather than minting a fresh time series. func statusLabel(err error) string { if err == nil { return "ok" } - var rerr *natsrouter.RouteError - if errors.As(err, &rerr) && rerr.Code != "" && rerr.Code != natsrouter.CodeInternal { - return rerr.Code + var ee *errcode.Error + if errors.As(err, &ee) && ee.Code != "" { + if _, ok := allowedStatusLabels[string(ee.Code)]; ok { + return string(ee.Code) + } } - return natsrouter.CodeInternal + return string(errcode.CodeInternal) +} + +// allowedStatusLabels pins the cardinality of the requests_total status label +// to the 8 canonical errcode Codes + "ok". Any label outside this set +// collapses to "internal" via statusLabel. +var allowedStatusLabels = map[string]struct{}{ + "ok": {}, + string(errcode.CodeBadRequest): {}, + string(errcode.CodeUnauthenticated): {}, + string(errcode.CodeForbidden): {}, + string(errcode.CodeNotFound): {}, + string(errcode.CodeConflict): {}, + string(errcode.CodeTooManyRequests): {}, + string(errcode.CodeUnavailable): {}, + string(errcode.CodeInternal): {}, } func metricsHandler() http.Handler { return promhttp.Handler() } diff --git a/search-service/metrics_test.go b/search-service/metrics_test.go new file mode 100644 index 000000000..b3fb9c4ce --- /dev/null +++ b/search-service/metrics_test.go @@ -0,0 +1,66 @@ +package main + +import ( + "errors" + "fmt" + "testing" + + "github.com/hmchangw/chat/pkg/errcode" +) + +func TestStatusLabel_OkOnNil(t *testing.T) { + if got := statusLabel(nil); got != "ok" { + t.Fatalf("nil err → status = %q, want %q", got, "ok") + } +} + +func TestStatusLabel_CanonicalErrcodePassesThrough(t *testing.T) { + cases := []struct { + err error + want string + }{ + {errcode.BadRequest("x"), "bad_request"}, + {errcode.Unauthenticated("x"), "unauthenticated"}, + {errcode.Forbidden("x"), "forbidden"}, + {errcode.NotFound("x"), "not_found"}, + {errcode.Conflict("x"), "conflict"}, + {errcode.TooManyRequests("x"), "too_many_requests"}, + {errcode.Unavailable("x"), "unavailable"}, + {errcode.Internal("x"), "internal"}, + } + for _, tc := range cases { + if got := statusLabel(tc.err); got != tc.want { + t.Errorf("statusLabel(%v) = %q, want %q", tc.err, got, tc.want) + } + } +} + +// Wrapped *errcode.Error (the actual production shape from handler.go where +// callers fmt.Errorf("ctx: %w", errcodeErr) before returning) must traverse +// the chain via errors.As and still pin the right label. +func TestStatusLabel_WrappedErrcodePassesThrough(t *testing.T) { + wrapped := fmt.Errorf("handler load: %w", errcode.BadRequest("missing field")) + if got := statusLabel(wrapped); got != "bad_request" { + t.Fatalf("wrapped errcode → %q, want bad_request", got) + } +} + +func TestStatusLabel_NonCanonicalCodeCollapsesToInternal(t *testing.T) { + // Synthetic *errcode.Error with a non-canonical Code (e.g. a federation peer + // shipped a foreign envelope). Must not mint a new Prometheus series — the + // allowedStatusLabels guard collapses it to "internal". + bad := &errcode.Error{Code: errcode.Code("made_up_category"), Message: "x"} + if got := statusLabel(bad); got != "internal" { + t.Fatalf("non-canonical Code → status = %q, want %q", got, "internal") + } +} + +func TestStatusLabel_RawErrorCollapsesToInternal(t *testing.T) { + if got := statusLabel(errors.New("mongo down")); got != "internal" { + t.Fatalf("raw err → status = %q, want %q", got, "internal") + } + wrapped := fmt.Errorf("ctx: %w", errors.New("mongo down")) + if got := statusLabel(wrapped); got != "internal" { + t.Fatalf("wrapped raw err → status = %q, want %q", got, "internal") + } +} diff --git a/search-service/query_rooms.go b/search-service/query_rooms.go index 50ce3f60a..4d3702f68 100644 --- a/search-service/query_rooms.go +++ b/search-service/query_rooms.go @@ -4,8 +4,8 @@ import ( "encoding/json" "fmt" + "github.com/hmchangw/chat/pkg/errcode" "github.com/hmchangw/chat/pkg/model" - "github.com/hmchangw/chat/pkg/natsrouter" ) // roomType filter values accepted on SearchRoomsRequest.RoomType. @@ -17,9 +17,9 @@ const ( ) // buildRoomQuery composes the ES `_search` body for a subscription -// search against the spotlight index. It returns a *natsrouter.RouteError -// (user-facing) on invalid/unsupported roomType values and a plain error on -// marshalling failures. +// search against the spotlight index. It returns a user-facing *errcode.Error +// on invalid/unsupported roomType values and a plain error on marshalling +// failures. func buildRoomQuery(req model.SearchRoomsRequest, account string) (json.RawMessage, error) { roomTypeFilter, rerr := roomTypeFilterClause(req.RoomType) if rerr != nil { @@ -69,9 +69,9 @@ func buildRoomQuery(req model.SearchRoomsRequest, account string) (json.RawMessa // filter on `roomType`. The filter values match the strings written to the // spotlight index by search-sync-worker (the model.RoomType values // themselves). Returns (nil, nil) for "" and "all" which need no extra -// filter; returns ErrBadRequest for "app" (MVP-unsupported) and any unknown +// filter; returns errcode.BadRequest for "app" (MVP-unsupported) and any unknown // value. -func roomTypeFilterClause(roomType string) (map[string]any, *natsrouter.RouteError) { +func roomTypeFilterClause(roomType string) (map[string]any, *errcode.Error) { switch roomType { case "", roomTypeAll: return nil, nil @@ -80,8 +80,8 @@ func roomTypeFilterClause(roomType string) (map[string]any, *natsrouter.RouteErr case roomTypeDM: return map[string]any{"term": map[string]any{"roomType": string(model.RoomTypeDM)}}, nil case roomTypeApp: - return nil, natsrouter.ErrBadRequest("invalid roomType: app is not supported") + return nil, errcode.BadRequest("invalid roomType: app is not supported") default: - return nil, natsrouter.ErrBadRequest(fmt.Sprintf("invalid roomType: %s", roomType)) + return nil, errcode.BadRequest(fmt.Sprintf("invalid roomType: %s", roomType)) } } diff --git a/search-service/query_rooms_test.go b/search-service/query_rooms_test.go index a3e709c5a..10ed15a23 100644 --- a/search-service/query_rooms_test.go +++ b/search-service/query_rooms_test.go @@ -7,8 +7,8 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "github.com/hmchangw/chat/pkg/errcode" "github.com/hmchangw/chat/pkg/model" - "github.com/hmchangw/chat/pkg/natsrouter" ) // subscriptionFilters extracts the filter array from the ES bool query. @@ -69,9 +69,9 @@ func TestBuildSubscriptionQuery_RoomTypeAppRejected(t *testing.T) { _, err := buildRoomQuery(req, "alice") require.Error(t, err) - var rerr *natsrouter.RouteError - require.True(t, errors.As(err, &rerr), "expected RouteError") - assert.Equal(t, natsrouter.CodeBadRequest, rerr.Code) + var rerr *errcode.Error + require.True(t, errors.As(err, &rerr), "expected *errcode.Error") + assert.Equal(t, errcode.CodeBadRequest, rerr.Code) assert.Contains(t, rerr.Message, "invalid roomType") } @@ -80,9 +80,9 @@ func TestBuildSubscriptionQuery_UnknownRoomTypeRejected(t *testing.T) { _, err := buildRoomQuery(req, "alice") require.Error(t, err) - var rerr *natsrouter.RouteError + var rerr *errcode.Error require.True(t, errors.As(err, &rerr)) - assert.Equal(t, natsrouter.CodeBadRequest, rerr.Code) + assert.Equal(t, errcode.CodeBadRequest, rerr.Code) assert.Contains(t, rerr.Message, "invalid roomType") }