Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
d81af2e
docs(errcode): design spec, implementation plan, error-handling guide…
claude Jun 2, 2026
0a09722
feat(errcode): core package, NATS/HTTP adapters, test helper, reason …
claude Jun 2, 2026
a6ae551
refactor(model): retire ErrorResponse and align event/member shapes f…
claude Jun 2, 2026
3547923
refactor(natsrouter,natsutil): emit errcode envelopes; RequestID midd…
claude Jun 2, 2026
56eeec2
refactor(room-service,room-worker): adopt errcode envelopes; attach r…
claude Jun 2, 2026
50d76df
refactor(message-gatekeeper,message-worker,inbox-worker): errcode env…
claude Jun 2, 2026
01261a2
refactor(history-service,search-service): adopt errcode envelopes + p…
claude Jun 2, 2026
e017f4e
refactor(auth,mock-user,broadcast,notification): adopt errcode envelopes
claude Jun 2, 2026
cc02054
feat(frontend): consume errcode envelopes; reason-keyed error UX; rel…
claude Jun 2, 2026
dbef413
chore(testutil,roomcrypto): centralize cassandra image pin; add skipO…
claude Jun 2, 2026
a5b0e02
refactor(request_id): mint-everywhere policy with strict reject on de…
claude Jun 2, 2026
ae66fab
fix(room-service): propagate request_id to cross-site list-members
claude Jun 2, 2026
26552e5
fix(room-service): errcode-migration rebase fixups
claude Jun 2, 2026
aa72bec
docs(errcode): worker-logging spec; add request_id to message-worker …
claude Jun 2, 2026
ab6f0de
review(branch): multi-lens review report + apply curated polish
claude Jun 2, 2026
bcf0f30
review(pkg/errcode): header + executive summary
claude Jun 3, 2026
c84c903
review(pkg/errcode): code quality
claude Jun 3, 2026
febd1d8
review(pkg/errcode): api design
claude Jun 3, 2026
bbf677d
review(pkg/errcode): test coverage
claude Jun 3, 2026
1a77cba
review(pkg/errcode): maintainability
claude Jun 3, 2026
8f192ff
review(pkg/errcode): consumer ergonomics
claude Jun 3, 2026
5700e03
review(pkg/errcode): performance
claude Jun 3, 2026
d20546f
review(pkg/errcode): prioritized action list
claude Jun 3, 2026
de18ea0
fix(history-service): migrate pin.go to pkg/errcode after merge
claude Jun 3, 2026
36b5a9f
chore(toolchain,pin): bump Go to 1.25.11 and polish pin handler obser…
claude Jun 3, 2026
24e428d
sast(semgrep): exclude pkg/atrest from no-multi-wrap-errcode rule
claude Jun 3, 2026
c599866
docs: backfill subject-naming + CLAUDE.md for recent PRs
claude Jun 3, 2026
67bbf51
Revert "docs: backfill subject-naming + CLAUDE.md for recent PRs"
claude Jun 3, 2026
d20f14b
docs: remove session-scoped review reports per CLAUDE.md
claude Jun 3, 2026
bf9dc60
ci: retrigger after docker hub registry timeout
claude Jun 3, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 61 additions & 0 deletions .semgrep/errcode.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
rules:
- id: errcode-no-reason-literal-outside-catalog
languages: [go]
severity: ERROR
message: >
Declare Reason codes as typed constants in pkg/errcode/codes_<service>.go,
not inline. Use an existing errcode.Reason constant.
paths:
exclude:
- "**/pkg/errcode/codes_*.go"
- "**/*_test.go"
patterns:
- pattern: errcode.Reason("...")

- id: errcode-withcause-must-not-wrap-errcode
languages: [go]
severity: ERROR
message: >
WithCause must wrap a raw error, never another errcode error. Propagate a
typed error with `return err` or `fmt.Errorf("...: %w", err)`.
patterns:
- pattern: errcode.WithCause(errcode.$F(...))

- id: errcode-no-multi-wrap-errcode
languages: [go]
severity: ERROR
message: >
Multiple %w verbs can place two errcode errors in one chain, defeating
the "one *Error per chain" invariant (Classify picks the first). Use a
single %w only.
paths:
exclude:
# pkg/errcode/** is the implementation + docs of this very contract;
# doc.go intentionally includes a "forbidden" example in a comment that
# the regex-based pattern matches even though it's only documentation.
- "**/pkg/errcode/**"
# pkg/atrest/** does not import errcode; its sentinels (ErrAuthFailed,
# ErrPayloadMalformed) are plain errors.New strings. The multi-%w idiom
# there joins a sentinel with the underlying crypto/json error so
# errors.Is works for both — there is no errcode error in the chain.
- "**/pkg/atrest/**"
- "**/*_test.go"
patterns:
- pattern-regex: 'fmt\.Errorf\([^)]*%w[^)]*%w'

- id: errcode-prefer-named-constructor
languages: [go]
severity: WARNING
message: >
Prefer the named constructor (errcode.NotFound(msg)) over
errcode.New(errcode.CodeX, msg) for a literal category. Reserve New for
a category chosen at runtime.
paths:
exclude:
- "**/pkg/errcode/**"
- "**/*_test.go"
patterns:
- pattern: errcode.New(errcode.$CODE, ...)
- metavariable-regex:
metavariable: $CODE
regex: '^Code[A-Z].*'
26 changes: 23 additions & 3 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,9 @@ All commands are wrapped in the root Makefile. Always use `make` targets — nev
- Always wrap with context: `fmt.Errorf("short description: %w", err)` — describe what the current function was doing, not what failed underneath
- Never return bare `err` or `fmt.Errorf("error: %w", err)`
- Never ignore errors silently — comment if intentionally discarded
- Use `model.ErrorResponse` via `natsutil.ReplyError` for all NATS reply errors
- Use `pkg/errcode` for ALL client-facing errors; reply via `errnats.Reply` (NATS) / `errhttp.Write` (Gin). Construct with the named constructors (`errcode.NotFound`, `errcode.Forbidden`, …), attach a domain `reason` from `codes_<service>.go` where the frontend must distinguish cases, and return raw `fmt.Errorf("…: %w", err)` for infra failures (they collapse to `internal` at the boundary). Full guide: `docs/error-handling.md`. Wire-side reference for clients: `docs/client-api.md` §6.
- Never compare errors by string — use `errors.Is` and `errors.As`
- Never expose raw internal errors to clients — sanitize errors at service boundaries, return user-safe messages
- Never expose raw internal errors to clients — the unexported `errcode.Error.cause` is never serialized; `Classify` logs it once server-side. Never wrap raw message bodies/tokens into a cause.

### Interfaces & Dependency Injection
- Define interfaces in the consumer, not the implementer
Expand Down Expand Up @@ -222,11 +222,31 @@ All commands are wrapped in the root Makefile. Always use `make` targets — nev
- Use `iter.Stop()` + `wg.Wait()` + `nc.Drain()` for graceful shutdown — see "JetStream Consumer Pattern" and "Graceful Shutdown" sections
- All NATS payloads are JSON — use `encoding/json` with typed structs from `pkg/model`, never `map[string]interface{}`
- Use NATS request/reply for synchronous operations; `nc.QueueSubscribe` with service name as queue group
- Use `natsutil.ReplyJSON` for success responses, `natsutil.ReplyError` for errors
- Use `natsutil.ReplyJSON` for success responses; for errors return a typed `*errcode.Error` from the handler and let `errnats.Reply` / `errhttp.Write` marshal the envelope (see `docs/error-handling.md`).
- Define all stream configs in `pkg/stream/stream.go` with name pattern `<STREAM>_<siteID>`
- Use durable consumers named after the service
- Stream creation is gated by `BOOTSTRAP_STREAMS` (see below); when enabled, use `js.CreateOrUpdateStream` (it's idempotent) via the service's `bootstrapStreams` helper, never inline

### Error Handling at the NATS/HTTP Boundary
`pkg/errcode` has a broad surface, but **day-to-day handler code touches almost none of it.** Use this tiering — if you reach past Tier 1, you should know why.

- **Tier 1 — every handler (this is 90% of usage).** Return a typed error built from a named constructor, optionally tagged with a `reason`. You do NOT call the adapter, classify, or log — the plumbing does:
- `return errcode.NotFound("room not found")` — pick the constructor whose name matches the HTTP/wire category (`BadRequest`, `NotFound`, `Forbidden`, `Conflict`, `Internal`, …).
- `return errcode.Forbidden("only owners can do this", errcode.WithReason(errcode.RoomNotOwner))` — add `WithReason` **only** when the frontend must branch on the case. Prefer a package-level sentinel (e.g. room-service `helper.go`) over reconstructing the same error at multiple sites, so `errors.Is` matches.
- For an infra failure, `return fmt.Errorf("get subscription: %w", err)` — a raw wrapped error collapses to `internal` at the boundary; do NOT dress it up as an errcode.
- **Tier 2 — one line per handler, written once and copied.** The adapter that turns the returned error into the wire envelope. You pick exactly one, determined by your transport, never both:
- NATS raw handler: `errnats.Reply(ctx, m.Msg, err)`.
- `pkg/natsrouter` handler: returned automatically by the router — you write nothing.
- Gin handler: `errhttp.Write(ctx, c, err)`.
- **Tier 3 — specialist, you'll know when.** Don't use these in ordinary request/reply handlers:
- `errcode.Permanent` / `IsPermanent` — JetStream **workers only**, to Ack-poison vs Nak-retry.
- `errcode.Parse` — **cross-site consumers** decoding a remote envelope (e.g. `memberlist_client.go`).
- `errnats.Marshal` / `MarshalQuiet` / `ReplyQuiet` — outbox/already-logged paths; the plain `Reply` already classifies-and-logs once, so `Quiet` exists only to avoid a double-log.
- `errcode.Classify`, `WithLogger`, `WithLogValues` — boundary/observability plumbing; handlers get request-id logging for free from the router middleware.
- **Never log AND return.** `Reply`/`Write` run `Classify`, which logs once at a category-aware level. A `slog.Error(...)` before returning the same error double-logs.
- **`WithCause` wraps an infra error, never another `*errcode.Error`** (one-errcode-per-chain; it panics otherwise, and semgrep guards it). Never put a raw token/body/subject in a cause or message — it reaches the server log.
- Full guide: `docs/error-handling.md`. Wire reference for clients: `docs/client-api.md` §6.

### Event Timestamps
- Every NATS event struct in `pkg/model` must include a `Timestamp int64 \`json:"timestamp" bson:"timestamp"\`` field
- Set the timestamp at the publish site using `time.Now().UTC().UnixMilli()`
Expand Down
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ GOSEC_FLAGS := -quiet -severity medium -confidence medium -tests=false \
# semgrep: fail on medium+ (WARNING/ERROR; INFO is informational/low).
SEMGREP_FLAGS := --error --severity=WARNING --severity=ERROR --metrics=off \
--exclude=tools --exclude=chat-frontend --exclude=testdata \
--exclude=docs --config=p/golang --config=p/security-audit
--exclude=docs --config=p/golang --config=p/security-audit \
--config=.semgrep/errcode.yml

# Makefile for the distributed multi-site chat system.

Expand Down
46 changes: 33 additions & 13 deletions auth-service/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ import (
"github.com/nats-io/jwt/v2"
"github.com/nats-io/nkeys"

"github.com/hmchangw/chat/pkg/errcode"
"github.com/hmchangw/chat/pkg/errcode/errhttp"
pkgoidc "github.com/hmchangw/chat/pkg/oidc"
)

Expand Down Expand Up @@ -74,38 +76,51 @@ func (h *AuthHandler) HandleAuth(c *gin.Context) {
return
}

ctx := errcode.WithLogValues(c.Request.Context(), "request_id", c.GetString("request_id"))

var req authRequest
if err := c.ShouldBindJSON(&req); err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": "ssoToken and natsPublicKey are required"})
errhttp.Write(ctx, c, errcode.BadRequest("ssoToken and natsPublicKey are required",
errcode.WithReason(errcode.AuthMissingFields)))
return
}

if !nkeys.IsValidPublicUserKey(req.NATSPublicKey) {
c.JSON(http.StatusBadRequest, gin.H{"error": "invalid natsPublicKey format"})
errhttp.Write(ctx, c, errcode.BadRequest("invalid natsPublicKey format",
errcode.WithReason(errcode.AuthInvalidNKey)))
return
}

claims, err := h.validator.Validate(c.Request.Context(), req.SSOToken)
claims, err := h.validator.Validate(ctx, req.SSOToken)
if err != nil {
if errors.Is(err, pkgoidc.ErrTokenExpired) {
slog.Warn("sso token expired", "error", err)
c.JSON(http.StatusUnauthorized, gin.H{"error": "SSO token has expired, please re-login"})
errhttp.Write(ctx, c, errcode.Unauthenticated("SSO token has expired, please re-login",
errcode.WithReason(errcode.AuthTokenExpired)))
return
}
slog.Error("oidc validation failed", "error", err)
c.JSON(http.StatusUnauthorized, gin.H{"error": "invalid SSO token"})
// Non-expiry failures surface as "invalid SSO token"; attach the raw
// cause so the server log carries the actual reason.
errhttp.Write(ctx, c, errcode.Unauthenticated("invalid SSO token",
errcode.WithReason(errcode.AuthInvalidToken),
errcode.WithCause(err)))
return
}

account := claims.PreferredUsername
if account == "" {
account = claims.Name
}
if account == "" {
// Blank account would mint a JWT with chat.user..> permissions — refuse.
errhttp.Write(ctx, c, errcode.Unauthenticated("token missing account claim",
errcode.WithReason(errcode.AuthInvalidToken)))
return
}
ctx = errcode.WithLogValues(ctx, "account", account)

natsJWT, err := h.signNATSJWT(req.NATSPublicKey, account)
if err != nil {
slog.Error("nats jwt signing failed", "error", err, "account", account)
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to generate NATS token"})
errhttp.Write(ctx, c, fmt.Errorf("generating NATS token: %w", err))
return
}

Expand All @@ -131,21 +146,26 @@ func (h *AuthHandler) HandleAuth(c *gin.Context) {
// handleDevAuth handles auth in dev mode: accepts account name directly
// without OIDC validation, for use during local development only.
func (h *AuthHandler) handleDevAuth(c *gin.Context) {
ctx := errcode.WithLogValues(c.Request.Context(), "request_id", c.GetString("request_id"))

var req devAuthRequest
if err := c.ShouldBindJSON(&req); err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": "account and natsPublicKey are required"})
errhttp.Write(ctx, c, errcode.BadRequest("account and natsPublicKey are required",
errcode.WithReason(errcode.AuthMissingFields)))
return
}

if !nkeys.IsValidPublicUserKey(req.NATSPublicKey) {
c.JSON(http.StatusBadRequest, gin.H{"error": "invalid natsPublicKey format"})
errhttp.Write(ctx, c, errcode.BadRequest("invalid natsPublicKey format",
errcode.WithReason(errcode.AuthInvalidNKey)))
return
}

ctx = errcode.WithLogValues(ctx, "account", req.Account)

natsJWT, err := h.signNATSJWT(req.NATSPublicKey, req.Account)
if err != nil {
slog.Error("nats jwt signing failed", "error", err, "account", req.Account)
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to generate NATS token"})
errhttp.Write(ctx, c, fmt.Errorf("generating NATS token: %w", err))
return
}

Expand Down
42 changes: 37 additions & 5 deletions auth-service/handler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ import (
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"

"github.com/hmchangw/chat/pkg/errcode"
"github.com/hmchangw/chat/pkg/errcode/errtest"
pkgoidc "github.com/hmchangw/chat/pkg/oidc"
)

Expand Down Expand Up @@ -146,7 +148,8 @@ func TestHandleAuth_ExpiredToken(t *testing.T) {
router.ServeHTTP(w, req)

assert.Equal(t, http.StatusUnauthorized, w.Code)
assert.Contains(t, w.Body.String(), "expired")
errtest.AssertCode(t, w.Body.Bytes(), errcode.CodeUnauthenticated)
errtest.AssertReason(t, w.Body.Bytes(), errcode.AuthTokenExpired)
}

func TestHandleAuth_InvalidToken(t *testing.T) {
Expand All @@ -163,7 +166,8 @@ func TestHandleAuth_InvalidToken(t *testing.T) {
router.ServeHTTP(w, req)

assert.Equal(t, http.StatusUnauthorized, w.Code)
assert.Contains(t, w.Body.String(), "invalid SSO token")
errtest.AssertCode(t, w.Body.Bytes(), errcode.CodeUnauthenticated)
errtest.AssertReason(t, w.Body.Bytes(), errcode.AuthInvalidToken)
}

func TestHandleAuth_InvalidNKey(t *testing.T) {
Expand All @@ -179,7 +183,7 @@ func TestHandleAuth_InvalidNKey(t *testing.T) {
router.ServeHTTP(w, req)

assert.Equal(t, http.StatusBadRequest, w.Code)
assert.Contains(t, w.Body.String(), "invalid natsPublicKey format")
errtest.AssertCode(t, w.Body.Bytes(), errcode.CodeBadRequest)
}

func TestHandleAuth_MissingFields(t *testing.T) {
Expand All @@ -204,6 +208,7 @@ func TestHandleAuth_MissingFields(t *testing.T) {
req.Header.Set("Content-Type", "application/json")
router.ServeHTTP(w, req)
assert.Equal(t, http.StatusBadRequest, w.Code)
errtest.AssertCode(t, w.Body.Bytes(), errcode.CodeBadRequest)
})
}
}
Expand Down Expand Up @@ -286,7 +291,7 @@ func TestHandleAuth_DevMode_MissingAccount(t *testing.T) {
router.ServeHTTP(w, req)

assert.Equal(t, http.StatusBadRequest, w.Code)
assert.Contains(t, w.Body.String(), "account")
errtest.AssertCode(t, w.Body.Bytes(), errcode.CodeBadRequest)
}

func TestHandleAuth_DevMode_InvalidNKey(t *testing.T) {
Expand All @@ -302,7 +307,34 @@ func TestHandleAuth_DevMode_InvalidNKey(t *testing.T) {
router.ServeHTTP(w, req)

assert.Equal(t, http.StatusBadRequest, w.Code)
assert.Contains(t, w.Body.String(), "invalid natsPublicKey")
errtest.AssertCode(t, w.Body.Bytes(), errcode.CodeBadRequest)
}

func TestHandleAuth_DevMode_TokenGenerationFailure(t *testing.T) {
// Force signNATSJWT (uc.Encode) to fail by supplying a non-account
// signing key. A user key pair cannot sign a NATS user JWT, so Encode
// returns an error, exercising the 500 internal-error path. The real
// cause is logged via Classify and must NOT appear in the response body.
userKP, err := nkeys.CreateUser()
require.NoError(t, err, "create user key")

handler := NewAuthHandler(nil, userKP, 2*time.Hour, true)
router := setupRouter(t, handler)

userPub := mustUserNKey(t)
body := `{"account":"alice","natsPublicKey":"` + userPub + `"}`
w := httptest.NewRecorder()
req := httptest.NewRequest(http.MethodPost, "/auth", strings.NewReader(body))
req.Header.Set("Content-Type", "application/json")
router.ServeHTTP(w, req)

require.Equal(t, http.StatusInternalServerError, w.Code)
errtest.AssertCode(t, w.Body.Bytes(), errcode.CodeInternal)

var env errcode.Error
require.NoError(t, json.Unmarshal(w.Body.Bytes(), &env))
assert.Equal(t, "internal error", env.Message)
assert.NotContains(t, w.Body.String(), "generating NATS token")
}

func TestHandleHealth(t *testing.T) {
Expand Down
14 changes: 9 additions & 5 deletions auth-service/middleware.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,20 @@ import (
"github.com/hmchangw/chat/pkg/natsutil"
)

// requestIDMiddleware extracts X-Request-ID (or mints via idgen) and stores it on Gin keys, c.Request.Context() via natsutil, and the response header.
// requestIDMiddleware funnels HTTP X-Request-ID through idgen.ResolveRequestID
// (the same primitive the NATS path uses via natsutil.StampRequestID) so the
// mint-vs-pass-through policy has a single owner. Missing → silent mint;
// malformed → mint + Warn preserving the inbound value for traceability.
func requestIDMiddleware() gin.HandlerFunc {
return func(c *gin.Context) {
id := c.GetHeader(natsutil.RequestIDHeader)
if !idgen.IsValidUUID(id) {
id = idgen.GenerateRequestID()
}
inbound := c.GetHeader(natsutil.RequestIDHeader)
id, replaced := idgen.ResolveRequestID(inbound)
c.Set("request_id", id)
c.Request = c.Request.WithContext(natsutil.WithRequestID(c.Request.Context(), id))
c.Header(natsutil.RequestIDHeader, id)
if replaced {
slog.WarnContext(c.Request.Context(), "minted request_id (inbound invalid)", "inbound", inbound, "path", c.Request.URL.Path)
}
c.Next()
}
}
Expand Down
6 changes: 3 additions & 3 deletions broadcast-worker/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ func (h *Handler) handleCreated(ctx context.Context, evt *model.MessageEvent) er
case model.RoomTypeDM:
return h.publishDMEvents(ctx, meta, clientMsg, resolved.Accounts)
default:
slog.Warn("unknown room type, skipping fan-out", "type", meta.Type, "roomID", meta.ID)
slog.Warn("unknown room type, skipping fan-out", "type", meta.Type, "room_id", meta.ID)
return nil
}
}
Expand Down Expand Up @@ -251,14 +251,14 @@ func (h *Handler) publishMutation(ctx context.Context, room *model.Room, roomEvt
"type", roomEvtType,
"account", account,
"messageID", messageID,
"roomID", room.ID,
"room_id", room.ID,
)
}
}
return nil

default:
slog.Warn("unknown room type, skipping mutation fan-out", "type", room.Type, "roomID", room.ID)
slog.Warn("unknown room type, skipping mutation fan-out", "type", room.Type, "room_id", room.ID)
return nil
}
}
Expand Down
2 changes: 1 addition & 1 deletion broadcast-worker/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ func main() {
<-sem
wg.Done()
}()
handlerCtx := natsutil.ContextWithRequestIDFromHeaders(msgCtx, msg.Headers())
handlerCtx, _ := natsutil.StampRequestID(msgCtx, msg.Headers(), msg.Subject())
if err := handler.HandleMessage(handlerCtx, msg.Data()); err != nil {
slog.Error("handle message failed", "error", err, "request_id", natsutil.RequestIDFromContext(handlerCtx))
if err := msg.Nak(); err != nil {
Expand Down
Loading
Loading