From 8f5a699fb50d7ea3825aa8a93ca9dee7af3b96af Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 6 Jun 2026 15:37:39 +0000 Subject: [PATCH 01/14] feat: real-time thread reply fan-out + reply-count badge pipeline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds end-to-end thread reply support across the chat pipeline: - broadcast-worker: fan out thread reply create/edit/delete events to thread followers (replyAccounts) and @-mentions; fan out tcount badge via handleThreadTCountUpdated. DM/BotDM replies go to all human members. - message-worker: persist thread replies with IF NOT EXISTS LWT + MapScanCAS idempotency; publish EventThreadReplyAdded with tcount from countAndSetParentTcount (COUNT-based, crash-safe, idempotent on redelivery). - history-service: edit/delete events carry ThreadParentMessageID + TShow; delete path recomputes tcount via countAndSetParentTcount (COUNT → blind SET). - search-sync-worker: skip EventThreadReplyAdded events (no searchable doc). - room-service: UpdateSubscriptionThreadRead made atomic (returns new array + alert in a single MongoDB pipeline update). - pkg/model: EventThreadReplyAdded, NewTCount, ThreadMetadataUpdatedEvent. - pkg/subject: badge events travel on MsgCanonicalCreated (.created subject). - docs: tcount COUNT-based implementation plan + client-api updates. tcount approach: replaces CAS increment/decrement with a full partition scan of thread_messages_by_thread (COUNT non-deleted rows → blind SET on parent rows). Eliminates 2PC crash window — any JetStream redelivery re-COUNTs and re-SETs, converging to the correct value. Known cost: O(N) scan per event. Follow-up PR will replace with a Cassandra COUNTER table + reconciliation job (see docs/superpowers/plans/2026-06-04-tcount-count-based.md §"Known Trade-offs and Future Work"). https://claude.ai/code/session_013Vs7CusvrZFrRKJaSoFtCi --- .gitignore | 1 + broadcast-worker/handler.go | 609 +++++++-- broadcast-worker/handler_test.go | 692 +++++++++- broadcast-worker/integration_test.go | 103 +- broadcast-worker/main.go | 6 +- broadcast-worker/mock_store_test.go | 19 +- broadcast-worker/store.go | 1 + broadcast-worker/store_mongo.go | 43 +- docs/client-api.md | 74 +- ...-05-28-broadcast-worker-thread-handling.md | 1156 +++++++++++++++++ .../plans/2026-06-04-tcount-count-based.md | 630 +++++++++ ...broadcast-worker-thread-handling-design.md | 215 +++ docs/thread-reply-notifications.md | 65 + history-service/internal/cassrepo/write.go | 162 +-- .../cassrepo/write_integration_test.go | 64 +- .../internal/publisher/publisher.go | 6 +- .../internal/service/integration_test.go | 69 + history-service/internal/service/messages.go | 98 +- .../internal/service/messages_test.go | 481 ++++++- .../internal/service/mocks/mock_repository.go | 14 +- history-service/internal/service/service.go | 8 +- inbox-worker/handler.go | 58 +- inbox-worker/handler_test.go | 171 +-- message-worker/handler.go | 40 +- message-worker/handler_test.go | 252 +++- message-worker/integration_test.go | 231 +++- message-worker/mock_store_test.go | 7 +- message-worker/store.go | 2 +- message-worker/store_cassandra.go | 217 ++-- message-worker/store_cassandra_test.go | 112 -- pkg/model/event.go | 57 +- pkg/model/model_test.go | 71 + room-service/handler.go | 14 +- room-service/handler_test.go | 40 +- room-service/integration_test.go | 43 +- room-service/mock_store_test.go | 14 +- room-service/store.go | 5 +- room-service/store_mongo.go | 54 +- room-worker/handler.go | 259 ++-- room-worker/handler_test.go | 360 +---- room-worker/integration_test.go | 2 +- search-sync-worker/messages.go | 6 + search-sync-worker/messages_test.go | 33 + 43 files changed, 5221 insertions(+), 1343 deletions(-) create mode 100644 docs/superpowers/plans/2026-05-28-broadcast-worker-thread-handling.md create mode 100644 docs/superpowers/plans/2026-06-04-tcount-count-based.md create mode 100644 docs/superpowers/specs/2026-05-28-broadcast-worker-thread-handling-design.md create mode 100644 docs/thread-reply-notifications.md delete mode 100644 message-worker/store_cassandra_test.go diff --git a/.gitignore b/.gitignore index 106713f6d..30b04ccf0 100644 --- a/.gitignore +++ b/.gitignore @@ -65,3 +65,4 @@ chat-frontend/junit.xml tmp/ .air.*.toml /loadgen +.claude/worktrees/ diff --git a/broadcast-worker/handler.go b/broadcast-worker/handler.go index 5c0595871..2c0592ad6 100644 --- a/broadcast-worker/handler.go +++ b/broadcast-worker/handler.go @@ -8,8 +8,11 @@ import ( "log/slog" "time" + "golang.org/x/sync/errgroup" + "github.com/hmchangw/chat/pkg/mention" "github.com/hmchangw/chat/pkg/model" + "github.com/hmchangw/chat/pkg/natsutil" "github.com/hmchangw/chat/pkg/roomcrypto" "github.com/hmchangw/chat/pkg/roomkeystore" "github.com/hmchangw/chat/pkg/roommetacache" @@ -73,15 +76,32 @@ func (h *Handler) HandleMessage(ctx context.Context, data []byte) error { return h.handleUnpinned(ctx, &evt) case model.EventReacted: return h.handleReacted(ctx, &evt) + case model.EventThreadReplyAdded: + return h.handleThreadTCountUpdated(ctx, &evt) default: - slog.Warn("unknown message event type, skipping", "event", evt.Event, "messageID", evt.Message.ID) + slog.WarnContext(ctx, "unknown message event type, skipping", + "event", evt.Event, + "messageID", evt.Message.ID, + "request_id", natsutil.RequestIDFromContext(ctx)) return nil } } +// shouldUseThreadFanOut reports whether a message should be routed through the +// thread fan-out path (thread subscribers + @-mentions) rather than the room +// broadcast path. True when the message is a thread reply hidden from the main +// channel (TShow=false). +func shouldUseThreadFanOut(msg *model.Message) bool { + return msg.ThreadParentMessageID != "" && !msg.TShow +} + func (h *Handler) handleCreated(ctx context.Context, evt *model.MessageEvent) error { msg := evt.Message + if shouldUseThreadFanOut(&msg) { + return h.handleThreadCreated(ctx, evt) + } + // One user-store round-trip covers both mention enrichment and sender // enrichment: parse mentions, dedupe with the sender, fetch once, then // hand the resulting map to ResolveFromParsed (skips a second parse) and @@ -90,12 +110,11 @@ func (h *Handler) handleCreated(ctx context.Context, evt *model.MessageEvent) er lookupAccounts := dedupedAccounts(msg.UserAccount, parsed.Accounts) users, lookupErr := h.userStore.FindUsersByAccounts(ctx, lookupAccounts) if lookupErr != nil { - slog.Warn("user lookup failed, falling back to account", "error", lookupErr) - } - userByAccount := make(map[string]model.User, len(users)) - for i := range users { - userByAccount[users[i].Account] = users[i] + slog.WarnContext(ctx, "user lookup failed, falling back to account", + "error", lookupErr, + "request_id", natsutil.RequestIDFromContext(ctx)) } + userByAccount := usersByAccount(users) resolved := mention.ResolveFromParsed(parsed, userByAccount) @@ -113,80 +132,101 @@ func (h *Handler) handleCreated(ctx context.Context, evt *model.MessageEvent) er } } - // Room-metadata sys messages publish typed RoomEvents instead of new_message - // so mention/sender/encryption fields they don't carry stay off the wire. - switch msg.Type { - case model.MessageTypeRoomRenamed: - return h.publishRoomRenamedEvent(ctx, meta, &msg) - case model.MessageTypeRoomRestricted: - return h.publishRoomRestrictedEvent(ctx, meta, &msg) - } - clientMsg := buildClientMessage(&msg, userByAccount) switch meta.Type { case model.RoomTypeChannel: - return h.publishChannelEvent(ctx, meta, clientMsg, resolved.MentionAll, resolved.Participants) - case model.RoomTypeDM: - return h.publishDMEvents(ctx, meta, clientMsg, resolved.Accounts) + return h.publishChannelEvent(ctx, meta, clientMsg, evt.Timestamp, resolved.MentionAll, resolved.Participants) + case model.RoomTypeDM, model.RoomTypeBotDM: + return h.publishDMEvents(ctx, meta, clientMsg, evt.Timestamp, resolved.Accounts) default: - slog.Warn("unknown room type, skipping fan-out", "type", meta.Type, "room_id", meta.ID) + slog.WarnContext(ctx, "unknown room type, skipping fan-out", + "type", meta.Type, + "room_id", meta.ID, + "request_id", natsutil.RequestIDFromContext(ctx)) return nil } } -func (h *Handler) publishRoomRenamedEvent(ctx context.Context, meta roommetacache.Meta, msg *model.Message) error { - var sys model.RoomRenamedSysData - if len(msg.SysMsgData) > 0 { - if err := json.Unmarshal(msg.SysMsgData, &sys); err != nil { - return fmt.Errorf("unmarshal room_renamed sysMsgData for room %s: %w", msg.RoomID, err) - } - } - evt := model.RoomRenamedRoomEvent{ - Type: model.RoomEventRoomRenamed, - RoomID: meta.ID, - SiteID: meta.SiteID, - Timestamp: time.Now().UTC().UnixMilli(), - NewName: sys.NewName, - ByAccount: sys.ByAccount, - RenamedAt: msg.CreatedAt, - } - payload, err := json.Marshal(evt) +func (h *Handler) handleThreadCreated(ctx context.Context, evt *model.MessageEvent) error { + msg := evt.Message + parentMsgID := msg.ThreadParentMessageID + + parsed := mention.Parse(msg.Content) + + // Fetch room type first so DM/BotDM rooms skip the thread-subscription query + // entirely — their fan-out uses ListSubscriptions, not thread subscribers. + meta, err := h.store.GetRoomMeta(ctx, msg.RoomID) if err != nil { - return fmt.Errorf("marshal room_renamed event for room %s: %w", msg.RoomID, err) - } - if err := h.pub.Publish(ctx, subject.RoomEvent(meta.ID), payload); err != nil { - return fmt.Errorf("publish room_renamed event for room %s: %w", msg.RoomID, err) + return fmt.Errorf("get room meta %s: %w", msg.RoomID, err) } - return nil -} -func (h *Handler) publishRoomRestrictedEvent(ctx context.Context, meta roommetacache.Meta, msg *model.Message) error { - var sys model.RoomRestrictedSysData - if len(msg.SysMsgData) > 0 { - if err := json.Unmarshal(msg.SysMsgData, &sys); err != nil { - return fmt.Errorf("unmarshal room_restricted sysMsgData for room %s: %w", msg.RoomID, err) + // Channel rooms: only thread subscribers and @-mentioned accounts receive the + // event. Fetch the subscriber list and build fanOut before any further work. + var fanOut []string + if meta.Type == model.RoomTypeChannel { + fanOut, err = h.channelThreadFanOut(ctx, parentMsgID, msg.UserAccount, evt.SiteID, parsed.Accounts) + if err != nil { + return fmt.Errorf("channel thread fan-out for parent %s: %w", parentMsgID, err) + } + if len(fanOut) == 0 { + slog.DebugContext(ctx, "no thread subscribers to notify for thread reply", + "parentMessageID", parentMsgID, + "request_id", natsutil.RequestIDFromContext(ctx)) + return nil } } - evt := model.RoomRestrictedRoomEvent{ - Type: model.RoomEventRoomRestricted, - RoomID: meta.ID, - SiteID: meta.SiteID, - Timestamp: time.Now().UTC().UnixMilli(), - Restricted: sys.Restricted, - ExternalAccess: sys.ExternalAccess, - OwnerAccount: sys.OwnerAccount, - ByAccount: sys.ByAccount, - ChangedAt: msg.CreatedAt, - } - payload, err := json.Marshal(evt) - if err != nil { - return fmt.Errorf("marshal room_restricted event for room %s: %w", msg.RoomID, err) + + lookupAccounts := dedupedAccounts(msg.UserAccount, parsed.Accounts) + users, lookupErr := h.userStore.FindUsersByAccounts(ctx, lookupAccounts) + if lookupErr != nil { + slog.WarnContext(ctx, "user lookup failed for thread reply, falling back to account", + "error", lookupErr, + "parentMessageID", parentMsgID, + "request_id", natsutil.RequestIDFromContext(ctx)) } - if err := h.pub.Publish(ctx, subject.RoomEvent(meta.ID), payload); err != nil { - return fmt.Errorf("publish room_restricted event for room %s: %w", msg.RoomID, err) + userByAccount := usersByAccount(users) + + resolved := mention.ResolveFromParsed(parsed, userByAccount) + + clientMsg := buildClientMessage(&msg, userByAccount) + + switch meta.Type { + case model.RoomTypeChannel: + // Do NOT call SetSubscriptionMentions here: TShow=false replies are invisible + // in the main channel, so a room-level mention badge would appear with no + // visible message to explain it. + roomEvt := buildRoomEvent(meta, clientMsg, evt.Timestamp) + roomEvt.MentionAll = resolved.MentionAll + if len(resolved.Participants) > 0 { + roomEvt.Mentions = resolved.Participants + } + if err := h.encryptRoomEvent(ctx, meta.ID, clientMsg, &roomEvt); err != nil { + return fmt.Errorf("encrypt thread created event for parent %s: %w", parentMsgID, err) + } + payload, err := json.Marshal(roomEvt) + if err != nil { + return fmt.Errorf("marshal thread created event for parent %s: %w", parentMsgID, err) + } + return h.publishToThreadAccounts(ctx, fanOut, payload, parentMsgID) + case model.RoomTypeDM, model.RoomTypeBotDM: + // DM thread replies are visible to all members, so @-mention badges are correct. + if len(resolved.Accounts) > 0 { + if err := h.store.SetSubscriptionMentions(ctx, meta.ID, resolved.Accounts); err != nil { + return fmt.Errorf("set subscription mentions: %w", err) + } + } + if err := h.store.UpdateRoomLastMessage(ctx, msg.RoomID, msg.ID, msg.CreatedAt, resolved.MentionAll); err != nil { + return fmt.Errorf("update room last message %s: %w", msg.RoomID, err) + } + return h.publishDMEvents(ctx, meta, clientMsg, evt.Timestamp, resolved.Accounts) + default: + slog.WarnContext(ctx, "unknown room type, skipping thread fan-out", + "type", meta.Type, + "room_id", meta.ID, + "request_id", natsutil.RequestIDFromContext(ctx)) + return nil } - return nil } func (h *Handler) handleUpdated(ctx context.Context, evt *model.MessageEvent) error { @@ -195,52 +235,236 @@ func (h *Handler) handleUpdated(ctx context.Context, evt *model.MessageEvent) er return fmt.Errorf("updated event missing EditedAt or UpdatedAt: %s", msg.ID) } + if shouldUseThreadFanOut(&msg) { + return h.handleThreadUpdated(ctx, evt) + } + room, err := h.store.GetRoom(ctx, msg.RoomID) if err != nil { return fmt.Errorf("fetch room %s: %w", msg.RoomID, err) } - edit := model.EditRoomEvent{ - Type: model.RoomEventMessageEdited, - RoomID: room.ID, - SiteID: room.SiteID, - Timestamp: time.Now().UTC().UnixMilli(), - MessageID: msg.ID, - NewContent: msg.Content, - EditedBy: msg.UserAccount, - EditedAt: *msg.EditedAt, - UpdatedAt: *msg.UpdatedAt, - } + edit := buildEditRoomEvent(room, evt) if room.Type == model.RoomTypeChannel && h.encrypt { if err := h.encryptEditedContent(ctx, room.ID, &edit); err != nil { - return err + return fmt.Errorf("encrypt edit content for room %s: %w", room.ID, err) } } return h.publishMutation(ctx, room, model.RoomEventMessageEdited, msg.ID, &edit) } +func (h *Handler) handleThreadUpdated(ctx context.Context, evt *model.MessageEvent) error { + msg := evt.Message + if msg.EditedAt == nil || msg.UpdatedAt == nil { + return fmt.Errorf("updated event missing EditedAt or UpdatedAt for thread reply %s", msg.ID) + } + parentMsgID := msg.ThreadParentMessageID + + // GetRoom (not GetRoomMeta) so the DM/BotDM branch has room.Accounts for + // fan-out. Fetched first so the routing decision is made before any + // thread-follower lookup. + room, err := h.store.GetRoom(ctx, msg.RoomID) + if err != nil { + return fmt.Errorf("get room %s: %w", msg.RoomID, err) + } + + edit := buildEditRoomEvent(room, evt) + + switch room.Type { + case model.RoomTypeChannel: + parsed := mention.Parse(msg.Content) + fanOut, err := h.channelThreadFanOut(ctx, parentMsgID, msg.UserAccount, evt.SiteID, parsed.Accounts) + if err != nil { + return fmt.Errorf("channel thread fan-out for thread update of parent %s: %w", parentMsgID, err) + } + if len(fanOut) == 0 { + slog.DebugContext(ctx, "no thread subscribers to notify for thread update", + "parentMessageID", parentMsgID, + "request_id", natsutil.RequestIDFromContext(ctx)) + return nil + } + if h.encrypt { + if err := h.encryptEditedContent(ctx, room.ID, &edit); err != nil { + return fmt.Errorf("encrypt thread updated event for parent %s: %w", parentMsgID, err) + } + } + payload, err := json.Marshal(&edit) + if err != nil { + return fmt.Errorf("marshal thread edit event for parent %s: %w", parentMsgID, err) + } + return h.publishToThreadAccounts(ctx, fanOut, payload, parentMsgID) + case model.RoomTypeDM, model.RoomTypeBotDM: + // DM thread replies are visible to every member, so edits fan out to + // all members (consistent with handleThreadCreated), not just thread + // subscribers. + return h.publishMutation(ctx, room, model.RoomEventMessageEdited, msg.ID, &edit) + default: + slog.WarnContext(ctx, "unknown room type, skipping thread update fan-out", + "type", room.Type, + "room_id", room.ID, + "request_id", natsutil.RequestIDFromContext(ctx)) + return nil + } +} + +func (h *Handler) handleThreadDeleted(ctx context.Context, evt *model.MessageEvent) error { + msg := evt.Message + parentMsgID := msg.ThreadParentMessageID + + if msg.UpdatedAt == nil { + return fmt.Errorf("missing UpdatedAt for thread message %s", msg.ID) + } + + // GetRoom first so the routing decision (thread followers vs all DM + // members) is made from the authoritative room type and Accounts. + room, err := h.store.GetRoom(ctx, msg.RoomID) + if err != nil { + return fmt.Errorf("get room %s: %w", msg.RoomID, err) + } + + del := buildDeleteRoomEvent(room, evt) + + switch room.Type { + case model.RoomTypeChannel: + // Parse @-mentions from the deleted message so that non-follower + // recipients who received the create event (via mention fan-out) also + // receive the delete. Only the channel path uses mentions; the DM path + // fans out to all members. + parsed := mention.Parse(msg.Content) + fanOut, err := h.channelThreadFanOut(ctx, parentMsgID, msg.UserAccount, evt.SiteID, parsed.Accounts) + if err != nil { + return fmt.Errorf("channel thread fan-out for thread delete of parent %s: %w", parentMsgID, err) + } + if len(fanOut) > 0 { + payload, err := json.Marshal(&del) + if err != nil { + return fmt.Errorf("marshal thread delete event for parent %s: %w", parentMsgID, err) + } + if err := h.publishToThreadAccounts(ctx, fanOut, payload, parentMsgID); err != nil { + return fmt.Errorf("publish thread delete event for parent %s: %w", parentMsgID, err) + } + } + case model.RoomTypeDM, model.RoomTypeBotDM: + // DM thread replies are visible to every member, so deletes fan out to + // all members (consistent with handleThreadCreated), not just thread + // subscribers. + if err := h.publishMutation(ctx, room, model.RoomEventMessageDeleted, msg.ID, &del); err != nil { + return fmt.Errorf("publish thread delete mutation for room %s message %s: %w", room.ID, msg.ID, err) + } + default: + slog.WarnContext(ctx, "unknown room type, skipping thread delete fan-out", + "type", room.Type, + "room_id", room.ID, + "request_id", natsutil.RequestIDFromContext(ctx)) + // No return: the badge update below is safe for all room types; + // publishThreadMetadata handles unknown types by logging and skipping. + } + + // Badge (tcount) update applies to all room types. + if evt.NewTCount != nil { + h.publishThreadBadge(ctx, room, *evt.NewTCount, parentMsgID, msg.ID, evt.Timestamp) + } + + return nil +} + +func (h *Handler) handleThreadTCountUpdated(ctx context.Context, evt *model.MessageEvent) error { + if evt.NewTCount == nil { + slog.WarnContext(ctx, "thread_reply_added event missing NewTCount, skipping", + "messageID", evt.Message.ID, + "request_id", natsutil.RequestIDFromContext(ctx)) + return nil + } + if evt.Message.ThreadParentMessageID == "" { + slog.WarnContext(ctx, "thread_reply_added event missing ThreadParentMessageID, skipping", + "messageID", evt.Message.ID, + "request_id", natsutil.RequestIDFromContext(ctx)) + return nil + } + room, err := h.store.GetRoom(ctx, evt.Message.RoomID) + if err != nil { + return fmt.Errorf("get room %s: %w", evt.Message.RoomID, err) + } + return h.publishThreadMetadata(ctx, room, *evt.NewTCount, evt.Message.ThreadParentMessageID, evt.Message.ID, model.ThreadActionReplyAdded, evt.Timestamp) +} + +func (h *Handler) publishThreadMetadata(ctx context.Context, room *model.Room, newTcount int, + parentMsgID, replyMsgID string, action model.ThreadAction, timestamp int64) error { + evt := model.ThreadMetadataUpdatedEvent{ + Type: model.RoomEventThreadMetadataUpdated, + RoomID: room.ID, + SiteID: room.SiteID, + ParentMessageID: parentMsgID, + ReplyMessageID: replyMsgID, + NewTCount: newTcount, + Action: action, + Timestamp: timestamp, + } + payload, err := json.Marshal(evt) + if err != nil { + return fmt.Errorf("marshal thread metadata event for room %s: %w", room.ID, err) + } + switch room.Type { + case model.RoomTypeChannel: + if err := h.pub.Publish(ctx, subject.RoomEvent(room.ID), payload); err != nil { + return fmt.Errorf("publish thread metadata for channel room %s: %w", room.ID, err) + } + case model.RoomTypeDM, model.RoomTypeBotDM: + for _, account := range room.Accounts { + if isBot(account) { + continue + } + if err := h.pub.Publish(ctx, subject.UserRoomEvent(account), payload); err != nil { + return fmt.Errorf("publish thread metadata to DM member %s in room %s: %w", account, room.ID, err) + } + } + default: + slog.WarnContext(ctx, "unknown room type for thread metadata, skipping", + "type", room.Type, + "room_id", room.ID, + "request_id", natsutil.RequestIDFromContext(ctx)) + } + return nil +} + func (h *Handler) handleDeleted(ctx context.Context, evt *model.MessageEvent) error { msg := evt.Message if msg.UpdatedAt == nil { return fmt.Errorf("deleted event missing UpdatedAt: %s", msg.ID) } + if shouldUseThreadFanOut(&msg) { + return h.handleThreadDeleted(ctx, evt) + } + room, err := h.store.GetRoom(ctx, msg.RoomID) if err != nil { return fmt.Errorf("fetch room %s: %w", msg.RoomID, err) } - del := model.DeleteRoomEvent{ - Type: model.RoomEventMessageDeleted, - RoomID: room.ID, - SiteID: room.SiteID, - Timestamp: time.Now().UTC().UnixMilli(), - MessageID: msg.ID, - DeletedBy: msg.UserAccount, - DeletedAt: *msg.UpdatedAt, - UpdatedAt: *msg.UpdatedAt, + del := buildDeleteRoomEvent(room, evt) + if err := h.publishMutation(ctx, room, model.RoomEventMessageDeleted, msg.ID, &del); err != nil { + return fmt.Errorf("publish delete mutation for room %s message %s: %w", room.ID, msg.ID, err) + } + // TShow=true thread replies appear in the main room (handled by publishMutation + // above) but still count toward the thread's reply-count badge. Since + // handleThreadDeleted is bypassed for TShow=true, we publish the badge update here. + if msg.ThreadParentMessageID != "" && evt.NewTCount != nil { + h.publishThreadBadge(ctx, room, *evt.NewTCount, msg.ThreadParentMessageID, msg.ID, evt.Timestamp) + } + return nil +} + +// publishThreadBadge publishes a thread-metadata badge update for a deleted +// reply. Errors are logged but not returned: badge updates are best-effort and +// JetStream will redeliver the parent event on failure. +func (h *Handler) publishThreadBadge(ctx context.Context, room *model.Room, newTCount int, parentMsgID, replyMsgID string, timestamp int64) { + if err := h.publishThreadMetadata(ctx, room, newTCount, parentMsgID, replyMsgID, model.ThreadActionReplyDeleted, timestamp); err != nil { + slog.ErrorContext(ctx, "publish thread badge for deleted reply failed", + "error", err, + "parentMessageID", parentMsgID, + "request_id", natsutil.RequestIDFromContext(ctx)) } - return h.publishMutation(ctx, room, model.RoomEventMessageDeleted, msg.ID, &del) } func (h *Handler) handlePinned(ctx context.Context, evt *model.MessageEvent) error { @@ -258,7 +482,7 @@ func (h *Handler) handlePinned(ctx context.Context, evt *model.MessageEvent) err Type: model.RoomEventMessagePinned, RoomID: room.ID, SiteID: room.SiteID, - Timestamp: time.Now().UTC().UnixMilli(), + Timestamp: evt.Timestamp, MessageID: msg.ID, PinnedBy: msg.PinnedBy, PinnedAt: *msg.PinnedAt, @@ -281,7 +505,7 @@ func (h *Handler) handleUnpinned(ctx context.Context, evt *model.MessageEvent) e Type: model.RoomEventMessageUnpinned, RoomID: room.ID, SiteID: room.SiteID, - Timestamp: time.Now().UTC().UnixMilli(), + Timestamp: evt.Timestamp, MessageID: msg.ID, UnpinnedBy: msg.PinnedBy, UnpinnedAt: time.UnixMilli(evt.Timestamp).UTC(), @@ -353,27 +577,60 @@ func (h *Handler) publishMutation(ctx context.Context, room *model.Room, roomEvt continue } if err := h.pub.Publish(ctx, subject.UserRoomEvent(account), payload); err != nil { - slog.Error("publish DM mutation event failed", + slog.ErrorContext(ctx, "publish DM mutation event failed", "error", err, "type", roomEvtType, "account", account, "messageID", messageID, "room_id", room.ID, + "request_id", natsutil.RequestIDFromContext(ctx), ) } } return nil default: - slog.Warn("unknown room type, skipping mutation fan-out", "type", room.Type, "room_id", room.ID) + slog.WarnContext(ctx, "unknown room type, skipping mutation fan-out", + "type", room.Type, + "room_id", room.ID, + "request_id", natsutil.RequestIDFromContext(ctx)) return nil } } +func buildEditRoomEvent(room *model.Room, evt *model.MessageEvent) model.EditRoomEvent { + msg := evt.Message + return model.EditRoomEvent{ + Type: model.RoomEventMessageEdited, + RoomID: room.ID, + SiteID: room.SiteID, + Timestamp: evt.Timestamp, + MessageID: msg.ID, + NewContent: msg.Content, + EditedBy: msg.UserAccount, + EditedAt: *msg.EditedAt, + UpdatedAt: *msg.UpdatedAt, + } +} + +func buildDeleteRoomEvent(room *model.Room, evt *model.MessageEvent) model.DeleteRoomEvent { + msg := evt.Message + return model.DeleteRoomEvent{ + Type: model.RoomEventMessageDeleted, + RoomID: room.ID, + SiteID: room.SiteID, + Timestamp: evt.Timestamp, + MessageID: msg.ID, + DeletedBy: msg.UserAccount, + DeletedAt: *msg.UpdatedAt, + UpdatedAt: *msg.UpdatedAt, + } +} + func (h *Handler) encryptEditedContent(ctx context.Context, roomID string, edited *model.EditRoomEvent) error { key, err := h.currentRoomKey(ctx, roomID) if err != nil { - return err + return fmt.Errorf("get encryption key for room %s: %w", roomID, err) } encrypted, err := h.encoder.Encode(roomID, edited.NewContent, key.KeyPair.PrivateKey, key.Version) if err != nil { @@ -401,48 +658,50 @@ func (h *Handler) currentRoomKey(ctx context.Context, roomID string) (*roomkeyst return key, nil } -func (h *Handler) publishChannelEvent(ctx context.Context, meta roommetacache.Meta, clientMsg *model.ClientMessage, mentionAll bool, mentions []model.Participant) error { - evt := buildRoomEvent(meta, clientMsg) +// encryptRoomEvent applies room encryption to evt if h.encrypt is true, +// replacing evt.Message with an EncryptedMessage envelope built from clientMsg. +func (h *Handler) encryptRoomEvent(ctx context.Context, roomID string, clientMsg *model.ClientMessage, evt *model.RoomEvent) error { + if !h.encrypt { + return nil + } + msgJSON, err := json.Marshal(clientMsg) + if err != nil { + return fmt.Errorf("marshal client message for room %s: %w", roomID, err) + } + key, err := h.currentRoomKey(ctx, roomID) + if err != nil { + return fmt.Errorf("get encryption key for room %s: %w", roomID, err) + } + encrypted, err := h.encoder.Encode(roomID, string(msgJSON), key.KeyPair.PrivateKey, key.Version) + if err != nil { + return fmt.Errorf("encrypt message for room %s: %w", roomID, err) + } + encJSON, err := json.Marshal(encrypted) + if err != nil { + return fmt.Errorf("marshal encrypted message for room %s: %w", roomID, err) + } + evt.EncryptedMessage = json.RawMessage(encJSON) + evt.Message = nil + return nil +} + +func (h *Handler) publishChannelEvent(ctx context.Context, meta roommetacache.Meta, clientMsg *model.ClientMessage, timestamp int64, mentionAll bool, mentions []model.Participant) error { + evt := buildRoomEvent(meta, clientMsg, timestamp) evt.MentionAll = mentionAll if len(mentions) > 0 { evt.Mentions = mentions } - - if h.encrypt { - msgJSON, err := json.Marshal(clientMsg) - if err != nil { - return fmt.Errorf("marshal client message: %w", err) - } - - key, err := h.currentRoomKey(ctx, meta.ID) - if err != nil { - return err - } - - encrypted, err := h.encoder.Encode(meta.ID, string(msgJSON), key.KeyPair.PrivateKey, key.Version) - if err != nil { - return fmt.Errorf("encrypt message for room %s: %w", meta.ID, err) - } - - encJSON, err := json.Marshal(encrypted) - if err != nil { - return fmt.Errorf("marshal encrypted message: %w", err) - } - - evt.EncryptedMessage = json.RawMessage(encJSON) - evt.Message = nil + if err := h.encryptRoomEvent(ctx, meta.ID, clientMsg, &evt); err != nil { + return fmt.Errorf("encrypt channel event for room %s: %w", meta.ID, err) } - // when h.encrypt is false, evt.Message is already set by buildRoomEvent - payload, err := json.Marshal(evt) if err != nil { return fmt.Errorf("marshal channel event: %w", err) } - return h.pub.Publish(ctx, subject.RoomEvent(meta.ID), payload) } -func (h *Handler) publishDMEvents(ctx context.Context, meta roommetacache.Meta, clientMsg *model.ClientMessage, mentionedAccounts []string) error { +func (h *Handler) publishDMEvents(ctx context.Context, meta roommetacache.Meta, clientMsg *model.ClientMessage, timestamp int64, mentionedAccounts []string) error { subs, err := h.store.ListSubscriptions(ctx, meta.ID) if err != nil { return fmt.Errorf("list subscriptions for DM room %s: %w", meta.ID, err) @@ -454,27 +713,43 @@ func (h *Handler) publishDMEvents(ctx context.Context, meta roommetacache.Meta, } for i := range subs { - _, hasMention := mentionSet[subs[i].User.Account] + account := subs[i].User.Account + // Skip bots: live UI events go to human clients only, consistent with + // publishMutation and publishThreadMetadata. Bots receive messages via + // their own server-side integration, not the websocket event channel. + if isBot(account) { + continue + } + _, hasMention := mentionSet[account] - evt := buildRoomEvent(meta, clientMsg) + evt := buildRoomEvent(meta, clientMsg, timestamp) evt.HasMention = hasMention payload, err := json.Marshal(evt) if err != nil { - return fmt.Errorf("marshal DM event for user %s: %w", subs[i].User.Account, err) + return fmt.Errorf("marshal DM event for user %s: %w", account, err) } - if err := h.pub.Publish(ctx, subject.UserRoomEvent(subs[i].User.Account), payload); err != nil { - slog.Error("publish DM event failed", "error", err, "account", subs[i].User.Account) + // Publish errors are intentionally swallowed here (log-and-continue). DM thread + // replies have no JetStream retry guarantee by design — the DM path uses + // publishDMEvents which is fire-and-forget, consistent with how all DM fan-out + // works in this service (publishMutation). Channel thread events propagate errors + // via publishToThreadAccounts so JetStream can redeliver. + if err := h.pub.Publish(ctx, subject.UserRoomEvent(account), payload); err != nil { + slog.ErrorContext(ctx, "publish DM event failed", + "error", err, + "account", account, + "room_id", meta.ID, + "request_id", natsutil.RequestIDFromContext(ctx)) } } return nil } -func buildRoomEvent(meta roommetacache.Meta, clientMsg *model.ClientMessage) model.RoomEvent { +func buildRoomEvent(meta roommetacache.Meta, clientMsg *model.ClientMessage, timestamp int64) model.RoomEvent { return model.RoomEvent{ Type: model.RoomEventNewMessage, RoomID: meta.ID, - Timestamp: time.Now().UTC().UnixMilli(), + Timestamp: timestamp, RoomName: meta.Name, RoomType: meta.Type, SiteID: meta.SiteID, @@ -502,3 +777,81 @@ func buildClientMessage(msg *model.Message, userMap map[string]model.User) *mode Sender: &sender, } } + +// publishToThreadAccounts publishes payload concurrently to every account in +// the list using an errgroup. On publish failure it logs and returns the error +// so the caller can propagate it to JetStream for redelivery — thread per-user +// events must have the same retry guarantee as room-channel events. +func (h *Handler) publishToThreadAccounts(ctx context.Context, accounts []string, payload []byte, parentMsgID string) error { + if len(accounts) == 0 { + return nil + } + g, gctx := errgroup.WithContext(ctx) + for _, account := range accounts { + account := account + g.Go(func() error { + if err := h.pub.Publish(gctx, subject.UserRoomEvent(account), payload); err != nil { + slog.ErrorContext(gctx, "publish thread event failed", + "error", err, + "account", account, + "parentMessageID", parentMsgID, + "request_id", natsutil.RequestIDFromContext(gctx)) + return fmt.Errorf("publish thread event to %s for parent %s: %w", account, parentMsgID, err) + } + return nil + }) + } + return g.Wait() +} + +// threadFanOutAccounts builds the deduplicated fan-out recipient list for +// a thread event. senderAccount is always excluded. extraAccounts +// (e.g. @mentioned users from the message payload) are added after the +// follower pass. +func threadFanOutAccounts(senderAccount string, followers map[string]struct{}, extraAccounts []string) []string { + seen := map[string]struct{}{senderAccount: {}} + var fanOut []string + for acc := range followers { + if _, ok := seen[acc]; ok { + continue + } + if isBot(acc) { + continue + } + seen[acc] = struct{}{} + fanOut = append(fanOut, acc) + } + for _, acc := range extraAccounts { + if _, ok := seen[acc]; ok { + continue + } + if isBot(acc) { + continue + } + seen[acc] = struct{}{} + fanOut = append(fanOut, acc) + } + return fanOut +} + +// channelThreadFanOut resolves the deduplicated recipient list for a channel +// thread event: it fetches the parent message's thread followers and merges +// them with the @-mentioned accounts, excluding the sender. Shared by the +// channel branch of every thread handler (created/updated/deleted). +func (h *Handler) channelThreadFanOut(ctx context.Context, parentMsgID, sender, siteID string, mentions []string) ([]string, error) { + followers, err := h.store.GetThreadFollowers(ctx, parentMsgID, siteID) + if err != nil { + return nil, fmt.Errorf("get thread followers for parent %s: %w", parentMsgID, err) + } + return threadFanOutAccounts(sender, followers, mentions), nil +} + +// usersByAccount indexes a slice of users by their Account for O(1) lookup +// during mention resolution and client-message enrichment. +func usersByAccount(users []model.User) map[string]model.User { + byAccount := make(map[string]model.User, len(users)) + for i := range users { + byAccount[users[i].Account] = users[i] + } + return byAccount +} diff --git a/broadcast-worker/handler_test.go b/broadcast-worker/handler_test.go index 93ef0555b..d3415898e 100644 --- a/broadcast-worker/handler_test.go +++ b/broadcast-worker/handler_test.go @@ -5,6 +5,7 @@ import ( "encoding/json" "errors" "fmt" + "sync" "testing" "time" @@ -25,10 +26,13 @@ type publishRecord struct { } type mockPublisher struct { + mu sync.Mutex records []publishRecord } func (m *mockPublisher) Publish(_ context.Context, subj string, data []byte) error { + m.mu.Lock() + defer m.mu.Unlock() m.records = append(m.records, publishRecord{subject: subj, data: data}) return nil } @@ -71,8 +75,9 @@ func metaOf(r *model.Room) roommetacache.Meta { func makeMessageEvent(roomID, content string, msgTime time.Time) []byte { evt := model.MessageEvent{ - Event: model.EventCreated, - SiteID: "site-a", + Event: model.EventCreated, + SiteID: "site-a", + Timestamp: msgTime.UnixMilli(), Message: model.Message{ ID: "msg-1", RoomID: roomID, UserID: "user-1", UserAccount: "sender", Content: content, CreatedAt: msgTime, @@ -239,7 +244,7 @@ func TestHandler_HandleMessage_ChannelRoom(t *testing.T) { assert.Equal(t, "site-a", evt.SiteID) assert.Equal(t, 5, evt.UserCount) assert.Equal(t, "msg-1", evt.LastMsgID) - assert.Greater(t, evt.Timestamp, int64(0)) + assert.Equal(t, msgTime.UnixMilli(), evt.Timestamp) assert.Equal(t, tc.wantMentionAll, evt.MentionAll) assert.Equal(t, "msg-1", msg.ID) @@ -299,8 +304,9 @@ func TestHandler_HandleMessage_DMRoom(t *testing.T) { pub := &mockPublisher{} evt := model.MessageEvent{ - Event: model.EventCreated, - SiteID: "site-a", + Event: model.EventCreated, + SiteID: "site-a", + Timestamp: msgTime.UnixMilli(), Message: model.Message{ ID: "msg-1", RoomID: "dm-1", UserID: "alice-id", UserAccount: "alice", Content: tc.content, CreatedAt: msgTime, @@ -340,7 +346,7 @@ func TestHandler_HandleMessage_DMRoom(t *testing.T) { aliceEvt := evtBySubject[subject.UserRoomEvent("alice")] assert.Equal(t, model.RoomEventNewMessage, aliceEvt.Type) - assert.Greater(t, aliceEvt.Timestamp, int64(0)) + assert.Equal(t, msgTime.UnixMilli(), aliceEvt.Timestamp) require.NotNil(t, aliceEvt.Message, "DM events must carry Message payload") assert.Equal(t, "msg-1", aliceEvt.Message.ID) require.NotNil(t, aliceEvt.Message.Sender) @@ -350,7 +356,7 @@ func TestHandler_HandleMessage_DMRoom(t *testing.T) { bobEvt := evtBySubject[subject.UserRoomEvent("bob")] require.NotNil(t, bobEvt.Message) - assert.Greater(t, bobEvt.Timestamp, int64(0)) + assert.Equal(t, msgTime.UnixMilli(), bobEvt.Timestamp) assert.Equal(t, "msg-1", bobEvt.Message.ID) require.NotNil(t, bobEvt.Message.Sender) assert.Equal(t, tc.bobHasMention, bobEvt.HasMention) @@ -531,12 +537,15 @@ func TestHandler_HandleMessage_Errors(t *testing.T) { } type failingPublisher struct { + mu sync.Mutex callCount int failAfter int records []publishRecord } func (p *failingPublisher) Publish(_ context.Context, subj string, data []byte) error { + p.mu.Lock() + defer p.mu.Unlock() p.callCount++ if p.callCount > p.failAfter { return errors.New("publish failed") @@ -1493,3 +1502,672 @@ func TestHandleUnpinned_DMRoom_FansOutToBothMembers(t *testing.T) { assert.True(t, subjects[subject.UserRoomEvent("alice")]) assert.True(t, subjects[subject.UserRoomEvent("bob")]) } + +// --------------------------------------------------------------------------- +// Thread handler tests +// --------------------------------------------------------------------------- + +func TestThreadFanOutAccounts(t *testing.T) { + tests := []struct { + name string + sender string + followers map[string]struct{} + extraAccounts []string + want []string + }{ + { + name: "no followers no extras", + sender: "alice", + followers: map[string]struct{}{}, + extraAccounts: nil, + want: nil, + }, + { + name: "sender excluded from followers", + sender: "alice", + followers: map[string]struct{}{"alice": {}, "bob": {}}, + want: []string{"bob"}, + }, + { + name: "extra accounts merged deduped", + sender: "alice", + followers: map[string]struct{}{"bob": {}}, + extraAccounts: []string{"bob", "carol"}, + want: []string{"bob", "carol"}, + }, + { + name: "bot accounts skipped", + sender: "alice", + followers: map[string]struct{}{"helper.bot": {}, "bob": {}}, + extraAccounts: []string{"other.bot"}, + want: []string{"bob"}, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + got := threadFanOutAccounts(tc.sender, tc.followers, tc.extraAccounts) + assert.ElementsMatch(t, tc.want, got) + }) + } +} + +func TestHandleMessage_ThreadReplyAdded_DispatchesToHandleThreadTCountUpdated(t *testing.T) { + ctrl := gomock.NewController(t) + store := NewMockStore(ctrl) + us := NewMockUserStore(ctrl) + pub := &mockPublisher{} + keyStore := NewMockRoomKeyProvider(ctrl) + + msgTime := time.Date(2026, 4, 1, 10, 0, 0, 0, time.UTC) + tcount := 3 + room := &model.Room{ID: "r1", Type: model.RoomTypeChannel, SiteID: "site-a"} + store.EXPECT().GetRoom(gomock.Any(), "r1").Return(room, nil) + + evt := model.MessageEvent{ + Event: model.EventThreadReplyAdded, + SiteID: "site-a", + Timestamp: msgTime.UnixMilli(), + NewTCount: &tcount, + Message: model.Message{ + ID: "reply-1", + RoomID: "r1", + UserAccount: "alice", + ThreadParentMessageID: "parent-1", + CreatedAt: msgTime, + }, + } + data, _ := json.Marshal(evt) + + h := NewHandler(store, us, pub, keyStore, false) + require.NoError(t, h.HandleMessage(context.Background(), data)) + + require.Len(t, pub.records, 1) + var tmEvt model.ThreadMetadataUpdatedEvent + require.NoError(t, json.Unmarshal(pub.records[0].data, &tmEvt)) + assert.Equal(t, model.RoomEventThreadMetadataUpdated, tmEvt.Type) + assert.Equal(t, "r1", tmEvt.RoomID) + assert.Equal(t, "site-a", tmEvt.SiteID) + assert.Equal(t, "parent-1", tmEvt.ParentMessageID) + assert.Equal(t, "reply-1", tmEvt.ReplyMessageID) + assert.Equal(t, 3, tmEvt.NewTCount) + assert.Equal(t, model.ThreadActionReplyAdded, tmEvt.Action) + assert.Equal(t, msgTime.UnixMilli(), tmEvt.Timestamp) +} + +func TestHandleThreadTCountUpdated_MissingNewTCount_Skips(t *testing.T) { + ctrl := gomock.NewController(t) + store := NewMockStore(ctrl) + us := NewMockUserStore(ctrl) + pub := &mockPublisher{} + keyStore := NewMockRoomKeyProvider(ctrl) + // No store calls expected — event is silently dropped. + + msgTime := time.Date(2026, 4, 1, 10, 0, 0, 0, time.UTC) + evt := model.MessageEvent{ + Event: model.EventThreadReplyAdded, + SiteID: "site-a", + Timestamp: msgTime.UnixMilli(), + // NewTCount intentionally nil + Message: model.Message{ + ID: "reply-1", + RoomID: "r1", + UserAccount: "alice", + ThreadParentMessageID: "parent-1", + CreatedAt: msgTime, + }, + } + data, _ := json.Marshal(evt) + + h := NewHandler(store, us, pub, keyStore, false) + require.NoError(t, h.HandleMessage(context.Background(), data)) + assert.Empty(t, pub.records) +} + +func TestHandleThreadTCountUpdated_MissingParentMessageID_Skips(t *testing.T) { + ctrl := gomock.NewController(t) + store := NewMockStore(ctrl) + us := NewMockUserStore(ctrl) + pub := &mockPublisher{} + keyStore := NewMockRoomKeyProvider(ctrl) + + msgTime := time.Date(2026, 4, 1, 10, 0, 0, 0, time.UTC) + tcount := 2 + evt := model.MessageEvent{ + Event: model.EventThreadReplyAdded, + SiteID: "site-a", + Timestamp: msgTime.UnixMilli(), + NewTCount: &tcount, + Message: model.Message{ + ID: "reply-1", + RoomID: "r1", + UserAccount: "alice", + // ThreadParentMessageID intentionally empty + CreatedAt: msgTime, + }, + } + data, _ := json.Marshal(evt) + + h := NewHandler(store, us, pub, keyStore, false) + require.NoError(t, h.HandleMessage(context.Background(), data)) + assert.Empty(t, pub.records) +} + +func TestHandleThreadTCountUpdated_GetRoomError_ReturnsError(t *testing.T) { + ctrl := gomock.NewController(t) + store := NewMockStore(ctrl) + us := NewMockUserStore(ctrl) + pub := &mockPublisher{} + keyStore := NewMockRoomKeyProvider(ctrl) + + msgTime := time.Date(2026, 4, 1, 10, 0, 0, 0, time.UTC) + tcount := 2 + store.EXPECT().GetRoom(gomock.Any(), "r1").Return(nil, errors.New("db error")) + + evt := model.MessageEvent{ + Event: model.EventThreadReplyAdded, + SiteID: "site-a", + Timestamp: msgTime.UnixMilli(), + NewTCount: &tcount, + Message: model.Message{ + ID: "reply-1", + RoomID: "r1", + UserAccount: "alice", + ThreadParentMessageID: "parent-1", + CreatedAt: msgTime, + }, + } + data, _ := json.Marshal(evt) + + h := NewHandler(store, us, pub, keyStore, false) + err := h.HandleMessage(context.Background(), data) + require.Error(t, err) + assert.Contains(t, err.Error(), "get room") + assert.Empty(t, pub.records) +} + +func TestHandleThreadCreated_ChannelRoom_FansOutToFollowers(t *testing.T) { + ctrl := gomock.NewController(t) + store := NewMockStore(ctrl) + us := NewMockUserStore(ctrl) + pub := &mockPublisher{} + keyStore := NewMockRoomKeyProvider(ctrl) + + msgTime := time.Date(2026, 4, 1, 10, 0, 0, 0, time.UTC) + parentMsgID := "parent-1" + siteID := "site-a" + roomID := "r1" + + followers := map[string]struct{}{"bob": {}, "carol": {}} + store.EXPECT().GetRoomMeta(gomock.Any(), roomID).Return(metaOf(testChannelRoom), nil) + store.EXPECT().GetThreadFollowers(gomock.Any(), parentMsgID, siteID).Return(followers, nil) + us.EXPECT().FindUsersByAccounts(gomock.Any(), []string{"alice"}).Return([]model.User{testUsers[0]}, nil) + + evt := model.MessageEvent{ + Event: model.EventCreated, + SiteID: siteID, + Timestamp: msgTime.UnixMilli(), + Message: model.Message{ + ID: "reply-1", + RoomID: roomID, + UserID: "u-alice", + UserAccount: "alice", + Content: "a thread reply", + CreatedAt: msgTime, + ThreadParentMessageID: parentMsgID, + TShow: false, + }, + } + data, _ := json.Marshal(evt) + + h := NewHandler(store, us, pub, keyStore, false) + require.NoError(t, h.HandleMessage(context.Background(), data)) + + // bob and carol (followers), alice (sender) excluded + require.Len(t, pub.records, 2) + subjects := map[string]bool{} + for _, r := range pub.records { + subjects[r.subject] = true + var roomEvt model.RoomEvent + require.NoError(t, json.Unmarshal(r.data, &roomEvt)) + assert.Equal(t, model.RoomEventNewMessage, roomEvt.Type) + assert.Equal(t, msgTime.UnixMilli(), roomEvt.Timestamp) + } + assert.True(t, subjects[subject.UserRoomEvent("bob")]) + assert.True(t, subjects[subject.UserRoomEvent("carol")]) +} + +func TestHandleThreadCreated_ChannelRoom_NoFollowers_Skips(t *testing.T) { + ctrl := gomock.NewController(t) + store := NewMockStore(ctrl) + us := NewMockUserStore(ctrl) + pub := &mockPublisher{} + keyStore := NewMockRoomKeyProvider(ctrl) + + msgTime := time.Date(2026, 4, 1, 10, 0, 0, 0, time.UTC) + + store.EXPECT().GetRoomMeta(gomock.Any(), "r1").Return(metaOf(testChannelRoom), nil) + store.EXPECT().GetThreadFollowers(gomock.Any(), "parent-1", "site-a").Return(map[string]struct{}{}, nil) + + evt := model.MessageEvent{ + Event: model.EventCreated, + SiteID: "site-a", + Timestamp: msgTime.UnixMilli(), + Message: model.Message{ + ID: "reply-1", + RoomID: "r1", + UserAccount: "alice", + Content: "hello", + CreatedAt: msgTime, + ThreadParentMessageID: "parent-1", + TShow: false, + }, + } + data, _ := json.Marshal(evt) + + h := NewHandler(store, us, pub, keyStore, false) + require.NoError(t, h.HandleMessage(context.Background(), data)) + assert.Empty(t, pub.records, "no followers → nothing published") +} + +func TestHandleThreadCreated_DMRoom_FansOutToAllMembers(t *testing.T) { + ctrl := gomock.NewController(t) + store := NewMockStore(ctrl) + us := NewMockUserStore(ctrl) + pub := &mockPublisher{} + keyStore := NewMockRoomKeyProvider(ctrl) + + msgTime := time.Date(2026, 4, 1, 11, 0, 0, 0, time.UTC) + + store.EXPECT().GetRoomMeta(gomock.Any(), "dm-1").Return(metaOf(testDMRoom), nil) + store.EXPECT().UpdateRoomLastMessage(gomock.Any(), "dm-1", "reply-1", msgTime, false).Return(nil) + store.EXPECT().ListSubscriptions(gomock.Any(), "dm-1").Return(testDMSubs, nil) + us.EXPECT().FindUsersByAccounts(gomock.Any(), []string{"alice"}).Return([]model.User{testUsers[0]}, nil) + + evt := model.MessageEvent{ + Event: model.EventCreated, + SiteID: "site-a", + Timestamp: msgTime.UnixMilli(), + Message: model.Message{ + ID: "reply-1", + RoomID: "dm-1", + UserID: "u-alice", + UserAccount: "alice", + Content: "thread reply in DM", + CreatedAt: msgTime, + ThreadParentMessageID: "parent-dm", + TShow: false, + }, + } + data, _ := json.Marshal(evt) + + h := NewHandler(store, us, pub, keyStore, false) + require.NoError(t, h.HandleMessage(context.Background(), data)) + + require.Len(t, pub.records, 2, "DM thread reply fans out to all members") + subjects := map[string]bool{} + for _, r := range pub.records { + subjects[r.subject] = true + } + assert.True(t, subjects[subject.UserRoomEvent("alice")]) + assert.True(t, subjects[subject.UserRoomEvent("bob")]) +} + +func TestHandleThreadCreated_DMRoom_WithMention_SetsMentions(t *testing.T) { + ctrl := gomock.NewController(t) + store := NewMockStore(ctrl) + us := NewMockUserStore(ctrl) + pub := &mockPublisher{} + keyStore := NewMockRoomKeyProvider(ctrl) + + msgTime := time.Date(2026, 4, 1, 11, 0, 0, 0, time.UTC) + + store.EXPECT().GetRoomMeta(gomock.Any(), "dm-1").Return(metaOf(testDMRoom), nil) + store.EXPECT().UpdateRoomLastMessage(gomock.Any(), "dm-1", "reply-1", msgTime, false).Return(nil) + store.EXPECT().SetSubscriptionMentions(gomock.Any(), "dm-1", []string{"bob"}).Return(nil) + store.EXPECT().ListSubscriptions(gomock.Any(), "dm-1").Return(testDMSubs, nil) + us.EXPECT().FindUsersByAccounts(gomock.Any(), []string{"alice", "bob"}).Return(testUsers, nil) + + evt := model.MessageEvent{ + Event: model.EventCreated, + SiteID: "site-a", + Timestamp: msgTime.UnixMilli(), + Message: model.Message{ + ID: "reply-1", + RoomID: "dm-1", + UserID: "u-alice", + UserAccount: "alice", + Content: "hey @bob", + CreatedAt: msgTime, + ThreadParentMessageID: "parent-dm", + TShow: false, + }, + } + data, _ := json.Marshal(evt) + + h := NewHandler(store, us, pub, keyStore, false) + require.NoError(t, h.HandleMessage(context.Background(), data)) + require.Len(t, pub.records, 2) +} + +func TestHandleThreadUpdated_ChannelRoom_FansOutToFollowers(t *testing.T) { + ctrl := gomock.NewController(t) + store := NewMockStore(ctrl) + us := NewMockUserStore(ctrl) + pub := &mockPublisher{} + keyStore := NewMockRoomKeyProvider(ctrl) + + msgTime := time.Date(2026, 4, 1, 10, 0, 0, 0, time.UTC) + editedAt := msgTime.Add(time.Minute) + parentMsgID := "parent-1" + siteID := "site-a" + roomID := "r1" + + room := &model.Room{ID: roomID, Type: model.RoomTypeChannel, SiteID: siteID} + followers := map[string]struct{}{"bob": {}, "carol": {}} + store.EXPECT().GetRoom(gomock.Any(), roomID).Return(room, nil) + store.EXPECT().GetThreadFollowers(gomock.Any(), parentMsgID, siteID).Return(followers, nil) + + evt := model.MessageEvent{ + Event: model.EventUpdated, + SiteID: siteID, + Timestamp: editedAt.UnixMilli(), + Message: model.Message{ + ID: "reply-1", + RoomID: roomID, + UserAccount: "alice", + Content: "updated thread reply", + CreatedAt: msgTime, + EditedAt: &editedAt, + UpdatedAt: &editedAt, + ThreadParentMessageID: parentMsgID, + TShow: false, + }, + } + data, _ := json.Marshal(evt) + + h := NewHandler(store, us, pub, keyStore, false) + require.NoError(t, h.HandleMessage(context.Background(), data)) + + require.Len(t, pub.records, 2) + for _, r := range pub.records { + var roomEvt model.EditRoomEvent + require.NoError(t, json.Unmarshal(r.data, &roomEvt)) + assert.Equal(t, model.RoomEventMessageEdited, roomEvt.Type) + assert.Equal(t, "reply-1", roomEvt.MessageID) + assert.Equal(t, "updated thread reply", roomEvt.NewContent) + assert.Equal(t, editedAt.UnixMilli(), roomEvt.Timestamp) + } +} + +func TestHandleThreadUpdated_ChannelRoom_GetThreadFollowersError(t *testing.T) { + ctrl := gomock.NewController(t) + store := NewMockStore(ctrl) + us := NewMockUserStore(ctrl) + pub := &mockPublisher{} + keyStore := NewMockRoomKeyProvider(ctrl) + + msgTime := time.Date(2026, 4, 1, 10, 0, 0, 0, time.UTC) + editedAt := msgTime.Add(time.Minute) + + room := &model.Room{ID: "r1", Type: model.RoomTypeChannel, SiteID: "site-a"} + store.EXPECT().GetRoom(gomock.Any(), "r1").Return(room, nil) + store.EXPECT().GetThreadFollowers(gomock.Any(), "parent-1", "site-a").Return(nil, errors.New("db error")) + + evt := model.MessageEvent{ + Event: model.EventUpdated, + SiteID: "site-a", + Timestamp: editedAt.UnixMilli(), + Message: model.Message{ + ID: "reply-1", + RoomID: "r1", + UserAccount: "alice", + Content: "edit", + CreatedAt: msgTime, + EditedAt: &editedAt, + UpdatedAt: &editedAt, + ThreadParentMessageID: "parent-1", + TShow: false, + }, + } + data, _ := json.Marshal(evt) + + h := NewHandler(store, us, pub, keyStore, false) + err := h.HandleMessage(context.Background(), data) + require.Error(t, err) + assert.Contains(t, err.Error(), "thread fan-out") + assert.Empty(t, pub.records) +} + +func TestHandleThreadUpdated_DMRoom_FansOutToAllMembers(t *testing.T) { + ctrl := gomock.NewController(t) + store := NewMockStore(ctrl) + us := NewMockUserStore(ctrl) + pub := &mockPublisher{} + keyStore := NewMockRoomKeyProvider(ctrl) + + msgTime := time.Date(2026, 4, 1, 11, 0, 0, 0, time.UTC) + editedAt := msgTime.Add(time.Minute) + + room := &model.Room{ + ID: "dm-alice-bob", + Type: model.RoomTypeDM, + SiteID: "site-a", + Accounts: []string{"alice", "bob"}, + } + store.EXPECT().GetRoom(gomock.Any(), "dm-alice-bob").Return(room, nil) + + evt := model.MessageEvent{ + Event: model.EventUpdated, + SiteID: "site-a", + Timestamp: editedAt.UnixMilli(), + Message: model.Message{ + ID: "reply-1", + RoomID: "dm-alice-bob", + UserAccount: "alice", + Content: "dm thread edit", + CreatedAt: msgTime, + EditedAt: &editedAt, + UpdatedAt: &editedAt, + ThreadParentMessageID: "parent-dm", + TShow: false, + }, + } + data, _ := json.Marshal(evt) + + h := NewHandler(store, us, pub, keyStore, false) + require.NoError(t, h.HandleMessage(context.Background(), data)) + + require.Len(t, pub.records, 2) + subjects := map[string]bool{} + for _, r := range pub.records { + subjects[r.subject] = true + var roomEvt model.EditRoomEvent + require.NoError(t, json.Unmarshal(r.data, &roomEvt)) + assert.Equal(t, model.RoomEventMessageEdited, roomEvt.Type) + assert.Equal(t, "dm thread edit", roomEvt.NewContent) + } + assert.True(t, subjects[subject.UserRoomEvent("alice")]) + assert.True(t, subjects[subject.UserRoomEvent("bob")]) +} + +func TestHandleThreadDeleted_ChannelRoom_FansOutToFollowers(t *testing.T) { + ctrl := gomock.NewController(t) + store := NewMockStore(ctrl) + us := NewMockUserStore(ctrl) + pub := &mockPublisher{} + keyStore := NewMockRoomKeyProvider(ctrl) + + msgTime := time.Date(2026, 4, 1, 10, 0, 0, 0, time.UTC) + deletedAt := msgTime.Add(time.Minute) + parentMsgID := "parent-1" + siteID := "site-a" + roomID := "r1" + + room := &model.Room{ID: roomID, Type: model.RoomTypeChannel, SiteID: siteID} + followers := map[string]struct{}{"bob": {}, "carol": {}} + store.EXPECT().GetRoom(gomock.Any(), roomID).Return(room, nil) + store.EXPECT().GetThreadFollowers(gomock.Any(), parentMsgID, siteID).Return(followers, nil) + // No NewTCount → no badge update. + + evt := model.MessageEvent{ + Event: model.EventDeleted, + SiteID: siteID, + Timestamp: deletedAt.UnixMilli(), + Message: model.Message{ + ID: "reply-1", + RoomID: roomID, + UserAccount: "alice", + Content: "deleted thread reply", + CreatedAt: msgTime, + UpdatedAt: &deletedAt, + ThreadParentMessageID: parentMsgID, + TShow: false, + }, + } + data, _ := json.Marshal(evt) + + h := NewHandler(store, us, pub, keyStore, false) + require.NoError(t, h.HandleMessage(context.Background(), data)) + + require.Len(t, pub.records, 2) + for _, r := range pub.records { + var roomEvt model.DeleteRoomEvent + require.NoError(t, json.Unmarshal(r.data, &roomEvt)) + assert.Equal(t, model.RoomEventMessageDeleted, roomEvt.Type) + assert.Equal(t, "reply-1", roomEvt.MessageID) + assert.Equal(t, deletedAt.UnixMilli(), roomEvt.Timestamp) + } +} + +func TestHandleThreadDeleted_ChannelRoom_WithBadgeUpdate(t *testing.T) { + ctrl := gomock.NewController(t) + store := NewMockStore(ctrl) + us := NewMockUserStore(ctrl) + pub := &mockPublisher{} + keyStore := NewMockRoomKeyProvider(ctrl) + + msgTime := time.Date(2026, 4, 1, 10, 0, 0, 0, time.UTC) + deletedAt := msgTime.Add(time.Minute) + tcount := 4 + + room := &model.Room{ID: "r1", Type: model.RoomTypeChannel, SiteID: "site-a"} + store.EXPECT().GetRoom(gomock.Any(), "r1").Return(room, nil) + store.EXPECT().GetThreadFollowers(gomock.Any(), "parent-1", "site-a").Return(map[string]struct{}{"bob": {}}, nil) + + evt := model.MessageEvent{ + Event: model.EventDeleted, + SiteID: "site-a", + Timestamp: deletedAt.UnixMilli(), + NewTCount: &tcount, + Message: model.Message{ + ID: "reply-1", + RoomID: "r1", + UserAccount: "alice", + Content: "", + CreatedAt: msgTime, + UpdatedAt: &deletedAt, + ThreadParentMessageID: "parent-1", + TShow: false, + }, + } + data, _ := json.Marshal(evt) + + h := NewHandler(store, us, pub, keyStore, false) + require.NoError(t, h.HandleMessage(context.Background(), data)) + + // 1 delete event (to bob) + 1 badge update (to room channel) + require.Len(t, pub.records, 2) + var sawDelete, sawBadge bool + for _, r := range pub.records { + if r.subject == subject.RoomEvent("r1") { + var tmEvt model.ThreadMetadataUpdatedEvent + require.NoError(t, json.Unmarshal(r.data, &tmEvt)) + assert.Equal(t, model.ThreadActionReplyDeleted, tmEvt.Action) + assert.Equal(t, 4, tmEvt.NewTCount) + sawBadge = true + } else { + var roomEvt model.DeleteRoomEvent + require.NoError(t, json.Unmarshal(r.data, &roomEvt)) + assert.Equal(t, model.RoomEventMessageDeleted, roomEvt.Type) + sawDelete = true + } + } + assert.True(t, sawDelete, "delete event must be published to follower") + assert.True(t, sawBadge, "badge update must be published to room channel") +} + +func TestHandleThreadDeleted_DMRoom_FansOutToAllMembers(t *testing.T) { + ctrl := gomock.NewController(t) + store := NewMockStore(ctrl) + us := NewMockUserStore(ctrl) + pub := &mockPublisher{} + keyStore := NewMockRoomKeyProvider(ctrl) + + msgTime := time.Date(2026, 4, 1, 11, 0, 0, 0, time.UTC) + deletedAt := msgTime.Add(time.Minute) + + room := &model.Room{ + ID: "dm-alice-bob", + Type: model.RoomTypeDM, + SiteID: "site-a", + Accounts: []string{"alice", "bob"}, + } + store.EXPECT().GetRoom(gomock.Any(), "dm-alice-bob").Return(room, nil) + + evt := model.MessageEvent{ + Event: model.EventDeleted, + SiteID: "site-a", + Timestamp: deletedAt.UnixMilli(), + Message: model.Message{ + ID: "reply-1", + RoomID: "dm-alice-bob", + UserAccount: "alice", + CreatedAt: msgTime, + UpdatedAt: &deletedAt, + ThreadParentMessageID: "parent-dm", + TShow: false, + }, + } + data, _ := json.Marshal(evt) + + h := NewHandler(store, us, pub, keyStore, false) + require.NoError(t, h.HandleMessage(context.Background(), data)) + + require.Len(t, pub.records, 2) + subjects := map[string]bool{} + for _, r := range pub.records { + subjects[r.subject] = true + var roomEvt model.DeleteRoomEvent + require.NoError(t, json.Unmarshal(r.data, &roomEvt)) + assert.Equal(t, model.RoomEventMessageDeleted, roomEvt.Type) + } + assert.True(t, subjects[subject.UserRoomEvent("alice")]) + assert.True(t, subjects[subject.UserRoomEvent("bob")]) +} + +func TestPublishToThreadAccounts_AllFail_ReturnsError(t *testing.T) { + failPub := &failingPublisher{failAfter: 0} + + ctrl := gomock.NewController(t) + store := NewMockStore(ctrl) + us := NewMockUserStore(ctrl) + keyStore := NewMockRoomKeyProvider(ctrl) + + h := NewHandler(store, us, failPub, keyStore, false) + err := h.publishToThreadAccounts(context.Background(), []string{"alice", "bob"}, []byte(`{}`), "parent-1") + require.Error(t, err) + assert.Contains(t, err.Error(), "publish thread event") +} + +func TestPublishToThreadAccounts_Empty_NoOp(t *testing.T) { + ctrl := gomock.NewController(t) + store := NewMockStore(ctrl) + us := NewMockUserStore(ctrl) + pub := &mockPublisher{} + keyStore := NewMockRoomKeyProvider(ctrl) + + h := NewHandler(store, us, pub, keyStore, false) + require.NoError(t, h.publishToThreadAccounts(context.Background(), nil, []byte(`{}`), "parent-1")) + assert.Empty(t, pub.records) +} diff --git a/broadcast-worker/integration_test.go b/broadcast-worker/integration_test.go index c106b901c..f9e8dc8f3 100644 --- a/broadcast-worker/integration_test.go +++ b/broadcast-worker/integration_test.go @@ -77,7 +77,7 @@ func TestBroadcastWorker_ChannelRoom_Integration(t *testing.T) { require.NoError(t, err) seedUsers(t, db) - store := NewMongoStore(db.Collection("rooms"), db.Collection("subscriptions")) + store := NewMongoStore(db.Collection("rooms"), db.Collection("subscriptions"), db.Collection("thread_rooms")) us := userstore.NewMongoStore(db.Collection("users")) pub := &recordingPublisher{} key := testRoomKey(t) @@ -123,7 +123,7 @@ func TestBroadcastWorker_ChannelRoom_MentionAll_Integration(t *testing.T) { require.NoError(t, err) seedUsers(t, db) - store := NewMongoStore(db.Collection("rooms"), db.Collection("subscriptions")) + store := NewMongoStore(db.Collection("rooms"), db.Collection("subscriptions"), db.Collection("thread_rooms")) us := userstore.NewMongoStore(db.Collection("users")) pub := &recordingPublisher{} key := testRoomKey(t) @@ -163,7 +163,7 @@ func TestBroadcastWorker_ChannelRoom_IndividualMention_Integration(t *testing.T) require.NoError(t, err) seedUsers(t, db) - store := NewMongoStore(db.Collection("rooms"), db.Collection("subscriptions")) + store := NewMongoStore(db.Collection("rooms"), db.Collection("subscriptions"), db.Collection("thread_rooms")) us := userstore.NewMongoStore(db.Collection("users")) pub := &recordingPublisher{} key := testRoomKey(t) @@ -214,7 +214,7 @@ func TestBroadcastWorker_DMRoom_Integration(t *testing.T) { require.NoError(t, err) seedUsers(t, db) - store := NewMongoStore(db.Collection("rooms"), db.Collection("subscriptions")) + store := NewMongoStore(db.Collection("rooms"), db.Collection("subscriptions"), db.Collection("thread_rooms")) us := userstore.NewMongoStore(db.Collection("users")) pub := &recordingPublisher{} keyStore := &fakeRoomKeyProvider{pair: nil} @@ -275,7 +275,7 @@ func TestBroadcastWorker_ChannelRoom_EncryptionDisabled_Integration(t *testing.T require.NoError(t, err) seedUsers(t, db) - store := NewMongoStore(db.Collection("rooms"), db.Collection("subscriptions")) + store := NewMongoStore(db.Collection("rooms"), db.Collection("subscriptions"), db.Collection("thread_rooms")) us := userstore.NewMongoStore(db.Collection("users")) pub := &recordingPublisher{} @@ -326,7 +326,7 @@ func TestBroadcastWorker_PersistsLastMessage_Integration(t *testing.T) { require.NoError(t, err) seedUsers(t, db) - store := NewMongoStore(db.Collection("rooms"), db.Collection("subscriptions")) + store := NewMongoStore(db.Collection("rooms"), db.Collection("subscriptions"), db.Collection("thread_rooms")) cached, err := newCachedMetaStore(store, 10, time.Minute) require.NoError(t, err) @@ -371,7 +371,7 @@ func TestBroadcastWorker_BulkUpdateRoomLastMessage_Integration(t *testing.T) { }) require.NoError(t, err) - store := NewMongoStore(db.Collection("rooms"), db.Collection("subscriptions")) + store := NewMongoStore(db.Collection("rooms"), db.Collection("subscriptions"), db.Collection("thread_rooms")) t1 := time.Date(2026, 5, 18, 12, 0, 0, 0, time.UTC) t2 := t1.Add(time.Second) @@ -400,7 +400,94 @@ func TestBroadcastWorker_BulkUpdateRoomLastMessage_Integration(t *testing.T) { func TestBroadcastWorker_BulkUpdateRoomLastMessage_EmptyIsNoOp_Integration(t *testing.T) { db := setupMongo(t) - store := NewMongoStore(db.Collection("rooms"), db.Collection("subscriptions")) + store := NewMongoStore(db.Collection("rooms"), db.Collection("subscriptions"), db.Collection("thread_rooms")) require.NoError(t, store.BulkUpdateRoomLastMessage(context.Background(), nil)) require.NoError(t, store.BulkUpdateRoomLastMessage(context.Background(), map[string]roomLastMsgUpdate{})) } + +func TestBroadcastWorker_GetThreadFollowers_Integration(t *testing.T) { + db := setupMongo(t) + ctx := context.Background() + store := NewMongoStore(db.Collection("rooms"), db.Collection("subscriptions"), db.Collection("thread_rooms")) + + // Seed a thread room document with replyAccounts and a siteId. + _, err := db.Collection("thread_rooms").InsertMany(ctx, []interface{}{ + bson.M{ + "_id": "tr-1", + "parentMessageId": "parent-1", + "siteId": "site-a", + "replyAccounts": []string{"bob", "carol", ""}, + }, + // Same parentMessageId but different siteId — must NOT be returned. + bson.M{ + "_id": "tr-2", + "parentMessageId": "parent-1", + "siteId": "site-b", + "replyAccounts": []string{"dave"}, + }, + }) + require.NoError(t, err) + + t.Run("returns followers for correct siteId", func(t *testing.T) { + followers, err := store.GetThreadFollowers(ctx, "parent-1", "site-a") + require.NoError(t, err) + // Empty string is filtered out. + assert.Equal(t, map[string]struct{}{"bob": {}, "carol": {}}, followers) + }) + + t.Run("cross-siteId isolation: different siteId returns empty", func(t *testing.T) { + followers, err := store.GetThreadFollowers(ctx, "parent-1", "site-b") + require.NoError(t, err) + assert.Equal(t, map[string]struct{}{"dave": {}}, followers) + }) + + t.Run("no document returns empty map", func(t *testing.T) { + followers, err := store.GetThreadFollowers(ctx, "nonexistent-parent", "site-a") + require.NoError(t, err) + assert.Empty(t, followers) + }) +} + +func TestBroadcastWorker_EnsureIndexes_Integration(t *testing.T) { + db := setupMongo(t) + ctx := context.Background() + store := NewMongoStore(db.Collection("rooms"), db.Collection("subscriptions"), db.Collection("thread_rooms")) + + // EnsureIndexes should be idempotent — call it twice without error. + require.NoError(t, store.EnsureIndexes(ctx)) + require.NoError(t, store.EnsureIndexes(ctx)) + + // Verify the compound index was created by listing indexes. + // MongoDB driver v2 decodes nested documents as bson.D (not bson.M), so we + // decode the index list into []bson.D and iterate element-by-element. + cursor, err := db.Collection("thread_rooms").Indexes().List(ctx) + require.NoError(t, err) + var idxes []bson.D + require.NoError(t, cursor.All(ctx, &idxes)) + + var found bool + for _, idx := range idxes { + var gotKeys bson.D + for _, elem := range idx { + if elem.Key == "key" { + if kd, ok := elem.Value.(bson.D); ok { + gotKeys = kd + } + } + } + var hasParent, hasSite bool + for _, kv := range gotKeys { + if kv.Key == "parentMessageId" { + hasParent = true + } + if kv.Key == "siteId" { + hasSite = true + } + } + if hasParent && hasSite { + found = true + break + } + } + assert.True(t, found, "compound index on (parentMessageId, siteId) must exist") +} diff --git a/broadcast-worker/main.go b/broadcast-worker/main.go index b7923eeef..0dce33c70 100644 --- a/broadcast-worker/main.go +++ b/broadcast-worker/main.go @@ -72,7 +72,11 @@ func main() { os.Exit(1) } db := mongoClient.Database(cfg.MongoDB) - store := NewMongoStore(db.Collection("rooms"), db.Collection("subscriptions")) + store := NewMongoStore(db.Collection("rooms"), db.Collection("subscriptions"), db.Collection("thread_rooms")) + if err := store.EnsureIndexes(ctx); err != nil { + slog.Error("ensure indexes failed", "error", err) + os.Exit(1) + } cachedStore, err := newCachedMetaStore(store, cfg.RoomMetaCacheSize, cfg.RoomMetaCacheTTL) if err != nil { slog.Error("init room meta cache failed", "error", err) diff --git a/broadcast-worker/mock_store_test.go b/broadcast-worker/mock_store_test.go index c47a04394..9b84c5e1b 100644 --- a/broadcast-worker/mock_store_test.go +++ b/broadcast-worker/mock_store_test.go @@ -1,9 +1,9 @@ // Code generated by MockGen. DO NOT EDIT. -// Source: github.com/hmchangw/chat/broadcast-worker (interfaces: Store) +// Source: ./.claude/worktrees/agent-a820856a32d038b68/broadcast-worker/store.go // // Generated by this command: // -// mockgen -destination=mock_store_test.go -package=main . Store +// mockgen -source=./.claude/worktrees/agent-a820856a32d038b68/broadcast-worker/store.go -destination=./.claude/worktrees/agent-a820856a32d038b68/broadcast-worker/mock_store_test.go -package=main // // Package main is a generated GoMock package. @@ -73,6 +73,21 @@ func (mr *MockStoreMockRecorder) GetRoomMeta(ctx, roomID any) *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetRoomMeta", reflect.TypeOf((*MockStore)(nil).GetRoomMeta), ctx, roomID) } +// GetThreadFollowers mocks base method. +func (m *MockStore) GetThreadFollowers(ctx context.Context, parentMessageID, siteID string) (map[string]struct{}, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetThreadFollowers", ctx, parentMessageID, siteID) + ret0, _ := ret[0].(map[string]struct{}) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// GetThreadFollowers indicates an expected call of GetThreadFollowers. +func (mr *MockStoreMockRecorder) GetThreadFollowers(ctx, parentMessageID, siteID any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetThreadFollowers", reflect.TypeOf((*MockStore)(nil).GetThreadFollowers), ctx, parentMessageID, siteID) +} + // ListSubscriptions mocks base method. func (m *MockStore) ListSubscriptions(ctx context.Context, roomID string) ([]model.Subscription, error) { m.ctrl.T.Helper() diff --git a/broadcast-worker/store.go b/broadcast-worker/store.go index 82f1fb2d6..45bc35b01 100644 --- a/broadcast-worker/store.go +++ b/broadcast-worker/store.go @@ -17,6 +17,7 @@ type Store interface { GetRoom(ctx context.Context, roomID string) (*model.Room, error) GetRoomMeta(ctx context.Context, roomID string) (roommetacache.Meta, error) ListSubscriptions(ctx context.Context, roomID string) ([]model.Subscription, error) + GetThreadFollowers(ctx context.Context, parentMessageID, siteID string) (map[string]struct{}, error) UpdateRoomLastMessage(ctx context.Context, roomID, msgID string, msgAt time.Time, mentionAll bool) error SetSubscriptionMentions(ctx context.Context, roomID string, accounts []string) error } diff --git a/broadcast-worker/store_mongo.go b/broadcast-worker/store_mongo.go index 71c577fb2..c50260f78 100644 --- a/broadcast-worker/store_mongo.go +++ b/broadcast-worker/store_mongo.go @@ -2,6 +2,7 @@ package main import ( "context" + "errors" "fmt" "time" @@ -13,13 +14,26 @@ import ( "github.com/hmchangw/chat/pkg/roommetacache" ) +// EnsureIndexes creates indexes that back the store's read paths. +// Must be called once at startup; index creation is idempotent when the key +// spec matches. +func (m *mongoStore) EnsureIndexes(ctx context.Context) error { + if _, err := m.threadRoomCol.Indexes().CreateOne(ctx, mongo.IndexModel{ + Keys: bson.D{{Key: "parentMessageId", Value: 1}, {Key: "siteId", Value: 1}}, + }); err != nil { + return fmt.Errorf("ensure thread_rooms (parentMessageId, siteId) index: %w", err) + } + return nil +} + type mongoStore struct { - roomCol *mongo.Collection - subCol *mongo.Collection + roomCol *mongo.Collection + subCol *mongo.Collection + threadRoomCol *mongo.Collection } -func NewMongoStore(roomCol, subCol *mongo.Collection) *mongoStore { - return &mongoStore{roomCol: roomCol, subCol: subCol} +func NewMongoStore(roomCol, subCol, threadRoomCol *mongo.Collection) *mongoStore { + return &mongoStore{roomCol: roomCol, subCol: subCol, threadRoomCol: threadRoomCol} } func (m *mongoStore) GetRoom(ctx context.Context, roomID string) (*model.Room, error) { @@ -111,3 +125,24 @@ func (m *mongoStore) SetSubscriptionMentions(ctx context.Context, roomID string, } return nil } + +func (m *mongoStore) GetThreadFollowers(ctx context.Context, parentMessageID, siteID string) (map[string]struct{}, error) { + var doc struct { + ReplyAccounts []string `bson:"replyAccounts"` + } + opts := options.FindOne().SetProjection(bson.M{"replyAccounts": 1, "_id": 0}) + err := m.threadRoomCol.FindOne(ctx, bson.M{"parentMessageId": parentMessageID, "siteId": siteID}, opts).Decode(&doc) + if err != nil { + if errors.Is(err, mongo.ErrNoDocuments) { + return map[string]struct{}{}, nil + } + return nil, fmt.Errorf("find thread room by parent %s site %s: %w", parentMessageID, siteID, err) + } + out := make(map[string]struct{}, len(doc.ReplyAccounts)) + for _, a := range doc.ReplyAccounts { + if a != "" { + out[a] = struct{}{} + } + } + return out, nil +} diff --git a/docs/client-api.md b/docs/client-api.md index d4823e27b..f424a2dd6 100644 --- a/docs/client-api.md +++ b/docs/client-api.md @@ -624,7 +624,11 @@ See [Error envelope](#6-error-envelope-reference). Returned synchronously when v - `"invalid X-Request-ID format"` — the header value is not a valid hyphenated UUID. ```json -{ "error": "rename is only allowed in channel rooms" } +{ + "error": "rename is only allowed in channel rooms", + "code": "bad_request", + "reason": "non_channel_operation" +} ``` ##### Triggered events — success path @@ -867,7 +871,7 @@ See [Error envelope](#6-error-envelope-reference). Common errors: ##### Behaviour notes -- **Alert recomputation:** new `alert = oldSub.alert && len(newThreadUnread) > 0`. A thread-read can only clear an alert, never set one. When the post-removal `threadUnread` is empty, `alert` becomes false. +- **Alert recomputation:** `alert = oldSub.alert && len(newThreadUnread) > 0`. A thread-read can only clear an alert, never set one. When the post-removal `threadUnread` is empty, `alert` becomes false. This computation runs atomically inside the MongoDB aggregation pipeline on the handler's site — not derived client-side. - **Concurrent local writes:** the room-`Subscription` update and the `ThreadSubscription` update run in parallel inside an `errgroup`. Both must succeed before the handler proceeds. - **Cross-site federation:** if the user's home site differs from the handler's site, a `thread_read` event is published to `outbox.{handlerSite}.to.{userSite}.thread_read` with payload `{account, roomId, threadRoomId, parentMessageId, newThreadUnread, alert, lastSeenAt, timestamp}` (timestamps as `int64` UnixMilli). The destination `inbox-worker` applies the supplied `newThreadUnread`+`alert` to the local Subscription cache and applies `lastSeenAt`+`updatedAt`+`hasMention=false` to the local ThreadSubscription with an `$lt` order-safety guard so out-of-order delivery cannot regress the thread's read position. - **Defensive `roomId` filter:** the thread-subscription lookup additionally enforces that the supplied `threadId` belongs to the room named in the subject. Mismatches return `thread subscription not found` (rather than silently clearing an unrelated thread). @@ -1692,10 +1696,12 @@ See [Error envelope](#6-error-envelope-reference). Errors: ##### Triggered events — success path -A `DeleteRoomEvent` is fanned out by `broadcast-worker` (not published when the request hits an already-deleted message or loses a concurrent-delete CAS). The subject depends on room type: +A `DeleteRoomEvent` is fanned out by `broadcast-worker` (not published when the request hits an already-deleted message or loses a concurrent-delete CAS). The subject and recipients depend on message type: -- **Channel rooms — `chat.room.{roomID}.event`** — one publish to the room stream. -- **DM/botDM rooms — `chat.user.{recipient}.event.room`** — published once per non-bot member. +- **Top-level channel message — `chat.room.{roomID}.event`** — one publish to the room stream; all room subscribers receive it. +- **Thread reply (TShow=false) in a channel** — `chat.user.{recipient}.event.room` — published once per thread subscriber (followers + @-mentioned accounts). Non-subscribers do not receive this event. +- **Thread reply (TShow=true) in a channel** — `chat.room.{roomID}.event` — visible in the main channel, so the full room stream receives it. +- **DM/botDM message — `chat.user.{recipient}.event.room`** — published once per non-bot member. The payload is flat: @@ -1704,7 +1710,7 @@ The payload is flat: | `type` | string | Always `"message_deleted"`. | | `roomId` | string | | | `siteId` | string | | -| `timestamp` | number | Milliseconds since Unix epoch (UTC). Event publish time. | +| `timestamp` | number | Milliseconds since Unix epoch (UTC). Propagated from the canonical event's publish time. | | `messageId` | string | The deleted message's ID. | | `deletedBy` | string | The sender's account. | | `deletedAt` | string | RFC 3339 timestamp. Domain time of the delete. | @@ -1723,6 +1729,8 @@ The payload is flat: } ``` +**Thread-reply deletes additionally emit a `ThreadMetadataUpdatedEvent`** (see [§4.1 Thread Metadata Event](#41-thread-metadata-event)) to update the parent message's reply-count badge. The `DeleteRoomEvent` and `ThreadMetadataUpdatedEvent` are published independently; clients must handle each on its own. + ##### Triggered events — error path `None — error returned only via the reply subject.` @@ -2683,6 +2691,8 @@ A `RoomEvent` (same struct as above) published once per DM participant. Recipien } ``` +**Thread replies additionally emit a `ThreadMetadataUpdatedEvent`** (see [§4.1 Thread Metadata Event](#41-thread-metadata-event)) to update the parent message's reply-count badge. This event is published to all room members (not only thread subscribers) so every client can show the correct badge without subscribing to the thread. + #### Triggered events — error path When validation fails, the gatekeeper publishes the error envelope to `chat.user.{account}.response.{requestId}` and **no downstream events are emitted**. The client should display the error and offer a retry. @@ -2715,6 +2725,56 @@ The worker filters recipients per message: --- +## 4.1 Thread Metadata Event + +### ThreadMetadataUpdatedEvent + +Pushed by `broadcast-worker` whenever a thread reply is **created** (`action: "reply_added"`) or **deleted** (`action: "reply_deleted"`). Its purpose is to let clients update the reply-count badge on the parent message without reloading the thread. + +#### Subjects + +| Room type | Subject | +|-----------|---------| +| Channel | `chat.room.{roomID}.event` | +| DM / botDM | `chat.user.{account}.event.room` — published once per non-bot member | + +#### Payload + +| Field | Type | Notes | +|-------|------|-------| +| `type` | string | Always `"thread_metadata_updated"`. | +| `roomId` | string | The room the thread lives in. | +| `siteId` | string | | +| `parentMessageId` | string | The thread parent message's ID. Clients use this to locate the message in their cache and update its badge. | +| `newTcount` | number | Authoritative post-CAS reply count for the parent message. Replaces any locally-computed count — do not delta. | +| `action` | string | `"reply_added"` or `"reply_deleted"`. | +| `replyMessageId` | string | The reply that was added or deleted. | +| `timestamp` | number | Milliseconds since Unix epoch (UTC). Propagated from the canonical event's publish time. | + +```json +{ + "type": "thread_metadata_updated", + "roomId": "01970a4f8c2d7c9aQ", + "siteId": "siteA", + "parentMessageId": "01970a4f8c2d7c9aQRST", + "newTcount": 4, + "action": "reply_added", + "replyMessageId": "01970a4f8c2d7c9aUVWX", + "timestamp": 1746518100123 +} +``` + +#### When it fires + +- **Reply added (`action: "reply_added"`):** fired when a new thread reply is successfully persisted (triggered by a `Send Message` RPC with `threadParentId` set). Published in addition to the per-subscriber `new_message` `RoomEvent` that carries the reply content. +- **Reply deleted (`action: "reply_deleted"`):** fired when a thread reply is soft-deleted (triggered by a `Delete Message` RPC). Published in addition to the `DeleteRoomEvent` that carries the delete notification. + +#### Client handling + +Apply `newTcount` directly to the parent message's badge — do not compute a delta. Events for the same parent may arrive out of order due to JetStream redelivery; always prefer the event with the larger `timestamp` for badge state. + +--- + ## 5. Room Encryption Channel messages can be end-to-end encrypted. The key material reaches clients as `RoomKeyEvent`s, which are triggered by the Create Room / Add Members / Remove Member RPCs (see their "Triggered events" sections). This section describes the event payload and how a client uses it to decrypt. @@ -2840,7 +2900,7 @@ Every error response — NATS reply subjects, JetStream async results, and HTTP | Field | Type | Notes | |------------|-----------------------|-------| | `error` | string | Human-readable, user-safe (never carries an internal cause). Do not parse or pattern-match against the text. | -| `code` | string | **Always present.** One of the 7 categories below. Drives HTTP status. | +| `code` | string | **Always present.** One of the 8 categories below. Drives HTTP status. | | `reason` | string (optional) | Domain-specific machine code (e.g. `max_room_size_reached`, `not_subscribed`). When present, the client should branch on `reason ?? code`. | | `metadata` | object (optional) | Free-form `string→string` map for structured detail (e.g. `{ "limit": "500" }`). | diff --git a/docs/superpowers/plans/2026-05-28-broadcast-worker-thread-handling.md b/docs/superpowers/plans/2026-05-28-broadcast-worker-thread-handling.md new file mode 100644 index 000000000..c968ae50b --- /dev/null +++ b/docs/superpowers/plans/2026-05-28-broadcast-worker-thread-handling.md @@ -0,0 +1,1156 @@ +# Broadcast Worker Thread Message Handling — Implementation Plan + +> **Status: IMPLEMENTED** — PR #245 (`claude/gallant-galileo-ice0C`). All tasks below are complete. See the design spec's "Implementation Notes" section for what diverged from the original plan. For notification-worker work that was intentionally left out, see `docs/thread-reply-notifications.md`. +> +> **Post-plan work:** After the initial implementation, three rounds of high-effort code review and a simplification pass produced additional commits that are not reflected in the four tasks below. See the "Post-Plan Fixes and Refactoring" section at the bottom of this file. + +**Goal:** Add real-time fan-out of thread reply events (created, updated, deleted) to thread subscribers in broadcast-worker. + +**Architecture:** Three new handler methods (`handleThreadCreated`, `handleThreadUpdated`, `handleThreadDeleted`) are added, each routing through a TShow gate added to the existing `handleCreated`/`handleUpdated`/`handleDeleted`. A new `ListThreadSubscriptions` store method queries the `thread_subscriptions` MongoDB collection. Fan-out publishes to `subject.UserRoomEvent(account)` per subscriber — the same per-user subject used for DMs. `Subscription.ThreadUnread` updates are out of scope; that gap lives in message-worker. + +**Tech Stack:** Go 1.25, `go.mongodb.org/mongo-driver/v2`, `go.uber.org/mock`, `github.com/stretchr/testify`, `pkg/model`, `pkg/subject`, `pkg/mention` + +--- + +## File Map + +| File | Change | +|------|--------| +| `broadcast-worker/store.go` | Add `ListThreadSubscriptions` to `Store` interface | +| `broadcast-worker/store_mongo.go` | Add `threadSubCol` field, update `NewMongoStore`, implement `ListThreadSubscriptions` | +| `broadcast-worker/main.go` | Pass `db.Collection("thread_subscriptions")` to `NewMongoStore` | +| `broadcast-worker/mock_store_test.go` | Regenerated — never edit manually | +| `broadcast-worker/handler.go` | Add TShow routing gate to each event handler + three new thread handler methods | +| `broadcast-worker/handler_test.go` | New table-driven tests for all three thread handler methods | +| `broadcast-worker/integration_test.go` | Update all 6 existing `NewMongoStore` calls + add `ListThreadSubscriptions` integration test | + +--- + +## Task 1: Store — interface, implementation, and integration test + +**Files:** +- Modify: `broadcast-worker/store.go` +- Modify: `broadcast-worker/store_mongo.go` +- Modify: `broadcast-worker/main.go` +- Modify: `broadcast-worker/integration_test.go` +- Regenerate: `broadcast-worker/mock_store_test.go` + +- [x] **Step 1: Add `ListThreadSubscriptions` to the `Store` interface** + +In `broadcast-worker/store.go`, replace the current interface block with: + +```go +//go:generate mockgen -destination=mock_store_test.go -package=main . Store +//go:generate mockgen -destination=mock_userstore_test.go -package=main github.com/hmchangw/chat/pkg/userstore UserStore +//go:generate mockgen -destination=mock_keystore_test.go -package=main . RoomKeyProvider + +// Store defines data access operations for the broadcast worker. +type Store interface { + GetRoom(ctx context.Context, roomID string) (*model.Room, error) + GetRoomMeta(ctx context.Context, roomID string) (roommetacache.Meta, error) + ListSubscriptions(ctx context.Context, roomID string) ([]model.Subscription, error) + ListThreadSubscriptions(ctx context.Context, parentMessageID, siteID string) ([]model.ThreadSubscription, error) + UpdateRoomLastMessage(ctx context.Context, roomID, msgID string, msgAt time.Time, mentionAll bool) error + SetSubscriptionMentions(ctx context.Context, roomID string, accounts []string) error +} +``` + +- [x] **Step 2: Update `mongoStore` to hold the thread subscriptions collection** + +In `broadcast-worker/store_mongo.go`, replace the struct and constructor: + +```go +type mongoStore struct { + roomCol *mongo.Collection + subCol *mongo.Collection + threadSubCol *mongo.Collection +} + +func NewMongoStore(roomCol, subCol, threadSubCol *mongo.Collection) *mongoStore { + return &mongoStore{roomCol: roomCol, subCol: subCol, threadSubCol: threadSubCol} +} +``` + +- [x] **Step 3: Implement `ListThreadSubscriptions` in `store_mongo.go`** + +Add this method to `mongoStore` (after `SetSubscriptionMentions`): + +```go +func (m *mongoStore) ListThreadSubscriptions(ctx context.Context, parentMessageID, siteID string) ([]model.ThreadSubscription, error) { + filter := bson.M{"parentMessageId": parentMessageID, "siteId": siteID} + cursor, err := m.threadSubCol.Find(ctx, filter) + if err != nil { + return nil, fmt.Errorf("query thread subscriptions for parent %s: %w", parentMessageID, err) + } + defer cursor.Close(ctx) + var subs []model.ThreadSubscription + if err := cursor.All(ctx, &subs); err != nil { + return nil, fmt.Errorf("decode thread subscriptions: %w", err) + } + return subs, nil +} +``` + +- [x] **Step 4: Update `main.go` to pass the thread subscriptions collection** + +In `broadcast-worker/main.go`, change line 74 from: + +```go +store := NewMongoStore(db.Collection("rooms"), db.Collection("subscriptions")) +``` + +to: + +```go +store := NewMongoStore(db.Collection("rooms"), db.Collection("subscriptions"), db.Collection("thread_subscriptions")) +``` + +- [x] **Step 5: Update all `NewMongoStore` calls in `integration_test.go`** + +There are **six** calls to `NewMongoStore` in `broadcast-worker/integration_test.go`, one per existing integration test function: + +- `TestBroadcastWorker_ChannelRoom_Integration` (~line 80) +- `TestBroadcastWorker_ChannelRoom_MentionAll_Integration` (~line 126) +- `TestBroadcastWorker_ChannelRoom_IndividualMention_Integration` (~line 162) +- `TestBroadcastWorker_DMRoom_Integration` (~line 217) +- `TestBroadcastWorker_ChannelRoom_EncryptionDisabled_Integration` (~line 279) +- `TestBroadcastWorker_PersistsLastMessage_Integration` (~line 330) + +Each currently passes two collections. Replace **all six** occurrences: + +```go +// Before (all six occurrences): +store := NewMongoStore(db.Collection("rooms"), db.Collection("subscriptions")) + +// After (all six occurrences): +store := NewMongoStore(db.Collection("rooms"), db.Collection("subscriptions"), db.Collection("thread_subscriptions")) +``` + +- [x] **Step 6: Add `ListThreadSubscriptions` integration test to `integration_test.go`** + +Append this test at the end of `broadcast-worker/integration_test.go`: + +```go +func TestBroadcastWorker_ListThreadSubscriptions_Integration(t *testing.T) { + db := setupMongo(t) + ctx := context.Background() + + parentMsgID := "parent-msg-1" + siteID := "site-a" + + _, err := db.Collection("thread_subscriptions").InsertMany(ctx, []interface{}{ + model.ThreadSubscription{ + ID: "ts1", ParentMessageID: parentMsgID, RoomID: "r1", + ThreadRoomID: "tr1", UserAccount: "alice", UserID: "u-alice", SiteID: siteID, + }, + model.ThreadSubscription{ + ID: "ts2", ParentMessageID: parentMsgID, RoomID: "r1", + ThreadRoomID: "tr1", UserAccount: "bob", UserID: "u-bob", SiteID: siteID, + }, + // different parent — must NOT be returned + model.ThreadSubscription{ + ID: "ts3", ParentMessageID: "other-parent", RoomID: "r1", + ThreadRoomID: "tr2", UserAccount: "charlie", UserID: "u-charlie", SiteID: siteID, + }, + // different siteID — must NOT be returned + model.ThreadSubscription{ + ID: "ts4", ParentMessageID: parentMsgID, RoomID: "r1", + ThreadRoomID: "tr1", UserAccount: "diana", UserID: "u-diana", SiteID: "site-b", + }, + }) + require.NoError(t, err) + + store := NewMongoStore(db.Collection("rooms"), db.Collection("subscriptions"), db.Collection("thread_subscriptions")) + subs, err := store.ListThreadSubscriptions(ctx, parentMsgID, siteID) + require.NoError(t, err) + require.Len(t, subs, 2) + accounts := []string{subs[0].UserAccount, subs[1].UserAccount} + assert.ElementsMatch(t, []string{"alice", "bob"}, accounts) +} +``` + +- [x] **Step 7: Regenerate mocks** + +```bash +make generate SERVICE=broadcast-worker +``` + +Expected: `broadcast-worker/mock_store_test.go` is updated with a `ListThreadSubscriptions` mock method. No other files change. + +- [x] **Step 8: Verify compilation** + +```bash +make build SERVICE=broadcast-worker +``` + +Expected: exits 0, binary produced. + +- [x] **Step 9: Run unit tests** + +```bash +make test SERVICE=broadcast-worker +``` + +Expected: all existing tests pass. + +- [x] **Step 10: Run integration tests** + +```bash +make test-integration SERVICE=broadcast-worker +``` + +Expected: all tests pass including `TestBroadcastWorker_ListThreadSubscriptions_Integration`. + +- [x] **Step 11: Commit** + +```bash +git add broadcast-worker/store.go broadcast-worker/store_mongo.go broadcast-worker/main.go \ + broadcast-worker/mock_store_test.go broadcast-worker/integration_test.go +git commit -m "feat(broadcast-worker): add ListThreadSubscriptions to store" +``` + +--- + +## Task 2: `handleThreadCreated` — TDD + +**Files:** +- Modify: `broadcast-worker/handler_test.go` +- Modify: `broadcast-worker/handler.go` + +- [x] **Step 1: Add failing tests for `handleThreadCreated` to `handler_test.go`** + +Append the following test function at the end of `broadcast-worker/handler_test.go`: + +```go +func TestHandler_HandleThreadCreated(t *testing.T) { + msgTime := time.Date(2026, 5, 28, 9, 0, 0, 0, time.UTC) + const parentMsgID = "parent-msg-1" + const siteID = "site-a" + const sender = "alice" + + tests := []struct { + name string + content string + threadSubs []model.ThreadSubscription + metaErr error + listErr error + userLookupErr error + wantSubjects []string + wantErrContains string + }{ + { + name: "fans out to thread subscribers excluding sender", + content: "hello thread", + threadSubs: []model.ThreadSubscription{ + {UserAccount: sender}, + {UserAccount: "bob"}, + {UserAccount: "carol"}, + }, + wantSubjects: []string{ + subject.UserRoomEvent("bob"), + subject.UserRoomEvent("carol"), + }, + }, + { + name: "mentioned non-subscriber included in fan-out", + content: "hey @dave", + threadSubs: []model.ThreadSubscription{ + {UserAccount: "bob"}, + }, + wantSubjects: []string{ + subject.UserRoomEvent("bob"), + subject.UserRoomEvent("dave"), + }, + }, + { + name: "mentioned user already a thread subscriber - deduped", + content: "hey @bob", + threadSubs: []model.ThreadSubscription{ + {UserAccount: "bob"}, + }, + wantSubjects: []string{subject.UserRoomEvent("bob")}, + }, + { + name: "only sender in subscriber list - no publish", + content: "hello", + threadSubs: []model.ThreadSubscription{{UserAccount: sender}}, + wantSubjects: nil, + }, + { + name: "empty subscriber list and no mentions - no publish", + content: "hello", + threadSubs: []model.ThreadSubscription{}, + wantSubjects: nil, + }, + { + name: "GetRoomMeta error - returns error", + content: "hello", + metaErr: errors.New("mongo down"), + wantErrContains: "get room meta", + }, + { + name: "ListThreadSubscriptions error - returns error", + content: "hello", + listErr: errors.New("db error"), + wantErrContains: "list thread subscriptions", + }, + { + name: "user lookup error - warns and continues, subscriber still notified", + content: "hello", + threadSubs: []model.ThreadSubscription{{UserAccount: "bob"}}, + userLookupErr: errors.New("db error"), + wantSubjects: []string{subject.UserRoomEvent("bob")}, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + ctrl := gomock.NewController(t) + store := NewMockStore(ctrl) + us := NewMockUserStore(ctrl) + pub := &mockPublisher{} + keyStore := NewMockRoomKeyProvider(ctrl) + + evt := model.MessageEvent{ + Event: model.EventCreated, + SiteID: siteID, + Message: model.Message{ + ID: "reply-1", + RoomID: "room-1", + UserID: "u-alice", + UserAccount: sender, + Content: tc.content, + CreatedAt: msgTime, + ThreadParentMessageID: parentMsgID, + TShow: false, + }, + } + data, _ := json.Marshal(evt) + + // Parse content to know what FindUsersByAccounts will be called with. + // mention.Parse("hello thread") → [] + // mention.Parse("hey @dave") → ["dave"] + // mention.Parse("hey @bob") → ["bob"] + // mention.Parse("hello") → [] + var expectedLookup []string + switch tc.content { + case "hey @dave": + expectedLookup = []string{sender, "dave"} + case "hey @bob": + expectedLookup = []string{sender, "bob"} + default: + expectedLookup = []string{sender} + } + if tc.userLookupErr != nil { + us.EXPECT().FindUsersByAccounts(gomock.Any(), expectedLookup).Return(nil, tc.userLookupErr) + } else { + us.EXPECT().FindUsersByAccounts(gomock.Any(), expectedLookup).Return(nil, nil) + } + + if tc.metaErr != nil { + store.EXPECT().GetRoomMeta(gomock.Any(), "room-1").Return(roommetacache.Meta{}, tc.metaErr) + } else if tc.listErr != nil { + store.EXPECT().GetRoomMeta(gomock.Any(), "room-1").Return(metaOf(testChannelRoom), nil) + store.EXPECT().ListThreadSubscriptions(gomock.Any(), parentMsgID, siteID).Return(nil, tc.listErr) + } else { + store.EXPECT().GetRoomMeta(gomock.Any(), "room-1").Return(metaOf(testChannelRoom), nil) + store.EXPECT().ListThreadSubscriptions(gomock.Any(), parentMsgID, siteID).Return(tc.threadSubs, nil) + } + + h := NewHandler(store, us, pub, keyStore, false) + err := h.HandleMessage(context.Background(), data) + + if tc.wantErrContains != "" { + require.Error(t, err) + assert.Contains(t, err.Error(), tc.wantErrContains) + assert.Empty(t, pub.records) + return + } + + require.NoError(t, err) + gotSubjects := make([]string, len(pub.records)) + for i, r := range pub.records { + gotSubjects[i] = r.subject + } + assert.ElementsMatch(t, tc.wantSubjects, gotSubjects) + + // Verify event payload on each record. + for _, r := range pub.records { + var roomEvt model.RoomEvent + require.NoError(t, json.Unmarshal(r.data, &roomEvt)) + assert.Equal(t, model.RoomEventNewMessage, roomEvt.Type) + assert.Equal(t, "room-1", roomEvt.RoomID) + assert.Equal(t, siteID, roomEvt.SiteID) + require.NotNil(t, roomEvt.Message) + assert.Equal(t, "reply-1", roomEvt.Message.ID) + assert.Equal(t, parentMsgID, roomEvt.Message.ThreadParentMessageID) + } + }) + } +} + +func TestHandler_ThreadCreated_TShow_FallsThroughToRoomBroadcast(t *testing.T) { + // TShow=true thread replies must NOT go through handleThreadCreated. + // They fall through to the existing channel broadcast path. + msgTime := time.Date(2026, 5, 28, 9, 0, 0, 0, time.UTC) + ctrl := gomock.NewController(t) + store := NewMockStore(ctrl) + us := NewMockUserStore(ctrl) + pub := &mockPublisher{} + keyStore := NewMockRoomKeyProvider(ctrl) + + evt := model.MessageEvent{ + Event: model.EventCreated, + SiteID: "site-a", + Message: model.Message{ + ID: "reply-tshow", + RoomID: "room-1", + UserID: "u-alice", + UserAccount: "alice", + Content: "also in channel", + CreatedAt: msgTime, + ThreadParentMessageID: "parent-msg-1", + TShow: true, // falls through to room broadcast + }, + } + data, _ := json.Marshal(evt) + + // Existing room broadcast path is called (UpdateRoomLastMessage + GetRoomMeta). + // ListThreadSubscriptions must NOT be called. + key := testRoomKey(t) + keyStore.EXPECT().Get(gomock.Any(), "room-1").Return(key, nil) + us.EXPECT().FindUsersByAccounts(gomock.Any(), []string{"alice"}).Return(nil, nil) + store.EXPECT().UpdateRoomLastMessage(gomock.Any(), "room-1", "reply-tshow", msgTime, false).Return(nil) + store.EXPECT().GetRoomMeta(gomock.Any(), "room-1").Return(metaOf(testChannelRoom), nil) + // NO store.EXPECT().ListThreadSubscriptions(...) + + h := NewHandler(store, us, pub, keyStore, true) + require.NoError(t, h.HandleMessage(context.Background(), data)) + + // Published to the room channel subject, not per-user. + require.Len(t, pub.records, 1) + assert.Equal(t, subject.RoomEvent("room-1"), pub.records[0].subject) +} +``` + +- [x] **Step 2: Run tests to confirm failure** + +```bash +make test SERVICE=broadcast-worker +``` + +Expected: `TestHandler_HandleThreadCreated` FAILS — the mock for `UpdateRoomLastMessage` is called unexpectedly (the routing gate does not exist yet), or `ListThreadSubscriptions` is never called. + +- [x] **Step 3: Add the TShow routing gate in `handleCreated` and a stub `handleThreadCreated`** + +In `broadcast-worker/handler.go`, at the top of `handleCreated` (after `msg := evt.Message`), add: + +```go +func (h *Handler) handleCreated(ctx context.Context, evt *model.MessageEvent) error { + msg := evt.Message + + if msg.ThreadParentMessageID != "" && !msg.TShow { + return h.handleThreadCreated(ctx, evt) + } + + // ... rest of existing handleCreated code unchanged ... +``` + +Then add the stub method after `handleCreated`: + +```go +func (h *Handler) handleThreadCreated(ctx context.Context, evt *model.MessageEvent) error { + return nil +} +``` + +- [x] **Step 4: Run tests — confirm different failure** + +```bash +make test SERVICE=broadcast-worker +``` + +Expected: `TestHandler_HandleThreadCreated` FAILS — `GetRoomMeta` and `ListThreadSubscriptions` are expected by mocks but not called (stub returns nil immediately). `TestHandler_ThreadCreated_TShow_FallsThroughToRoomBroadcast` PASSES (existing path unchanged). All previously passing tests still pass. + +- [x] **Step 5: Implement `handleThreadCreated`** + +Replace the stub in `broadcast-worker/handler.go` with the full implementation: + +```go +func (h *Handler) handleThreadCreated(ctx context.Context, evt *model.MessageEvent) error { + msg := evt.Message + + // Parse mentions first so we know which accounts to look up for sender enrichment. + // Use parsed.Accounts (not ResolveFromParsed) for the fan-out set — raw account names + // are sufficient and work even when the user store can't resolve a mentioned account. + parsed := mention.Parse(msg.Content) + lookupAccounts := dedupedAccounts(msg.UserAccount, parsed.Accounts) + users, lookupErr := h.userStore.FindUsersByAccounts(ctx, lookupAccounts) + if lookupErr != nil { + slog.Warn("user lookup failed for thread reply, falling back to account", + "error", lookupErr, "parentMessageID", msg.ThreadParentMessageID) + } + userByAccount := make(map[string]model.User, len(users)) + for i := range users { + userByAccount[users[i].Account] = users[i] + } + + meta, err := h.store.GetRoomMeta(ctx, msg.RoomID) + if err != nil { + return fmt.Errorf("get room meta %s: %w", msg.RoomID, err) + } + + threadSubs, err := h.store.ListThreadSubscriptions(ctx, msg.ThreadParentMessageID, evt.SiteID) + if err != nil { + return fmt.Errorf("list thread subscriptions for parent %s: %w", msg.ThreadParentMessageID, err) + } + + // Union of thread subscribers + mentioned accounts, dedup, exclude sender. + seen := map[string]struct{}{msg.UserAccount: {}} + var fanOut []string + for i := range threadSubs { + acc := threadSubs[i].UserAccount + if _, ok := seen[acc]; ok { + continue + } + seen[acc] = struct{}{} + fanOut = append(fanOut, acc) + } + for _, acc := range parsed.Accounts { + if _, ok := seen[acc]; ok { + continue + } + seen[acc] = struct{}{} + fanOut = append(fanOut, acc) + } + + if len(fanOut) == 0 { + slog.Debug("no thread subscribers to notify for thread reply", + "parentMessageID", msg.ThreadParentMessageID) + return nil + } + + clientMsg := buildClientMessage(&msg, userByAccount) + + // Encrypt once for channel rooms when encryption is enabled. + var encJSON json.RawMessage + if meta.Type == model.RoomTypeChannel && h.encrypt { + msgJSON, err := json.Marshal(clientMsg) + if err != nil { + return fmt.Errorf("marshal thread client message: %w", err) + } + key, err := h.currentRoomKey(ctx, meta.ID) + if err != nil { + return err + } + encrypted, err := h.encoder.Encode(meta.ID, string(msgJSON), key.KeyPair.PrivateKey, key.Version) + if err != nil { + return fmt.Errorf("encrypt thread message for room %s: %w", meta.ID, err) + } + encJSON, err = json.Marshal(encrypted) + if err != nil { + return fmt.Errorf("marshal encrypted thread message: %w", err) + } + } + + for _, account := range fanOut { + roomEvt := buildRoomEvent(meta, clientMsg) + if encJSON != nil { + roomEvt.EncryptedMessage = encJSON + roomEvt.Message = nil + } + payload, err := json.Marshal(roomEvt) + if err != nil { + return fmt.Errorf("marshal thread event for user %s: %w", account, err) + } + if err := h.pub.Publish(ctx, subject.UserRoomEvent(account), payload); err != nil { + slog.Error("publish thread event failed", + "error", err, "account", account, "parentMessageID", msg.ThreadParentMessageID) + } + } + return nil +} +``` + +- [x] **Step 6: Run tests — confirm pass** + +```bash +make test SERVICE=broadcast-worker +``` + +Expected: all tests pass including `TestHandler_HandleThreadCreated` and `TestHandler_ThreadCreated_TShow_FallsThroughToRoomBroadcast`. + +- [x] **Step 7: Commit** + +```bash +git add broadcast-worker/handler.go broadcast-worker/handler_test.go +git commit -m "feat(broadcast-worker): fan-out thread reply created events to thread subscribers" +``` + +--- + +## Task 3: `handleThreadUpdated` — TDD + +**Files:** +- Modify: `broadcast-worker/handler_test.go` +- Modify: `broadcast-worker/handler.go` + +- [x] **Step 1: Add failing tests for `handleThreadUpdated` to `handler_test.go`** + +Append at the end of `broadcast-worker/handler_test.go`: + +```go +func TestHandler_HandleThreadUpdated(t *testing.T) { + const parentMsgID = "parent-msg-1" + const siteID = "site-a" + edited := time.Date(2026, 5, 28, 10, 5, 0, 0, time.UTC) + + makeThreadEditEvt := func(tshow bool) []byte { + evt := model.MessageEvent{ + Event: model.EventUpdated, + SiteID: siteID, + Message: model.Message{ + ID: "reply-1", + RoomID: "room-1", + UserID: "u-alice", + UserAccount: "alice", + Content: "edited content", + ThreadParentMessageID: parentMsgID, + TShow: tshow, + EditedAt: &edited, + UpdatedAt: &edited, + }, + } + data, _ := json.Marshal(evt) + return data + } + + t.Run("fans out edit to thread subscribers excluding sender", func(t *testing.T) { + ctrl := gomock.NewController(t) + store := NewMockStore(ctrl) + us := NewMockUserStore(ctrl) + pub := &mockPublisher{} + keyStore := NewMockRoomKeyProvider(ctrl) + + room := &model.Room{ID: "room-1", Type: model.RoomTypeChannel, SiteID: siteID} + store.EXPECT().GetRoom(gomock.Any(), "room-1").Return(room, nil) + store.EXPECT().ListThreadSubscriptions(gomock.Any(), parentMsgID, siteID).Return([]model.ThreadSubscription{ + {UserAccount: "alice"}, // sender — excluded + {UserAccount: "bob"}, + {UserAccount: "carol"}, + }, nil) + + h := NewHandler(store, us, pub, keyStore, false) + require.NoError(t, h.HandleMessage(context.Background(), makeThreadEditEvt(false))) + + require.Len(t, pub.records, 2) + gotSubjects := []string{pub.records[0].subject, pub.records[1].subject} + assert.ElementsMatch(t, []string{ + subject.UserRoomEvent("bob"), + subject.UserRoomEvent("carol"), + }, gotSubjects) + + var editEvt model.EditRoomEvent + require.NoError(t, json.Unmarshal(pub.records[0].data, &editEvt)) + assert.Equal(t, model.RoomEventMessageEdited, editEvt.Type) + assert.Equal(t, "room-1", editEvt.RoomID) + assert.Equal(t, siteID, editEvt.SiteID) + assert.Equal(t, "reply-1", editEvt.MessageID) + assert.Equal(t, "edited content", editEvt.NewContent) + assert.Equal(t, "alice", editEvt.EditedBy) + assert.True(t, editEvt.EditedAt.Equal(edited)) + }) + + t.Run("empty subscriber list - no publish", func(t *testing.T) { + ctrl := gomock.NewController(t) + store := NewMockStore(ctrl) + us := NewMockUserStore(ctrl) + pub := &mockPublisher{} + keyStore := NewMockRoomKeyProvider(ctrl) + + room := &model.Room{ID: "room-1", Type: model.RoomTypeChannel, SiteID: siteID} + store.EXPECT().GetRoom(gomock.Any(), "room-1").Return(room, nil) + store.EXPECT().ListThreadSubscriptions(gomock.Any(), parentMsgID, siteID).Return([]model.ThreadSubscription{}, nil) + + h := NewHandler(store, us, pub, keyStore, false) + require.NoError(t, h.HandleMessage(context.Background(), makeThreadEditEvt(false))) + assert.Empty(t, pub.records) + }) + + t.Run("GetRoom error - returns error", func(t *testing.T) { + ctrl := gomock.NewController(t) + store := NewMockStore(ctrl) + us := NewMockUserStore(ctrl) + pub := &mockPublisher{} + keyStore := NewMockRoomKeyProvider(ctrl) + + store.EXPECT().GetRoom(gomock.Any(), "room-1").Return(nil, errors.New("mongo down")) + + h := NewHandler(store, us, pub, keyStore, false) + err := h.HandleMessage(context.Background(), makeThreadEditEvt(false)) + require.Error(t, err) + assert.Contains(t, err.Error(), "fetch room") + assert.Empty(t, pub.records) + }) + + t.Run("ListThreadSubscriptions error - returns error", func(t *testing.T) { + ctrl := gomock.NewController(t) + store := NewMockStore(ctrl) + us := NewMockUserStore(ctrl) + pub := &mockPublisher{} + keyStore := NewMockRoomKeyProvider(ctrl) + + room := &model.Room{ID: "room-1", Type: model.RoomTypeChannel, SiteID: siteID} + store.EXPECT().GetRoom(gomock.Any(), "room-1").Return(room, nil) + store.EXPECT().ListThreadSubscriptions(gomock.Any(), parentMsgID, siteID).Return(nil, errors.New("db error")) + + h := NewHandler(store, us, pub, keyStore, false) + err := h.HandleMessage(context.Background(), makeThreadEditEvt(false)) + require.Error(t, err) + assert.Contains(t, err.Error(), "list thread subscriptions") + assert.Empty(t, pub.records) + }) + + t.Run("TShow=true falls through to room broadcast not thread handler", func(t *testing.T) { + ctrl := gomock.NewController(t) + store := NewMockStore(ctrl) + us := NewMockUserStore(ctrl) + pub := &mockPublisher{} + keyStore := NewMockRoomKeyProvider(ctrl) + + room := &model.Room{ID: "room-1", Type: model.RoomTypeChannel, SiteID: siteID} + store.EXPECT().GetRoom(gomock.Any(), "room-1").Return(room, nil) + // ListThreadSubscriptions must NOT be called for TShow=true + + h := NewHandler(store, us, pub, keyStore, false) + require.NoError(t, h.HandleMessage(context.Background(), makeThreadEditEvt(true))) + + require.Len(t, pub.records, 1) + assert.Equal(t, subject.RoomEvent("room-1"), pub.records[0].subject) + }) +} +``` + +- [x] **Step 2: Run tests — confirm failure** + +```bash +make test SERVICE=broadcast-worker +``` + +Expected: `TestHandler_HandleThreadUpdated` FAILS — `ListThreadSubscriptions` is expected but not called (no routing gate yet for updated path). + +- [x] **Step 3: Add routing gate in `handleUpdated` and a stub `handleThreadUpdated`** + +In `broadcast-worker/handler.go`, add the gate at the top of `handleUpdated`, after the `EditedAt`/`UpdatedAt` guard: + +```go +func (h *Handler) handleUpdated(ctx context.Context, evt *model.MessageEvent) error { + msg := evt.Message + if msg.EditedAt == nil || msg.UpdatedAt == nil { + return fmt.Errorf("updated event missing EditedAt or UpdatedAt: %s", msg.ID) + } + + if msg.ThreadParentMessageID != "" && !msg.TShow { + return h.handleThreadUpdated(ctx, evt) + } + + // ... rest of existing handleUpdated code unchanged ... +``` + +Add the stub: + +```go +func (h *Handler) handleThreadUpdated(ctx context.Context, evt *model.MessageEvent) error { + return nil +} +``` + +- [x] **Step 4: Run tests — confirm different failure** + +```bash +make test SERVICE=broadcast-worker +``` + +Expected: `TestHandler_HandleThreadUpdated` FAILS — `ListThreadSubscriptions` expected but not called (stub returns nil). All pre-existing tests still pass. + +- [x] **Step 5: Implement `handleThreadUpdated`** + +Replace the stub in `broadcast-worker/handler.go`: + +```go +func (h *Handler) handleThreadUpdated(ctx context.Context, evt *model.MessageEvent) error { + msg := evt.Message + + room, err := h.store.GetRoom(ctx, msg.RoomID) + if err != nil { + return fmt.Errorf("fetch room %s: %w", msg.RoomID, err) + } + + threadSubs, err := h.store.ListThreadSubscriptions(ctx, msg.ThreadParentMessageID, evt.SiteID) + if err != nil { + return fmt.Errorf("list thread subscriptions for parent %s: %w", msg.ThreadParentMessageID, err) + } + + edit := model.EditRoomEvent{ + Type: model.RoomEventMessageEdited, + RoomID: room.ID, + SiteID: room.SiteID, + Timestamp: time.Now().UTC().UnixMilli(), + MessageID: msg.ID, + NewContent: msg.Content, + EditedBy: msg.UserAccount, + EditedAt: *msg.EditedAt, + UpdatedAt: *msg.UpdatedAt, + } + if room.Type == model.RoomTypeChannel && h.encrypt { + if err := h.encryptEditedContent(ctx, room.ID, &edit); err != nil { + return err + } + } + + payload, err := json.Marshal(edit) + if err != nil { + return fmt.Errorf("marshal thread edit event: %w", err) + } + + for i := range threadSubs { + if threadSubs[i].UserAccount == msg.UserAccount { + continue + } + if err := h.pub.Publish(ctx, subject.UserRoomEvent(threadSubs[i].UserAccount), payload); err != nil { + slog.Error("publish thread edit event failed", + "error", err, + "account", threadSubs[i].UserAccount, + "parentMessageID", msg.ThreadParentMessageID, + ) + } + } + return nil +} +``` + +- [x] **Step 6: Run tests — confirm pass** + +```bash +make test SERVICE=broadcast-worker +``` + +Expected: all tests pass including `TestHandler_HandleThreadUpdated`. + +- [x] **Step 7: Commit** + +```bash +git add broadcast-worker/handler.go broadcast-worker/handler_test.go +git commit -m "feat(broadcast-worker): fan-out thread reply edit events to thread subscribers" +``` + +--- + +## Task 4: `handleThreadDeleted` — TDD + +**Files:** +- Modify: `broadcast-worker/handler_test.go` +- Modify: `broadcast-worker/handler.go` + +- [x] **Step 1: Add failing tests for `handleThreadDeleted` to `handler_test.go`** + +Append at the end of `broadcast-worker/handler_test.go`: + +```go +func TestHandler_HandleThreadDeleted(t *testing.T) { + const parentMsgID = "parent-msg-1" + const siteID = "site-a" + deletedAt := time.Date(2026, 5, 28, 10, 10, 0, 0, time.UTC) + + makeThreadDelEvt := func(tshow bool) []byte { + evt := model.MessageEvent{ + Event: model.EventDeleted, + SiteID: siteID, + Message: model.Message{ + ID: "reply-1", + RoomID: "room-1", + UserID: "u-alice", + UserAccount: "alice", + ThreadParentMessageID: parentMsgID, + TShow: tshow, + UpdatedAt: &deletedAt, + }, + } + data, _ := json.Marshal(evt) + return data + } + + t.Run("fans out delete to thread subscribers excluding sender", func(t *testing.T) { + ctrl := gomock.NewController(t) + store := NewMockStore(ctrl) + us := NewMockUserStore(ctrl) + pub := &mockPublisher{} + keyStore := NewMockRoomKeyProvider(ctrl) + + room := &model.Room{ID: "room-1", Type: model.RoomTypeChannel, SiteID: siteID} + store.EXPECT().GetRoom(gomock.Any(), "room-1").Return(room, nil) + store.EXPECT().ListThreadSubscriptions(gomock.Any(), parentMsgID, siteID).Return([]model.ThreadSubscription{ + {UserAccount: "alice"}, // sender — excluded + {UserAccount: "bob"}, + }, nil) + + h := NewHandler(store, us, pub, keyStore, false) + require.NoError(t, h.HandleMessage(context.Background(), makeThreadDelEvt(false))) + + require.Len(t, pub.records, 1) + assert.Equal(t, subject.UserRoomEvent("bob"), pub.records[0].subject) + + var delEvt model.DeleteRoomEvent + require.NoError(t, json.Unmarshal(pub.records[0].data, &delEvt)) + assert.Equal(t, model.RoomEventMessageDeleted, delEvt.Type) + assert.Equal(t, "room-1", delEvt.RoomID) + assert.Equal(t, siteID, delEvt.SiteID) + assert.Equal(t, "reply-1", delEvt.MessageID) + assert.Equal(t, "alice", delEvt.DeletedBy) + assert.True(t, delEvt.DeletedAt.Equal(deletedAt)) + }) + + t.Run("empty subscriber list - no publish", func(t *testing.T) { + ctrl := gomock.NewController(t) + store := NewMockStore(ctrl) + us := NewMockUserStore(ctrl) + pub := &mockPublisher{} + keyStore := NewMockRoomKeyProvider(ctrl) + + room := &model.Room{ID: "room-1", Type: model.RoomTypeChannel, SiteID: siteID} + store.EXPECT().GetRoom(gomock.Any(), "room-1").Return(room, nil) + store.EXPECT().ListThreadSubscriptions(gomock.Any(), parentMsgID, siteID).Return([]model.ThreadSubscription{}, nil) + + h := NewHandler(store, us, pub, keyStore, false) + require.NoError(t, h.HandleMessage(context.Background(), makeThreadDelEvt(false))) + assert.Empty(t, pub.records) + }) + + t.Run("GetRoom error - returns error", func(t *testing.T) { + ctrl := gomock.NewController(t) + store := NewMockStore(ctrl) + us := NewMockUserStore(ctrl) + pub := &mockPublisher{} + keyStore := NewMockRoomKeyProvider(ctrl) + + store.EXPECT().GetRoom(gomock.Any(), "room-1").Return(nil, errors.New("mongo down")) + + h := NewHandler(store, us, pub, keyStore, false) + err := h.HandleMessage(context.Background(), makeThreadDelEvt(false)) + require.Error(t, err) + assert.Contains(t, err.Error(), "fetch room") + assert.Empty(t, pub.records) + }) + + t.Run("ListThreadSubscriptions error - returns error", func(t *testing.T) { + ctrl := gomock.NewController(t) + store := NewMockStore(ctrl) + us := NewMockUserStore(ctrl) + pub := &mockPublisher{} + keyStore := NewMockRoomKeyProvider(ctrl) + + room := &model.Room{ID: "room-1", Type: model.RoomTypeChannel, SiteID: siteID} + store.EXPECT().GetRoom(gomock.Any(), "room-1").Return(room, nil) + store.EXPECT().ListThreadSubscriptions(gomock.Any(), parentMsgID, siteID).Return(nil, errors.New("db error")) + + h := NewHandler(store, us, pub, keyStore, false) + err := h.HandleMessage(context.Background(), makeThreadDelEvt(false)) + require.Error(t, err) + assert.Contains(t, err.Error(), "list thread subscriptions") + assert.Empty(t, pub.records) + }) + + t.Run("TShow=true falls through to room broadcast not thread handler", func(t *testing.T) { + ctrl := gomock.NewController(t) + store := NewMockStore(ctrl) + us := NewMockUserStore(ctrl) + pub := &mockPublisher{} + keyStore := NewMockRoomKeyProvider(ctrl) + + room := &model.Room{ID: "room-1", Type: model.RoomTypeChannel, SiteID: siteID} + store.EXPECT().GetRoom(gomock.Any(), "room-1").Return(room, nil) + // ListThreadSubscriptions must NOT be called for TShow=true + + h := NewHandler(store, us, pub, keyStore, false) + require.NoError(t, h.HandleMessage(context.Background(), makeThreadDelEvt(true))) + + require.Len(t, pub.records, 1) + assert.Equal(t, subject.RoomEvent("room-1"), pub.records[0].subject) + }) +} +``` + +- [x] **Step 2: Run tests — confirm failure** + +```bash +make test SERVICE=broadcast-worker +``` + +Expected: `TestHandler_HandleThreadDeleted` FAILS — `ListThreadSubscriptions` expected but not called. + +- [x] **Step 3: Add routing gate in `handleDeleted` and a stub `handleThreadDeleted`** + +In `broadcast-worker/handler.go`, add the gate at the top of `handleDeleted`, after the `UpdatedAt` guard: + +```go +func (h *Handler) handleDeleted(ctx context.Context, evt *model.MessageEvent) error { + msg := evt.Message + if msg.UpdatedAt == nil { + return fmt.Errorf("deleted event missing UpdatedAt: %s", msg.ID) + } + + if msg.ThreadParentMessageID != "" && !msg.TShow { + return h.handleThreadDeleted(ctx, evt) + } + + // ... rest of existing handleDeleted code unchanged ... +``` + +Add the stub: + +```go +func (h *Handler) handleThreadDeleted(ctx context.Context, evt *model.MessageEvent) error { + return nil +} +``` + +- [x] **Step 4: Run tests — confirm different failure** + +```bash +make test SERVICE=broadcast-worker +``` + +Expected: `TestHandler_HandleThreadDeleted` FAILS — stub returns nil, no publishes happen. All pre-existing tests still pass. + +- [x] **Step 5: Implement `handleThreadDeleted`** + +Replace the stub in `broadcast-worker/handler.go`: + +```go +func (h *Handler) handleThreadDeleted(ctx context.Context, evt *model.MessageEvent) error { + msg := evt.Message + + room, err := h.store.GetRoom(ctx, msg.RoomID) + if err != nil { + return fmt.Errorf("fetch room %s: %w", msg.RoomID, err) + } + + threadSubs, err := h.store.ListThreadSubscriptions(ctx, msg.ThreadParentMessageID, evt.SiteID) + if err != nil { + return fmt.Errorf("list thread subscriptions for parent %s: %w", msg.ThreadParentMessageID, err) + } + + del := model.DeleteRoomEvent{ + Type: model.RoomEventMessageDeleted, + RoomID: room.ID, + SiteID: room.SiteID, + Timestamp: time.Now().UTC().UnixMilli(), + MessageID: msg.ID, + DeletedBy: msg.UserAccount, + DeletedAt: *msg.UpdatedAt, + UpdatedAt: *msg.UpdatedAt, + } + + payload, err := json.Marshal(del) + if err != nil { + return fmt.Errorf("marshal thread delete event: %w", err) + } + + for i := range threadSubs { + if threadSubs[i].UserAccount == msg.UserAccount { + continue + } + if err := h.pub.Publish(ctx, subject.UserRoomEvent(threadSubs[i].UserAccount), payload); err != nil { + slog.Error("publish thread delete event failed", + "error", err, + "account", threadSubs[i].UserAccount, + "parentMessageID", msg.ThreadParentMessageID, + ) + } + } + return nil +} +``` + +- [x] **Step 6: Run tests — confirm pass** + +```bash +make test SERVICE=broadcast-worker +``` + +Expected: all tests pass including `TestHandler_HandleThreadDeleted`. + +- [x] **Step 7: Commit** + +```bash +git add broadcast-worker/handler.go broadcast-worker/handler_test.go +git commit -m "feat(broadcast-worker): fan-out thread reply delete events to thread subscribers" +``` + +--- + +## Task 5: Final verification + +- [x] **Step 1: Run lint** + +```bash +make lint +``` + +Expected: exits 0, no errors. + +- [x] **Step 2: Run all unit tests with race detector** + +```bash +make test SERVICE=broadcast-worker +``` + +Expected: all tests pass with `-race`. + +- [x] **Step 3: Run integration tests** + +```bash +make test-integration SERVICE=broadcast-worker +``` + +Expected: all tests pass. + +- [x] **Step 4: Push branch** + +```bash +git push -u origin claude/gallant-galileo-ice0C +``` + +--- + +## Out-of-scope reminder + +`Subscription.ThreadUnread` — the array on a user's room subscription that tracks unread thread parent message IDs — is NOT updated by broadcast-worker. This is a known gap in message-worker that must be addressed in a separate task. Without it, the unread thread badge on the client will not reflect new thread replies until the user reads the thread explicitly. + +--- + +## Post-Plan Fixes and Refactoring + +After the four tasks above were complete, three rounds of high-effort code review (`/code-review --effort high`) and a simplification pass (`/simplify`) identified and fixed additional issues. All changes are in PR #245 on branch `claude/gallant-galileo-ice0C`. + +### Correctness fixes (broadcast-worker) + +- **`evt.Timestamp` propagation** (`fix(broadcast-worker): propagate evt.Timestamp`): `EditRoomEvent` and `DeleteRoomEvent` were stamping `Timestamp` with `time.Now()` at broadcast time instead of forwarding `evt.Timestamp` from the canonical event. This caused the timestamp to differ across JetStream redeliveries and drift from the canonical timeline. Fixed in all four edit/delete handlers; unit and integration tests updated with exact-equality assertions. + +- **TShow=true badge on delete** (`fix(broadcast-worker): publish tcount badge for TShow=true deleted thread replies`): When a `TShow=true` thread reply is deleted, `handleDeleted` takes the normal room broadcast path and `handleThreadDeleted` is never called. The tcount badge update was therefore never published. Fixed by detecting `ThreadParentMessageID != ""` in `handleDeleted` and calling `publishThreadBadge` there. + +- **history-service tcount errors best-effort** (`fix(history-service): treat messages_by_room tcount errors as best-effort in decrementParentTcount`): Cassandra errors on the secondary `messages_by_room` tcount mirror were propagating and causing JetStream redelivery, re-running a CAS-decrement that was already committed on `messages_by_id`. Fixed by logging the mirror error and returning the already-decremented value. + +### Simplification and defensive fixes (broadcast-worker) + +- **`shouldUseThreadFanOut` rename** (was `isThreadReply`): the predicate encodes a routing decision, not structural identity — renamed for clarity at all 3 call sites. + +- **`buildEditRoomEvent` / `buildDeleteRoomEvent` helpers**: eliminated duplicated 9-field struct literals that appeared independently in the non-thread and thread handler variants. + +- **`publishThreadBadge` helper**: consolidated the `publishThreadMetadata` + error-log pattern duplicated in `handleThreadDeleted` and `handleDeleted`. + +- **`handleThreadDeleted` default-branch `return nil` removed**: the early return silently skipped the tcount badge block for unknown room types. Badge block now runs unconditionally after the switch. + +- **Nil guard in `handleThreadUpdated`**: defensive `EditedAt`/`UpdatedAt` nil check added before dereferencing, mirroring the outer guard in `handleUpdated`. + +- **Integration test timestamp assertion** (`TestBroadcastWorker_ThreadDeleted_Integration`): updated from a wall-clock range check (stale `#13` comment) to an exact `evt.Timestamp` equality assertion, consistent with the corrected implementation. diff --git a/docs/superpowers/plans/2026-06-04-tcount-count-based.md b/docs/superpowers/plans/2026-06-04-tcount-count-based.md new file mode 100644 index 000000000..1af201d05 --- /dev/null +++ b/docs/superpowers/plans/2026-06-04-tcount-count-based.md @@ -0,0 +1,630 @@ +# tcount: COUNT-Based Approach Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Replace the CAS increment/decrement approach for `tcount` with a COUNT of non-deleted rows from `thread_messages_by_thread`, eliminating the crash window and simplifying the code in both `message-worker` and `history-service`. + +**Architecture:** Each time a thread reply is added or deleted, derive the authoritative `tcount` by iterating the `thread_messages_by_thread` partition (one partition = one thread) and counting non-deleted rows in Go. Then blind-SET that value on the parent row in both `messages_by_id` and `messages_by_room`. Because the count is re-derived from the source of truth on every write — including JetStream redeliveries — there is no crash window and no need for CAS loops, sentinel columns, or schema changes. + +**Tech Stack:** Go 1.25, gocql, testify, testcontainers-go (via `pkg/testutil`). All tests run via `make test` (unit) and `make test-integration SERVICE=` (integration). + +--- + +## Background + +### Why tcount is broken today + +`SaveThreadMessage` (message-worker) does three things: +1. LWT INSERT reply into `messages_by_id` — idempotency gate (`IF NOT EXISTS`) +2. Plain INSERT into `thread_messages_by_thread` — idempotent +3. If `applied=true` → `incrementParentTcount` (CAS increment on parent in both tables) + If `applied=false` → `readParentTcount` (read-only, no increment) + +**Crash window:** process crashes after step 1 but before step 3. On JetStream redelivery, `applied=false` → step 3 is skipped permanently → `tcount` stays at 0 forever. + +`decrementParentTcount` in history-service has the same pattern for deletes. + +### Fix + +After the INSERT into `thread_messages_by_thread`, always derive the count from that table and SET it on the parent — regardless of `applied`. The partition contains every reply for the thread. Counting non-deleted rows is always correct and always repeatable. + +--- + +## Files Changed + +| File | Action | +|------|--------| +| `message-worker/store_cassandra.go` | Remove `casMaxRetries`, `casIncrement`, `readParentTcount`, `incrementParentTcount`. Add `countThreadReplies`, `setParentTcount`, `countAndSetParentTcount`. Simplify `SaveThreadMessage` and `saveThreadMessageEncrypted`. | +| `message-worker/integration_test.go` | Add `TestCassandraStore_SaveThreadMessage_CountBasedTcount` (the new failing test). Existing tests stay; they pass because the observable behavior is unchanged. | +| `history-service/internal/cassrepo/write.go` | Remove `casMaxRetries`, `casDecrement`, `decrementParentTcount`. Add `countThreadReplies`, `setParentTcount`, `countAndSetParentTcount`. Update `SoftDeleteMessage` to call `countAndSetParentTcount`. | +| `history-service/internal/cassrepo/cas_test.go` | Delete — tests `casDecrement` which is being removed. | +| `history-service/internal/cassrepo/write_integration_test.go` | Rewrite `TestRepository_SoftDeleteMessage_DecrementsParentTcount` to seed `thread_messages_by_thread` rows (not a hard-coded `tcount=3` in the parent) and verify the COUNT-derived result. | + +--- + +## Task 1 — message-worker: Write the failing integration test + +**Files:** +- Modify: `message-worker/integration_test.go` + +The new test pre-seeds 2 reply rows directly into `thread_messages_by_thread` before calling `SaveThreadMessage` for a third reply. With the old CAS approach the parent starts at `tcount=null` → increments to 1. With the new COUNT approach it reads 3 rows in the partition → sets tcount=3. + +- [ ] **Step 1: Write the failing test** + +Add this function at the end of `message-worker/integration_test.go`, before the final `}`: + +```go +func TestCassandraStore_SaveThreadMessage_CountBasedTcount(t *testing.T) { + cassSession := setupCassandra(t) + store := NewCassandraStore(cassSession, msgbucket.New(24*time.Hour), nil) + ctx := context.Background() + + parentCreatedAt := time.Now().UTC().Truncate(time.Millisecond) + parentBucket := msgbucket.New(24 * time.Hour).Of(parentCreatedAt) + + parentSender := &cassParticipant{ID: "u-cnt-parent", Account: "alice", EngName: "Alice"} + parentMsg := &model.Message{ + ID: "cnt-parent", + RoomID: "cnt-room", + UserID: "u-cnt-parent", + CreatedAt: parentCreatedAt, + Content: "parent message", + } + require.NoError(t, store.SaveMessage(ctx, parentMsg, parentSender, "site-a")) + + // Pre-seed two existing replies directly in thread_messages_by_thread. + // This simulates replies that were already processed before a crash — + // the kind of state that the CAS approach can't recover from but COUNT can. + threadRoomID := "tr-cnt-1" + t1 := parentCreatedAt.Add(1 * time.Minute) + t2 := parentCreatedAt.Add(2 * time.Minute) + for _, row := range []struct { + msgID string + createdAt time.Time + }{ + {"cnt-reply-pre-1", t1}, + {"cnt-reply-pre-2", t2}, + } { + require.NoError(t, cassSession.Query( + `INSERT INTO thread_messages_by_thread + (thread_room_id, created_at, message_id, room_id, thread_parent_id, deleted) + VALUES (?, ?, ?, ?, ?, ?)`, + threadRoomID, row.createdAt, row.msgID, "cnt-room", "cnt-parent", false, + ).Exec()) + } + + // Now process a third reply via SaveThreadMessage. + t3 := parentCreatedAt.Add(3 * time.Minute) + replySender := &cassParticipant{ID: "u-cnt-replier", Account: "bob", EngName: "Bob"} + replyMsg := &model.Message{ + ID: "cnt-reply-3", + RoomID: "cnt-room", + UserID: "u-cnt-replier", + Content: "third reply", + CreatedAt: t3, + ThreadParentMessageID: "cnt-parent", + ThreadParentMessageCreatedAt: &parentCreatedAt, + } + newTcount, err := store.SaveThreadMessage(ctx, replyMsg, replySender, "site-a", threadRoomID) + require.NoError(t, err) + + // COUNT of non-deleted rows in the partition = 3 (2 pre-seeded + 1 just inserted). + // The old CAS approach would give 1 (increments from null). + require.NotNil(t, newTcount, "newTcount must not be nil — parent tcount must be updated") + assert.Equal(t, 3, *newTcount, "tcount must equal the COUNT of non-deleted replies in the partition") + + t.Run("tcount=3 written to messages_by_id", func(t *testing.T) { + var tcount int + require.NoError(t, cassSession.Query( + `SELECT tcount FROM messages_by_id WHERE message_id = ? AND created_at = ?`, + "cnt-parent", parentCreatedAt, + ).Scan(&tcount)) + assert.Equal(t, 3, tcount) + }) + + t.Run("tcount=3 written to messages_by_room", func(t *testing.T) { + var tcount int + require.NoError(t, cassSession.Query( + `SELECT tcount FROM messages_by_room WHERE room_id = ? AND bucket = ? AND created_at = ? AND message_id = ?`, + "cnt-room", parentBucket, parentCreatedAt, "cnt-parent", + ).Scan(&tcount)) + assert.Equal(t, 3, tcount) + }) +} +``` + +- [ ] **Step 2: Run the test to verify it FAILS** + +```bash +make test-integration SERVICE=message-worker +``` + +Expected: `TestCassandraStore_SaveThreadMessage_CountBasedTcount` FAILS with something like `assert.Equal: expected 3, got 1`. + +--- + +## Task 2 — message-worker: Implement COUNT-based approach + +**Files:** +- Modify: `message-worker/store_cassandra.go` + +- [ ] **Step 1: Add the three new functions after the `buildCassandraMessage` function (around line 330)** + +Insert the following block after `buildCassandraMessage` and before the `casMaxRetries` constant: + +```go +// countThreadReplies returns the number of non-deleted replies in the thread +// by iterating the thread_messages_by_thread partition and counting in Go. +// Using Go-side filtering handles the null/false/true ambiguity: message-worker +// never writes deleted on INSERT (null), history-service writes true on delete. +// Both null and false are treated as "active". +func (s *CassandraStore) countThreadReplies(ctx context.Context, threadRoomID string) (int, error) { + iter := s.cassSession.Query( + `SELECT deleted FROM thread_messages_by_thread WHERE thread_room_id = ?`, + threadRoomID, + ).WithContext(ctx).Iter() + var deleted *bool + n := 0 + for iter.Scan(&deleted) { + if deleted == nil || !*deleted { + n++ + } + } + if err := iter.Close(); err != nil { + return 0, fmt.Errorf("count thread replies for thread %s: %w", threadRoomID, err) + } + return n, nil +} + +// setParentTcount blindly overwrites tcount on the parent message row in both +// messages_by_id and messages_by_room. No IF guard — idempotent on redelivery +// because the source value is always re-derived from countThreadReplies. +func (s *CassandraStore) setParentTcount(ctx context.Context, msg *model.Message, n int) error { + parentID := msg.ThreadParentMessageID + parentCreatedAt := *msg.ThreadParentMessageCreatedAt + parentBucket := s.bucket.Of(parentCreatedAt) + + if err := s.cassSession.Query( + `UPDATE messages_by_id SET tcount = ? WHERE message_id = ? AND created_at = ?`, + n, parentID, parentCreatedAt, + ).WithContext(ctx).Exec(); err != nil { + return fmt.Errorf("set tcount on parent %s in messages_by_id: %w", parentID, err) + } + if err := s.cassSession.Query( + `UPDATE messages_by_room SET tcount = ? WHERE room_id = ? AND bucket = ? AND created_at = ? AND message_id = ?`, + n, msg.RoomID, parentBucket, parentCreatedAt, parentID, + ).WithContext(ctx).Exec(); err != nil { + return fmt.Errorf("set tcount on parent %s in messages_by_room: %w", parentID, err) + } + return nil +} + +// countAndSetParentTcount counts non-deleted replies in the thread and writes +// the result to the parent row in both tables. Safe to call on any delivery +// (first or redelivery) — both the count and the SET are idempotent. +// Returns (nil, nil) when ThreadParentMessageCreatedAt is unset (same semantics +// as the old incrementParentTcount). +func (s *CassandraStore) countAndSetParentTcount(ctx context.Context, msg *model.Message, threadRoomID string) (*int, error) { + if msg.ThreadParentMessageCreatedAt == nil { + return nil, nil + } + n, err := s.countThreadReplies(ctx, threadRoomID) + if err != nil { + return nil, fmt.Errorf("count thread replies: %w", err) + } + if err := s.setParentTcount(ctx, msg, n); err != nil { + return nil, err + } + return &n, nil +} +``` + +- [ ] **Step 2: Simplify `SaveThreadMessage` — replace the applied branch** + +In `SaveThreadMessage` (around line 217), replace: +```go + if !applied { + return s.readParentTcount(ctx, msg) + } + return s.incrementParentTcount(ctx, msg) +``` +with: +```go + return s.countAndSetParentTcount(ctx, msg, threadRoomID) +``` + +Also change `applied, err :=` to `_, err :=` on the `MapScanCAS` call (line 189) since `applied` is no longer used: +```go + _, err := s.cassSession.Query( +``` + +- [ ] **Step 3: Simplify `saveThreadMessageEncrypted` — same replacement** + +In `saveThreadMessageEncrypted` (around line 275), replace: +```go + if !applied { + return s.readParentTcount(ctx, msg) + } + return s.incrementParentTcount(ctx, msg) +``` +with: +```go + return s.countAndSetParentTcount(ctx, msg, threadRoomID) +``` + +Also change `applied, err :=` to `_, err :=` on that function's `MapScanCAS` call (line 245). + +- [ ] **Step 4: Remove the four dead functions and constant** + +Delete the following from `store_cassandra.go`: +- `const casMaxRetries = 16` (line 336) +- The entire `readParentTcount` function (lines 288–304) +- The entire `casIncrement` function (lines 343–360) +- The entire `incrementParentTcount` function (lines 374–427) + +Also remove the comment block above `casMaxRetries` (lines 332–336). + +- [ ] **Step 5: Run integration tests** + +```bash +make test-integration SERVICE=message-worker +``` + +Expected: ALL tests pass, including `TestCassandraStore_SaveThreadMessage_CountBasedTcount`. + +- [ ] **Step 6: Run unit tests and lint** + +```bash +make test SERVICE=message-worker && make lint +``` + +Expected: PASS with no errors. + +- [ ] **Step 7: Commit** + +```bash +git add message-worker/store_cassandra.go message-worker/integration_test.go +git commit -m "message-worker: replace tcount CAS increment with COUNT-based approach + +tcount is now derived from thread_messages_by_thread on every write +(add or redelivery) and blind-SET on the parent row. Eliminates the +crash window between LWT INSERT and the old incrementParentTcount. + +Removes casMaxRetries, casIncrement, readParentTcount, and +incrementParentTcount. Adds countThreadReplies, setParentTcount, +and countAndSetParentTcount." +``` + +--- + +## Task 3 — history-service: Write the failing integration test + +**Files:** +- Modify: `history-service/internal/cassrepo/write_integration_test.go` + +The existing `TestRepository_SoftDeleteMessage_DecrementsParentTcount` seeds `tcount=3` directly in the parent tables so the CAS decrement has something to work from. Rewrite it to instead seed 3 rows in `thread_messages_by_thread` and verify the result is the COUNT (2 remaining after delete), not the CAS decrement. + +- [ ] **Step 1: Rewrite the test** + +Find `TestRepository_SoftDeleteMessage_DecrementsParentTcount` (around line 396) and replace the entire function with: + +```go +func TestRepository_SoftDeleteMessage_DecrementsParentTcount(t *testing.T) { + session := setupCassandra(t) + repo := NewRepository(session, msgbucket.New(24*time.Hour), 365, nil) + ctx := context.Background() + + sender := models.Participant{ID: "u1", Account: "alice"} + roomID := "room-tcount" + threadRoomID := "thread-tcount" + parentID := "m-tcount-parent" + parentCreatedAt := time.Now().UTC().Truncate(time.Millisecond) + parentBucket := msgbucket.New(24 * time.Hour).Of(parentCreatedAt) + replyID := "m-tcount-reply" + replyCreatedAt := parentCreatedAt.Add(10 * time.Second) + + // Seed parent WITHOUT a pre-existing tcount — the COUNT approach derives + // the authoritative value from thread_messages_by_thread, not from a + // materialized counter in the parent tables. + require.NoError(t, session.Query( + `INSERT INTO messages_by_id (message_id, room_id, created_at, sender, msg, deleted) VALUES (?, ?, ?, ?, ?, ?)`, + parentID, roomID, parentCreatedAt, sender, "parent", false, + ).Exec()) + require.NoError(t, session.Query( + `INSERT INTO messages_by_room (room_id, bucket, created_at, message_id, sender, msg, deleted) VALUES (?, ?, ?, ?, ?, ?, ?)`, + roomID, parentBucket, parentCreatedAt, parentID, sender, "parent", false, + ).Exec()) + + // Seed three reply rows in thread_messages_by_thread: two survivors and one + // being deleted. This is the source of truth for the COUNT. + for _, row := range []struct { + id string + off time.Duration + }{ + {"m-tcount-survivor-1", 1 * time.Second}, + {"m-tcount-survivor-2", 2 * time.Second}, + {replyID, 10 * time.Second}, + } { + require.NoError(t, session.Query( + `INSERT INTO thread_messages_by_thread + (thread_room_id, created_at, message_id, room_id, thread_parent_id, deleted) + VALUES (?, ?, ?, ?, ?, ?)`, + threadRoomID, parentCreatedAt.Add(row.off), row.id, roomID, parentID, false, + ).Exec()) + } + + // Seed the reply being deleted in messages_by_id. + require.NoError(t, session.Query( + `INSERT INTO messages_by_id + (message_id, room_id, created_at, sender, msg, thread_parent_id, thread_parent_created_at, thread_room_id, deleted) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`, + replyID, roomID, replyCreatedAt, sender, "reply", parentID, parentCreatedAt, threadRoomID, false, + ).Exec()) + + parentCreatedAtPtr := parentCreatedAt + msg := &models.Message{ + MessageID: replyID, + RoomID: roomID, + CreatedAt: replyCreatedAt, + Sender: sender, + ThreadParentID: parentID, + ThreadParentCreatedAt: &parentCreatedAtPtr, + ThreadRoomID: threadRoomID, + } + _, applied, newTcount, err := repo.SoftDeleteMessage(ctx, msg, replyCreatedAt.Add(time.Minute)) + require.NoError(t, err) + require.True(t, applied, "first delete should apply") + require.NotNil(t, newTcount, "newTcount must be non-nil after a successful thread-reply delete") + assert.Equal(t, 2, *newTcount, "tcount must equal COUNT of non-deleted rows: 3 seeded - 1 deleted = 2") + + var gotTcount int + require.NoError(t, session.Query( + `SELECT tcount FROM messages_by_id WHERE message_id = ? AND created_at = ?`, + parentID, parentCreatedAt, + ).Scan(&gotTcount)) + assert.Equal(t, 2, gotTcount, "messages_by_id.tcount must be set to COUNT result") + + require.NoError(t, session.Query( + `SELECT tcount FROM messages_by_room WHERE room_id = ? AND bucket = ? AND created_at = ? AND message_id = ?`, + roomID, parentBucket, parentCreatedAt, parentID, + ).Scan(&gotTcount)) + assert.Equal(t, 2, gotTcount, "messages_by_room.tcount must be set to COUNT result") +} +``` + +- [ ] **Step 2: Run the test to verify it FAILS** + +```bash +make test-integration SERVICE=history-service +``` + +Expected: `TestRepository_SoftDeleteMessage_DecrementsParentTcount` FAILS — `assert.Equal: expected 2, got ` (casDecrement returns nil for nil initial since no tcount was seeded in the parent). + +--- + +## Task 4 — history-service: Implement COUNT-based approach + +**Files:** +- Modify: `history-service/internal/cassrepo/write.go` +- Delete: `history-service/internal/cassrepo/cas_test.go` + +- [ ] **Step 1: Add the three new functions to `write.go`** + +Insert after `ErrMessageNotFound` (around line 60), before the `casMaxRetries` constant: + +```go +// countThreadReplies returns the number of non-deleted replies in the thread +// by iterating the thread_messages_by_thread partition and counting in Go. +// null and false are both treated as "active" — message-worker never writes +// deleted on INSERT (null), history-service writes true on delete. +func (r *Repository) countThreadReplies(ctx context.Context, threadRoomID string) (int, error) { + iter := r.session.Query( + `SELECT deleted FROM thread_messages_by_thread WHERE thread_room_id = ?`, + threadRoomID, + ).WithContext(ctx).Iter() + var deleted *bool + n := 0 + for iter.Scan(&deleted) { + if deleted == nil || !*deleted { + n++ + } + } + if err := iter.Close(); err != nil { + return 0, fmt.Errorf("count thread replies for thread %s: %w", threadRoomID, err) + } + return n, nil +} + +// setParentTcount blindly overwrites tcount on the parent message row in both +// messages_by_id and messages_by_room. No IF guard — idempotent because the +// value is always re-derived from countThreadReplies. +func (r *Repository) setParentTcount(ctx context.Context, msg *models.Message, n int) error { + parentID := msg.ThreadParentID + parentCreatedAt := *msg.ThreadParentCreatedAt + parentBucket := r.bucket.Of(parentCreatedAt) + + if err := r.session.Query( + `UPDATE messages_by_id SET tcount = ? WHERE message_id = ? AND created_at = ?`, + n, parentID, parentCreatedAt, + ).WithContext(ctx).Exec(); err != nil { + return fmt.Errorf("set tcount on parent %s in messages_by_id: %w", parentID, err) + } + if err := r.session.Query( + `UPDATE messages_by_room SET tcount = ? WHERE room_id = ? AND bucket = ? AND created_at = ? AND message_id = ?`, + n, msg.RoomID, parentBucket, parentCreatedAt, parentID, + ).WithContext(ctx).Exec(); err != nil { + return fmt.Errorf("set tcount on parent %s in messages_by_room: %w", parentID, err) + } + return nil +} + +// countAndSetParentTcount counts non-deleted replies for the thread and +// writes the result to the parent row in both tables. Safe on any delivery. +// Returns (nil, nil) when ThreadParentCreatedAt is unset. +func (r *Repository) countAndSetParentTcount(ctx context.Context, msg *models.Message) (*int, error) { + if msg.ThreadParentCreatedAt == nil { + return nil, nil + } + n, err := r.countThreadReplies(ctx, msg.ThreadRoomID) + if err != nil { + return nil, fmt.Errorf("count thread replies: %w", err) + } + if err := r.setParentTcount(ctx, msg, n); err != nil { + return nil, err + } + return &n, nil +} +``` + +- [ ] **Step 2: Replace `decrementParentTcount` call in `SoftDeleteMessage`** + +In `SoftDeleteMessage` (around line 362), replace: +```go + newTcount, err := r.decrementParentTcount(ctx, msg) + if err != nil { + // The LWT delete already committed — return applied=true so callers correctly + // identify this as a decrement failure rather than a concurrent-winner race. + return deletedAt, true, nil, fmt.Errorf("decrement parent tcount for message %s: %w", msg.MessageID, err) + } + return deletedAt, true, newTcount, nil +``` +with: +```go + newTcount, err := r.countAndSetParentTcount(ctx, msg) + if err != nil { + return deletedAt, true, nil, fmt.Errorf("set parent tcount for message %s: %w", msg.MessageID, err) + } + return deletedAt, true, newTcount, nil +``` + +- [ ] **Step 3: Remove dead code from `write.go`** + +Delete the following from `write.go`: +- `const casMaxRetries = 16` (line 25) and its comment block (lines 17–25) +- The entire `casDecrement` function (lines 76–104) and its comment +- The entire `decrementParentTcount` function (lines 371–438) and its comment + +- [ ] **Step 4: Delete `cas_test.go`** + +```bash +rm /home/user/chat/history-service/internal/cassrepo/cas_test.go +``` + +This file only tests `casDecrement`, which is now removed. + +- [ ] **Step 5: Run integration tests** + +```bash +make test-integration SERVICE=history-service +``` + +Expected: ALL tests pass, including the rewritten `TestRepository_SoftDeleteMessage_DecrementsParentTcount`. + +- [ ] **Step 6: Run unit tests and lint** + +```bash +make test SERVICE=history-service && make lint +``` + +Expected: PASS. + +- [ ] **Step 7: Commit** + +```bash +git add history-service/internal/cassrepo/write.go \ + history-service/internal/cassrepo/write_integration_test.go +git rm history-service/internal/cassrepo/cas_test.go +git commit -m "history-service: replace tcount CAS decrement with COUNT-based approach + +tcount on delete is now derived from thread_messages_by_thread COUNT +and blind-SET on the parent row, matching the message-worker add path. +Eliminates the crash window between the LWT soft-delete and the old +decrementParentTcount. + +Removes casMaxRetries, casDecrement, decrementParentTcount, and +cas_test.go. Adds countThreadReplies, setParentTcount, and +countAndSetParentTcount." +``` + +--- + +## Task 5 — Final verification + +- [ ] **Step 1: Run all unit tests** + +```bash +make test +``` + +Expected: PASS. + +- [ ] **Step 2: Run all integration tests for both services** + +```bash +make test-integration SERVICE=message-worker +make test-integration SERVICE=history-service +``` + +Expected: PASS. + +- [ ] **Step 3: Run lint across the repo** + +```bash +make lint +``` + +Expected: PASS with no errors or warnings. + +- [ ] **Step 4: Push** + +```bash +git push -u origin claude/gallant-galileo-ice0C +``` + +--- + +## Self-Review + +**Spec coverage:** +- ✅ crash window on add path (message-worker) — fixed by `countAndSetParentTcount` +- ✅ crash window on delete path (history-service) — fixed by `countAndSetParentTcount` +- ✅ no schema change — confirmed, no DDL touched +- ✅ redelivery safe — COUNT + blind SET is idempotent +- ✅ existing behavior preserved — tcount still updated in both `messages_by_id` and `messages_by_room` +- ✅ nil `ThreadParentMessageCreatedAt` returns `(nil, nil)` — preserved + +**Placeholder scan:** None found. + +**Type consistency:** +- `countAndSetParentTcount` in message-worker: `(ctx, *model.Message, threadRoomID string) (*int, error)` — consistent across Task 2 steps +- `countAndSetParentTcount` in history-service: `(ctx, *models.Message) (*int, error)` — consistent across Task 4 steps (ThreadRoomID is a field on `models.Message`, so it's not a separate arg) + +**One known gap:** The `casRow` map in `SaveThreadMessage` and `saveThreadMessageEncrypted` is allocated but its contents are never read after the change. It is still needed because `MapScanCAS` must absorb all existing columns when the LWT is not applied — switching to `ScanCAS` with no destinations would cause the "not enough columns to scan" panic that was already fixed once. Keep `casRow` allocated and passed to `MapScanCAS`; it is a required absorber even though its values are unused. + +--- + +## Known Trade-offs and Future Work + +### O(N) partition scan in `countThreadReplies` + +**Current behavior:** `countThreadReplies` streams every row in the `thread_messages_by_thread` partition for the thread and counts non-deleted rows in Go. For a thread with *N* replies this is O(N) on every add or delete event. + +**Why it was designed this way:** The O(N) scan is the minimum-complexity design that achieves full crash-safety. The alternative — a stored CAS counter (increment/decrement) — has a 2PC crash window: a crash between the Cassandra write succeeding and the counter update leaves the counter permanently wrong. COUNT gives the ground truth at the moment of the SET, so any JetStream redelivery converges to the correct value regardless of how many times the handler ran. + +**Planned improvement (target: follow-up PR):** + +Replace the Go-side row scan with a dedicated Cassandra COUNTER table: + +```sql +CREATE TABLE thread_reply_counts ( + thread_room_id text PRIMARY KEY, + count counter +); +``` + +- **Add-path:** `UPDATE thread_reply_counts SET count = count + 1 WHERE thread_room_id = ?` after the LWT INSERT succeeds. +- **Delete-path:** `UPDATE thread_reply_counts SET count = count - 1 WHERE thread_room_id = ?` after the soft-delete. +- **Crash-safety:** a periodic reconciliation job (scheduled, low-frequency) overwrites the COUNTER with the true `SELECT COUNT(*)` scan. This bounds drift to the reconciliation interval — O(N) scan becomes a maintenance operation, not a hot path. +- **Read-path:** `tcount` value comes from this COUNTER table instead of the live scan, making it O(1). + +Until the COUNTER table ships, the current O(N) scan is correct behavior. Threads with fewer than ~500 replies see sub-millisecond scan latency on a well-partitioned Cassandra cluster; the trade-off is acceptable for the initial rollout. diff --git a/docs/superpowers/specs/2026-05-28-broadcast-worker-thread-handling-design.md b/docs/superpowers/specs/2026-05-28-broadcast-worker-thread-handling-design.md new file mode 100644 index 000000000..988e4df75 --- /dev/null +++ b/docs/superpowers/specs/2026-05-28-broadcast-worker-thread-handling-design.md @@ -0,0 +1,215 @@ +# Broadcast Worker: Thread Message Handling + +**Date:** 2026-05-28 +**Status:** Implemented — PR #245 (`claude/gallant-galileo-ice0C`) + +## Implementation Notes (what changed from this design) + +The PR delivered everything in this spec plus several additions discovered during implementation. Key divergences: + +### broadcast-worker additions +- **DM/BotDM thread handling**: all three thread handlers (`handleThreadCreated`, `handleThreadUpdated`, `handleThreadDeleted`) gained a DM/BotDM branch. Channel thread replies fan out to thread subscribers only; DM thread replies fan out to all DM members via `publishDMEvents` (DMs have no concept of "only thread subscribers — deliver to everyone"). `handleCreated` delegates to `handleThreadCreated`, which routes by room type. +- **`EventThreadReplyAdded` badge handler**: broadcast-worker gained `handleThreadTCountUpdated` to process the new `EventThreadReplyAdded` event published by message-worker, sending a `ThreadMetadataUpdatedEvent` (reply count badge update) to channel rooms and DM members. +- **Thread delete badge**: `handleThreadDeleted` also publishes the badge update (`publishThreadMetadata`) when `evt.NewTCount != nil`, so clients see the reply count decrement immediately. +- **TShow=true thread-reply badge on delete**: when a `TShow=true` thread reply is deleted it flows through `handleDeleted` (normal room broadcast), bypassing `handleThreadDeleted`. `handleDeleted` detects `ThreadParentMessageID != ""` and publishes the tcount badge there, so the reply-count decrement reaches clients regardless of TShow. +- **@-mention fan-out on delete**: `handleThreadDeleted` channel path parses @-mentions from the deleted message content so non-subscriber recipients who received the `EventCreated` (via mention fan-out) also receive the `EventDeleted`. +- **`SetSubscriptionMentions` — DM/BotDM branch only**: `handleThreadCreated` sets the room-subscription mention flag for @-mentioned members *only on the DM/BotDM path*. The channel path deliberately does **not** call `SetSubscriptionMentions`: a `TShow=false` reply is invisible in the main channel feed, so a room-level mention badge would appear with no visible message to explain it. (Thread-level mention state for channel replies is handled upstream by message-worker via `MarkThreadSubscriptionMention`.) +- **`channelThreadFanOut` helper**: extracted to avoid repeating the follower-query + dedup logic across all three channel-path handlers. Uses `GetThreadFollowers` (a `FindOne` projection on `thread_rooms.replyAccounts`) instead of a `thread_subscriptions` cursor scan — same pattern as PR #237's `notification-worker`. The `siteID` parameter was dropped because broadcast-worker is a single-site service. +- **Dead code removed**: `EventThreadReplyDeleted` (never published by any service) and `MsgCanonicalThreadReply` (non-standard subject replaced by routing badge events over `.created`) were both removed. Badge events (`EventThreadReplyAdded`) are routed over `chat.msg.canonical.{siteID}.created` with the event discriminator in the payload — consumers use `evt.Event == model.EventThreadReplyAdded` to distinguish them from real new messages. +- **`evt.Timestamp` propagation**: `EditRoomEvent` and `DeleteRoomEvent` set `Timestamp` from `evt.Timestamp` (the canonical event's publish time), not from `time.Now()` at broadcast time. Using wall-clock time at broadcast would make the timestamp differ across redeliveries and lag behind the canonical timeline. +- **`shouldUseThreadFanOut` rename** (was `isThreadReply`): the predicate encodes a routing decision (fan-out to thread subscribers vs. room broadcast), not a structural "is a thread reply" test — the new name makes that intent explicit. +- **`buildEditRoomEvent` / `buildDeleteRoomEvent` helpers**: extracted to eliminate the duplicated 9-field struct literals that appeared independently in the non-thread and thread variants of each handler. +- **`publishThreadBadge` helper**: consolidates the `publishThreadMetadata` call + error-log pattern that was duplicated in `handleThreadDeleted` and the TShow=true path of `handleDeleted`. +- **`handleThreadDeleted` default-branch `return nil` removed**: the premature return was silently skipping the tcount badge block for unknown room types. The badge block now runs unconditionally after the switch; `publishThreadMetadata` handles unknown room types gracefully via its own default branch. +- **Nil guard in `handleThreadUpdated`**: added a defensive `EditedAt`/`UpdatedAt` nil check (mirrors the outer guard in `handleUpdated`) to prevent a panic if the function is ever called without the outer guard in a future refactor. + +### message-worker additions +- **`SaveThreadMessage` returns `*int` (new tcount)**: the store method was extended to return the post-CAS tcount from Cassandra so message-worker can publish the `EventThreadReplyAdded` event with an authoritative count. +- **Idempotent CAS increment via `IF NOT EXISTS`**: `SaveThreadMessage` inserts the reply into `messages_by_id` with an `IF NOT EXISTS` LWT. On a JetStream redelivery the insert reports `applied=false`, so the handler reads back the existing tcount (`readParentTcount`) instead of re-incrementing (`incrementParentTcount`). This is what makes publish-error retries safe — a redelivered reply never double-counts the badge. +- **`publishThreadReplyEvent`**: new handler method that publishes `EventThreadReplyAdded` to `subject.MsgCanonicalThreadReply(siteID)`. Publish errors are **propagated** (NAK → JetStream redelivery), not swallowed — otherwise a dropped publish would permanently lose the badge update. Redelivery is safe because of the `IF NOT EXISTS` LWT above (the increment is idempotent), so there is no double-increment risk. +- **Thread subscription outbox**: `message-worker` publishes `OutboxThreadSubscriptionUpserted` events for remote-site thread subscribers. + +### history-service additions +- **`SoftDeleteMessage` returns `newTcount`**: the cassrepo method now runs a CAS decrement on `messages_by_id.tcount` (and mirrors to `messages_by_room`) and returns the post-CAS value so the service layer can include it in the `EventDeleted` publish. +- **`decrementParentTcount`**: new cassrepo helper that does the Cassandra CAS tcount decrement for deleted thread replies. Returns `nil` for legacy rows with NULL tcount (nothing authoritative to publish). +- **Already-deleted retry path re-publishes with tcount**: the already-deleted short-circuit in `DeleteMessage` re-fetches the parent's current tcount and re-publishes the `EventDeleted` event so a lost badge update gets retried. +- **`EventDeleted` carries `Content`**: history-service populates `Message.Content` on `EventDeleted` payloads so broadcast-worker's mention parser can build the correct fan-out list for deleted thread replies. + +### notification-worker — intentionally unchanged +notification-worker was scoped out of this PR. All planned changes to it are documented in [`docs/thread-reply-notifications.md`](../../thread-reply-notifications.md). The engineer who owns notification-worker should read that file before starting. The three things that need to be built there are: (1) filter to `EventCreated` only, (2) route thread replies to thread subscribers only, (3) notify @-mentioned non-subscribers via `EventThreadReplyAdded`. + +### Known remaining gaps +- **`Subscription.ThreadUnread`** — the array that drives the unread thread badge on the client — is still not updated when a thread reply arrives. This was out of scope here (noted in the original design) and must be addressed in a separate task. +- **Parent-message mentionees are not subscribed.** A user @-mentioned only in the parent message who never replies is never added to the thread subscription set (only the parent author, repliers, and reply-mentionees are). They receive no thread events. Closing this requires carrying the parent's resolved mention list onto the reply event so message-worker can seed those subscriptions on thread-room creation. +- **Edit/delete fan-out uses the current subscriber set.** A user @-mentioned in the *original* reply but later un-mentioned (or whose mention was edited out) will not receive the edit/delete event, because the fan-out re-derives recipients from current subscribers + current content. Fixing this would require persisting the original recipient list per reply. + +--- + +## Background + +The broadcast-worker consumes from `MESSAGES_CANONICAL` and fans out message events (Created, Updated, Deleted) to room members. Currently it has no awareness of thread messages — thread reply messages are treated identically to regular room messages and broadcast to all room members, regardless of thread membership. + +The message model already supports threads via `ThreadParentMessageID`, `ThreadParentMessageCreatedAt`, and `TShow` fields on `model.Message`. The message-worker already creates `ThreadRoom` and `ThreadSubscription` records when thread replies arrive. What is missing is real-time delivery of thread events to the correct set of recipients. + +## What Message-Worker Already Handles + +The following thread state is already managed by message-worker and is **not** broadcast-worker's responsibility: + +- Creating `ThreadRoom` on the first reply to a parent message +- Inserting/upserting `ThreadSubscription` records for the **parent message author** and the **replier** (on every reply) +- Setting `ThreadSubscription.hasMention=true` (auto-creating the subscription if absent) for users **@-mentioned in a reply**, via `MarkThreadSubscriptionMention` + - **Gap:** a user @-mentioned only in the *parent* message who never replies is **not** subscribed — no code path reads the parent's mention list. Such a user receives no thread events. Subscribing them would require carrying the parent's resolved mentions onto the reply event (see "Known remaining gap" below). +- Updating `ThreadRoom.LastMsgAt` and `ThreadRoom.LastMsgID` +- Publishing cross-site outbox events (`thread_subscription_upserted`) + +## Known Gap in Message-Worker (Out of Scope Here) + +`Subscription.ThreadUnread` — the array on a user's room subscription that tracks which parent message IDs have unread thread replies — is **not currently updated** when a thread reply arrives. This is the field that drives the unread thread badge on the client. Message-worker must be updated (in a separate task) to add the `parentMessageID` to `Subscription.ThreadUnread` for all thread subscribers who are not the sender. + +## Design Goals + +1. Fan-out thread reply events in real time to the correct set of recipients. +2. Leave all persistent state mutations to message-worker. +3. Reuse existing subjects, helpers, and patterns — no new NATS subject namespace. +4. Keep broadcast-worker as pure real-time delivery: no MongoDB writes. + +## Routing Gate + +Thread messages are identified by a non-empty `ThreadParentMessageID`. The `TShow` flag determines which broadcast path to use: + +| Condition | Path | +|-----------|------| +| `ThreadParentMessageID == ""` | Existing room/DM broadcast (unchanged) | +| `ThreadParentMessageID != ""` AND `TShow=true` | Existing room/DM broadcast (unchanged) — message appears in main room feed; all thread subscribers receive it as room members | +| `ThreadParentMessageID != ""` AND `TShow=false` | New thread subscriber fan-out | + +For `TShow=true`: thread subscribers are always room members, so the room broadcast already delivers the message to them. The `ClientMessage` payload retains `ThreadParentMessageID`, allowing clients to render the message in both the main room feed and the thread view. No duplicate fan-out is needed. + +The routing check is placed at the top of each event handler method (`handleCreated`, `handleUpdated`, `handleDeleted`), delegating to a dedicated thread handler method when conditions are met. + +## Store Interface Change + +One new method added to the `Store` interface in `store.go`: + +```go +GetThreadFollowers(ctx context.Context, parentMessageID string) (map[string]struct{}, error) +``` + +Implemented in `store_mongo.go` against the `thread_rooms` collection. Uses a `FindOne` projection on `replyAccounts` — the field that message-worker populates with the parent author, replier, and @mentioned accounts on every reply. Returns an empty set (not an error) when no thread room exists yet. A single-field index on `parentMessageId` covers this query. + +This follows the same pattern introduced in PR #237 for notification-worker (`mongoThreadFollowers.Followers`). The `siteID` filter used by the old `ListThreadSubscriptions` is no longer needed — broadcast-worker is a single-site service and the `thread_rooms` collection is scoped to its own site's MongoDB. + +The `//go:generate mockgen` directive in `store.go` regenerates `mock_store_test.go` via `make generate`. + +## New Handler Methods + +Three new unexported methods on the handler struct, mirroring the structure of the existing `handleCreated`, `handleUpdated`, `handleDeleted` methods. Each **branches on room type** (`meta.Type` / `room.Type`): channel rooms fan out to the thread-subscriber set, while DM/BotDM rooms fan out to all human members (see the DM/BotDM note in "Implementation Notes"). The channel-path subscriber-query + dedup logic is shared via the `channelThreadFanOut` helper. + +> The numbered flows below describe the **channel path**. The DM/BotDM path delegates to `publishDMEvents` and is summarized in "Implementation Notes". + +### `handleThreadCreated` (channel path) + +1. Look up users — sender + mentioned accounts from the message payload (same user lookup as existing `handleCreated`). +2. `channelThreadFanOut(parentMessageID, sender, mentions)` → query `GetThreadFollowers`, then build the fan-out set via `threadFanOutAccounts`: + - Union of `thread_rooms.replyAccounts` + mentioned accounts from the message payload. + - Deduplicate using a `seen` map seeded with the sender's account (sender excluded). + - **Bot accounts are excluded** (`isBot`), consistent with every other fan-out path (`publishMutation`, `publishDMEvents`). +3. Build `ClientMessage` — same enrichment flow as existing created handler (sender display name, user IDs, etc.). +4. `publishToThreadAccounts` → publish to `subject.UserRoomEvent(account)` for each account; on the first publish failure it **returns the error** so JetStream redelivers. + +**Why union of DB subscribers + mentioned accounts?** +Message-worker and broadcast-worker consume `MESSAGES_CANONICAL` independently with no guaranteed ordering. If broadcast-worker processes the event before message-worker creates the `ThreadSubscription` for a newly @mentioned user, the DB query will not include that user. Including mentioned accounts directly from the message payload closes this race, ensuring @mentioned users always receive the real-time notification. All mentioned accounts are guaranteed to be room members (enforced by message-gatekeeper). + +### `handleThreadUpdated` (channel path) + +1. Defensive `EditedAt`/`UpdatedAt` nil guard (mirrors `handleUpdated`). +2. `channelThreadFanOut` → follower accounts from `replyAccounts` (sender + bots excluded). Edits do not introduce new mentioned users, but the deleted/edited content is still parsed for @-mentions so the recipient set matches the original create fan-out. +3. Build the edit event via `buildEditRoomEvent` (timestamp from `evt.Timestamp`, not `time.Now()`). +4. `publishToThreadAccounts` → publish to each subscriber; returns error on failure for redelivery. + +### `handleThreadDeleted` (channel path) + +1. `channelThreadFanOut`, parsing @-mentions from the **deleted message content** so non-subscriber recipients who received the `EventCreated` (via mention fan-out) also receive the `EventDeleted`. +2. Build the delete event via `buildDeleteRoomEvent` (timestamp from `evt.Timestamp`). +3. `publishToThreadAccounts` → publish to each recipient; returns error on failure. +4. After the room-type switch, publish the reply-count badge (`publishThreadBadge`) when `evt.NewTCount != nil`, so clients see the decrement immediately. + +## Delivery Subject + +All thread events are published to `subject.UserRoomEvent(account)` — the same per-user subject used for DM deliveries. Clients already subscribe to this subject. The `ThreadParentMessageID` field in the payload distinguishes thread events from regular room events; no new subject namespace is required. + +## Error Handling + +| Error | Behavior | +|-------|----------| +| `GetThreadFollowers` fails | Log error, return error → JetStream redelivers the message | +| Thread room not found in `thread_rooms` | Returns empty set (not an error) — no recipients to fan out | +| User lookup fails | Log warning, continue — sender display name falls back to the account string | +| NATS publish fails (channel thread path, `publishToThreadAccounts`) | Log error and **return the error** on the first failure → JetStream redelivers. Thread per-user events must have the same delivery guarantee as channel room events; redelivery is safe because the upstream CAS increment is idempotent. | +| NATS publish fails (DM/BotDM path, `publishDMEvents`/`publishMutation`) | Log error, continue to remaining members — partial delivery accepted (consistent with existing DM mutation handling). | +| No thread followers found | Log at debug level, return nil — nothing to fan-out | + +## Encryption + +Thread messages in encrypted channel rooms use the same `RoomKeyProvider` path as the existing `publishChannelEvent`: encrypt once, then publish the same ciphertext to each subscriber. DM rooms are never encrypted, so the DM delivery path has no encryption — thread replies follow the channel path, not the DM path. No new design is needed. + +## Testing Plan + +All tests follow TDD: tests written and confirmed failing before implementation is written. + +### Unit Tests (`handler_test.go`) + +Table-driven tests for each new handler method: + +**`handleThreadCreated`:** +- Thread reply with @mentioned user not yet a follower → mentioned user included in fan-out +- Thread reply to existing thread with multiple followers → all receive the event +- Mentioned user already a thread follower → deduped, not double-published +- Sender is excluded from fan-out +- `TShow=true` → falls through to room broadcast, thread handler not called +- `GetRoomMeta` error → error returned, no publish +- `GetThreadFollowers` error → error returned, no publish +- User lookup error → log warning, continue (sender display name falls back to account string) +- No followers and no mentions → no publish, no error + +**`handleThreadUpdated`:** +- All thread followers receive the edit event, sender excluded +- Empty follower set → no publish, no error +- `GetRoom` error → error returned, no publish +- `GetThreadFollowers` error → error returned, no publish +- `TShow=true` → falls through to room broadcast, thread handler not called + +**`handleThreadDeleted`:** +- All thread followers receive the delete event, sender excluded +- Empty follower set → no publish, no error +- `GetRoom` error → error returned, no publish +- `GetThreadFollowers` error → error returned, no publish +- `TShow=true` → falls through to room broadcast, thread handler not called + +### Mock Regeneration + +`make generate` must be run after the store interface change to regenerate `mock_store_test.go` with the `GetThreadFollowers` mock method. + +## Commit Strategy + +Small, reviewable commits in this order: + +1. **Store** — add `GetThreadFollowers` to `Store`, implement in `store_mongo.go` against `thread_rooms.replyAccounts`, regenerate mocks, update all `NewMongoStore` call sites. +2. **`handleThreadCreated`** — write failing tests, add routing gate to `handleCreated`, implement method, commit once green. +3. **`handleThreadUpdated`** — write failing tests, add routing gate to `handleUpdated`, implement method, commit once green. +4. **`handleThreadDeleted`** — write failing tests, add routing gate to `handleDeleted`, implement method, commit once green. +5. **Final** — `make lint`, full test run, push. + +Each commit is independently compilable and does not break existing tests. + +## Files Changed + +| File | Change | +|------|--------| +| `broadcast-worker/store.go` | Add `GetThreadFollowers` to `Store` interface (reads `thread_rooms.replyAccounts`) | +| `broadcast-worker/store_mongo.go` | Implement `GetThreadFollowers` via `FindOne` projection on `thread_rooms`; add `parentMessageId` index | +| `broadcast-worker/main.go` | Pass `db.Collection("thread_rooms")` to `NewMongoStore` | +| `broadcast-worker/mock_store_test.go` | Regenerated — do not edit manually | +| `broadcast-worker/handler.go` | Add routing gate + three new thread handler methods; `channelThreadFanOut` drops `siteID` param | +| `broadcast-worker/handler_test.go` | New table-driven tests for thread handler methods | +| `broadcast-worker/integration_test.go` | Update `NewMongoStore` calls + integration test for thread fan-out | diff --git a/docs/thread-reply-notifications.md b/docs/thread-reply-notifications.md new file mode 100644 index 000000000..37c2d3768 --- /dev/null +++ b/docs/thread-reply-notifications.md @@ -0,0 +1,65 @@ +# Thread Reply Notifications — Out of Scope for PR #245 + +PR #245 ("feat: real-time thread reply fan-out (broadcast-worker) + reply-count badge pipeline") +implements thread reply fan-out in **broadcast-worker** and the reply-count badge pipeline in +**message-worker** and **history-service**. + +**notification-worker was intentionally left unchanged.** A separate engineer owns that service. + +> ⚠️ **Regression introduced by PR #245:** This PR publishes `EventThreadReplyAdded` events to +> `MESSAGES_CANONICAL` (subject `chat.msg.canonical..thread.reply`). The current +> notification-worker handler has no event-type guard, so every thread reply now fires a +> `"new_message"` push notification to **all room members** with a nearly empty `Message` body +> (`Content=""`, `UserID=""`). The sender-exclusion guard (`User.ID == senderID`) never fires +> because `senderID` is `""` on these events. **Priority #1 below is now a regression fix, +> not just a future improvement.** + +## What needs to be built in notification-worker + +### 1. Filter to EventCreated only + +The current handler fans out a `"new_message"` notification for every event type it receives +(EventCreated, EventUpdated, EventDeleted, EventThreadReplyAdded, …). It should return early for +anything that is not `EventCreated`. + +```go +if evt.Event != model.EventCreated { + return nil +} +``` + +### 2. Route thread replies to thread subscribers only + +Thread replies have `ThreadParentMessageID != ""` and `TShow == false`. They are invisible in the +main room and should notify only the thread's subscribers — not all room members. + +This requires: + +- A `ThreadSubscriberLookup` interface backed by the `thread_subscriptions` MongoDB collection + (same collection that broadcast-worker uses via its store). +- A `fanOutToThreadSubscribers` function that lists subscribers, excludes the sender, and + publishes `notifData` to each. +- Wiring the lookup into `NewHandler` and `main.go`. + +### 3. Notify @-mentioned non-subscribers + +When a thread reply @-mentions a user who is not yet a thread subscriber, that user should still +receive a notification. The resolved `Mentions []model.Participant` slice is **not** available on +`EventCreated` (message-gatekeeper publishes it before mention resolution). It is available on the +`EventThreadReplyAdded` event published by message-worker after saving to Cassandra. + +The correct approach: +- Handle `EventThreadReplyAdded` in `HandleMessage` in addition to `EventCreated`. +- For `EventCreated` thread replies: notify thread subscribers (no resolved Mentions yet). +- For `EventThreadReplyAdded`: notify only @-mentioned accounts that are **not** already thread + subscribers (they were notified on EventCreated). Skip `"all"` — it is not a real account. + +### Key files to read before starting + +| File | Why | +|------|-----| +| `notification-worker/handler.go` | Current handler — in-code TODOs at HandleMessage | +| `broadcast-worker/handler.go` | Reference implementation for thread subscriber fan-out | +| `broadcast-worker/store.go` | `ThreadSubscriptions` store interface shape | +| `pkg/model/event.go` | `EventCreated`, `EventThreadReplyAdded`, `MessageEvent.Mentions` | +| `pkg/subject/subject.go` | `subject.Notification(account)` builder | diff --git a/history-service/internal/cassrepo/write.go b/history-service/internal/cassrepo/write.go index 94a5c2a41..7e48043cd 100644 --- a/history-service/internal/cassrepo/write.go +++ b/history-service/internal/cassrepo/write.go @@ -13,16 +13,6 @@ import ( cassmodel "github.com/hmchangw/chat/pkg/model/cassandra" ) -// CAS = Compare-And-Set: a Cassandra UPDATE/INSERT with an `IF` clause, -// executed as a Paxos lightweight transaction (LWT) so the read-check- -// write is atomic across replicas. Costs ~4× a normal QUORUM write but -// is the only safe primitive when correctness depends on the row's -// current state (e.g. "edit only if not soft-deleted"). The `applied` -// boolean returned by gocql tells you whether the `IF` predicate held. -// -// casMaxRetries bounds the CAS loop; 16 retries cover realistic burst concurrency. -const casMaxRetries = 16 - const ( // Plaintext-path edits. enc_payload/enc_meta are nulled to keep a // cipher-disabled (rollback) edit from leaving stale ciphertext that @@ -72,31 +62,6 @@ const ( deleteThreadParentPinnedMsg = "UPDATE pinned_messages_by_room SET deleted = true, enc_payload = null, enc_meta = null, type = '" + MessageTypeRemoved + "', updated_at = ? WHERE room_id = ? AND pinned_at = ? AND message_id = ?" ) -// casDecrement atomically decrements a nullable INT toward zero (clamping at zero); mirrors message-worker/store_cassandra.go casIncrement. -// When initial is nil the column was never written and the function returns immediately — no LWT is issued and no zero is materialised. -func casDecrement(maxRetries int, initial *int, update func(newVal int, expected *int) (applied bool, current *int, err error)) error { - if initial == nil { - // tcount was never written — nothing to decrement; skip to avoid materialising a zero on a null column. - return nil - } - tcount := initial - for range maxRetries { - newVal := 0 - if tcount != nil && *tcount > 0 { - newVal = *tcount - 1 - } - applied, current, err := update(newVal, tcount) - if err != nil { - return err - } - if applied { - return nil - } - tcount = current - } - return fmt.Errorf("cas decrement exceeded %d retries", maxRetries) -} - // editPayload is the shared, pre-prepared edit payload passed to each // per-table UPDATE. It carries either the new plaintext body (cipher // disabled) or a pre-encrypted bundle (cipher enabled). Building it once @@ -288,9 +253,9 @@ func (r *Repository) UpdateMessageContent(ctx context.Context, msg *models.Messa // SoftDeleteMessage uses a Cassandra LWT on messages_by_id as a one-shot gate so only // the winning goroutine runs mirror-table updates and tcount decrement, preventing double-decrement. // `IF deleted != true` matches NULL (message-worker never writes deleted) and false, excluding true. -func (r *Repository) SoftDeleteMessage(ctx context.Context, msg *models.Message, deletedAt time.Time) (time.Time, bool, error) { +func (r *Repository) SoftDeleteMessage(ctx context.Context, msg *models.Message, deletedAt time.Time) (time.Time, bool, *int, error) { if msg.ThreadParentID != "" && msg.ThreadRoomID == "" { - return time.Time{}, false, fmt.Errorf("delete thread message %s: ThreadParentID %q is set but ThreadRoomID is empty", msg.MessageID, msg.ThreadParentID) + return time.Time{}, false, nil, fmt.Errorf("delete thread message %s: ThreadParentID %q is set but ThreadRoomID is empty", msg.MessageID, msg.ThreadParentID) } isThreadParent := msg.TCount != nil && *msg.TCount > 0 @@ -306,7 +271,7 @@ func (r *Repository) SoftDeleteMessage(ctx context.Context, msg *models.Message, deletedAt, msg.MessageID, msg.CreatedAt, ).WithContext(ctx).ScanCAS(¤t) if err != nil { - return time.Time{}, false, fmt.Errorf("cas update messages_by_id for message %s: %w", msg.MessageID, err) + return time.Time{}, false, nil, fmt.Errorf("cas update messages_by_id for message %s: %w", msg.MessageID, err) } if !applied { // Concurrent delete won. Read the existing updated_at so the caller @@ -318,11 +283,11 @@ func (r *Repository) SoftDeleteMessage(ctx context.Context, msg *models.Message, ).WithContext(ctx).Scan(&existing); err != nil { if errors.Is(err, gocql.ErrNotFound) { // Row vanished between the CAS and the follow-up SELECT — abnormal race. - return time.Time{}, false, fmt.Errorf("message %s vanished after cas miss: %w", msg.MessageID, gocql.ErrNotFound) + return time.Time{}, false, nil, fmt.Errorf("message %s vanished after cas miss: %w", msg.MessageID, gocql.ErrNotFound) } - return time.Time{}, false, fmt.Errorf("read updated_at after cas miss for message %s: %w", msg.MessageID, err) + return time.Time{}, false, nil, fmt.Errorf("read updated_at after cas miss for message %s: %w", msg.MessageID, err) } - return existing, false, nil + return existing, false, nil, nil } msgByRoomQ := deleteMsgByRoom @@ -336,79 +301,90 @@ func (r *Repository) SoftDeleteMessage(ctx context.Context, msg *models.Message, if msg.ThreadParentID == "" { if err := r.deleteInMessagesByRoom(ctx, msgByRoomQ, msg, deletedAt); err != nil { - return time.Time{}, false, fmt.Errorf("update messages_by_room for message %s in room %s: %w", msg.MessageID, msg.RoomID, err) + return time.Time{}, false, nil, fmt.Errorf("update messages_by_room for message %s in room %s: %w", msg.MessageID, msg.RoomID, err) } } else { if err := r.session.Query(threadMsgQ, deletedAt, msg.ThreadRoomID, msg.CreatedAt, msg.MessageID).WithContext(ctx).Exec(); err != nil { - return time.Time{}, false, fmt.Errorf("update thread_messages_by_thread for message %s thread %s: %w", msg.MessageID, msg.ThreadRoomID, err) + return time.Time{}, false, nil, fmt.Errorf("update thread_messages_by_thread for message %s thread %s: %w", msg.MessageID, msg.ThreadRoomID, err) } } if msg.PinnedAt != nil { if err := r.deleteInPinnedMessagesByRoom(ctx, pinnedMsgQ, msg, deletedAt); err != nil { - return time.Time{}, false, fmt.Errorf("update pinned_messages_by_room for message %s in room %s: %w", msg.MessageID, msg.RoomID, err) + return time.Time{}, false, nil, fmt.Errorf("update pinned_messages_by_room for message %s in room %s: %w", msg.MessageID, msg.RoomID, err) } } - if msg.ThreadParentID != "" { - if err := r.decrementParentTcount(ctx, msg); err != nil { - return time.Time{}, false, fmt.Errorf("decrement parent tcount for message %s: %w", msg.MessageID, err) - } + if msg.ThreadParentID == "" { + return deletedAt, true, nil, nil } - - return deletedAt, true, nil + newTcount, err := r.countAndSetParentTcount(ctx, msg) + if err != nil { + // The LWT delete already committed — return applied=true so callers correctly + // identify this as a count-set failure rather than a concurrent-winner race. + return deletedAt, true, nil, fmt.Errorf("count and set parent tcount for message %s: %w", msg.MessageID, err) + } + return deletedAt, true, newTcount, nil } -// decrementParentTcount silently skips if ThreadParentCreatedAt is nil or if the parent row is missing. -func (r *Repository) decrementParentTcount(ctx context.Context, msg *models.Message) error { - if msg.ThreadParentCreatedAt == nil { - return nil +// countThreadReplies counts non-deleted rows in the thread_messages_by_thread +// partition for threadRoomID. The deleted column may be NULL (message-worker +// doesn't write it on INSERT), so Go-side filtering treats NULL as not-deleted. +func (r *Repository) countThreadReplies(ctx context.Context, threadRoomID string) (int, error) { + iter := r.session.Query( + `SELECT deleted FROM thread_messages_by_thread WHERE thread_room_id = ?`, + threadRoomID, + ).WithContext(ctx).Iter() + var deleted *bool + n := 0 + for iter.Scan(&deleted) { + if deleted == nil || !*deleted { + n++ + } + } + if err := iter.Close(); err != nil { + return 0, fmt.Errorf("count thread replies for thread %s: %w", threadRoomID, err) } + return n, nil +} + +// setParentTcount blind-SETs tcount on the parent row in both messages_by_id +// and messages_by_room. No IF clause — the value is always derived from the +// authoritative COUNT, so overwrites are idempotent on any redelivery. +func (r *Repository) setParentTcount(ctx context.Context, msg *models.Message, n int) error { parentID := msg.ThreadParentID parentCreatedAt := *msg.ThreadParentCreatedAt - - // CAS decrement on messages_by_id. - var tcount *int if err := r.session.Query( - `SELECT tcount FROM messages_by_id WHERE message_id = ? AND created_at = ?`, - parentID, parentCreatedAt, - ).WithContext(ctx).Scan(&tcount); err != nil { - if errors.Is(err, gocql.ErrNotFound) { - return nil - } - return fmt.Errorf("read tcount for parent %s in messages_by_id: %w", parentID, err) + `UPDATE messages_by_id SET tcount = ? WHERE message_id = ? AND created_at = ?`, + n, parentID, parentCreatedAt, + ).WithContext(ctx).Exec(); err != nil { + return fmt.Errorf("set tcount on parent %s in messages_by_id: %w", parentID, err) } - if err := casDecrement(casMaxRetries, tcount, func(newVal int, expected *int) (bool, *int, error) { - var current *int - applied, err := r.session.Query( - `UPDATE messages_by_id SET tcount = ? WHERE message_id = ? AND created_at = ? IF tcount = ?`, - newVal, parentID, parentCreatedAt, expected, - ).WithContext(ctx).ScanCAS(¤t) - return applied, current, err - }); err != nil { - return fmt.Errorf("cas tcount decrement in messages_by_id for parent %s: %w", parentID, err) - } - - // CAS decrement on messages_by_room. parentBucket := r.bucket.Of(parentCreatedAt) if err := r.session.Query( - `SELECT tcount FROM messages_by_room WHERE room_id = ? AND bucket = ? AND created_at = ? AND message_id = ?`, - msg.RoomID, parentBucket, parentCreatedAt, parentID, - ).WithContext(ctx).Scan(&tcount); err != nil { - if errors.Is(err, gocql.ErrNotFound) { - return nil - } - return fmt.Errorf("read tcount for parent %s in messages_by_room: %w", parentID, err) - } - if err := casDecrement(casMaxRetries, tcount, func(newVal int, expected *int) (bool, *int, error) { - var current *int - applied, err := r.session.Query( - `UPDATE messages_by_room SET tcount = ? WHERE room_id = ? AND bucket = ? AND created_at = ? AND message_id = ? IF tcount = ?`, - newVal, msg.RoomID, parentBucket, parentCreatedAt, parentID, expected, - ).WithContext(ctx).ScanCAS(¤t) - return applied, current, err - }); err != nil { - return fmt.Errorf("cas tcount decrement in messages_by_room for parent %s: %w", parentID, err) + `UPDATE messages_by_room SET tcount = ? WHERE room_id = ? AND bucket = ? AND created_at = ? AND message_id = ?`, + n, msg.RoomID, parentBucket, parentCreatedAt, parentID, + ).WithContext(ctx).Exec(); err != nil { + return fmt.Errorf("set tcount on parent %s in messages_by_room: %w", parentID, err) } return nil } + +// countAndSetParentTcount derives tcount from the thread partition COUNT and +// blind-SETs it on the parent row in both Cassandra tables. Returns (nil, nil) +// when ThreadParentCreatedAt is unset (no parent key available). +// This approach is crash-safe: COUNT + blind SET is idempotent on redelivery, +// avoiding the 2PC window of the old CAS decrement. +func (r *Repository) countAndSetParentTcount(ctx context.Context, msg *models.Message) (*int, error) { + if msg.ThreadParentCreatedAt == nil { + return nil, nil + } + n, err := r.countThreadReplies(ctx, msg.ThreadRoomID) + if err != nil { + return nil, fmt.Errorf("count thread replies: %w", err) + } + if err := r.setParentTcount(ctx, msg, n); err != nil { + return nil, err + } + return &n, nil +} diff --git a/history-service/internal/cassrepo/write_integration_test.go b/history-service/internal/cassrepo/write_integration_test.go index 771cafafc..fd54dfebc 100644 --- a/history-service/internal/cassrepo/write_integration_test.go +++ b/history-service/internal/cassrepo/write_integration_test.go @@ -220,7 +220,7 @@ func TestRepository_SoftDeleteMessage_TopLevel(t *testing.T) { ThreadParentID: "", } deletedAt := createdAt.Add(time.Minute) - _, applied, err := repo.SoftDeleteMessage(ctx, msg, deletedAt) + _, applied, _, err := repo.SoftDeleteMessage(ctx, msg, deletedAt) require.NoError(t, err) require.True(t, applied, "first delete should apply") @@ -295,7 +295,7 @@ func TestRepository_SoftDeleteMessage_ThreadReply(t *testing.T) { ThreadRoomID: threadRoomID, } deletedAt := replyCreatedAt.Add(time.Minute) - _, applied, err := repo.SoftDeleteMessage(ctx, msg, deletedAt) + _, applied, _, err := repo.SoftDeleteMessage(ctx, msg, deletedAt) require.NoError(t, err) require.True(t, applied, "first delete should apply") @@ -367,7 +367,7 @@ func TestRepository_SoftDeleteMessage_Pinned(t *testing.T) { PinnedAt: &pinnedAt, } deletedAt := createdAt.Add(time.Minute) - _, applied, err := repo.SoftDeleteMessage(ctx, msg, deletedAt) + _, applied, _, err := repo.SoftDeleteMessage(ctx, msg, deletedAt) require.NoError(t, err) require.True(t, applied, "first delete should apply") @@ -406,24 +406,36 @@ func TestRepository_SoftDeleteMessage_DecrementsParentTcount(t *testing.T) { replyID := "m-tcount-reply" replyCreatedAt := parentCreatedAt.Add(10 * time.Second) - // Parent has tcount = 3 (three replies, of which we're about to delete one). + // Parent has no pre-seeded tcount — countAndSetParentTcount computes it from + // thread_messages_by_thread rather than CAS-decrementing a stored value. require.NoError(t, session.Query( - `INSERT INTO messages_by_id (message_id, room_id, created_at, sender, msg, thread_parent_id, tcount, deleted) VALUES (?, ?, ?, ?, ?, ?, ?, ?)`, - parentID, roomID, parentCreatedAt, sender, "parent", "", 3, false, + `INSERT INTO messages_by_id (message_id, room_id, created_at, sender, msg, thread_parent_id, deleted) VALUES (?, ?, ?, ?, ?, ?, ?)`, + parentID, roomID, parentCreatedAt, sender, "parent", "", false, ).Exec()) require.NoError(t, session.Query( - `INSERT INTO messages_by_room (room_id, bucket, created_at, message_id, sender, msg, thread_parent_id, tcount, deleted) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`, - roomID, msgbucket.New(24*time.Hour).Of(parentCreatedAt), parentCreatedAt, parentID, sender, "parent", "", 3, false, + `INSERT INTO messages_by_room (room_id, bucket, created_at, message_id, sender, msg, thread_parent_id, deleted) VALUES (?, ?, ?, ?, ?, ?, ?, ?)`, + roomID, msgbucket.New(24*time.Hour).Of(parentCreatedAt), parentCreatedAt, parentID, sender, "parent", "", false, ).Exec()) - // Seed the reply we're deleting. + // Seed 3 replies in thread_messages_by_thread: 2 survivors + the reply being deleted. + // After SoftDeleteMessage marks replyID as deleted=true, COUNT gives 2. + survivor1At := parentCreatedAt.Add(5 * time.Second) + survivor2At := parentCreatedAt.Add(7 * time.Second) + require.NoError(t, session.Query( + `INSERT INTO thread_messages_by_thread (thread_room_id, created_at, message_id, room_id, sender, msg, thread_parent_id, deleted) VALUES (?, ?, ?, ?, ?, ?, ?, ?)`, + threadRoomID, survivor1At, "m-tcount-survivor-1", roomID, sender, "survivor 1", parentID, false, + ).Exec()) + require.NoError(t, session.Query( + `INSERT INTO thread_messages_by_thread (thread_room_id, created_at, message_id, room_id, sender, msg, thread_parent_id, deleted) VALUES (?, ?, ?, ?, ?, ?, ?, ?)`, + threadRoomID, survivor2At, "m-tcount-survivor-2", roomID, sender, "survivor 2", parentID, false, + ).Exec()) require.NoError(t, session.Query( `INSERT INTO messages_by_id (message_id, room_id, created_at, sender, msg, thread_parent_id, thread_parent_created_at, thread_room_id, deleted) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`, - replyID, roomID, replyCreatedAt, sender, "reply", parentID, parentCreatedAt, threadRoomID, false, + replyID, roomID, replyCreatedAt, sender, "reply to delete", parentID, parentCreatedAt, threadRoomID, false, ).Exec()) require.NoError(t, session.Query( `INSERT INTO thread_messages_by_thread (thread_room_id, created_at, message_id, room_id, sender, msg, thread_parent_id, deleted) VALUES (?, ?, ?, ?, ?, ?, ?, ?)`, - threadRoomID, replyCreatedAt, replyID, roomID, sender, "reply", parentID, false, + threadRoomID, replyCreatedAt, replyID, roomID, sender, "reply to delete", parentID, false, ).Exec()) parentCreatedAtPtr := parentCreatedAt @@ -436,9 +448,13 @@ func TestRepository_SoftDeleteMessage_DecrementsParentTcount(t *testing.T) { ThreadParentCreatedAt: &parentCreatedAtPtr, ThreadRoomID: threadRoomID, } - _, applied, err := repo.SoftDeleteMessage(ctx, msg, replyCreatedAt.Add(time.Minute)) + _, applied, newTcount, err := repo.SoftDeleteMessage(ctx, msg, replyCreatedAt.Add(time.Minute)) require.NoError(t, err) require.True(t, applied, "first delete should apply") + // SoftDeleteMessage returns the COUNT of non-deleted thread replies so the + // caller can publish a ThreadMetadataUpdatedEvent without an extra round-trip. + require.NotNil(t, newTcount, "newTcount must be non-nil after a successful thread-reply delete") + assert.Equal(t, 2, *newTcount, "tcount = non-deleted COUNT (3 seeded - 1 deleted = 2)") // Both tables' tcount should now be 2. var gotTcount int @@ -446,13 +462,13 @@ func TestRepository_SoftDeleteMessage_DecrementsParentTcount(t *testing.T) { `SELECT tcount FROM messages_by_id WHERE message_id = ? AND created_at = ?`, parentID, parentCreatedAt, ).Scan(&gotTcount)) - assert.Equal(t, 2, gotTcount, "messages_by_id.tcount should decrement 3 -> 2") + assert.Equal(t, 2, gotTcount, "messages_by_id.tcount = count-based 2") require.NoError(t, session.Query( `SELECT tcount FROM messages_by_room WHERE room_id = ? AND bucket = ? AND created_at = ? AND message_id = ?`, roomID, msgbucket.New(24*time.Hour).Of(parentCreatedAt), parentCreatedAt, parentID, ).Scan(&gotTcount)) - assert.Equal(t, 2, gotTcount, "messages_by_room.tcount should decrement 3 -> 2") + assert.Equal(t, 2, gotTcount, "messages_by_room.tcount = count-based 2") } func TestRepository_SoftDeleteMessage_TopLevelDoesNotTouchTcount(t *testing.T) { @@ -482,7 +498,7 @@ func TestRepository_SoftDeleteMessage_TopLevelDoesNotTouchTcount(t *testing.T) { Sender: sender, ThreadParentID: "", } - _, applied, err := repo.SoftDeleteMessage(ctx, msg, createdAt.Add(time.Minute)) + _, applied, _, err := repo.SoftDeleteMessage(ctx, msg, createdAt.Add(time.Minute)) require.NoError(t, err) require.True(t, applied, "first delete should apply") @@ -549,7 +565,7 @@ func TestRepository_SoftDeleteMessage_MissingThreadRoomID_ReturnsError(t *testin ThreadParentID: "m-parent", ThreadRoomID: "", } - _, _, err := repo.SoftDeleteMessage(ctx, msg, createdAt.Add(time.Minute)) + _, _, _, err := repo.SoftDeleteMessage(ctx, msg, createdAt.Add(time.Minute)) require.Error(t, err, "expected error when ThreadRoomID is empty for a thread reply") // Validation must fire before any DB write — messages_by_id must be unchanged. @@ -616,7 +632,7 @@ func TestRepository_SoftDeleteMessage_LWTGatesDoubleDecrement(t *testing.T) { // First delete: LWT applies (deleted was NULL → matches != true). firstAt := replyCreatedAt.Add(time.Minute) - gotAt1, applied1, err := repo.SoftDeleteMessage(ctx, msg, firstAt) + gotAt1, applied1, _, err := repo.SoftDeleteMessage(ctx, msg, firstAt) require.NoError(t, err) require.True(t, applied1, "first delete must apply (deleted was NULL)") assert.Equal(t, firstAt.UnixMilli(), gotAt1.UnixMilli()) @@ -639,7 +655,7 @@ func TestRepository_SoftDeleteMessage_LWTGatesDoubleDecrement(t *testing.T) { // hydrated msg (Deleted=false) to simulate a stale read; the repo's CAS // is authoritative. secondAt := firstAt.Add(time.Second) - gotAt2, applied2, err := repo.SoftDeleteMessage(ctx, msg, secondAt) + gotAt2, applied2, _, err := repo.SoftDeleteMessage(ctx, msg, secondAt) require.NoError(t, err) require.False(t, applied2, "second delete must NOT apply — deleted is already true") assert.Equal(t, firstAt.UnixMilli(), gotAt2.UnixMilli(), "actualDeletedAt should reflect the winning goroutine's timestamp") @@ -727,7 +743,7 @@ func TestRepository_SoftDeleteMessage_RoundTrip(t *testing.T) { ThreadParentID: "", } deletedAt := createdAt.Add(time.Minute) - gotAt, applied, err := repo.SoftDeleteMessage(ctx, msg, deletedAt) + gotAt, applied, _, err := repo.SoftDeleteMessage(ctx, msg, deletedAt) require.NoError(t, err) require.True(t, applied) assert.Equal(t, deletedAt.UnixMilli(), gotAt.UnixMilli()) @@ -757,7 +773,7 @@ func TestRepository_SoftDeleteMessage_RowCreatedByLWT(t *testing.T) { } deletedAt := msg.CreatedAt.Add(time.Minute) - _, _, err := repo.SoftDeleteMessage(ctx, msg, deletedAt) + _, _, _, err := repo.SoftDeleteMessage(ctx, msg, deletedAt) require.NoError(t, err, "SoftDeleteMessage must not return an error on a non-existent row") } @@ -796,7 +812,7 @@ func TestRepository_SoftDeleteMessage_ThreadParent_SetsTypeRemoved(t *testing.T) } deletedAt := createdAt.Add(time.Minute) - _, applied, err := repo.SoftDeleteMessage(ctx, msg, deletedAt) + _, applied, _, err := repo.SoftDeleteMessage(ctx, msg, deletedAt) require.NoError(t, err) require.True(t, applied) @@ -850,7 +866,7 @@ func TestRepository_SoftDeleteMessage_NonThreadParent_NoTypeChange(t *testing.T) } deletedAt := createdAt.Add(time.Minute) - _, applied, err := repo.SoftDeleteMessage(ctx, msg, deletedAt) + _, applied, _, err := repo.SoftDeleteMessage(ctx, msg, deletedAt) require.NoError(t, err) require.True(t, applied) @@ -904,7 +920,7 @@ func TestRepository_SoftDeleteMessage_ReplyThreadParent_SetsTypeRemoved(t *testi } deletedAt := createdAt.Add(time.Minute) - _, applied, err := repo.SoftDeleteMessage(ctx, msg, deletedAt) + _, applied, _, err := repo.SoftDeleteMessage(ctx, msg, deletedAt) require.NoError(t, err) require.True(t, applied) @@ -1118,7 +1134,7 @@ func TestDeleteMessage_NullsEncryptedColumns(t *testing.T) { "m1", now, roomID, payload, &cassmodel.EncMeta{Nonce: meta.Nonce}, "site-a", ).Exec()) - _, applied, err := repo.SoftDeleteMessage(ctx, &models.Message{ + _, applied, _, err := repo.SoftDeleteMessage(ctx, &models.Message{ RoomID: roomID, MessageID: "m1", CreatedAt: now, }, now.Add(time.Minute)) require.NoError(t, err) diff --git a/history-service/internal/publisher/publisher.go b/history-service/internal/publisher/publisher.go index f260bb572..f8e610280 100644 --- a/history-service/internal/publisher/publisher.go +++ b/history-service/internal/publisher/publisher.go @@ -1,5 +1,4 @@ -// Package publisher adapts a JetStream context to the service.EventPublisher -// interface. +// Package publisher adapts NATS connections to the service.EventPublisher interface. package publisher import ( @@ -12,8 +11,7 @@ import ( "github.com/hmchangw/chat/pkg/natsutil" ) -// Publisher publishes byte payloads to JetStream subjects. Each publish blocks -// on PubAck so transient JetStream failures surface as errors to the caller. +// Publisher publishes byte payloads to NATS JetStream with dedup support. type Publisher struct { js oteljetstream.JetStream } diff --git a/history-service/internal/service/integration_test.go b/history-service/internal/service/integration_test.go index 8561291dd..3ecee2074 100644 --- a/history-service/internal/service/integration_test.go +++ b/history-service/internal/service/integration_test.go @@ -120,6 +120,7 @@ func (p *recordingPublisher) Publish(_ context.Context, subj string, data []byte return nil } +// alwaysSubscribedRepo stubs SubscriptionRepository so the subscription gate passes. type alwaysSubscribedRepo struct{} func (alwaysSubscribedRepo) GetHistorySharedSince(_ context.Context, _, _ string) (*time.Time, bool, error) { @@ -340,3 +341,71 @@ func TestDeleteMessage_ParentWithReplies_NoCascade(t *testing.T) { ).Scan(&gotTcount)) assert.Equal(t, 1, gotTcount, "parent tcount should be unchanged (replies still exist and are counted)") } + +func TestDeleteMessage_Integration_ThreadReplyPublishesMetadataEvent(t *testing.T) { + session := setupCassandra(t) + repo := cassrepo.NewRepository(session, msgbucket.New(24*time.Hour), 365, nil) + pub := &recordingPublisher{} + svc := New(repo, alwaysSubscribedRepo{}, stubRoomRepo{}, pub, nil, &config.Config{ + MessageHistoryFloorDays: 730, + LargeRoomThreshold: 500, + MaxPinnedPerRoom: 10, + PinEnabled: true, + }) + + sender := models.Participant{ID: "u1", Account: "alice"} + roomID := "r-thread-meta-event" + threadRoomID := "thread-meta-event" + parentID := "m-parent-meta" + parentCreatedAt := time.Now().UTC().Truncate(time.Millisecond) + replyID := "m-reply-meta" + replyCreatedAt := parentCreatedAt.Add(10 * time.Second) + + // Seed parent message with tcount = 1 (one existing reply). + require.NoError(t, session.Query( + `INSERT INTO messages_by_id (message_id, room_id, created_at, sender, msg, thread_parent_id, tcount, deleted) VALUES (?, ?, ?, ?, ?, ?, ?, ?)`, + parentID, roomID, parentCreatedAt, sender, "parent message", "", 1, false, + ).Exec()) + require.NoError(t, session.Query( + `INSERT INTO messages_by_room (room_id, bucket, created_at, message_id, sender, msg, thread_parent_id, tcount, deleted) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`, + roomID, msgbucket.New(24*time.Hour).Of(parentCreatedAt), parentCreatedAt, parentID, sender, "parent message", "", 1, false, + ).Exec()) + + // Seed thread reply referencing the parent. + require.NoError(t, session.Query( + `INSERT INTO messages_by_id (message_id, room_id, created_at, sender, msg, thread_parent_id, thread_parent_created_at, thread_room_id, deleted) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`, + replyID, roomID, replyCreatedAt, sender, "thread reply", parentID, parentCreatedAt, threadRoomID, false, + ).Exec()) + require.NoError(t, session.Query( + `INSERT INTO thread_messages_by_thread (thread_room_id, created_at, message_id, room_id, sender, msg, thread_parent_id, deleted) VALUES (?, ?, ?, ?, ?, ?, ?, ?)`, + threadRoomID, replyCreatedAt, replyID, roomID, sender, "thread reply", parentID, false, + ).Exec()) + + // Delete the thread reply as alice. + c := natsrouter.NewContext(map[string]string{"account": "alice", "roomID": roomID}) + resp, err := svc.DeleteMessage(c, "site-test", models.DeleteMessageRequest{MessageID: replyID}) + require.NoError(t, err) + assert.Equal(t, replyID, resp.MessageID) + assert.NotZero(t, resp.DeletedAt) + + // Collect all published messages. + pub.mu.Lock() + sent := make([]recordedMessage, len(pub.sent)) + copy(sent, pub.sent) + pub.mu.Unlock() + + // Expect exactly one publish: the canonical .deleted event with NewTCount embedded. + // Badge routing (ThreadMetadataUpdatedEvent) is now broadcast-worker's responsibility; + // history-service no longer publishes directly to subject.RoomEvent. + require.Len(t, sent, 1, "expected exactly one canonical delete publish") + + assert.Equal(t, subject.MsgCanonicalDeleted("site-test"), sent[0].Subject) + + var canonicalEvt model.MessageEvent + require.NoError(t, json.Unmarshal(sent[0].Data, &canonicalEvt)) + assert.Equal(t, model.EventDeleted, canonicalEvt.Event) + assert.Equal(t, replyID, canonicalEvt.Message.ID) + assert.Equal(t, parentID, canonicalEvt.Message.ThreadParentMessageID) + require.NotNil(t, canonicalEvt.NewTCount, "canonical delete for a thread reply must carry NewTCount") + assert.Equal(t, 0, *canonicalEvt.NewTCount, "tcount seeded at 1 minus one decrement must equal 0") +} diff --git a/history-service/internal/service/messages.go b/history-service/internal/service/messages.go index 96b03039b..5a89c6ce1 100644 --- a/history-service/internal/service/messages.go +++ b/history-service/internal/service/messages.go @@ -330,18 +330,24 @@ func (s *HistoryService) EditMessage(c *natsrouter.Context, siteID string, req m editedAtMs := editedAt.UnixMilli() - // Mentions intentionally omitted — broadcast-worker re-resolves them from Content. + // Carry the fields downstream actually reads: search-sync-worker reindexes + // by Content/EditedAt/UpdatedAt; broadcast-worker emits a slim + // MessageEditedPayload of {ID, Content, EditedBy, EditedAt, UpdatedAt} and + // routes thread-reply edits via ThreadParentMessageID + TShow. + // Mentions intentionally omitted — broadcast-worker re-resolves from Content. canonicalEvt := model.MessageEvent{ Event: model.EventUpdated, Message: model.Message{ - ID: msg.MessageID, - RoomID: msg.RoomID, - UserID: msg.Sender.ID, - UserAccount: msg.Sender.Account, - Content: req.NewMsg, - CreatedAt: msg.CreatedAt, - EditedAt: &editedAt, - UpdatedAt: &editedAt, + ID: msg.MessageID, + RoomID: msg.RoomID, + UserID: msg.Sender.ID, + UserAccount: msg.Sender.Account, + Content: req.NewMsg, + CreatedAt: msg.CreatedAt, + EditedAt: &editedAt, + UpdatedAt: &editedAt, + ThreadParentMessageID: msg.ThreadParentID, + TShow: msg.TShow, }, SiteID: siteID, Timestamp: editedAtMs, @@ -374,11 +380,67 @@ func (s *HistoryService) DeleteMessage(c *natsrouter.Context, siteID string, req return nil, errcode.Forbidden("only the sender can delete") } + // Already-deleted short-circuit: echo the current updated_at as the DeletedAt. + // Prevents tcount double-decrement on caller retry and avoids duplicate events. + // Re-publishes the canonical deleted event so a badge update that was lost on + // the first attempt (publishCanonicalBestEffort is best-effort) gets retried. + // JetStream dedup (":deleted") prevents double-delivery if the first + // publish actually succeeded. if msg.Deleted { var deletedAtMs int64 if msg.UpdatedAt != nil { deletedAtMs = msg.UpdatedAt.UnixMilli() } + var newTcount *int + // Gate parent lookup on UpdatedAt != nil: nil-UpdatedAt records can never produce + // a valid EventDeleted, so the lookup result would be unconsumed anyway. + if msg.ThreadParentID != "" && msg.UpdatedAt != nil { + parent, parentErr := s.msgReader.GetMessageByID(c, msg.ThreadParentID) + switch { + case parentErr != nil: + // Return error so the caller retries the delete handler. On retry the + // lookup will either succeed (returning the correct tcount) or find the + // parent gone (default branch, which skips the publish). Publishing now + // with NewTCount=nil risks permanently dropping the badge update — the + // same reason the default branch skips the publish entirely. + return nil, fmt.Errorf("already-deleted retry: look up parent tcount for %s: %w", msg.ThreadParentID, parentErr) + case parent != nil: + newTcount = parent.TCount + default: + // Parent was concurrently hard-deleted. No badge to update — skip the + // canonical republish entirely to avoid publishing EventDeleted with + // NewTCount=nil, which would cause broadcast-worker to permanently drop + // the tcount decrement. + return &models.DeleteMessageResponse{ + MessageID: req.MessageID, + DeletedAt: deletedAtMs, + }, nil + } + } + // Only republish when UpdatedAt is available. Legacy records with nil + // UpdatedAt cannot produce a valid EventDeleted — downstream handlers + // (broadcast-worker, search-sync) reject nil UpdatedAt and would NAK, + // causing an infinite redelivery loop. + if msg.UpdatedAt != nil { + canonicalEvt := model.MessageEvent{ + Event: model.EventDeleted, + Message: model.Message{ + ID: msg.MessageID, + RoomID: msg.RoomID, + UserID: msg.Sender.ID, + UserAccount: msg.Sender.Account, + Content: msg.Msg, + CreatedAt: msg.CreatedAt, + UpdatedAt: msg.UpdatedAt, + ThreadParentMessageID: msg.ThreadParentID, + TShow: msg.TShow, + }, + SiteID: siteID, + Timestamp: deletedAtMs, + NewTCount: newTcount, + } + s.publishCanonicalBestEffort(c, subject.MsgCanonicalDeleted(siteID), &canonicalEvt) + } return &models.DeleteMessageResponse{ MessageID: req.MessageID, DeletedAt: deletedAtMs, @@ -386,7 +448,7 @@ func (s *HistoryService) DeleteMessage(c *natsrouter.Context, siteID string, req } deletedAt := time.Now().UTC() - actualDeletedAt, applied, err := s.msgWriter.SoftDeleteMessage(c, msg, deletedAt) + actualDeletedAt, applied, newTcount, err := s.msgWriter.SoftDeleteMessage(c, msg, deletedAt) if err != nil { return nil, fmt.Errorf("deleting message %s: %w", req.MessageID, err) } @@ -403,15 +465,19 @@ func (s *HistoryService) DeleteMessage(c *natsrouter.Context, siteID string, req canonicalEvt := model.MessageEvent{ Event: model.EventDeleted, Message: model.Message{ - ID: msg.MessageID, - RoomID: msg.RoomID, - UserID: msg.Sender.ID, - UserAccount: msg.Sender.Account, - CreatedAt: msg.CreatedAt, - UpdatedAt: &actualDeletedAt, + ID: msg.MessageID, + RoomID: msg.RoomID, + UserID: msg.Sender.ID, + UserAccount: msg.Sender.Account, + Content: msg.Msg, + CreatedAt: msg.CreatedAt, + UpdatedAt: &actualDeletedAt, + ThreadParentMessageID: msg.ThreadParentID, + TShow: msg.TShow, }, SiteID: siteID, Timestamp: deletedAtMs, + NewTCount: newTcount, } s.publishCanonicalBestEffort(c, subject.MsgCanonicalDeleted(siteID), &canonicalEvt) diff --git a/history-service/internal/service/messages_test.go b/history-service/internal/service/messages_test.go index f0ba06e63..e4f441c82 100644 --- a/history-service/internal/service/messages_test.go +++ b/history-service/internal/service/messages_test.go @@ -1221,7 +1221,7 @@ func TestHistoryService_EditMessage_PassesDedupMessageID(t *testing.T) { // --- DeleteMessage --- func TestHistoryService_DeleteMessage_AlreadyDeleted_ShortCircuits(t *testing.T) { - svc, msgs, subs, _, _ := newService(t) + svc, msgs, subs, pub, _ := newService(t) c := testContext() subs.EXPECT().GetHistorySharedSince(gomock.Any(), "u1", "r1").Return(nil, true, nil) @@ -1236,8 +1236,19 @@ func TestHistoryService_DeleteMessage_AlreadyDeleted_ShortCircuits(t *testing.T) } msgs.EXPECT().GetMessageByID(gomock.Any(), "m-abc").Return(hydrated, nil) - // No SoftDeleteMessage call expected. No Publish call expected. gomock will - // fail the test if either is invoked unexpectedly. + // Non-thread-reply: no parent lookup expected. Publish fires to re-deliver + // any badge event that was lost if the original publish failed. + pub.EXPECT(). + Publish(gomock.Any(), subject.MsgCanonicalDeleted("site-test"), gomock.Any(), gomock.Any()). + DoAndReturn(func(_ context.Context, _ string, data []byte, dedupID string) error { + var evt model.MessageEvent + require.NoError(t, json.Unmarshal(data, &evt)) + assert.Equal(t, model.EventDeleted, evt.Event) + assert.Equal(t, "m-abc", evt.Message.ID) + assert.Nil(t, evt.NewTCount, "non-thread-reply should have nil NewTCount") + assert.Equal(t, natsutil.CanonicalDedupID(&evt), dedupID) + return nil + }) resp, err := svc.DeleteMessage(c, "site-test", models.DeleteMessageRequest{MessageID: "m-abc"}) require.NoError(t, err) @@ -1245,6 +1256,182 @@ func TestHistoryService_DeleteMessage_AlreadyDeleted_ShortCircuits(t *testing.T) assert.Equal(t, priorUpdatedAt.UnixMilli(), resp.DeletedAt, "short-circuit should echo the existing updated_at") } +func TestHistoryService_DeleteMessage_AlreadyDeleted_ThreadReply_RepublishesWithParentTCount(t *testing.T) { + svc, msgs, subs, pub, _ := newService(t) + c := testContext() + + subs.EXPECT().GetHistorySharedSince(gomock.Any(), "u1", "r1").Return(nil, true, nil) + + priorUpdatedAt := time.Now().UTC().Add(-time.Hour).Truncate(time.Millisecond) + hydrated := &models.Message{ + MessageID: "reply-abc", + RoomID: "r1", + Sender: models.Participant{Account: "u1", ID: "u1-id"}, + Deleted: true, + UpdatedAt: &priorUpdatedAt, + ThreadParentID: "parent-xyz", + TShow: false, + } + msgs.EXPECT().GetMessageByID(gomock.Any(), "reply-abc").Return(hydrated, nil) + + parentTcount := 3 + parent := &models.Message{ + MessageID: "parent-xyz", + RoomID: "r1", + TCount: &parentTcount, + } + msgs.EXPECT().GetMessageByID(gomock.Any(), "parent-xyz").Return(parent, nil) + + pub.EXPECT(). + Publish(gomock.Any(), subject.MsgCanonicalDeleted("site-test"), gomock.Any(), gomock.Any()). + DoAndReturn(func(_ context.Context, _ string, data []byte, _ string) error { + var evt model.MessageEvent + require.NoError(t, json.Unmarshal(data, &evt)) + assert.Equal(t, model.EventDeleted, evt.Event) + assert.Equal(t, "reply-abc", evt.Message.ID) + assert.Equal(t, "parent-xyz", evt.Message.ThreadParentMessageID) + require.NotNil(t, evt.NewTCount) + assert.Equal(t, 3, *evt.NewTCount) + return nil + }) + + resp, err := svc.DeleteMessage(c, "site-test", models.DeleteMessageRequest{MessageID: "reply-abc"}) + require.NoError(t, err) + assert.Equal(t, "reply-abc", resp.MessageID) + assert.Equal(t, priorUpdatedAt.UnixMilli(), resp.DeletedAt) +} + +// TestHistoryService_DeleteMessage_AlreadyDeleted_ThreadReply_ParentHardDeleted_SkipsRepublish +// verifies that when GetMessageByID returns (nil, nil) for the parent (concurrent hard-delete), +// the already-deleted short-circuit skips the canonical republish entirely. There is no badge +// to update when the parent row is gone, so publishing EventDeleted with NewTCount=nil would +// cause broadcast-worker to permanently skip a tcount decrement it can never apply. +func TestHistoryService_DeleteMessage_AlreadyDeleted_ThreadReply_ParentHardDeleted_SkipsRepublish(t *testing.T) { + svc, msgs, subs, pub, _ := newService(t) + c := testContext() + + subs.EXPECT().GetHistorySharedSince(gomock.Any(), "u1", "r1").Return(nil, true, nil) + + priorUpdatedAt := time.Now().UTC().Add(-time.Hour).Truncate(time.Millisecond) + hydrated := &models.Message{ + MessageID: "reply-abc", + RoomID: "r1", + Sender: models.Participant{Account: "u1", ID: "u1-id"}, + Deleted: true, + UpdatedAt: &priorUpdatedAt, + ThreadParentID: "parent-xyz", + TShow: false, + } + msgs.EXPECT().GetMessageByID(gomock.Any(), "reply-abc").Return(hydrated, nil) + + // Parent was concurrently hard-deleted — GetMessageByID returns (nil, nil). + msgs.EXPECT().GetMessageByID(gomock.Any(), "parent-xyz").Return(nil, nil) + + // No publish expected: parent is gone, no badge to update. + _ = pub + + resp, err := svc.DeleteMessage(c, "site-test", models.DeleteMessageRequest{MessageID: "reply-abc"}) + require.NoError(t, err, "already-deleted retry must return success even when parent is gone") + assert.Equal(t, "reply-abc", resp.MessageID) + assert.Equal(t, priorUpdatedAt.UnixMilli(), resp.DeletedAt) +} + +// TestHistoryService_DeleteMessage_AlreadyDeleted_ThreadReply_ParentLookupError_ReturnsError +// verifies that when the parent-tcount lookup fails on an already-deleted retry, the handler +// returns an error instead of publishing with NewTCount=nil. Publishing nil tcount would cause +// broadcast-worker to permanently drop the badge update — the same reason the hard-deleted +// parent branch (default:) skips the publish entirely. Returning an error lets the client +// retry the delete; on the next attempt the lookup will either succeed or find the parent gone. +func TestHistoryService_DeleteMessage_AlreadyDeleted_ThreadReply_ParentLookupError_ReturnsError(t *testing.T) { + svc, msgs, subs, _, _ := newService(t) + c := testContext() + + subs.EXPECT().GetHistorySharedSince(gomock.Any(), "u1", "r1").Return(nil, true, nil) + + priorUpdatedAt := time.Now().UTC().Add(-time.Hour).Truncate(time.Millisecond) + hydrated := &models.Message{ + MessageID: "reply-abc", + RoomID: "r1", + Sender: models.Participant{Account: "u1", ID: "u1-id"}, + Deleted: true, + UpdatedAt: &priorUpdatedAt, + ThreadParentID: "parent-xyz", + TShow: false, + } + msgs.EXPECT().GetMessageByID(gomock.Any(), "reply-abc").Return(hydrated, nil) + + // Parent lookup fails — transient error + msgs.EXPECT().GetMessageByID(gomock.Any(), "parent-xyz").Return(nil, fmt.Errorf("cassandra: unavailable")) + + // No publish: publishing with NewTCount=nil would permanently drop the badge update. + _, err := svc.DeleteMessage(c, "site-test", models.DeleteMessageRequest{MessageID: "reply-abc"}) + require.Error(t, err, "already-deleted retry must return error when parent tcount lookup fails") +} + +// TestHistoryService_DeleteMessage_AlreadyDeleted_NilUpdatedAt_SkipsRepublish verifies +// that when a deleted record has nil UpdatedAt (legacy row written before the field was +// added), the already-deleted short-circuit does NOT publish a canonical event. +// Downstream handlers (broadcast-worker handleThreadDeleted / handleDeleted) guard on +// msg.UpdatedAt != nil and would NAK, causing an infinite redelivery loop. +func TestHistoryService_DeleteMessage_AlreadyDeleted_NilUpdatedAt_SkipsRepublish(t *testing.T) { + svc, msgs, subs, pub, _ := newService(t) + c := testContext() + + subs.EXPECT().GetHistorySharedSince(gomock.Any(), "u1", "r1").Return(nil, true, nil) + + hydrated := &models.Message{ + MessageID: "m-legacy", + RoomID: "r1", + Sender: models.Participant{Account: "u1", ID: "u1-id"}, + Deleted: true, + UpdatedAt: nil, // legacy record: no delete timestamp stored + } + msgs.EXPECT().GetMessageByID(gomock.Any(), "m-legacy").Return(hydrated, nil) + + // pub must NOT be called — a nil UpdatedAt cannot produce a valid EventDeleted. + // If it were published, broadcast-worker would NAK and redelivery would loop. + _ = pub // no EXPECT needed; gomock strict controller will fail if Publish is called + + resp, err := svc.DeleteMessage(c, "site-test", models.DeleteMessageRequest{MessageID: "m-legacy"}) + require.NoError(t, err, "already-deleted with nil UpdatedAt must still return success") + assert.Equal(t, "m-legacy", resp.MessageID) + assert.Equal(t, int64(0), resp.DeletedAt, "DeletedAt should be 0 when UpdatedAt is nil") +} + +// TestHistoryService_DeleteMessage_AlreadyDeleted_ThreadReply_NilUpdatedAt_SkipsRepublish +// verifies the nil-UpdatedAt guard for thread replies. When UpdatedAt is nil the handler +// skips both the parent-tcount lookup AND the canonical event — no wasted Cassandra read +// for records that will never produce a valid EventDeleted anyway. +func TestHistoryService_DeleteMessage_AlreadyDeleted_ThreadReply_NilUpdatedAt_SkipsRepublish(t *testing.T) { + svc, msgs, subs, pub, _ := newService(t) + c := testContext() + + subs.EXPECT().GetHistorySharedSince(gomock.Any(), "u1", "r1").Return(nil, true, nil) + + hydrated := &models.Message{ + MessageID: "reply-legacy", + RoomID: "r1", + Sender: models.Participant{Account: "u1", ID: "u1-id"}, + Deleted: true, + UpdatedAt: nil, // legacy thread reply with no stored delete timestamp + ThreadParentID: "parent-xyz", + TShow: false, + } + msgs.EXPECT().GetMessageByID(gomock.Any(), "reply-legacy").Return(hydrated, nil) + + // Parent lookup must NOT be called: UpdatedAt=nil means we can't produce a valid + // EventDeleted, so the lookup result is never consumed. Gomock strict controller + // will fail if GetMessageByID("parent-xyz") is called unexpectedly. + + // No publish expected — nil UpdatedAt suppresses the canonical event. + _ = pub + + resp, err := svc.DeleteMessage(c, "site-test", models.DeleteMessageRequest{MessageID: "reply-legacy"}) + require.NoError(t, err, "already-deleted thread reply with nil UpdatedAt must return success") + assert.Equal(t, "reply-legacy", resp.MessageID) + assert.Equal(t, int64(0), resp.DeletedAt) +} + func TestHistoryService_DeleteMessage_NotSubscribed(t *testing.T) { svc, _, subs, _, _ := newService(t) c := testContext() @@ -1333,7 +1520,7 @@ func TestHistoryService_DeleteMessage_SoftDeleteFails(t *testing.T) { msgs.EXPECT().GetMessageByID(gomock.Any(), "m-abc").Return(hydrated, nil) msgs.EXPECT(). SoftDeleteMessage(gomock.Any(), hydrated, gomock.Any()). - Return(time.Time{}, false, fmt.Errorf("cassandra timeout")) + Return(time.Time{}, false, (*int)(nil), fmt.Errorf("cassandra timeout")) // No Publish expected when the UPDATE fails. @@ -1365,7 +1552,7 @@ func TestHistoryService_DeleteMessage_ConcurrentDeleteSkipsPublish(t *testing.T) winnerWrote := time.Date(2026, 4, 28, 9, 0, 0, 0, time.UTC) msgs.EXPECT(). SoftDeleteMessage(gomock.Any(), hydrated, gomock.Any()). - Return(winnerWrote, false, nil) + Return(winnerWrote, false, (*int)(nil), nil) // Critically, NO Publish call is expected — gomock will fail the test if // the handler tries to publish on the LWT-not-applied path. @@ -1393,8 +1580,8 @@ func TestHistoryService_DeleteMessage_PublishFails(t *testing.T) { msgs.EXPECT().GetMessageByID(gomock.Any(), "m-abc").Return(hydrated, nil) msgs.EXPECT(). SoftDeleteMessage(gomock.Any(), hydrated, gomock.Any()). - DoAndReturn(func(_ context.Context, _ *models.Message, deletedAt time.Time) (time.Time, bool, error) { - return deletedAt, true, nil + DoAndReturn(func(_ context.Context, _ *models.Message, deletedAt time.Time) (time.Time, bool, *int, error) { + return deletedAt, true, nil, nil }) pub.EXPECT(). @@ -1423,8 +1610,8 @@ func TestHistoryService_DeleteMessage_PublishesCanonicalDeletedEvent(t *testing. msgs.EXPECT().GetMessageByID(gomock.Any(), "msg-1").Return(hydrated, nil) msgs.EXPECT(). SoftDeleteMessage(gomock.Any(), hydrated, gomock.Any()). - DoAndReturn(func(_ context.Context, _ *models.Message, deletedAt time.Time) (time.Time, bool, error) { - return deletedAt, true, nil + DoAndReturn(func(_ context.Context, _ *models.Message, deletedAt time.Time) (time.Time, bool, *int, error) { + return deletedAt, true, nil, nil }) pub.EXPECT(). @@ -1446,6 +1633,84 @@ func TestHistoryService_DeleteMessage_PublishesCanonicalDeletedEvent(t *testing. require.NotNil(t, resp) } +// Editing a thread reply must carry ThreadParentMessageID and TShow on the +// canonical event so broadcast-worker can route the edit to thread subscribers +// (via handleThreadUpdated) and search-sync-worker preserves the thread linkage +// when re-upserting the search-index doc. +func TestHistoryService_EditMessage_ThreadReply_CarriesThreadFields(t *testing.T) { + svc, msgs, subs, pub, _ := newService(t) + c := testContext() + + subs.EXPECT().GetHistorySharedSince(gomock.Any(), "u1", "r1").Return(nil, true, nil) + hydrated := &models.Message{ + MessageID: "reply-1", + RoomID: "r1", + Sender: models.Participant{Account: "u1", ID: "u1-id"}, + CreatedAt: time.Date(2026, 5, 14, 12, 0, 0, 0, time.UTC), + Msg: "original reply", + ThreadParentID: "parent-1", + TShow: false, + } + msgs.EXPECT().GetMessageByID(gomock.Any(), "reply-1").Return(hydrated, nil) + msgs.EXPECT().UpdateMessageContent(gomock.Any(), hydrated, "edited reply", gomock.Any()).Return(nil) + + pub.EXPECT(). + Publish(gomock.Any(), subject.MsgCanonicalUpdated("site-test"), gomock.Any(), gomock.Any()). + DoAndReturn(func(_ context.Context, _ string, data []byte, _ string) error { + var evt model.MessageEvent + require.NoError(t, json.Unmarshal(data, &evt)) + assert.Equal(t, "parent-1", evt.Message.ThreadParentMessageID, "edit event must carry ThreadParentMessageID for thread routing") + assert.False(t, evt.Message.TShow, "edit event must carry TShow") + return nil + }) + + resp, err := svc.EditMessage(c, "site-test", models.EditMessageRequest{ + MessageID: "reply-1", + NewMsg: "edited reply", + }) + require.NoError(t, err) + require.NotNil(t, resp) +} + +// Deleting a thread reply must carry ThreadParentMessageID and TShow on the +// canonical event so broadcast-worker can route the delete to thread subscribers +// (via handleThreadDeleted). +func TestHistoryService_DeleteMessage_ThreadReply_CarriesThreadFields(t *testing.T) { + svc, msgs, subs, pub, _ := newService(t) + c := testContext() + + subs.EXPECT().GetHistorySharedSince(gomock.Any(), "u1", "r1").Return(nil, true, nil) + hydrated := &models.Message{ + MessageID: "reply-1", + RoomID: "r1", + Sender: models.Participant{Account: "u1", ID: "u1-id"}, + CreatedAt: time.Date(2026, 5, 14, 12, 0, 0, 0, time.UTC), + Msg: "reply", + ThreadParentID: "parent-1", + TShow: false, + } + msgs.EXPECT().GetMessageByID(gomock.Any(), "reply-1").Return(hydrated, nil) + msgs.EXPECT(). + SoftDeleteMessage(gomock.Any(), hydrated, gomock.Any()). + DoAndReturn(func(_ context.Context, _ *models.Message, deletedAt time.Time) (time.Time, bool, *int, error) { + return deletedAt, true, nil, nil + }) + + pub.EXPECT(). + Publish(gomock.Any(), subject.MsgCanonicalDeleted("site-test"), gomock.Any(), gomock.Any()). + DoAndReturn(func(_ context.Context, _ string, data []byte, _ string) error { + var evt model.MessageEvent + require.NoError(t, json.Unmarshal(data, &evt)) + assert.Equal(t, "parent-1", evt.Message.ThreadParentMessageID, "delete event must carry ThreadParentMessageID for thread routing") + assert.False(t, evt.Message.TShow, "delete event must carry TShow") + return nil + }) + + resp, err := svc.DeleteMessage(c, "site-test", models.DeleteMessageRequest{MessageID: "reply-1"}) + require.NoError(t, err) + require.NotNil(t, resp) +} + // Nats-Msg-Id shape "{messageID}:deleted": distinct from the `.created` key // so the JetStream dedup window doesn't collapse a delete against an earlier // create. @@ -1463,8 +1728,8 @@ func TestHistoryService_DeleteMessage_PassesDedupMessageID(t *testing.T) { msgs.EXPECT().GetMessageByID(gomock.Any(), "msg-1").Return(hydrated, nil) msgs.EXPECT(). SoftDeleteMessage(gomock.Any(), hydrated, gomock.Any()). - DoAndReturn(func(_ context.Context, _ *models.Message, deletedAt time.Time) (time.Time, bool, error) { - return deletedAt, true, nil + DoAndReturn(func(_ context.Context, _ *models.Message, deletedAt time.Time) (time.Time, bool, *int, error) { + return deletedAt, true, nil, nil }) pub.EXPECT(). @@ -1480,6 +1745,126 @@ func TestHistoryService_DeleteMessage_PassesDedupMessageID(t *testing.T) { require.NoError(t, err) } +// TestHistoryService_DeleteMessage_ThreadReply_PublishesThreadMetadataEvent verifies +// that deleting a thread reply sets NewTCount on the canonical deleted event so that +// broadcast-worker can do DM-aware routing. +func TestHistoryService_DeleteMessage_ThreadReply_PublishesThreadMetadataEvent(t *testing.T) { + svc, msgs, subs, pub, _ := newService(t) + c := testContext() + + subs.EXPECT().GetHistorySharedSince(gomock.Any(), "u1", "r1").Return(nil, true, nil) + + parentCreatedAt := time.Date(2026, 5, 14, 12, 0, 0, 0, time.UTC) + hydrated := &models.Message{ + MessageID: "reply-1", + RoomID: "r1", + Sender: models.Participant{Account: "u1", ID: "u1-id"}, + CreatedAt: time.Date(2026, 5, 14, 13, 0, 0, 0, time.UTC), + Msg: "reply content", + ThreadParentID: "parent-1", + ThreadParentCreatedAt: &parentCreatedAt, + TShow: false, + } + msgs.EXPECT().GetMessageByID(gomock.Any(), "reply-1").Return(hydrated, nil) + + newTcount := 4 + msgs.EXPECT(). + SoftDeleteMessage(gomock.Any(), hydrated, gomock.Any()). + DoAndReturn(func(_ context.Context, _ *models.Message, deletedAt time.Time) (time.Time, bool, *int, error) { + return deletedAt, true, &newTcount, nil + }) + + pub.EXPECT(). + Publish(gomock.Any(), subject.MsgCanonicalDeleted("site-test"), gomock.Any(), gomock.Any()). + DoAndReturn(func(_ context.Context, _ string, data []byte, _ string) error { + var evt model.MessageEvent + require.NoError(t, json.Unmarshal(data, &evt)) + assert.Equal(t, model.EventDeleted, evt.Event) + require.NotNil(t, evt.NewTCount) + assert.Equal(t, 4, *evt.NewTCount) + assert.Equal(t, "reply-1", evt.Message.ID) + assert.Equal(t, "r1", evt.Message.RoomID) + assert.Equal(t, "parent-1", evt.Message.ThreadParentMessageID) + return nil + }) + + resp, err := svc.DeleteMessage(c, "site-test", models.DeleteMessageRequest{MessageID: "reply-1"}) + require.NoError(t, err) + require.NotNil(t, resp) + assert.Equal(t, "reply-1", resp.MessageID) +} + +// TestHistoryService_DeleteMessage_ThreadReply_PublishFailsButDeleteSucceeds verifies +// the best-effort contract for thread reply deletes: if publishCanonicalBestEffort +// fails to publish the canonical deleted event (e.g. NATS is disconnected), +// DeleteMessage still returns success — Cassandra is the source of truth. +func TestHistoryService_DeleteMessage_ThreadReply_PublishFailsButDeleteSucceeds(t *testing.T) { + svc, msgs, subs, pub, _ := newService(t) + c := testContext() + + subs.EXPECT().GetHistorySharedSince(gomock.Any(), "u1", "r1").Return(nil, true, nil) + + parentCreatedAt := time.Date(2026, 5, 14, 12, 0, 0, 0, time.UTC) + hydrated := &models.Message{ + MessageID: "reply-1", + RoomID: "r1", + Sender: models.Participant{Account: "u1", ID: "u1-id"}, + CreatedAt: time.Date(2026, 5, 14, 13, 0, 0, 0, time.UTC), + Msg: "reply content", + ThreadParentID: "parent-1", + ThreadParentCreatedAt: &parentCreatedAt, + } + msgs.EXPECT().GetMessageByID(gomock.Any(), "reply-1").Return(hydrated, nil) + + newTcount := 4 + msgs.EXPECT(). + SoftDeleteMessage(gomock.Any(), hydrated, gomock.Any()). + DoAndReturn(func(_ context.Context, _ *models.Message, deletedAt time.Time) (time.Time, bool, *int, error) { + return deletedAt, true, &newTcount, nil + }) + + pub.EXPECT(). + Publish(gomock.Any(), subject.MsgCanonicalDeleted("site-test"), gomock.Any(), gomock.Any()). + Return(fmt.Errorf("nats disconnected")) + + resp, err := svc.DeleteMessage(c, "site-test", models.DeleteMessageRequest{MessageID: "reply-1"}) + require.NoError(t, err, "best-effort publish: failure must be logged, not returned") + require.NotNil(t, resp) + assert.Equal(t, "reply-1", resp.MessageID) +} + +// TestHistoryService_DeleteMessage_ThreadReply_NoMetadataEventWhenTCountNil verifies +// that no ThreadMetadataUpdatedEvent is published when the repository returns nil tcount +// (CAS was skipped because the parent row was not found or tcount was never written). +func TestHistoryService_DeleteMessage_ThreadReply_NoMetadataEventWhenTCountNil(t *testing.T) { + svc, msgs, subs, pub, _ := newService(t) + c := testContext() + + subs.EXPECT().GetHistorySharedSince(gomock.Any(), "u1", "r1").Return(nil, true, nil) + + hydrated := &models.Message{ + MessageID: "reply-1", + RoomID: "r1", + Sender: models.Participant{Account: "u1"}, + CreatedAt: time.Date(2026, 5, 14, 13, 0, 0, 0, time.UTC), + ThreadParentID: "parent-1", + } + msgs.EXPECT().GetMessageByID(gomock.Any(), "reply-1").Return(hydrated, nil) + msgs.EXPECT(). + SoftDeleteMessage(gomock.Any(), hydrated, gomock.Any()). + DoAndReturn(func(_ context.Context, _ *models.Message, deletedAt time.Time) (time.Time, bool, *int, error) { + return deletedAt, true, nil, nil + }) + + pub.EXPECT(). + Publish(gomock.Any(), subject.MsgCanonicalDeleted("site-test"), gomock.Any(), gomock.Any()). + Return(nil) + + resp, err := svc.DeleteMessage(c, "site-test", models.DeleteMessageRequest{MessageID: "reply-1"}) + require.NoError(t, err) + require.NotNil(t, resp) +} + // ============================================================ // Quote redaction // ============================================================ @@ -1721,6 +2106,80 @@ func TestHistoryService_TShow_TwoMessagesWithSameParent_BothRedacted(t *testing. assert.Equal(t, service.UnavailableQuoteMsg, resp.Messages[1].QuotedParentMessage.Msg) } +// TestHistoryService_DeleteMessage_EventDeletedCarriesContent verifies that the +// canonical EventDeleted published on delete includes the message body so that +// broadcast-worker can parse @-mentions for the thread-delete fan-out. +func TestHistoryService_DeleteMessage_EventDeletedCarriesContent(t *testing.T) { + svc, msgs, subs, pub, _ := newService(t) + c := testContext() + + subs.EXPECT().GetHistorySharedSince(gomock.Any(), "u1", "r1").Return(nil, true, nil) + + hydrated := &models.Message{ + MessageID: "m-content", + RoomID: "r1", + Sender: models.Participant{Account: "u1", ID: "u1-id"}, + Deleted: false, + Msg: "hey @dave check this out", + } + msgs.EXPECT().GetMessageByID(gomock.Any(), "m-content").Return(hydrated, nil) + + deletedAt := time.Now().UTC() + msgs.EXPECT(). + SoftDeleteMessage(gomock.Any(), hydrated, gomock.Any()). + Return(deletedAt, true, (*int)(nil), nil) + + pub.EXPECT(). + Publish(gomock.Any(), subject.MsgCanonicalDeleted("site-test"), gomock.Any(), gomock.Any()). + DoAndReturn(func(_ context.Context, _ string, data []byte, _ string) error { + var evt model.MessageEvent + require.NoError(t, json.Unmarshal(data, &evt)) + assert.Equal(t, model.EventDeleted, evt.Event) + assert.Equal(t, "hey @dave check this out", evt.Message.Content, + "EventDeleted must carry Content so broadcast-worker can parse @-mentions for thread-delete fan-out") + return nil + }) + + resp, err := svc.DeleteMessage(c, "site-test", models.DeleteMessageRequest{MessageID: "m-content"}) + require.NoError(t, err) + assert.Equal(t, "m-content", resp.MessageID) +} + +// TestHistoryService_DeleteMessage_AlreadyDeleted_EventDeletedCarriesContent verifies +// that the already-deleted retry path also includes Content in EventDeleted. +func TestHistoryService_DeleteMessage_AlreadyDeleted_EventDeletedCarriesContent(t *testing.T) { + svc, msgs, subs, pub, _ := newService(t) + c := testContext() + + subs.EXPECT().GetHistorySharedSince(gomock.Any(), "u1", "r1").Return(nil, true, nil) + + priorUpdatedAt := time.Now().UTC().Add(-time.Hour).Truncate(time.Millisecond) + hydrated := &models.Message{ + MessageID: "m-retry", + RoomID: "r1", + Sender: models.Participant{Account: "u1", ID: "u1-id"}, + Deleted: true, + UpdatedAt: &priorUpdatedAt, + Msg: "hey @carol look at this", + } + msgs.EXPECT().GetMessageByID(gomock.Any(), "m-retry").Return(hydrated, nil) + + pub.EXPECT(). + Publish(gomock.Any(), subject.MsgCanonicalDeleted("site-test"), gomock.Any(), gomock.Any()). + DoAndReturn(func(_ context.Context, _ string, data []byte, _ string) error { + var evt model.MessageEvent + require.NoError(t, json.Unmarshal(data, &evt)) + assert.Equal(t, model.EventDeleted, evt.Event) + assert.Equal(t, "hey @carol look at this", evt.Message.Content, + "already-deleted retry EventDeleted must carry Content for thread-delete fan-out") + return nil + }) + + resp, err := svc.DeleteMessage(c, "site-test", models.DeleteMessageRequest{MessageID: "m-retry"}) + require.NoError(t, err) + assert.Equal(t, "m-retry", resp.MessageID) +} + // TShow message where ThreadParentCreatedAt is nil (message-worker didn't populate it) → // conservatively redacted because the access window cannot be verified. func TestHistoryService_TShow_ThreadParentCreatedAtNil_ConservativeRedaction(t *testing.T) { diff --git a/history-service/internal/service/mocks/mock_repository.go b/history-service/internal/service/mocks/mock_repository.go index f35270595..5647e2c16 100644 --- a/history-service/internal/service/mocks/mock_repository.go +++ b/history-service/internal/service/mocks/mock_repository.go @@ -247,13 +247,14 @@ func (mr *MockMessageWriterMockRecorder) RemoveReaction(ctx, msg, key, updatedAt } // SoftDeleteMessage mocks base method. -func (m *MockMessageWriter) SoftDeleteMessage(ctx context.Context, msg *models.Message, deletedAt time.Time) (time.Time, bool, error) { +func (m *MockMessageWriter) SoftDeleteMessage(ctx context.Context, msg *models.Message, deletedAt time.Time) (time.Time, bool, *int, error) { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "SoftDeleteMessage", ctx, msg, deletedAt) ret0, _ := ret[0].(time.Time) ret1, _ := ret[1].(bool) - ret2, _ := ret[2].(error) - return ret0, ret1, ret2 + ret2, _ := ret[2].(*int) + ret3, _ := ret[3].(error) + return ret0, ret1, ret2, ret3 } // SoftDeleteMessage indicates an expected call of SoftDeleteMessage. @@ -492,13 +493,14 @@ func (mr *MockMessageRepositoryMockRecorder) RemoveReaction(ctx, msg, key, updat } // SoftDeleteMessage mocks base method. -func (m *MockMessageRepository) SoftDeleteMessage(ctx context.Context, msg *models.Message, deletedAt time.Time) (time.Time, bool, error) { +func (m *MockMessageRepository) SoftDeleteMessage(ctx context.Context, msg *models.Message, deletedAt time.Time) (time.Time, bool, *int, error) { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "SoftDeleteMessage", ctx, msg, deletedAt) ret0, _ := ret[0].(time.Time) ret1, _ := ret[1].(bool) - ret2, _ := ret[2].(error) - return ret0, ret1, ret2 + ret2, _ := ret[2].(*int) + ret3, _ := ret[3].(error) + return ret0, ret1, ret2, ret3 } // SoftDeleteMessage indicates an expected call of SoftDeleteMessage. diff --git a/history-service/internal/service/service.go b/history-service/internal/service/service.go index 99de98b0a..8afd81e46 100644 --- a/history-service/internal/service/service.go +++ b/history-service/internal/service/service.go @@ -35,7 +35,9 @@ type MessageWriter interface { // runs the mirror-table and parent-tcount work when the LWT applies. // Returns the updated_at value now persisted (the deletedAt argument when // applied; the existing value when a concurrent delete won the race). - SoftDeleteMessage(ctx context.Context, msg *models.Message, deletedAt time.Time) (actualDeletedAt time.Time, applied bool, err error) + // newTcount is non-nil when the parent's tcount was decremented via CAS; + // nil means the CAS was skipped (e.g. parent row not found, or msg is not a thread reply). + SoftDeleteMessage(ctx context.Context, msg *models.Message, deletedAt time.Time) (actualDeletedAt time.Time, applied bool, newTcount *int, err error) PinMessage(ctx context.Context, msg *models.Message, pinnedAt time.Time, pinnedBy models.Participant) error UnpinMessage(ctx context.Context, msg *models.Message) error // AddReaction writes one (emoji, user_account) map-cell to every mirror; idempotent. @@ -64,9 +66,7 @@ type RoomRepository interface { GetRoomUserCount(ctx context.Context, roomID string) (int, error) } -// EventPublisher publishes canonical events to a JetStream-backed NATS -// subject. msgID is sent as the Nats-Msg-Id header so the server collapses -// duplicate publishes within the stream's dedup window. +// EventPublisher publishes events to NATS with a Nats-Msg-Id dedup header. type EventPublisher interface { Publish(ctx context.Context, subject string, data []byte, msgID string) error } diff --git a/inbox-worker/handler.go b/inbox-worker/handler.go index 202f1ae68..1609b11fc 100644 --- a/inbox-worker/handler.go +++ b/inbox-worker/handler.go @@ -13,11 +13,8 @@ import ( "github.com/hmchangw/chat/pkg/errcode" "github.com/hmchangw/chat/pkg/idgen" "github.com/hmchangw/chat/pkg/model" - "github.com/hmchangw/chat/pkg/natsutil" ) -//go:generate mockgen -destination=mock_store_test.go -package=main . InboxStore - // InboxStore abstracts the data store operations needed by the inbox worker. type InboxStore interface { CreateSubscription(ctx context.Context, sub *model.Subscription) error @@ -39,13 +36,12 @@ type InboxStore interface { UpdateSubscriptionMute(ctx context.Context, roomID, account string, muted bool) error // UpdateSubscriptionFavorite silently no-ops on missing-sub (federation race — user left mid-flight). UpdateSubscriptionFavorite(ctx context.Context, roomID, account string, favorite bool) error - - // UpdateSubscriptionNamesForRoom mass-renames subscription mirrors on this site. + // UpdateSubscriptionNamesForRoom sets name on every subscription in the room. + // Used when a channel is renamed — replicated via the outbox to remote sites. UpdateSubscriptionNamesForRoom(ctx context.Context, roomID, newName string) error - - // ApplySubscriptionVisibility mirrors room-worker's counterpart (same 3 branches). - // On a remote site this only updates mirrored subs whose users are homed here; - // OwnerAccount is load-bearing so $cond can promote the chosen owner's local mirror. + // ApplySubscriptionVisibility writes {restricted, externalAccess, roles} to all subs + // in the room. When restricted=true and ownerAccount is non-empty, a $cond pipeline + // demotes all accounts except ownerAccount to RoleMember. ApplySubscriptionVisibility(ctx context.Context, roomID string, restricted, externalAccess bool, ownerAccount string) error } @@ -85,10 +81,10 @@ func (h *Handler) HandleEvent(ctx context.Context, data []byte) error { return h.handleThreadSubscriptionUpserted(ctx, &evt) case "thread_read": return h.handleThreadRead(ctx, &evt) - case "room_renamed": + case model.OutboxRoomRenamed: return h.handleRoomRenamed(ctx, &evt) - case "room_restricted": - return h.handleRoomRestricted(ctx, &evt) + case model.OutboxRoomRestricted: + return h.handleRoomVisibilityChanged(ctx, &evt) default: slog.Warn("unknown event type, skipping", "type", evt.Type) return nil @@ -278,40 +274,30 @@ func (h *Handler) handleThreadRead(ctx context.Context, evt *model.OutboxEvent) } lastSeenAt := time.UnixMilli(e.LastSeenAt).UTC() if err := h.store.ApplyThreadRead(ctx, e.RoomID, e.ThreadRoomID, e.Account, e.NewThreadUnread, e.Alert, lastSeenAt); err != nil { - return fmt.Errorf("apply thread read (room %q, parent %q, account %q): %w", - e.RoomID, e.ParentMessageID, e.Account, err) + return fmt.Errorf("apply thread read (room %q, thread %q, account %q): %w", + e.RoomID, e.ThreadRoomID, e.Account, err) } return nil } func (h *Handler) handleRoomRenamed(ctx context.Context, evt *model.OutboxEvent) error { - var payload model.RoomRenamedOutboxPayload - if err := json.Unmarshal(evt.Payload, &payload); err != nil { - return fmt.Errorf("unmarshal room_renamed payload: %w", err) + var p model.RoomRenamedOutboxPayload + if err := json.Unmarshal(evt.Payload, &p); err != nil { + return errcode.Permanent(errcode.BadRequest("unmarshal room_renamed payload")) } - slog.Info("processing room_renamed", - "roomID", payload.RoomID, - "newName", payload.NewName, - "requestID", natsutil.RequestIDFromContext(ctx)) - if err := h.store.UpdateSubscriptionNamesForRoom(ctx, payload.RoomID, payload.NewName); err != nil { - return fmt.Errorf("update subscription names for room %s: %w", payload.RoomID, err) + if err := h.store.UpdateSubscriptionNamesForRoom(ctx, p.RoomID, p.NewName); err != nil { + return fmt.Errorf("update subscription names for room %s: %w", p.RoomID, err) } return nil } -func (h *Handler) handleRoomRestricted(ctx context.Context, evt *model.OutboxEvent) error { - var payload model.RoomRestrictedOutboxPayload - if err := json.Unmarshal(evt.Payload, &payload); err != nil { - return fmt.Errorf("unmarshal room_restricted payload: %w", err) - } - slog.Info("processing room_restricted", - "roomID", payload.RoomID, - "restricted", payload.Restricted, - "externalAccess", payload.ExternalAccess, - "ownerAccount", payload.OwnerAccount, - "requestID", natsutil.RequestIDFromContext(ctx)) - if err := h.store.ApplySubscriptionVisibility(ctx, payload.RoomID, payload.Restricted, payload.ExternalAccess, payload.OwnerAccount); err != nil { - return fmt.Errorf("apply restricted for room %s: %w", payload.RoomID, err) +func (h *Handler) handleRoomVisibilityChanged(ctx context.Context, evt *model.OutboxEvent) error { + var p model.RoomRestrictedOutboxPayload + if err := json.Unmarshal(evt.Payload, &p); err != nil { + return errcode.Permanent(errcode.BadRequest("unmarshal room_restricted payload")) + } + if err := h.store.ApplySubscriptionVisibility(ctx, p.RoomID, p.Restricted, p.ExternalAccess, p.OwnerAccount); err != nil { + return fmt.Errorf("apply subscription visibility for room %s: %w", p.RoomID, err) } return nil } diff --git a/inbox-worker/handler_test.go b/inbox-worker/handler_test.go index 5ac0f2054..7c2b735f3 100644 --- a/inbox-worker/handler_test.go +++ b/inbox-worker/handler_test.go @@ -12,7 +12,6 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "go.mongodb.org/mongo-driver/v2/mongo" - "go.uber.org/mock/gomock" "github.com/hmchangw/chat/pkg/errcode" "github.com/hmchangw/chat/pkg/idgen" @@ -238,14 +237,6 @@ func (s *stubInboxStore) UpsertThreadSubscription(_ context.Context, sub *model. return nil } -func (s *stubInboxStore) getThreadSubs() []model.ThreadSubscription { - s.mu.Lock() - defer s.mu.Unlock() - cp := make([]model.ThreadSubscription, len(s.threadSubs)) - copy(cp, s.threadSubs) - return cp -} - func (s *stubInboxStore) UpdateSubscriptionNamesForRoom(_ context.Context, _, _ string) error { return nil } @@ -254,6 +245,14 @@ func (s *stubInboxStore) ApplySubscriptionVisibility(_ context.Context, _ string return nil } +func (s *stubInboxStore) getThreadSubs() []model.ThreadSubscription { + s.mu.Lock() + defer s.mu.Unlock() + cp := make([]model.ThreadSubscription, len(s.threadSubs)) + copy(cp, s.threadSubs) + return cp +} + // --- Tests --- func TestHandleEvent_MemberAdded(t *testing.T) { @@ -1427,52 +1426,6 @@ func TestHandler_SubscriptionFavoriteToggled(t *testing.T) { assert.True(t, subs[0].Favorite) } -func TestHandleRoomRenamed_HappyPath(t *testing.T) { - ctrl := gomock.NewController(t) - defer ctrl.Finish() - store := NewMockInboxStore(ctrl) - store.EXPECT().UpdateSubscriptionNamesForRoom(gomock.Any(), "r1", "new").Return(nil) - - h := NewHandler(store) - payload, _ := json.Marshal(model.RoomRenamedOutboxPayload{RoomID: "r1", NewName: "new", Timestamp: 1700000000000}) - data, _ := json.Marshal(model.OutboxEvent{Type: model.OutboxRoomRenamed, Payload: payload, Timestamp: 1700000000000}) - require.NoError(t, h.HandleEvent(context.Background(), data)) -} - -func TestHandleRoomRenamed_ErrorOnUnmarshal(t *testing.T) { - ctrl := gomock.NewController(t) - defer ctrl.Finish() - store := NewMockInboxStore(ctrl) - h := NewHandler(store) - data, _ := json.Marshal(model.OutboxEvent{Type: model.OutboxRoomRenamed, Payload: []byte("not json")}) - err := h.HandleEvent(context.Background(), data) - require.Error(t, err) -} - -func TestHandleRoomRestricted_HappyPath(t *testing.T) { - ctrl := gomock.NewController(t) - defer ctrl.Finish() - store := NewMockInboxStore(ctrl) - store.EXPECT().ApplySubscriptionVisibility(gomock.Any(), "r1", true, false, "bob").Return(nil) - - h := NewHandler(store) - payload, _ := json.Marshal(model.RoomRestrictedOutboxPayload{ - RoomID: "r1", Restricted: true, ExternalAccess: false, OwnerAccount: "bob", Timestamp: 1700000000000, - }) - data, _ := json.Marshal(model.OutboxEvent{Type: model.OutboxRoomRestricted, Payload: payload, Timestamp: 1700000000000}) - require.NoError(t, h.HandleEvent(context.Background(), data)) -} - -func TestHandleRoomRestricted_ErrorOnUnmarshal(t *testing.T) { - ctrl := gomock.NewController(t) - defer ctrl.Finish() - store := NewMockInboxStore(ctrl) - h := NewHandler(store) - data, _ := json.Marshal(model.OutboxEvent{Type: model.OutboxRoomRestricted, Payload: []byte("not json")}) - err := h.HandleEvent(context.Background(), data) - require.Error(t, err) -} - func TestHandler_SubscriptionFavoriteToggled_MissingSubscriptionNoOp(t *testing.T) { store := &stubInboxStore{} h := NewHandler(store) @@ -1502,111 +1455,3 @@ func TestHandler_SubscriptionFavoriteToggled_MalformedPayload(t *testing.T) { require.Error(t, h.HandleEvent(context.Background(), evt)) } - -func TestHandleRoomRenamed_StoreError(t *testing.T) { - ctrl := gomock.NewController(t) - defer ctrl.Finish() - store := NewMockInboxStore(ctrl) - store.EXPECT().UpdateSubscriptionNamesForRoom(gomock.Any(), "r1", "new").Return(errors.New("mongo timeout")) - - h := NewHandler(store) - payload, _ := json.Marshal(model.RoomRenamedOutboxPayload{RoomID: "r1", NewName: "new", Timestamp: 1700000000000}) - data, _ := json.Marshal(model.OutboxEvent{Type: model.OutboxRoomRenamed, Payload: payload}) - err := h.HandleEvent(context.Background(), data) - require.Error(t, err) - assert.Contains(t, err.Error(), "update subscription names") -} - -func TestHandleRoomRestricted_StoreError(t *testing.T) { - ctrl := gomock.NewController(t) - defer ctrl.Finish() - store := NewMockInboxStore(ctrl) - store.EXPECT().ApplySubscriptionVisibility(gomock.Any(), "r1", true, false, "bob").Return(errors.New("mongo timeout")) - - h := NewHandler(store) - payload, _ := json.Marshal(model.RoomRestrictedOutboxPayload{ - RoomID: "r1", Restricted: true, ExternalAccess: false, OwnerAccount: "bob", Timestamp: 1700000000000, - }) - data, _ := json.Marshal(model.OutboxEvent{Type: model.OutboxRoomRestricted, Payload: payload}) - err := h.HandleEvent(context.Background(), data) - require.Error(t, err) - assert.Contains(t, err.Error(), "apply restricted") -} - -func TestHandleRoomRenamed_EdgeCases(t *testing.T) { - tests := []struct { - name string - payload model.RoomRenamedOutboxPayload - }{ - { - name: "empty room ID propagates to store call", - payload: model.RoomRenamedOutboxPayload{RoomID: "", NewName: "new", Timestamp: 1700000000000}, - }, - { - name: "empty new name propagates to store call", - payload: model.RoomRenamedOutboxPayload{RoomID: "r1", NewName: "", Timestamp: 1700000000000}, - }, - { - name: "zero timestamp accepted (inbox handler does not validate)", - payload: model.RoomRenamedOutboxPayload{RoomID: "r1", NewName: "new", Timestamp: 0}, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - ctrl := gomock.NewController(t) - store := NewMockInboxStore(ctrl) - store.EXPECT().UpdateSubscriptionNamesForRoom(gomock.Any(), tt.payload.RoomID, tt.payload.NewName).Return(nil) - - h := NewHandler(store) - payload, _ := json.Marshal(tt.payload) - data, _ := json.Marshal(model.OutboxEvent{Type: model.OutboxRoomRenamed, Payload: payload}) - require.NoError(t, h.HandleEvent(context.Background(), data)) - }) - } -} - -func TestHandleRoomRestricted_EdgeCases(t *testing.T) { - tests := []struct { - name string - payload model.RoomRestrictedOutboxPayload - }{ - { - name: "empty room ID propagates to store call", - payload: model.RoomRestrictedOutboxPayload{ - RoomID: "", Restricted: true, ExternalAccess: false, OwnerAccount: "bob", Timestamp: 1700000000000, - }, - }, - { - name: "missing owner account on restrict propagates to store (branch (b) flags-only)", - payload: model.RoomRestrictedOutboxPayload{ - RoomID: "r1", Restricted: true, ExternalAccess: true, OwnerAccount: "", Timestamp: 1700000000000, - }, - }, - { - name: "missing owner account on unrestrict propagates to store (branch (c) flags-only)", - payload: model.RoomRestrictedOutboxPayload{ - RoomID: "r1", Restricted: false, ExternalAccess: false, OwnerAccount: "", Timestamp: 1700000000000, - }, - }, - { - name: "zero timestamp accepted (inbox handler does not validate)", - payload: model.RoomRestrictedOutboxPayload{ - RoomID: "r1", Restricted: true, ExternalAccess: false, OwnerAccount: "bob", Timestamp: 0, - }, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - ctrl := gomock.NewController(t) - store := NewMockInboxStore(ctrl) - store.EXPECT().ApplySubscriptionVisibility( - gomock.Any(), tt.payload.RoomID, tt.payload.Restricted, tt.payload.ExternalAccess, tt.payload.OwnerAccount, - ).Return(nil) - - h := NewHandler(store) - payload, _ := json.Marshal(tt.payload) - data, _ := json.Marshal(model.OutboxEvent{Type: model.OutboxRoomRestricted, Payload: payload}) - require.NoError(t, h.HandleEvent(context.Background(), data)) - }) - } -} diff --git a/message-worker/handler.go b/message-worker/handler.go index c8e339e5d..1c373c107 100644 --- a/message-worker/handler.go +++ b/message-worker/handler.go @@ -60,6 +60,13 @@ func (h *Handler) processMessage(ctx context.Context, data []byte) error { return fmt.Errorf("unmarshal message event: %w", err) } + // Badge events published by this worker back onto .created are handled + // by broadcast-worker, not here. Skip them to avoid re-processing our + // own publishes as new messages. + if evt.Event == model.EventThreadReplyAdded { + return nil + } + resolved, err := mention.Resolve(ctx, evt.Message.Content, h.userStore.FindUsersByAccounts) if err != nil { return fmt.Errorf("resolve mentions: %w", err) @@ -96,9 +103,15 @@ func (h *Handler) processMessage(ctx context.Context, data []byte) error { if err := h.markThreadMentions(ctx, &evt.Message, threadRoomID, evt.SiteID); err != nil { return fmt.Errorf("mark thread mentions: %w", err) } - if err := h.store.SaveThreadMessage(ctx, &evt.Message, sender, evt.SiteID, threadRoomID); err != nil { + newTcount, err := h.store.SaveThreadMessage(ctx, &evt.Message, sender, evt.SiteID, threadRoomID) + if err != nil { return fmt.Errorf("save thread message: %w", err) } + if newTcount != nil { + if err := h.publishThreadReplyEvent(ctx, &evt.Message, *newTcount); err != nil { + return fmt.Errorf("publish thread reply event: %w", err) + } + } } else { if err := h.store.SaveMessage(ctx, &evt.Message, sender, evt.SiteID); err != nil { return fmt.Errorf("save message: %w", err) @@ -433,3 +446,28 @@ func (h *Handler) publishThreadSubOutboxIfRemote(ctx context.Context, sub *model } return nil } + +// publishThreadReplyEvent publishes an EventThreadReplyAdded badge event to +// the MESSAGES_CANONICAL stream on the .created subject so broadcast-worker +// can do DM-aware routing of the reply-count badge update. The dedup ID is +// stable across redeliveries so JetStream stream-level dedup absorbs +// duplicates within the dedup window. +func (h *Handler) publishThreadReplyEvent(ctx context.Context, msg *model.Message, newTcount int) error { + evt := model.MessageEvent{ + Event: model.EventThreadReplyAdded, + Message: model.Message{ + ID: msg.ID, + RoomID: msg.RoomID, + ThreadParentMessageID: msg.ThreadParentMessageID, + }, + SiteID: h.siteID, + Timestamp: time.Now().UTC().UnixMilli(), + NewTCount: &newTcount, + } + data, err := json.Marshal(evt) + if err != nil { + return fmt.Errorf("marshal thread reply event: %w", err) + } + dedupID := fmt.Sprintf("thread-reply-added:%s:%s", h.siteID, msg.ID) + return h.publish(ctx, subject.MsgCanonicalCreated(h.siteID), data, dedupID) +} diff --git a/message-worker/handler_test.go b/message-worker/handler_test.go index 90e95e1e7..8be9669c8 100644 --- a/message-worker/handler_test.go +++ b/message-worker/handler_test.go @@ -16,6 +16,7 @@ import ( "github.com/hmchangw/chat/pkg/model" "github.com/hmchangw/chat/pkg/model/cassandra" + "github.com/hmchangw/chat/pkg/subject" "github.com/hmchangw/chat/pkg/userstore" ) @@ -226,7 +227,7 @@ func TestHandler_ProcessMessage(t *testing.T) { ts.EXPECT().UpsertThreadSubscription(gomock.Any(), gomock.Any()).Return(nil) ts.EXPECT().UpdateThreadRoomLastMessage(gomock.Any(), "tr-1", "msg-2", gomock.Any(), now).Return(nil) // SaveThreadMessage receives the resolved threadRoomID. - store.EXPECT().SaveThreadMessage(gomock.Any(), &threadMsg, &expectedSender, "site-a", "tr-1").Return(nil) + store.EXPECT().SaveThreadMessage(gomock.Any(), &threadMsg, &expectedSender, "site-a", "tr-1").Return((*int)(nil), nil) }, }, { @@ -246,7 +247,7 @@ func TestHandler_ProcessMessage(t *testing.T) { ts.EXPECT().UpsertThreadSubscription(gomock.Any(), gomock.Any()).Return(nil) ts.EXPECT().UpdateThreadRoomLastMessage(gomock.Any(), "tr-1", "msg-2", gomock.Any(), now).Return(nil) store.EXPECT().SaveThreadMessage(gomock.Any(), &threadMsg, &expectedSender, "site-a", "tr-1"). - Return(errors.New("cassandra: write timeout")) + Return((*int)(nil), errors.New("cassandra: write timeout")) }, wantErr: true, }, @@ -338,7 +339,7 @@ func TestHandler_ProcessMessage(t *testing.T) { assert.Nil(t, sub.LastSeenAt) return nil }) - store.EXPECT().SaveThreadMessage(gomock.Any(), gomock.Any(), gomock.Any(), "site-a", gomock.Any()).Return(nil) + store.EXPECT().SaveThreadMessage(gomock.Any(), gomock.Any(), gomock.Any(), "site-a", gomock.Any()).Return((*int)(nil), nil) }, }, { @@ -357,7 +358,7 @@ func TestHandler_ProcessMessage(t *testing.T) { ts.EXPECT().InsertThreadSubscription(gomock.Any(), gomock.Any()).Return(nil) ts.EXPECT().InsertThreadSubscription(gomock.Any(), gomock.Any()).Return(nil) // MarkThreadSubscriptionMention must NOT be called — sender excluded. - store.EXPECT().SaveThreadMessage(gomock.Any(), gomock.Any(), gomock.Any(), "site-a", gomock.Any()).Return(nil) + store.EXPECT().SaveThreadMessage(gomock.Any(), gomock.Any(), gomock.Any(), "site-a", gomock.Any()).Return((*int)(nil), nil) }, }, { @@ -374,7 +375,7 @@ func TestHandler_ProcessMessage(t *testing.T) { ts.EXPECT().InsertThreadSubscription(gomock.Any(), gomock.Any()).Return(nil) ts.EXPECT().InsertThreadSubscription(gomock.Any(), gomock.Any()).Return(nil) // MarkThreadSubscriptionMention must NOT be called — @all is thread-ignored. - store.EXPECT().SaveThreadMessage(gomock.Any(), gomock.Any(), gomock.Any(), "site-a", gomock.Any()).Return(nil) + store.EXPECT().SaveThreadMessage(gomock.Any(), gomock.Any(), gomock.Any(), "site-a", gomock.Any()).Return((*int)(nil), nil) }, }, { @@ -397,7 +398,7 @@ func TestHandler_ProcessMessage(t *testing.T) { assert.True(t, sub.HasMention) return nil }) - store.EXPECT().SaveThreadMessage(gomock.Any(), gomock.Any(), gomock.Any(), "site-a", gomock.Any()).Return(nil) + store.EXPECT().SaveThreadMessage(gomock.Any(), gomock.Any(), gomock.Any(), "site-a", gomock.Any()).Return((*int)(nil), nil) }, }, { @@ -444,6 +445,67 @@ func TestHandler_ProcessMessage(t *testing.T) { } } +// TestHandler_ProcessMessage_ThreadReply_PublishesBadgeEvent verifies that when +// SaveThreadMessage returns a non-nil tcount (first write or redelivery recovery), +// the handler calls publishThreadReplyEvent on the MESSAGES_CANONICAL stream. +func TestHandler_ProcessMessage_ThreadReply_PublishesBadgeEvent(t *testing.T) { + now := time.Date(2026, 1, 1, 12, 0, 0, 0, time.UTC) + user := &model.User{ID: "u-1", Account: "alice", SiteID: "site-a", EngName: "Alice Wang", ChineseName: "愛麗絲"} + threadMsg := model.Message{ + ID: "msg-reply", + RoomID: "r1", + UserID: "u-1", + UserAccount: "alice", + Content: "reply", + CreatedAt: now, + ThreadParentMessageID: "msg-parent", + } + threadEvt := model.MessageEvent{Message: threadMsg, SiteID: "site-a", Timestamp: now.UnixMilli()} + data, _ := json.Marshal(threadEvt) + expectedSender := cassParticipant{ID: "u-1", EngName: "Alice Wang", CompanyName: "愛麗絲", Account: "alice"} + expectedTcount := 3 + + ctrl := gomock.NewController(t) + mockStore := NewMockStore(ctrl) + mockUserStore := NewMockUserStore(ctrl) + mockThreadStore := NewMockThreadStore(ctrl) + + mockUserStore.EXPECT().FindUserByID(gomock.Any(), "u-1").Return(user, nil) + mockThreadStore.EXPECT().CreateThreadRoom(gomock.Any(), gomock.Any()).Return(errThreadRoomExists) + mockThreadStore.EXPECT().GetThreadRoomByParentMessageID(gomock.Any(), "msg-parent"). + Return(&model.ThreadRoom{ID: "tr-99"}, nil) + mockStore.EXPECT().GetMessageSender(gomock.Any(), "msg-parent"). + Return(&cassParticipant{ID: "u-parent", Account: "parent-user"}, nil) + mockUserStore.EXPECT().FindUserByID(gomock.Any(), "u-parent"). + Return(&model.User{ID: "u-parent", Account: "parent-user", SiteID: "site-a"}, nil) + mockThreadStore.EXPECT().UpsertThreadSubscription(gomock.Any(), gomock.Any()).Return(nil) + mockThreadStore.EXPECT().UpsertThreadSubscription(gomock.Any(), gomock.Any()).Return(nil) + mockThreadStore.EXPECT().UpdateThreadRoomLastMessage(gomock.Any(), "tr-99", "msg-reply", gomock.Any(), now).Return(nil) + // SaveThreadMessage returns a non-nil tcount (simulates first write or redelivery recovery). + mockStore.EXPECT().SaveThreadMessage(gomock.Any(), &threadMsg, &expectedSender, "site-a", "tr-99"). + Return(&expectedTcount, nil) + + var capturedSubj string + var capturedData []byte + h := NewHandler(mockStore, mockUserStore, mockThreadStore, "site-a", + func(_ context.Context, subj string, data []byte, _ string) error { + capturedSubj = subj + capturedData = data + return nil + }, + ) + require.NoError(t, h.processMessage(context.Background(), data)) + + assert.Equal(t, subject.MsgCanonicalCreated("site-a"), capturedSubj, + "badge event must be published to the canonical created subject") + var badgeEvt model.MessageEvent + require.NoError(t, json.Unmarshal(capturedData, &badgeEvt)) + assert.Equal(t, model.EventThreadReplyAdded, badgeEvt.Event) + assert.Equal(t, "msg-reply", badgeEvt.Message.ID) + require.NotNil(t, badgeEvt.NewTCount) + assert.Equal(t, expectedTcount, *badgeEvt.NewTCount) +} + func TestHandler_HandleThreadRoomAndSubscriptions(t *testing.T) { now := time.Date(2026, 1, 1, 12, 0, 0, 0, time.UTC) @@ -1741,3 +1803,181 @@ func TestHandler_ProcessMessage_Quote(t *testing.T) { require.NoError(t, err) }) } + +func TestHandler_ProcessMessage_ThreadReplyPublish(t *testing.T) { + now := time.Date(2026, 1, 1, 12, 0, 0, 0, time.UTC) + parentCreatedAt := now.Add(-10 * time.Minute) + + user := &model.User{ + ID: "u-1", + Account: "alice", + SiteID: "site-a", + EngName: "Alice Wang", + ChineseName: "愛麗絲", + } + expectedSender := cassParticipant{ + ID: user.ID, + EngName: user.EngName, + CompanyName: user.ChineseName, + Account: "alice", + } + threadMsg := model.Message{ + ID: "msg-reply", + RoomID: "r1", + UserID: "u-1", + UserAccount: "alice", + Content: "thread reply", + CreatedAt: now, + ThreadParentMessageID: "msg-parent", + ThreadParentMessageCreatedAt: &parentCreatedAt, + } + threadEvt := model.MessageEvent{Message: threadMsg, SiteID: "site-a", Timestamp: now.UnixMilli()} + threadData, _ := json.Marshal(threadEvt) + + setupCommonMocks := func(store *MockStore, us *MockUserStore, ts *MockThreadStore) { + us.EXPECT().FindUserByID(gomock.Any(), "u-1").Return(user, nil) + ts.EXPECT().CreateThreadRoom(gomock.Any(), gomock.Any()).Return(errThreadRoomExists) + ts.EXPECT().GetThreadRoomByParentMessageID(gomock.Any(), "msg-parent"). + Return(&model.ThreadRoom{ID: "tr-1"}, nil) + store.EXPECT().GetMessageSender(gomock.Any(), "msg-parent"). + Return(&cassParticipant{ID: "u-parent", Account: "parent-user"}, nil) + us.EXPECT().FindUserByID(gomock.Any(), "u-parent"). + Return(&model.User{ID: "u-parent", Account: "parent-user", SiteID: "site-a"}, nil) + ts.EXPECT().UpsertThreadSubscription(gomock.Any(), gomock.Any()).Return(nil) + ts.EXPECT().UpsertThreadSubscription(gomock.Any(), gomock.Any()).Return(nil) + ts.EXPECT().UpdateThreadRoomLastMessage(gomock.Any(), "tr-1", "msg-reply", gomock.Any(), now).Return(nil) + // parentFound && ThreadParentMessageCreatedAt != nil → stamps thread_room_id on parent. + store.EXPECT().UpdateParentMessageThreadRoomID( + gomock.Any(), "msg-parent", "r1", parentCreatedAt, "tr-1", + ).Return(nil) + } + + t.Run("publishes MessageEvent to canonical thread reply subject", func(t *testing.T) { + ctrl := gomock.NewController(t) + store := NewMockStore(ctrl) + us := NewMockUserStore(ctrl) + ts := NewMockThreadStore(ctrl) + setupCommonMocks(store, us, ts) + + newTcount := 3 + store.EXPECT().SaveThreadMessage(gomock.Any(), &threadMsg, &expectedSender, "site-a", "tr-1"). + Return(&newTcount, nil) + + var capturedSubj string + var capturedData []byte + var capturedMsgID string + publishCount := 0 + h := NewHandler(store, us, ts, "site-a", func(_ context.Context, subj string, data []byte, msgID string) error { + publishCount++ + capturedSubj = subj + capturedData = data + capturedMsgID = msgID + return nil + }) + + require.NoError(t, h.processMessage(context.Background(), threadData)) + + require.Equal(t, 1, publishCount, "exactly one publish call for thread reply event") + assert.Equal(t, "chat.msg.canonical.site-a.created", capturedSubj) + assert.Equal(t, "thread-reply-added:site-a:msg-reply", capturedMsgID) + + var evt model.MessageEvent + require.NoError(t, json.Unmarshal(capturedData, &evt)) + assert.Equal(t, model.EventThreadReplyAdded, evt.Event) + assert.Equal(t, "msg-reply", evt.Message.ID) + assert.Equal(t, "r1", evt.Message.RoomID) + assert.Equal(t, "msg-parent", evt.Message.ThreadParentMessageID) + assert.Equal(t, "site-a", evt.SiteID) + require.NotNil(t, evt.NewTCount) + assert.Equal(t, 3, *evt.NewTCount) + assert.Greater(t, evt.Timestamp, int64(0)) + }) + + t.Run("publish error propagates for JetStream retry", func(t *testing.T) { + ctrl := gomock.NewController(t) + store := NewMockStore(ctrl) + us := NewMockUserStore(ctrl) + ts := NewMockThreadStore(ctrl) + setupCommonMocks(store, us, ts) + + newTcount := 3 + store.EXPECT().SaveThreadMessage(gomock.Any(), &threadMsg, &expectedSender, "site-a", "tr-1"). + Return(&newTcount, nil) + + h := NewHandler(store, us, ts, "site-a", func(_ context.Context, _ string, _ []byte, _ string) error { + return errors.New("nats: publish failed") + }) + + // Publish failure MUST propagate: the IF NOT EXISTS LWT on messages_by_id means + // redelivery detects applied=false and calls readParentTcount instead of + // incrementParentTcount, so there is no double-increment risk. Swallowing the + // error would permanently drop the badge tcount event. + require.Error(t, h.processMessage(context.Background(), threadData)) + }) + + t.Run("no publish when SaveThreadMessage returns nil tcount (CAS skipped)", func(t *testing.T) { + ctrl := gomock.NewController(t) + store := NewMockStore(ctrl) + us := NewMockUserStore(ctrl) + ts := NewMockThreadStore(ctrl) + setupCommonMocks(store, us, ts) + + store.EXPECT().SaveThreadMessage(gomock.Any(), &threadMsg, &expectedSender, "site-a", "tr-1"). + Return((*int)(nil), nil) + + publishCount := 0 + h := NewHandler(store, us, ts, "site-a", func(_ context.Context, _ string, _ []byte, _ string) error { + publishCount++ + return nil + }) + + require.NoError(t, h.processMessage(context.Background(), threadData)) + assert.Equal(t, 0, publishCount, "must not publish when CAS was skipped") + }) +} + +func TestHandler_PublishThreadReplyEvent(t *testing.T) { + type publishCall struct { + subj string + data []byte + msgID string + } + + msg := &model.Message{ + ID: "msg-2", + RoomID: "r1", + ThreadParentMessageID: "msg-1", + } + + var captured publishCall + h := NewHandler(nil, nil, nil, "site-a", func(_ context.Context, subj string, data []byte, msgID string) error { + captured = publishCall{subj: subj, data: data, msgID: msgID} + return nil + }) + + err := h.publishThreadReplyEvent(context.Background(), msg, 5) + require.NoError(t, err) + + assert.Equal(t, "chat.msg.canonical.site-a.created", captured.subj) + assert.Equal(t, "thread-reply-added:site-a:msg-2", captured.msgID) + + var evt model.MessageEvent + require.NoError(t, json.Unmarshal(captured.data, &evt)) + assert.Equal(t, model.EventThreadReplyAdded, evt.Event) + require.NotNil(t, evt.NewTCount) + assert.Equal(t, 5, *evt.NewTCount) + assert.Equal(t, "msg-2", evt.Message.ID) + assert.Equal(t, "r1", evt.Message.RoomID) + assert.Equal(t, "msg-1", evt.Message.ThreadParentMessageID) + assert.Equal(t, "site-a", evt.SiteID) + assert.Greater(t, evt.Timestamp, int64(0)) +} + +func TestHandler_PublishThreadReplyEvent_PublishError(t *testing.T) { + msg := &model.Message{ID: "msg-2", RoomID: "r1", ThreadParentMessageID: "msg-1"} + h := NewHandler(nil, nil, nil, "site-a", func(_ context.Context, _ string, _ []byte, _ string) error { + return errors.New("nats: connection closed") + }) + err := h.publishThreadReplyEvent(context.Background(), msg, 3) + require.Error(t, err) +} diff --git a/message-worker/integration_test.go b/message-worker/integration_test.go index f67ebd81d..82b79a0c0 100644 --- a/message-worker/integration_test.go +++ b/message-worker/integration_test.go @@ -158,6 +158,7 @@ func setupCassandra(t *testing.T) *gocql.Session { quoted_parent_message FROZEN<"QuotedParentMessage">, enc_payload BLOB, enc_meta FROZEN<"EncMeta">, + deleted BOOLEAN, PRIMARY KEY ((thread_room_id), created_at, message_id) ) WITH CLUSTERING ORDER BY (created_at DESC, message_id DESC)`, keyspace), } @@ -306,7 +307,7 @@ func TestCassandraStore_SaveThreadMessage(t *testing.T) { } const threadRoomID = "tr-test-1" - err := store.SaveThreadMessage(ctx, msg, sender, "site-a", threadRoomID) + _, err := store.SaveThreadMessage(ctx, msg, sender, "site-a", threadRoomID) require.NoError(t, err) t.Run("thread_messages_by_thread mentions persisted", func(t *testing.T) { @@ -668,17 +669,6 @@ func TestHandler_Integration_ThreadReplyWithMention(t *testing.T) { require.NoError(t, err) assert.Equal(t, int64(3), count) }) - - t.Run("thread_rooms.replyAccounts contains replier + parent author + mentioned user", func(t *testing.T) { - var got model.ThreadRoom - err := db.Collection("thread_rooms").FindOne(ctx, bson.M{ - "parentMessageId": "msg-parent-mention", - }).Decode(&got) - require.NoError(t, err) - assert.ElementsMatch(t, []string{"replier", "parent-user", "bob"}, got.ReplyAccounts, - "replyAccounts should match thread_subscriptions members so notification-worker "+ - "can use this single field as the follower set") - }) } func TestThreadStoreMongo_CreateThreadRoom(t *testing.T) { @@ -951,7 +941,8 @@ func TestCassandraStore_SaveThreadMessage_IncrementsParentTcount(t *testing.T) { ThreadParentMessageID: "tcount-parent", ThreadParentMessageCreatedAt: &parentCreatedAt, } - require.NoError(t, store.SaveThreadMessage(ctx, replyMsg, replySender, "site-a", "tr-tcount-1")) + _, err := store.SaveThreadMessage(ctx, replyMsg, replySender, "site-a", "tr-tcount-1") + require.NoError(t, err) t.Run("tcount incremented to 1 in messages_by_id", func(t *testing.T) { var tcount int @@ -984,7 +975,8 @@ func TestCassandraStore_SaveThreadMessage_IncrementsParentTcount(t *testing.T) { ThreadParentMessageID: "tcount-parent", ThreadParentMessageCreatedAt: &parentCreatedAt, } - require.NoError(t, store.SaveThreadMessage(ctx, replyMsg2, replySender, "site-a", "tr-tcount-1")) + _, err2 := store.SaveThreadMessage(ctx, replyMsg2, replySender, "site-a", "tr-tcount-1") + require.NoError(t, err2) t.Run("tcount incremented to 2 in messages_by_id after second reply", func(t *testing.T) { var tcount int @@ -1016,7 +1008,7 @@ func TestCassandraStore_SaveThreadMessage_IncrementsParentTcount(t *testing.T) { ThreadParentMessageID: "tcount-parent", // ThreadParentMessageCreatedAt intentionally nil } - err := store.SaveThreadMessage(ctx, noTsReply, replySender, "site-a", "tr-tcount-1") + _, err := store.SaveThreadMessage(ctx, noTsReply, replySender, "site-a", "tr-tcount-1") assert.NoError(t, err) // tcount must stay at 2 — nil timestamp skips the increment @@ -1030,6 +1022,65 @@ func TestCassandraStore_SaveThreadMessage_IncrementsParentTcount(t *testing.T) { }) } +func TestCassandraStore_SaveThreadMessage_IdempotentOnRedelivery(t *testing.T) { + cassSession := setupCassandra(t) + store := NewCassandraStore(cassSession, msgbucket.New(24*time.Hour), nil) + ctx := context.Background() + + parentCreatedAt := time.Now().UTC().Truncate(time.Millisecond) + parentBucket := msgbucket.New(24 * time.Hour).Of(parentCreatedAt) + replyCreatedAt := parentCreatedAt.Add(5 * time.Minute) + + parentSender := &cassParticipant{ID: "u-idem-parent", Account: "alice", EngName: "Alice"} + parentMsg := &model.Message{ + ID: "idem-parent", + RoomID: "idem-room", + UserID: "u-idem-parent", + CreatedAt: parentCreatedAt, + Content: "parent message", + } + require.NoError(t, store.SaveMessage(ctx, parentMsg, parentSender, "site-a")) + + replySender := &cassParticipant{ID: "u-idem-replier", Account: "bob", EngName: "Bob"} + replyMsg := &model.Message{ + ID: "idem-reply-1", + RoomID: "idem-room", + UserID: "u-idem-replier", + Content: "reply message", + CreatedAt: replyCreatedAt, + ThreadParentMessageID: "idem-parent", + ThreadParentMessageCreatedAt: &parentCreatedAt, + } + + // First delivery. + _, err := store.SaveThreadMessage(ctx, replyMsg, replySender, "site-a", "tr-idem-1") + require.NoError(t, err) + + // JetStream redelivery — same message ID, must not increment tcount again. + _, err = store.SaveThreadMessage(ctx, replyMsg, replySender, "site-a", "tr-idem-1") + require.NoError(t, err) + + t.Run("tcount stays at 1 in messages_by_id after redelivery", func(t *testing.T) { + var tcount int + err := cassSession.Query( + `SELECT tcount FROM messages_by_id WHERE message_id = ? AND created_at = ?`, + "idem-parent", parentCreatedAt, + ).Scan(&tcount) + require.NoError(t, err) + assert.Equal(t, 1, tcount) + }) + + t.Run("tcount stays at 1 in messages_by_room after redelivery", func(t *testing.T) { + var tcount int + err := cassSession.Query( + `SELECT tcount FROM messages_by_room WHERE room_id = ? AND bucket = ? AND created_at = ? AND message_id = ?`, + "idem-room", parentBucket, parentCreatedAt, "idem-parent", + ).Scan(&tcount) + require.NoError(t, err) + assert.Equal(t, 1, tcount) + }) +} + func TestCassandraStore_SaveMessage_WithQuotedParent(t *testing.T) { cassSession := setupCassandra(t) store := NewCassandraStore(cassSession, msgbucket.New(24*time.Hour), nil) @@ -1187,7 +1238,8 @@ func TestSaveThreadMessage_PartitionsByThreadRoom(t *testing.T) { ThreadParentMessageID: parentID, ThreadParentMessageCreatedAt: &parentCreatedAt, } - require.NoError(t, store.SaveThreadMessage(ctx, reply, sender, "site-A", "thread-room-1")) + _, errSave := store.SaveThreadMessage(ctx, reply, sender, "site-A", "thread-room-1") + require.NoError(t, errSave) // 1. The reply must land in the partition keyed by thread_room_id. var gotRoomID string @@ -1207,6 +1259,72 @@ func TestSaveThreadMessage_PartitionsByThreadRoom(t *testing.T) { assert.Equal(t, 1, tcount) } +func TestCassandraStore_SaveThreadMessage_ReturnsTcount(t *testing.T) { + cassSession := setupCassandra(t) + store := NewCassandraStore(cassSession, msgbucket.New(24*time.Hour), nil) + ctx := context.Background() + + parentCreatedAt := time.Now().UTC().Truncate(time.Millisecond) + replyCreatedAt := parentCreatedAt.Add(5 * time.Minute) + + parentSender := &cassParticipant{ID: "u-parent-ret", Account: "alice", EngName: "Alice"} + parentMsg := &model.Message{ + ID: "ret-parent", + RoomID: "ret-room", + UserID: "u-parent-ret", + CreatedAt: parentCreatedAt, + Content: "parent for return-value test", + } + require.NoError(t, store.SaveMessage(ctx, parentMsg, parentSender, "site-a")) + + replySender := &cassParticipant{ID: "u-replier-ret", Account: "bob", EngName: "Bob"} + + // First reply: returned tcount must be non-nil and equal 1. + reply1 := &model.Message{ + ID: "ret-reply-1", + RoomID: "ret-room", + UserID: "u-replier-ret", + Content: "first reply", + CreatedAt: replyCreatedAt, + ThreadParentMessageID: "ret-parent", + ThreadParentMessageCreatedAt: &parentCreatedAt, + } + tcount1, err := store.SaveThreadMessage(ctx, reply1, replySender, "site-a", "tr-ret-1") + require.NoError(t, err) + require.NotNil(t, tcount1, "SaveThreadMessage must return non-nil tcount for a reply with ThreadParentMessageCreatedAt set") + assert.Equal(t, 1, *tcount1, "first reply must produce tcount == 1") + + // Second reply: returned tcount must be non-nil and equal 2. + reply2CreatedAt := replyCreatedAt.Add(5 * time.Minute) + reply2 := &model.Message{ + ID: "ret-reply-2", + RoomID: "ret-room", + UserID: "u-replier-ret", + Content: "second reply", + CreatedAt: reply2CreatedAt, + ThreadParentMessageID: "ret-parent", + ThreadParentMessageCreatedAt: &parentCreatedAt, + } + tcount2, err := store.SaveThreadMessage(ctx, reply2, replySender, "site-a", "tr-ret-1") + require.NoError(t, err) + require.NotNil(t, tcount2, "SaveThreadMessage must return non-nil tcount after second reply") + assert.Equal(t, 2, *tcount2, "second reply must produce tcount == 2") + + // Reply with nil ThreadParentMessageCreatedAt: returned tcount must be nil. + reply3 := &model.Message{ + ID: "ret-reply-3", + RoomID: "ret-room", + UserID: "u-replier-ret", + Content: "reply without parent timestamp", + CreatedAt: reply2CreatedAt.Add(5 * time.Minute), + ThreadParentMessageID: "ret-parent", + // ThreadParentMessageCreatedAt intentionally nil + } + tcount3, err := store.SaveThreadMessage(ctx, reply3, replySender, "site-a", "tr-ret-1") + require.NoError(t, err) + assert.Nil(t, tcount3, "SaveThreadMessage must return nil tcount when ThreadParentMessageCreatedAt is nil") +} + func TestCassandraStore_SaveThreadMessage_WithQuotedParent(t *testing.T) { cassSession := setupCassandra(t) bucket := msgbucket.New(24 * time.Hour) @@ -1236,7 +1354,8 @@ func TestCassandraStore_SaveThreadMessage_WithQuotedParent(t *testing.T) { } const threadRoomID = "tr-quote-1" - require.NoError(t, store.SaveThreadMessage(ctx, msg, sender, "site-a", threadRoomID)) + _, errThread := store.SaveThreadMessage(ctx, msg, sender, "site-a", threadRoomID) + require.NoError(t, errThread) t.Run("thread_messages_by_thread round-trips QuotedParentMessage", func(t *testing.T) { var got cassandra.QuotedParentMessage @@ -1391,3 +1510,81 @@ func TestSaveMessage_RedeliveryOverLegacyRow_NullsPlaintextColumns(t *testing.T) assert.Equal(t, originalSysMsgData, sysMsgData, "%s: un-encrypted sys_msg_data must be preserved after redelivered encrypted insert", tableQuery.name) } } + +// TestSaveThreadMessage_EncryptedPath_SkipsTcountOnRedelivery verifies that +// saveThreadMessageEncrypted uses an IF NOT EXISTS guard on messages_by_id so +// that a JetStream redelivery of the same reply does not double-increment the +// parent's tcount. On first delivery the INSERT must be applied and tcount +// must reach 1; on redelivery the INSERT must be skipped and tcount must stay +// at 1. +func TestSaveThreadMessage_EncryptedPath_SkipsTcountOnRedelivery(t *testing.T) { + ctx := context.Background() + session := setupCassandra(t) + mongoDB := setupMongo(t) + + wrapper := newTestVaultWrapper(t, ctx) + cipher := atrest.NewCipher(wrapper, atrest.NewMongoDEKStore(mongoDB.Collection(atrest.CollectionName)), + atrest.Config{DEKCacheSize: 100, DEKCacheTTL: time.Hour}) + bucket := msgbucket.New(24 * time.Hour) + store := NewCassandraStore(session, bucket, cipher) + + parentCreatedAt := time.Now().UTC().Truncate(time.Millisecond) + replyCreatedAt := parentCreatedAt.Add(5 * time.Minute) + + parentSender := &cassParticipant{ID: "u-parent", Account: "alice", EngName: "Alice"} + parentMsg := &model.Message{ + ID: "enc-tcount-parent", + RoomID: "enc-tcount-room", + UserID: "u-parent", + CreatedAt: parentCreatedAt, + Content: "parent message", + } + require.NoError(t, store.SaveMessage(ctx, parentMsg, parentSender, "site-a")) + + replySender := &cassParticipant{ID: "u-replier", Account: "bob", EngName: "Bob"} + replyMsg := &model.Message{ + ID: "enc-tcount-reply", + RoomID: "enc-tcount-room", + UserID: "u-replier", + Content: "first reply", + CreatedAt: replyCreatedAt, + ThreadParentMessageID: "enc-tcount-parent", + ThreadParentMessageCreatedAt: &parentCreatedAt, + } + + // First delivery — must succeed and increment tcount to 1. + _, err := store.SaveThreadMessage(ctx, replyMsg, replySender, "site-a", "enc-tr-tcount-1") + require.NoError(t, err) + + t.Run("tcount 1 after first delivery", func(t *testing.T) { + var tcount int + require.NoError(t, session.Query( + `SELECT tcount FROM messages_by_id WHERE message_id = ? AND created_at = ?`, + "enc-tcount-parent", parentCreatedAt, + ).Scan(&tcount)) + assert.Equal(t, 1, tcount) + }) + + // Redelivery — same message, same coordinates. Must NOT double-increment. + _, err = store.SaveThreadMessage(ctx, replyMsg, replySender, "site-a", "enc-tr-tcount-1") + require.NoError(t, err) + + t.Run("tcount still 1 after redelivery — no double-increment", func(t *testing.T) { + var tcount int + require.NoError(t, session.Query( + `SELECT tcount FROM messages_by_id WHERE message_id = ? AND created_at = ?`, + "enc-tcount-parent", parentCreatedAt, + ).Scan(&tcount)) + assert.Equal(t, 1, tcount, "redelivery must not double-increment tcount") + }) + + t.Run("tcount still 1 in messages_by_room after redelivery", func(t *testing.T) { + parentBucket := bucket.Of(parentCreatedAt) + var tcount int + require.NoError(t, session.Query( + `SELECT tcount FROM messages_by_room WHERE room_id = ? AND bucket = ? AND created_at = ? AND message_id = ?`, + "enc-tcount-room", parentBucket, parentCreatedAt, "enc-tcount-parent", + ).Scan(&tcount)) + assert.Equal(t, 1, tcount, "redelivery must not double-increment tcount in messages_by_room") + }) +} diff --git a/message-worker/mock_store_test.go b/message-worker/mock_store_test.go index 997cd228d..98dddf28d 100644 --- a/message-worker/mock_store_test.go +++ b/message-worker/mock_store_test.go @@ -72,11 +72,12 @@ func (mr *MockStoreMockRecorder) SaveMessage(ctx, msg, sender, siteID any) *gomo } // SaveThreadMessage mocks base method. -func (m *MockStore) SaveThreadMessage(ctx context.Context, msg *model.Message, sender *cassParticipant, siteID, threadRoomID string) error { +func (m *MockStore) SaveThreadMessage(ctx context.Context, msg *model.Message, sender *cassParticipant, siteID, threadRoomID string) (*int, error) { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "SaveThreadMessage", ctx, msg, sender, siteID, threadRoomID) - ret0, _ := ret[0].(error) - return ret0 + ret0, _ := ret[0].(*int) + ret1, _ := ret[1].(error) + return ret0, ret1 } // SaveThreadMessage indicates an expected call of SaveThreadMessage. diff --git a/message-worker/store.go b/message-worker/store.go index c41f69988..5d0a25b0c 100644 --- a/message-worker/store.go +++ b/message-worker/store.go @@ -13,7 +13,7 @@ import ( // Store defines Cassandra persistence operations for the message worker. type Store interface { SaveMessage(ctx context.Context, msg *model.Message, sender *cassParticipant, siteID string) error - SaveThreadMessage(ctx context.Context, msg *model.Message, sender *cassParticipant, siteID string, threadRoomID string) error + SaveThreadMessage(ctx context.Context, msg *model.Message, sender *cassParticipant, siteID string, threadRoomID string) (*int, error) GetMessageSender(ctx context.Context, messageID string) (*cassParticipant, error) UpdateParentMessageThreadRoomID(ctx context.Context, parentMessageID, roomID string, parentCreatedAt time.Time, threadRoomID string) error } diff --git a/message-worker/store_cassandra.go b/message-worker/store_cassandra.go index 999aa7c7a..55a4035a6 100644 --- a/message-worker/store_cassandra.go +++ b/message-worker/store_cassandra.go @@ -160,29 +160,47 @@ func (s *CassandraStore) saveMessageEncrypted(ctx context.Context, msg *model.Me return nil } -// SaveThreadMessage batches the two regular inserts (messages_by_id and -// thread_messages_by_thread) into one round-trip via UnloggedBatch — same -// rationale as SaveMessage. incrementParentTcount stays separate because -// it uses Lightweight Transactions (CAS), which cannot be combined with -// non-LWT statements in a single batch. -func (s *CassandraStore) SaveThreadMessage(ctx context.Context, msg *model.Message, sender *cassParticipant, siteID string, threadRoomID string) error { +// SaveThreadMessage writes the reply to messages_by_id using an LWT +// (IF NOT EXISTS) and then unconditionally inserts into +// thread_messages_by_thread. +// +// The LWT is the idempotency gate for tcount: +// - applied=true → first delivery → increment parent tcount. +// - applied=false → redelivery → read and return the current tcount so +// the caller can still publish a badge event (no increment — avoids +// double-counting on publish-failure retries). +// +// Using IF NOT EXISTS eliminates the SELECT-before-INSERT TOCTOU window of the +// previous pre-check design. The thread_messages_by_thread INSERT is plain +// (no LWT): re-writing an identical row is safe and keeps that write fast. +func (s *CassandraStore) SaveThreadMessage(ctx context.Context, msg *model.Message, sender *cassParticipant, siteID string, threadRoomID string) (*int, error) { if s.cipher != nil { return s.saveThreadMessageEncrypted(ctx, msg, sender, siteID, threadRoomID) } + mentions := toMentionSet(msg.Mentions) - batch := s.cassSession.NewBatch(gocql.UnloggedBatch).WithContext(ctx) - batch.Query( + // MapScanCAS is required here instead of ScanCAS(). When IF NOT EXISTS is + // not applied (row already exists), Cassandra returns [applied]=false PLUS + // all existing row columns. ScanCAS() with no destinations cannot absorb + // those extra columns and returns "not enough columns to scan into". + // MapScanCAS scans everything into a map so no column count is needed. + casRow := make(map[string]interface{}) + _, err := s.cassSession.Query( `INSERT INTO messages_by_id (message_id, created_at, room_id, sender, msg, site_id, updated_at, mentions, thread_room_id, thread_parent_id, thread_parent_created_at, type, sys_msg_data, tshow, quoted_parent_message, attachments, card, card_action, file) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) IF NOT EXISTS`, msg.ID, msg.CreatedAt, msg.RoomID, sender, msg.Content, siteID, msg.CreatedAt, mentions, threadRoomID, msg.ThreadParentMessageID, msg.ThreadParentMessageCreatedAt, msg.Type, msg.SysMsgData, msg.TShow, msg.QuotedParentMessage, msg.Attachments, msg.Card, msg.CardAction, msg.File, - ) - batch.Query( + ).WithContext(ctx).MapScanCAS(casRow) + if err != nil { + return nil, fmt.Errorf("lwt insert thread message %s into messages_by_id: %w", msg.ID, err) + } + + if err := s.cassSession.Query( `INSERT INTO thread_messages_by_thread (thread_room_id, created_at, message_id, room_id, thread_parent_id, sender, msg, site_id, updated_at, mentions, type, sys_msg_data, quoted_parent_message, @@ -192,49 +210,52 @@ func (s *CassandraStore) SaveThreadMessage(ctx context.Context, msg *model.Messa sender, msg.Content, siteID, msg.CreatedAt, mentions, msg.Type, msg.SysMsgData, msg.QuotedParentMessage, msg.Attachments, msg.Card, msg.CardAction, msg.File, - ) - if err := s.cassSession.ExecuteBatch(batch); err != nil { - return fmt.Errorf("save thread message %s: %w", msg.ID, err) + ).WithContext(ctx).Exec(); err != nil { + return nil, fmt.Errorf("insert thread message %s into thread_messages_by_thread: %w", msg.ID, err) } - if err := s.incrementParentTcount(ctx, msg); err != nil { - return err - } - - return nil + return s.countAndSetParentTcount(ctx, msg, threadRoomID) } // saveThreadMessageEncrypted is the cipher-enabled counterpart to -// SaveThreadMessage. The tcount increment at the end is shared with the -// legacy path. -func (s *CassandraStore) saveThreadMessageEncrypted(ctx context.Context, msg *model.Message, sender *cassParticipant, siteID string, threadRoomID string) error { +// SaveThreadMessage. See SaveThreadMessage for the LWT idempotency rationale. +// +// Encrypted body columns (msg, attachments, card, card_action, file) are bound +// to NULL so a redelivered pre-encryption row cannot end up in a hybrid +// plaintext+encrypted state. sys_msg_data is unencrypted and written as +// plaintext in both rows. +func (s *CassandraStore) saveThreadMessageEncrypted(ctx context.Context, msg *model.Message, sender *cassParticipant, siteID string, threadRoomID string) (*int, error) { cm := buildCassandraMessage(msg) enc := atrest.SplitForEncryption(&cm) payload, meta, err := s.cipher.Encrypt(ctx, cm.RoomID, enc) if err != nil { - return fmt.Errorf("encrypt message %s in room %s: %w", cm.MessageID, cm.RoomID, err) + return nil, fmt.Errorf("encrypt message %s in room %s: %w", cm.MessageID, cm.RoomID, err) } atrest.StripEncryptedFields(&cm) encMeta := &cassandra.EncMeta{Nonce: meta.Nonce} mentions := toMentionSet(msg.Mentions) - // See saveMessageEncrypted: encrypted body columns are bound to NULL so a - // redelivered pre-rollout row can't end up in a hybrid plaintext+encrypted - // state. sys_msg_data is not encrypted and is written as plaintext. - batch := s.cassSession.NewBatch(gocql.UnloggedBatch).WithContext(ctx) - batch.Query( + // Same MapScanCAS rationale as SaveThreadMessage: IF NOT EXISTS returns all + // existing columns on non-apply, which ScanCAS() cannot absorb without + // explicit scan destinations. + casRow := make(map[string]interface{}) + _, err = s.cassSession.Query( `INSERT INTO messages_by_id (message_id, created_at, room_id, sender, site_id, updated_at, mentions, thread_room_id, thread_parent_id, thread_parent_created_at, type, tshow, quoted_parent_message, sys_msg_data, msg, attachments, card, card_action, file, enc_payload, enc_meta) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, null, null, null, null, null, ?, ?)`, + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, null, null, null, null, null, ?, ?) IF NOT EXISTS`, msg.ID, msg.CreatedAt, msg.RoomID, sender, siteID, msg.CreatedAt, mentions, threadRoomID, msg.ThreadParentMessageID, msg.ThreadParentMessageCreatedAt, msg.Type, msg.TShow, cm.QuotedParentMessage, msg.SysMsgData, payload, encMeta, - ) - batch.Query( + ).WithContext(ctx).MapScanCAS(casRow) + if err != nil { + return nil, fmt.Errorf("lwt insert thread message %s into messages_by_id: %w", msg.ID, err) + } + + if err := s.cassSession.Query( `INSERT INTO thread_messages_by_thread (thread_room_id, created_at, message_id, room_id, thread_parent_id, sender, site_id, updated_at, mentions, type, quoted_parent_message, sys_msg_data, @@ -244,16 +265,11 @@ func (s *CassandraStore) saveThreadMessageEncrypted(ctx context.Context, msg *mo threadRoomID, msg.CreatedAt, msg.ID, msg.RoomID, msg.ThreadParentMessageID, sender, siteID, msg.CreatedAt, mentions, msg.Type, cm.QuotedParentMessage, msg.SysMsgData, payload, encMeta, - ) - if err := s.cassSession.ExecuteBatch(batch); err != nil { - return fmt.Errorf("save thread message %s: %w", msg.ID, err) - } - - if err := s.incrementParentTcount(ctx, msg); err != nil { - return err + ).WithContext(ctx).Exec(); err != nil { + return nil, fmt.Errorf("insert thread message %s into thread_messages_by_thread: %w", msg.ID, err) } - return nil + return s.countAndSetParentTcount(ctx, msg, threadRoomID) } // buildCassandraMessage projects the user-authored fields of msg into a @@ -282,94 +298,67 @@ func buildCassandraMessage(msg *model.Message) cassandra.Message { return cm } -// casMaxRetries is the maximum number of CAS attempts per tcount increment. -// A conflict means another thread-reply landed between our read and write; -// 16 attempts is sufficient for any realistic burst while preventing an -// infinite loop if something unexpected keeps the row locked. -const casMaxRetries = 16 - -// casIncrement atomically increments the nullable INT counter starting at -// initial by calling update(newVal, expected) in a retry loop. On conflict -// (applied==false) it retries with the value returned by update. Returns an -// error after maxRetries consecutive failures. -func casIncrement(maxRetries int, initial *int, update func(newVal int, expected *int) (applied bool, current *int, err error)) error { - tcount := initial - for range maxRetries { - newVal := 1 - if tcount != nil { - newVal = *tcount + 1 - } - applied, current, err := update(newVal, tcount) - if err != nil { - return err +// countThreadReplies counts non-deleted rows in the thread_messages_by_thread +// partition for threadRoomID. message-worker does not write the deleted column +// on INSERT (it remains NULL), so the Go-side filter treats NULL the same as +// false — only rows where deleted is explicitly true are excluded. +func (s *CassandraStore) countThreadReplies(ctx context.Context, threadRoomID string) (int, error) { + iter := s.cassSession.Query( + `SELECT deleted FROM thread_messages_by_thread WHERE thread_room_id = ?`, + threadRoomID, + ).WithContext(ctx).Iter() + var deleted *bool + n := 0 + for iter.Scan(&deleted) { + if deleted == nil || !*deleted { + n++ } - if applied { - return nil - } - tcount = current } - return fmt.Errorf("cas increment exceeded %d retries", maxRetries) + if err := iter.Close(); err != nil { + return 0, fmt.Errorf("count thread replies for thread %s: %w", threadRoomID, err) + } + return n, nil } -// incrementParentTcount increments tcount on the parent message row in both -// messages_by_id and messages_by_room using Cassandra Lightweight Transactions -// (IF tcount = ?). Each table is incremented independently via casIncrement, -// which retries up to casMaxRetries times on CAS conflict. -// Binding a nil *int as the IF condition evaluates to IF tcount = null, which -// handles the initial case where tcount has never been set on the parent row. -// If ThreadParentMessageCreatedAt is nil the increment is silently skipped — -// tcount cannot be updated without the full primary key of the parent row. -func (s *CassandraStore) incrementParentTcount(ctx context.Context, msg *model.Message) error { - if msg.ThreadParentMessageCreatedAt == nil { - return nil - } +// setParentTcount blind-SETs tcount on the parent row in both messages_by_id +// and messages_by_room. No IF clause — the value is always derived from the +// authoritative COUNT, so overwrites are idempotent on any redelivery. +func (s *CassandraStore) setParentTcount(ctx context.Context, msg *model.Message, n int) error { parentID := msg.ThreadParentMessageID parentCreatedAt := *msg.ThreadParentMessageCreatedAt parentBucket := s.bucket.Of(parentCreatedAt) - - // CAS increment on messages_by_id (no bucket — table unchanged). - var tcount *int if err := s.cassSession.Query( - `SELECT tcount FROM messages_by_id WHERE message_id = ? AND created_at = ?`, - parentID, parentCreatedAt, - ).WithContext(ctx).Scan(&tcount); err != nil { - if errors.Is(err, gocql.ErrNotFound) { - return nil - } - return fmt.Errorf("read tcount for parent message %s: %w", parentID, err) + `UPDATE messages_by_id SET tcount = ? WHERE message_id = ? AND created_at = ?`, + n, parentID, parentCreatedAt, + ).WithContext(ctx).Exec(); err != nil { + return fmt.Errorf("set tcount on parent %s in messages_by_id: %w", parentID, err) } - if err := casIncrement(casMaxRetries, tcount, func(newVal int, expected *int) (bool, *int, error) { - var current *int - applied, err := s.cassSession.Query( - `UPDATE messages_by_id SET tcount = ? WHERE message_id = ? AND created_at = ? IF tcount = ?`, - newVal, parentID, parentCreatedAt, expected, - ).WithContext(ctx).ScanCAS(¤t) - return applied, current, err - }); err != nil { - return fmt.Errorf("cas tcount in messages_by_id for parent %s: %w", parentID, err) + if err := s.cassSession.Query( + `UPDATE messages_by_room SET tcount = ? WHERE room_id = ? AND bucket = ? AND created_at = ? AND message_id = ?`, + n, msg.RoomID, parentBucket, parentCreatedAt, parentID, + ).WithContext(ctx).Exec(); err != nil { + return fmt.Errorf("set tcount on parent %s in messages_by_room: %w", parentID, err) } + return nil +} - if err := s.cassSession.Query( - `SELECT tcount FROM messages_by_room WHERE room_id = ? AND bucket = ? AND created_at = ? AND message_id = ?`, - msg.RoomID, parentBucket, parentCreatedAt, parentID, - ).WithContext(ctx).Scan(&tcount); err != nil { - if errors.Is(err, gocql.ErrNotFound) { - return nil - } - return fmt.Errorf("read tcount in messages_by_room for parent %s: %w", parentID, err) +// countAndSetParentTcount derives tcount from the thread partition COUNT and +// blind-SETs it on the parent row in both Cassandra tables. Returns (nil, nil) +// when ThreadParentMessageCreatedAt is unset (no parent key available). +// This approach is crash-safe: COUNT + blind SET is idempotent on redelivery, +// avoiding the 2PC window of the old CAS increment. +func (s *CassandraStore) countAndSetParentTcount(ctx context.Context, msg *model.Message, threadRoomID string) (*int, error) { + if msg.ThreadParentMessageCreatedAt == nil { + return nil, nil + } + n, err := s.countThreadReplies(ctx, threadRoomID) + if err != nil { + return nil, fmt.Errorf("count thread replies: %w", err) } - if err := casIncrement(casMaxRetries, tcount, func(newVal int, expected *int) (bool, *int, error) { - var current *int - applied, err := s.cassSession.Query( - `UPDATE messages_by_room SET tcount = ? WHERE room_id = ? AND bucket = ? AND created_at = ? AND message_id = ? IF tcount = ?`, - newVal, msg.RoomID, parentBucket, parentCreatedAt, parentID, expected, - ).WithContext(ctx).ScanCAS(¤t) - return applied, current, err - }); err != nil { - return fmt.Errorf("cas tcount in messages_by_room for parent %s: %w", parentID, err) + if err := s.setParentTcount(ctx, msg, n); err != nil { + return nil, err } - - return nil + return &n, nil } // IF EXISTS prevents phantom rows on missing parents; misses log at ERROR diff --git a/message-worker/store_cassandra_test.go b/message-worker/store_cassandra_test.go deleted file mode 100644 index eb93245fa..000000000 --- a/message-worker/store_cassandra_test.go +++ /dev/null @@ -1,112 +0,0 @@ -package main - -import ( - "errors" - "testing" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -func intPtr(i int) *int { return &i } - -func TestCasIncrement(t *testing.T) { - tests := []struct { - name string - maxRetries int - initial *int - updates []func(newVal int, expected *int) (bool, *int, error) - wantErr bool - wantCalls int - }{ - { - name: "first attempt succeeds — nil initial increments to 1", - maxRetries: 5, - initial: nil, - updates: []func(int, *int) (bool, *int, error){ - func(newVal int, expected *int) (bool, *int, error) { - assert.Equal(t, 1, newVal) - assert.Nil(t, expected) - return true, nil, nil - }, - }, - wantCalls: 1, - }, - { - name: "first attempt succeeds — non-nil initial increments by 1", - maxRetries: 5, - initial: intPtr(3), - updates: []func(int, *int) (bool, *int, error){ - func(newVal int, expected *int) (bool, *int, error) { - assert.Equal(t, 4, newVal) - assert.Equal(t, intPtr(3), expected) - return true, nil, nil - }, - }, - wantCalls: 1, - }, - { - name: "one conflict then success — retries with current value", - maxRetries: 5, - initial: intPtr(3), - updates: []func(int, *int) (bool, *int, error){ - // concurrent writer bumped it to 5 - func(newVal int, expected *int) (bool, *int, error) { - assert.Equal(t, 4, newVal) - return false, intPtr(5), nil - }, - func(newVal int, expected *int) (bool, *int, error) { - assert.Equal(t, 6, newVal) - assert.Equal(t, intPtr(5), expected) - return true, nil, nil - }, - }, - wantCalls: 2, - }, - { - name: "retries exhausted — returns error after maxRetries attempts", - maxRetries: 3, - initial: nil, - updates: []func(int, *int) (bool, *int, error){ - func(int, *int) (bool, *int, error) { return false, intPtr(1), nil }, - func(int, *int) (bool, *int, error) { return false, intPtr(2), nil }, - func(int, *int) (bool, *int, error) { return false, intPtr(3), nil }, - }, - wantErr: true, - wantCalls: 3, - }, - { - name: "update error — returned immediately without further retries", - maxRetries: 5, - initial: nil, - updates: []func(int, *int) (bool, *int, error){ - func(int, *int) (bool, *int, error) { - return false, nil, errors.New("cassandra: write timeout") - }, - }, - wantErr: true, - wantCalls: 1, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - idx := 0 - update := func(newVal int, expected *int) (bool, *int, error) { - require.Less(t, idx, len(tt.updates), "unexpected extra call to update") - fn := tt.updates[idx] - idx++ - return fn(newVal, expected) - } - - err := casIncrement(tt.maxRetries, tt.initial, update) - - assert.Equal(t, tt.wantCalls, idx, "number of update calls") - if tt.wantErr { - require.Error(t, err) - } else { - require.NoError(t, err) - } - }) - } -} diff --git a/pkg/model/event.go b/pkg/model/event.go index ef6b087f1..a0f1b5ac9 100644 --- a/pkg/model/event.go +++ b/pkg/model/event.go @@ -8,12 +8,13 @@ import ( type EventType string const ( - EventCreated EventType = "created" - EventUpdated EventType = "updated" - EventDeleted EventType = "deleted" - EventPinned EventType = "pinned" - EventUnpinned EventType = "unpinned" - EventReacted EventType = "reacted" + EventCreated EventType = "created" + EventUpdated EventType = "updated" + EventDeleted EventType = "deleted" + EventPinned EventType = "pinned" + EventUnpinned EventType = "unpinned" + EventReacted EventType = "reacted" + EventThreadReplyAdded EventType = "thread_reply_added" ) type MessageEvent struct { @@ -23,6 +24,11 @@ type MessageEvent struct { // ReactionDelta is set only when Event == EventReacted. ReactionDelta *ReactionDelta `json:"reactionDelta,omitempty" bson:"reactionDelta,omitempty"` Timestamp int64 `json:"timestamp" bson:"timestamp"` + // NewTCount is the authoritative tcount of the parent message after a thread + // reply is added (EventThreadReplyAdded) or deleted (EventDeleted with + // ThreadParentMessageID set). Nil for all other event types. + // bson tag omits omitempty — zero is a valid count when the last reply is deleted. + NewTCount *int `json:"newTcount,omitempty" bson:"newTcount"` } // ReactionAction is the toggle direction on ReactionDelta.Action; defined @@ -199,14 +205,23 @@ type ClientMessage struct { type RoomEventType string const ( - RoomEventNewMessage RoomEventType = "new_message" - RoomEventMessageEdited RoomEventType = "message_edited" - RoomEventMessageDeleted RoomEventType = "message_deleted" - RoomEventMessagePinned RoomEventType = "message_pinned" - RoomEventMessageUnpinned RoomEventType = "message_unpinned" - RoomEventRoomRenamed RoomEventType = "room_renamed" - RoomEventRoomRestricted RoomEventType = "room_restricted" - RoomEventMessageReacted RoomEventType = "message_reacted" + RoomEventNewMessage RoomEventType = "new_message" + RoomEventMessageEdited RoomEventType = "message_edited" + RoomEventMessageDeleted RoomEventType = "message_deleted" + RoomEventMessagePinned RoomEventType = "message_pinned" + RoomEventMessageUnpinned RoomEventType = "message_unpinned" + RoomEventRoomRenamed RoomEventType = "room_renamed" + RoomEventRoomRestricted RoomEventType = "room_restricted" + RoomEventMessageReacted RoomEventType = "message_reacted" + RoomEventThreadMetadataUpdated RoomEventType = "thread_metadata_updated" +) + +// ThreadAction identifies what operation triggered a ThreadMetadataUpdatedEvent. +type ThreadAction string + +const ( + ThreadActionReplyAdded ThreadAction = "reply_added" + ThreadActionReplyDeleted ThreadAction = "reply_deleted" ) // RoomEvent is the live fan-out event for a newly created message @@ -288,6 +303,20 @@ type UnpinRoomEvent struct { UnpinnedAt time.Time `json:"unpinnedAt" bson:"unpinnedAt"` } +// ThreadMetadataUpdatedEvent is published on the per-user NATS subject when a +// thread reply is added or deleted, so clients can update the reply-count badge +// on the parent message without re-fetching the full message. +type ThreadMetadataUpdatedEvent struct { + Type RoomEventType `json:"type" bson:"type"` + RoomID string `json:"roomId" bson:"roomId"` + SiteID string `json:"siteId" bson:"siteId"` + Timestamp int64 `json:"timestamp" bson:"timestamp"` + ParentMessageID string `json:"parentMessageId" bson:"parentMessageId"` + ReplyMessageID string `json:"replyMessageId" bson:"replyMessageId"` + NewTCount int `json:"newTcount" bson:"newTcount"` + Action ThreadAction `json:"action" bson:"action"` +} + // RoomRenamedRoomEvent is the live event published when a channel is renamed. // Flat shape (same convention as EditRoomEvent / DeleteRoomEvent) — no // zero-valued RoomEvent base fields shipped to clients. Drives the client's diff --git a/pkg/model/model_test.go b/pkg/model/model_test.go index 8a906739a..b0be0fc65 100644 --- a/pkg/model/model_test.go +++ b/pkg/model/model_test.go @@ -3174,3 +3174,74 @@ func TestPresenceSnapshotReply_RoundTrip(t *testing.T) { require.NoError(t, json.Unmarshal(data, &out)) assert.Equal(t, in, out) } + +func TestMessageEvent_NewTCount(t *testing.T) { + t.Run("NewTCount nil is omitted from JSON", func(t *testing.T) { + e := model.MessageEvent{ + Message: model.Message{ID: "m1", RoomID: "r1", UserID: "u1", UserAccount: "alice", CreatedAt: time.Date(2026, 1, 1, 12, 0, 0, 0, time.UTC)}, + SiteID: "site-a", + Timestamp: 1735689600000, + } + data, err := json.Marshal(e) + require.NoError(t, err) + var raw map[string]any + require.NoError(t, json.Unmarshal(data, &raw)) + _, present := raw["newTcount"] + assert.False(t, present, "nil NewTCount must be omitted from JSON") + }) + + t.Run("NewTCount zero is included in JSON", func(t *testing.T) { + zero := 0 + e := model.MessageEvent{ + Message: model.Message{ID: "m1", RoomID: "r1", UserID: "u1", UserAccount: "alice", CreatedAt: time.Date(2026, 1, 1, 12, 0, 0, 0, time.UTC)}, + SiteID: "site-a", + Timestamp: 1735689600000, + NewTCount: &zero, + } + data, err := json.Marshal(e) + require.NoError(t, err) + var raw map[string]any + require.NoError(t, json.Unmarshal(data, &raw)) + val, present := raw["newTcount"] + assert.True(t, present, "non-nil zero NewTCount must be present in JSON") + assert.Equal(t, float64(0), val, "zero NewTCount must marshal as 0") + + var dst model.MessageEvent + require.NoError(t, json.Unmarshal(data, &dst)) + require.NotNil(t, dst.NewTCount) + assert.Equal(t, 0, *dst.NewTCount) + }) + + t.Run("NewTCount positive round-trips", func(t *testing.T) { + count := 3 + e := model.MessageEvent{ + Message: model.Message{ID: "m1", RoomID: "r1", UserID: "u1", UserAccount: "alice", CreatedAt: time.Date(2026, 1, 1, 12, 0, 0, 0, time.UTC)}, + SiteID: "site-a", + Timestamp: 1735689600000, + NewTCount: &count, + } + data, err := json.Marshal(e) + require.NoError(t, err) + var dst model.MessageEvent + require.NoError(t, json.Unmarshal(data, &dst)) + require.NotNil(t, dst.NewTCount) + assert.Equal(t, 3, *dst.NewTCount) + }) + + t.Run("NewTCount zero in BSON round-trips — omitempty must not drop zero", func(t *testing.T) { + zero := 0 + e := model.MessageEvent{ + Message: model.Message{ID: "m1", RoomID: "r1"}, + SiteID: "site-a", + Timestamp: 1735689600000, + NewTCount: &zero, + } + data, err := bson.Marshal(e) + require.NoError(t, err) + var raw bson.M + require.NoError(t, bson.Unmarshal(data, &raw)) + val, present := raw["newTcount"] + assert.True(t, present, "zero NewTCount must be present in BSON — bson omitempty must not be used") + assert.EqualValues(t, 0, val, "zero BSON value must be 0, not missing") + }) +} diff --git a/room-service/handler.go b/room-service/handler.go index de2be71a5..cee445b97 100644 --- a/room-service/handler.go +++ b/room-service/handler.go @@ -8,7 +8,6 @@ import ( "fmt" "log/slog" "net/url" - "slices" "strconv" "strings" "time" @@ -1456,15 +1455,14 @@ func (h *Handler) handleMessageThreadRead(ctx context.Context, subj string, data // Plain errgroup.Group (not WithContext) so a NotFound from one goroutine does NOT cancel // the siblings — otherwise context.Canceled in subErr/userSiteErr would outrank tsubErr. var ( - sub *model.Subscription tsub *model.ThreadSubscription userSiteID string subErr, tsubErr, userSiteErr error ) var g errgroup.Group g.Go(func() error { - s, err := h.store.GetSubscription(ctx, account, roomID) - sub, subErr = s, err + _, err := h.store.GetSubscription(ctx, account, roomID) + subErr = err return err }) g.Go(func() error { @@ -1493,13 +1491,15 @@ func (h *Handler) handleMessageThreadRead(ctx context.Context, subj string, data return nil, fmt.Errorf("get user siteId: %w", userSiteErr) } - newThreadUnread := slices.DeleteFunc(slices.Clone(sub.ThreadUnread), func(s string) bool { return s == req.ThreadID }) - newAlert := sub.Alert && len(newThreadUnread) > 0 now := time.Now().UTC() + var newThreadUnread []string + var newAlert bool wg, wctx := errgroup.WithContext(ctx) wg.Go(func() error { - if err := h.store.UpdateSubscriptionThreadRead(wctx, roomID, account, newThreadUnread, newAlert); err != nil { + var err error + newThreadUnread, newAlert, err = h.store.UpdateSubscriptionThreadRead(wctx, roomID, account, req.ThreadID) + if err != nil { return fmt.Errorf("update subscription thread-read: %w", err) } return nil diff --git a/room-service/handler_test.go b/room-service/handler_test.go index f4c79d232..eb660e0e4 100644 --- a/room-service/handler_test.go +++ b/room-service/handler_test.go @@ -3828,8 +3828,8 @@ func TestHandler_MessageThreadRead_HappyAlertClears(t *testing.T) { Return(baseSubForThreadRead("alice", "r1", []string{"p1"}, true), nil) f.store.EXPECT().GetThreadSubscriptionByParent(gomock.Any(), "alice", "p1", "r1"). Return(baseThreadSub("alice", "r1", "p1", "tr1"), nil) - f.store.EXPECT().UpdateSubscriptionThreadRead(gomock.Any(), "r1", "alice", - gomock.Len(0), false).Return(nil) + f.store.EXPECT().UpdateSubscriptionThreadRead(gomock.Any(), "r1", "alice", "p1"). + Return(nil, false, nil) f.store.EXPECT().UpdateThreadSubscriptionRead(gomock.Any(), "tr1", "alice", gomock.Any()). Return(nil) f.store.EXPECT().GetUserSiteID(gomock.Any(), "alice").Return("site-a", nil) @@ -3849,8 +3849,8 @@ func TestHandler_MessageThreadRead_HappyAlertStays(t *testing.T) { Return(baseSubForThreadRead("alice", "r1", []string{"p1", "p2"}, true), nil) f.store.EXPECT().GetThreadSubscriptionByParent(gomock.Any(), "alice", "p1", "r1"). Return(baseThreadSub("alice", "r1", "p1", "tr1"), nil) - f.store.EXPECT().UpdateSubscriptionThreadRead(gomock.Any(), "r1", "alice", - []string{"p2"}, true).Return(nil) + f.store.EXPECT().UpdateSubscriptionThreadRead(gomock.Any(), "r1", "alice", "p1"). + Return([]string{"p2"}, true, nil) f.store.EXPECT().UpdateThreadSubscriptionRead(gomock.Any(), "tr1", "alice", gomock.Any()). Return(nil) f.store.EXPECT().GetUserSiteID(gomock.Any(), "alice").Return("site-a", nil) @@ -3866,8 +3866,8 @@ func TestHandler_MessageThreadRead_IdempotentIDNotInArray(t *testing.T) { Return(baseSubForThreadRead("alice", "r1", []string{"p2"}, true), nil) f.store.EXPECT().GetThreadSubscriptionByParent(gomock.Any(), "alice", "p1", "r1"). Return(baseThreadSub("alice", "r1", "p1", "tr1"), nil) - f.store.EXPECT().UpdateSubscriptionThreadRead(gomock.Any(), "r1", "alice", - []string{"p2"}, true).Return(nil) + f.store.EXPECT().UpdateSubscriptionThreadRead(gomock.Any(), "r1", "alice", "p1"). + Return([]string{"p2"}, true, nil) f.store.EXPECT().UpdateThreadSubscriptionRead(gomock.Any(), "tr1", "alice", gomock.Any()). Return(nil) f.store.EXPECT().GetUserSiteID(gomock.Any(), "alice").Return("site-a", nil) @@ -3883,8 +3883,8 @@ func TestHandler_MessageThreadRead_AlertAlreadyFalse(t *testing.T) { Return(baseSubForThreadRead("alice", "r1", []string{"p1"}, false), nil) f.store.EXPECT().GetThreadSubscriptionByParent(gomock.Any(), "alice", "p1", "r1"). Return(baseThreadSub("alice", "r1", "p1", "tr1"), nil) - f.store.EXPECT().UpdateSubscriptionThreadRead(gomock.Any(), "r1", "alice", - gomock.Len(0), false).Return(nil) + f.store.EXPECT().UpdateSubscriptionThreadRead(gomock.Any(), "r1", "alice", "p1"). + Return(nil, false, nil) f.store.EXPECT().UpdateThreadSubscriptionRead(gomock.Any(), "tr1", "alice", gomock.Any()). Return(nil) f.store.EXPECT().GetUserSiteID(gomock.Any(), "alice").Return("site-a", nil) @@ -3900,8 +3900,8 @@ func TestHandler_MessageThreadRead_CrossSite_PublishesOutbox(t *testing.T) { Return(baseSubForThreadRead("alice", "r1", []string{"p1", "p2"}, true), nil) f.store.EXPECT().GetThreadSubscriptionByParent(gomock.Any(), "alice", "p1", "r1"). Return(baseThreadSub("alice", "r1", "p1", "tr1"), nil) - f.store.EXPECT().UpdateSubscriptionThreadRead(gomock.Any(), "r1", "alice", - []string{"p2"}, true).Return(nil) + f.store.EXPECT().UpdateSubscriptionThreadRead(gomock.Any(), "r1", "alice", "p1"). + Return([]string{"p2"}, true, nil) f.store.EXPECT().UpdateThreadSubscriptionRead(gomock.Any(), "tr1", "alice", gomock.Any()). Return(nil) f.store.EXPECT().GetUserSiteID(gomock.Any(), "alice").Return("site-b", nil) @@ -3937,8 +3937,8 @@ func TestHandler_MessageThreadRead_GetUserSiteID_Empty(t *testing.T) { Return(baseSubForThreadRead("alice", "r1", []string{"p1"}, true), nil) f.store.EXPECT().GetThreadSubscriptionByParent(gomock.Any(), "alice", "p1", "r1"). Return(baseThreadSub("alice", "r1", "p1", "tr1"), nil) - f.store.EXPECT().UpdateSubscriptionThreadRead(gomock.Any(), "r1", "alice", gomock.Any(), gomock.Any()). - Return(nil) + f.store.EXPECT().UpdateSubscriptionThreadRead(gomock.Any(), "r1", "alice", gomock.Any()). + Return(nil, false, nil) f.store.EXPECT().UpdateThreadSubscriptionRead(gomock.Any(), "tr1", "alice", gomock.Any()). Return(nil) f.store.EXPECT().GetUserSiteID(gomock.Any(), "alice").Return("", nil) @@ -3957,8 +3957,8 @@ func TestHandler_MessageThreadRead_GetUserSiteID_Error(t *testing.T) { Return(baseThreadSub("alice", "r1", "p1", "tr1"), nil) f.store.EXPECT().GetUserSiteID(gomock.Any(), "alice").Return("", fmt.Errorf("boom")) // Writes are short-circuited by the read-phase error, but may race ahead. - f.store.EXPECT().UpdateSubscriptionThreadRead(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()). - Return(nil).AnyTimes() + f.store.EXPECT().UpdateSubscriptionThreadRead(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()). + Return(nil, false, nil).AnyTimes() f.store.EXPECT().UpdateThreadSubscriptionRead(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()). Return(nil).AnyTimes() @@ -3975,8 +3975,8 @@ func TestHandler_MessageThreadRead_OutboxPublishError(t *testing.T) { Return(baseSubForThreadRead("alice", "r1", []string{"p1"}, true), nil) f.store.EXPECT().GetThreadSubscriptionByParent(gomock.Any(), "alice", "p1", "r1"). Return(baseThreadSub("alice", "r1", "p1", "tr1"), nil) - f.store.EXPECT().UpdateSubscriptionThreadRead(gomock.Any(), "r1", "alice", gomock.Any(), gomock.Any()). - Return(nil) + f.store.EXPECT().UpdateSubscriptionThreadRead(gomock.Any(), "r1", "alice", gomock.Any()). + Return(nil, false, nil) f.store.EXPECT().UpdateThreadSubscriptionRead(gomock.Any(), "tr1", "alice", gomock.Any()). Return(nil) f.store.EXPECT().GetUserSiteID(gomock.Any(), "alice").Return("site-b", nil) @@ -3994,8 +3994,8 @@ func TestHandler_MessageThreadRead_UpdateSubscriptionError(t *testing.T) { f.store.EXPECT().GetThreadSubscriptionByParent(gomock.Any(), "alice", "p1", "r1"). Return(baseThreadSub("alice", "r1", "p1", "tr1"), nil) f.store.EXPECT().GetUserSiteID(gomock.Any(), "alice").Return("site-a", nil) - f.store.EXPECT().UpdateSubscriptionThreadRead(gomock.Any(), "r1", "alice", gomock.Any(), gomock.Any()). - Return(fmt.Errorf("mongo down")) + f.store.EXPECT().UpdateSubscriptionThreadRead(gomock.Any(), "r1", "alice", gomock.Any()). + Return(nil, false, fmt.Errorf("mongo down")) f.store.EXPECT().UpdateThreadSubscriptionRead(gomock.Any(), "tr1", "alice", gomock.Any()). Return(nil).AnyTimes() @@ -4012,8 +4012,8 @@ func TestHandler_MessageThreadRead_UpdateThreadSubscriptionError(t *testing.T) { f.store.EXPECT().GetThreadSubscriptionByParent(gomock.Any(), "alice", "p1", "r1"). Return(baseThreadSub("alice", "r1", "p1", "tr1"), nil) f.store.EXPECT().GetUserSiteID(gomock.Any(), "alice").Return("site-a", nil) - f.store.EXPECT().UpdateSubscriptionThreadRead(gomock.Any(), "r1", "alice", gomock.Any(), gomock.Any()). - Return(nil).AnyTimes() + f.store.EXPECT().UpdateSubscriptionThreadRead(gomock.Any(), "r1", "alice", gomock.Any()). + Return(nil, false, nil).AnyTimes() f.store.EXPECT().UpdateThreadSubscriptionRead(gomock.Any(), "tr1", "alice", gomock.Any()). Return(fmt.Errorf("mongo down")) diff --git a/room-service/integration_test.go b/room-service/integration_test.go index 95a11514a..4fe58ec64 100644 --- a/room-service/integration_test.go +++ b/room-service/integration_test.go @@ -2098,16 +2098,22 @@ func TestMongoStore_UpdateSubscriptionThreadRead(t *testing.T) { _, err := db.Collection("subscriptions").InsertOne(ctx, &sub) require.NoError(t, err) - t.Run("non-empty array path", func(t *testing.T) { - require.NoError(t, store.UpdateSubscriptionThreadRead(ctx, "r1", "alice", []string{"t2"}, true)) + t.Run("removes specified threadID and returns remaining", func(t *testing.T) { + newUnread, newAlert, err := store.UpdateSubscriptionThreadRead(ctx, "r1", "alice", "t1") + require.NoError(t, err) + assert.Equal(t, []string{"t2"}, newUnread) + assert.True(t, newAlert) var got model.Subscription require.NoError(t, db.Collection("subscriptions").FindOne(ctx, bson.M{"_id": "sub-1"}).Decode(&got)) assert.Equal(t, []string{"t2"}, got.ThreadUnread) assert.True(t, got.Alert) }) - t.Run("empty array path unsets threadUnread", func(t *testing.T) { - require.NoError(t, store.UpdateSubscriptionThreadRead(ctx, "r1", "alice", nil, false)) + t.Run("last element removed unsets threadUnread field and clears alert", func(t *testing.T) { + newUnread, newAlert, err := store.UpdateSubscriptionThreadRead(ctx, "r1", "alice", "t2") + require.NoError(t, err) + assert.Nil(t, newUnread) + assert.False(t, newAlert) var raw bson.M require.NoError(t, db.Collection("subscriptions").FindOne(ctx, bson.M{"_id": "sub-1"}).Decode(&raw)) _, present := raw["threadUnread"] @@ -2116,9 +2122,36 @@ func TestMongoStore_UpdateSubscriptionThreadRead(t *testing.T) { }) t.Run("missing subscription returns sentinel", func(t *testing.T) { - err := store.UpdateSubscriptionThreadRead(ctx, "r-missing", "alice", nil, false) + _, _, err := store.UpdateSubscriptionThreadRead(ctx, "r-missing", "alice", "t1") require.ErrorIs(t, err, model.ErrSubscriptionNotFound) }) + + t.Run("concurrent removals do not lose updates", func(t *testing.T) { + // Reset subscription to ["c1", "c2"] with alert=true + _, err := db.Collection("subscriptions").UpdateOne(ctx, bson.M{"_id": "sub-1"}, + bson.M{"$set": bson.M{"threadUnread": []string{"c1", "c2"}, "alert": true}}) + require.NoError(t, err) + + // Two concurrent calls each remove a different threadID + done := make(chan error, 2) + go func() { + _, _, err := store.UpdateSubscriptionThreadRead(ctx, "r1", "alice", "c1") + done <- err + }() + go func() { + _, _, err := store.UpdateSubscriptionThreadRead(ctx, "r1", "alice", "c2") + done <- err + }() + require.NoError(t, <-done) + require.NoError(t, <-done) + + // Both removals must have applied — threadUnread should be absent (empty) + var raw bson.M + require.NoError(t, db.Collection("subscriptions").FindOne(ctx, bson.M{"_id": "sub-1"}).Decode(&raw)) + _, present := raw["threadUnread"] + assert.False(t, present, "both concurrent removals must apply — no lost updates") + assert.Equal(t, false, raw["alert"]) + }) } func TestMongoStore_UpdateThreadSubscriptionRead(t *testing.T) { diff --git a/room-service/mock_store_test.go b/room-service/mock_store_test.go index 6c65dc399..7e5be6393 100644 --- a/room-service/mock_store_test.go +++ b/room-service/mock_store_test.go @@ -490,17 +490,19 @@ func (mr *MockRoomStoreMockRecorder) UpdateSubscriptionRead(ctx, roomID, account } // UpdateSubscriptionThreadRead mocks base method. -func (m *MockRoomStore) UpdateSubscriptionThreadRead(ctx context.Context, roomID, account string, threadUnread []string, alert bool) error { +func (m *MockRoomStore) UpdateSubscriptionThreadRead(ctx context.Context, roomID, account, threadID string) ([]string, bool, error) { m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "UpdateSubscriptionThreadRead", ctx, roomID, account, threadUnread, alert) - ret0, _ := ret[0].(error) - return ret0 + ret := m.ctrl.Call(m, "UpdateSubscriptionThreadRead", ctx, roomID, account, threadID) + ret0, _ := ret[0].([]string) + ret1, _ := ret[1].(bool) + ret2, _ := ret[2].(error) + return ret0, ret1, ret2 } // UpdateSubscriptionThreadRead indicates an expected call of UpdateSubscriptionThreadRead. -func (mr *MockRoomStoreMockRecorder) UpdateSubscriptionThreadRead(ctx, roomID, account, threadUnread, alert any) *gomock.Call { +func (mr *MockRoomStoreMockRecorder) UpdateSubscriptionThreadRead(ctx, roomID, account, threadID any) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UpdateSubscriptionThreadRead", reflect.TypeOf((*MockRoomStore)(nil).UpdateSubscriptionThreadRead), ctx, roomID, account, threadUnread, alert) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UpdateSubscriptionThreadRead", reflect.TypeOf((*MockRoomStore)(nil).UpdateSubscriptionThreadRead), ctx, roomID, account, threadID) } // UpdateThreadSubscriptionRead mocks base method. diff --git a/room-service/store.go b/room-service/store.go index c559933d4..097b798d9 100644 --- a/room-service/store.go +++ b/room-service/store.go @@ -146,8 +146,9 @@ type RoomStore interface { // filter rejects a threadId that belongs to a different room than the request subject. GetThreadSubscriptionByParent(ctx context.Context, account, parentMessageID, roomID string) (*model.ThreadSubscription, error) - // UpdateSubscriptionThreadRead overwrites threadUnread + alert; empty threadUnread is $unset. - UpdateSubscriptionThreadRead(ctx context.Context, roomID, account string, threadUnread []string, alert bool) error + // UpdateSubscriptionThreadRead atomically removes threadID from threadUnread and returns + // the updated slice (nil when empty) and the updated alert flag. + UpdateSubscriptionThreadRead(ctx context.Context, roomID, account, threadID string) (newThreadUnread []string, newAlert bool, err error) UpdateThreadSubscriptionRead(ctx context.Context, threadRoomID, account string, lastSeenAt time.Time) error diff --git a/room-service/store_mongo.go b/room-service/store_mongo.go index 67ea9b426..a6cdf6a71 100644 --- a/room-service/store_mongo.go +++ b/room-service/store_mongo.go @@ -1119,27 +1119,47 @@ func (s *MongoStore) GetThreadSubscriptionByParent(ctx context.Context, account, return &ts, nil } -// Empty threadUnread is $unset so it round-trips through JSON as nil (omitempty contract). -func (s *MongoStore) UpdateSubscriptionThreadRead(ctx context.Context, roomID, account string, threadUnread []string, alert bool) error { +// UpdateSubscriptionThreadRead atomically removes threadID from threadUnread via +// an aggregation-pipeline FindOneAndUpdate. When the result is empty, the field +// is removed ($$REMOVE) and alert is set to false. +func (s *MongoStore) UpdateSubscriptionThreadRead(ctx context.Context, roomID, account, threadID string) ([]string, bool, error) { filter := bson.M{"roomId": roomID, "u.account": account} - var update bson.M - if len(threadUnread) == 0 { - update = bson.M{ - "$set": bson.M{"alert": alert}, - "$unset": bson.M{"threadUnread": ""}, - } - } else { - update = bson.M{"$set": bson.M{"threadUnread": threadUnread, "alert": alert}} - } - res, err := s.subscriptions.UpdateOne(ctx, filter, update) - if err != nil { - return fmt.Errorf("update subscription thread-read for %q in room %q: %w", account, roomID, err) + + // Aggregation pipeline: filter out threadID in one atomic pass, then unset + // threadUnread if the result is empty ($$REMOVE) and derive alert from that. + // Stage 1 stores the filtered array in a temp field _tuf; stage 2 applies it. + update := bson.A{ + bson.M{"$set": bson.M{"_tuf": bson.M{"$filter": bson.M{ + "input": bson.M{"$ifNull": bson.A{"$threadUnread", bson.A{}}}, + "as": "item", + "cond": bson.M{"$ne": bson.A{"$$item", threadID}}, + }}}}, + bson.M{"$set": bson.M{ + "threadUnread": bson.M{"$cond": bson.A{ + bson.M{"$gt": bson.A{bson.M{"$size": "$_tuf"}, 0}}, + "$_tuf", + "$$REMOVE", + }}, + "alert": bson.M{"$cond": bson.A{ + bson.M{"$gt": bson.A{bson.M{"$size": "$_tuf"}, 0}}, + "$alert", + false, + }}, + }}, + bson.M{"$unset": "_tuf"}, } - if res.MatchedCount == 0 { - return fmt.Errorf("update subscription thread-read for %q in room %q: %w", + + opts := options.FindOneAndUpdate().SetReturnDocument(options.After) + var updated model.Subscription + err := s.subscriptions.FindOneAndUpdate(ctx, filter, update, opts).Decode(&updated) + if errors.Is(err, mongo.ErrNoDocuments) { + return nil, false, fmt.Errorf("update subscription thread-read for %q in room %q: %w", account, roomID, model.ErrSubscriptionNotFound) } - return nil + if err != nil { + return nil, false, fmt.Errorf("update subscription thread-read for %q in room %q: %w", account, roomID, err) + } + return updated.ThreadUnread, updated.Alert, nil } // ListDefaultChannelTabApps returns apps whose channelTab.enabled AND diff --git a/room-worker/handler.go b/room-worker/handler.go index 1d48da83e..23435b2bc 100644 --- a/room-worker/handler.go +++ b/room-worker/handler.go @@ -74,13 +74,6 @@ func NewHandler(store SubscriptionStore, siteID string, publish PublishFunc, key } } -// publishSubscriptionUpdate fans out the per-user subscription.update event for the FE; best-effort. -func (h *Handler) publishSubscriptionUpdate(ctx context.Context, account string, subEvtData []byte) { - if err := h.publish(ctx, subject.SubscriptionUpdate(account), subEvtData, ""); err != nil { - slog.Error("subscription update publish failed", "error", err, "account", account) - } -} - // SetKeyFanoutWorkers overrides the bounded-worker pool size used by // fanOutKey. Values <= 0 are ignored so partial-deployment misconfig can't // disable the cap. main wires this from KEY_FANOUT_WORKERS at startup. @@ -366,7 +359,9 @@ func (h *Handler) processRemoveIndividual(ctx context.Context, req *model.Remove Timestamp: now.UnixMilli(), } subEvtData, _ := json.Marshal(subEvt) - h.publishSubscriptionUpdate(ctx, req.Account, subEvtData) + if err := h.publish(ctx, subject.SubscriptionUpdate(req.Account), subEvtData, ""); err != nil { + slog.ErrorContext(ctx, "subscription update publish failed", "error", err, "account", req.Account) + } // Member change event evtType := model.MessageTypeMemberLeft @@ -575,7 +570,9 @@ func (h *Handler) processRemoveOrg(ctx context.Context, req *model.RemoveMemberR Timestamp: now.UnixMilli(), } subEvtData, _ := json.Marshal(subEvt) - h.publishSubscriptionUpdate(ctx, m.Account, subEvtData) + if err := h.publish(ctx, subject.SubscriptionUpdate(m.Account), subEvtData, ""); err != nil { + slog.ErrorContext(ctx, "subscription update publish failed", "error", err, "account", m.Account) + } } // Member change event with all removed accounts @@ -979,7 +976,9 @@ func (h *Handler) processAddMembers(ctx context.Context, data []byte) (err error Timestamp: now.UnixMilli(), } subEvtData, _ := json.Marshal(subEvt) - h.publishSubscriptionUpdate(ctx, sub.User.Account, subEvtData) + if err := h.publish(ctx, subject.SubscriptionUpdate(sub.User.Account), subEvtData, ""); err != nil { + slog.ErrorContext(ctx, "subscription update publish failed", "error", err, "account", sub.User.Account) + } } // Fan out the room key only to newly-subscribed accounts. Accounts in @@ -1012,7 +1011,7 @@ func (h *Handler) processAddMembers(ctx context.Context, data []byte) (err error historySharedSince := historySharedSincePtr(req.History, req.Timestamp, req.RoomID) if len(actualAccounts) > 0 || len(req.Orgs) > 0 { memberAddEvt := model.MemberAddEvent{ - Type: model.OutboxMemberAdded, + Type: "member_added", RoomID: req.RoomID, RoomName: room.Name, RoomType: room.Type, @@ -1034,7 +1033,7 @@ func (h *Handler) processAddMembers(ctx context.Context, data []byte) (err error if len(actualAccounts) > 0 { inboxOutbox := model.OutboxEvent{ - Type: model.OutboxMemberAdded, + Type: "member_added", SiteID: room.SiteID, DestSiteID: room.SiteID, Payload: memberAddData, @@ -1088,28 +1087,22 @@ func (h *Handler) processAddMembers(ctx context.Context, data []byte) (err error } } - // 10. Outbox for cross-site members — one event per destination site. - // Single-pass bucket: accounts → home site, skipping the local site. The map - // keys are the distinct remote sites; each entry already carries the - // per-site filtered account list, so the downstream loop is O(sites) not - // O(sites × accounts). Sending the full list would over-pressure NATS and - // ship subscription identities to sites that have no business knowing them, - // even though inbox-worker would filter on the destination. - accountsBySite := make(map[string][]string) - for _, acc := range actualAccounts { - siteID := userMap[acc].SiteID - if siteID == "" || siteID == h.siteID { + // 10. Outbox for cross-site members — batched by destination site + remoteSiteMembers := make(map[string][]string) + for _, sub := range subs { + user, ok := userMap[sub.User.Account] + if !ok || user.SiteID == room.SiteID { continue } - accountsBySite[siteID] = append(accountsBySite[siteID], acc) + remoteSiteMembers[user.SiteID] = append(remoteSiteMembers[user.SiteID], sub.User.Account) } - for destSiteID, siteAccounts := range accountsBySite { + for destSiteID, accounts := range remoteSiteMembers { siteEvt := model.MemberAddEvent{ - Type: model.OutboxMemberAdded, + Type: "member_added", RoomID: req.RoomID, RoomName: room.Name, RoomType: room.Type, - Accounts: siteAccounts, + Accounts: accounts, SiteID: room.SiteID, RequesterAccount: req.RequesterAccount, JoinedAt: req.Timestamp, @@ -1118,13 +1111,13 @@ func (h *Handler) processAddMembers(ctx context.Context, data []byte) (err error } siteEvtData, _ := json.Marshal(siteEvt) outbox := model.OutboxEvent{ - Type: model.OutboxMemberAdded, SiteID: room.SiteID, DestSiteID: destSiteID, + Type: "member_added", SiteID: room.SiteID, DestSiteID: destSiteID, Payload: siteEvtData, Timestamp: now.UnixMilli(), } outboxData, _ := json.Marshal(outbox) payloadSeed := fmt.Sprintf("%s:%s:%d", req.RoomID, req.RequesterAccount, req.Timestamp) dedupID := natsutil.OutboxDedupID(ctx, destSiteID, payloadSeed) - if err := h.publish(ctx, subject.Outbox(room.SiteID, destSiteID, model.OutboxMemberAdded), outboxData, dedupID); err != nil { + if err := h.publish(ctx, subject.Outbox(room.SiteID, destSiteID, "member_added"), outboxData, dedupID); err != nil { return fmt.Errorf("outbox publish to %s failed: %w", destSiteID, err) } } @@ -1132,6 +1125,80 @@ func (h *Handler) processAddMembers(ctx context.Context, data []byte) (err error return nil } +func (h *Handler) processRoomRename(ctx context.Context, data []byte) error { + var req model.RenameRoomRequest + if err := json.Unmarshal(data, &req); err != nil { + return permanent(errcode.BadRequest("unmarshal RenameRoomRequest")) + } + + if err := h.store.UpdateRoomName(ctx, req.RoomID, req.NewName); err != nil { + return fmt.Errorf("update room name: %w", err) + } + if err := h.store.UpdateSubscriptionNamesForRoom(ctx, req.RoomID, req.NewName); err != nil { + return fmt.Errorf("update subscription names for room: %w", err) + } + + requester, err := h.store.GetUser(ctx, req.Account) + if err != nil { + if errors.Is(err, ErrUserNotFound) { + return permanent(errcode.NotFound(fmt.Sprintf("requester %s not found (room %s)", req.Account, req.RoomID), errcode.WithReason(errcode.RoomUserNotFound))) + } + return fmt.Errorf("get requester: %w", err) + } + + now := time.Now().UTC() + sysMsgData, _ := json.Marshal(model.RoomRenamedSysData{NewName: req.NewName, ByAccount: req.Account}) + seed := messageDedupSeed(ctx, "processRoomRename", req.RoomID, + fmt.Sprintf("%s:%s:%d", req.RoomID, req.Account, req.Timestamp)) + sysMsg := model.Message{ + ID: idgen.MessageIDFromRequestID(seed, "room_renamed"), + RoomID: req.RoomID, + UserID: requester.ID, + UserAccount: requester.Account, + Type: model.MessageTypeRoomRenamed, + Content: quoted(displayName(requester)) + " renamed the channel to " + quoted(req.NewName), + SysMsgData: sysMsgData, + CreatedAt: now, + } + if err := h.publishCanonical(ctx, &sysMsg, h.siteID, now); err != nil { + return fmt.Errorf("publish room_renamed sys message: %w", err) + } + + // Fan out outbox to remote sites that have members in this room. + subs, err := h.store.ListByRoom(ctx, req.RoomID) + if err != nil { + return fmt.Errorf("list subscriptions for outbox fan-out: %w", err) + } + remoteSites := make(map[string]struct{}) + for i := range subs { + if subs[i].SiteID != h.siteID && subs[i].SiteID != "" { + remoteSites[subs[i].SiteID] = struct{}{} + } + } + payload := model.RoomRenamedOutboxPayload{ + RoomID: req.RoomID, + NewName: req.NewName, + Timestamp: req.Timestamp, + } + payloadData, _ := json.Marshal(payload) + for destSiteID := range remoteSites { + outbox := model.OutboxEvent{ + Type: model.OutboxRoomRenamed, + SiteID: h.siteID, + DestSiteID: destSiteID, + Payload: payloadData, + Timestamp: now.UnixMilli(), + } + outboxData, _ := json.Marshal(outbox) + payloadSeed := fmt.Sprintf("%s:%s:%d", req.RoomID, req.Account, req.Timestamp) + dedupID := natsutil.OutboxDedupID(ctx, destSiteID, payloadSeed) + if err := h.publish(ctx, subject.Outbox(h.siteID, destSiteID, model.OutboxRoomRenamed), outboxData, dedupID); err != nil { + return fmt.Errorf("publish room_renamed outbox to %s: %w", destSiteID, err) + } + } + return nil +} + func mustMarshal(v any) []byte { data, _ := json.Marshal(v) return data @@ -1411,7 +1478,9 @@ func (h *Handler) finishCreateRoom(ctx context.Context, req *model.CreateRoomReq slog.ErrorContext(ctx, "marshal subscription.update failed", "error", err, "account", sub.User.Account) continue } - h.publishSubscriptionUpdate(ctx, sub.User.Account, data) + if err := h.publish(ctx, subject.SubscriptionUpdate(sub.User.Account), data, ""); err != nil { + slog.ErrorContext(ctx, "publish subscription.update failed", "error", err, "account", sub.User.Account) + } } // Task 36: channel-only sys-messages @@ -1781,135 +1850,11 @@ func (h *Handler) publishSubscriptionUpdates(ctx context.Context, subs []*model. "error", err, "account", sub.User.Account, "request_id", requestID) continue } - h.publishSubscriptionUpdate(ctx, sub.User.Account, data) - } -} - -// findRemoteSitesForAccounts looks up the home site of each account and returns -// the deduplicated set of remote sites (siteID != h.siteID). Empty in → empty out. -func (h *Handler) findRemoteSitesForAccounts(ctx context.Context, accounts []string) ([]string, error) { - if len(accounts) == 0 { - return []string{}, nil - } - users, err := h.store.FindUsersByAccounts(ctx, accounts) - if err != nil { - return nil, fmt.Errorf("find users by accounts: %w", err) - } - seen := make(map[string]struct{}, len(users)) - out := make([]string, 0, len(users)) - for i := range users { - if users[i].SiteID == h.siteID { - continue - } - if _, dup := seen[users[i].SiteID]; dup { - continue - } - seen[users[i].SiteID] = struct{}{} - out = append(out, users[i].SiteID) - } - return out, nil -} - -func (h *Handler) processRoomRename(ctx context.Context, data []byte) (err error) { - var requesterAccount, roomID string - defer func() { - h.publishAsyncJobResult(ctx, requesterAccount, model.AsyncJobOpRoomRename, roomID, err) - }() - - requestID := natsutil.RequestIDFromContext(ctx) - if requestID == "" { - return permanent(errcode.BadRequest("missing X-Request-ID")) - } - if !idgen.IsValidUUID(requestID) { - return permanent(errcode.BadRequest("invalid X-Request-ID: must be a hyphenated UUID")) - } - - var req model.RenameRoomRequest - if err = json.Unmarshal(data, &req); err != nil { - return permanent(errcode.BadRequest(fmt.Sprintf("unmarshal rename request: %s", err.Error()))) - } - requesterAccount, roomID = req.Account, req.RoomID - slog.Info("processing room.rename", - "op", model.AsyncJobOpRoomRename, - "requester", req.Account, - "roomID", req.RoomID, - "requestID", requestID) - - if err = h.store.UpdateRoomName(ctx, req.RoomID, req.NewName); err != nil { - if errors.Is(err, ErrRoomNotFound) { - return permanent(errcode.NotFound("room not found")) - } - if errors.Is(err, ErrNotChannelRoom) { - return permanent(errcode.BadRequest("rename is only allowed in channel rooms", errcode.WithReason(errcode.RoomNonChannelOperation))) - } - return fmt.Errorf("update room name: %w", err) - } - if err = h.store.UpdateSubscriptionNamesForRoom(ctx, req.RoomID, req.NewName); err != nil { - return fmt.Errorf("update subscription names: %w", err) - } - - sysData, err := json.Marshal(model.RoomRenamedSysData{NewName: req.NewName, ByAccount: req.Account}) - if err != nil { - return fmt.Errorf("marshal sys data: %w", err) - } - requester, err := h.store.GetUser(ctx, req.Account) - if err != nil && !errors.Is(err, ErrUserNotFound) { - return fmt.Errorf("get requester for sys message: %w", err) - } - requesterLabel := req.Account - if requester != nil { - requesterLabel = displayName(requester) - } - msg := model.Message{ - ID: idgen.MessageIDFromRequestID(requestID, "room_renamed"), - RoomID: req.RoomID, - UserAccount: req.Account, - Type: model.MessageTypeRoomRenamed, - Content: fmt.Sprintf("%q renamed the channel to %q", requesterLabel, req.NewName), - SysMsgData: sysData, - CreatedAt: time.UnixMilli(req.Timestamp).UTC(), - } - if err = h.publishCanonical(ctx, &msg, h.siteID, time.Now().UTC()); err != nil { - return fmt.Errorf("publish room_renamed sys message: %w", err) - } - - // Single room-scoped event (the room_renamed sys message published above) - // is sufficient — clients update their subscription state from the room - // event without per-subscription fan-out. - subs, err := h.store.ListByRoom(ctx, req.RoomID) - if err != nil { - return fmt.Errorf("list subscriptions: %w", err) - } - - accounts := make([]string, 0, len(subs)) - for i := range subs { - accounts = append(accounts, subs[i].User.Account) - } - remoteSites, err := h.findRemoteSitesForAccounts(ctx, accounts) - if err != nil { - return fmt.Errorf("find remote sites for outbox fan-out: %w", err) - } - renamedPayload, err := json.Marshal(model.RoomRenamedOutboxPayload{ - RoomID: req.RoomID, NewName: req.NewName, Timestamp: req.Timestamp, - }) - if err != nil { - return fmt.Errorf("marshal rename outbox payload: %w", err) - } - for _, remoteSiteID := range remoteSites { - evt := model.OutboxEvent{ - Type: model.OutboxRoomRenamed, SiteID: h.siteID, DestSiteID: remoteSiteID, - Payload: renamedPayload, Timestamp: time.Now().UTC().UnixMilli(), - } - evtData, mErr := json.Marshal(evt) - if mErr != nil { - return fmt.Errorf("marshal rename outbox event: %w", mErr) - } - if err = h.publish(ctx, subject.Outbox(h.siteID, remoteSiteID, model.OutboxRoomRenamed), - evtData, natsutil.OutboxDedupID(ctx, remoteSiteID, requestID)); err != nil { - return fmt.Errorf("publish rename outbox to %s: %w", remoteSiteID, err) + if err := h.publish(ctx, subject.SubscriptionUpdate(sub.User.Account), data, ""); err != nil { + slog.ErrorContext(ctx, "sync DM: publish subscription.update failed", + "error", err, "account", sub.User.Account, "request_id", requestID) } } - return nil } func (h *Handler) publishSyncDMOutbox(ctx context.Context, room *model.Room, requester, other *model.User, joinedAt time.Time) error { diff --git a/room-worker/handler_test.go b/room-worker/handler_test.go index c4939d4d8..106cb7fc0 100644 --- a/room-worker/handler_test.go +++ b/room-worker/handler_test.go @@ -395,9 +395,7 @@ func TestHandler_ProcessAddMembers(t *testing.T) { // 2 SubscriptionUpdate + 1 MemberAddEvent + 1 system msg + 1 batched outbox (site-b) assert.GreaterOrEqual(t, len(published), 4) - // Exactly one outbox event to site-b, carrying only site-b-homed accounts - // (charlie). bob is on site-a (home), so does not appear in the cross-site - // payload. + // Verify exactly 1 outbox event for site-b (batched, not per-member) var outboxCount int for _, p := range published { if strings.Contains(p.subj, "outbox") { @@ -407,10 +405,10 @@ func TestHandler_ProcessAddMembers(t *testing.T) { require.NoError(t, json.Unmarshal(p.data, &outboxEvt)) var change model.MemberAddEvent require.NoError(t, json.Unmarshal(outboxEvt.Payload, &change)) - assert.ElementsMatch(t, []string{"charlie"}, change.Accounts) + assert.Equal(t, []string{"charlie"}, change.Accounts) } } - assert.Equal(t, 1, outboxCount, "should publish exactly 1 outbox event per remote site") + assert.Equal(t, 1, outboxCount, "should publish exactly 1 batched outbox event per destination site") } // TestHandler_ProcessAddMembers_PublishesSubscriptionUpdateBeforeRoomKey locks in @@ -795,12 +793,11 @@ func TestHandler_ProcessAddMembers_MultipleSiteOutbox(t *testing.T) { Return([]AddMemberCandidate{ {Account: "alice"}, {Account: "bob"}, {Account: "charlie"}, }, nil) - allUsers := []model.User{ + store.EXPECT().FindUsersByAccounts(gomock.Any(), []string{"alice", "bob", "charlie"}).Return([]model.User{ {ID: "u1", Account: "alice", SiteID: "site-b", EngName: "Alice", ChineseName: "愛"}, {ID: "u2", Account: "bob", SiteID: "site-b", EngName: "Bob", ChineseName: "鮑"}, {ID: "u3", Account: "charlie", SiteID: "site-c", EngName: "Charlie", ChineseName: "查"}, - } - store.EXPECT().FindUsersByAccounts(gomock.Any(), []string{"alice", "bob", "charlie"}).Return(allUsers, nil) + }, nil) store.EXPECT().GetUser(gomock.Any(), "alice").Return(&model.User{ ID: "u1", Account: "alice", SiteID: "site-b", EngName: "Alice", ChineseName: "愛", }, nil) @@ -827,23 +824,19 @@ func TestHandler_ProcessAddMembers_MultipleSiteOutbox(t *testing.T) { outboxEvents = append(outboxEvents, p) } } - assert.Len(t, outboxEvents, 2, "one outbox event per remote site: site-b and site-c") + assert.Len(t, outboxEvents, 2, "should batch outbox by site: 1 for site-b, 1 for site-c") - // Each remote site receives only its own homed accounts: site-b gets - // alice+bob, site-c gets charlie. - want := map[string][]string{ - "site-b": {"alice", "bob"}, - "site-c": {"charlie"}, - } for _, p := range outboxEvents { var outboxEvt model.OutboxEvent require.NoError(t, json.Unmarshal(p.data, &outboxEvt)) var change model.MemberAddEvent require.NoError(t, json.Unmarshal(outboxEvt.Payload, &change)) - expected, ok := want[outboxEvt.DestSiteID] - require.True(t, ok, "unexpected destSiteID %s", outboxEvt.DestSiteID) - assert.ElementsMatch(t, expected, change.Accounts, - "outbox to %s should carry only its homed accounts", outboxEvt.DestSiteID) + + if strings.Contains(p.subj, "site-b") { + assert.Len(t, change.Accounts, 2, "site-b should have alice and bob") + } else if strings.Contains(p.subj, "site-c") { + assert.Equal(t, []string{"charlie"}, change.Accounts) + } } } @@ -894,7 +887,7 @@ func TestHandler_ProcessRemoveMember_OwnerRemovesOrg(t *testing.T) { err := h.processRemoveMember(context.Background(), data) require.NoError(t, err) - // Expect: 2 sub updates + 1 member event + 1 local INBOX + 1 sys msg = 5 publishes + // Expect: 2 sub updates (carol, dave) + 1 member event + 1 local INBOX + 1 sys msg = 5 publishes assert.Len(t, published, 5, "expected 5 publishes: 2 sub updates, member event, local INBOX, sys msg") subjSet := make(map[string]bool) @@ -2561,7 +2554,7 @@ func TestHandleSyncCreateDM_SelfDM(t *testing.T) { // Reply returns the in-memory sub directly (no read-back round-trip). assert.Equal(t, *captured[0], reply.Subscription) - // subscription.update only — same-site self-DM; no outbox and no canonical event (Option C). + // One subscription.update; no outbox (same-site by definition). require.Len(t, capture.captured, 1) assert.Equal(t, subject.SubscriptionUpdate("alice"), capture.captured[0].subject) } @@ -4618,328 +4611,3 @@ func TestRunJobWithRecovery_PanicAcksAndDoesNotCrash(t *testing.T) { assert.True(t, msg.acked, "panic must Ack (poison-pill drop), not Nak — a deterministic panic would otherwise loop on redelivery") assert.False(t, msg.naked) } - -func TestFindRemoteSitesForAccounts(t *testing.T) { - t.Run("dedupes remote, drops local, preserves siteIDs", func(t *testing.T) { - ctrl := gomock.NewController(t) - store := NewMockSubscriptionStore(ctrl) - store.EXPECT().FindUsersByAccounts(gomock.Any(), []string{"alice", "bob", "carol", "dave"}).Return([]model.User{ - {Account: "alice", SiteID: "site-a"}, // local - {Account: "bob", SiteID: "site-b"}, // remote - {Account: "carol", SiteID: "site-c"}, // remote - {Account: "dave", SiteID: "site-b"}, // dup of bob's site - }, nil) - h := &Handler{store: store, siteID: "site-a"} - got, err := h.findRemoteSitesForAccounts(context.Background(), []string{"alice", "bob", "carol", "dave"}) - require.NoError(t, err) - assert.ElementsMatch(t, []string{"site-b", "site-c"}, got) - }) - - t.Run("empty input returns empty slice", func(t *testing.T) { - ctrl := gomock.NewController(t) - store := NewMockSubscriptionStore(ctrl) - h := &Handler{store: store, siteID: "site-a"} - got, err := h.findRemoteSitesForAccounts(context.Background(), nil) - require.NoError(t, err) - assert.Empty(t, got) - }) -} - -func TestFindRemoteSitesForAccounts_StoreError(t *testing.T) { - ctrl := gomock.NewController(t) - store := NewMockSubscriptionStore(ctrl) - store.EXPECT().FindUsersByAccounts(gomock.Any(), gomock.Any()).Return(nil, errors.New("mongo timeout")) - - h := &Handler{store: store, siteID: "site-a"} - got, err := h.findRemoteSitesForAccounts(context.Background(), []string{"alice"}) - require.Error(t, err) - assert.Nil(t, got) - assert.Contains(t, err.Error(), "find users by accounts") -} - -// --- processRoomRename tests --- - -// Test 1: Missing X-Request-ID → permanent error, no store calls. -func TestProcessRoomRename_MissingRequestID(t *testing.T) { - ctrl := gomock.NewController(t) - defer ctrl.Finish() - store := NewMockSubscriptionStore(ctrl) - // No EXPECT calls — store must not be called. - - h := &Handler{store: store, siteID: "site-a", publish: func(_ context.Context, _ string, _ []byte, _ string) error { - return nil - }} - body, _ := json.Marshal(model.RenameRoomRequest{RoomID: "r1", NewName: "renamed", Account: "alice", Timestamp: 1700000000000}) - err := h.processRoomRename(context.Background(), body) - require.Error(t, err) - assert.True(t, errors.Is(err, errPermanent), "expected permanent error, got %v", err) -} - -// Test 2: Invalid UUID → permanent error. -func TestProcessRoomRename_InvalidUUID(t *testing.T) { - ctrl := gomock.NewController(t) - defer ctrl.Finish() - store := NewMockSubscriptionStore(ctrl) - - h := &Handler{store: store, siteID: "site-a", publish: func(_ context.Context, _ string, _ []byte, _ string) error { - return nil - }} - ctx := natsutil.WithRequestID(context.Background(), "not-a-valid-uuid") - body, _ := json.Marshal(model.RenameRoomRequest{RoomID: "r1", NewName: "renamed", Account: "alice", Timestamp: 1700000000000}) - err := h.processRoomRename(ctx, body) - require.Error(t, err) - assert.True(t, errors.Is(err, errPermanent), "expected permanent error, got %v", err) -} - -// Test 3: Unmarshal failure → permanent + AsyncJobResult does NOT publish (empty requesterAccount short-circuits). -func TestProcessRoomRename_UnmarshalFailure(t *testing.T) { - ctrl := gomock.NewController(t) - defer ctrl.Finish() - store := NewMockSubscriptionStore(ctrl) - - requestID := "01970a4f-8c2d-7c9a-abcd-e0123456789f" - var publishedSubjects []string - h := &Handler{store: store, siteID: "site-a", publish: func(_ context.Context, subj string, _ []byte, _ string) error { - publishedSubjects = append(publishedSubjects, subj) - return nil - }} - ctx := natsutil.WithRequestID(context.Background(), requestID) - err := h.processRoomRename(ctx, []byte("not-valid-json")) - require.Error(t, err) - assert.True(t, errors.Is(err, errPermanent), "expected permanent error, got %v", err) - // requesterAccount is empty after unmarshal failure, so publishAsyncJobResult must not publish. - for _, s := range publishedSubjects { - assert.False(t, - strings.HasPrefix(s, "chat.user.") && strings.Contains(s, ".response."+requestID), - "should not publish AsyncJobResult on unmarshal failure: got %s", s) - } -} - -// Test 4: UpdateRoomName returns ErrRoomNotFound → permanent + AsyncJobResult error. -func TestProcessRoomRename_RoomNotFound(t *testing.T) { - ctrl := gomock.NewController(t) - defer ctrl.Finish() - store := NewMockSubscriptionStore(ctrl) - - requestID := "01970a4f-8c2d-7c9a-abcd-e0123456789f" - store.EXPECT().UpdateRoomName(gomock.Any(), "r1", "renamed").Return(ErrRoomNotFound) - - var asyncResults []model.AsyncJobResult - publish := func(_ context.Context, subj string, data []byte, _ string) error { - if subj == subject.UserResponse("alice", requestID) { - var r model.AsyncJobResult - require.NoError(t, json.Unmarshal(data, &r)) - asyncResults = append(asyncResults, r) - } - return nil - } - - h := &Handler{store: store, siteID: "site-a", publish: publish} - ctx := natsutil.WithRequestID(context.Background(), requestID) - body, _ := json.Marshal(model.RenameRoomRequest{RoomID: "r1", NewName: "renamed", Account: "alice", Timestamp: 1700000000000}) - err := h.processRoomRename(ctx, body) - require.Error(t, err) - assert.True(t, errors.Is(err, errPermanent), "expected permanent error, got %v", err) - require.Len(t, asyncResults, 1) - assert.Equal(t, model.AsyncJobStatusError, asyncResults[0].Status) -} - -// Test 5: UpdateRoomName returns ErrNotChannelRoom → permanent + AsyncJobResult error. -func TestProcessRoomRename_NotChannelRoom(t *testing.T) { - ctrl := gomock.NewController(t) - defer ctrl.Finish() - store := NewMockSubscriptionStore(ctrl) - - requestID := "01970a4f-8c2d-7c9a-abcd-e0123456789f" - store.EXPECT().UpdateRoomName(gomock.Any(), "r1", "renamed").Return(ErrNotChannelRoom) - - var asyncResults []model.AsyncJobResult - publish := func(_ context.Context, subj string, data []byte, _ string) error { - if subj == subject.UserResponse("alice", requestID) { - var r model.AsyncJobResult - require.NoError(t, json.Unmarshal(data, &r)) - asyncResults = append(asyncResults, r) - } - return nil - } - - h := &Handler{store: store, siteID: "site-a", publish: publish} - ctx := natsutil.WithRequestID(context.Background(), requestID) - body, _ := json.Marshal(model.RenameRoomRequest{RoomID: "r1", NewName: "renamed", Account: "alice", Timestamp: 1700000000000}) - err := h.processRoomRename(ctx, body) - require.Error(t, err) - assert.True(t, errors.Is(err, errPermanent), "expected permanent error, got %v", err) - require.Len(t, asyncResults, 1) - assert.Equal(t, model.AsyncJobStatusError, asyncResults[0].Status) -} - -// Test 6: Transient error on UpdateSubscriptionNamesForRoom → non-permanent error returned. -func TestProcessRoomRename_TransientSubscriptionUpdateError(t *testing.T) { - ctrl := gomock.NewController(t) - defer ctrl.Finish() - store := NewMockSubscriptionStore(ctrl) - - requestID := "01970a4f-8c2d-7c9a-abcd-e0123456789f" - store.EXPECT().UpdateRoomName(gomock.Any(), "r1", "renamed").Return(nil) - store.EXPECT().UpdateSubscriptionNamesForRoom(gomock.Any(), "r1", "renamed").Return(errors.New("mongo timeout")) - - h := &Handler{store: store, siteID: "site-a", publish: func(_ context.Context, _ string, _ []byte, _ string) error { - return nil - }} - ctx := natsutil.WithRequestID(context.Background(), requestID) - body, _ := json.Marshal(model.RenameRoomRequest{RoomID: "r1", NewName: "renamed", Account: "alice", Timestamp: 1700000000000}) - err := h.processRoomRename(ctx, body) - require.Error(t, err) - assert.False(t, errors.Is(err, errPermanent), "expected transient (non-permanent) error, got %v", err) -} - -// Test 7: Happy path no remote sites. -func TestProcessRoomRename_HappyPathNoRemoteSites(t *testing.T) { - ctrl := gomock.NewController(t) - defer ctrl.Finish() - store := NewMockSubscriptionStore(ctrl) - - const roomID, newName = "r1", "renamed" - requestID := "01970a4f-8c2d-7c9a-abcd-e0123456789f" - - subs := []model.Subscription{ - {ID: "s1", User: model.SubscriptionUser{ID: "u1", Account: "alice"}, RoomID: roomID}, - {ID: "s2", User: model.SubscriptionUser{ID: "u2", Account: "bob"}, RoomID: roomID}, - } - - store.EXPECT().UpdateRoomName(gomock.Any(), roomID, newName).Return(nil) - store.EXPECT().UpdateSubscriptionNamesForRoom(gomock.Any(), roomID, newName).Return(nil) - store.EXPECT().GetUser(gomock.Any(), "alice").Return(&model.User{Account: "alice"}, nil) - store.EXPECT().ListByRoom(gomock.Any(), roomID).Return(subs, nil) - store.EXPECT().FindUsersByAccounts(gomock.Any(), gomock.Any()).Return([]model.User{ - {Account: "alice", SiteID: "site-a"}, {Account: "bob", SiteID: "site-a"}, - }, nil) - - var publishedSubjects []string - publish := func(_ context.Context, subj string, _ []byte, _ string) error { - publishedSubjects = append(publishedSubjects, subj) - return nil - } - - h := &Handler{store: store, siteID: "site-a", publish: publish} - ctx := natsutil.WithRequestID(context.Background(), requestID) - body, _ := json.Marshal(model.RenameRoomRequest{ - RoomID: roomID, NewName: newName, Account: "alice", Timestamp: time.Now().UTC().UnixMilli(), - }) - - require.NoError(t, h.processRoomRename(ctx, body)) - - assert.Contains(t, publishedSubjects, subject.MsgCanonicalCreated("site-a")) - assert.Contains(t, publishedSubjects, subject.UserResponse("alice", requestID)) - for _, subj := range publishedSubjects { - assert.NotContains(t, subj, "outbox.", "should not publish to outbox when all members are local") - assert.NotContains(t, subj, ".event.subscription.update", "rename publishes a single room-scoped sys message; no per-subscription fan-out") - } -} - -// Test 8: Happy path with one remote site. -func TestProcessRoomRename_HappyPathWithRemoteSite(t *testing.T) { - ctrl := gomock.NewController(t) - defer ctrl.Finish() - store := NewMockSubscriptionStore(ctrl) - - const roomID, newName = "r1", "renamed" - requestID := "01970a4f-8c2d-7c9a-abcd-e0123456789f" - ts := int64(1700000000000) - - subs := []model.Subscription{ - {ID: "s1", User: model.SubscriptionUser{ID: "u1", Account: "alice"}, RoomID: roomID}, - {ID: "s2", User: model.SubscriptionUser{ID: "u2", Account: "bob"}, RoomID: roomID}, - } - - store.EXPECT().UpdateRoomName(gomock.Any(), roomID, newName).Return(nil) - store.EXPECT().UpdateSubscriptionNamesForRoom(gomock.Any(), roomID, newName).Return(nil) - store.EXPECT().GetUser(gomock.Any(), "alice").Return(&model.User{Account: "alice"}, nil) - store.EXPECT().ListByRoom(gomock.Any(), roomID).Return(subs, nil) - // Bob is on a remote site. - store.EXPECT().FindUsersByAccounts(gomock.Any(), gomock.Any()).Return([]model.User{ - {Account: "alice", SiteID: "site-a"}, - {Account: "bob", SiteID: "site-b"}, - }, nil) - - var publishedSubjects []string - var outboxPayloads []model.RoomRenamedOutboxPayload - publish := func(_ context.Context, subj string, data []byte, _ string) error { - publishedSubjects = append(publishedSubjects, subj) - if strings.Contains(subj, "outbox.") { - var env model.OutboxEvent - require.NoError(t, json.Unmarshal(data, &env)) - var payload model.RoomRenamedOutboxPayload - require.NoError(t, json.Unmarshal(env.Payload, &payload)) - outboxPayloads = append(outboxPayloads, payload) - } - return nil - } - - h := &Handler{store: store, siteID: "site-a", publish: publish} - ctx := natsutil.WithRequestID(context.Background(), requestID) - body, _ := json.Marshal(model.RenameRoomRequest{ - RoomID: roomID, NewName: newName, Account: "alice", Timestamp: ts, - }) - - require.NoError(t, h.processRoomRename(ctx, body)) - - assert.Contains(t, publishedSubjects, subject.MsgCanonicalCreated("site-a")) - assert.Contains(t, publishedSubjects, subject.UserResponse("alice", requestID)) - for _, subj := range publishedSubjects { - assert.NotContains(t, subj, ".event.subscription.update", "rename publishes a single room-scoped sys message; no per-subscription fan-out") - } - - // Exactly one outbox publish to site-b. - outboxSubjects := make([]string, 0) - for _, s := range publishedSubjects { - if strings.Contains(s, "outbox.") { - outboxSubjects = append(outboxSubjects, s) - } - } - require.Len(t, outboxSubjects, 1) - assert.Contains(t, outboxSubjects[0], "site-b") - require.Len(t, outboxPayloads, 1) - assert.Equal(t, roomID, outboxPayloads[0].RoomID) - assert.Equal(t, newName, outboxPayloads[0].NewName) - assert.Equal(t, ts, outboxPayloads[0].Timestamp) -} - -// Test 9: Error-then-ok retry sequence. -func TestProcessRoomRename_ErrorThenOkRetrySequence(t *testing.T) { - ctrl := gomock.NewController(t) - defer ctrl.Finish() - store := NewMockSubscriptionStore(ctrl) - requestID := "01970a4f-8c2d-7c9a-abcd-e0123456789f" - - store.EXPECT().UpdateRoomName(gomock.Any(), "r1", "x").Return(errors.New("mongo timeout")) - store.EXPECT().UpdateRoomName(gomock.Any(), "r1", "x").Return(nil) - store.EXPECT().UpdateSubscriptionNamesForRoom(gomock.Any(), "r1", "x").Return(nil) - store.EXPECT().GetUser(gomock.Any(), "alice").Return(&model.User{Account: "alice"}, nil) - // Empty subs → accounts is empty → findRemoteSitesForAccounts short-circuits (no FindUsersByAccounts call). - store.EXPECT().ListByRoom(gomock.Any(), "r1").Return([]model.Subscription{}, nil) - - var asyncResults []model.AsyncJobResult - publish := func(_ context.Context, subj string, data []byte, _ string) error { - if subj == subject.UserResponse("alice", requestID) { - var r model.AsyncJobResult - require.NoError(t, json.Unmarshal(data, &r)) - asyncResults = append(asyncResults, r) - } - return nil - } - - h := &Handler{store: store, siteID: "site-a", publish: publish} - ctx := natsutil.WithRequestID(context.Background(), requestID) - body, _ := json.Marshal(model.RenameRoomRequest{RoomID: "r1", NewName: "x", Account: "alice", Timestamp: 1700000000000}) - - err := h.processRoomRename(ctx, body) - require.Error(t, err) - assert.False(t, errors.Is(err, errPermanent)) - - require.NoError(t, h.processRoomRename(ctx, body)) - - require.Len(t, asyncResults, 2) - assert.Equal(t, model.AsyncJobStatusError, asyncResults[0].Status) - assert.Equal(t, model.AsyncJobStatusOK, asyncResults[1].Status) -} diff --git a/room-worker/integration_test.go b/room-worker/integration_test.go index a56e995d4..471ed7408 100644 --- a/room-worker/integration_test.go +++ b/room-worker/integration_test.go @@ -1852,7 +1852,7 @@ func TestIntegration_ProcessRoomRename(t *testing.T) { }) mustInsertSub(t, db, &model.Subscription{ ID: idgen.GenerateUUIDv7(), User: model.SubscriptionUser{ID: "u3", Account: "carol"}, - RoomID: roomID, SiteID: siteID, Name: oldName, RoomType: model.RoomTypeChannel, + RoomID: roomID, SiteID: remoteSite, Name: oldName, RoomType: model.RoomTypeChannel, Roles: []model.Role{model.RoleMember}, JoinedAt: time.Now().UTC(), }) mustInsertUser(t, db, &model.User{ID: "u1", Account: "alice", SiteID: siteID}) diff --git a/search-sync-worker/messages.go b/search-sync-worker/messages.go index e74016766..c1cb7b1fb 100644 --- a/search-sync-worker/messages.go +++ b/search-sync-worker/messages.go @@ -68,6 +68,12 @@ func (c *messageCollection) BuildAction(data []byte) ([]searchengine.BulkAction, if err := json.Unmarshal(data, &evt); err != nil { return nil, fmt.Errorf("unmarshal message event: %w", err) } + // Thread-reply badge events carry only the parent's tcount, not a + // searchable document — skip them before the document-shape guards below, + // which would otherwise reject the sparse Message as a hard error. + if evt.Event == model.EventThreadReplyAdded { + return nil, nil + } if evt.Message.ID == "" { return nil, fmt.Errorf("build message action: missing message id") } diff --git a/search-sync-worker/messages_test.go b/search-sync-worker/messages_test.go index f9865cd81..02ae28b39 100644 --- a/search-sync-worker/messages_test.go +++ b/search-sync-worker/messages_test.go @@ -163,6 +163,39 @@ func TestBuildMessageAction(t *testing.T) { }) } +func TestMessageCollection_BuildAction_SkipsBadgeEvents(t *testing.T) { + // Badge-only thread events carry just the parent tcount (no CreatedAt / + // Content) and must be skipped rather than reaching the document-shape + // guards (which would log an error and ack). + tests := []struct { + name string + event model.EventType + }{ + {name: "thread reply added", event: model.EventThreadReplyAdded}, + } + + coll := newMessageCollection("msgs-v1", time.Time{}) + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + evt := model.MessageEvent{ + Event: tc.event, + SiteID: "site-a", + Message: model.Message{ + ID: "reply-1", + RoomID: "room-1", + ThreadParentMessageID: "parent-1", + }, + Timestamp: 1737964678390, + } + data, _ := json.Marshal(evt) + + actions, err := coll.BuildAction(data) + require.NoError(t, err) + assert.Nil(t, actions) + }) + } +} + func TestMessageTemplateProperties_MatchesStruct(t *testing.T) { props := messageTemplateProperties() From 4b6964a7eea41a609307a70220da12f13bd61409 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 6 Jun 2026 15:37:40 +0000 Subject: [PATCH 02/14] revert: restore room-worker/handler.go to pre-PR state The PR accidentally deleted the publishSubscriptionUpdate helper, inlined its three call sites, replaced model.OutboxMemberAdded constants with raw strings, deleted findRemoteSitesForAccounts, and replaced the full processRoomRename implementation with a stripped-down version that lost async-job-result publishing and request-ID validation. None of these changes are part of the thread-reply pipeline. https://claude.ai/code/session_01LjGg9QJU7QVZFU6pXXFbgb --- room-worker/handler.go | 259 +++++++++++++++++++++++++---------------- 1 file changed, 157 insertions(+), 102 deletions(-) diff --git a/room-worker/handler.go b/room-worker/handler.go index 23435b2bc..1d48da83e 100644 --- a/room-worker/handler.go +++ b/room-worker/handler.go @@ -74,6 +74,13 @@ func NewHandler(store SubscriptionStore, siteID string, publish PublishFunc, key } } +// publishSubscriptionUpdate fans out the per-user subscription.update event for the FE; best-effort. +func (h *Handler) publishSubscriptionUpdate(ctx context.Context, account string, subEvtData []byte) { + if err := h.publish(ctx, subject.SubscriptionUpdate(account), subEvtData, ""); err != nil { + slog.Error("subscription update publish failed", "error", err, "account", account) + } +} + // SetKeyFanoutWorkers overrides the bounded-worker pool size used by // fanOutKey. Values <= 0 are ignored so partial-deployment misconfig can't // disable the cap. main wires this from KEY_FANOUT_WORKERS at startup. @@ -359,9 +366,7 @@ func (h *Handler) processRemoveIndividual(ctx context.Context, req *model.Remove Timestamp: now.UnixMilli(), } subEvtData, _ := json.Marshal(subEvt) - if err := h.publish(ctx, subject.SubscriptionUpdate(req.Account), subEvtData, ""); err != nil { - slog.ErrorContext(ctx, "subscription update publish failed", "error", err, "account", req.Account) - } + h.publishSubscriptionUpdate(ctx, req.Account, subEvtData) // Member change event evtType := model.MessageTypeMemberLeft @@ -570,9 +575,7 @@ func (h *Handler) processRemoveOrg(ctx context.Context, req *model.RemoveMemberR Timestamp: now.UnixMilli(), } subEvtData, _ := json.Marshal(subEvt) - if err := h.publish(ctx, subject.SubscriptionUpdate(m.Account), subEvtData, ""); err != nil { - slog.ErrorContext(ctx, "subscription update publish failed", "error", err, "account", m.Account) - } + h.publishSubscriptionUpdate(ctx, m.Account, subEvtData) } // Member change event with all removed accounts @@ -976,9 +979,7 @@ func (h *Handler) processAddMembers(ctx context.Context, data []byte) (err error Timestamp: now.UnixMilli(), } subEvtData, _ := json.Marshal(subEvt) - if err := h.publish(ctx, subject.SubscriptionUpdate(sub.User.Account), subEvtData, ""); err != nil { - slog.ErrorContext(ctx, "subscription update publish failed", "error", err, "account", sub.User.Account) - } + h.publishSubscriptionUpdate(ctx, sub.User.Account, subEvtData) } // Fan out the room key only to newly-subscribed accounts. Accounts in @@ -1011,7 +1012,7 @@ func (h *Handler) processAddMembers(ctx context.Context, data []byte) (err error historySharedSince := historySharedSincePtr(req.History, req.Timestamp, req.RoomID) if len(actualAccounts) > 0 || len(req.Orgs) > 0 { memberAddEvt := model.MemberAddEvent{ - Type: "member_added", + Type: model.OutboxMemberAdded, RoomID: req.RoomID, RoomName: room.Name, RoomType: room.Type, @@ -1033,7 +1034,7 @@ func (h *Handler) processAddMembers(ctx context.Context, data []byte) (err error if len(actualAccounts) > 0 { inboxOutbox := model.OutboxEvent{ - Type: "member_added", + Type: model.OutboxMemberAdded, SiteID: room.SiteID, DestSiteID: room.SiteID, Payload: memberAddData, @@ -1087,22 +1088,28 @@ func (h *Handler) processAddMembers(ctx context.Context, data []byte) (err error } } - // 10. Outbox for cross-site members — batched by destination site - remoteSiteMembers := make(map[string][]string) - for _, sub := range subs { - user, ok := userMap[sub.User.Account] - if !ok || user.SiteID == room.SiteID { + // 10. Outbox for cross-site members — one event per destination site. + // Single-pass bucket: accounts → home site, skipping the local site. The map + // keys are the distinct remote sites; each entry already carries the + // per-site filtered account list, so the downstream loop is O(sites) not + // O(sites × accounts). Sending the full list would over-pressure NATS and + // ship subscription identities to sites that have no business knowing them, + // even though inbox-worker would filter on the destination. + accountsBySite := make(map[string][]string) + for _, acc := range actualAccounts { + siteID := userMap[acc].SiteID + if siteID == "" || siteID == h.siteID { continue } - remoteSiteMembers[user.SiteID] = append(remoteSiteMembers[user.SiteID], sub.User.Account) + accountsBySite[siteID] = append(accountsBySite[siteID], acc) } - for destSiteID, accounts := range remoteSiteMembers { + for destSiteID, siteAccounts := range accountsBySite { siteEvt := model.MemberAddEvent{ - Type: "member_added", + Type: model.OutboxMemberAdded, RoomID: req.RoomID, RoomName: room.Name, RoomType: room.Type, - Accounts: accounts, + Accounts: siteAccounts, SiteID: room.SiteID, RequesterAccount: req.RequesterAccount, JoinedAt: req.Timestamp, @@ -1111,13 +1118,13 @@ func (h *Handler) processAddMembers(ctx context.Context, data []byte) (err error } siteEvtData, _ := json.Marshal(siteEvt) outbox := model.OutboxEvent{ - Type: "member_added", SiteID: room.SiteID, DestSiteID: destSiteID, + Type: model.OutboxMemberAdded, SiteID: room.SiteID, DestSiteID: destSiteID, Payload: siteEvtData, Timestamp: now.UnixMilli(), } outboxData, _ := json.Marshal(outbox) payloadSeed := fmt.Sprintf("%s:%s:%d", req.RoomID, req.RequesterAccount, req.Timestamp) dedupID := natsutil.OutboxDedupID(ctx, destSiteID, payloadSeed) - if err := h.publish(ctx, subject.Outbox(room.SiteID, destSiteID, "member_added"), outboxData, dedupID); err != nil { + if err := h.publish(ctx, subject.Outbox(room.SiteID, destSiteID, model.OutboxMemberAdded), outboxData, dedupID); err != nil { return fmt.Errorf("outbox publish to %s failed: %w", destSiteID, err) } } @@ -1125,80 +1132,6 @@ func (h *Handler) processAddMembers(ctx context.Context, data []byte) (err error return nil } -func (h *Handler) processRoomRename(ctx context.Context, data []byte) error { - var req model.RenameRoomRequest - if err := json.Unmarshal(data, &req); err != nil { - return permanent(errcode.BadRequest("unmarshal RenameRoomRequest")) - } - - if err := h.store.UpdateRoomName(ctx, req.RoomID, req.NewName); err != nil { - return fmt.Errorf("update room name: %w", err) - } - if err := h.store.UpdateSubscriptionNamesForRoom(ctx, req.RoomID, req.NewName); err != nil { - return fmt.Errorf("update subscription names for room: %w", err) - } - - requester, err := h.store.GetUser(ctx, req.Account) - if err != nil { - if errors.Is(err, ErrUserNotFound) { - return permanent(errcode.NotFound(fmt.Sprintf("requester %s not found (room %s)", req.Account, req.RoomID), errcode.WithReason(errcode.RoomUserNotFound))) - } - return fmt.Errorf("get requester: %w", err) - } - - now := time.Now().UTC() - sysMsgData, _ := json.Marshal(model.RoomRenamedSysData{NewName: req.NewName, ByAccount: req.Account}) - seed := messageDedupSeed(ctx, "processRoomRename", req.RoomID, - fmt.Sprintf("%s:%s:%d", req.RoomID, req.Account, req.Timestamp)) - sysMsg := model.Message{ - ID: idgen.MessageIDFromRequestID(seed, "room_renamed"), - RoomID: req.RoomID, - UserID: requester.ID, - UserAccount: requester.Account, - Type: model.MessageTypeRoomRenamed, - Content: quoted(displayName(requester)) + " renamed the channel to " + quoted(req.NewName), - SysMsgData: sysMsgData, - CreatedAt: now, - } - if err := h.publishCanonical(ctx, &sysMsg, h.siteID, now); err != nil { - return fmt.Errorf("publish room_renamed sys message: %w", err) - } - - // Fan out outbox to remote sites that have members in this room. - subs, err := h.store.ListByRoom(ctx, req.RoomID) - if err != nil { - return fmt.Errorf("list subscriptions for outbox fan-out: %w", err) - } - remoteSites := make(map[string]struct{}) - for i := range subs { - if subs[i].SiteID != h.siteID && subs[i].SiteID != "" { - remoteSites[subs[i].SiteID] = struct{}{} - } - } - payload := model.RoomRenamedOutboxPayload{ - RoomID: req.RoomID, - NewName: req.NewName, - Timestamp: req.Timestamp, - } - payloadData, _ := json.Marshal(payload) - for destSiteID := range remoteSites { - outbox := model.OutboxEvent{ - Type: model.OutboxRoomRenamed, - SiteID: h.siteID, - DestSiteID: destSiteID, - Payload: payloadData, - Timestamp: now.UnixMilli(), - } - outboxData, _ := json.Marshal(outbox) - payloadSeed := fmt.Sprintf("%s:%s:%d", req.RoomID, req.Account, req.Timestamp) - dedupID := natsutil.OutboxDedupID(ctx, destSiteID, payloadSeed) - if err := h.publish(ctx, subject.Outbox(h.siteID, destSiteID, model.OutboxRoomRenamed), outboxData, dedupID); err != nil { - return fmt.Errorf("publish room_renamed outbox to %s: %w", destSiteID, err) - } - } - return nil -} - func mustMarshal(v any) []byte { data, _ := json.Marshal(v) return data @@ -1478,9 +1411,7 @@ func (h *Handler) finishCreateRoom(ctx context.Context, req *model.CreateRoomReq slog.ErrorContext(ctx, "marshal subscription.update failed", "error", err, "account", sub.User.Account) continue } - if err := h.publish(ctx, subject.SubscriptionUpdate(sub.User.Account), data, ""); err != nil { - slog.ErrorContext(ctx, "publish subscription.update failed", "error", err, "account", sub.User.Account) - } + h.publishSubscriptionUpdate(ctx, sub.User.Account, data) } // Task 36: channel-only sys-messages @@ -1850,11 +1781,135 @@ func (h *Handler) publishSubscriptionUpdates(ctx context.Context, subs []*model. "error", err, "account", sub.User.Account, "request_id", requestID) continue } - if err := h.publish(ctx, subject.SubscriptionUpdate(sub.User.Account), data, ""); err != nil { - slog.ErrorContext(ctx, "sync DM: publish subscription.update failed", - "error", err, "account", sub.User.Account, "request_id", requestID) + h.publishSubscriptionUpdate(ctx, sub.User.Account, data) + } +} + +// findRemoteSitesForAccounts looks up the home site of each account and returns +// the deduplicated set of remote sites (siteID != h.siteID). Empty in → empty out. +func (h *Handler) findRemoteSitesForAccounts(ctx context.Context, accounts []string) ([]string, error) { + if len(accounts) == 0 { + return []string{}, nil + } + users, err := h.store.FindUsersByAccounts(ctx, accounts) + if err != nil { + return nil, fmt.Errorf("find users by accounts: %w", err) + } + seen := make(map[string]struct{}, len(users)) + out := make([]string, 0, len(users)) + for i := range users { + if users[i].SiteID == h.siteID { + continue + } + if _, dup := seen[users[i].SiteID]; dup { + continue + } + seen[users[i].SiteID] = struct{}{} + out = append(out, users[i].SiteID) + } + return out, nil +} + +func (h *Handler) processRoomRename(ctx context.Context, data []byte) (err error) { + var requesterAccount, roomID string + defer func() { + h.publishAsyncJobResult(ctx, requesterAccount, model.AsyncJobOpRoomRename, roomID, err) + }() + + requestID := natsutil.RequestIDFromContext(ctx) + if requestID == "" { + return permanent(errcode.BadRequest("missing X-Request-ID")) + } + if !idgen.IsValidUUID(requestID) { + return permanent(errcode.BadRequest("invalid X-Request-ID: must be a hyphenated UUID")) + } + + var req model.RenameRoomRequest + if err = json.Unmarshal(data, &req); err != nil { + return permanent(errcode.BadRequest(fmt.Sprintf("unmarshal rename request: %s", err.Error()))) + } + requesterAccount, roomID = req.Account, req.RoomID + slog.Info("processing room.rename", + "op", model.AsyncJobOpRoomRename, + "requester", req.Account, + "roomID", req.RoomID, + "requestID", requestID) + + if err = h.store.UpdateRoomName(ctx, req.RoomID, req.NewName); err != nil { + if errors.Is(err, ErrRoomNotFound) { + return permanent(errcode.NotFound("room not found")) + } + if errors.Is(err, ErrNotChannelRoom) { + return permanent(errcode.BadRequest("rename is only allowed in channel rooms", errcode.WithReason(errcode.RoomNonChannelOperation))) + } + return fmt.Errorf("update room name: %w", err) + } + if err = h.store.UpdateSubscriptionNamesForRoom(ctx, req.RoomID, req.NewName); err != nil { + return fmt.Errorf("update subscription names: %w", err) + } + + sysData, err := json.Marshal(model.RoomRenamedSysData{NewName: req.NewName, ByAccount: req.Account}) + if err != nil { + return fmt.Errorf("marshal sys data: %w", err) + } + requester, err := h.store.GetUser(ctx, req.Account) + if err != nil && !errors.Is(err, ErrUserNotFound) { + return fmt.Errorf("get requester for sys message: %w", err) + } + requesterLabel := req.Account + if requester != nil { + requesterLabel = displayName(requester) + } + msg := model.Message{ + ID: idgen.MessageIDFromRequestID(requestID, "room_renamed"), + RoomID: req.RoomID, + UserAccount: req.Account, + Type: model.MessageTypeRoomRenamed, + Content: fmt.Sprintf("%q renamed the channel to %q", requesterLabel, req.NewName), + SysMsgData: sysData, + CreatedAt: time.UnixMilli(req.Timestamp).UTC(), + } + if err = h.publishCanonical(ctx, &msg, h.siteID, time.Now().UTC()); err != nil { + return fmt.Errorf("publish room_renamed sys message: %w", err) + } + + // Single room-scoped event (the room_renamed sys message published above) + // is sufficient — clients update their subscription state from the room + // event without per-subscription fan-out. + subs, err := h.store.ListByRoom(ctx, req.RoomID) + if err != nil { + return fmt.Errorf("list subscriptions: %w", err) + } + + accounts := make([]string, 0, len(subs)) + for i := range subs { + accounts = append(accounts, subs[i].User.Account) + } + remoteSites, err := h.findRemoteSitesForAccounts(ctx, accounts) + if err != nil { + return fmt.Errorf("find remote sites for outbox fan-out: %w", err) + } + renamedPayload, err := json.Marshal(model.RoomRenamedOutboxPayload{ + RoomID: req.RoomID, NewName: req.NewName, Timestamp: req.Timestamp, + }) + if err != nil { + return fmt.Errorf("marshal rename outbox payload: %w", err) + } + for _, remoteSiteID := range remoteSites { + evt := model.OutboxEvent{ + Type: model.OutboxRoomRenamed, SiteID: h.siteID, DestSiteID: remoteSiteID, + Payload: renamedPayload, Timestamp: time.Now().UTC().UnixMilli(), + } + evtData, mErr := json.Marshal(evt) + if mErr != nil { + return fmt.Errorf("marshal rename outbox event: %w", mErr) + } + if err = h.publish(ctx, subject.Outbox(h.siteID, remoteSiteID, model.OutboxRoomRenamed), + evtData, natsutil.OutboxDedupID(ctx, remoteSiteID, requestID)); err != nil { + return fmt.Errorf("publish rename outbox to %s: %w", remoteSiteID, err) } } + return nil } func (h *Handler) publishSyncDMOutbox(ctx context.Context, room *model.Room, requester, other *model.User, joinedAt time.Time) error { From 85898d58961f54c3a8bcc6106cd08145cea54bb2 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 6 Jun 2026 15:37:40 +0000 Subject: [PATCH 03/14] fix(broadcast-worker): remove redundant siteID filter from GetThreadFollowers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit parentMessageId is the unique key in thread_rooms — one document per parent message — so the siteId filter was redundant. Removing it simplifies the query, the store interface, and all callers. https://claude.ai/code/session_01LjGg9QJU7QVZFU6pXXFbgb --- broadcast-worker/handler.go | 10 +++++----- broadcast-worker/handler_test.go | 12 ++++++------ broadcast-worker/mock_store_test.go | 8 ++++---- broadcast-worker/store.go | 2 +- broadcast-worker/store_mongo.go | 6 +++--- 5 files changed, 19 insertions(+), 19 deletions(-) diff --git a/broadcast-worker/handler.go b/broadcast-worker/handler.go index 2c0592ad6..5141e4f8c 100644 --- a/broadcast-worker/handler.go +++ b/broadcast-worker/handler.go @@ -165,7 +165,7 @@ func (h *Handler) handleThreadCreated(ctx context.Context, evt *model.MessageEve // event. Fetch the subscriber list and build fanOut before any further work. var fanOut []string if meta.Type == model.RoomTypeChannel { - fanOut, err = h.channelThreadFanOut(ctx, parentMsgID, msg.UserAccount, evt.SiteID, parsed.Accounts) + fanOut, err = h.channelThreadFanOut(ctx, parentMsgID, msg.UserAccount, parsed.Accounts) if err != nil { return fmt.Errorf("channel thread fan-out for parent %s: %w", parentMsgID, err) } @@ -273,7 +273,7 @@ func (h *Handler) handleThreadUpdated(ctx context.Context, evt *model.MessageEve switch room.Type { case model.RoomTypeChannel: parsed := mention.Parse(msg.Content) - fanOut, err := h.channelThreadFanOut(ctx, parentMsgID, msg.UserAccount, evt.SiteID, parsed.Accounts) + fanOut, err := h.channelThreadFanOut(ctx, parentMsgID, msg.UserAccount, parsed.Accounts) if err != nil { return fmt.Errorf("channel thread fan-out for thread update of parent %s: %w", parentMsgID, err) } @@ -331,7 +331,7 @@ func (h *Handler) handleThreadDeleted(ctx context.Context, evt *model.MessageEve // receive the delete. Only the channel path uses mentions; the DM path // fans out to all members. parsed := mention.Parse(msg.Content) - fanOut, err := h.channelThreadFanOut(ctx, parentMsgID, msg.UserAccount, evt.SiteID, parsed.Accounts) + fanOut, err := h.channelThreadFanOut(ctx, parentMsgID, msg.UserAccount, parsed.Accounts) if err != nil { return fmt.Errorf("channel thread fan-out for thread delete of parent %s: %w", parentMsgID, err) } @@ -838,8 +838,8 @@ func threadFanOutAccounts(senderAccount string, followers map[string]struct{}, e // thread event: it fetches the parent message's thread followers and merges // them with the @-mentioned accounts, excluding the sender. Shared by the // channel branch of every thread handler (created/updated/deleted). -func (h *Handler) channelThreadFanOut(ctx context.Context, parentMsgID, sender, siteID string, mentions []string) ([]string, error) { - followers, err := h.store.GetThreadFollowers(ctx, parentMsgID, siteID) +func (h *Handler) channelThreadFanOut(ctx context.Context, parentMsgID, sender string, mentions []string) ([]string, error) { + followers, err := h.store.GetThreadFollowers(ctx, parentMsgID) if err != nil { return nil, fmt.Errorf("get thread followers for parent %s: %w", parentMsgID, err) } diff --git a/broadcast-worker/handler_test.go b/broadcast-worker/handler_test.go index d3415898e..f746ea862 100644 --- a/broadcast-worker/handler_test.go +++ b/broadcast-worker/handler_test.go @@ -1700,7 +1700,7 @@ func TestHandleThreadCreated_ChannelRoom_FansOutToFollowers(t *testing.T) { followers := map[string]struct{}{"bob": {}, "carol": {}} store.EXPECT().GetRoomMeta(gomock.Any(), roomID).Return(metaOf(testChannelRoom), nil) - store.EXPECT().GetThreadFollowers(gomock.Any(), parentMsgID, siteID).Return(followers, nil) + store.EXPECT().GetThreadFollowers(gomock.Any(), parentMsgID).Return(followers, nil) us.EXPECT().FindUsersByAccounts(gomock.Any(), []string{"alice"}).Return([]model.User{testUsers[0]}, nil) evt := model.MessageEvent{ @@ -1747,7 +1747,7 @@ func TestHandleThreadCreated_ChannelRoom_NoFollowers_Skips(t *testing.T) { msgTime := time.Date(2026, 4, 1, 10, 0, 0, 0, time.UTC) store.EXPECT().GetRoomMeta(gomock.Any(), "r1").Return(metaOf(testChannelRoom), nil) - store.EXPECT().GetThreadFollowers(gomock.Any(), "parent-1", "site-a").Return(map[string]struct{}{}, nil) + store.EXPECT().GetThreadFollowers(gomock.Any(), "parent-1").Return(map[string]struct{}{}, nil) evt := model.MessageEvent{ Event: model.EventCreated, @@ -1866,7 +1866,7 @@ func TestHandleThreadUpdated_ChannelRoom_FansOutToFollowers(t *testing.T) { room := &model.Room{ID: roomID, Type: model.RoomTypeChannel, SiteID: siteID} followers := map[string]struct{}{"bob": {}, "carol": {}} store.EXPECT().GetRoom(gomock.Any(), roomID).Return(room, nil) - store.EXPECT().GetThreadFollowers(gomock.Any(), parentMsgID, siteID).Return(followers, nil) + store.EXPECT().GetThreadFollowers(gomock.Any(), parentMsgID).Return(followers, nil) evt := model.MessageEvent{ Event: model.EventUpdated, @@ -1912,7 +1912,7 @@ func TestHandleThreadUpdated_ChannelRoom_GetThreadFollowersError(t *testing.T) { room := &model.Room{ID: "r1", Type: model.RoomTypeChannel, SiteID: "site-a"} store.EXPECT().GetRoom(gomock.Any(), "r1").Return(room, nil) - store.EXPECT().GetThreadFollowers(gomock.Any(), "parent-1", "site-a").Return(nil, errors.New("db error")) + store.EXPECT().GetThreadFollowers(gomock.Any(), "parent-1").Return(nil, errors.New("db error")) evt := model.MessageEvent{ Event: model.EventUpdated, @@ -2007,7 +2007,7 @@ func TestHandleThreadDeleted_ChannelRoom_FansOutToFollowers(t *testing.T) { room := &model.Room{ID: roomID, Type: model.RoomTypeChannel, SiteID: siteID} followers := map[string]struct{}{"bob": {}, "carol": {}} store.EXPECT().GetRoom(gomock.Any(), roomID).Return(room, nil) - store.EXPECT().GetThreadFollowers(gomock.Any(), parentMsgID, siteID).Return(followers, nil) + store.EXPECT().GetThreadFollowers(gomock.Any(), parentMsgID).Return(followers, nil) // No NewTCount → no badge update. evt := model.MessageEvent{ @@ -2053,7 +2053,7 @@ func TestHandleThreadDeleted_ChannelRoom_WithBadgeUpdate(t *testing.T) { room := &model.Room{ID: "r1", Type: model.RoomTypeChannel, SiteID: "site-a"} store.EXPECT().GetRoom(gomock.Any(), "r1").Return(room, nil) - store.EXPECT().GetThreadFollowers(gomock.Any(), "parent-1", "site-a").Return(map[string]struct{}{"bob": {}}, nil) + store.EXPECT().GetThreadFollowers(gomock.Any(), "parent-1").Return(map[string]struct{}{"bob": {}}, nil) evt := model.MessageEvent{ Event: model.EventDeleted, diff --git a/broadcast-worker/mock_store_test.go b/broadcast-worker/mock_store_test.go index 9b84c5e1b..afac533dc 100644 --- a/broadcast-worker/mock_store_test.go +++ b/broadcast-worker/mock_store_test.go @@ -74,18 +74,18 @@ func (mr *MockStoreMockRecorder) GetRoomMeta(ctx, roomID any) *gomock.Call { } // GetThreadFollowers mocks base method. -func (m *MockStore) GetThreadFollowers(ctx context.Context, parentMessageID, siteID string) (map[string]struct{}, error) { +func (m *MockStore) GetThreadFollowers(ctx context.Context, parentMessageID string) (map[string]struct{}, error) { m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "GetThreadFollowers", ctx, parentMessageID, siteID) + ret := m.ctrl.Call(m, "GetThreadFollowers", ctx, parentMessageID) ret0, _ := ret[0].(map[string]struct{}) ret1, _ := ret[1].(error) return ret0, ret1 } // GetThreadFollowers indicates an expected call of GetThreadFollowers. -func (mr *MockStoreMockRecorder) GetThreadFollowers(ctx, parentMessageID, siteID any) *gomock.Call { +func (mr *MockStoreMockRecorder) GetThreadFollowers(ctx, parentMessageID any) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetThreadFollowers", reflect.TypeOf((*MockStore)(nil).GetThreadFollowers), ctx, parentMessageID, siteID) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetThreadFollowers", reflect.TypeOf((*MockStore)(nil).GetThreadFollowers), ctx, parentMessageID) } // ListSubscriptions mocks base method. diff --git a/broadcast-worker/store.go b/broadcast-worker/store.go index 45bc35b01..3ac4cc207 100644 --- a/broadcast-worker/store.go +++ b/broadcast-worker/store.go @@ -17,7 +17,7 @@ type Store interface { GetRoom(ctx context.Context, roomID string) (*model.Room, error) GetRoomMeta(ctx context.Context, roomID string) (roommetacache.Meta, error) ListSubscriptions(ctx context.Context, roomID string) ([]model.Subscription, error) - GetThreadFollowers(ctx context.Context, parentMessageID, siteID string) (map[string]struct{}, error) + GetThreadFollowers(ctx context.Context, parentMessageID string) (map[string]struct{}, error) UpdateRoomLastMessage(ctx context.Context, roomID, msgID string, msgAt time.Time, mentionAll bool) error SetSubscriptionMentions(ctx context.Context, roomID string, accounts []string) error } diff --git a/broadcast-worker/store_mongo.go b/broadcast-worker/store_mongo.go index c50260f78..565834c5d 100644 --- a/broadcast-worker/store_mongo.go +++ b/broadcast-worker/store_mongo.go @@ -126,17 +126,17 @@ func (m *mongoStore) SetSubscriptionMentions(ctx context.Context, roomID string, return nil } -func (m *mongoStore) GetThreadFollowers(ctx context.Context, parentMessageID, siteID string) (map[string]struct{}, error) { +func (m *mongoStore) GetThreadFollowers(ctx context.Context, parentMessageID string) (map[string]struct{}, error) { var doc struct { ReplyAccounts []string `bson:"replyAccounts"` } opts := options.FindOne().SetProjection(bson.M{"replyAccounts": 1, "_id": 0}) - err := m.threadRoomCol.FindOne(ctx, bson.M{"parentMessageId": parentMessageID, "siteId": siteID}, opts).Decode(&doc) + err := m.threadRoomCol.FindOne(ctx, bson.M{"parentMessageId": parentMessageID}, opts).Decode(&doc) if err != nil { if errors.Is(err, mongo.ErrNoDocuments) { return map[string]struct{}{}, nil } - return nil, fmt.Errorf("find thread room by parent %s site %s: %w", parentMessageID, siteID, err) + return nil, fmt.Errorf("find thread room by parent %s: %w", parentMessageID, err) } out := make(map[string]struct{}, len(doc.ReplyAccounts)) for _, a := range doc.ReplyAccounts { From 8427934578a3986dfe34892d013de76c689bbfd9 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 6 Jun 2026 15:37:40 +0000 Subject: [PATCH 04/14] fix(broadcast-worker): remove encryption from thread fan-out paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thread replies are published per-user to chat.user.{account}.room.event subjects — each message goes to exactly one user's mailbox, so there is no shared stream to protect. Encrypting per-user subjects adds overhead with no security benefit. Encryption is retained for the shared room channel stream (chat.room.{roomID}.event) where all members subscribe. https://claude.ai/code/session_01LjGg9QJU7QVZFU6pXXFbgb --- broadcast-worker/handler.go | 8 -------- 1 file changed, 8 deletions(-) diff --git a/broadcast-worker/handler.go b/broadcast-worker/handler.go index 5141e4f8c..5f9ec9d67 100644 --- a/broadcast-worker/handler.go +++ b/broadcast-worker/handler.go @@ -201,9 +201,6 @@ func (h *Handler) handleThreadCreated(ctx context.Context, evt *model.MessageEve if len(resolved.Participants) > 0 { roomEvt.Mentions = resolved.Participants } - if err := h.encryptRoomEvent(ctx, meta.ID, clientMsg, &roomEvt); err != nil { - return fmt.Errorf("encrypt thread created event for parent %s: %w", parentMsgID, err) - } payload, err := json.Marshal(roomEvt) if err != nil { return fmt.Errorf("marshal thread created event for parent %s: %w", parentMsgID, err) @@ -283,11 +280,6 @@ func (h *Handler) handleThreadUpdated(ctx context.Context, evt *model.MessageEve "request_id", natsutil.RequestIDFromContext(ctx)) return nil } - if h.encrypt { - if err := h.encryptEditedContent(ctx, room.ID, &edit); err != nil { - return fmt.Errorf("encrypt thread updated event for parent %s: %w", parentMsgID, err) - } - } payload, err := json.Marshal(&edit) if err != nil { return fmt.Errorf("marshal thread edit event for parent %s: %w", parentMsgID, err) From a95079674c23728ba2b0378d94216c5e7f126d14 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 6 Jun 2026 15:37:40 +0000 Subject: [PATCH 05/14] fix(broadcast-worker): only fail publishToThreadAccounts when all publishes fail Previously errgroup returned the first error, causing JetStream to redeliver to all accounts including those who already received the event, producing duplicate thread replies on the frontend. Now partial failure is tolerated: only return an error (triggering redelivery) when every single publish failed. Individual failures are logged but do not cause redelivery for successful recipients. https://claude.ai/code/session_01LjGg9QJU7QVZFU6pXXFbgb --- broadcast-worker/handler.go | 33 +++++++++++++++++++------------- broadcast-worker/handler_test.go | 17 +++++++++++++++- 2 files changed, 36 insertions(+), 14 deletions(-) diff --git a/broadcast-worker/handler.go b/broadcast-worker/handler.go index 5f9ec9d67..fe06cc1c8 100644 --- a/broadcast-worker/handler.go +++ b/broadcast-worker/handler.go @@ -8,7 +8,8 @@ import ( "log/slog" "time" - "golang.org/x/sync/errgroup" + "sync" + "sync/atomic" "github.com/hmchangw/chat/pkg/mention" "github.com/hmchangw/chat/pkg/model" @@ -771,29 +772,35 @@ func buildClientMessage(msg *model.Message, userMap map[string]model.User) *mode } // publishToThreadAccounts publishes payload concurrently to every account in -// the list using an errgroup. On publish failure it logs and returns the error -// so the caller can propagate it to JetStream for redelivery — thread per-user -// events must have the same retry guarantee as room-channel events. +// the list. Only returns an error (triggering JetStream redelivery) when every +// publish fails — partial failure is tolerated to avoid duplicate delivery to +// accounts that already received the event on the first attempt. func (h *Handler) publishToThreadAccounts(ctx context.Context, accounts []string, payload []byte, parentMsgID string) error { if len(accounts) == 0 { return nil } - g, gctx := errgroup.WithContext(ctx) + var wg sync.WaitGroup + var failCount atomic.Int64 for _, account := range accounts { account := account - g.Go(func() error { - if err := h.pub.Publish(gctx, subject.UserRoomEvent(account), payload); err != nil { - slog.ErrorContext(gctx, "publish thread event failed", + wg.Add(1) + go func() { + defer wg.Done() + if err := h.pub.Publish(ctx, subject.UserRoomEvent(account), payload); err != nil { + slog.ErrorContext(ctx, "publish thread event failed", "error", err, "account", account, "parentMessageID", parentMsgID, - "request_id", natsutil.RequestIDFromContext(gctx)) - return fmt.Errorf("publish thread event to %s for parent %s: %w", account, parentMsgID, err) + "request_id", natsutil.RequestIDFromContext(ctx)) + failCount.Add(1) } - return nil - }) + }() } - return g.Wait() + wg.Wait() + if failCount.Load() == int64(len(accounts)) { + return fmt.Errorf("all %d thread account publishes failed for parent %s", len(accounts), parentMsgID) + } + return nil } // threadFanOutAccounts builds the deduplicated fan-out recipient list for diff --git a/broadcast-worker/handler_test.go b/broadcast-worker/handler_test.go index f746ea862..599fc6af1 100644 --- a/broadcast-worker/handler_test.go +++ b/broadcast-worker/handler_test.go @@ -2157,7 +2157,22 @@ func TestPublishToThreadAccounts_AllFail_ReturnsError(t *testing.T) { h := NewHandler(store, us, failPub, keyStore, false) err := h.publishToThreadAccounts(context.Background(), []string{"alice", "bob"}, []byte(`{}`), "parent-1") require.Error(t, err) - assert.Contains(t, err.Error(), "publish thread event") + assert.Contains(t, err.Error(), "all 2 thread account publishes failed") +} + +func TestPublishToThreadAccounts_PartialFail_ReturnsNil(t *testing.T) { + // failAfter=1: first publish succeeds, subsequent ones fail. + failPub := &failingPublisher{failAfter: 1} + + ctrl := gomock.NewController(t) + store := NewMockStore(ctrl) + us := NewMockUserStore(ctrl) + keyStore := NewMockRoomKeyProvider(ctrl) + + h := NewHandler(store, us, failPub, keyStore, false) + // alice succeeds, bob fails — partial failure must not trigger redelivery. + err := h.publishToThreadAccounts(context.Background(), []string{"alice", "bob"}, []byte(`{}`), "parent-1") + require.NoError(t, err) } func TestPublishToThreadAccounts_Empty_NoOp(t *testing.T) { From 05dd9d82a705b45a24d3d946c1d823087c85f4cc Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 6 Jun 2026 15:32:15 +0000 Subject: [PATCH 06/14] fix(integration-tests): resolve three CI build failures MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - history-service/internal/service/integration_test.go:349 — TestDeleteMessage_Integration_ThreadReplyPublishesMetadataEvent called New() with 6 args; function now requires 8 (UserStore + CustomEmojiStore were added). Add nil, nil for the two store params. - history-service/internal/cassrepo/reactions_integration_test.go — TestRepository_AddReaction_Pinned and TestRepository_RemoveReaction_Pinned inserted pinned_at into messages_by_room, which lacks that column per the Cassandra model doc. Remove the column from the INSERT statements. - broadcast-worker/integration_test.go — TestBroadcastWorker_GetThreadFollowers called GetThreadFollowers with a siteID arg after commit 37b55b7 removed it from the Store interface. Update calls to 1-arg form and replace the siteId-isolation subtest (not applicable per-site deployment model) with a distinct-parentMessageId subtest. --- broadcast-worker/integration_test.go | 20 +++++++++---------- .../cassrepo/reactions_integration_test.go | 8 ++++---- .../internal/service/integration_test.go | 2 +- 3 files changed, 14 insertions(+), 16 deletions(-) diff --git a/broadcast-worker/integration_test.go b/broadcast-worker/integration_test.go index f9e8dc8f3..9f99ed8e9 100644 --- a/broadcast-worker/integration_test.go +++ b/broadcast-worker/integration_test.go @@ -410,39 +410,37 @@ func TestBroadcastWorker_GetThreadFollowers_Integration(t *testing.T) { ctx := context.Background() store := NewMongoStore(db.Collection("rooms"), db.Collection("subscriptions"), db.Collection("thread_rooms")) - // Seed a thread room document with replyAccounts and a siteId. + // Seed a thread room document with replyAccounts (siteID isolation is handled + // at the deployment level — each site has its own MongoDB instance). _, err := db.Collection("thread_rooms").InsertMany(ctx, []interface{}{ bson.M{ "_id": "tr-1", "parentMessageId": "parent-1", - "siteId": "site-a", "replyAccounts": []string{"bob", "carol", ""}, }, - // Same parentMessageId but different siteId — must NOT be returned. bson.M{ - "_id": "tr-2", - "parentMessageId": "parent-1", - "siteId": "site-b", + "_id": "tr-3", + "parentMessageId": "parent-2", "replyAccounts": []string{"dave"}, }, }) require.NoError(t, err) - t.Run("returns followers for correct siteId", func(t *testing.T) { - followers, err := store.GetThreadFollowers(ctx, "parent-1", "site-a") + t.Run("returns followers with empty strings filtered", func(t *testing.T) { + followers, err := store.GetThreadFollowers(ctx, "parent-1") require.NoError(t, err) // Empty string is filtered out. assert.Equal(t, map[string]struct{}{"bob": {}, "carol": {}}, followers) }) - t.Run("cross-siteId isolation: different siteId returns empty", func(t *testing.T) { - followers, err := store.GetThreadFollowers(ctx, "parent-1", "site-b") + t.Run("different parentMessageId returns correct subset", func(t *testing.T) { + followers, err := store.GetThreadFollowers(ctx, "parent-2") require.NoError(t, err) assert.Equal(t, map[string]struct{}{"dave": {}}, followers) }) t.Run("no document returns empty map", func(t *testing.T) { - followers, err := store.GetThreadFollowers(ctx, "nonexistent-parent", "site-a") + followers, err := store.GetThreadFollowers(ctx, "nonexistent-parent") require.NoError(t, err) assert.Empty(t, followers) }) diff --git a/history-service/internal/cassrepo/reactions_integration_test.go b/history-service/internal/cassrepo/reactions_integration_test.go index 743243e33..d7c479037 100644 --- a/history-service/internal/cassrepo/reactions_integration_test.go +++ b/history-service/internal/cassrepo/reactions_integration_test.go @@ -146,8 +146,8 @@ func TestRepository_AddReaction_Pinned(t *testing.T) { msgID, roomID, createdAt, sender, "pinned msg", "", pinnedAt, ).Exec()) require.NoError(t, repo.session.Query( - `INSERT INTO messages_by_room (room_id, bucket, created_at, message_id, sender, msg, thread_parent_id, pinned_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?)`, - roomID, bucketSizer.Of(createdAt), createdAt, msgID, sender, "pinned msg", "", pinnedAt, + `INSERT INTO messages_by_room (room_id, bucket, created_at, message_id, sender, msg, thread_parent_id) VALUES (?, ?, ?, ?, ?, ?, ?)`, + roomID, bucketSizer.Of(createdAt), createdAt, msgID, sender, "pinned msg", "", ).Exec()) require.NoError(t, repo.session.Query( `INSERT INTO pinned_messages_by_room (room_id, created_at, message_id, sender, msg) VALUES (?, ?, ?, ?, ?)`, @@ -294,8 +294,8 @@ func TestRepository_RemoveReaction_Pinned(t *testing.T) { msgID, roomID, createdAt, sender, "pinned msg", "", pinnedAt, ).Exec()) require.NoError(t, repo.session.Query( - `INSERT INTO messages_by_room (room_id, bucket, created_at, message_id, sender, msg, thread_parent_id, pinned_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?)`, - roomID, bucketSizer.Of(createdAt), createdAt, msgID, sender, "pinned msg", "", pinnedAt, + `INSERT INTO messages_by_room (room_id, bucket, created_at, message_id, sender, msg, thread_parent_id) VALUES (?, ?, ?, ?, ?, ?, ?)`, + roomID, bucketSizer.Of(createdAt), createdAt, msgID, sender, "pinned msg", "", ).Exec()) require.NoError(t, repo.session.Query( `INSERT INTO pinned_messages_by_room (room_id, created_at, message_id, sender, msg) VALUES (?, ?, ?, ?, ?)`, diff --git a/history-service/internal/service/integration_test.go b/history-service/internal/service/integration_test.go index 3ecee2074..2c31680f0 100644 --- a/history-service/internal/service/integration_test.go +++ b/history-service/internal/service/integration_test.go @@ -346,7 +346,7 @@ func TestDeleteMessage_Integration_ThreadReplyPublishesMetadataEvent(t *testing. session := setupCassandra(t) repo := cassrepo.NewRepository(session, msgbucket.New(24*time.Hour), 365, nil) pub := &recordingPublisher{} - svc := New(repo, alwaysSubscribedRepo{}, stubRoomRepo{}, pub, nil, &config.Config{ + svc := New(repo, alwaysSubscribedRepo{}, stubRoomRepo{}, pub, nil, nil, nil, &config.Config{ MessageHistoryFloorDays: 730, LargeRoomThreshold: 500, MaxPinnedPerRoom: 10, From e219a4ad747e46d77ead0ec4045d0a7fede98189 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 6 Jun 2026 15:40:13 +0000 Subject: [PATCH 07/14] fix(cassrepo): use pinned_at clustering key in pinned_messages_by_room test INSERTs/SELECTs TestRepository_AddReaction_Pinned and TestRepository_RemoveReaction_Pinned were inserting into pinned_messages_by_room using column name 'created_at' instead of 'pinned_at' (the actual clustering key), and the verification SELECTs had the same wrong column in the WHERE clause. https://claude.ai/code/session_013Vs7CusvrZFrRKJaSoFtCi --- .../internal/cassrepo/reactions_integration_test.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/history-service/internal/cassrepo/reactions_integration_test.go b/history-service/internal/cassrepo/reactions_integration_test.go index d7c479037..aae064e15 100644 --- a/history-service/internal/cassrepo/reactions_integration_test.go +++ b/history-service/internal/cassrepo/reactions_integration_test.go @@ -150,7 +150,7 @@ func TestRepository_AddReaction_Pinned(t *testing.T) { roomID, bucketSizer.Of(createdAt), createdAt, msgID, sender, "pinned msg", "", ).Exec()) require.NoError(t, repo.session.Query( - `INSERT INTO pinned_messages_by_room (room_id, created_at, message_id, sender, msg) VALUES (?, ?, ?, ?, ?)`, + `INSERT INTO pinned_messages_by_room (room_id, pinned_at, message_id, sender, msg) VALUES (?, ?, ?, ?, ?)`, roomID, pinnedAt, msgID, sender, "pinned msg", ).Exec()) @@ -176,7 +176,7 @@ func TestRepository_AddReaction_Pinned(t *testing.T) { // default (no UPDATE issued by AddReaction). var pinnedUpdatedAt time.Time err := repo.session.Query( - `SELECT updated_at FROM pinned_messages_by_room WHERE room_id = ? AND created_at = ? AND message_id = ?`, + `SELECT updated_at FROM pinned_messages_by_room WHERE room_id = ? AND pinned_at = ? AND message_id = ?`, roomID, pinnedAt, msgID, ).Scan(&pinnedUpdatedAt) require.NoError(t, err) @@ -298,7 +298,7 @@ func TestRepository_RemoveReaction_Pinned(t *testing.T) { roomID, bucketSizer.Of(createdAt), createdAt, msgID, sender, "pinned msg", "", ).Exec()) require.NoError(t, repo.session.Query( - `INSERT INTO pinned_messages_by_room (room_id, created_at, message_id, sender, msg) VALUES (?, ?, ?, ?, ?)`, + `INSERT INTO pinned_messages_by_room (room_id, pinned_at, message_id, sender, msg) VALUES (?, ?, ?, ?, ?)`, roomID, pinnedAt, msgID, sender, "pinned msg", ).Exec()) @@ -323,7 +323,7 @@ func TestRepository_RemoveReaction_Pinned(t *testing.T) { // pinned_messages_by_room.updated_at must NOT have been touched. var pinnedUpdatedAt time.Time require.NoError(t, repo.session.Query( - `SELECT updated_at FROM pinned_messages_by_room WHERE room_id = ? AND created_at = ? AND message_id = ?`, + `SELECT updated_at FROM pinned_messages_by_room WHERE room_id = ? AND pinned_at = ? AND message_id = ?`, roomID, pinnedAt, msgID, ).Scan(&pinnedUpdatedAt)) assert.True(t, pinnedUpdatedAt.IsZero() || pinnedUpdatedAt.Before(removedAt), From 8c6efd8f16caa8cb0df3ede27fbd8ea5619f8467 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 6 Jun 2026 16:04:47 +0000 Subject: [PATCH 08/14] fix(history-service): simplify already-deleted short-circuit to skip parent lookup and re-publish MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit tcount is persisted durably by countAndSetParentTcount on the first delete. Re-publishing EventDeleted on retry adds unnecessary Cassandra reads and failure modes (parent-lookup error → retry loop) without any benefit. https://claude.ai/code/session_01LjGg9QJU7QVZFU6pXXFbgb --- history-service/internal/service/messages.go | 56 +----- .../internal/service/messages_test.go | 176 ++---------------- 2 files changed, 15 insertions(+), 217 deletions(-) diff --git a/history-service/internal/service/messages.go b/history-service/internal/service/messages.go index 5a89c6ce1..95fc11810 100644 --- a/history-service/internal/service/messages.go +++ b/history-service/internal/service/messages.go @@ -382,65 +382,13 @@ func (s *HistoryService) DeleteMessage(c *natsrouter.Context, siteID string, req // Already-deleted short-circuit: echo the current updated_at as the DeletedAt. // Prevents tcount double-decrement on caller retry and avoids duplicate events. - // Re-publishes the canonical deleted event so a badge update that was lost on - // the first attempt (publishCanonicalBestEffort is best-effort) gets retried. - // JetStream dedup (":deleted") prevents double-delivery if the first - // publish actually succeeded. + // countAndSetParentTcount already wrote the correct tcount on the first delete, + // so no re-publish is needed — the tcount is durable in Cassandra. if msg.Deleted { var deletedAtMs int64 if msg.UpdatedAt != nil { deletedAtMs = msg.UpdatedAt.UnixMilli() } - var newTcount *int - // Gate parent lookup on UpdatedAt != nil: nil-UpdatedAt records can never produce - // a valid EventDeleted, so the lookup result would be unconsumed anyway. - if msg.ThreadParentID != "" && msg.UpdatedAt != nil { - parent, parentErr := s.msgReader.GetMessageByID(c, msg.ThreadParentID) - switch { - case parentErr != nil: - // Return error so the caller retries the delete handler. On retry the - // lookup will either succeed (returning the correct tcount) or find the - // parent gone (default branch, which skips the publish). Publishing now - // with NewTCount=nil risks permanently dropping the badge update — the - // same reason the default branch skips the publish entirely. - return nil, fmt.Errorf("already-deleted retry: look up parent tcount for %s: %w", msg.ThreadParentID, parentErr) - case parent != nil: - newTcount = parent.TCount - default: - // Parent was concurrently hard-deleted. No badge to update — skip the - // canonical republish entirely to avoid publishing EventDeleted with - // NewTCount=nil, which would cause broadcast-worker to permanently drop - // the tcount decrement. - return &models.DeleteMessageResponse{ - MessageID: req.MessageID, - DeletedAt: deletedAtMs, - }, nil - } - } - // Only republish when UpdatedAt is available. Legacy records with nil - // UpdatedAt cannot produce a valid EventDeleted — downstream handlers - // (broadcast-worker, search-sync) reject nil UpdatedAt and would NAK, - // causing an infinite redelivery loop. - if msg.UpdatedAt != nil { - canonicalEvt := model.MessageEvent{ - Event: model.EventDeleted, - Message: model.Message{ - ID: msg.MessageID, - RoomID: msg.RoomID, - UserID: msg.Sender.ID, - UserAccount: msg.Sender.Account, - Content: msg.Msg, - CreatedAt: msg.CreatedAt, - UpdatedAt: msg.UpdatedAt, - ThreadParentMessageID: msg.ThreadParentID, - TShow: msg.TShow, - }, - SiteID: siteID, - Timestamp: deletedAtMs, - NewTCount: newTcount, - } - s.publishCanonicalBestEffort(c, subject.MsgCanonicalDeleted(siteID), &canonicalEvt) - } return &models.DeleteMessageResponse{ MessageID: req.MessageID, DeletedAt: deletedAtMs, diff --git a/history-service/internal/service/messages_test.go b/history-service/internal/service/messages_test.go index e4f441c82..319c932e7 100644 --- a/history-service/internal/service/messages_test.go +++ b/history-service/internal/service/messages_test.go @@ -1221,7 +1221,7 @@ func TestHistoryService_EditMessage_PassesDedupMessageID(t *testing.T) { // --- DeleteMessage --- func TestHistoryService_DeleteMessage_AlreadyDeleted_ShortCircuits(t *testing.T) { - svc, msgs, subs, pub, _ := newService(t) + svc, msgs, subs, _, _ := newService(t) c := testContext() subs.EXPECT().GetHistorySharedSince(gomock.Any(), "u1", "r1").Return(nil, true, nil) @@ -1236,28 +1236,16 @@ func TestHistoryService_DeleteMessage_AlreadyDeleted_ShortCircuits(t *testing.T) } msgs.EXPECT().GetMessageByID(gomock.Any(), "m-abc").Return(hydrated, nil) - // Non-thread-reply: no parent lookup expected. Publish fires to re-deliver - // any badge event that was lost if the original publish failed. - pub.EXPECT(). - Publish(gomock.Any(), subject.MsgCanonicalDeleted("site-test"), gomock.Any(), gomock.Any()). - DoAndReturn(func(_ context.Context, _ string, data []byte, dedupID string) error { - var evt model.MessageEvent - require.NoError(t, json.Unmarshal(data, &evt)) - assert.Equal(t, model.EventDeleted, evt.Event) - assert.Equal(t, "m-abc", evt.Message.ID) - assert.Nil(t, evt.NewTCount, "non-thread-reply should have nil NewTCount") - assert.Equal(t, natsutil.CanonicalDedupID(&evt), dedupID) - return nil - }) - + // Already-deleted: no parent lookup, no publish. tcount was persisted by + // countAndSetParentTcount on the first delete and is durable in Cassandra. resp, err := svc.DeleteMessage(c, "site-test", models.DeleteMessageRequest{MessageID: "m-abc"}) require.NoError(t, err) assert.Equal(t, "m-abc", resp.MessageID) assert.Equal(t, priorUpdatedAt.UnixMilli(), resp.DeletedAt, "short-circuit should echo the existing updated_at") } -func TestHistoryService_DeleteMessage_AlreadyDeleted_ThreadReply_RepublishesWithParentTCount(t *testing.T) { - svc, msgs, subs, pub, _ := newService(t) +func TestHistoryService_DeleteMessage_AlreadyDeleted_ThreadReply_ShortCircuits(t *testing.T) { + svc, msgs, subs, _, _ := newService(t) c := testContext() subs.EXPECT().GetHistorySharedSince(gomock.Any(), "u1", "r1").Return(nil, true, nil) @@ -1274,111 +1262,21 @@ func TestHistoryService_DeleteMessage_AlreadyDeleted_ThreadReply_RepublishesWith } msgs.EXPECT().GetMessageByID(gomock.Any(), "reply-abc").Return(hydrated, nil) - parentTcount := 3 - parent := &models.Message{ - MessageID: "parent-xyz", - RoomID: "r1", - TCount: &parentTcount, - } - msgs.EXPECT().GetMessageByID(gomock.Any(), "parent-xyz").Return(parent, nil) - - pub.EXPECT(). - Publish(gomock.Any(), subject.MsgCanonicalDeleted("site-test"), gomock.Any(), gomock.Any()). - DoAndReturn(func(_ context.Context, _ string, data []byte, _ string) error { - var evt model.MessageEvent - require.NoError(t, json.Unmarshal(data, &evt)) - assert.Equal(t, model.EventDeleted, evt.Event) - assert.Equal(t, "reply-abc", evt.Message.ID) - assert.Equal(t, "parent-xyz", evt.Message.ThreadParentMessageID) - require.NotNil(t, evt.NewTCount) - assert.Equal(t, 3, *evt.NewTCount) - return nil - }) - + // No parent lookup, no publish: tcount is durable in Cassandra from the first delete. resp, err := svc.DeleteMessage(c, "site-test", models.DeleteMessageRequest{MessageID: "reply-abc"}) require.NoError(t, err) assert.Equal(t, "reply-abc", resp.MessageID) assert.Equal(t, priorUpdatedAt.UnixMilli(), resp.DeletedAt) } -// TestHistoryService_DeleteMessage_AlreadyDeleted_ThreadReply_ParentHardDeleted_SkipsRepublish -// verifies that when GetMessageByID returns (nil, nil) for the parent (concurrent hard-delete), -// the already-deleted short-circuit skips the canonical republish entirely. There is no badge -// to update when the parent row is gone, so publishing EventDeleted with NewTCount=nil would -// cause broadcast-worker to permanently skip a tcount decrement it can never apply. -func TestHistoryService_DeleteMessage_AlreadyDeleted_ThreadReply_ParentHardDeleted_SkipsRepublish(t *testing.T) { - svc, msgs, subs, pub, _ := newService(t) - c := testContext() - - subs.EXPECT().GetHistorySharedSince(gomock.Any(), "u1", "r1").Return(nil, true, nil) - - priorUpdatedAt := time.Now().UTC().Add(-time.Hour).Truncate(time.Millisecond) - hydrated := &models.Message{ - MessageID: "reply-abc", - RoomID: "r1", - Sender: models.Participant{Account: "u1", ID: "u1-id"}, - Deleted: true, - UpdatedAt: &priorUpdatedAt, - ThreadParentID: "parent-xyz", - TShow: false, - } - msgs.EXPECT().GetMessageByID(gomock.Any(), "reply-abc").Return(hydrated, nil) - - // Parent was concurrently hard-deleted — GetMessageByID returns (nil, nil). - msgs.EXPECT().GetMessageByID(gomock.Any(), "parent-xyz").Return(nil, nil) - - // No publish expected: parent is gone, no badge to update. - _ = pub - - resp, err := svc.DeleteMessage(c, "site-test", models.DeleteMessageRequest{MessageID: "reply-abc"}) - require.NoError(t, err, "already-deleted retry must return success even when parent is gone") - assert.Equal(t, "reply-abc", resp.MessageID) - assert.Equal(t, priorUpdatedAt.UnixMilli(), resp.DeletedAt) -} - -// TestHistoryService_DeleteMessage_AlreadyDeleted_ThreadReply_ParentLookupError_ReturnsError -// verifies that when the parent-tcount lookup fails on an already-deleted retry, the handler -// returns an error instead of publishing with NewTCount=nil. Publishing nil tcount would cause -// broadcast-worker to permanently drop the badge update — the same reason the hard-deleted -// parent branch (default:) skips the publish entirely. Returning an error lets the client -// retry the delete; on the next attempt the lookup will either succeed or find the parent gone. -func TestHistoryService_DeleteMessage_AlreadyDeleted_ThreadReply_ParentLookupError_ReturnsError(t *testing.T) { +// TestHistoryService_DeleteMessage_AlreadyDeleted_NilUpdatedAt verifies that a +// deleted record with nil UpdatedAt returns success with DeletedAt=0. +func TestHistoryService_DeleteMessage_AlreadyDeleted_NilUpdatedAt(t *testing.T) { svc, msgs, subs, _, _ := newService(t) c := testContext() subs.EXPECT().GetHistorySharedSince(gomock.Any(), "u1", "r1").Return(nil, true, nil) - priorUpdatedAt := time.Now().UTC().Add(-time.Hour).Truncate(time.Millisecond) - hydrated := &models.Message{ - MessageID: "reply-abc", - RoomID: "r1", - Sender: models.Participant{Account: "u1", ID: "u1-id"}, - Deleted: true, - UpdatedAt: &priorUpdatedAt, - ThreadParentID: "parent-xyz", - TShow: false, - } - msgs.EXPECT().GetMessageByID(gomock.Any(), "reply-abc").Return(hydrated, nil) - - // Parent lookup fails — transient error - msgs.EXPECT().GetMessageByID(gomock.Any(), "parent-xyz").Return(nil, fmt.Errorf("cassandra: unavailable")) - - // No publish: publishing with NewTCount=nil would permanently drop the badge update. - _, err := svc.DeleteMessage(c, "site-test", models.DeleteMessageRequest{MessageID: "reply-abc"}) - require.Error(t, err, "already-deleted retry must return error when parent tcount lookup fails") -} - -// TestHistoryService_DeleteMessage_AlreadyDeleted_NilUpdatedAt_SkipsRepublish verifies -// that when a deleted record has nil UpdatedAt (legacy row written before the field was -// added), the already-deleted short-circuit does NOT publish a canonical event. -// Downstream handlers (broadcast-worker handleThreadDeleted / handleDeleted) guard on -// msg.UpdatedAt != nil and would NAK, causing an infinite redelivery loop. -func TestHistoryService_DeleteMessage_AlreadyDeleted_NilUpdatedAt_SkipsRepublish(t *testing.T) { - svc, msgs, subs, pub, _ := newService(t) - c := testContext() - - subs.EXPECT().GetHistorySharedSince(gomock.Any(), "u1", "r1").Return(nil, true, nil) - hydrated := &models.Message{ MessageID: "m-legacy", RoomID: "r1", @@ -1388,22 +1286,16 @@ func TestHistoryService_DeleteMessage_AlreadyDeleted_NilUpdatedAt_SkipsRepublish } msgs.EXPECT().GetMessageByID(gomock.Any(), "m-legacy").Return(hydrated, nil) - // pub must NOT be called — a nil UpdatedAt cannot produce a valid EventDeleted. - // If it were published, broadcast-worker would NAK and redelivery would loop. - _ = pub // no EXPECT needed; gomock strict controller will fail if Publish is called - resp, err := svc.DeleteMessage(c, "site-test", models.DeleteMessageRequest{MessageID: "m-legacy"}) require.NoError(t, err, "already-deleted with nil UpdatedAt must still return success") assert.Equal(t, "m-legacy", resp.MessageID) assert.Equal(t, int64(0), resp.DeletedAt, "DeletedAt should be 0 when UpdatedAt is nil") } -// TestHistoryService_DeleteMessage_AlreadyDeleted_ThreadReply_NilUpdatedAt_SkipsRepublish -// verifies the nil-UpdatedAt guard for thread replies. When UpdatedAt is nil the handler -// skips both the parent-tcount lookup AND the canonical event — no wasted Cassandra read -// for records that will never produce a valid EventDeleted anyway. -func TestHistoryService_DeleteMessage_AlreadyDeleted_ThreadReply_NilUpdatedAt_SkipsRepublish(t *testing.T) { - svc, msgs, subs, pub, _ := newService(t) +// TestHistoryService_DeleteMessage_AlreadyDeleted_ThreadReply_NilUpdatedAt verifies that a +// deleted thread reply with nil UpdatedAt returns success with DeletedAt=0, no parent lookup. +func TestHistoryService_DeleteMessage_AlreadyDeleted_ThreadReply_NilUpdatedAt(t *testing.T) { + svc, msgs, subs, _, _ := newService(t) c := testContext() subs.EXPECT().GetHistorySharedSince(gomock.Any(), "u1", "r1").Return(nil, true, nil) @@ -1419,13 +1311,6 @@ func TestHistoryService_DeleteMessage_AlreadyDeleted_ThreadReply_NilUpdatedAt_Sk } msgs.EXPECT().GetMessageByID(gomock.Any(), "reply-legacy").Return(hydrated, nil) - // Parent lookup must NOT be called: UpdatedAt=nil means we can't produce a valid - // EventDeleted, so the lookup result is never consumed. Gomock strict controller - // will fail if GetMessageByID("parent-xyz") is called unexpectedly. - - // No publish expected — nil UpdatedAt suppresses the canonical event. - _ = pub - resp, err := svc.DeleteMessage(c, "site-test", models.DeleteMessageRequest{MessageID: "reply-legacy"}) require.NoError(t, err, "already-deleted thread reply with nil UpdatedAt must return success") assert.Equal(t, "reply-legacy", resp.MessageID) @@ -2145,41 +2030,6 @@ func TestHistoryService_DeleteMessage_EventDeletedCarriesContent(t *testing.T) { assert.Equal(t, "m-content", resp.MessageID) } -// TestHistoryService_DeleteMessage_AlreadyDeleted_EventDeletedCarriesContent verifies -// that the already-deleted retry path also includes Content in EventDeleted. -func TestHistoryService_DeleteMessage_AlreadyDeleted_EventDeletedCarriesContent(t *testing.T) { - svc, msgs, subs, pub, _ := newService(t) - c := testContext() - - subs.EXPECT().GetHistorySharedSince(gomock.Any(), "u1", "r1").Return(nil, true, nil) - - priorUpdatedAt := time.Now().UTC().Add(-time.Hour).Truncate(time.Millisecond) - hydrated := &models.Message{ - MessageID: "m-retry", - RoomID: "r1", - Sender: models.Participant{Account: "u1", ID: "u1-id"}, - Deleted: true, - UpdatedAt: &priorUpdatedAt, - Msg: "hey @carol look at this", - } - msgs.EXPECT().GetMessageByID(gomock.Any(), "m-retry").Return(hydrated, nil) - - pub.EXPECT(). - Publish(gomock.Any(), subject.MsgCanonicalDeleted("site-test"), gomock.Any(), gomock.Any()). - DoAndReturn(func(_ context.Context, _ string, data []byte, _ string) error { - var evt model.MessageEvent - require.NoError(t, json.Unmarshal(data, &evt)) - assert.Equal(t, model.EventDeleted, evt.Event) - assert.Equal(t, "hey @carol look at this", evt.Message.Content, - "already-deleted retry EventDeleted must carry Content for thread-delete fan-out") - return nil - }) - - resp, err := svc.DeleteMessage(c, "site-test", models.DeleteMessageRequest{MessageID: "m-retry"}) - require.NoError(t, err) - assert.Equal(t, "m-retry", resp.MessageID) -} - // TShow message where ThreadParentCreatedAt is nil (message-worker didn't populate it) → // conservatively redacted because the access window cannot be verified. func TestHistoryService_TShow_ThreadParentCreatedAtNil_ConservativeRedaction(t *testing.T) { From d32391c2010038ebfcdae031dbb17ca5f6cef228 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 6 Jun 2026 16:06:07 +0000 Subject: [PATCH 09/14] fix(message-worker): remove LWT (IF NOT EXISTS) from SaveThreadMessage JetStream MsgID dedup prevents double-delivery at the consumer level, so IF NOT EXISTS was adding 5-10x Paxos overhead for no benefit. tcount is derived from a COUNT + blind SET via countAndSetParentTcount, which is idempotent on redelivery without any CAS. https://claude.ai/code/session_01LjGg9QJU7QVZFU6pXXFbgb --- message-worker/store_cassandra.go | 50 ++++++++++--------------------- 1 file changed, 16 insertions(+), 34 deletions(-) diff --git a/message-worker/store_cassandra.go b/message-worker/store_cassandra.go index 55a4035a6..81ed8349e 100644 --- a/message-worker/store_cassandra.go +++ b/message-worker/store_cassandra.go @@ -160,19 +160,12 @@ func (s *CassandraStore) saveMessageEncrypted(ctx context.Context, msg *model.Me return nil } -// SaveThreadMessage writes the reply to messages_by_id using an LWT -// (IF NOT EXISTS) and then unconditionally inserts into -// thread_messages_by_thread. -// -// The LWT is the idempotency gate for tcount: -// - applied=true → first delivery → increment parent tcount. -// - applied=false → redelivery → read and return the current tcount so -// the caller can still publish a badge event (no increment — avoids -// double-counting on publish-failure retries). -// -// Using IF NOT EXISTS eliminates the SELECT-before-INSERT TOCTOU window of the -// previous pre-check design. The thread_messages_by_thread INSERT is plain -// (no LWT): re-writing an identical row is safe and keeps that write fast. +// SaveThreadMessage writes the reply to messages_by_id and then inserts into +// thread_messages_by_thread. Both writes are plain INSERTs (no LWT): JetStream +// MsgID dedup prevents double-delivery at the consumer level, so re-inserting +// an identical row is safe and avoids the 5–10× Paxos overhead of IF NOT EXISTS. +// countAndSetParentTcount derives tcount from a COUNT query and blind-SETs it, +// which is idempotent on redelivery without any CAS. func (s *CassandraStore) SaveThreadMessage(ctx context.Context, msg *model.Message, sender *cassParticipant, siteID string, threadRoomID string) (*int, error) { if s.cipher != nil { return s.saveThreadMessageEncrypted(ctx, msg, sender, siteID, threadRoomID) @@ -180,24 +173,17 @@ func (s *CassandraStore) SaveThreadMessage(ctx context.Context, msg *model.Messa mentions := toMentionSet(msg.Mentions) - // MapScanCAS is required here instead of ScanCAS(). When IF NOT EXISTS is - // not applied (row already exists), Cassandra returns [applied]=false PLUS - // all existing row columns. ScanCAS() with no destinations cannot absorb - // those extra columns and returns "not enough columns to scan into". - // MapScanCAS scans everything into a map so no column count is needed. - casRow := make(map[string]interface{}) - _, err := s.cassSession.Query( + if err := s.cassSession.Query( `INSERT INTO messages_by_id (message_id, created_at, room_id, sender, msg, site_id, updated_at, mentions, thread_room_id, thread_parent_id, thread_parent_created_at, type, sys_msg_data, tshow, quoted_parent_message, attachments, card, card_action, file) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) IF NOT EXISTS`, + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, msg.ID, msg.CreatedAt, msg.RoomID, sender, msg.Content, siteID, msg.CreatedAt, mentions, threadRoomID, msg.ThreadParentMessageID, msg.ThreadParentMessageCreatedAt, msg.Type, msg.SysMsgData, msg.TShow, msg.QuotedParentMessage, msg.Attachments, msg.Card, msg.CardAction, msg.File, - ).WithContext(ctx).MapScanCAS(casRow) - if err != nil { - return nil, fmt.Errorf("lwt insert thread message %s into messages_by_id: %w", msg.ID, err) + ).WithContext(ctx).Exec(); err != nil { + return nil, fmt.Errorf("insert thread message %s into messages_by_id: %w", msg.ID, err) } if err := s.cassSession.Query( @@ -218,7 +204,8 @@ func (s *CassandraStore) SaveThreadMessage(ctx context.Context, msg *model.Messa } // saveThreadMessageEncrypted is the cipher-enabled counterpart to -// SaveThreadMessage. See SaveThreadMessage for the LWT idempotency rationale. +// SaveThreadMessage. Both writes are plain INSERTs — see SaveThreadMessage for +// the rationale (JetStream MsgID dedup + idempotent countAndSetParentTcount). // // Encrypted body columns (msg, attachments, card, card_action, file) are bound // to NULL so a redelivered pre-encryption row cannot end up in a hybrid @@ -235,24 +222,19 @@ func (s *CassandraStore) saveThreadMessageEncrypted(ctx context.Context, msg *mo encMeta := &cassandra.EncMeta{Nonce: meta.Nonce} mentions := toMentionSet(msg.Mentions) - // Same MapScanCAS rationale as SaveThreadMessage: IF NOT EXISTS returns all - // existing columns on non-apply, which ScanCAS() cannot absorb without - // explicit scan destinations. - casRow := make(map[string]interface{}) - _, err = s.cassSession.Query( + if err = s.cassSession.Query( `INSERT INTO messages_by_id (message_id, created_at, room_id, sender, site_id, updated_at, mentions, thread_room_id, thread_parent_id, thread_parent_created_at, type, tshow, quoted_parent_message, sys_msg_data, msg, attachments, card, card_action, file, enc_payload, enc_meta) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, null, null, null, null, null, ?, ?) IF NOT EXISTS`, + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, null, null, null, null, null, ?, ?)`, msg.ID, msg.CreatedAt, msg.RoomID, sender, siteID, msg.CreatedAt, mentions, threadRoomID, msg.ThreadParentMessageID, msg.ThreadParentMessageCreatedAt, msg.Type, msg.TShow, cm.QuotedParentMessage, msg.SysMsgData, payload, encMeta, - ).WithContext(ctx).MapScanCAS(casRow) - if err != nil { - return nil, fmt.Errorf("lwt insert thread message %s into messages_by_id: %w", msg.ID, err) + ).WithContext(ctx).Exec(); err != nil { + return nil, fmt.Errorf("insert thread message %s into messages_by_id: %w", msg.ID, err) } if err := s.cassSession.Query( From 1284a2fb95e188a3a04bda26d1c4fcee73989a4f Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 6 Jun 2026 16:09:09 +0000 Subject: [PATCH 10/14] fix(room-service): replace aggregation pipeline in UpdateSubscriptionThreadRead with app-side logic Use $pull to atomically remove the threadID, then check in Go whether threadUnread is now empty. If empty, a second UpdateOne clears alert and unsets the field. Avoids aggregation pipeline CPU overhead on MongoDB. https://claude.ai/code/session_01LjGg9QJU7QVZFU6pXXFbgb --- room-service/store_mongo.go | 48 ++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 28 deletions(-) diff --git a/room-service/store_mongo.go b/room-service/store_mongo.go index a6cdf6a71..ab8881879 100644 --- a/room-service/store_mongo.go +++ b/room-service/store_mongo.go @@ -4,6 +4,7 @@ import ( "context" "errors" "fmt" + "log/slog" "regexp" "time" @@ -1119,39 +1120,18 @@ func (s *MongoStore) GetThreadSubscriptionByParent(ctx context.Context, account, return &ts, nil } -// UpdateSubscriptionThreadRead atomically removes threadID from threadUnread via -// an aggregation-pipeline FindOneAndUpdate. When the result is empty, the field -// is removed ($$REMOVE) and alert is set to false. +// UpdateSubscriptionThreadRead removes threadID from threadUnread using a $pull +// and returns the resulting state. If threadUnread becomes empty a second update +// clears alert and removes the field. func (s *MongoStore) UpdateSubscriptionThreadRead(ctx context.Context, roomID, account, threadID string) ([]string, bool, error) { filter := bson.M{"roomId": roomID, "u.account": account} - // Aggregation pipeline: filter out threadID in one atomic pass, then unset - // threadUnread if the result is empty ($$REMOVE) and derive alert from that. - // Stage 1 stores the filtered array in a temp field _tuf; stage 2 applies it. - update := bson.A{ - bson.M{"$set": bson.M{"_tuf": bson.M{"$filter": bson.M{ - "input": bson.M{"$ifNull": bson.A{"$threadUnread", bson.A{}}}, - "as": "item", - "cond": bson.M{"$ne": bson.A{"$$item", threadID}}, - }}}}, - bson.M{"$set": bson.M{ - "threadUnread": bson.M{"$cond": bson.A{ - bson.M{"$gt": bson.A{bson.M{"$size": "$_tuf"}, 0}}, - "$_tuf", - "$$REMOVE", - }}, - "alert": bson.M{"$cond": bson.A{ - bson.M{"$gt": bson.A{bson.M{"$size": "$_tuf"}, 0}}, - "$alert", - false, - }}, - }}, - bson.M{"$unset": "_tuf"}, - } - opts := options.FindOneAndUpdate().SetReturnDocument(options.After) var updated model.Subscription - err := s.subscriptions.FindOneAndUpdate(ctx, filter, update, opts).Decode(&updated) + err := s.subscriptions.FindOneAndUpdate(ctx, filter, + bson.M{"$pull": bson.M{"threadUnread": threadID}}, + opts, + ).Decode(&updated) if errors.Is(err, mongo.ErrNoDocuments) { return nil, false, fmt.Errorf("update subscription thread-read for %q in room %q: %w", account, roomID, model.ErrSubscriptionNotFound) @@ -1159,6 +1139,18 @@ func (s *MongoStore) UpdateSubscriptionThreadRead(ctx context.Context, roomID, a if err != nil { return nil, false, fmt.Errorf("update subscription thread-read for %q in room %q: %w", account, roomID, err) } + + if len(updated.ThreadUnread) == 0 { + if _, err = s.subscriptions.UpdateOne(ctx, filter, bson.M{ + "$set": bson.M{"alert": false}, + "$unset": bson.M{"threadUnread": ""}, + }); err != nil { + slog.WarnContext(ctx, "clear alert after empty threadUnread", + "error", err, "account", account, "roomID", roomID) + } + return nil, false, nil + } + return updated.ThreadUnread, updated.Alert, nil } From 581d2646d1512b16d7f627aa6507e2004aedaa9c Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 6 Jun 2026 16:16:14 +0000 Subject: [PATCH 11/14] fix(broadcast-worker): set Timestamp to publish time, add EventTimestamp for canonical event time Timestamp on each room event struct now records when broadcast-worker publishes the event, enabling clients to detect JetStream redeliveries. EventTimestamp carries the original canonical event time from message-worker for correlation. ReactRoomEvent already used publish time; all other event types (RoomEvent, EditRoomEvent, DeleteRoomEvent, PinRoomEvent, UnpinRoomEvent, ThreadMetadataUpdatedEvent) are updated. https://claude.ai/code/session_01LjGg9QJU7QVZFU6pXXFbgb --- broadcast-worker/handler.go | 115 ++++++++++++++++--------------- broadcast-worker/handler_test.go | 21 ++++-- pkg/model/event.go | 53 +++++++------- 3 files changed, 105 insertions(+), 84 deletions(-) diff --git a/broadcast-worker/handler.go b/broadcast-worker/handler.go index fe06cc1c8..00040433c 100644 --- a/broadcast-worker/handler.go +++ b/broadcast-worker/handler.go @@ -382,7 +382,7 @@ func (h *Handler) handleThreadTCountUpdated(ctx context.Context, evt *model.Mess } func (h *Handler) publishThreadMetadata(ctx context.Context, room *model.Room, newTcount int, - parentMsgID, replyMsgID string, action model.ThreadAction, timestamp int64) error { + parentMsgID, replyMsgID string, action model.ThreadAction, eventTimestamp int64) error { evt := model.ThreadMetadataUpdatedEvent{ Type: model.RoomEventThreadMetadataUpdated, RoomID: room.ID, @@ -391,7 +391,8 @@ func (h *Handler) publishThreadMetadata(ctx context.Context, room *model.Room, n ReplyMessageID: replyMsgID, NewTCount: newTcount, Action: action, - Timestamp: timestamp, + Timestamp: time.Now().UTC().UnixMilli(), + EventTimestamp: eventTimestamp, } payload, err := json.Marshal(evt) if err != nil { @@ -472,13 +473,14 @@ func (h *Handler) handlePinned(ctx context.Context, evt *model.MessageEvent) err } pin := model.PinRoomEvent{ - Type: model.RoomEventMessagePinned, - RoomID: room.ID, - SiteID: room.SiteID, - Timestamp: evt.Timestamp, - MessageID: msg.ID, - PinnedBy: msg.PinnedBy, - PinnedAt: *msg.PinnedAt, + Type: model.RoomEventMessagePinned, + RoomID: room.ID, + SiteID: room.SiteID, + Timestamp: time.Now().UTC().UnixMilli(), + EventTimestamp: evt.Timestamp, + MessageID: msg.ID, + PinnedBy: msg.PinnedBy, + PinnedAt: *msg.PinnedAt, } return h.publishMutation(ctx, room, model.RoomEventMessagePinned, msg.ID, &pin) } @@ -495,13 +497,14 @@ func (h *Handler) handleUnpinned(ctx context.Context, evt *model.MessageEvent) e } unpin := model.UnpinRoomEvent{ - Type: model.RoomEventMessageUnpinned, - RoomID: room.ID, - SiteID: room.SiteID, - Timestamp: evt.Timestamp, - MessageID: msg.ID, - UnpinnedBy: msg.PinnedBy, - UnpinnedAt: time.UnixMilli(evt.Timestamp).UTC(), + Type: model.RoomEventMessageUnpinned, + RoomID: room.ID, + SiteID: room.SiteID, + Timestamp: time.Now().UTC().UnixMilli(), + EventTimestamp: evt.Timestamp, + MessageID: msg.ID, + UnpinnedBy: msg.PinnedBy, + UnpinnedAt: time.UnixMilli(evt.Timestamp).UTC(), } return h.publishMutation(ctx, room, model.RoomEventMessageUnpinned, msg.ID, &unpin) } @@ -534,16 +537,17 @@ func (h *Handler) handleReacted(ctx context.Context, evt *model.MessageEvent) er } react := model.ReactRoomEvent{ - Type: model.RoomEventMessageReacted, - RoomID: room.ID, - SiteID: room.SiteID, - Timestamp: time.Now().UTC().UnixMilli(), - MessageID: msg.ID, - Shortcode: evt.ReactionDelta.Shortcode, - Action: evt.ReactionDelta.Action, - Actor: evt.ReactionDelta.Actor, - ReactedAt: *msg.UpdatedAt, - UpdatedAt: *msg.UpdatedAt, + Type: model.RoomEventMessageReacted, + RoomID: room.ID, + SiteID: room.SiteID, + Timestamp: time.Now().UTC().UnixMilli(), + EventTimestamp: evt.Timestamp, + MessageID: msg.ID, + Shortcode: evt.ReactionDelta.Shortcode, + Action: evt.ReactionDelta.Action, + Actor: evt.ReactionDelta.Actor, + ReactedAt: *msg.UpdatedAt, + UpdatedAt: *msg.UpdatedAt, } return h.publishMutation(ctx, room, model.RoomEventMessageReacted, msg.ID, &react) } @@ -594,29 +598,31 @@ func (h *Handler) publishMutation(ctx context.Context, room *model.Room, roomEvt func buildEditRoomEvent(room *model.Room, evt *model.MessageEvent) model.EditRoomEvent { msg := evt.Message return model.EditRoomEvent{ - Type: model.RoomEventMessageEdited, - RoomID: room.ID, - SiteID: room.SiteID, - Timestamp: evt.Timestamp, - MessageID: msg.ID, - NewContent: msg.Content, - EditedBy: msg.UserAccount, - EditedAt: *msg.EditedAt, - UpdatedAt: *msg.UpdatedAt, + Type: model.RoomEventMessageEdited, + RoomID: room.ID, + SiteID: room.SiteID, + Timestamp: time.Now().UTC().UnixMilli(), + EventTimestamp: evt.Timestamp, + MessageID: msg.ID, + NewContent: msg.Content, + EditedBy: msg.UserAccount, + EditedAt: *msg.EditedAt, + UpdatedAt: *msg.UpdatedAt, } } func buildDeleteRoomEvent(room *model.Room, evt *model.MessageEvent) model.DeleteRoomEvent { msg := evt.Message return model.DeleteRoomEvent{ - Type: model.RoomEventMessageDeleted, - RoomID: room.ID, - SiteID: room.SiteID, - Timestamp: evt.Timestamp, - MessageID: msg.ID, - DeletedBy: msg.UserAccount, - DeletedAt: *msg.UpdatedAt, - UpdatedAt: *msg.UpdatedAt, + Type: model.RoomEventMessageDeleted, + RoomID: room.ID, + SiteID: room.SiteID, + Timestamp: time.Now().UTC().UnixMilli(), + EventTimestamp: evt.Timestamp, + MessageID: msg.ID, + DeletedBy: msg.UserAccount, + DeletedAt: *msg.UpdatedAt, + UpdatedAt: *msg.UpdatedAt, } } @@ -738,18 +744,19 @@ func (h *Handler) publishDMEvents(ctx context.Context, meta roommetacache.Meta, return nil } -func buildRoomEvent(meta roommetacache.Meta, clientMsg *model.ClientMessage, timestamp int64) model.RoomEvent { +func buildRoomEvent(meta roommetacache.Meta, clientMsg *model.ClientMessage, eventTimestamp int64) model.RoomEvent { return model.RoomEvent{ - Type: model.RoomEventNewMessage, - RoomID: meta.ID, - Timestamp: timestamp, - RoomName: meta.Name, - RoomType: meta.Type, - SiteID: meta.SiteID, - UserCount: meta.UserCount, - LastMsgAt: clientMsg.CreatedAt, - LastMsgID: clientMsg.ID, - Message: clientMsg, + Type: model.RoomEventNewMessage, + RoomID: meta.ID, + Timestamp: time.Now().UTC().UnixMilli(), + EventTimestamp: eventTimestamp, + RoomName: meta.Name, + RoomType: meta.Type, + SiteID: meta.SiteID, + UserCount: meta.UserCount, + LastMsgAt: clientMsg.CreatedAt, + LastMsgID: clientMsg.ID, + Message: clientMsg, } } diff --git a/broadcast-worker/handler_test.go b/broadcast-worker/handler_test.go index 599fc6af1..ceb745431 100644 --- a/broadcast-worker/handler_test.go +++ b/broadcast-worker/handler_test.go @@ -244,7 +244,8 @@ func TestHandler_HandleMessage_ChannelRoom(t *testing.T) { assert.Equal(t, "site-a", evt.SiteID) assert.Equal(t, 5, evt.UserCount) assert.Equal(t, "msg-1", evt.LastMsgID) - assert.Equal(t, msgTime.UnixMilli(), evt.Timestamp) + assert.Positive(t, evt.Timestamp, "Timestamp must be the broadcast-worker publish time") + assert.Equal(t, msgTime.UnixMilli(), evt.EventTimestamp) assert.Equal(t, tc.wantMentionAll, evt.MentionAll) assert.Equal(t, "msg-1", msg.ID) @@ -346,7 +347,8 @@ func TestHandler_HandleMessage_DMRoom(t *testing.T) { aliceEvt := evtBySubject[subject.UserRoomEvent("alice")] assert.Equal(t, model.RoomEventNewMessage, aliceEvt.Type) - assert.Equal(t, msgTime.UnixMilli(), aliceEvt.Timestamp) + assert.Positive(t, aliceEvt.Timestamp, "Timestamp must be the broadcast-worker publish time") + assert.Equal(t, msgTime.UnixMilli(), aliceEvt.EventTimestamp) require.NotNil(t, aliceEvt.Message, "DM events must carry Message payload") assert.Equal(t, "msg-1", aliceEvt.Message.ID) require.NotNil(t, aliceEvt.Message.Sender) @@ -356,7 +358,8 @@ func TestHandler_HandleMessage_DMRoom(t *testing.T) { bobEvt := evtBySubject[subject.UserRoomEvent("bob")] require.NotNil(t, bobEvt.Message) - assert.Equal(t, msgTime.UnixMilli(), bobEvt.Timestamp) + assert.Positive(t, bobEvt.Timestamp, "Timestamp must be the broadcast-worker publish time") + assert.Equal(t, msgTime.UnixMilli(), bobEvt.EventTimestamp) assert.Equal(t, "msg-1", bobEvt.Message.ID) require.NotNil(t, bobEvt.Message.Sender) assert.Equal(t, tc.bobHasMention, bobEvt.HasMention) @@ -1592,7 +1595,8 @@ func TestHandleMessage_ThreadReplyAdded_DispatchesToHandleThreadTCountUpdated(t assert.Equal(t, "reply-1", tmEvt.ReplyMessageID) assert.Equal(t, 3, tmEvt.NewTCount) assert.Equal(t, model.ThreadActionReplyAdded, tmEvt.Action) - assert.Equal(t, msgTime.UnixMilli(), tmEvt.Timestamp) + assert.Positive(t, tmEvt.Timestamp, "Timestamp must be the broadcast-worker publish time") + assert.Equal(t, msgTime.UnixMilli(), tmEvt.EventTimestamp) } func TestHandleThreadTCountUpdated_MissingNewTCount_Skips(t *testing.T) { @@ -1731,7 +1735,8 @@ func TestHandleThreadCreated_ChannelRoom_FansOutToFollowers(t *testing.T) { var roomEvt model.RoomEvent require.NoError(t, json.Unmarshal(r.data, &roomEvt)) assert.Equal(t, model.RoomEventNewMessage, roomEvt.Type) - assert.Equal(t, msgTime.UnixMilli(), roomEvt.Timestamp) + assert.Positive(t, roomEvt.Timestamp, "Timestamp must be the broadcast-worker publish time") + assert.Equal(t, msgTime.UnixMilli(), roomEvt.EventTimestamp) } assert.True(t, subjects[subject.UserRoomEvent("bob")]) assert.True(t, subjects[subject.UserRoomEvent("carol")]) @@ -1896,7 +1901,8 @@ func TestHandleThreadUpdated_ChannelRoom_FansOutToFollowers(t *testing.T) { assert.Equal(t, model.RoomEventMessageEdited, roomEvt.Type) assert.Equal(t, "reply-1", roomEvt.MessageID) assert.Equal(t, "updated thread reply", roomEvt.NewContent) - assert.Equal(t, editedAt.UnixMilli(), roomEvt.Timestamp) + assert.Positive(t, roomEvt.Timestamp, "Timestamp must be the broadcast-worker publish time") + assert.Equal(t, editedAt.UnixMilli(), roomEvt.EventTimestamp) } } @@ -2036,7 +2042,8 @@ func TestHandleThreadDeleted_ChannelRoom_FansOutToFollowers(t *testing.T) { require.NoError(t, json.Unmarshal(r.data, &roomEvt)) assert.Equal(t, model.RoomEventMessageDeleted, roomEvt.Type) assert.Equal(t, "reply-1", roomEvt.MessageID) - assert.Equal(t, deletedAt.UnixMilli(), roomEvt.Timestamp) + assert.Positive(t, roomEvt.Timestamp, "Timestamp must be the broadcast-worker publish time") + assert.Equal(t, deletedAt.UnixMilli(), roomEvt.EventTimestamp) } } diff --git a/pkg/model/event.go b/pkg/model/event.go index a0f1b5ac9..4ca3306b1 100644 --- a/pkg/model/event.go +++ b/pkg/model/event.go @@ -229,9 +229,10 @@ const ( // EditRoomEvent / DeleteRoomEvent / PinRoomEvent / UnpinRoomEvent so clients // are not handed zero-valued base fields. type RoomEvent struct { - Type RoomEventType `json:"type"` - RoomID string `json:"roomId"` - Timestamp int64 `json:"timestamp" bson:"timestamp"` + Type RoomEventType `json:"type"` + RoomID string `json:"roomId"` + Timestamp int64 `json:"timestamp" bson:"timestamp"` + EventTimestamp int64 `json:"eventTimestamp,omitempty" bson:"eventTimestamp,omitempty"` RoomName string `json:"roomName"` RoomType RoomType `json:"roomType"` @@ -258,6 +259,7 @@ type EditRoomEvent struct { RoomID string `json:"roomId" bson:"roomId"` SiteID string `json:"siteId" bson:"siteId"` Timestamp int64 `json:"timestamp" bson:"timestamp"` + EventTimestamp int64 `json:"eventTimestamp,omitempty" bson:"eventTimestamp,omitempty"` MessageID string `json:"messageId" bson:"messageId"` NewContent string `json:"newContent,omitempty" bson:"newContent,omitempty"` EncryptedNewContent json.RawMessage `json:"encryptedNewContent,omitempty" bson:"encryptedNewContent,omitempty"` @@ -269,11 +271,12 @@ type EditRoomEvent struct { // DeleteRoomEvent is the live event published when a message is deleted. Fields // are flat (no zero-valued RoomEvent base fields). type DeleteRoomEvent struct { - Type RoomEventType `json:"type" bson:"type"` - RoomID string `json:"roomId" bson:"roomId"` - SiteID string `json:"siteId" bson:"siteId"` - Timestamp int64 `json:"timestamp" bson:"timestamp"` - MessageID string `json:"messageId" bson:"messageId"` + Type RoomEventType `json:"type" bson:"type"` + RoomID string `json:"roomId" bson:"roomId"` + SiteID string `json:"siteId" bson:"siteId"` + Timestamp int64 `json:"timestamp" bson:"timestamp"` + EventTimestamp int64 `json:"eventTimestamp,omitempty" bson:"eventTimestamp,omitempty"` + MessageID string `json:"messageId" bson:"messageId"` DeletedBy string `json:"deletedBy" bson:"deletedBy"` DeletedAt time.Time `json:"deletedAt" bson:"deletedAt"` UpdatedAt time.Time `json:"updatedAt" bson:"updatedAt"` @@ -283,22 +286,24 @@ type DeleteRoomEvent struct { // are flat (no zero-valued RoomEvent base fields). Mirrors the // EditRoomEvent / DeleteRoomEvent pattern. type PinRoomEvent struct { - Type RoomEventType `json:"type" bson:"type"` - RoomID string `json:"roomId" bson:"roomId"` - SiteID string `json:"siteId" bson:"siteId"` - Timestamp int64 `json:"timestamp" bson:"timestamp"` - MessageID string `json:"messageId" bson:"messageId"` + Type RoomEventType `json:"type" bson:"type"` + RoomID string `json:"roomId" bson:"roomId"` + SiteID string `json:"siteId" bson:"siteId"` + Timestamp int64 `json:"timestamp" bson:"timestamp"` + EventTimestamp int64 `json:"eventTimestamp,omitempty" bson:"eventTimestamp,omitempty"` + MessageID string `json:"messageId" bson:"messageId"` PinnedBy *Participant `json:"pinnedBy,omitempty" bson:"pinnedBy,omitempty"` PinnedAt time.Time `json:"pinnedAt" bson:"pinnedAt"` } // UnpinRoomEvent is the live event published when a message is unpinned. type UnpinRoomEvent struct { - Type RoomEventType `json:"type" bson:"type"` - RoomID string `json:"roomId" bson:"roomId"` - SiteID string `json:"siteId" bson:"siteId"` - Timestamp int64 `json:"timestamp" bson:"timestamp"` - MessageID string `json:"messageId" bson:"messageId"` + Type RoomEventType `json:"type" bson:"type"` + RoomID string `json:"roomId" bson:"roomId"` + SiteID string `json:"siteId" bson:"siteId"` + Timestamp int64 `json:"timestamp" bson:"timestamp"` + EventTimestamp int64 `json:"eventTimestamp,omitempty" bson:"eventTimestamp,omitempty"` + MessageID string `json:"messageId" bson:"messageId"` UnpinnedBy *Participant `json:"unpinnedBy,omitempty" bson:"unpinnedBy,omitempty"` UnpinnedAt time.Time `json:"unpinnedAt" bson:"unpinnedAt"` } @@ -311,6 +316,7 @@ type ThreadMetadataUpdatedEvent struct { RoomID string `json:"roomId" bson:"roomId"` SiteID string `json:"siteId" bson:"siteId"` Timestamp int64 `json:"timestamp" bson:"timestamp"` + EventTimestamp int64 `json:"eventTimestamp,omitempty" bson:"eventTimestamp,omitempty"` ParentMessageID string `json:"parentMessageId" bson:"parentMessageId"` ReplyMessageID string `json:"replyMessageId" bson:"replyMessageId"` NewTCount int `json:"newTcount" bson:"newTcount"` @@ -352,11 +358,12 @@ type RoomRestrictedRoomEvent struct { // ReactRoomEvent is the live event published when a reaction is toggled. // Actor carries the full Participant so clients can render display names without a side lookup. type ReactRoomEvent struct { - Type RoomEventType `json:"type" bson:"type"` - RoomID string `json:"roomId" bson:"roomId"` - SiteID string `json:"siteId" bson:"siteId"` - Timestamp int64 `json:"timestamp" bson:"timestamp"` - MessageID string `json:"messageId" bson:"messageId"` + Type RoomEventType `json:"type" bson:"type"` + RoomID string `json:"roomId" bson:"roomId"` + SiteID string `json:"siteId" bson:"siteId"` + Timestamp int64 `json:"timestamp" bson:"timestamp"` + EventTimestamp int64 `json:"eventTimestamp,omitempty" bson:"eventTimestamp,omitempty"` + MessageID string `json:"messageId" bson:"messageId"` Shortcode string `json:"shortcode" bson:"shortcode"` Action ReactionAction `json:"action" bson:"action"` Actor Participant `json:"actor" bson:"actor"` From 9aefaad57289806516b21889d563170223cd5dc1 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 6 Jun 2026 16:22:38 +0000 Subject: [PATCH 12/14] fix: move thread tcount badge from MESSAGES_CANONICAL to core NATS server-broadcast Publishing EventThreadReplyAdded back to MESSAGES_CANONICAL polluted the message CRUD event store with badge events and required a skip guard in message-worker to avoid reprocessing its own publishes. New flow: message-worker publishes via core NATS on chat.server.broadcast.{siteID}.thread.tcount; broadcast-worker subscribes to chat.server.broadcast.{siteID}.> with a queue group and handles the event via HandleServerBroadcast. Badge events are best-effort (fire-and-forget, no JetStream durability needed). https://claude.ai/code/session_01LjGg9QJU7QVZFU6pXXFbgb --- broadcast-worker/handler.go | 28 ++++++++++++++++++++++++++-- broadcast-worker/handler_test.go | 19 +++++++++---------- broadcast-worker/main.go | 16 ++++++++++++++++ message-worker/handler.go | 19 +++++-------------- message-worker/handler_test.go | 19 +++++++++---------- pkg/subject/subject.go | 14 ++++++++++++++ 6 files changed, 79 insertions(+), 36 deletions(-) diff --git a/broadcast-worker/handler.go b/broadcast-worker/handler.go index 00040433c..ac669227d 100644 --- a/broadcast-worker/handler.go +++ b/broadcast-worker/handler.go @@ -77,8 +77,6 @@ func (h *Handler) HandleMessage(ctx context.Context, data []byte) error { return h.handleUnpinned(ctx, &evt) case model.EventReacted: return h.handleReacted(ctx, &evt) - case model.EventThreadReplyAdded: - return h.handleThreadTCountUpdated(ctx, &evt) default: slog.WarnContext(ctx, "unknown message event type, skipping", "event", evt.Event, @@ -88,6 +86,32 @@ func (h *Handler) HandleMessage(ctx context.Context, data []byte) error { } } +// HandleServerBroadcast processes a single server-broadcast core-NATS message +// (chat.server.broadcast.{siteID}.>). Currently handles EventThreadReplyAdded +// badge events published by message-worker. +func (h *Handler) HandleServerBroadcast(ctx context.Context, data []byte) { + var evt model.MessageEvent + if err := json.Unmarshal(data, &evt); err != nil { + slog.ErrorContext(ctx, "unmarshal server-broadcast event failed; dropping", + "error", err, + "request_id", natsutil.RequestIDFromContext(ctx)) + return + } + switch evt.Event { + case model.EventThreadReplyAdded: + if err := h.handleThreadTCountUpdated(ctx, &evt); err != nil { + slog.ErrorContext(ctx, "handle thread tcount update failed", + "error", err, + "messageID", evt.Message.ID, + "request_id", natsutil.RequestIDFromContext(ctx)) + } + default: + slog.WarnContext(ctx, "unknown server-broadcast event type; dropping", + "event", evt.Event, + "request_id", natsutil.RequestIDFromContext(ctx)) + } +} + // shouldUseThreadFanOut reports whether a message should be routed through the // thread fan-out path (thread subscribers + @-mentions) rather than the room // broadcast path. True when the message is a thread reply hidden from the main diff --git a/broadcast-worker/handler_test.go b/broadcast-worker/handler_test.go index ceb745431..a532564a7 100644 --- a/broadcast-worker/handler_test.go +++ b/broadcast-worker/handler_test.go @@ -1555,7 +1555,7 @@ func TestThreadFanOutAccounts(t *testing.T) { } } -func TestHandleMessage_ThreadReplyAdded_DispatchesToHandleThreadTCountUpdated(t *testing.T) { +func TestHandleServerBroadcast_ThreadReplyAdded_FansOutBadge(t *testing.T) { ctrl := gomock.NewController(t) store := NewMockStore(ctrl) us := NewMockUserStore(ctrl) @@ -1583,7 +1583,7 @@ func TestHandleMessage_ThreadReplyAdded_DispatchesToHandleThreadTCountUpdated(t data, _ := json.Marshal(evt) h := NewHandler(store, us, pub, keyStore, false) - require.NoError(t, h.HandleMessage(context.Background(), data)) + h.HandleServerBroadcast(context.Background(), data) require.Len(t, pub.records, 1) var tmEvt model.ThreadMetadataUpdatedEvent @@ -1599,7 +1599,7 @@ func TestHandleMessage_ThreadReplyAdded_DispatchesToHandleThreadTCountUpdated(t assert.Equal(t, msgTime.UnixMilli(), tmEvt.EventTimestamp) } -func TestHandleThreadTCountUpdated_MissingNewTCount_Skips(t *testing.T) { +func TestHandleServerBroadcast_ThreadReplyAdded_MissingNewTCount_Skips(t *testing.T) { ctrl := gomock.NewController(t) store := NewMockStore(ctrl) us := NewMockUserStore(ctrl) @@ -1624,11 +1624,11 @@ func TestHandleThreadTCountUpdated_MissingNewTCount_Skips(t *testing.T) { data, _ := json.Marshal(evt) h := NewHandler(store, us, pub, keyStore, false) - require.NoError(t, h.HandleMessage(context.Background(), data)) + h.HandleServerBroadcast(context.Background(), data) assert.Empty(t, pub.records) } -func TestHandleThreadTCountUpdated_MissingParentMessageID_Skips(t *testing.T) { +func TestHandleServerBroadcast_ThreadReplyAdded_MissingParentMessageID_Skips(t *testing.T) { ctrl := gomock.NewController(t) store := NewMockStore(ctrl) us := NewMockUserStore(ctrl) @@ -1653,11 +1653,11 @@ func TestHandleThreadTCountUpdated_MissingParentMessageID_Skips(t *testing.T) { data, _ := json.Marshal(evt) h := NewHandler(store, us, pub, keyStore, false) - require.NoError(t, h.HandleMessage(context.Background(), data)) + h.HandleServerBroadcast(context.Background(), data) assert.Empty(t, pub.records) } -func TestHandleThreadTCountUpdated_GetRoomError_ReturnsError(t *testing.T) { +func TestHandleServerBroadcast_ThreadReplyAdded_GetRoomError_LogsAndContinues(t *testing.T) { ctrl := gomock.NewController(t) store := NewMockStore(ctrl) us := NewMockUserStore(ctrl) @@ -1683,10 +1683,9 @@ func TestHandleThreadTCountUpdated_GetRoomError_ReturnsError(t *testing.T) { } data, _ := json.Marshal(evt) + // HandleServerBroadcast is fire-and-forget: errors are logged, not returned. h := NewHandler(store, us, pub, keyStore, false) - err := h.HandleMessage(context.Background(), data) - require.Error(t, err) - assert.Contains(t, err.Error(), "get room") + h.HandleServerBroadcast(context.Background(), data) assert.Empty(t, pub.records) } diff --git a/broadcast-worker/main.go b/broadcast-worker/main.go index 0dce33c70..d984f27b5 100644 --- a/broadcast-worker/main.go +++ b/broadcast-worker/main.go @@ -20,6 +20,7 @@ import ( "github.com/hmchangw/chat/pkg/roomkeystore" "github.com/hmchangw/chat/pkg/shutdown" "github.com/hmchangw/chat/pkg/stream" + "github.com/hmchangw/chat/pkg/subject" "github.com/hmchangw/chat/pkg/userstore" ) @@ -149,6 +150,18 @@ func main() { handler := NewHandler(coalescer, us, publisher, keyStore, cfg.Encryption.Enabled) + // Core-NATS queue subscriber for server-broadcast events (e.g. thread tcount badge). + // Fire-and-forget: errors are logged inside HandleServerBroadcast; no retry path. + broadcastSub, err := nc.QueueSubscribe(subject.ServerBroadcastWildcard(cfg.SiteID), "broadcast-worker", + func(msg otelnats.Msg) { + broadcastCtx, _ := natsutil.StampRequestID(context.Background(), msg.Msg.Header, msg.Msg.Subject) + handler.HandleServerBroadcast(broadcastCtx, msg.Msg.Data) + }) + if err != nil { + slog.Error("subscribe server-broadcast failed", "error", err) + os.Exit(1) + } + iter, err := cons.Messages(jetstream.PullMaxMessages(2 * cfg.MaxWorkers)) if err != nil { slog.Error("messages failed", "error", err) @@ -189,6 +202,9 @@ func main() { slog.Info("broadcast-worker started", "site", cfg.SiteID, "encryption", cfg.Encryption.Enabled) hooks := []func(context.Context) error{ + func(_ context.Context) error { + return broadcastSub.Unsubscribe() + }, func(ctx context.Context) error { iter.Stop() return nil diff --git a/message-worker/handler.go b/message-worker/handler.go index 1c373c107..1a8d9947b 100644 --- a/message-worker/handler.go +++ b/message-worker/handler.go @@ -60,13 +60,6 @@ func (h *Handler) processMessage(ctx context.Context, data []byte) error { return fmt.Errorf("unmarshal message event: %w", err) } - // Badge events published by this worker back onto .created are handled - // by broadcast-worker, not here. Skip them to avoid re-processing our - // own publishes as new messages. - if evt.Event == model.EventThreadReplyAdded { - return nil - } - resolved, err := mention.Resolve(ctx, evt.Message.Content, h.userStore.FindUsersByAccounts) if err != nil { return fmt.Errorf("resolve mentions: %w", err) @@ -447,11 +440,10 @@ func (h *Handler) publishThreadSubOutboxIfRemote(ctx context.Context, sub *model return nil } -// publishThreadReplyEvent publishes an EventThreadReplyAdded badge event to -// the MESSAGES_CANONICAL stream on the .created subject so broadcast-worker -// can do DM-aware routing of the reply-count badge update. The dedup ID is -// stable across redeliveries so JetStream stream-level dedup absorbs -// duplicates within the dedup window. +// publishThreadReplyEvent fires a badge event via core NATS so broadcast-worker +// can update the reply-count badge for thread followers. Published to +// chat.server.broadcast.{siteID}.thread.tcount (not MESSAGES_CANONICAL) because +// badge updates are best-effort and do not belong in the message CRUD event store. func (h *Handler) publishThreadReplyEvent(ctx context.Context, msg *model.Message, newTcount int) error { evt := model.MessageEvent{ Event: model.EventThreadReplyAdded, @@ -468,6 +460,5 @@ func (h *Handler) publishThreadReplyEvent(ctx context.Context, msg *model.Messag if err != nil { return fmt.Errorf("marshal thread reply event: %w", err) } - dedupID := fmt.Sprintf("thread-reply-added:%s:%s", h.siteID, msg.ID) - return h.publish(ctx, subject.MsgCanonicalCreated(h.siteID), data, dedupID) + return h.publish(ctx, subject.ServerBroadcastThreadTCount(h.siteID), data, "") } diff --git a/message-worker/handler_test.go b/message-worker/handler_test.go index 8be9669c8..f11959c5e 100644 --- a/message-worker/handler_test.go +++ b/message-worker/handler_test.go @@ -496,8 +496,8 @@ func TestHandler_ProcessMessage_ThreadReply_PublishesBadgeEvent(t *testing.T) { ) require.NoError(t, h.processMessage(context.Background(), data)) - assert.Equal(t, subject.MsgCanonicalCreated("site-a"), capturedSubj, - "badge event must be published to the canonical created subject") + assert.Equal(t, subject.ServerBroadcastThreadTCount("site-a"), capturedSubj, + "badge event must be published via core NATS server-broadcast, not MESSAGES_CANONICAL") var badgeEvt model.MessageEvent require.NoError(t, json.Unmarshal(capturedData, &badgeEvt)) assert.Equal(t, model.EventThreadReplyAdded, badgeEvt.Event) @@ -1878,8 +1878,8 @@ func TestHandler_ProcessMessage_ThreadReplyPublish(t *testing.T) { require.NoError(t, h.processMessage(context.Background(), threadData)) require.Equal(t, 1, publishCount, "exactly one publish call for thread reply event") - assert.Equal(t, "chat.msg.canonical.site-a.created", capturedSubj) - assert.Equal(t, "thread-reply-added:site-a:msg-reply", capturedMsgID) + assert.Equal(t, subject.ServerBroadcastThreadTCount("site-a"), capturedSubj) + assert.Equal(t, "", capturedMsgID, "core NATS publish must have empty msgID") var evt model.MessageEvent require.NoError(t, json.Unmarshal(capturedData, &evt)) @@ -1908,10 +1908,9 @@ func TestHandler_ProcessMessage_ThreadReplyPublish(t *testing.T) { return errors.New("nats: publish failed") }) - // Publish failure MUST propagate: the IF NOT EXISTS LWT on messages_by_id means - // redelivery detects applied=false and calls readParentTcount instead of - // incrementParentTcount, so there is no double-increment risk. Swallowing the - // error would permanently drop the badge tcount event. + // Publish failure propagates so the caller can log it; badge events are + // best-effort via core NATS so a transient error is expected to be swallowed + // by the caller rather than retried via JetStream. require.Error(t, h.processMessage(context.Background(), threadData)) }) @@ -1958,8 +1957,8 @@ func TestHandler_PublishThreadReplyEvent(t *testing.T) { err := h.publishThreadReplyEvent(context.Background(), msg, 5) require.NoError(t, err) - assert.Equal(t, "chat.msg.canonical.site-a.created", captured.subj) - assert.Equal(t, "thread-reply-added:site-a:msg-2", captured.msgID) + assert.Equal(t, subject.ServerBroadcastThreadTCount("site-a"), captured.subj) + assert.Equal(t, "", captured.msgID, "core NATS publish must have empty msgID") var evt model.MessageEvent require.NoError(t, json.Unmarshal(captured.data, &evt)) diff --git a/pkg/subject/subject.go b/pkg/subject/subject.go index 04628f31e..f5a4d2e6a 100644 --- a/pkg/subject/subject.go +++ b/pkg/subject/subject.go @@ -878,6 +878,20 @@ func PushNotificationFilter(siteID string) string { return fmt.Sprintf("chat.server.notification.push.%s.>", siteID) } +// ServerBroadcastThreadTCount is the core-NATS subject on which message-worker +// publishes thread reply-count badge events. Broadcast-worker queue-subscribes +// using the wildcard ServerBroadcastWildcard so this stays fire-and-forget +// without polluting MESSAGES_CANONICAL (which is reserved for message CRUD). +func ServerBroadcastThreadTCount(siteID string) string { + return fmt.Sprintf("chat.server.broadcast.%s.thread.tcount", siteID) +} + +// ServerBroadcastWildcard is the queue-subscribe subject used by broadcast-worker +// to receive all server-broadcast events for a site. +func ServerBroadcastWildcard(siteID string) string { + return fmt.Sprintf("chat.server.broadcast.%s.>", siteID) +} + // PresenceSnapshot is the bulk presence RPC subject (request/reply). func PresenceSnapshot(siteID string) string { return fmt.Sprintf("chat.presence.%s.request.snapshot", siteID) From a47986f29e928acdb917c8a8797e9cb615f62fcc Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 6 Jun 2026 16:23:22 +0000 Subject: [PATCH 13/14] docs(client-api): update timestamp/eventTimestamp semantics for broadcast events timestamp = when broadcast-worker published the event (allows clients to detect JetStream redeliveries). eventTimestamp = when message-worker published the canonical event (source-of-truth time for correlation). Updated DeleteRoomEvent and ThreadMetadataUpdatedEvent tables. https://claude.ai/code/session_01LjGg9QJU7QVZFU6pXXFbgb --- docs/client-api.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/client-api.md b/docs/client-api.md index f424a2dd6..1c041a045 100644 --- a/docs/client-api.md +++ b/docs/client-api.md @@ -1710,7 +1710,8 @@ The payload is flat: | `type` | string | Always `"message_deleted"`. | | `roomId` | string | | | `siteId` | string | | -| `timestamp` | number | Milliseconds since Unix epoch (UTC). Propagated from the canonical event's publish time. | +| `timestamp` | number | Milliseconds since Unix epoch (UTC). When broadcast-worker published this event. | +| `eventTimestamp` | number | Milliseconds since Unix epoch (UTC). When message-worker published the canonical event. Omitted for legacy events. | | `messageId` | string | The deleted message's ID. | | `deletedBy` | string | The sender's account. | | `deletedAt` | string | RFC 3339 timestamp. Domain time of the delete. | @@ -2749,7 +2750,8 @@ Pushed by `broadcast-worker` whenever a thread reply is **created** (`action: "r | `newTcount` | number | Authoritative post-CAS reply count for the parent message. Replaces any locally-computed count — do not delta. | | `action` | string | `"reply_added"` or `"reply_deleted"`. | | `replyMessageId` | string | The reply that was added or deleted. | -| `timestamp` | number | Milliseconds since Unix epoch (UTC). Propagated from the canonical event's publish time. | +| `timestamp` | number | Milliseconds since Unix epoch (UTC). When broadcast-worker published this event. | +| `eventTimestamp` | number | Milliseconds since Unix epoch (UTC). When message-worker published the canonical event. Omitted for legacy events. | ```json { From 73309588caeb3062677b082600696b6d80daccf3 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 6 Jun 2026 16:27:36 +0000 Subject: [PATCH 14/14] fix(model): realign struct fields after EventTimestamp addition New commits added EventTimestamp to DeleteRoomEvent, PinRoomEvent, UnpinRoomEvent, and ReactRoomEvent but left older fields misaligned. goimports now requires all fields in a struct to be column-aligned. https://claude.ai/code/session_013Vs7CusvrZFrRKJaSoFtCi --- pkg/model/event.go | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/pkg/model/event.go b/pkg/model/event.go index 4ca3306b1..a1372a999 100644 --- a/pkg/model/event.go +++ b/pkg/model/event.go @@ -277,9 +277,9 @@ type DeleteRoomEvent struct { Timestamp int64 `json:"timestamp" bson:"timestamp"` EventTimestamp int64 `json:"eventTimestamp,omitempty" bson:"eventTimestamp,omitempty"` MessageID string `json:"messageId" bson:"messageId"` - DeletedBy string `json:"deletedBy" bson:"deletedBy"` - DeletedAt time.Time `json:"deletedAt" bson:"deletedAt"` - UpdatedAt time.Time `json:"updatedAt" bson:"updatedAt"` + DeletedBy string `json:"deletedBy" bson:"deletedBy"` + DeletedAt time.Time `json:"deletedAt" bson:"deletedAt"` + UpdatedAt time.Time `json:"updatedAt" bson:"updatedAt"` } // PinRoomEvent is the live event published when a message is pinned. Fields @@ -292,8 +292,8 @@ type PinRoomEvent struct { Timestamp int64 `json:"timestamp" bson:"timestamp"` EventTimestamp int64 `json:"eventTimestamp,omitempty" bson:"eventTimestamp,omitempty"` MessageID string `json:"messageId" bson:"messageId"` - PinnedBy *Participant `json:"pinnedBy,omitempty" bson:"pinnedBy,omitempty"` - PinnedAt time.Time `json:"pinnedAt" bson:"pinnedAt"` + PinnedBy *Participant `json:"pinnedBy,omitempty" bson:"pinnedBy,omitempty"` + PinnedAt time.Time `json:"pinnedAt" bson:"pinnedAt"` } // UnpinRoomEvent is the live event published when a message is unpinned. @@ -304,8 +304,8 @@ type UnpinRoomEvent struct { Timestamp int64 `json:"timestamp" bson:"timestamp"` EventTimestamp int64 `json:"eventTimestamp,omitempty" bson:"eventTimestamp,omitempty"` MessageID string `json:"messageId" bson:"messageId"` - UnpinnedBy *Participant `json:"unpinnedBy,omitempty" bson:"unpinnedBy,omitempty"` - UnpinnedAt time.Time `json:"unpinnedAt" bson:"unpinnedAt"` + UnpinnedBy *Participant `json:"unpinnedBy,omitempty" bson:"unpinnedBy,omitempty"` + UnpinnedAt time.Time `json:"unpinnedAt" bson:"unpinnedAt"` } // ThreadMetadataUpdatedEvent is published on the per-user NATS subject when a @@ -364,11 +364,11 @@ type ReactRoomEvent struct { Timestamp int64 `json:"timestamp" bson:"timestamp"` EventTimestamp int64 `json:"eventTimestamp,omitempty" bson:"eventTimestamp,omitempty"` MessageID string `json:"messageId" bson:"messageId"` - Shortcode string `json:"shortcode" bson:"shortcode"` - Action ReactionAction `json:"action" bson:"action"` - Actor Participant `json:"actor" bson:"actor"` - ReactedAt time.Time `json:"reactedAt" bson:"reactedAt"` - UpdatedAt time.Time `json:"updatedAt" bson:"updatedAt"` + Shortcode string `json:"shortcode" bson:"shortcode"` + Action ReactionAction `json:"action" bson:"action"` + Actor Participant `json:"actor" bson:"actor"` + ReactedAt time.Time `json:"reactedAt" bson:"reactedAt"` + UpdatedAt time.Time `json:"updatedAt" bson:"updatedAt"` } // RemovedSubscriptionRef is the minimal subscription identity carried on a