From bb7e01c39de1f56831ef794ed0ee2f4b9e99d60c Mon Sep 17 00:00:00 2001
From: jonathan keinan <jonathan.keinan@redis.com>
Date: Thu, 11 Jun 2026 13:20:31 +0300
Subject: [PATCH 01/10] HNSW snapshot 01: block deep-copy primitive + design
 docs

ElementGraphData::copyTo / ElementLevelData::copyInto: value-semantic deep copy
of a graph block (fresh mutex, independent incoming-edges, FAM-aware), so a block
buffer can later be copied-on-write without sharing mutable state. Zero call-site
churn (the mutex stays in ElementGraphData; the copy never aliases its lock).

First branch of the snapshot-iterator stack. Includes the full design docs for
the feature (design, proposal, spec-delta, prototypes, task tracker).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 docs/design/hnsw-snapshot/README.md           |  53 +++
 docs/design/hnsw-snapshot/design-note.md      | 217 +++++++++++
 docs/design/hnsw-snapshot/design.md           | 364 ++++++++++++++++++
 docs/design/hnsw-snapshot/proposal.md         |  72 ++++
 .../prototypes/block_deepcopy.cpp             | 148 +++++++
 .../hnsw-snapshot/prototypes/cow_snapshot.cpp | 197 ++++++++++
 .../prototypes/read_indirection_bench.cpp     | 205 ++++++++++
 .../hnsw-snapshot/prototypes/snapshot_poc.cpp | 279 ++++++++++++++
 docs/design/hnsw-snapshot/spec-delta.md       |  85 ++++
 docs/design/hnsw-snapshot/tasks.md            | 276 +++++++++++++
 src/VecSim/algorithms/hnsw/graph_data.h       |  48 +++
 .../algorithms/hnsw/hnsw_base_tests_friends.h |   1 +
 tests/unit/test_hnsw.cpp                      |  90 +++++
 13 files changed, 2035 insertions(+)
 create mode 100644 docs/design/hnsw-snapshot/README.md
 create mode 100644 docs/design/hnsw-snapshot/design-note.md
 create mode 100644 docs/design/hnsw-snapshot/design.md
 create mode 100644 docs/design/hnsw-snapshot/proposal.md
 create mode 100644 docs/design/hnsw-snapshot/prototypes/block_deepcopy.cpp
 create mode 100644 docs/design/hnsw-snapshot/prototypes/cow_snapshot.cpp
 create mode 100644 docs/design/hnsw-snapshot/prototypes/read_indirection_bench.cpp
 create mode 100644 docs/design/hnsw-snapshot/prototypes/snapshot_poc.cpp
 create mode 100644 docs/design/hnsw-snapshot/spec-delta.md
 create mode 100644 docs/design/hnsw-snapshot/tasks.md

diff --git a/docs/design/hnsw-snapshot/README.md b/docs/design/hnsw-snapshot/README.md
new file mode 100644
index 000000000..a2b29f992
--- /dev/null
+++ b/docs/design/hnsw-snapshot/README.md
@@ -0,0 +1,53 @@
+# HNSW snapshot iterator — design
+
+Design exploration for a **consistent, lock-free snapshot iterator** over HNSW:
+construct an iterator under the index read lock, release the lock, and iterate
+while concurrent inserts/deletes proceed — with results consistent as of
+construction time. Mechanism: persistent (rooted) copy-on-write where the
+**clone decision is generation-tag-driven** and **reclamation is refcount-driven**
+(superseded versions free themselves when the last snapshot referencing them
+drops); slot recycling (SWAP) is gated by a separate reclaim horizon.
+
+The foundational change (the block deep-copy primitive `ElementGraphData::copyTo`
+/ `ElementLevelData::copyInto`) is implemented in
+`src/VecSim/algorithms/hnsw/graph_data.h` on this branch.
+
+## Contents
+
+| file | what it is |
+|---|---|
+| `proposal.md`   | Why + what (the user-visible surface). No code. |
+| `design.md`     | Implementation plan: subsystems touched, data model, alternatives considered/rejected, testing strategy, open questions. |
+| `design-note.md`| Deeper technical exploration: why "filter the live graph" fails, the chosen `shared_ptr<vector<Block{shared_ptr buf}>>` structure, prototype + benchmark + POC findings, costs. |
+| `tasks.md`      | Phased implementation checklist (phase 1 = the `copyTo` primitive, already started). |
+| `spec-delta.md` | Behavior spec: requirements + WHEN/THEN scenarios. |
+| `prototypes/`   | Standalone C++ programs (below). |
+
+## Prototypes
+
+All build with `g++ -O2 -std=c++20 -pthread`. `snapshot_poc.cpp` additionally
+links two `src/VecSim/memory/*.cpp` files (see its header comment).
+
+| prototype | proves |
+|---|---|
+| `cow_snapshot.cpp`          | rooted-COW snapshot mechanism: O(1) capture, consistent lock-free reads under concurrent writes, automatic cleanup |
+| `read_indirection_bench.cpp`| read-path cost of candidate layouts — the per-block `shared_ptr` scatter is the cost; contiguous headers + refcounted buffer read at flat speed |
+| `block_deepcopy.cpp`        | the `copyTo` deep-copy algorithm (FAM links + fresh mutex + independent incoming-edges) on a multi-level node |
+| `snapshot_poc.cpp`          | **end-to-end feasibility** using the REAL `ElementGraphData` + `copyTo`: snapshot stays consistent through millions of concurrent writes; clean under TSan/ASan/UBSan |
+
+## Status
+
+The **vecsim integration is implemented and validated** (a stacked series of
+branches, `snapshot/01`–`10`): the COW block storage + generation-tag clone, the
+O(1)-ish snapshot capture, the lock-free KNN/batch iterator (plain **and**
+tiered, opt-in), concurrent-insert safety (atomic block-COW), the swap reclaim
+horizon, and COW-before-lock repair so deletes' graph repair runs under live
+cursors. Full unit suite green; the concurrent insert+delete+repair+cursor repro
+is clean under ASan. The original prototypes (below) proved feasibility first.
+
+**Not yet built** (tracked in `tasks.md` Phases 5–7): RediSearch wiring
+(`hybrid_reader.c`, holding the snapshot across `FT.CURSOR READ`, a string
+param-resolver token), the snapshot memory/lifetime cap + `FT.INFO`/`FT.DEBUG`
+observability, range-query snapshot support, and production-scale benchmarks.
+`spec-delta.md` uses the OpenSpec requirement/scenario shape from the RediSearch
+repo, which is framework-neutral.
diff --git a/docs/design/hnsw-snapshot/design-note.md b/docs/design/hnsw-snapshot/design-note.md
new file mode 100644
index 000000000..caeb50cce
--- /dev/null
+++ b/docs/design/hnsw-snapshot/design-note.md
@@ -0,0 +1,217 @@
+# Design: Consistent lock-free snapshot iterator for HNSW (VecSim)
+
+Status: **draft / prototype**. This is a design exploration, not a committed plan.
+
+## Problem & requirement
+
+A query/batch iterator over an HNSW index should be:
+
+1. **Constructed under the index read lock**, then **release the lock**, and
+2. **iterate while concurrent insertions/deletions run**, ideally
+3. returning results **consistent with the index state at construction time**.
+
+Today the multi-threaded path (tiered index `acquireSharedLocks`,
+`hnsw_tiered.h`) holds the shared lock for the *whole* query precisely because
+lock-free traversal during writes is unsafe: the block backbone can be
+reallocated, and slot reuse (SWAP) / in-place link rewrites can pull data out
+from under a reader. Releasing the lock is exactly the guarantee we must
+replace.
+
+## Why "filter the live graph" cannot work
+
+HNSW is **lossy on write**: inserting a node can *evict* an edge between two
+existing nodes (`revisitNeighborConnections` → `getNeighborsByHeuristic2` →
+`setLinks`), and delete/repair rewrites neighbor lists in place
+(`removeLink`, swap-remove). So a point-in-time view cannot be reconstructed by
+reading the live structure and filtering (by doc-id watermark, by prefix of an
+"append-only" list, etc.) — the old information is destroyed. **A snapshot must
+retain the old bytes.** The only real choice is the retention mechanism.
+
+## Chosen mechanism: persistent (rooted) copy-on-write
+
+Store the graph blocks behind a **shared, rooted structure**. Prototype B
+(below) settled the exact shape: block headers must stay **contiguous** (a
+`vector<shared_ptr<DataBlock>>` scatters them across the heap and costs +25–60%
+on traversal). The structure that keeps flat-speed reads *and* per-block COW is
+contiguous headers each holding a **refcounted data buffer**:
+
+```
+shared_ptr< vector< Block > >          // root over a CONTIGUOUS header vector
+   where  Block { shared_ptr<Buffer> data; }   // per-block refcounted buffer
+```
+
+- **root** (`shared_ptr<...>`) — swapped rarely (once per generation that writes).
+- **header vector** — contiguous; copied (headers only, cheap) on backbone COW.
+- **block buffer** (`shared_ptr<Buffer>`) — the edges; COW'd per block on write.
+
+Reads are `root` (hoisted) → contiguous `vec[b]` → `data.get()` → buffer — the
+same access shape as today's flat `vector<DataBlock>`.
+
+### Operations
+
+- **Snapshot capture** (under the *write*/exclusive lock — see "Locking model"):
+  hand out a generation id, **fork the backbone** (copy the header vector,
+  `N/blockSize` pointers, so the live index keeps mutating a private copy while
+  the snapshot owns the captured one), and snapshot the vector-block base
+  pointers. O(#blocks); no element/vector data is copied. Release the lock, then
+  iterate through the captured handle with **no lock**.
+- **No snapshot active**: every version's generation post-dates all (zero) live
+  snapshots → writers mutate in place. **Zero overhead — current behavior.**
+- **Backbone after capture**: the fork is stamped the *current* generation, so a
+  concurrent writer's backbone-COW check is a no-op — the live backbone is never
+  re-forked under the shared lock and stays structurally stable for lock-free
+  readers. Thereafter only per-block buffers are COW'd.
+- **Content write to a shared block**: clone the block buffer (deep-copy via
+  `copyTo`) iff a live snapshot can still see it, repoint its slot, mutate the
+  copy. Under concurrent tiered inserts (which mutate the live graph under the
+  *shared* lock + per-node mutexes) the slot is repointed with an **atomic
+  compare-exchange**, so two writers racing to COW the same block can't tear the
+  handle or lose a fork (the generation stamp makes the CAS idempotent — the
+  loser adopts the winner's fork).
+- **Cleanup**: **automatic** via refcounts. When the snapshot is dropped, old
+  blocks/backbones with no remaining references free themselves. No
+  hand-rolled epoch/horizon/retire-queue.
+
+**Clone decision: a generation tag, not `use_count`.** The initial sketch keyed
+the clone on `shared_ptr` `use_count` (`> 1` ⇒ shared ⇒ clone). That breaks: a
+container-level `use_count` goes **stale after the first COW** — the live root is
+unshared again while a snapshot still holds the *old* one — so it would wrongly
+mutate in place and corrupt the snapshot. Instead every version carries the
+**generation it was last written in**; a write clones iff `newestLive() >= gen`
+(some live snapshot predates it) and stamps the clone with the current
+generation. `use_count` is reserved purely for **reclamation**.
+
+**Locking model (revised for concurrency).** The first design assumed all writes
+run under the *exclusive* lock and capture under the *shared* lock (mutually
+exclusive), so the clone decision would need no atomics. That holds for the plain
+index, but the **tiered** index wires concurrent inserts into the live graph
+under the *shared* `mainIndexGuard` + per-node mutexes. So the implementation
+moved two things: **capture runs under the *exclusive* lock** (a brief quiescence
+point — no writer is mid-insert — where the backbone fork + generation handout
+can't race an in-flight insert), and **per-block COW publishes via an atomic
+CAS** so concurrent inserts COW'ing the same block are safe. After capture the
+reader still touches only its private immutable handle — no per-node locks, and
+no atomics on the snapshot read path.
+
+## Consistency guarantee
+
+The shipped guarantee is **strict for graph topology + membership + vectors**,
+**weak (best-effort) for per-element metadata flags**:
+
+- **Strict as-of-construction** for the graph the snapshot traverses: the set of
+  ids that existed at capture and their links/levels, plus each visible id's
+  vector bytes, are frozen — concurrent inserts COW, so they never perturb the
+  captured backbone/buffers. This is the property the requirement asks for and
+  the one the lock-free reader depends on.
+- **Weak** for the deleted / in-process **flags**: `markDelete` flips the flag in
+  place on the single-version metadata, so a delete that lands after capture MAY
+  or MAY NOT be observed by the reader (it tolerates either value). This matches
+  the live index's existing weak flag consistency; making it strict would mean
+  versioning the metadata flag too, which we chose not to do (see
+  `spec-delta.md`, "Deleted-flag consistency").
+
+A cheaper **safety-only** variant (drop COW entirely, just capture the backbone +
+gate slot reuse: "no crashes, no new nodes after T, but edges among old nodes may
+drift") was the recorded fallback; the implementation took the consistent COW
+version above.
+
+## Costs & open problems
+
+1. **Read-path indirection.** `root → backbone[i] → block → data` adds one
+   pointer hop vs today's flat `vector<DataBlock>` offset math, and the block
+   headers stop being contiguous. *The element-data locality is unchanged*
+   (it already lives in a separate `DataBlock::data` heap buffer). Net cost must
+   be measured — see prototype B.
+2. **Block must be deep-copyable for block-COW.** `ElementGraphData` embeds a
+   `std::mutex` (can't be copied) and a pointer to separately-allocated upper
+   levels. **Resolved by an explicit `copyTo` deep-copy** (re-inits a fresh mutex
+   on the copy, deep-copies each level's incoming-edges, walks the `others` upper
+   levels) — the mutex **stays in the struct**. The originally-sketched "move the
+   mutexes into a side array keyed by id" was implemented as a spike and
+   **rejected**: a stable side-array lock fixes lock *identity* but repair still
+   writes non-COW'd buffers, so it was neither necessary nor sufficient — the real
+   requirement is to **COW every touched block before taking any per-node lock**
+   (so a held lock's buffer can't be forked out from under it). With COW-before-
+   lock in place, deep-copy + in-struct mutex is sufficient and churns zero call
+   sites. (The mutex-identity hazard, and why the side array doesn't solve it, is
+   the central finding of the delete/repair work — see `design.md` "Subsystems".)
+3. **Pinned snapshot → retained memory.** A long-lived snapshot (open
+   `FT.CURSOR`) keeps old blocks alive; COW'd blocks double until it drops.
+   Mitigation = cap snapshot lifetime. (Same issue under any retention scheme.)
+4. **Granularity.** Block-COW copies a whole ~200 KB block to preserve one
+   node's edges. Fine if writes touch few blocks during a snapshot; approaches a
+   full copy if writes are spread across the whole index.
+
+## Feasibility: PROVEN (end-to-end POC)
+
+`prototypes/snapshot_poc.cpp` is a minimal graph index built on the **real
+VecSim `ElementGraphData`/`ElementLevelData`** (it `#include`s the production
+`graph_data.h`) and the **real `copyTo` primitive** added in Phase 1, with the
+rooted-COW block storage. It demonstrates the full requirement end to end:
+construct a snapshot under a read lock, release it, and traverse lock-free while
+a writer thread adds nodes and rewires edges (COW-ing real blocks via `copyTo`).
+
+Results (compiled against the real headers; `memory/vecsim_malloc.cpp` +
+`memory/vecsim_base.cpp` linked):
+
+- Snapshot query stayed **bit-identical** across thousands of lock-free reads
+  during ~1–2.4M concurrent writer ops; the live graph provably diverged.
+- **ThreadSanitizer: clean** (no data race) — the lock-free read path is sound.
+- **ASan/UBSan/LeakSan: clean** — the `copyTo` FAM logic and the refcounted
+  buffer deleter are memory-safe, no leaks.
+- Allocator bytes grew while the snapshot was held (pinned COW versions) and
+  **dropped on release** (automatic reclamation).
+
+This answers "is it implementable against VecSim's actual node representation?"
+— yes. The remaining work is integration scope (wiring COW through the real
+HNSW read/write paths, the batch iterator, the tiered index, and RediSearch),
+not a feasibility unknown.
+
+## Prototype findings
+
+Prototypes are in `prototypes/` (standalone). A/B/C use `g++ -O2 -std=c++20`;
+the POC additionally links two VecSim `memory/*.cpp` files (see its header).
+
+- **C — `block_deepcopy.cpp`:** validates the `copyTo` algorithm (FAM links +
+  fresh mutex + independent incoming-edges) on a multi-level node. PASSED.
+
+**A — `cow_snapshot.cpp` (mechanism): PASSED.** O(1) capture; a snapshot stayed
+bit-identical (checksum invariant) through **2.4M concurrent writes** with no
+lock during iteration; the live index diverged; dropping the snapshot freed
+exactly the COW'd-away versions automatically (276 → 212 live blocks). Confirms
+the refcount-driven COW is correct, lock-free for readers, and self-cleaning.
+
+**B — `read_indirection_bench.cpp` (read cost): layout matters a lot.**
+Dependent random walk, 1M elements, dim-128 distance per visit:
+
+| layout | lookup-only | vs flat |
+|---|---|---|
+| `vector<DataBlock>` (today, flat) | ~9.9 ns | — |
+| `shared_ptr<vector<Block>>` (blocks by value) | ~8.2 ns | ~0% |
+| **`shared_ptr<vector<Block{shared_ptr buf}>>`** (chosen) | ~9.0 ns | **~0%** |
+| `shared_ptr<vector<shared_ptr<Block>>>` | ~15.7 ns | **+58%** |
+
+With a realistic distance calc per visit, the scattered `shared_ptr<Block>`
+layout still cost **+23%** end-to-end. **Conclusion: the cost is the per-block
+`shared_ptr` heap scatter, not the root indirection.** Contiguous headers +
+refcounted buffer recover flat-speed reads, so the chosen structure imposes
+**no read regression** while keeping per-block COW.
+
+### Decision recorded: why not a "mutate-in-place + per-snapshot overlay"
+
+An overlay (keep the live index flat, copy a block into a side map just before a
+writer mutates it in place) would also avoid the read tax. **Rejected:** a
+lock-free snapshot reader that misses the overlay and reads the live block can
+**race** a writer that inserts-then-mutates that same block, yielding a
+*newer-than-T* read. In-place mutation is not cleanly lock-free. The rooted COW
+wins because snapshots read **immutable** versions — writers only ever create
+*new* blocks, never touch a block a snapshot holds. This is the property that
+makes "release the lock and iterate during writes" actually safe.
+
+## Fallback
+
+If the trivially-copyable-block refactor (cost #2) proves too invasive, fall
+back to **safety-only weak consistency**: keep the flat layout, capture the
+backbone + `curElementCount` under the lock, gate slot reuse by an owner
+horizon. Lock-free and crash-safe, excludes nodes added after T, but does not
+isolate edge drift among existing nodes.
diff --git a/docs/design/hnsw-snapshot/design.md b/docs/design/hnsw-snapshot/design.md
new file mode 100644
index 000000000..bf6d21715
--- /dev/null
+++ b/docs/design/hnsw-snapshot/design.md
@@ -0,0 +1,364 @@
+# Design: HNSW snapshot iterator
+
+> Backed by the standalone design note
+> [`docs/design/vecsim-hnsw-snapshot.md`](design-note.md)
+> and two compiled/run prototypes under `prototypes/`. Numbers cited
+> here come from those prototypes.
+
+## Overview
+
+A snapshot iterator captures an immutable, point-in-time view of the HNSW graph
+under the index read lock, then iterates without holding any lock. Concurrent
+writers never mutate **graph** data a snapshot can see — they create *new*
+versions via copy-on-write, and old graph versions are reclaimed automatically
+once no snapshot references them. This is a **persistent (rooted) copy-on-write**
+structure with refcount-driven reclamation.
+
+COW covers the graph container only. A slot's *vector* and *metadata* live in
+separate, un-versioned containers that SWAP overwrites in place, so they need a
+*second*, complementary mechanism — snapshot-gated slot reclamation — described
+in its own section below. Getting this split right is the crux of the design:
+**refcount COW for graph versions, horizon-gated deferral for slot recycling.**
+
+Why not a cheaper trick (doc-id watermark, append-only edge log, prefix reads):
+HNSW is **lossy on write** — inserting a node can evict an edge between two
+existing nodes (`revisitNeighborConnections` → `getNeighborsByHeuristic2` →
+`setLinks`), and delete/repair rewrites neighbor lists in place. A point-in-time
+view cannot be reconstructed by filtering the live graph; the old bytes must be
+**retained**. COW is the retention mechanism.
+
+## Data structure
+
+Replace the graph block storage
+
+```
+vecsim_stl::vector<DataBlock> graphDataBlocks;     // today
+```
+
+with a rooted, refcounted structure:
+
+```
+shared_ptr< vector< Block > >  root;          // contiguous header vector
+   where  Block { shared_ptr<Buffer> data; }  // per-block refcounted buffer
+```
+
+Prototype B established the exact shape by measuring a dependent random walk
+(1M elements, dim-128 distance per visit):
+
+| layout | lookup-only vs flat | notes |
+|---|---|---|
+| `vector<DataBlock>` (today) | baseline | |
+| `shared_ptr<vector<shared_ptr<Block>>>` | **+58–88%** | block headers scattered on heap |
+| **`shared_ptr<vector<Block{shared_ptr buf}>>`** | **~0% (noise)** | contiguous headers, refcounted buffer |
+
+The cost of the naive nested-`shared_ptr` form is the **per-block heap scatter**,
+not the root indirection (which hoists out of the loop). Contiguous headers +
+a refcounted data buffer recover flat-speed reads, so the chosen structure
+imposes **no read regression** while keeping per-block COW granularity.
+
+## Operations
+
+- **Snapshot capture (under read lock):** `snap = root` — an O(1) `shared_ptr`
+  copy. Release the lock. Iterate through `snap` with no lock.
+- **No snapshot active:** `root`/buffers are unshared (`use_count == 1`) →
+  writers mutate in place. **Zero overhead — identical to today.**
+- **First write after a snapshot:** the backbone (header vector) must be
+  preserved before its slots are repointed. Decide by **generation tag, not
+  refcount**: each backbone carries `gen`; clone it once iff
+  `max_live_snapshot_id >= backbone.gen`, stamp the clone with the current gen,
+  swap `root`.
+- **Content write to a block:** each block buffer carries `gen`; clone it iff
+  `max_live_snapshot_id >= block.gen`, stamp the clone with the current gen,
+  repoint its header slot, mutate the copy. A block written *after* all live
+  snapshots (`block.gen > max_live_id`) is mutated in place — including repeated
+  writes within the same generation (a freshly-cloned block carries the current
+  gen, so subsequent writes to it stay in place).
+- **Reclamation stays refcount-driven.** `shared_ptr` on the backbone + block
+  buffers frees a superseded version automatically once the last snapshot
+  referencing it drops. So the two halves split cleanly: **the generation tag
+  decides *when to clone* (`max` live id); the refcount decides *when to free*
+  (`min` live id, implicitly).** Same max/min duality as "Snapshot identity".
+
+**Locking (as implemented).** The plain index takes writes under the exclusive
+lock, so `max_live_snapshot_id` is trivially stable there. The **tiered** index,
+however, wires concurrent inserts into the live graph under the *shared*
+`mainIndexGuard` + per-node mutexes — so two things moved from the original
+sketch: **capture runs under the *exclusive* lock** (a brief quiescence point at
+which no writer is mid-insert, so the backbone fork + generation handout can't
+race an in-flight write), and **per-block COW publishes the new buffer with an
+atomic compare-exchange** (`std::atomic_*` free functions on the block handle —
+`std::atomic<shared_ptr>` is GCC-12+), so two inserts COW'ing the same block
+can't tear the handle; the generation stamp makes the CAS idempotent (the loser
+adopts the winner's fork). The backbone is forked once **at capture** and stamped
+the current generation, so it is never re-forked under the shared lock and stays
+structurally stable for readers. After capture, the reader touches only its
+private immutable handle — **no per-node locks**, and the only atomic on the read
+path is the block-handle load, taken **only while a snapshot is live** (gated by
+`anySnapshotLive()`; with none live the read is a plain pointer deref).
+
+> **Why not `use_count` for the clone decision.** A *container-level*
+> `root.use_count()` check goes **stale after the first backbone COW**: once the
+> live backbone is cloned, `root.use_count() == 1` even while a snapshot still
+> holds the old backbone — so a naive check wrongly mutates in place and corrupts
+> the snapshot. A *per-block-buffer* `use_count` is technically sound (a snapshot
+> transitively holds each buffer's `shared_ptr`), but it couples correctness to
+> refcount discipline and false-positives on any transient buffer copy. The
+> generation tag is refcount-independent and immune to both, which is why the
+> clone decision uses it and refcount is reserved for reclamation.
+
+> **What pins a block version (and why blocks don't need a horizon).** A snapshot
+> captures the **whole backbone** (`root` = `shared_ptr<vector<Block>>`), and that
+> vector holds a `shared_ptr<Buffer>` for **every** block. A block is therefore
+> pinned by the *captured backbone*, **not** by the reader traversing to it — a
+> block the snapshot never reaches is still held alive. This is correct only under
+> two invariants, both worth a test:
+> 1. **capture is eager and whole-backbone** (the handle holds `root`, not a
+>    lazily-grown set of touched nodes); and
+> 2. **backbone COW clones the header vector before repointing any slot**, so the
+>    captured backbone is never mutated in place.
+>
+> Violate either and it becomes a use-after-free: a block overwritten before the
+> reader reaches it loses its only ref and is freed. Given the invariants,
+> `shared_ptr` is sufficient for graph blocks and **no horizon is needed for
+> them** — and a horizon wouldn't help anyway (a consistent snapshot inherently
+> pins all blocks as of T).
+>
+> The horizon **is** needed for the **vector + metadata** containers: those are
+> not refcounted, so a snapshot holds **no** `shared_ptr` on a slot's embedding or
+> flags — nothing pins them. That is exactly why slot reclamation is a separate,
+> deferral/horizon-gated mechanism (see "Slot reclamation"), not a refcount one.
+
+Prototype A demonstrated correctness: a snapshot stayed bit-identical
+(checksum invariant) through 5.8M concurrent writes with no lock during
+iteration, the live index diverged, and dropping the snapshot freed exactly the
+COW'd-away versions.
+
+## Snapshot identity: the generation id
+
+Every snapshot carries a **monotonically increasing id** (a generation), handed
+out from one global counter at capture time and never reused, plus the
+**`curElementCount` it captured**. The index's `SnapshotRegistry` tracks the live
+set. The clone and reclaim decisions use two different keys:
+
+- **`newestLive()` = max(live ids)** is the **clone threshold**: a block/backbone
+  version stamped `g` must be preserved (cloned on write) iff `newestLive() >= g`
+  (some live snapshot predates, and so can still see, it).
+- **`maxVisibleCount()` = max captured `curElementCount` over live snapshots** is
+  the **reclaim horizon** for slot recycling. A snapshot only ever reads ids
+  `< its curElementCount`, so a SWAP that overwrites internal id `d` is invisible
+  to *every* live snapshot iff `d >= maxVisibleCount()` — then it recycles now;
+  below it, the slot is deferred. (This is the count-based form that shipped. An
+  earlier sketch keyed the horizon on `min(live ids)` + a per-slot
+  free-generation; the curElementCount form is simpler and exact — what a snapshot
+  can reach is defined by its count, not by id ordering.)
+
+`graphSnapshotActive()` is the **degenerate gate**: "is the live set non-empty?"
+(`newestLive() != 0`). It is what the boolean MVP used to defer *all* swaps; the
+`maxVisibleCount()` horizon refines the reclaim end so compaction keeps flowing
+under a long-lived cursor (high-id churn recycles immediately).
+
+The id and the count are both maintained from the first branch that adds the
+handle, so the clone decision (`newestLive`) and the horizon (`maxVisibleCount`)
+drop in without reworking the capture API. The clone decision is **generation-tag
+based, not `use_count`** (see the callout above); `use_count` is reserved purely
+for reclamation.
+
+## Slot reclamation — a separate problem COW does *not* solve
+
+A snapshot must give a consistent view of an internal `id` (a "slot"), and a
+slot's state is spread across **three parallel containers**, all indexed by the
+same id (`block = id/blockSize`, `offset = id%blockSize`):
+
+| state | container | COW'd by this design? |
+|---|---|---|
+| graph data (`ElementGraphData`: links, neighbors, mutex) | `graphDataBlocks` → the rooted `shared_ptr<vector<Block>>` | **yes** (above) |
+| raw vector (the embedding) | `this->vectors` (`DataBlocksContainer`) | **no** |
+| per-id metadata (`ElementMetaData`: label + flags incl. marked-deleted) | `idToMetaData` (flat vector) | **no** |
+
+The refcounted COW versions only the **graph-data** container. The vector and
+metadata containers are single-version and mutated **in place**. The dangerous
+operation is **SWAP** (`SwapLastIdWithDeletedId`): to compact a delete it copies
+the last element (`curElementCount-1`) into the deleted id's slot —
+
+```
+graph:    memcpy(getGraphDataByInternalIdForWrite(id), last_element, ...)  // COW-aware, safe
+vector:   memcpy(getDataByInternalId(id),              last_data, ...)     // IN PLACE
+metadata: idToMetaData[id] = idToMetaData[curElementCount]                 // IN PLACE
+```
+
+So a snapshot holding `id < snapshot.curElementCount` would read the *right*
+links but the *wrong embedding* and a *wrong deleted-flag* — a different element
+was moved into that slot underneath it. **Graph COW alone is insufficient for
+slot safety.**
+
+### Fix: defer the swap, don't COW the other two containers
+
+COW-ing the vector + metadata containers as well would version the **largest**
+data in the index (the embeddings) to protect against a comparatively rare event
+(SWAP) — the wrong trade. The clean fix is to **not recycle a slot while a
+snapshot references it**: leave the deleted slot as a tombstone and defer the
+SWAP. A deferred slot keeps all three containers intact for the snapshot with
+**zero extra COW**.
+
+This is the job of the **horizon** — distinct from, and complementary to, the
+refcount COW:
+
+| problem | mechanism |
+|---|---|
+| graph-data version lifetime (link rewrites that don't move a slot) | **`shared_ptr` refcount COW** — automatic |
+| slot/id recycling (SWAP overwrites vector + metadata in place) | **snapshot-gated SWAP deferral** — boolean now, horizon later |
+
+Two levels, shipped in order:
+
+1. **Boolean gate (MVP):** gate `removeAndSwap` / `executeReadySwapJobs` on
+   `graphSnapshotActive()` — while *any* snapshot is live, defer all swaps
+   (deletes become tombstones). Correct and trivial; its only cost is that one
+   long-lived cursor stalls *all* compaction.
+2. **Horizon refinement (when measured to matter):** stamp each freed slot with
+   the generation at which it became free; recycle it once the **oldest live
+   snapshot is newer than that stamp** (head of the owner list). This keeps
+   compaction flowing under long-lived cursors instead of blocking it globally.
+   Worth the owner-list + per-slot free-generation bookkeeping only if tombstone
+   buildup under open cursors proves to be a real problem. (For the clone
+   decision the *newest* live snapshot — the list tail — is the relevant end; for
+   slot reclamation it is the *oldest* — the head.)
+
+### Deleted-flag consistency (decide explicitly)
+
+The metadata deleted-flag is flipped **in place** by every `markDelete`, not just
+by SWAP. So even with swap deferral, a snapshot can observe an element that was
+live at capture as *deleted* (a concurrent delete flipped its shared flag). That
+is acceptable under a "no *new* vectors after T, crash-safe" contract, but it is
+**not** strict as-of-capture consistency. Choosing strict would require
+versioning/snapshotting the flag too. This is a deliberate contract decision —
+see Open questions.
+
+## Subsystems touched
+
+1. **Block storage & graph data.** *(implemented)*
+   - `graph_data.h`: `ElementGraphData` is made **deep-copyable** via
+     `ElementGraphData::copyTo` + `ElementLevelData::copyInto`: a field-wise deep
+     copy that re-initializes a fresh `neighborsGuard` mutex on the copy (lock
+     state never aliased), copies the flexible-array link lists by `recordSize`,
+     deep-copies each level's `incomingUnidirectionalEdges`, and walks the
+     separately-allocated `others` upper levels. The mutex **stays in the struct**;
+     the "side array keyed by id" idea was spiked and rejected (it fixes lock
+     identity but not the non-COW'd write — see design-note cost #2).
+   - `graph_data_blocks.h` *(new file)*: `RootedCowStore<T>` (the reusable
+     `shared_ptr` root + generation tag + `SnapshotRegistry`), `GraphBlockBuffer`
+     (the refcounted per-block buffer), `GraphDataBlocks` (the contiguous header
+     vector + COW write paths, with the atomic CAS-fork in `cowBlock`), and the
+     `HNSWGraphSnapshot` handle. `MetaDataStore` (in `hnsw.h`) is built on the
+     same `RootedCowStore<T>` so the per-id metadata grows by COW instead of
+     reallocating, and is read through an atomically-published pointer.
+   - `hnsw.h`: `getGraphDataByInternalId` resolves through the root;
+     `getGraphDataByInternalIdForWrite` COWs backbone+block before returning a
+     writable pointer. The delete/**repair** path is the subtle one:
+     `mutuallyUpdateForRepairedNode` **COWs every node it will touch up front,
+     before taking any per-node lock**, so a held lock's buffer can't be forked
+     out from under it (the mutex-identity hazard). This is what lets repair run
+     **while a snapshot is live** rather than being deferred.
+2. **Batch iterator.** *(implemented — plain + tiered, opt-in)* Rather than
+   branch the hot live iterator, a **dedicated `HNSWSnapshot_BatchIterator`**
+   (+ single/multi subclasses) reads entirely through the captured handle with no
+   per-node locks; `hnsw_batch_iterator.h` is left untouched (zero perf/risk to
+   the default path). `newBatchIterator` captures the snapshot (under the lock,
+   released immediately) and dispatches to it **when the opt-in
+   `useGraphSnapshotIterator` query param is set**; otherwise the live iterator is
+   unchanged.
+3. **Tiered index.** *(implemented)* The cursor takes `mainIndexGuard`
+   *exclusively* only to capture the backend snapshot, then **releases it and
+   iterates lock-free** — ingestion/GC are no longer blocked for the cursor's
+   life. SWAP slot reuse (`executeReadySwapJobs`) is gated by the
+   **`maxVisibleCount()` reclaim horizon** (the boolean `graphSnapshotActive()`
+   gate was the MVP; the horizon shipped): a freed id `>=` the horizon recycles
+   immediately, below it defers. Repair runs under the snapshot (COW-before-lock,
+   item 1), so the horizon is actually reached rather than stalled behind globally
+   deferred repairs. The vector + metadata containers are still overwritten in
+   place — the horizon, not COW, is what keeps a deferred slot valid.
+4. **RediSearch boundary.** *(not yet implemented)* `src/iterators/hybrid_reader.c`
+   (and the vecsim wrappers) would construct the iterator under the lock and
+   release it for iteration, and hold the snapshot across `FT.CURSOR READ`. Also
+   needs a string param-resolver token so `FT.SEARCH` can flip
+   `useGraphSnapshotIterator` (today it is only settable programmatically).
+5. **Config & info.** *(not yet implemented)* enable flag + snapshot
+   memory/lifetime cap, and `FT.INFO`/`FT.DEBUG` exposure of active-snapshot count
+   and retained memory.
+
+## Edge cases
+
+- **Index grows during iteration** (`growByBlock` appends a block): the snapshot
+  references the old header vector / `curElementCount` at T, so new ids are
+  simply outside its view. No new vectors appear in results — exactly the
+  intended consistency.
+- **Element deleted during iteration:** marked-deleted; the snapshot still holds
+  the old buffer, so the vector remains visible (consistent with T). Physical
+  removal (SWAP) is deferred until no snapshot references the slot.
+- **Long-lived snapshot (open cursor) pins memory:** retained versions
+  accumulate. Bounded by a configurable snapshot lifetime / memory cap; on
+  breach, either refuse new snapshots or fail the cursor with a clear error
+  (decision deferred to review — see Open questions).
+- **Snapshot outlives the index drop:** the `shared_ptr` graph keeps the
+  referenced blocks alive until the last iterator is freed; index teardown must
+  not assume blocks are gone.
+- **Multiple concurrent snapshots at different generations:** each holds its own
+  root; refcounts naturally keep every still-referenced version alive.
+
+## Alternatives considered
+
+- **Doc-id watermark / "traverse edges added before T".** Rejected: HNSW evicts
+  and rewrites old nodes' edges in place, so filtering the live graph cannot
+  reconstruct the topology at T.
+- **Append-only edge list + prefix read.** Rejected: HNSW is bounded-degree and
+  prunes; making edges append-only either breaks the algorithm or degenerates
+  into a versioned log with O(log-length) replay on the hot read path.
+- **Mutate-in-place + per-snapshot overlay.** Rejected: a lock-free reader that
+  misses the overlay and reads the live block can *race* a writer that
+  inserts-then-mutates that block, yielding a newer-than-T read. Only immutable
+  versions are cleanly lock-free.
+- **`shared_ptr<vector<shared_ptr<Block>>>` (per-block pointers).** Rejected as
+  the storage layout: +23% end-to-end read regression from heap scatter
+  (prototype B). Kept the per-block COW idea but moved to contiguous headers +
+  refcounted buffers.
+- **Eager full copy at construction / `fork()`.** O(N) per query, or
+  process-fork semantics. `fork()` remains the right tool for *batch*-scoped
+  snapshots (GC/RDB) and is explicitly out of scope here (this is per-query).
+- **Safety-only weak consistency** (gate reclamation, no COW): a viable, much
+  cheaper fallback if the trivially-copyable-block refactor proves too invasive.
+  It is lock-free and crash-safe but does not isolate edge drift among existing
+  nodes. Recorded as the fallback in the design note.
+
+## Testing strategy
+
+- **C++ unit (`tests/cpptests`):** COW correctness (snapshot invariant under
+  concurrent writes), automatic reclamation (no leak after snapshot drop),
+  backbone-growth-during-snapshot, delete-during-snapshot visibility.
+- **Concurrency / sanitizers:** the snapshot-vs-writer tests under
+  `SAN=address` and TSan to prove the lock-free read path is race-free.
+- **Python end-to-end (`tests/pytests`):** KNN / hybrid / `WITHCURSOR` over a
+  vector index returning consistent results while a writer adds/deletes
+  concurrently; writer progress is observable during a long read (lock released).
+- **Benchmarks (`microbenchmark` label):** read-path regression check (must stay
+  ~flat per prototype B) and mixed read/write throughput improvement.
+
+## Open questions (resolved during implementation, except where noted)
+
+1. **Opt-in vs default.** → **Resolved: opt-in.** A query param
+   (`useGraphSnapshotIterator`, default off) selects the snapshot iterator; the
+   default path is byte-for-byte unchanged.
+2. **Memory-cap policy on breach.** → **Still open (not implemented).** No cap
+   yet bounds a long-lived cursor's retained memory; a stuck cursor pins its
+   as-of-capture memory and defers below-horizon slots indefinitely. This is the
+   main "trust it in production" gap.
+3. **Granularity of COW.** → **Resolved: per-block buffer**, as proposed. No
+   block-copy churn observed in the unit/concurrency tests; revisit only if a
+   real workload shows it.
+4. **Deleted-flag consistency level.** → **Resolved: weak** — "no new vectors
+   after T, crash-safe, slot-stable." Strict deleted-flags (versioning the
+   metadata flag) was deliberately not built. See `spec-delta.md`.
+5. **Slot deferral granularity.** → **Resolved: horizon shipped.** The boolean
+   `graphSnapshotActive()` gate was the MVP; the shipped reclaim horizon is
+   `maxVisibleCount()` (max captured `curElementCount` over live snapshots), so
+   one open cursor no longer stalls all compaction. (Note the shipped horizon is
+   count-based, not the min-live-id + per-slot-free-generation form sketched
+   earlier — see "Snapshot identity".)
diff --git a/docs/design/hnsw-snapshot/proposal.md b/docs/design/hnsw-snapshot/proposal.md
new file mode 100644
index 000000000..fa87b4aac
--- /dev/null
+++ b/docs/design/hnsw-snapshot/proposal.md
@@ -0,0 +1,72 @@
+# Add a consistent, lock-free snapshot iterator for HNSW vector search
+
+> Status: **draft proposal** for maintainer review. No code yet. A standalone
+> design exploration and two working prototypes back this proposal — see
+> [`docs/design/vecsim-hnsw-snapshot.md`](design-note.md)
+> and `prototypes/`.
+
+## Why
+
+VecSim batch iteration over an HNSW index has two coupled problems in
+multi-threaded mode:
+
+1. **Long reads block writes.** A query holds the index read lock for its
+   *entire* duration (`VecSimTieredIndex::acquireSharedLocks`, held across the
+   whole `topKQuery`/batch iteration). For a long KNN scan — and especially a
+   cursor-paginated aggregation (`FT.AGGREGATE … WITHCURSOR`) over a vector
+   index, which can stay open across many `FT.CURSOR READ` calls — the
+   background indexer and concurrent writers are stalled the whole time. This is
+   a real throughput and tail-latency problem under mixed read/write load.
+
+2. **Results are not point-in-time consistent.** VecSim explicitly documents
+   that a query "may include vectors that were added after the query was
+   submitted, with no guarantees." Results are therefore non-reproducible under
+   concurrency, which complicates testing, debugging, and any read-your-writes
+   reasoning.
+
+The reason the lock is held for the whole query is that lock-free traversal
+during writes is currently unsafe: the block backbone can be reallocated by a
+concurrent insert, and slot reuse (SWAP) and in-place link rewrites can pull
+data out from under a reader.
+
+## What Changes
+
+Introduce a **snapshot iterator** for HNSW: an iterator that is **constructed
+under the read lock, then releases it**, and iterates **lock-free while
+insertions and deletions proceed concurrently**, returning results **consistent
+with the index state at construction time**.
+
+User-visible surface:
+
+- **Consistency guarantee strengthens.** A vector query / batch iterator (KNN,
+  hybrid, and cursor-paginated vector aggregations) reflects the graph as of
+  *query start*: vectors added after the query started are not included, and a
+  vector deleted after start keeps its slot (the slot is not recycled while the
+  snapshot is live), so a *different* element never appears in its place. One
+  nuance: the deleted/in-process **flags** are weakly consistent — a delete that
+  lands after start may or may not be observed (the same best-effort the live
+  index already has); the strengthening is about *which set of vectors is
+  visible* and *slot stability*, not strict deletion timing. The current "may see
+  newer vectors" wording no longer applies when snapshotting is enabled.
+- **Writers are no longer blocked for the duration of a vector read.** Inserts
+  and deletes proceed while a query/cursor iterates, improving write throughput
+  and tail latency under mixed load.
+- **New configuration** *(planned, not yet built)* (tiered-HNSW scoped, mirroring
+  `swapJobThreshold`) to enable snapshot iteration and to **bound snapshot memory
+  / lifetime**, since a live snapshot retains superseded graph versions. (The
+  vecsim layer currently exposes the opt-in as a per-query param; the config + cap
+  are the remaining production surface.)
+- **`FT.INFO` / debug exposure** *(planned, not yet built)* of snapshot state
+  (active snapshots, retained snapshot memory) for operability.
+
+### Non-goals
+
+- **No change to the flat (brute-force) index** — its batch iterator already
+  materializes scores at construction and is unaffected.
+- **No change to non-vector iterators** (inverted index, numeric, tag).
+- **No cross-shard snapshot coordination** in cluster mode beyond per-shard
+  consistency; each shard snapshots independently, as it already executes
+  independently.
+- **No exact-recall guarantee change** — results remain approximate; the change
+  is about *which set of vectors* is visible and *when the lock is held*, not
+  about HNSW recall.
diff --git a/docs/design/hnsw-snapshot/prototypes/block_deepcopy.cpp b/docs/design/hnsw-snapshot/prototypes/block_deepcopy.cpp
new file mode 100644
index 000000000..00f90ae56
--- /dev/null
+++ b/docs/design/hnsw-snapshot/prototypes/block_deepcopy.cpp
@@ -0,0 +1,148 @@
+// Prototype C: validate the block deep-copy algorithm that Phase 1 of the
+// snapshot work needs. The real ElementGraphData has three features that make a
+// copy non-trivial, and this models all three:
+//   1. a flexible-array-member link list (`idType links[]`) sized per level,
+//   2. an embedded std::mutex that must NOT be copied (fresh on the copy),
+//   3. a heap-allocated incoming-edges vector that must be deep-copied so the
+//      snapshot never shares a mutable vector with the live index.
+//
+// Build: g++ -O2 -std=c++20 -pthread block_deepcopy.cpp -o block_deepcopy
+//
+// This mirrors deps/VectorSimilarity/.../graph_data.h closely enough to prove
+// the copyTo logic before applying it to the real (FAM + allocator) header.
+
+#include <cassert>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <mutex>
+#include <vector>
+
+using idType = uint32_t;
+using linkListSize = uint16_t;
+
+// --- mirrors ElementLevelData (FAM links + incoming-edges pointer) ---
+struct ElementLevelData {
+  std::vector<idType>* incomingUnidirectionalEdges;
+  linkListSize numLinks;
+  idType links[];  // flexible array member
+};
+
+// --- mirrors ElementGraphData (mutex + others + inline level0) ---
+struct ElementGraphData {
+  size_t toplevel;
+  std::mutex neighborsGuard;
+  ElementLevelData* others;
+  ElementLevelData level0;
+};
+
+static size_t levelDataSize(size_t M) {
+  return sizeof(ElementLevelData) + sizeof(idType) * M;
+}
+static size_t elementGraphDataSize(size_t M0) {
+  return sizeof(ElementGraphData) + sizeof(idType) * M0;
+}
+
+// Copy one level record of `recordSize` bytes (fixed part + links FAM), giving
+// the destination an independent incoming-edges vector.
+static void copyLevelInto(ElementLevelData* dst, const ElementLevelData* src,
+                          size_t recordSize) {
+  std::memcpy(dst, src, recordSize);  // numLinks + links FAM (+ stale ptr)
+  dst->incomingUnidirectionalEdges =
+      new std::vector<idType>(*src->incomingUnidirectionalEdges);  // deep copy
+}
+
+// Deep-copy `src` graph data into raw zeroed memory `dst`. Fresh mutex.
+static void copyTo(ElementGraphData* dst, const ElementGraphData* src, size_t M,
+                   size_t M0) {
+  const size_t lds = levelDataSize(M);
+  const size_t egds = elementGraphDataSize(M0);
+  dst->toplevel = src->toplevel;
+  new (&dst->neighborsGuard) std::mutex();  // fresh; never copy lock state
+  const size_t level0Size =
+      egds - (sizeof(ElementGraphData) - sizeof(ElementLevelData));
+  copyLevelInto(&dst->level0, &src->level0, level0Size);
+  if (src->toplevel > 0) {
+    dst->others = (ElementLevelData*)std::calloc(src->toplevel, lds);
+    for (size_t i = 0; i < src->toplevel; i++) {
+      auto* s = (const ElementLevelData*)((const char*)src->others + i * lds);
+      auto* d = (ElementLevelData*)((char*)dst->others + i * lds);
+      copyLevelInto(d, s, lds);
+    }
+  } else {
+    dst->others = nullptr;
+  }
+}
+
+// Helpers to build/destroy a node the way the real ctor/destroy do.
+static ElementGraphData* makeNode(size_t toplevel, size_t M, size_t M0) {
+  void* mem = std::calloc(1, elementGraphDataSize(M0));
+  auto* gd = (ElementGraphData*)mem;
+  gd->toplevel = toplevel;
+  new (&gd->neighborsGuard) std::mutex();
+  gd->level0.incomingUnidirectionalEdges = new std::vector<idType>();
+  gd->others = toplevel ? (ElementLevelData*)std::calloc(toplevel, levelDataSize(M))
+                        : nullptr;
+  for (size_t i = 0; i < toplevel; i++) {
+    auto* ld = (ElementLevelData*)((char*)gd->others + i * levelDataSize(M));
+    ld->incomingUnidirectionalEdges = new std::vector<idType>();
+  }
+  return gd;
+}
+static ElementLevelData& levelOf(ElementGraphData* gd, size_t lvl, size_t M) {
+  return lvl == 0 ? gd->level0
+                  : *(ElementLevelData*)((char*)gd->others +
+                                         (lvl - 1) * levelDataSize(M));
+}
+
+int main() {
+  const size_t M = 16, M0 = 32, toplevel = 3;  // node present on levels 0..3
+  auto* src = makeNode(toplevel, M, M0);
+
+  // Populate links + incoming edges on each level with recognizable data.
+  for (size_t lvl = 0; lvl <= toplevel; lvl++) {
+    auto& ld = levelOf(src, lvl, M);
+    size_t cap = (lvl == 0) ? M0 : M;
+    ld.numLinks = cap / 2;
+    for (size_t j = 0; j < ld.numLinks; j++) ld.links[j] = uint32_t(lvl * 1000 + j);
+    for (size_t j = 0; j < lvl + 2; j++)
+      ld.incomingUnidirectionalEdges->push_back(uint32_t(lvl * 100 + j));
+  }
+
+  // Deep copy.
+  void* mem = std::calloc(1, elementGraphDataSize(M0));
+  auto* dst = (ElementGraphData*)mem;
+  copyTo(dst, src, M, M0);
+
+  // Verify: identical contents, but INDEPENDENT storage.
+  bool ok = (dst->toplevel == src->toplevel);
+  for (size_t lvl = 0; lvl <= toplevel && ok; lvl++) {
+    auto& s = levelOf(src, lvl, M);
+    auto& d = levelOf(dst, lvl, M);
+    if (d.numLinks != s.numLinks) { ok = false; break; }
+    if (std::memcmp(d.links, s.links, s.numLinks * sizeof(idType))) { ok = false; break; }
+    if (*d.incomingUnidirectionalEdges != *s.incomingUnidirectionalEdges) { ok = false; break; }
+    // independent pointers (not shared)
+    if (d.incomingUnidirectionalEdges == s.incomingUnidirectionalEdges) { ok = false; break; }
+    if (lvl > 0 && d.links == s.links) { ok = false; break; }
+  }
+  std::printf("contents equal + storage independent: %s\n", ok ? "YES" : "NO");
+
+  // Mutate the SOURCE after copy; destination must be unaffected (isolation).
+  levelOf(src, 0, M).links[0] = 0xFFFFFFFF;
+  src->level0.incomingUnidirectionalEdges->push_back(0xABCD);
+  bool isolated = (levelOf(dst, 0, M).links[0] == 0) &&  // dst kept 0*1000+0 == 0
+                  (dst->level0.incomingUnidirectionalEdges->back() != 0xABCD);
+  std::printf("snapshot isolated from later source mutation: %s\n",
+              isolated ? "YES" : "NO");
+
+  // Both mutexes are usable (fresh, unlocked).
+  bool locks_ok = src->neighborsGuard.try_lock() && dst->neighborsGuard.try_lock();
+  if (locks_ok) { src->neighborsGuard.unlock(); dst->neighborsGuard.unlock(); }
+  std::printf("both mutexes fresh & lockable: %s\n", locks_ok ? "YES" : "NO");
+
+  bool pass = ok && isolated && locks_ok;
+  std::printf("\n==== PROTOTYPE C %s ====\n", pass ? "PASSED" : "FAILED");
+  return pass ? 0 : 1;
+}
diff --git a/docs/design/hnsw-snapshot/prototypes/cow_snapshot.cpp b/docs/design/hnsw-snapshot/prototypes/cow_snapshot.cpp
new file mode 100644
index 000000000..e5e5a61c0
--- /dev/null
+++ b/docs/design/hnsw-snapshot/prototypes/cow_snapshot.cpp
@@ -0,0 +1,197 @@
+// Prototype A: persistent rooted copy-on-write snapshot for an HNSW-like block
+// store. Proves:
+//   1. O(1) snapshot capture (clone the root shared_ptr).
+//   2. Lock-free reads from a snapshot stay CONSISTENT while a writer thread
+//      concurrently mutates edges, COWs blocks, and grows the backbone.
+//   3. Automatic refcount-driven cleanup: dropping the snapshot frees the old
+//      blocks/backbones with no remaining references.
+//
+// Build: g++ -O2 -std=c++20 -pthread cow_snapshot.cpp -o cow_snapshot
+//
+// This is a standalone model of the structure
+//   shared_ptr< vector< shared_ptr<Block> > >
+// described in docs/design/vecsim-hnsw-snapshot.md. It is NOT wired into VecSim;
+// it exists to validate the mechanism and its concurrency story in isolation.
+
+#include <atomic>
+#include <cassert>
+#include <cstdint>
+#include <cstdio>
+#include <memory>
+#include <mutex>
+#include <random>
+#include <shared_mutex>
+#include <thread>
+#include <vector>
+
+// ----- Block: models a DataBlock holding `blockSize` elements, each with an
+// edge list. We track live instances to prove cleanup.
+struct Block {
+  static inline std::atomic<int64_t> live{0};
+
+  // elems[e] is the link list (neighbors) of element e in this block.
+  std::vector<std::vector<uint32_t>> elems;
+
+  explicit Block(size_t blockSize) : elems(blockSize) { live.fetch_add(1); }
+  // Deep copy ctor — this is what block-COW invokes.
+  Block(const Block& o) : elems(o.elems) { live.fetch_add(1); }
+  ~Block() { live.fetch_sub(1); }
+};
+
+using Backbone = std::vector<std::shared_ptr<Block>>;
+using Root = std::shared_ptr<Backbone>;
+
+// ----- The index: a root pointer guarded by a shared_mutex, mirroring VecSim's
+// indexDataGuard. Writers take it exclusive; snapshots take it shared just long
+// enough to clone the root, then iterate lock-free.
+class CowIndex {
+ public:
+  CowIndex(size_t nBlocks, size_t blockSize) : blockSize_(blockSize) {
+    auto bb = std::make_shared<Backbone>();
+    for (size_t i = 0; i < nBlocks; i++) {
+      auto blk = std::make_shared<Block>(blockSize);
+      for (size_t e = 0; e < blockSize; e++)
+        blk->elems[e] = {uint32_t(i), uint32_t(e)};  // some initial edges
+      bb->push_back(std::move(blk));
+    }
+    root_ = std::move(bb);
+  }
+
+  // O(1) snapshot: clone the root under a shared lock, then it's detached.
+  Root snapshot() {
+    std::shared_lock lk(guard_);
+    return root_;  // shared_ptr copy: bumps backbone refcount, O(1)
+  }
+
+  size_t blockSize() const { return blockSize_; }
+
+  // A write: mutate the edge list of (block b, element e). Performs generational
+  // backbone-COW then per-block COW, all under the exclusive lock.
+  void mutateEdges(size_t b, size_t e, uint32_t tag) {
+    std::unique_lock lk(guard_);
+    cowBackboneIfShared();
+    auto& slot = (*root_)[b];
+    if (slot.use_count() > 1)               // shared with a live snapshot
+      slot = std::make_shared<Block>(*slot);  // deep-copy this block
+    slot->elems[e] = {tag, tag, tag};       // mutate the private copy
+  }
+
+  // Grow: append a new block (the rare backbone-membership change).
+  void addBlock() {
+    std::unique_lock lk(guard_);
+    cowBackboneIfShared();
+    auto blk = std::make_shared<Block>(blockSize_);
+    root_->push_back(std::move(blk));
+  }
+
+  size_t numBlocks() {
+    std::shared_lock lk(guard_);
+    return root_->size();
+  }
+
+ private:
+  // Path-copy the backbone once per generation: only if a snapshot shares it.
+  void cowBackboneIfShared() {
+    if (root_.use_count() > 1 || /* the backbone object itself */ false) {
+      // Copy the vector of block shared_ptrs (cheap: N/blockSize pointers).
+      root_ = std::make_shared<Backbone>(*root_);
+    }
+  }
+
+  std::shared_mutex guard_;
+  Root root_;
+  size_t blockSize_;
+};
+
+// Checksum over an entire snapshot's edges — used to assert consistency.
+static uint64_t checksum(const Root& snap) {
+  uint64_t h = 1469598103934665603ull;  // FNV-1a
+  for (auto& blk : *snap)
+    for (auto& edges : blk->elems)
+      for (uint32_t v : edges) {
+        h ^= v;
+        h *= 1099511628211ull;
+      }
+  return h;
+}
+
+int main() {
+  const size_t nBlocks = 64, blockSize = 1024;
+  CowIndex idx(nBlocks, blockSize);
+
+  std::printf("initial live blocks: %lld (expect %zu)\n",
+              (long long)Block::live.load(), nBlocks);
+
+  // Capture a snapshot and its baseline checksum (lock-free reads hereafter).
+  Root snap = idx.snapshot();
+  const uint64_t baseline = checksum(snap);
+  std::printf("snapshot captured, baseline checksum = %016llx\n",
+              (unsigned long long)baseline);
+
+  // Writer thread: hammer the index with edge mutations + block growth.
+  std::atomic<bool> stop{false};
+  std::atomic<uint64_t> writes{0};
+  std::thread writer([&] {
+    std::mt19937 rng(12345);
+    while (!stop.load(std::memory_order_relaxed)) {
+      size_t nb = idx.numBlocks();
+      size_t b = rng() % nb;
+      size_t e = rng() % blockSize;
+      idx.mutateEdges(b, e, 0xDEADBEEF);
+      if ((writes++ & 0x3FFF) == 0) idx.addBlock();  // occasionally grow
+    }
+  });
+
+  // Main thread: repeatedly re-read the SNAPSHOT lock-free and verify it never
+  // changes, while the live index is being mutated underneath.
+  bool consistent = true;
+  for (int i = 0; i < 2000; i++) {
+    if (checksum(snap) != baseline) {
+      consistent = false;
+      break;
+    }
+    std::this_thread::yield();
+  }
+
+  stop.store(true);
+  writer.join();
+
+  // The live index HAS changed (sanity: the snapshot diverges from live).
+  Root liveNow = idx.snapshot();
+  uint64_t liveChecksum = 0;
+  // Only compare the original blocks (live has more after growth).
+  {
+    uint64_t h = 1469598103934665603ull;
+    for (size_t i = 0; i < nBlocks; i++)
+      for (auto& edges : (*liveNow)[i]->elems)
+        for (uint32_t v : edges) { h ^= v; h *= 1099511628211ull; }
+    liveChecksum = h;
+  }
+
+  std::printf("writes performed: %llu\n", (unsigned long long)writes.load());
+  std::printf("snapshot consistent during writes: %s\n",
+              consistent ? "YES" : "NO");
+  std::printf("snapshot checksum still = %016llx\n",
+              (unsigned long long)checksum(snap));
+  std::printf("live   checksum now     = %016llx (differs: %s)\n",
+              (unsigned long long)liveChecksum,
+              liveChecksum != baseline ? "YES" : "no");
+
+  int64_t whileSnapHeld = Block::live.load();
+  std::printf("live blocks while snapshot held: %lld (>= %zu: COW copies + growth)\n",
+              (long long)whileSnapHeld, nBlocks);
+
+  // Drop the snapshot -> automatic cleanup of versions only it referenced.
+  snap.reset();
+  liveNow.reset();
+  int64_t afterDrop = Block::live.load();
+  std::printf("live blocks after dropping snapshot: %lld\n", (long long)afterDrop);
+  std::printf("(should equal current live backbone size = %zu)\n", idx.numBlocks());
+
+  bool cleanupOk = (afterDrop == (int64_t)idx.numBlocks());
+  std::printf("automatic cleanup correct: %s\n", cleanupOk ? "YES" : "NO");
+
+  bool pass = consistent && cleanupOk && (liveChecksum != baseline);
+  std::printf("\n==== PROTOTYPE A %s ====\n", pass ? "PASSED" : "FAILED");
+  return pass ? 0 : 1;
+}
diff --git a/docs/design/hnsw-snapshot/prototypes/read_indirection_bench.cpp b/docs/design/hnsw-snapshot/prototypes/read_indirection_bench.cpp
new file mode 100644
index 000000000..0939e2522
--- /dev/null
+++ b/docs/design/hnsw-snapshot/prototypes/read_indirection_bench.cpp
@@ -0,0 +1,205 @@
+// Prototype B: quantify the read-path cost of the rooted-COW layout vs today's
+// flat layout, in an HNSW-traversal-like dependent random walk.
+//
+//   FLAT     : vector<Block>                          (today: blocks[id/bs].data)
+//   INDIRECT : shared_ptr<vector<shared_ptr<Block>>>  (proposed rooted COW)
+//
+// Both store edges identically in a per-block heap buffer, so the ONLY
+// difference measured is the container indirection (extra pointer hop + the
+// block headers no longer being contiguous). Element-data locality is identical
+// in both, matching the real DataBlock (data lives in a separate heap buffer).
+//
+// Build: g++ -O2 -std=c++20 cow_snapshot.cpp ... no; standalone:
+//        g++ -O2 -std=c++20 read_indirection_bench.cpp -o read_indirection_bench
+
+#include <chrono>
+#include <cstdint>
+#include <cstdio>
+#include <memory>
+#include <random>
+#include <vector>
+
+static constexpr size_t M = 16;            // links per element (HNSW default)
+static constexpr size_t BS = 1024;         // block size
+static constexpr size_t N = 1'000'000;     // elements
+static constexpr size_t NB = (N + BS - 1) / BS;
+static constexpr size_t STEPS = 20'000'000;  // walk length
+static constexpr size_t DIM = 128;         // vector dim for the distance calc
+
+struct Block {
+  uint32_t* data;  // BS * M link ids, like ElementLevelData::links[] per elem
+  Block() { data = new uint32_t[BS * M]; }
+  ~Block() { delete[] data; }
+  Block(const Block&) = delete;
+};
+
+int main() {
+  std::mt19937 rng(98765);
+
+  // ---- FLAT layout: vector<Block> (headers contiguous, data in heap buffers)
+  std::vector<Block> flat(NB);
+  // ---- INDIRECT layout: shared_ptr<vector<shared_ptr<Block>>>
+  auto bb = std::make_shared<std::vector<std::shared_ptr<Block>>>();
+  bb->reserve(NB);
+  for (size_t i = 0; i < NB; i++) bb->push_back(std::make_shared<Block>());
+  auto root = std::make_shared<decltype(bb)::element_type>(*bb);
+
+  // Same random edge data in both layouts.
+  for (size_t b = 0; b < NB; b++)
+    for (size_t k = 0; k < BS * M; k++) {
+      uint32_t v = rng() % N;
+      flat[b].data[k] = v;
+      (*root)[b]->data[k] = v;
+    }
+
+  // Shared vector data (one copy, used by both layouts) for a realistic
+  // distance computation per visit. This is the dominant per-visit cost in real
+  // HNSW and is identical between layouts; only link lookup differs.
+  std::vector<float> vecs(N * DIM);
+  for (auto& f : vecs) f = float(rng()) / float(rng.max());
+  std::vector<float> query(DIM);
+  for (auto& f : query) f = float(rng()) / float(rng.max());
+  auto l2 = [&](uint32_t id) {
+    const float* v = vecs.data() + size_t(id) * DIM;
+    float s = 0;
+    for (size_t d = 0; d < DIM; d++) {
+      float diff = v[d] - query[d];
+      s += diff * diff;
+    }
+    return s;
+  };
+
+  auto walk_flat = [&](uint32_t start) {
+    uint32_t cur = start;
+    uint64_t acc = 0;
+    for (size_t s = 0; s < STEPS; s++) {
+      size_t b = cur / BS, e = cur % BS;
+      const uint32_t* links = flat[b].data + e * M;
+      uint32_t nxt = links[cur % M];
+      acc += nxt;
+      cur = nxt;
+    }
+    return acc;
+  };
+  auto walk_indirect = [&](uint32_t start) {
+    const auto& R = *root;
+    uint32_t cur = start;
+    uint64_t acc = 0;
+    for (size_t s = 0; s < STEPS; s++) {
+      size_t b = cur / BS, e = cur % BS;
+      const uint32_t* links = R[b]->data + e * M;  // extra hop: R[b]-> ...
+      uint32_t nxt = links[cur % M];
+      acc += nxt;
+      cur = nxt;
+    }
+    return acc;
+  };
+
+  auto bench = [&](const char* name, auto fn) {
+    volatile uint64_t sink = 0;
+    double best = 1e30;
+    for (int trial = 0; trial < 3; trial++) {
+      auto t0 = std::chrono::steady_clock::now();
+      sink = fn(123457u);
+      auto t1 = std::chrono::steady_clock::now();
+      double ns = std::chrono::duration<double, std::nano>(t1 - t0).count();
+      best = std::min(best, ns / STEPS);
+    }
+    std::printf("%-10s : %.3f ns/visit\n", name, best);
+    (void)sink;
+    return best;
+  };
+
+  // Realistic variants: same walk, but compute a distance per visit.
+  auto walk_flat_dist = [&](uint32_t start) {
+    uint32_t cur = start;
+    double acc = 0;
+    for (size_t s = 0; s < STEPS; s++) {
+      size_t b = cur / BS, e = cur % BS;
+      const uint32_t* links = flat[b].data + e * M;
+      uint32_t nxt = links[cur % M];
+      acc += l2(nxt);
+      cur = nxt;
+    }
+    return uint64_t(acc);
+  };
+  auto walk_indirect_dist = [&](uint32_t start) {
+    const auto& R = *root;
+    uint32_t cur = start;
+    double acc = 0;
+    for (size_t s = 0; s < STEPS; s++) {
+      size_t b = cur / BS, e = cur % BS;
+      const uint32_t* links = R[b]->data + e * M;
+      uint32_t nxt = links[cur % M];
+      acc += l2(nxt);
+      cur = nxt;
+    }
+    return uint64_t(acc);
+  };
+
+  // Third layout: shared_ptr<vector<Block>> — blocks BY VALUE in the vector
+  // (the original "shared_ptr<vector<DataBlock>>" idea). Aliases `flat` (no-op
+  // deleter) so it reads the same data. Backbone access is a contiguous-array
+  // load like FLAT; isolates whether the cost is the per-block shared_ptr.
+  std::shared_ptr<std::vector<Block>> rootVal(&flat, [](std::vector<Block>*) {});
+  auto walk_val = [&](uint32_t start) {
+    const auto& R = *rootVal;
+    uint32_t cur = start;
+    uint64_t acc = 0;
+    for (size_t s = 0; s < STEPS; s++) {
+      size_t b = cur / BS, e = cur % BS;
+      const uint32_t* links = R[b].data + e * M;  // contiguous header, by value
+      uint32_t nxt = links[cur % M];
+      acc += nxt;
+      cur = nxt;
+    }
+    return acc;
+  };
+
+  // Fourth layout: shared_ptr<vector<BlockSP>> where each BlockSP holds a
+  // REFCOUNTED data buffer (shared_ptr<uint32_t[]>). Headers stay contiguous
+  // (fast reads) AND each block's buffer can be COW'd independently (cheap
+  // writes). This is the candidate "best of both" structure.
+  struct BlockSP {
+    std::shared_ptr<uint32_t[]> data;
+  };
+  auto rootSP = std::make_shared<std::vector<BlockSP>>(NB);
+  for (size_t b = 0; b < NB; b++) {
+    (*rootSP)[b].data = std::shared_ptr<uint32_t[]>(new uint32_t[BS * M]);
+    for (size_t k = 0; k < BS * M; k++) (*rootSP)[b].data[k] = flat[b].data[k];
+  }
+  auto walk_sp = [&](uint32_t start) {
+    const auto& R = *rootSP;
+    uint32_t cur = start;
+    uint64_t acc = 0;
+    for (size_t s = 0; s < STEPS; s++) {
+      size_t b = cur / BS, e = cur % BS;
+      const uint32_t* links = R[b].data.get() + e * M;  // contiguous hdr -> buf
+      uint32_t nxt = links[cur % M];
+      acc += nxt;
+      cur = nxt;
+    }
+    return acc;
+  };
+
+  std::printf("walk of %zu steps over %zu elements (%zu blocks), dim=%zu\n",
+              STEPS, N, NB, DIM);
+  std::printf("\n-- link lookup only (isolates the indirection) --\n");
+  double f = bench("FLAT", walk_flat);
+  double v = bench("VAL-vec", walk_val);
+  double sp = bench("VAL+spbuf", walk_sp);
+  double i = bench("INDIRECT", walk_indirect);
+  std::printf("VAL+spbuf (contiguous hdr + refcounted buffer) vs FLAT: %+.1f%%\n",
+              100.0 * (sp - f) / f);
+  std::printf("shared_ptr<vector<Block>>  overhead vs FLAT: %+.1f%%\n",
+              100.0 * (v - f) / f);
+  std::printf("shared_ptr<vec<shared_ptr>> overhead vs FLAT: %+.1f%% (%.3f ns)\n",
+              100.0 * (i - f) / f, i - f);
+
+  std::printf("\n-- with distance calc (realistic per-visit work) --\n");
+  double fd = bench("FLAT+dist", walk_flat_dist);
+  double id = bench("INDIR+dist", walk_indirect_dist);
+  std::printf("indirection overhead: %+.1f%% (%.3f ns/visit)\n",
+              100.0 * (id - fd) / fd, id - fd);
+  return 0;
+}
diff --git a/docs/design/hnsw-snapshot/prototypes/snapshot_poc.cpp b/docs/design/hnsw-snapshot/prototypes/snapshot_poc.cpp
new file mode 100644
index 000000000..de1d35e38
--- /dev/null
+++ b/docs/design/hnsw-snapshot/prototypes/snapshot_poc.cpp
@@ -0,0 +1,279 @@
+// POC: lock-free consistent snapshot iteration over a graph built from the REAL
+// VecSim node structures (`ElementGraphData`/`ElementLevelData` from
+// graph_data.h) and the REAL `copyTo` deep-copy primitive added in Phase 1.
+//
+// It is a minimal (level-0) graph index — not full HNSW search quality — whose
+// only purpose is to answer "is the snapshot mechanism implementable against
+// VecSim's actual node representation?" It demonstrates, end to end:
+//   * rooted COW block storage  shared_ptr<vector<Block{shared_ptr buf}>>
+//   * snapshot capture under a read lock, then release (O(1) shared_ptr copy)
+//   * a graph traversal run lock-free from the snapshot
+//   * a concurrent writer that adds nodes + rewires edges, COW-ing real blocks
+//     via ElementGraphData::copyTo
+//   * the snapshot's query result staying invariant while the live graph diverges
+//   * automatic refcount cleanup (allocator bytes return to the live baseline)
+//
+// Build (from this dir):
+//   g++ -O2 -std=c++20 -pthread -I ../../../../src \
+//       snapshot_poc.cpp ../../../../src/VecSim/memory/vecsim_malloc.cpp \
+//       -o snapshot_poc
+
+#include "VecSim/algorithms/hnsw/graph_data.h"
+#include "VecSim/memory/vecsim_malloc.h"
+
+#include <algorithm>
+#include <atomic>
+#include <cstdint>
+#include <cstdio>
+#include <memory>
+#include <queue>
+#include <random>
+#include <shared_mutex>
+#include <thread>
+#include <vector>
+
+using Alloc = std::shared_ptr<VecSimAllocator>;
+static constexpr idType INVALID = (idType)-1;
+
+struct Block {
+  std::shared_ptr<char[]> data;  // blockSize * elementGraphDataSize bytes
+};
+using Backbone = std::vector<Block>;
+using Root = std::shared_ptr<Backbone>;
+
+// Immutable snapshot handle: captured under the read lock, then used lock-free.
+struct Snapshot {
+  Root root;
+  idType entry;
+  size_t count, blockSize, egds, lds;
+  ElementGraphData *at(idType id) const {
+    return (ElementGraphData *)((*root)[id / blockSize].data.get() +
+                                (id % blockSize) * egds);
+  }
+};
+
+class SnapGraph {
+ public:
+  SnapGraph(size_t M = 16, size_t M0 = 32, size_t blockSize = 64)
+      : M_(M), M0_(M0), blockSize_(blockSize),
+        alloc_(VecSimAllocator::newVecsimAllocator()) {
+    lds_ = sizeof(ElementLevelData) + sizeof(idType) * M_;
+    egds_ = sizeof(ElementGraphData) + sizeof(idType) * M0_;
+    root_ = std::make_shared<Backbone>();
+  }
+
+  size_t allocatedBytes() { return alloc_->getAllocationSize(); }
+
+  Snapshot snapshot() {
+    std::shared_lock lk(guard_);
+    return Snapshot{root_, entry_, count_, blockSize_, egds_, lds_};
+  }
+
+  // Append a node (no links yet). Returns its id.
+  idType addNode() {
+    std::unique_lock lk(guard_);
+    idType id = (idType)count_;
+    if (id % blockSize_ == 0) {  // need a new block
+      cowBackbone();
+      root_->push_back(makeBlock());
+    }
+    count_++;
+    if (entry_ == INVALID) entry_ = id;
+    return id;
+  }
+
+  // Add directed edge a -> b at level 0 (COW-aware).
+  void connect(idType a, idType b) {
+    std::unique_lock lk(guard_);
+    mutateNode(a, [&](ElementLevelData &ld) {
+      if (ld.getNumLinks() < (linkListSize)M0_) ld.appendLink(b);
+    });
+  }
+
+  // Replace a's level-0 links with `neighbors` (COW-aware) — used to diverge.
+  void rewire(idType a, const std::vector<idType> &neighbors) {
+    std::unique_lock lk(guard_);
+    mutateNode(a, [&](ElementLevelData &ld) {
+      vecsim_stl::vector<idType> v(alloc_);
+      for (idType n : neighbors)
+        if (v.size() < M0_) v.push_back(n);
+      ld.setLinks(v);
+    });
+  }
+
+  size_t liveCount() {
+    std::shared_lock lk(guard_);
+    return count_;
+  }
+
+ private:
+  ElementGraphData *at(Root &r, idType id) {
+    return (ElementGraphData *)((*r)[id / blockSize_].data.get() +
+                                (id % blockSize_) * egds_);
+  }
+
+  // Backbone COW: copy the header vector once per generation (cheap; shares the
+  // block buffers by refcount). Only when a snapshot is sharing it.
+  void cowBackbone() {
+    if (root_.use_count() > 1) root_ = std::make_shared<Backbone>(*root_);
+  }
+
+  // Per-block COW via the REAL ElementGraphData::copyTo. Only when a snapshot
+  // shares this block's buffer.
+  void cowBlock(idType id) {
+    Block &blk = (*root_)[id / blockSize_];
+    if (blk.data.use_count() > 1) {
+      char *nraw = (char *)alloc_->allocate(blockSize_ * egds_);
+      for (size_t i = 0; i < blockSize_; i++) {
+        auto *src = (ElementGraphData *)(blk.data.get() + i * egds_);
+        auto *dst = (ElementGraphData *)(nraw + i * egds_);
+        src->copyTo(dst, lds_, egds_, alloc_);  // <-- exercises Phase 1 primitive
+      }
+      blk.data = makeBufferOwner(nraw);
+    }
+  }
+
+  template <typename F>
+  void mutateNode(idType id, F &&fn) {
+    cowBackbone();
+    cowBlock(id);
+    fn(at(root_, id)->getElementLevelData(0, lds_));
+  }
+
+  std::shared_ptr<char[]> makeBufferOwner(char *raw) {
+    Alloc a = alloc_;
+    size_t bs = blockSize_, e = egds_, l = lds_;
+    return std::shared_ptr<char[]>(raw, [a, bs, e, l](char *p) {
+      for (size_t i = 0; i < bs; i++)
+        ((ElementGraphData *)(p + i * e))->destroy(l, a);
+      a->free_allocation(p);
+    });
+  }
+
+  Block makeBlock() {
+    char *raw = (char *)alloc_->allocate(blockSize_ * egds_);
+    for (size_t i = 0; i < blockSize_; i++)
+      new (raw + i * egds_) ElementGraphData(0, lds_, alloc_);  // level-0 only
+    return Block{makeBufferOwner(raw)};
+  }
+
+  size_t M_, M0_, blockSize_, lds_, egds_;
+  Alloc alloc_;
+  std::shared_mutex guard_;
+  Root root_;
+  idType entry_ = INVALID;
+  size_t count_ = 0;
+};
+
+// Lock-free traversal of a snapshot: BFS from the entry over level-0 links,
+// collect reachable ids, return an order-independent hash of the closest `k` to
+// `q` (distance = |id - q|). Result depends only on the snapshot's topology.
+static uint64_t query(const Snapshot &s, idType q, size_t k) {
+  if (s.count == 0) return 0;
+  std::vector<char> seen(s.count, 0);
+  std::queue<idType> bfs;
+  std::vector<idType> reached;
+  bfs.push(s.entry);
+  seen[s.entry] = 1;
+  while (!bfs.empty()) {
+    idType cur = bfs.front();
+    bfs.pop();
+    reached.push_back(cur);
+    ElementLevelData &ld = s.at(cur)->getElementLevelData(0, s.lds);
+    for (linkListSize j = 0; j < ld.getNumLinks(); j++) {
+      idType nb = ld.getLinkAtPos(j);
+      if (nb < s.count && !seen[nb]) {
+        seen[nb] = 1;
+        bfs.push(nb);
+      }
+    }
+  }
+  auto dist = [&](idType id) {
+    return id > q ? (uint64_t)(id - q) : (uint64_t)(q - id);
+  };
+  std::sort(reached.begin(), reached.end(),
+            [&](idType a, idType b) { return dist(a) < dist(b); });
+  if (reached.size() > k) reached.resize(k);
+  std::sort(reached.begin(), reached.end());  // order-independent
+  uint64_t h = 1469598103934665603ull;
+  for (idType id : reached) {
+    h ^= id;
+    h *= 1099511628211ull;
+  }
+  return h;
+}
+
+int main() {
+  SnapGraph g;
+  const size_t N = 2000;
+  std::mt19937 rng(2024);
+
+  // Build a connected graph: ring + random forward edges so BFS reaches a lot.
+  for (size_t i = 0; i < N; i++) g.addNode();
+  for (idType i = 0; i < N; i++) {
+    g.connect(i, (i + 1) % N);
+    for (int e = 0; e < 6; e++) g.connect(i, rng() % N);
+  }
+
+  size_t baseBytes = g.allocatedBytes();
+  Snapshot snap = g.snapshot();
+  const idType q = 1000;
+  const uint64_t baseline = query(snap, q, 32);
+  std::printf("baseline snapshot query = %016llx\n", (unsigned long long)baseline);
+
+  // Concurrent writer: rewire random nodes (changes reachability) + add nodes.
+  std::atomic<bool> stop{false};
+  std::atomic<uint64_t> ops{0};
+  std::thread writer([&] {
+    std::mt19937 wr(777);
+    while (!stop.load(std::memory_order_relaxed)) {
+      size_t n = g.liveCount();
+      idType a = wr() % n;
+      std::vector<idType> nb;
+      for (int e = 0; e < 8; e++) nb.push_back(wr() % n);
+      g.rewire(a, nb);
+      if ((ops++ & 0xFFF) == 0) {
+        idType x = g.addNode();
+        g.connect(x, wr() % g.liveCount());
+      }
+    }
+  });
+
+  // Read the snapshot lock-free, repeatedly, while writes happen.
+  bool consistent = true;
+  for (int i = 0; i < 5000; i++) {
+    if (query(snap, q, 32) != baseline) { consistent = false; break; }
+    std::this_thread::yield();
+  }
+
+  stop.store(true);
+  writer.join();
+
+  // Deterministically diverge the LIVE graph: point the entry node only at
+  // itself, so a live BFS reaches just {entry}. The snapshot must be unaffected.
+  g.rewire(0, {0});
+  Snapshot liveNow = g.snapshot();
+  uint64_t liveResult = query(liveNow, q, 32);
+  uint64_t snapResult = query(snap, q, 32);
+
+  std::printf("writer ops: %llu\n", (unsigned long long)ops.load());
+  std::printf("snapshot stayed consistent during writes: %s\n",
+              consistent ? "YES" : "NO");
+  std::printf("snapshot query still = %016llx (== baseline: %s)\n",
+              (unsigned long long)snapResult, snapResult == baseline ? "YES" : "NO");
+  std::printf("live query after edit = %016llx (diverged: %s)\n",
+              (unsigned long long)liveResult, liveResult != baseline ? "YES" : "no");
+
+  size_t heldBytes = g.allocatedBytes();
+  snap.root.reset();
+  liveNow.root.reset();
+  size_t afterBytes = g.allocatedBytes();
+  std::printf("allocator bytes: baseline=%zu, while-held=%zu, after-drop=%zu\n",
+              baseBytes, heldBytes, afterBytes);
+  bool reclaimed = afterBytes < heldBytes;  // COW versions freed on snapshot drop
+
+  bool pass = consistent && (snapResult == baseline) && (liveResult != baseline) &&
+              reclaimed;
+  std::printf("\n==== SNAPSHOT POC %s ====\n", pass ? "PASSED" : "FAILED");
+  return pass ? 0 : 1;
+}
diff --git a/docs/design/hnsw-snapshot/spec-delta.md b/docs/design/hnsw-snapshot/spec-delta.md
new file mode 100644
index 000000000..d442f5673
--- /dev/null
+++ b/docs/design/hnsw-snapshot/spec-delta.md
@@ -0,0 +1,85 @@
+# vector-snapshot-iteration (delta)
+
+> On merge, the requirements below fold into
+> `openspec/specs/vector-snapshot-iteration/spec.md`.
+
+## ADDED Requirements
+
+### Requirement: Vector batch iteration is consistent as of construction
+When snapshot iteration is enabled, a vector batch iterator (KNN, hybrid, and
+cursor-paginated vector aggregation) over an HNSW index SHALL return results
+consistent with the index state at the time the iterator was constructed.
+
+#### Scenario: Vectors added after construction are not returned
+- **WHEN** a batch iterator is constructed over an HNSW index
+- **AND** new vectors are added to the index while the iterator is being consumed
+- **THEN** results returned by the iterator SHALL NOT include the vectors added
+  after construction
+
+#### Scenario: Vectors deleted after construction remain visible
+- **WHEN** a batch iterator is constructed over an HNSW index containing a
+  vector `v`
+- **AND** `v` is deleted while the iterator is being consumed
+- **THEN** `v`'s **slot is not recycled** while the snapshot can still read it —
+  its id is below the reclaim horizon (`maxVisibleCount`), so the SWAP is deferred
+  — and the snapshot continues to read `v`'s own embedding and identity, never a
+  different element wrongly occupying `v`'s slot. (Slots at/above the horizon —
+  ids no live snapshot can see — still recycle promptly.)
+
+#### Scenario: Deleted-flag consistency is weak (the chosen contract)
+> **Decision (task 4.8).** `markDelete` flips the deleted flag **in place** in the
+> single-version `idToMetaData` container; copy-on-write versions only the graph,
+> and SWAP deferral protects only against *slot recycling*, not against a flag
+> flip on the slot's own metadata.
+- **WHEN** a snapshot is captured over an index containing a live vector `v`
+- **AND** `v` is marked deleted while the snapshot is held
+- **THEN** the snapshot MAY observe `v` as deleted (and filter it out), even
+  though `v` was live at capture. This is the **accepted weak contract**: "no
+  *new* vectors after T, crash-safe, slot-stable." Strict as-of-capture
+  deleted-flags would require versioning the metadata flag and is tracked as
+  separate work (not built).
+
+#### Scenario: Repeated reads of the same snapshot are stable
+- **WHEN** a snapshot iterator is constructed
+- **AND** the index is modified concurrently by inserts and deletes
+- **THEN** the set of candidate elements the iterator traverses SHALL be the
+  point-in-time set captured at construction, unaffected by those modifications
+
+### Requirement: Iteration does not hold the index lock
+When snapshot iteration is enabled, a vector batch iterator SHALL acquire the
+index read lock only to construct (capture) the snapshot, and SHALL NOT hold any
+index lock while producing results.
+
+#### Scenario: Writers progress during a long read
+- **WHEN** a long-running vector iteration (e.g. a paginated cursor) is in
+  progress
+- **AND** a concurrent client inserts or deletes vectors
+- **THEN** the inserts and deletes SHALL complete without waiting for the
+  iteration to finish
+
+### Requirement: Snapshot memory is bounded
+> **Status: NOT yet implemented.** The reclaim horizon keeps compaction flowing
+> for above-horizon churn, but nothing yet caps a single long-lived cursor's
+> retained memory or its indefinitely-deferred below-horizon slots. This
+> requirement is the remaining production-safety gap (tasks.md 6.2).
+
+A configured limit SHALL bound the memory retained by live snapshots. When the
+limit would be exceeded, the system SHALL take a deterministic, documented
+action rather than growing retained memory without bound.
+
+#### Scenario: Limit is enforced for a long-lived cursor
+- **WHEN** snapshot iteration is enabled with a memory/lifetime limit
+- **AND** a cursor is held open while concurrent writes accumulate superseded
+  graph versions up to the limit
+- **THEN** the system SHALL enforce the limit by the configured on-breach action
+  (refuse new snapshots or terminate the offending cursor with a clear error)
+- **AND** retained snapshot memory SHALL NOT grow beyond the configured bound
+
+### Requirement: No regression when snapshots are inactive
+When no snapshot is active, vector indexing and query operations SHALL behave as
+before this change, with no additional copy-on-write or locking overhead.
+
+#### Scenario: In-place writes with no active snapshot
+- **WHEN** no snapshot iterator is live
+- **AND** vectors are inserted or deleted
+- **THEN** the index SHALL mutate its graph in place without copy-on-write
diff --git a/docs/design/hnsw-snapshot/tasks.md b/docs/design/hnsw-snapshot/tasks.md
new file mode 100644
index 000000000..f532c785b
--- /dev/null
+++ b/docs/design/hnsw-snapshot/tasks.md
@@ -0,0 +1,276 @@
+# Tasks: HNSW snapshot iterator
+
+> Each top-level task is roughly one reviewable PR/commit. Phases 1–2 are the
+> risky structural work; later phases depend on them. Test tasks are explicit: a
+> change is not done until new behavior is covered and the suites are green.
+
+## 1. Make the graph block deep-copyable (`deps/VectorSimilarity`)
+
+> **Decision (changed during implementation):** instead of moving the mutex out
+> to a side array (original 1.1), the block is made *deep-copyable* via an
+> explicit `copyTo` that re-initializes a fresh mutex on the copy and deep-copies
+> each level's incoming-edges vector. Rationale: the side-array refactor touches
+> every `lockNodeLinks`/`getElementLevelData` call site (large blast radius, high
+> miscompile risk), whereas value-semantic deep copy achieves the same goal —
+> a block buffer that can be COW'd without sharing mutable state — with **zero**
+> call-site churn. The mutex stays in `ElementGraphData`; the copy never aliases
+> its lock state (COW runs under the exclusive write lock, before any
+> per-node lock is taken). Algorithm validated in
+> `docs/design/prototypes/block_deepcopy.cpp`.
+
+- [x] 1.1 Add `ElementGraphData::copyTo` + `ElementLevelData::copyInto` deep-copy
+      primitives (fresh mutex, independent incoming-edges, FAM-aware) in
+      `graph_data.h`; compiles against the real headers (`-fsyntax-only`)
+- [x] 1.2 Handle upper levels (`others`) in `copyTo`; the
+      `incomingUnidirectionalEdges` copy uses the index's allocator
+- [x] 1.3 C++ unit test in `tests/unit` exercising `copyTo` on a
+      multi-level node (contents equal, storage independent, source-mutation
+      isolation)
+- [x] 1.4 Build VecSim unit suite green with the primitive in place
+
+## 2. Rooted copy-on-write block storage
+
+- [x] 2.1 Refcounted `GraphBlockBuffer` + the rooted `shared_ptr<vector<Block>>`
+      storage (`RootedCowStore<T>` + `GraphDataBlocks` in `graph_data_blocks.h`)
+- [x] 2.2 Resolve `getGraphDataByInternalId` / `getElement` through `root`
+- [x] 2.3 Backbone COW (forked once at capture; never re-forked under the shared
+      lock — see 4.6 / design.md "Operations")
+- [x] 2.4 Per-block buffer COW in the write paths (insert + repair), with the
+      atomic CAS-fork for concurrent inserts
+- [x] 2.5 C++ unit test: writes COW only when a snapshot shares the block;
+      in-place when unshared (`cowStorageSnapshotIsolation`,
+      `cowDecisionByGenerationNotUseCount`)
+- [~] 2.6 Read-path cost: the `blockData` raw-pointer return keeps the no-snapshot
+      path a plain deref (gated on `anySnapshotLive()`); informal microbench shows
+      ~flat at realistic dims. A formal `microbenchmark`-label run at production
+      dimensions is still outstanding.
+
+## 3. Snapshot handle + capture under the lock
+
+- [x] 3.1 `HNSWGraphSnapshot` handle (captured `root`, entry point, `maxLevel`,
+      `curElementCount`, captured vector-block pointers + metadata root)
+- [x] 3.2 `captureGraphSnapshot()` captures under the lock; the caller releases it
+- [x] 3.3 A **dedicated** `HNSWSnapshot_BatchIterator` resolves all node access
+      through the handle (the live `hnsw_batch_iterator.h` is left untouched)
+- [x] 3.4 C++ unit tests: capture + iteration touch only the snapshot
+      (`snapshotHandleCapture`, `snapshotIterator_*`)
+- [x] 3.5 **Give the snapshot a monotonic generation id** (+ its captured
+      `curElementCount`). Handed out from one counter at capture (never reused);
+      the `SnapshotRegistry` tracks the live set. Clone threshold = `newestLive()`
+      = max(live ids); reclaim horizon = `maxVisibleCount()` = max captured
+      `curElementCount` (NOT `min(live ids)` — see design.md "Snapshot identity").
+
+## 4. Lock-free iteration & reclamation
+
+> **Status: IMPLEMENTED (vecsim layer), across branches `snapshot/06`–`10`.**
+> Phases 1–3 landed the storage mechanism (deep-copy primitive, rooted COW block
+> storage, O(1) snapshot handle + `captureGraphSnapshot()`). Phase 4 then took the
+> production lock-free iterator the whole way: plain + tiered, opt-in, with
+> concurrent inserts/deletes/repair/swap all proceeding during a live cursor. The
+> concurrent insert+delete+repair+cursor repro
+> (`parallelBatchIteratorSearchSnapshot`) is clean under ASan.
+>
+> **The hard-won finding that shaped it.** An early attempt to make the production
+> iterator capture a snapshot deadlocked under `parallelBatchIteratorSearch`: the
+> tiered insert path mutates the live graph **in place under a *shared* lock**,
+> relying on stable node addresses + per-node `neighborsGuard` mutexes. With a
+> snapshot live, a concurrent insert COWs a block under that shared lock — moving
+> a node (and its mutex) to a new buffer while a holder is mid-critical-section,
+> so `lockNodeLinks(old)`/`unlock(new)` resolve to different mutexes (lock leak →
+> deadlock; torn reads → crash). The resolution had two parts, and notably it is
+> **NOT** "serialize all writers under the exclusive lock":
+> - **Capture runs under the *exclusive* lock** — a brief quiescence point where
+>   no writer is mid-insert — and **forks the backbone** there, so the live
+>   backbone is never re-forked under the shared lock (stays structurally stable
+>   for readers). Concurrent inserts keep running under the *shared* lock.
+> - **Per-block COW publishes via an atomic compare-exchange** (generation-tag
+>   idempotent), so two inserts racing to COW the same block are safe without a
+>   global writer serialization.
+> - **Repair COWs every touched node's block BEFORE taking any per-node lock**
+>   (`mutuallyUpdateForRepairedNode`), so a held lock's buffer can't be forked out
+>   from under it — this is what lets repair run *under* a live snapshot instead of
+>   being deferred.
+> - **Reader reads the immutable snapshot with NO per-node locks** via a dedicated
+>   `HNSWSnapshot_BatchIterator` (the live `hnsw_batch_iterator.h` is untouched).
+> - **SWAP slot reuse gated** by the `maxVisibleCount()` reclaim horizon, so a
+>   freed id a snapshot can still see stays valid (and the vectors container needs
+>   no COW).
+> - **All gated behind the opt-in query param** so the default path is unchanged.
+>
+> Validated under **ASan** (memory safety on the concurrent path). A fully clean
+> **TSan** run is not claimed: pre-existing relaxed flag-byte races (4.4a) remain.
+
+- [~] 4.1 Lock-free snapshot read path. **Done (core capability):**
+      `HNSWIndex::topKFromSnapshot(snap, query, k, params)` runs the full two-phase
+      HNSW KNN (greedy upper-level descent + ef-bounded level-0 best-first) reading
+      **all** graph access through the captured `HNSWGraphSnapshot` with **no
+      per-node locks** and a local visited set — it touches no live graph or mutex.
+      Proven by `HNSWTest.lockFreeSnapshotQuery`: on the same graph it returns
+      results identical to the live `topKQuery`, and run with no lock it stays
+      point-in-time consistent while a writer copy-on-writes the live graph
+      underneath (TSan-clean, 0 races). The snapshot also captures the **vectors**
+      container's per-block base pointers (`HNSWGraphSnapshot::getVectorData`), so
+      the query reads vector data without touching the live (reallocating) vectors
+      container — TSan confirmed this removes the vector-data race.
+      **Third per-id container now closed (Stage A):** `idToMetaData`
+      (deleted-flags + labels) used to be a flat vector that *fully* reallocs on
+      growth (`resizeIndexCommon`); the query read it via `isMarkedDeleted` /
+      `isInProcess` / `getExternalLabel`, so a query during a concurrent **insert**
+      raced there (TSan-confirmed: `isMarkedAs` vs `resizeIndexCommon`). It is now
+      wrapped in `MetaDataStore`, backed by the same rooted/generation-tagged COW
+      as the graph (see consolidation note below): capture is O(1), growth COWs
+      instead of reallocating, and `topKFromSnapshot` reads flags + labels from the
+      **captured** metadata (`snap.metaData`), not the live index. Proven by
+      `HNSWTest.lockFreeSnapshotQueryDuringInserts` — a writer doing real
+      `addVector`s (growing all three per-id containers) cannot perturb a
+      lock-free snapshot read; TSan-clean.
+      **Done:** wired into the production batch iterator via the dedicated
+      `HNSWSnapshot_BatchIterator` (plain, branch 06) and the tiered cursor that
+      captures-then-releases the main lock (branch 07), behind the opt-in
+      `useGraphSnapshotIterator` param.
+
+      **COW consolidation (Stage A).** The rooted-COW + generation-tag mechanism
+      that was specific to the graph backbone is now a single reusable
+      `RootedCowStore<T>` (in `graph_data_blocks.h`): `shared_ptr<T>` root +
+      generation tag + `SnapshotRegistry`, exposing `capture()` (O(1)),
+      `cowForWrite(cloneFn)` (clone iff `newestLive() >= gen`, stamp with
+      `currentGeneration()`), `initRoot`/`reset`/`mustClone`. Both the graph
+      backbone (`GraphDataBlocks`) and the metadata (`MetaDataStore`) are built on
+      it. The vectors container stays on the captured-per-block-pointer scheme
+      (append-only blocks keep stable addresses; gated SWAP keeps ids stable), which
+      `lockFreeSnapshotQueryDuringInserts` confirms is race-free — so it is not
+      migrated to `RootedCowStore<T>`.
+- [x] 4.2 SWAP-reuse gating primitive (`graphSnapshotActive()`), now backed by
+      the live-id registry (not `root.use_count()`, which goes stale after the
+      first COW) and **wired into `executeReadySwapJobs`** (see 4.7a).
+- [x] 4.3 Verify automatic reclamation: dropping the snapshot frees superseded
+      versions (covered by `cowStorageSnapshotIsolation` + `snapshotLockFree-
+      Consistency`)
+- [~] 4.4 Concurrency test under **TSan** (clang; g++ libtsan segfaults in this
+      sandbox). **Run and clean** for the snapshot paths — 0 data races in
+      `snapshotPreservedUnderConcurrentInserts` (real concurrent inserts + a
+      lock-free snapshot read), `snapshotLockFreeConsistency`,
+      `cowDecisionByGenerationNotUseCount`, `snapshotPinsUntraversedBlock`, and
+      `swapDeferredWhileSnapshotHeld`. Two findings, neither in snapshot code:
+      (a) **pre-existing**: `parallelBatchIteratorSearch` reports 26 data races in
+      the existing tiered concurrent insert+search path (e.g. a non-atomic read of
+      `ElementMetaData::flags` IN_PROCESS in `processCandidate` vs the atomic write
+      in `unmarkInProcess`) — all in `hnsw.h`/`hnsw_tiered.h`, none touching
+      `graph_data_blocks.h` / `SnapshotRegistry` / the gen tags; never TSan-checked
+      before. `swapJobBasic` (same delete/repair path, no snapshot) is TSan-clean.
+      (b) the delete/**repair** path was not COW-safe under a live snapshot — now
+      **fixed** (4.5, branch 10: COW-before-lock). The full production path
+      (concurrent insert + delete + repair + lock-free cursors,
+      `parallelBatchIteratorSearchSnapshot`) is now validated **clean under ASan**.
+      Still outstanding: a fully clean **TSan** run — blocked only by the
+      pre-existing (a) relaxed flag-byte races, not by snapshot code.
+- [x] 4.5 Convert all graph-mutation sites to COW-aware (COW-before-reference).
+      **Insert path** via `getElementLevelDataForWrite` /
+      `getGraphDataByInternalIdForWrite` in `mutuallyConnectNewElement`,
+      `revisitNeighborConnections`, `mutuallyRemoveNeighborAtPos` (proven by
+      `snapshotPreservedUnderConcurrentInserts`). **Delete/repair path** completed
+      in branch 10: `mutuallyUpdateForRepairedNode` COWs **every** node in
+      `nodes_to_update` **before** taking any per-node lock, so the held lock's
+      buffer can't be forked out from under it (the mutex-identity hazard the
+      earlier TSan run surfaced as "unlock of an unlocked mutex"). With that,
+      **repair runs under a live snapshot** (no longer deferred);
+      `removeVectorInPlace` captures the locked candidate id so lock/unlock can't
+      target different ids. Proven by `tieredSnapshotRepairRunsHorizonRecycles` +
+      `parallelBatchIteratorSearchSnapshot` (ASan-clean).
+- [x] 4.6 ~~Make COW-ing writes take the exclusive lock when a snapshot is
+      active.~~ **Superseded.** Instead of serializing all writers under the
+      exclusive lock, the implementation keeps concurrent inserts on the *shared*
+      lock and makes per-block COW publish via an **atomic compare-exchange**
+      (generation-tag idempotent); only **capture** takes the exclusive lock (a
+      brief quiescence point that forks the backbone). This preserves write
+      concurrency, which serializing would have killed.
+- [ ] 4.7 **Slot reclamation — the vector+metadata problem (distinct from graph
+      COW).** A slot spans three parallel containers; COW only versions the
+      graph. SWAP overwrites the **vector** (`getDataByInternalId`) and
+      **metadata** (`idToMetaData[id]`) **in place**, so a snapshot-referenced
+      `id` would read a *wrong embedding + wrong deleted-flag*. Fix = **defer the
+      SWAP**, do NOT COW the vector/metadata containers:
+      - [x] 4.7a Wired `graphSnapshotActive()` into `executeReadySwapJobs`: while
+        a snapshot is live the ready swap jobs are deferred (tombstoned) and
+        recycled by a later GC pass once no snapshot references them.
+      - [x] 4.7b Test `HNSWTieredIndexTest.swapDeferredWhileSnapshotHeld`:
+        single-writer delete while holding a snapshot — the deleted id's slot
+        keeps its **vector bytes** and **identity** (label), and shows its **own**
+        deleted-flag rather than a live element wrongly occupying the recycled
+        slot; releasing the snapshot lets the deferred swap proceed.
+      - [x] 4.7c Horizon refinement **shipped** (branch 09), as a *count-based*
+        horizon rather than the originally-sketched min-live-id + per-slot
+        free-generation: recycle a freed id `d` once `d >= maxVisibleCount()` (max
+        captured `curElementCount` over live snapshots) — no live snapshot can read
+        an id at/above its own count. So one long-lived cursor no longer stalls all
+        compaction (high-id churn recycles immediately). Proven by
+        `tieredSnapshotRepairRunsHorizonRecycles`.
+- [x] 4.8 **Deleted-flag consistency decision.** `markDelete` flips the flag in
+      place, so even with SWAP deferral a snapshot can observe an element that was
+      live at capture as *deleted*. Default contract: accept this weak
+      consistency ("no *new* vectors after T, crash-safe"). Strict as-of-capture
+      deleted-flags would require versioning the metadata flag (separate work).
+      Record the chosen contract in `spec-delta.md`.
+- [x] 4.9 **COW trigger = generation tag, not `use_count`.** Stamp the backbone
+      and each block buffer with the generation they were last written in; in the
+      backbone-COW check and `getGraphDataByInternalIdForWrite`, clone iff
+      `max_live_snapshot_id >= gen`, NOT `use_count() > 1`. Rationale:
+      container-level `root.use_count()` goes **stale after the first backbone
+      COW** (reports unshared while a snapshot still holds the old backbone →
+      in-place mutation corrupts the snapshot — the same staleness 4.2 hit for the
+      gate); per-buffer `use_count` is sound but refcount-coupled and
+      false-positive-prone. Keep `shared_ptr` purely for **reclamation**.
+      `max_live_snapshot_id` = tail of the 3.5 live-id set. **Done:** backbone +
+      `GraphBlockBuffer` carry `gen`; `SnapshotRegistry` exposes lock-free
+      `newestLive()` / `currentGeneration()` (atomics on the write hot path, mutex
+      only for the rare min() query); `cowBackbone`/`cowBlock` clone iff
+      `newestLive() >= gen` and stamp the clone with `currentGeneration()`;
+      `shared_ptr` retained purely for reclamation. Tests
+      `HNSWTest.cowDecisionByGenerationNotUseCount` (write block A → backbone
+      clones, then a write to block B still COWs) and
+      `HNSWTest.snapshotPinsUntraversedBlock` (a never-traversed block is pinned by
+      the captured backbone — read-order independence). NOTE: capturing a snapshot
+      now requires a registered generation (`captureGraphSnapshot()`); a bare
+      `captureRoot()` no longer triggers COW, so `cowStorageSnapshotIsolation` was
+      updated to capture via the handle.
+
+- [ ] 4.10 **Snapshot that outlives `VecSimIndex_Free` corrupts the heap.**
+      Discovered via 4.9's tests: if a snapshot handle is still alive when the
+      index is destroyed (its captured backbone/buffers then outlive the index),
+      teardown corrupts the heap (`malloc_consolidate: unaligned fastbin chunk`).
+      Dropping the handle before `VecSimIndex_Free` is clean, which is the normal
+      usage (a batch iterator is always freed before the index), and all snapshot
+      unit tests now release the handle first. But design.md "Edge cases" lists
+      "snapshot outlives the index drop" as a supported scenario, so this is a real
+      teardown/allocator-ordering bug to root-cause separately (likely the index's
+      allocator vs. the buffers' allocator refs during `~HNSWIndex` +
+      `operator delete`). Out of 4.9's scope (the gen-tag itself is correct — the
+      tests pass once the handle is released before the free).
+
+## 5. RediSearch integration
+
+- [ ] 5.1 `src/iterators/hybrid_reader.c` + vecsim wrappers: construct the
+      iterator under the lock, release it for iteration
+- [ ] 5.2 Ensure cursor-paginated vector aggregation holds the snapshot across
+      `FT.CURSOR READ` calls and frees it on cursor close/timeout
+- [ ] 5.3 Python end-to-end: KNN / hybrid / `WITHCURSOR` return consistent
+      results while a concurrent writer adds and deletes
+- [ ] 5.4 Python end-to-end: writer makes observable progress during a long
+      vector read (lock released)
+
+## 6. Configuration, limits & observability
+
+- [ ] 6.1 Add a tiered-HNSW config to enable snapshot iteration
+- [ ] 6.2 Add a snapshot memory / lifetime cap; define on-breach behavior
+      (resolve Open question #2)
+- [ ] 6.3 Expose active-snapshot count and retained snapshot memory via
+      `FT.INFO` and/or `FT.DEBUG`
+- [ ] 6.4 Python end-to-end: cap is enforced (long-lived cursor cannot grow
+      retained memory without bound)
+
+## 7. Docs & spec delta
+
+- [ ] 7.1 Update the delta spec under
+      `specs/vector-snapshot-iteration/spec.md` to match what shipped
+- [ ] 7.2 Document the new config and the strengthened consistency guarantee
+- [ ] 7.3 Confirm the PR's release-notes checkbox reflects the behavior change
diff --git a/src/VecSim/algorithms/hnsw/graph_data.h b/src/VecSim/algorithms/hnsw/graph_data.h
index b38f74986..b8efd9df0 100644
--- a/src/VecSim/algorithms/hnsw/graph_data.h
+++ b/src/VecSim/algorithms/hnsw/graph_data.h
@@ -95,6 +95,19 @@ struct ElementLevelData {
         assert(it != this->incomingUnidirectionalEdges->end());
         *it = id_after;
     }
+    // Deep-copy this level record into `dst`. `recordSize` is the full byte size
+    // of the record including the `links` flexible-array member (`levelDataSize`
+    // for upper levels; the level-0 region size for level 0). The destination
+    // gets an INDEPENDENT incoming-edges vector so a snapshot never shares a
+    // mutable vector with the live index. Used by block-level copy-on-write.
+    void copyInto(ElementLevelData *dst, size_t recordSize,
+                  const std::shared_ptr<VecSimAllocator> &allocator) const {
+        // Copies numLinks + the links FAM in one shot (also shallow-copies the
+        // incoming-edges pointer, which we immediately replace below).
+        memcpy(dst, this, recordSize);
+        dst->incomingUnidirectionalEdges =
+            new (allocator) vecsim_stl::vector<idType>(*this->incomingUnidirectionalEdges);
+    }
 };
 
 struct ElementGraphData {
@@ -136,4 +149,39 @@ struct ElementGraphData {
         return *reinterpret_cast<ElementLevelData *>(reinterpret_cast<char *>(this->others) +
                                                      (level - 1) * levelDataSize);
     }
+
+    // Deep-copy this element's graph data into raw, zeroed memory `dst` of size
+    // `elementGraphDataSize`. The copy gets a freshly-constructed mutex (lock
+    // state is never copied) and independent per-level incoming-edges vectors,
+    // so it shares no mutable state with the source. This is the foundational
+    // operation for block-level copy-on-write: it materializes an immutable
+    // snapshot version of a node that the live index can keep mutating.
+    void copyTo(ElementGraphData *dst, size_t levelDataSize, size_t elementGraphDataSize,
+                const std::shared_ptr<VecSimAllocator> &allocator) const {
+        dst->toplevel = this->toplevel;
+        // Fresh mutex; a snapshot reads immutable data and never locks, and the
+        // source's lock state must never be aliased into the copy.
+        new (&dst->neighborsGuard) std::mutex();
+        // Level 0 is inline; its links FAM occupies the tail of the element, so
+        // its record size is everything past the offset of `level0`.
+        const size_t level0Size =
+            elementGraphDataSize - (sizeof(ElementGraphData) - sizeof(ElementLevelData));
+        this->level0.copyInto(&dst->level0, level0Size, allocator);
+        if (this->toplevel > 0) {
+            dst->others =
+                (ElementLevelData *)allocator->callocate(levelDataSize * this->toplevel);
+            if (dst->others == nullptr) {
+                throw std::runtime_error("VecSim index low memory error");
+            }
+            for (size_t i = 0; i < this->toplevel; i++) {
+                auto *src_ld = reinterpret_cast<const ElementLevelData *>(
+                    reinterpret_cast<const char *>(this->others) + i * levelDataSize);
+                auto *dst_ld = reinterpret_cast<ElementLevelData *>(
+                    reinterpret_cast<char *>(dst->others) + i * levelDataSize);
+                src_ld->copyInto(dst_ld, levelDataSize, allocator);
+            }
+        } else {
+            dst->others = nullptr;
+        }
+    }
 };
diff --git a/src/VecSim/algorithms/hnsw/hnsw_base_tests_friends.h b/src/VecSim/algorithms/hnsw/hnsw_base_tests_friends.h
index cf289dffa..e74b0c129 100644
--- a/src/VecSim/algorithms/hnsw/hnsw_base_tests_friends.h
+++ b/src/VecSim/algorithms/hnsw/hnsw_base_tests_friends.h
@@ -15,6 +15,7 @@ INDEX_TEST_FRIEND_CLASS(HNSWMultiTest_MultiBatchIteratorHeapLogic_Test)
 INDEX_TEST_FRIEND_CLASS(IndexAllocatorTest_testIncomingEdgesSet_Test)
 INDEX_TEST_FRIEND_CLASS(IndexAllocatorTest_test_hnsw_reclaim_memory_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTest_markDelete_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTest_copyToDeepCopy_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWMultiTest_markDelete_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTest_allMarkedDeletedLevel_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTestParallel)
diff --git a/tests/unit/test_hnsw.cpp b/tests/unit/test_hnsw.cpp
index e308a5d6c..e0689223d 100644
--- a/tests/unit/test_hnsw.cpp
+++ b/tests/unit/test_hnsw.cpp
@@ -2248,3 +2248,93 @@ TYPED_TEST(HNSWTest, FitMemoryTest) {
 
     VecSimIndex_Free(index);
 }
+
+// Phase 1 of the HNSW snapshot work: validate the block deep-copy primitive
+// (ElementGraphData::copyTo / ElementLevelData::copyInto) against the real,
+// production node layout. Ported from
+// docs/design/hnsw-snapshot/prototypes/block_deepcopy.cpp, which proved the
+// algorithm on a stand-in struct; this runs it on a genuine multi-level node
+// built by the index (FAM links + allocator-backed `others` + incoming edges).
+// The copy is the foundation of block-level copy-on-write: it must produce a
+// node with identical contents but fully independent storage and a fresh mutex.
+TYPED_TEST(HNSWTest, copyToDeepCopy) {
+    size_t dim = 4;
+    size_t M = 8;
+    HNSWParams params = {.dim = dim, .metric = VecSimMetric_L2, .M = M, .efConstruction = 200};
+    VecSimIndex *index = this->CreateNewIndex(params);
+    auto *hnsw = this->CastToHNSW(index);
+
+    // Insert vectors until at least one node is promoted above level 0, so the
+    // copy exercises the inline level 0 *and* the separately-allocated `others`.
+    size_t n = 0;
+    ElementGraphData *src = nullptr;
+    while (src == nullptr) {
+        GenerateAndAddVector<TEST_DATA_T>(index, dim, n, n);
+        if (hnsw->getGraphDataByInternalId(n)->toplevel > 0) {
+            src = hnsw->getGraphDataByInternalId(n);
+        }
+        n++;
+        ASSERT_LT(n, 100000u) << "no multi-level node was created";
+    }
+
+    const size_t egds = hnsw->elementGraphDataSize;
+    const size_t lds = hnsw->levelDataSize;
+    auto allocator = index->getAllocator();
+
+    // Deep-copy into freshly zeroed memory, exactly as block-COW will.
+    auto dstMem = allocator->allocate_unique(egds);
+    memset(dstMem.get(), 0, egds);
+    auto *dst = reinterpret_cast<ElementGraphData *>(dstMem.get());
+    src->copyTo(dst, lds, egds, allocator);
+
+    // (1) Identical contents but independent storage, at every level.
+    ASSERT_EQ(dst->toplevel, src->toplevel);
+    for (size_t lvl = 0; lvl <= src->toplevel; lvl++) {
+        ElementLevelData &s = src->getElementLevelData(lvl, lds);
+        ElementLevelData &d = dst->getElementLevelData(lvl, lds);
+        ASSERT_EQ(d.getNumLinks(), s.getNumLinks()) << "level " << lvl;
+        for (size_t j = 0; j < s.getNumLinks(); j++) {
+            ASSERT_EQ(d.getLinkAtPos(j), s.getLinkAtPos(j)) << "level " << lvl << " pos " << j;
+        }
+        // Incoming edges: equal content, but a distinct (independent) vector.
+        const auto &se = s.getIncomingEdges();
+        const auto &de = d.getIncomingEdges();
+        ASSERT_EQ(de.size(), se.size()) << "level " << lvl;
+        for (size_t j = 0; j < se.size(); j++) {
+            ASSERT_EQ(de[j], se[j]) << "level " << lvl << " incoming " << j;
+        }
+        ASSERT_NE(d.incomingUnidirectionalEdges, s.incomingUnidirectionalEdges) << "level " << lvl;
+        // Upper-level link arrays live in `others`, so they must be at distinct
+        // addresses; level 0 is inline in each struct and naturally differs.
+        if (lvl > 0) {
+            ASSERT_NE(d.links, s.links) << "level " << lvl;
+        }
+    }
+
+    // (2) Mutating the source after the copy must not touch the snapshot.
+    ElementLevelData &s0 = src->getElementLevelData(0, lds);
+    ElementLevelData &d0 = dst->getElementLevelData(0, lds);
+    ASSERT_GT(s0.getNumLinks(), 0u);
+    idType orig_link = s0.getLinkAtPos(0);
+    size_t dst_incoming_before = d0.getIncomingEdges().size();
+    s0.setLinkAtPos(0, 0xDEADBEEF);
+    s0.newIncomingUnidirectionalEdge(0xCAFE);
+    ASSERT_EQ(d0.getLinkAtPos(0), orig_link);
+    ASSERT_EQ(d0.getIncomingEdges().size(), dst_incoming_before);
+    // Restore the source so the index stays consistent for teardown.
+    s0.setLinkAtPos(0, orig_link);
+    s0.removeIncomingUnidirectionalEdgeIfExists(0xCAFE);
+
+    // (3) Both mutexes are fresh and independently lockable (lock state never
+    // aliased from source to copy).
+    ASSERT_TRUE(src->neighborsGuard.try_lock());
+    src->neighborsGuard.unlock();
+    ASSERT_TRUE(dst->neighborsGuard.try_lock());
+    dst->neighborsGuard.unlock();
+
+    // Free the deep copy's heap allocations (incoming-edges vectors + `others`).
+    // The ElementGraphData block itself is freed by dstMem's unique_ptr.
+    dst->destroy(lds, allocator);
+
+    VecSimIndex_Free(index);
+}

From 0397d665c58ae2f51d8fdcb8f5e473cd9782e855 Mon Sep 17 00:00:00 2001
From: jonathan keinan <jonathan.keinan@redis.com>
Date: Thu, 11 Jun 2026 13:29:45 +0300
Subject: [PATCH 02/10] HNSW snapshot 02: rooted copy-on-write block storage
 (RootedCowStore<T>)

Replace the flat vector<DataBlock> graph storage with a rooted, refcounted
copy-on-write container built on a single reusable RootedCowStore<T>
(shared_ptr<T> root + generation tag + SnapshotRegistry). The same store backs
the per-id metadata via MetaDataStore, so the COW machinery is written once and
shared by both per-id containers (the graph and idToMetaData).

This branch introduces the storage + COW primitives and routes the
read/grow/shrink/serialize paths through them. With no snapshot live every
version is unshared, so cowBackbone/cowBlock are no-ops and all mutations happen
in place: behavior and cost are identical to the previous flat storage. Snapshot
capture and COW-aware writes land in later branches.

Pure refactor: the full existing suite stays green (171 hnsw + 7 allocator,
including the updated memory-accounting test for the new block layout).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .../algorithms/hnsw/graph_data_blocks.h       | 393 ++++++++++++++++++
 src/VecSim/algorithms/hnsw/hnsw.h             | 114 ++++-
 .../algorithms/hnsw/hnsw_serializer_impl.h    |  18 +-
 tests/unit/test_allocator.cpp                 |  25 +-
 4 files changed, 522 insertions(+), 28 deletions(-)
 create mode 100644 src/VecSim/algorithms/hnsw/graph_data_blocks.h

diff --git a/src/VecSim/algorithms/hnsw/graph_data_blocks.h b/src/VecSim/algorithms/hnsw/graph_data_blocks.h
new file mode 100644
index 000000000..932a62946
--- /dev/null
+++ b/src/VecSim/algorithms/hnsw/graph_data_blocks.h
@@ -0,0 +1,393 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#pragma once
+
+#include "VecSim/memory/vecsim_malloc.h"
+#include "VecSim/memory/vecsim_base.h"
+#include "VecSim/utils/vecsim_stl.h"
+#include "VecSim/algorithms/hnsw/graph_data.h"
+
+#include <atomic>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <mutex>
+#include <set>
+#include <stdexcept>
+#include <vector>
+
+// Rooted, refcounted copy-on-write storage for the HNSW graph blocks.
+//
+// This replaces the old `vecsim_stl::vector<DataBlock> graphDataBlocks` with the
+// structure the snapshot design settled on:
+//
+//     shared_ptr< vector< Block > >  root          // contiguous header vector
+//        where  Block { shared_ptr<GraphBlockBuffer> data; }
+//
+// The point is to make a point-in-time *snapshot* of the graph cheap to capture
+// (an O(1) `shared_ptr` copy of `root`) and safe to read lock-free while writers
+// keep mutating the live index: writers never touch a buffer a snapshot can see;
+// instead they copy-on-write a new version. Old versions free themselves once no
+// snapshot references them (refcount-driven reclamation), so there is no
+// hand-rolled epoch/retire machinery.
+//
+// When no snapshot is active every refcount is 1, so `cowBackbone`/`cowBlock` are
+// no-ops and all mutations happen in place: behavior and cost are identical to
+// the previous flat storage. Snapshot capture itself is added in a later phase;
+// this phase introduces the storage + COW primitives and routes the read/grow/
+// shrink/serialize paths through them.
+
+// Tracks the live snapshot generation ids for one index. Snapshots get a unique,
+// monotonically increasing id at capture (`acquire`) and are removed at release.
+// Held via shared_ptr by both the index and the snapshot handles, so it safely
+// outlives the index if a snapshot does.
+//
+// It drives the two generation-based decisions (see design.md "Operations"):
+//   - `newestLive()` = max(live ids) = the **clone threshold**: a backbone/block
+//     version stamped `gen` must be preserved (cloned on write) iff
+//     `newestLive() >= gen` — some live snapshot can still see it.
+// `currentGeneration()` = the next id to be handed out = strictly greater than
+// every already-captured id; freshly-written/cloned versions are stamped with it,
+// so writes that post-date all live snapshots stay in place.
+//
+// The hot path (clone decision, read on every COW-aware write) uses only the
+// lock-free atomics; the mutex guards the ordered set on capture/release.
+class SnapshotRegistry {
+public:
+    uint64_t acquire() {
+        uint64_t g = nextGeneration_.fetch_add(1);
+        std::lock_guard<std::mutex> lk(guard_);
+        live_.insert(g);
+        maxLive_.store(*live_.rbegin());
+        return g;
+    }
+    void release(uint64_t generation) {
+        std::lock_guard<std::mutex> lk(guard_);
+        live_.erase(generation);
+        maxLive_.store(live_.empty() ? 0 : *live_.rbegin());
+    }
+    // "is any snapshot live?" — the degenerate gate (used by the SWAP deferral).
+    bool anyLive() const { return maxLive_.load() != 0; }
+    // max(live ids) — the clone threshold; 0 when none live. Lock-free.
+    uint64_t newestLive() const { return maxLive_.load(); }
+    // next id to be handed out (> all captured ids). Lock-free.
+    uint64_t currentGeneration() const { return nextGeneration_.load(); }
+
+private:
+    mutable std::mutex guard_;
+    std::set<uint64_t> live_;            // unique, monotonic ids; begin=min, rbegin=max
+    std::atomic<uint64_t> nextGeneration_{1}; // 0 reserved for "no/invalid generation"
+    std::atomic<uint64_t> maxLive_{0};   // cached max(live_) for the lock-free hot path
+};
+
+// Generation-tagged, copy-on-write holder for a single `shared_ptr<T>` version
+// (the reusable core of the snapshot mechanism). A snapshot captures the current
+// version with capture() — an O(1) shared_ptr bump that pins it. Before a write,
+// cowForWrite() clones the version iff a live snapshot can still see it
+// (newestLive() >= gen), swaps in the clone, and stamps it with the current
+// generation; a version written after all live snapshots is mutated in place.
+// The clone operation is supplied by the caller (trivial copy for the backbone /
+// metadata; a deep copyTo for the per-block graph buffers), so the same gen-tag
+// machinery serves every per-id container. Reclamation stays refcount-driven:
+// the old version frees itself once the last snapshot referencing it drops.
+//
+// `gen` is intentionally NOT use_count: a container-level use_count goes stale
+// after the first clone (the live root is unshared again while a snapshot still
+// holds the old one), and a per-buffer use_count couples correctness to refcount
+// discipline. The generation tag is immune to both.
+template <typename T>
+class RootedCowStore {
+public:
+    void setRegistry(std::shared_ptr<SnapshotRegistry> registry) {
+        registry_ = std::move(registry);
+    }
+    bool hasRoot() const { return static_cast<bool>(root_); }
+    const std::shared_ptr<T> &root() const { return root_; }
+    std::shared_ptr<T> capture() const { return root_; } // O(1) pin for a snapshot
+
+    // Install the initial version, stamped with the current generation.
+    void initRoot(std::shared_ptr<T> root) {
+        root_ = std::move(root);
+        gen_ = currentGeneration();
+    }
+    // Drop the live version (a live snapshot keeps its own copy alive). The next
+    // initRoot re-stamps a fresh generation.
+    void reset() {
+        root_.reset();
+        gen_ = 0;
+    }
+    // Generation to stamp freshly-written/cloned versions with (> every live id).
+    uint64_t currentGeneration() const {
+        return registry_ ? registry_->currentGeneration() : 1;
+    }
+    // True iff a live snapshot can still see the current version (so a write must
+    // preserve it by cloning). Reusable for finer-grained (per-buffer) decisions.
+    bool mustClone(uint64_t gen) const {
+        return (registry_ ? registry_->newestLive() : 0) >= gen;
+    }
+    bool mustCloneRoot() const { return mustClone(gen_); }
+
+    // Copy-on-write: if a live snapshot can still see this version, replace it with
+    // `cloneFn(*root_)` and re-stamp; otherwise leave it (subsequent writes mutate
+    // in place). cloneFn(const T&) -> std::shared_ptr<T>.
+    template <class CloneFn> void cowForWrite(CloneFn &&cloneFn) {
+        if (mustCloneRoot()) {
+            root_ = cloneFn(static_cast<const T &>(*root_));
+            gen_ = currentGeneration();
+        }
+    }
+
+private:
+    std::shared_ptr<SnapshotRegistry> registry_;
+    std::shared_ptr<T> root_;
+    uint64_t gen_ = 0;
+};
+
+// One block's raw bytes: `block_size` ElementGraphData records laid out
+// contiguously, plus the count of live records. Held only behind a shared_ptr;
+// when the last reference drops, the destructor releases each live element's
+// owned resources (incoming-edges vectors + the `others` upper-level allocation)
+// and frees the raw buffer. A superseded (COW'd-away) buffer thus cleans itself
+// up the moment the last snapshot that pinned it is released.
+//
+// `gen` is the generation the buffer was last written in (stamped on creation and
+// on each copy-on-write clone); the clone decision compares it against
+// `SnapshotRegistry::newestLive()` rather than the shared_ptr use_count.
+class GraphBlockBuffer : public VecsimBaseObject {
+public:
+    GraphBlockBuffer(size_t blockSize, size_t elementBytes, size_t levelDataSize, uint64_t gen,
+                     std::shared_ptr<VecSimAllocator> allocator)
+        : VecsimBaseObject(allocator), length(0), element_bytes(elementBytes),
+          level_data_size(levelDataSize), block_size(blockSize), gen_(gen) {
+        data = static_cast<char *>(this->allocator->callocate(block_size * element_bytes));
+        if (data == nullptr) {
+            throw std::runtime_error("VecSim index low memory error");
+        }
+    }
+
+    // Generation this buffer's contents were last written in (set on creation and
+    // on each COW clone). Drives the clone decision via SnapshotRegistry.
+    uint64_t gen() const { return gen_; }
+
+    ~GraphBlockBuffer() noexcept override {
+        if (data == nullptr) {
+            return;
+        }
+        for (size_t i = 0; i < length; i++) {
+            reinterpret_cast<ElementGraphData *>(data + i * element_bytes)
+                ->destroy(level_data_size, allocator);
+        }
+        allocator->free_allocation(data);
+    }
+
+    // Held only via shared_ptr; copying is done explicitly through copyLiveInto.
+    GraphBlockBuffer(const GraphBlockBuffer &) = delete;
+    GraphBlockBuffer &operator=(const GraphBlockBuffer &) = delete;
+
+    char *getElement(size_t offset) const { return data + offset * element_bytes; }
+    size_t getLength() const { return length; }
+
+    // Append a bitwise copy of an already-constructed ElementGraphData record
+    // (its owned pointers are moved into this buffer's slot). Returns the slot.
+    char *addElement(const void *element_record) {
+        assert(length < block_size);
+        char *slot = data + length * element_bytes;
+        memcpy(slot, element_record, element_bytes);
+        length++;
+        return slot;
+    }
+
+    // Drop the last record from the live count and return it (the caller either
+    // moves it elsewhere or has already released its resources). Mirrors the old
+    // DataBlock::removeAndFetchLastElement.
+    char *removeAndFetchLastElement() {
+        assert(length > 0);
+        return data + (--length) * element_bytes;
+    }
+
+    // Deep-copy this buffer's live elements into a freshly-allocated buffer using
+    // the Phase-1 ElementGraphData::copyTo primitive, so the copy shares no
+    // mutable state (fresh mutex, independent incoming-edges, own `others`).
+    void copyLiveInto(GraphBlockBuffer &dst) const {
+        assert(dst.block_size == block_size && dst.element_bytes == element_bytes);
+        for (size_t i = 0; i < length; i++) {
+            reinterpret_cast<const ElementGraphData *>(data + i * element_bytes)
+                ->copyTo(reinterpret_cast<ElementGraphData *>(dst.data + i * element_bytes),
+                         level_data_size, element_bytes, allocator);
+        }
+        dst.length = length;
+    }
+
+private:
+    // `allocator` is provided by VecsimBaseObject.
+    size_t length;
+    size_t element_bytes;
+    size_t level_data_size;
+    size_t block_size;
+    char *data;
+    uint64_t gen_; // generation last written in (clone decision; not the use_count)
+};
+
+// A backbone slot: a refcounted handle to one block's buffer. Trivially/cheaply
+// copyable, which is what keeps backbone copy-on-write (path-copying the header
+// vector) cheap — only refcounts are bumped, buffers are shared.
+struct GraphBlock {
+    std::shared_ptr<GraphBlockBuffer> data;
+};
+
+class GraphDataBlocks {
+public:
+    using Backbone = vecsim_stl::vector<GraphBlock>;
+    using Root = std::shared_ptr<Backbone>;
+
+    explicit GraphDataBlocks(std::shared_ptr<VecSimAllocator> allocator)
+        : allocator(std::move(allocator)), element_bytes(0), level_data_size(0), block_size(0) {
+        // The backbone is created lazily on the first block, so an empty index
+        // allocates nothing here (matching the previous flat storage and the
+        // initial-size estimation).
+    }
+
+    // Set the per-element/per-block geometry and the shared snapshot registry.
+    // Must be called once, before any block is added, after the index has computed
+    // elementGraphDataSize / levelDataSize (which depend on M / M0). The registry
+    // supplies the live-snapshot generation bounds that drive the clone decision.
+    void configure(size_t blockSize, size_t elementBytes, size_t levelDataSize,
+                   std::shared_ptr<SnapshotRegistry> registry) {
+        block_size = blockSize;
+        element_bytes = elementBytes;
+        level_data_size = levelDataSize;
+        backbone_.setRegistry(std::move(registry));
+    }
+
+    // ---- sizing ----
+    size_t size() const { return backbone_.hasRoot() ? backbone_.root()->size() : 0; }
+    size_t capacity() const { return backbone_.hasRoot() ? backbone_.root()->capacity() : 0; }
+    void reserve(size_t n) {
+        ensureRoot();
+        backbone_.root()->reserve(n);
+    }
+    void shrink_to_fit() {
+        if (!backbone_.hasRoot()) {
+            return;
+        }
+        if (backbone_.root()->empty()) {
+            // Release the backbone entirely so an emptied index returns to its
+            // initial footprint (a live snapshot, if any, keeps its own copy
+            // alive via its captured root).
+            backbone_.reset();
+            return;
+        }
+        cowBackbone();
+        backbone_.root()->shrink_to_fit();
+    }
+
+    // ---- read path ----
+    char *getElement(size_t id) const {
+        return (*backbone_.root())[id / block_size].data->getElement(id % block_size);
+    }
+    size_t blockLength(size_t blockIdx) const {
+        return (*backbone_.root())[blockIdx].data->getLength();
+    }
+    char *getElementInBlock(size_t blockIdx, size_t offset) const {
+        return (*backbone_.root())[blockIdx].data->getElement(offset);
+    }
+
+    // ---- write path (copy-on-write aware; must run under the index write lock) ----
+
+    // Resolve an element for mutation: ensure neither the backbone nor the block
+    // holding `id` is shared with a snapshot, then return a writable pointer.
+    char *getElementForWrite(size_t id) {
+        cowBackbone();
+        cowBlock(id / block_size);
+        return (*backbone_.root())[id / block_size].data->getElement(id % block_size);
+    }
+
+    // Append an empty block (graph growth). Path-copies the backbone first if a
+    // snapshot is sharing it.
+    void addBlock() {
+        ensureRoot();
+        cowBackbone();
+        backbone_.root()->push_back(GraphBlock{makeBuffer()});
+    }
+
+    // Append a new element record to the last block. COWs the last block if it is
+    // shared. Returns a writable pointer to the stored record.
+    char *addElement(const void *element_record) {
+        cowBackbone();
+        cowBlock(backbone_.root()->size() - 1);
+        return backbone_.root()->back().data->addElement(element_record);
+    }
+
+    // Remove (decrement) the last record of the last block and return it. COWs
+    // the last block if shared.
+    char *removeAndFetchLastElement() {
+        cowBackbone();
+        cowBlock(backbone_.root()->size() - 1);
+        return backbone_.root()->back().data->removeAndFetchLastElement();
+    }
+
+    // Drop the last (empty) block. Path-copies the backbone first if shared.
+    void popLastBlock() {
+        cowBackbone();
+        backbone_.root()->pop_back();
+    }
+
+    // ---- snapshot support ----
+    // O(1) capture of the current immutable view. Callers take this under the
+    // index read lock; reads through the returned Root are then lock-free.
+    Root captureRoot() const { return backbone_.capture(); }
+
+private:
+    void ensureRoot() {
+        if (!backbone_.hasRoot()) {
+            // Allocate the backbone vector object through the index allocator (so
+            // it is accounted); the shared_ptr control block uses global new and
+            // is intentionally not routed through the VecSimAllocator, keeping the
+            // tracked footprint expressible via sizeof for the memory tests.
+            backbone_.initRoot(Root(new (allocator) Backbone(allocator)));
+        }
+    }
+
+    std::shared_ptr<GraphBlockBuffer> makeBuffer() const {
+        return std::shared_ptr<GraphBlockBuffer>(new (allocator) GraphBlockBuffer(
+            block_size, element_bytes, level_data_size, backbone_.currentGeneration(), allocator));
+    }
+
+    // Backbone COW via the shared generation-tag store: clone the header vector
+    // (copying the GraphBlock handles, sharing the buffers) iff a live snapshot can
+    // still see this version.
+    void cowBackbone() {
+        backbone_.cowForWrite([this](const Backbone &old) {
+            Root fresh(new (allocator) Backbone(allocator));
+            *fresh = old;
+            return fresh;
+        });
+    }
+
+    // Per-block COW (same generation-tag rule, finer granularity): clone the block
+    // buffer (deep-copy via copyTo) iff a live snapshot can still see this version.
+    // A per-buffer use_count would be misleading before the backbone is cloned (a
+    // shared backbone holds a single shared_ptr per block, so use_count == 1 even
+    // though a snapshot sees it) — the generation tag is reliable.
+    void cowBlock(size_t blockIdx) {
+        GraphBlock &blk = (*backbone_.root())[blockIdx];
+        if (backbone_.mustClone(blk.data->gen())) {
+            auto fresh = makeBuffer(); // carries currentGeneration()
+            blk.data->copyLiveInto(*fresh);
+            blk.data = std::move(fresh);
+        }
+    }
+
+    std::shared_ptr<VecSimAllocator> allocator;
+    RootedCowStore<Backbone> backbone_;
+    size_t element_bytes;
+    size_t level_data_size;
+    size_t block_size;
+};
diff --git a/src/VecSim/algorithms/hnsw/hnsw.h b/src/VecSim/algorithms/hnsw/hnsw.h
index 1a3776fa2..2fb9e0fa8 100644
--- a/src/VecSim/algorithms/hnsw/hnsw.h
+++ b/src/VecSim/algorithms/hnsw/hnsw.h
@@ -14,6 +14,7 @@
 #include "VecSim/utils/vecsim_stl.h"
 #include "VecSim/utils/vec_utils.h"
 #include "VecSim/containers/data_block.h"
+#include "VecSim/algorithms/hnsw/graph_data_blocks.h"
 #include "VecSim/containers/raw_data_container_interface.h"
 #include "VecSim/containers/data_blocks_container.h"
 #include "VecSim/containers/vecsim_results_container.h"
@@ -79,6 +80,82 @@ struct ElementMetaData {
 };
 #pragma pack() // restore default packing
 
+// Rooted, copy-on-write store for the per-id metadata (label + flags) — the third
+// per-id container (after the graph and the vectors). It exposes the vector API
+// the index already uses (operator[], at, data, size, capacity, resize,
+// shrink_to_fit), so call sites are unchanged, but a *capacity change* (resize /
+// shrink) copies-on-write when a live snapshot can still see the current version
+// (driven by the shared RootedCowStore / SnapshotRegistry generation tag). A
+// snapshot therefore keeps reading its as-of-capture flags + labels even when a
+// concurrent insert reallocates the live metadata. In-place per-id writes
+// (markAs, setVectorId, swap) target ids the snapshot does not read (newly
+// inserted) or are the accepted weak deleted-flag semantics, so they are not
+// copied. Lazy: no allocation until the first resize (matching the previous flat
+// vector's footprint at capacity 0). Reclamation is refcount-driven.
+class MetaDataStore {
+public:
+    using Vector = vecsim_stl::vector<ElementMetaData>;
+    explicit MetaDataStore(std::shared_ptr<VecSimAllocator> allocator)
+        : allocator_(std::move(allocator)) {}
+
+    void configure(std::shared_ptr<SnapshotRegistry> registry) {
+        store_.setRegistry(std::move(registry));
+    }
+
+    ElementMetaData &operator[](size_t id) { return (*store_.root())[id]; }
+    const ElementMetaData &operator[](size_t id) const {
+        const Vector &v = *store_.root();
+        return v[id];
+    }
+    ElementMetaData &at(size_t id) { return store_.root()->at(id); }
+    const ElementMetaData &at(size_t id) const {
+        const Vector &v = *store_.root();
+        return v.at(id);
+    }
+    ElementMetaData *data() { return store_.hasRoot() ? store_.root()->data() : nullptr; }
+    const ElementMetaData *data() const {
+        return store_.hasRoot() ? store_.root()->data() : nullptr;
+    }
+    size_t size() const { return store_.hasRoot() ? store_.root()->size() : 0; }
+    size_t capacity() const { return store_.hasRoot() ? store_.root()->capacity() : 0; }
+
+    void resize(size_t n) {
+        ensureRoot();
+        cowForResize();
+        store_.root()->resize(n);
+    }
+    void shrink_to_fit() {
+        if (!store_.hasRoot()) {
+            return;
+        }
+        if (store_.root()->empty()) {
+            store_.reset(); // return to the initial (no-heap) footprint
+            return;
+        }
+        cowForResize();
+        store_.root()->shrink_to_fit();
+    }
+
+    // O(1) capture of the current metadata version (pins it for a snapshot).
+    std::shared_ptr<Vector> captureRoot() const { return store_.capture(); }
+
+private:
+    void ensureRoot() {
+        if (!store_.hasRoot()) {
+            store_.initRoot(std::shared_ptr<Vector>(new (allocator_) Vector(allocator_)));
+        }
+    }
+    void cowForResize() {
+        store_.cowForWrite([this](const Vector &old) {
+            auto fresh = std::shared_ptr<Vector>(new (allocator_) Vector(allocator_));
+            *fresh = old; // copy current contents; the resize/shrink mutates the copy
+            return fresh;
+        });
+    }
+    std::shared_ptr<VecSimAllocator> allocator_;
+    RootedCowStore<Vector> store_;
+};
+
 //////////////////////////////////// HNSW index implementation ////////////////////////////////////
 
 template <typename DataType, typename DistType>
@@ -115,8 +192,13 @@ class HNSWIndex : public VecSimIndexAbstract<DataType, DistType>,
     size_t maxLevel; // this is the top level of the entry point's element
 
     // Index data
-    vecsim_stl::vector<DataBlock> graphDataBlocks;
-    vecsim_stl::vector<ElementMetaData> idToMetaData;
+    GraphDataBlocks graphDataBlocks;
+    MetaDataStore idToMetaData;
+
+    // Live-snapshot generation registry (shared so it outlives the index if a
+    // snapshot does). Hands out monotonic ids at capture and tracks the live set;
+    // drives the SWAP-deferral gate (and, later, the clone/reclaim horizons).
+    std::shared_ptr<SnapshotRegistry> snapshotRegistry_ = std::make_shared<SnapshotRegistry>();
 
     // Used for marking the visited nodes in graph scans (the pool supports parallel graph scans).
     // This is mutable since the object changes upon search operations as well (which are const).
@@ -393,8 +475,7 @@ const char *HNSWIndex<DataType, DistType>::getDataByInternalId(idType internal_i
 template <typename DataType, typename DistType>
 ElementGraphData *
 HNSWIndex<DataType, DistType>::getGraphDataByInternalId(idType internal_id) const {
-    return (ElementGraphData *)graphDataBlocks[internal_id / this->blockSize].getElement(
-        internal_id % this->blockSize);
+    return (ElementGraphData *)graphDataBlocks.getElement(internal_id);
 }
 
 template <typename DataType, typename DistType>
@@ -1090,10 +1171,10 @@ void HNSWIndex<DataType, DistType>::replaceEntryPoint() {
         // If there is no neighbors in the current level, check for any vector at
         // this level to be the new entry point.
         idType cur_id = 0;
-        for (DataBlock &graph_data_block : graphDataBlocks) {
-            size_t size = graph_data_block.getLength();
+        for (size_t b = 0; b < graphDataBlocks.size(); b++) {
+            size_t size = graphDataBlocks.blockLength(b);
             for (size_t i = 0; i < size; i++) {
-                auto cur_element = (ElementGraphData *)graph_data_block.getElement(i);
+                auto cur_element = (ElementGraphData *)graphDataBlocks.getElementInBlock(b, i);
                 if (cur_element->toplevel == maxLevel && cur_id != old_entry_point_id &&
                     !isMarkedDeleted(cur_id)) {
                     // Found a non element in the current max level.
@@ -1318,7 +1399,7 @@ void HNSWIndex<DataType, DistType>::growByBlock() {
               maxElements + this->blockSize);
     maxElements += this->blockSize;
 
-    graphDataBlocks.emplace_back(this->blockSize, this->elementGraphDataSize, this->allocator);
+    graphDataBlocks.addBlock();
 
     if (idToMetaData.capacity() == indexSize()) {
         resizeIndexCommon(maxElements);
@@ -1334,7 +1415,7 @@ void HNSWIndex<DataType, DistType>::shrinkByBlock() {
         this->log(VecSimCommonStrings::LOG_VERBOSE_STRING,
                   "Updating HNSW index capacity from %zu to %zu", maxElements,
                   maxElements - this->blockSize);
-        graphDataBlocks.pop_back();
+        graphDataBlocks.popLastBlock();
         assert(graphDataBlocks.size() == indexSize() / this->blockSize);
 
         // assuming idToMetaData reflects the capacity of the heavy reallocation containers.
@@ -1637,13 +1718,17 @@ HNSWIndex<DataType, DistType>::HNSWIndex(const HNSWParams *params,
 
     elementGraphDataSize = sizeof(ElementGraphData) + sizeof(idType) * M0;
     levelDataSize = sizeof(ElementLevelData) + sizeof(idType) * M;
+    graphDataBlocks.configure(this->blockSize, elementGraphDataSize, levelDataSize,
+                              snapshotRegistry_);
+    idToMetaData.configure(snapshotRegistry_);
 }
 
 template <typename DataType, typename DistType>
 HNSWIndex<DataType, DistType>::~HNSWIndex() {
-    for (idType id = 0; id < curElementCount; id++) {
-        getGraphDataByInternalId(id)->destroy(this->levelDataSize, this->allocator);
-    }
+    // Each block's GraphBlockBuffer owns its live elements' resources and frees
+    // them (and the raw buffer) when its last reference drops, which happens as
+    // graphDataBlocks' root is torn down here. No explicit per-element loop is
+    // needed; doing one would double-free the elements the buffer destroys.
 }
 
 /**
@@ -1684,8 +1769,7 @@ void HNSWIndex<DataType, DistType>::removeAndSwap(idType internalId) {
     // Get the last element's metadata and data.
     // If we are deleting the last element, we already destroyed it's metadata.
     auto *last_element_data = getDataByInternalId(curElementCount);
-    DataBlock &last_gd_block = graphDataBlocks.back();
-    auto last_element = (ElementGraphData *)last_gd_block.removeAndFetchLastElement();
+    auto last_element = (ElementGraphData *)graphDataBlocks.removeAndFetchLastElement();
 
     // Swap the last id with the deleted one, and invalidate the last id data.
     if (curElementCount != internalId) {
@@ -1803,7 +1887,7 @@ HNSWAddVectorState HNSWIndex<DataType, DistType>::storeNewElement(labelType labe
 
     // Insert the new element to the data block
     this->vectors->addElement(vector_data, state.newElementId);
-    this->graphDataBlocks.back().addElement(cur_egd);
+    this->graphDataBlocks.addElement(cur_egd);
     // We mark id as in process *before* we set it in the label lookup, so that IN_PROCESS flag is
     // set when checking if label .
     this->idToMetaData[state.newElementId] = ElementMetaData(label);
diff --git a/src/VecSim/algorithms/hnsw/hnsw_serializer_impl.h b/src/VecSim/algorithms/hnsw/hnsw_serializer_impl.h
index 5f9dd8cbf..5f0fd37e5 100644
--- a/src/VecSim/algorithms/hnsw/hnsw_serializer_impl.h
+++ b/src/VecSim/algorithms/hnsw/hnsw_serializer_impl.h
@@ -28,6 +28,12 @@ HNSWIndex<DataType, DistType>::HNSWIndex(std::ifstream &input, const HNSWParams
     // levels value than the loaded index.
     levelGenerator.seed(200);
 
+    // Configure the rooted COW stores with the registry before any sizing.
+    // restoreIndexFields (above) has set elementGraphDataSize / levelDataSize.
+    graphDataBlocks.configure(this->blockSize, this->elementGraphDataSize, this->levelDataSize,
+                              snapshotRegistry_);
+    idToMetaData.configure(snapshotRegistry_);
+
     // Set the initial capacity based on the number of elements in the loaded index.
     maxElements = RoundUpInitialCapacity(this->curElementCount, this->blockSize);
     this->idToMetaData.resize(maxElements);
@@ -190,8 +196,7 @@ void HNSWIndex<DataType, DistType>::restoreGraph(std::ifstream &input,
     size_t toplevel = 0;
     size_t num_blocks = dynamic_cast<DataBlocksContainer *>(this->vectors)->numBlocks();
     for (size_t i = 0; i < num_blocks; i++) {
-        this->graphDataBlocks.emplace_back(this->blockSize, this->elementGraphDataSize,
-                                           this->allocator);
+        this->graphDataBlocks.addBlock();
         unsigned int block_len = 0;
         readBinaryPOD(input, block_len);
         for (size_t j = 0; j < block_len; j++) {
@@ -209,8 +214,7 @@ void HNSWIndex<DataType, DistType>::restoreGraph(std::ifstream &input,
                 throw e;
             }
             // Add the current element to the current block, and update cur_egt to point to it.
-            this->graphDataBlocks.back().addElement(tmpData.get());
-            cur_egt = (ElementGraphData *)this->graphDataBlocks.back().getElement(j);
+            cur_egt = (ElementGraphData *)this->graphDataBlocks.addElement(tmpData.get());
 
             // Restore the current element's graph data
             for (size_t k = 0; k <= toplevel; k++) {
@@ -286,11 +290,11 @@ void HNSWIndex<DataType, DistType>::saveGraph(std::ofstream &output) const {
 
     // Save graph data blocks
     for (size_t i = 0; i < this->graphDataBlocks.size(); i++) {
-        auto &block = this->graphDataBlocks[i];
-        unsigned int block_len = block.getLength();
+        unsigned int block_len = this->graphDataBlocks.blockLength(i);
         writeBinaryPOD(output, block_len);
         for (size_t j = 0; j < block_len; j++) {
-            ElementGraphData *cur_element = (ElementGraphData *)block.getElement(j);
+            ElementGraphData *cur_element =
+                (ElementGraphData *)this->graphDataBlocks.getElementInBlock(i, j);
             writeBinaryPOD(output, cur_element->toplevel);
 
             // Save all the levels of the current element
diff --git a/tests/unit/test_allocator.cpp b/tests/unit/test_allocator.cpp
index 6aa4a0d0b..ad3a92d9b 100644
--- a/tests/unit/test_allocator.cpp
+++ b/tests/unit/test_allocator.cpp
@@ -566,8 +566,16 @@ TYPED_TEST(IndexAllocatorTest, test_hnsw_reclaim_memory) {
     // Also account for all the memory allocation caused by the resizing that this vector triggered
     // except for the bucket count of the labels_lookup hash table that is calculated separately.
     // Calculate the expected memory delta for adding a block.
+    // The vectors container still uses a flat vector<DataBlock> (one backbone slot
+    // + one data-buffer allocation per block). The graph now uses the rooted
+    // GraphDataBlocks: a vector<GraphBlock> backbone slot, a separately-allocated
+    // GraphBlockBuffer object, and that buffer's data allocation per block.
     size_t data_containers_block_mem =
-        2 * (sizeof(DataBlock) + vecsimAllocationOverhead) + hnswIndex->getStorageAlignment();
+        (sizeof(DataBlock) + vecsimAllocationOverhead) + // vectors: backbone slot + data alloc overhead
+        sizeof(GraphBlock) +                             // graph: backbone slot growth
+        (sizeof(GraphBlockBuffer) + vecsimAllocationOverhead) + // graph: block buffer object
+        vecsimAllocationOverhead +                       // graph: data buffer alloc overhead
+        hnswIndex->getStorageAlignment();
     size_t size_total_data_per_element =
         hnswIndex->elementGraphDataSize + hnswIndex->getStoredDataSize();
     data_containers_block_mem += size_total_data_per_element * block_size;
@@ -596,9 +604,12 @@ TYPED_TEST(IndexAllocatorTest, test_hnsw_reclaim_memory) {
     // Free the buffer of the last block in both data containers.
     expected_allocation_size -=
         size_total_data_per_element * block_size + 2 * vecsimAllocationOverhead;
+    // The graph block also frees its GraphBlockBuffer object (the vectors block's
+    // DataBlock lives inline in the backbone vector and is covered below).
+    expected_allocation_size -= (sizeof(GraphBlockBuffer) + vecsimAllocationOverhead);
     expected_allocation_size -=
         (graph_data_blocks_capacity - hnswIndex->graphDataBlocks.capacity()) *
-        (sizeof(DataBlock) + vecsimAllocationOverhead);
+        (sizeof(GraphBlock) + vecsimAllocationOverhead);
     expected_allocation_size -= (vectors_blocks_capacity - vectors_blocks->capacity()) *
                                 (sizeof(DataBlock) + vecsimAllocationOverhead);
     ASSERT_EQ(allocator->getAllocationSize(), expected_allocation_size);
@@ -613,11 +624,13 @@ TYPED_TEST(IndexAllocatorTest, test_hnsw_reclaim_memory) {
     // All data structures' memory returns to as it was, with the exceptional of the labels_lookup
     // (STL unordered_map with hash table implementation), that leaves some empty buckets.
     size_t hash_table_memory = hnswIndex->labelLookup.bucket_count() * sizeof(size_t);
-    // Data block vectors do not shrink on resize so extra memory is expected.
+    // Data block vectors do not shrink on resize so extra memory is expected. The
+    // graph backbone additionally retains its heap-allocated vector<GraphBlock>
+    // object (the vectors container's vector lives inline in the index).
     size_t block_vectors_memory =
-        sizeof(DataBlock) * (hnswIndex->graphDataBlocks.capacity() +
-                             dynamic_cast<DataBlocksContainer *>(hnswIndex->vectors)->capacity()) +
-        2 * vecsimAllocationOverhead;
+        sizeof(DataBlock) * dynamic_cast<DataBlocksContainer *>(hnswIndex->vectors)->capacity() +
+        sizeof(GraphBlock) * hnswIndex->graphDataBlocks.capacity() +
+        sizeof(GraphDataBlocks::Backbone) + 3 * vecsimAllocationOverhead;
     // Current memory should be back as it was initially. The label_lookup hash table is an
     // exception, since in some platforms, empty buckets remain even when the capacity is set to
     // zero, while in others the entire capacity reduced to zero (including the header).

From dd12d5e2b95780f4e9ec2c235ec2f12848ee9229 Mon Sep 17 00:00:00 2001
From: jonathan keinan <jonathan.keinan@redis.com>
Date: Thu, 11 Jun 2026 13:33:15 +0300
Subject: [PATCH 03/10] HNSW snapshot 03: immutable snapshot handle + capture
 API

Add HNSWGraphSnapshot (an immutable, point-in-time view: the captured backbone
root + scalar entry/level/count state + captured vector-block base pointers +
captured metadata root) and HNSWIndex::captureGraphSnapshot(), which builds one
under the index read lock in O(1) (a shared_ptr bump, no element allocation) and
registers a live generation id whose token deregisters on the last handle drop.

Test snapshotHandleCapture: capture is allocation-free, records the right
entry-point/level/count, and resolves node access through the captured backbone
to the same topology the live index holds. Isolation under concurrent writes
depends on the copy-on-write write path and is exercised in the next branch.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .../algorithms/hnsw/graph_data_blocks.h       | 56 +++++++++++++++++++
 src/VecSim/algorithms/hnsw/hnsw.h             | 51 +++++++++++++++++
 .../algorithms/hnsw/hnsw_base_tests_friends.h |  1 +
 tests/unit/test_hnsw.cpp                      | 49 ++++++++++++++++
 4 files changed, 157 insertions(+)

diff --git a/src/VecSim/algorithms/hnsw/graph_data_blocks.h b/src/VecSim/algorithms/hnsw/graph_data_blocks.h
index 932a62946..8285095cd 100644
--- a/src/VecSim/algorithms/hnsw/graph_data_blocks.h
+++ b/src/VecSim/algorithms/hnsw/graph_data_blocks.h
@@ -391,3 +391,59 @@ class GraphDataBlocks {
     size_t level_data_size;
     size_t block_size;
 };
+
+// An immutable, point-in-time view of the HNSW graph topology. It is captured by
+// copying the backbone root (an O(1) shared_ptr bump that pins every block buffer
+// referenced at capture time) together with the scalar entry-point / level /
+// count state. Because writers copy-on-write rather than mutate shared buffers,
+// the snapshot keeps observing the graph exactly as it was at capture, even as
+// the live index diverges. Reads through it resolve node access without touching
+// the live `graphDataBlocks` member.
+//
+// `generation` is the snapshot's id from the index's SnapshotRegistry; `liveToken`
+// is an opaque handle whose deleter removes that id from the registry when the
+// last copy of this snapshot is destroyed (registration lifetime == snapshot
+// lifetime, independent of the copy-on-write `root` refcount).
+struct HNSWGraphSnapshot {
+    GraphDataBlocks::Root root;
+    idType entrypointNode;
+    size_t maxLevel;
+    size_t curElementCount;
+    size_t blockSize;
+    size_t levelDataSize;
+    size_t elementGraphDataSize;
+    uint64_t generation = 0;
+    std::shared_ptr<void> liveToken;
+
+    // Captured per-block base pointers of the raw vector data (one per block at
+    // capture time), so a lock-free query reads vector data WITHOUT touching the
+    // live vectors container (whose block-header vector reallocs on insert). The
+    // pointers stay valid while the snapshot is live: the underlying data buffers
+    // don't move when the index grows (only the header vector reallocs; the buffer
+    // pointer is preserved), and SWAP/shrink reuse is deferred while a snapshot is
+    // held. Shared so handle copies don't re-capture. (Capture is O(#blocks)
+    // rather than strictly O(1); rooting the vectors container would restore O(1).)
+    std::shared_ptr<std::vector<const char *>> vectorBlocks;
+    size_t vectorElementBytes = 0;
+
+    // Captured per-id metadata version (the flat label+flags vector), type-erased
+    // because this header is below ElementMetaData's definition. It pins the
+    // as-of-capture metadata so a lock-free query reads its flags + labels without
+    // racing a concurrent insert that reallocates the live metadata. The reader
+    // (HNSWIndex::topKFromSnapshot) casts it back to vecsim_stl::vector<ElementMetaData>.
+    std::shared_ptr<void> metaData;
+
+    bool valid() const { return root != nullptr; }
+
+    ElementGraphData *getGraphData(idType id) const {
+        return reinterpret_cast<ElementGraphData *>(
+            (*root)[id / blockSize].data->getElement(id % blockSize));
+    }
+    ElementLevelData &getLevelData(idType id, size_t level) const {
+        return getGraphData(id)->getElementLevelData(level, levelDataSize);
+    }
+    const char *getVectorData(idType id) const {
+        return (*vectorBlocks)[id / blockSize] +
+               static_cast<size_t>(id % blockSize) * vectorElementBytes;
+    }
+};
diff --git a/src/VecSim/algorithms/hnsw/hnsw.h b/src/VecSim/algorithms/hnsw/hnsw.h
index 2fb9e0fa8..c11e9cdce 100644
--- a/src/VecSim/algorithms/hnsw/hnsw.h
+++ b/src/VecSim/algorithms/hnsw/hnsw.h
@@ -346,6 +346,57 @@ class HNSWIndex : public VecSimIndexAbstract<DataType, DistType>,
     ElementGraphData *getGraphDataByInternalId(idType internal_id) const;
     ElementLevelData &getElementLevelData(idType internal_id, size_t level) const;
     ElementLevelData &getElementLevelData(ElementGraphData *element, size_t level) const;
+    // Capture an immutable, point-in-time view of the graph. Must be called under
+    // the index read lock; the returned handle is then usable without holding any
+    // lock, as writers copy-on-write rather than mutate the buffers it pins.
+    //
+    // Consistency contract (what a reader of the returned snapshot sees):
+    //   - STRICT point-in-time for graph TOPOLOGY and MEMBERSHIP: the set of ids
+    //     that existed at capture (ids < curElementCount) and their links/levels
+    //     are frozen — concurrent inserts copy-on-write, so they never perturb the
+    //     captured backbone/buffers.
+    //   - WEAK (best-effort) for per-element METADATA (deleted / in-process flags,
+    //     and a slot's label): metadata structural changes (resize) copy-on-write,
+    //     but per-element fields are written in place on the live metadata, so a
+    //     change that lands after capture (a delete-mark, or a label rewrite when a
+    //     freed slot is recycled) MAY or MAY NOT be observed by the snapshot reader.
+    //     This matches the live index's existing weak flag consistency; a snapshot
+    //     read is not a strict point-in-time view of deletion status. (A later phase
+    //     defers slot recycling for ids a live snapshot can still see, which makes
+    //     labels of visible ids effectively stable; flags remain weak.)
+    HNSWGraphSnapshot captureGraphSnapshot() const {
+        // Hand out a fresh generation id and a registration token whose deleter
+        // removes it from the live set when the last copy of the handle drops.
+        uint64_t generation = snapshotRegistry_->acquire();
+        auto registry = snapshotRegistry_;
+        auto liveToken = std::shared_ptr<void>(
+            static_cast<void *>(nullptr), [registry, generation](void *) {
+                registry->release(generation);
+            });
+        // Capture the per-block base pointers of the raw vector data so a
+        // lock-free query never reads the live (reallocating) vectors container.
+        // Called under the index read lock, so no concurrent writer is reallocating
+        // the block-header vector here.
+        auto *vectorsContainer = static_cast<const DataBlocksContainer *>(this->vectors);
+        auto vectorBlocks = std::make_shared<std::vector<const char *>>();
+        size_t numVectorBlocks = vectorsContainer->numBlocks();
+        vectorBlocks->reserve(numVectorBlocks);
+        for (size_t b = 0; b < numVectorBlocks; b++) {
+            vectorBlocks->push_back(vectorsContainer->getElement(b * this->blockSize));
+        }
+        return HNSWGraphSnapshot{graphDataBlocks.captureRoot(),
+                                 entrypointNode,
+                                 maxLevel,
+                                 curElementCount,
+                                 this->blockSize,
+                                 levelDataSize,
+                                 elementGraphDataSize,
+                                 generation,
+                                 std::move(liveToken),
+                                 std::move(vectorBlocks),
+                                 vectorsContainer->elementByteCount(),
+                                 std::static_pointer_cast<void>(idToMetaData.captureRoot())};
+    }
     idType searchBottomLayerEP(const void *query_data, void *timeoutCtx,
                                VecSimQueryReply_Code *rc) const;
 
diff --git a/src/VecSim/algorithms/hnsw/hnsw_base_tests_friends.h b/src/VecSim/algorithms/hnsw/hnsw_base_tests_friends.h
index e74b0c129..9d7d17e64 100644
--- a/src/VecSim/algorithms/hnsw/hnsw_base_tests_friends.h
+++ b/src/VecSim/algorithms/hnsw/hnsw_base_tests_friends.h
@@ -16,6 +16,7 @@ INDEX_TEST_FRIEND_CLASS(IndexAllocatorTest_testIncomingEdgesSet_Test)
 INDEX_TEST_FRIEND_CLASS(IndexAllocatorTest_test_hnsw_reclaim_memory_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTest_markDelete_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTest_copyToDeepCopy_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTest_snapshotHandleCapture_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWMultiTest_markDelete_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTest_allMarkedDeletedLevel_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTestParallel)
diff --git a/tests/unit/test_hnsw.cpp b/tests/unit/test_hnsw.cpp
index e0689223d..c37ed3886 100644
--- a/tests/unit/test_hnsw.cpp
+++ b/tests/unit/test_hnsw.cpp
@@ -2338,3 +2338,52 @@ TYPED_TEST(HNSWTest, copyToDeepCopy) {
 
     VecSimIndex_Free(index);
 }
+
+// Phase 3 of the HNSW snapshot work: the immutable snapshot handle. Capturing it
+// is O(1) (a shared_ptr bump + scalar state, no element allocation), it records
+// the entry-point / level / count as of capture, and reads through it resolve the
+// captured graph topology without touching the live `graphDataBlocks` member.
+// (Isolation under concurrent writes is a separate guarantee that depends on the
+// copy-on-write write path; it is exercised in the next branch.)
+TYPED_TEST(HNSWTest, snapshotHandleCapture) {
+    size_t dim = 4;
+    size_t M = 8;
+    size_t bs = 64;
+    HNSWParams params = {
+        .dim = dim, .metric = VecSimMetric_L2, .blockSize = bs, .M = M, .efConstruction = 200};
+    VecSimIndex *index = this->CreateNewIndex(params);
+    auto *hnsw = this->CastToHNSW(index);
+    auto allocator = index->getAllocator();
+
+    size_t n = bs * 3;
+    for (size_t i = 0; i < n; i++) {
+        GenerateAndAddVector<TEST_DATA_T>(index, dim, i, i);
+    }
+
+    // Capture is O(1): no element memory is allocated, only refcounts/scalars.
+    size_t mem_before = allocator->getAllocationSize();
+    auto snap = hnsw->captureGraphSnapshot();
+    ASSERT_EQ(allocator->getAllocationSize(), mem_before);
+    ASSERT_TRUE(snap.valid());
+    ASSERT_EQ(snap.curElementCount, n);
+    ASSERT_EQ(snap.entrypointNode, hnsw->entrypointNode);
+    ASSERT_EQ(snap.maxLevel, hnsw->maxLevel);
+
+    // The snapshot resolves node access through the captured backbone and returns
+    // the same level-0 topology the live index holds (no writes have occurred).
+    const size_t lds = hnsw->levelDataSize;
+    ElementLevelData &snapLd0 = snap.getLevelData(0, 0);
+    ElementLevelData &liveLd0 = hnsw->getElementLevelData((idType)0, (size_t)0);
+    ASSERT_EQ(snapLd0.getNumLinks(), liveLd0.getNumLinks());
+    ASSERT_GT(snapLd0.getNumLinks(), 0u);
+    for (size_t j = 0; j < snapLd0.getNumLinks(); j++) {
+        ASSERT_EQ(snapLd0.getLinkAtPos(j), liveLd0.getLinkAtPos(j));
+    }
+    // The snapshot's node 0 resolves to the same (still-shared) buffer as the live
+    // index, since nothing has been copied-on-write yet.
+    ASSERT_EQ(snap.getGraphData(0), hnsw->getGraphDataByInternalId(0));
+    (void)lds;
+
+    snap = HNSWGraphSnapshot{};
+    VecSimIndex_Free(index);
+}

From 518b9e88794afeb81ba4dd2e767910a68d6ff4af Mon Sep 17 00:00:00 2001
From: jonathan keinan <jonathan.keinan@redis.com>
Date: Thu, 11 Jun 2026 13:38:04 +0300
Subject: [PATCH 04/10] HNSW snapshot 04: COW-aware insert path (snapshots
 survive concurrent writes)

Add the copy-on-write write accessors getGraphDataByInternalIdForWrite /
getElementLevelDataForWrite, which copy the backbone and the touched block out of
any live snapshot before returning a writable pointer (no-op when no snapshot is
live, decided by the generation tag rather than use_count). Route the graph
mutation sites through them: mutuallyConnectNewElement, SwapLastIdWithDeletedId,
mutuallyRemoveNeighborAtPos. Also expose graphSnapshotActive() (live-id registry
gate) for the SWAP-deferral consumer in the next branch.

Tests: cowStorageSnapshotIsolation (a COW-aware write copies the shared block and
leaves the snapshot unchanged; dropping the snapshot reclaims it; an unshared
write stays in place with no allocation), snapshotHandleCapture (now asserts
isolation after a post-capture grow + write), snapshotLockFreeConsistency,
snapshotPreservedUnderConcurrentInserts (real concurrent inserts can't perturb a
captured snapshot), cowDecisionByGenerationNotUseCount, snapshotPinsUntraversedBlock.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 src/VecSim/algorithms/hnsw/hnsw.h             |  37 +-
 .../algorithms/hnsw/hnsw_base_tests_friends.h |   5 +
 tests/unit/test_hnsw.cpp                      | 367 +++++++++++++++++-
 3 files changed, 391 insertions(+), 18 deletions(-)

diff --git a/src/VecSim/algorithms/hnsw/hnsw.h b/src/VecSim/algorithms/hnsw/hnsw.h
index c11e9cdce..aa9f155c0 100644
--- a/src/VecSim/algorithms/hnsw/hnsw.h
+++ b/src/VecSim/algorithms/hnsw/hnsw.h
@@ -344,8 +344,20 @@ class HNSWIndex : public VecSimIndexAbstract<DataType, DistType>,
     bool preferAdHocSearch(size_t subsetSize, size_t k, bool initial_check) const override;
     const char *getDataByInternalId(idType internal_id) const;
     ElementGraphData *getGraphDataByInternalId(idType internal_id) const;
+    // Copy-on-write aware accessor for mutation: ensures the backbone and the
+    // block holding `internal_id` are not shared with a live snapshot before
+    // returning a writable pointer. Must be called under the index write lock.
+    ElementGraphData *getGraphDataByInternalIdForWrite(idType internal_id);
     ElementLevelData &getElementLevelData(idType internal_id, size_t level) const;
     ElementLevelData &getElementLevelData(ElementGraphData *element, size_t level) const;
+    // COW-aware level-data accessor for mutation: copies the block holding
+    // `internal_id` out of any shared snapshot (idempotent per block) and returns
+    // a writable level record. Use at every site that mutates a node's links or
+    // incoming edges, so a live snapshot's buffers are never modified in place.
+    ElementLevelData &getElementLevelDataForWrite(idType internal_id, size_t level) {
+        return getGraphDataByInternalIdForWrite(internal_id)->getElementLevelData(
+            level, this->levelDataSize);
+    }
     // Capture an immutable, point-in-time view of the graph. Must be called under
     // the index read lock; the returned handle is then usable without holding any
     // lock, as writers copy-on-write rather than mutate the buffers it pins.
@@ -397,6 +409,13 @@ class HNSWIndex : public VecSimIndexAbstract<DataType, DistType>,
                                  vectorsContainer->elementByteCount(),
                                  std::static_pointer_cast<void>(idToMetaData.captureRoot())};
     }
+    // True while any captured graph snapshot is still live. SWAP slot reuse must
+    // be deferred while this holds, so a freed id referenced by a snapshot is not
+    // physically recycled. Backed by the live-id registry rather than the root's
+    // use_count: after the first copy-on-write the live root is unshared again
+    // (the snapshot holds the *old* root), so use_count would wrongly report no
+    // snapshot mid-stream — the registry tracks the snapshot's whole lifetime.
+    bool graphSnapshotActive() const { return snapshotRegistry_->anyLive(); }
     idType searchBottomLayerEP(const void *query_data, void *timeoutCtx,
                                VecSimQueryReply_Code *rc) const;
 
@@ -529,6 +548,12 @@ HNSWIndex<DataType, DistType>::getGraphDataByInternalId(idType internal_id) cons
     return (ElementGraphData *)graphDataBlocks.getElement(internal_id);
 }
 
+template <typename DataType, typename DistType>
+ElementGraphData *
+HNSWIndex<DataType, DistType>::getGraphDataByInternalIdForWrite(idType internal_id) {
+    return (ElementGraphData *)graphDataBlocks.getElementForWrite(internal_id);
+}
+
 template <typename DataType, typename DistType>
 size_t HNSWIndex<DataType, DistType>::getRandomLevel(double reverse_size) {
     std::uniform_real_distribution<double> distribution(0.0, 1.0);
@@ -1027,14 +1052,17 @@ idType HNSWIndex<DataType, DistType>::mutuallyConnectNewElement(
     assert(top_candidates_list.size() <= M &&
            "Should be not be more than M candidates returned by the heuristic");
 
-    auto *new_node_level = getGraphDataByInternalId(new_node_id);
+    // COW-aware: copy the new node's block out of any shared snapshot before
+    // mutating its links (no-op when no snapshot is live).
+    auto *new_node_level = getGraphDataByInternalIdForWrite(new_node_id);
     ElementLevelData &new_node_level_data = getElementLevelData(new_node_level, level);
     assert(new_node_level_data.getNumLinks() == 0 &&
            "The newly inserted element should have blank link list");
 
     for (auto &neighbor_data : top_candidates_list) {
         idType selected_neighbor = neighbor_data.second; // neighbor's id
-        auto *neighbor_graph_data = getGraphDataByInternalId(selected_neighbor);
+        // COW-aware: the neighbor's links are about to be updated.
+        auto *neighbor_graph_data = getGraphDataByInternalIdForWrite(selected_neighbor);
         if (new_node_id < selected_neighbor) {
             lockNodeLinks(new_node_level);
             lockNodeLinks(neighbor_graph_data);
@@ -1313,7 +1341,7 @@ void HNSWIndex<DataType, DistType>::SwapLastIdWithDeletedId(idType element_inter
     }
 
     // Move the last element's data to the deleted element's place
-    auto element = getGraphDataByInternalId(element_internal_id);
+    auto element = getGraphDataByInternalIdForWrite(element_internal_id);
     memcpy((void *)element, last_element, this->elementGraphDataSize);
 
     auto data = getDataByInternalId(element_internal_id);
@@ -1671,7 +1699,8 @@ void HNSWIndex<DataType, DistType>::mutuallyRemoveNeighborAtPos(ElementLevelData
                                                                 size_t pos) {
     // Now we know that we are looking at a neighbor that needs to be removed.
     auto removed_node = node_level.getLinkAtPos(pos);
-    ElementLevelData &removed_node_level = getElementLevelData(removed_node, level);
+    // COW-aware: the removed node's incoming edges may be updated below.
+    ElementLevelData &removed_node_level = getElementLevelDataForWrite(removed_node, level);
     // Perform the mutual update:
     // if the removed node id (the node's neighbour to be removed)
     // wasn't pointing to the node (i.e., the edge was uni-directional),
diff --git a/src/VecSim/algorithms/hnsw/hnsw_base_tests_friends.h b/src/VecSim/algorithms/hnsw/hnsw_base_tests_friends.h
index 9d7d17e64..b60eab73f 100644
--- a/src/VecSim/algorithms/hnsw/hnsw_base_tests_friends.h
+++ b/src/VecSim/algorithms/hnsw/hnsw_base_tests_friends.h
@@ -17,6 +17,11 @@ INDEX_TEST_FRIEND_CLASS(IndexAllocatorTest_test_hnsw_reclaim_memory_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTest_markDelete_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTest_copyToDeepCopy_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTest_snapshotHandleCapture_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTest_cowStorageSnapshotIsolation_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTest_snapshotLockFreeConsistency_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTest_snapshotPreservedUnderConcurrentInserts_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTest_cowDecisionByGenerationNotUseCount_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTest_snapshotPinsUntraversedBlock_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWMultiTest_markDelete_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTest_allMarkedDeletedLevel_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTestParallel)
diff --git a/tests/unit/test_hnsw.cpp b/tests/unit/test_hnsw.cpp
index c37ed3886..8d8560668 100644
--- a/tests/unit/test_hnsw.cpp
+++ b/tests/unit/test_hnsw.cpp
@@ -2339,12 +2339,74 @@ TYPED_TEST(HNSWTest, copyToDeepCopy) {
     VecSimIndex_Free(index);
 }
 
+// Phase 2 of the HNSW snapshot work: the rooted copy-on-write block storage.
+// Capturing the backbone (root) yields an immutable view; a subsequent write to
+// a shared block must copy that block so the snapshot keeps seeing the old data,
+// and dropping the snapshot must reclaim the superseded buffer automatically.
+// With no snapshot live, writes mutate in place with no extra allocation.
+TYPED_TEST(HNSWTest, cowStorageSnapshotIsolation) {
+    size_t dim = 4;
+    size_t M = 8;
+    size_t bs = 64; // small blocks so a few hundred vectors span multiple blocks
+    HNSWParams params = {
+        .dim = dim, .metric = VecSimMetric_L2, .blockSize = bs, .M = M, .efConstruction = 200};
+    VecSimIndex *index = this->CreateNewIndex(params);
+    auto *hnsw = this->CastToHNSW(index);
+    auto allocator = index->getAllocator();
+
+    size_t n = bs * 3;
+    for (size_t i = 0; i < n; i++) {
+        GenerateAndAddVector<TEST_DATA_T>(index, dim, i, i);
+    }
+    ASSERT_GE(hnsw->graphDataBlocks.size(), 3u);
+
+    const size_t lds = hnsw->levelDataSize;
+
+    // Capture a snapshot (registers a live generation — COW is now driven by the
+    // generation tag, so a bare backbone ref alone would not trigger a clone).
+    auto snap = hnsw->captureGraphSnapshot();
+    ASSERT_TRUE(snap.valid());
+
+    // Record node 0's level-0 links as the snapshot sees them.
+    auto *snapNode0 = snap.getGraphData(0);
+    ElementLevelData &snapLd0 = snapNode0->getElementLevelData(0, lds);
+    std::vector<idType> recorded;
+    for (size_t j = 0; j < snapLd0.getNumLinks(); j++) {
+        recorded.push_back(snapLd0.getLinkAtPos(j));
+    }
+    ASSERT_GT(recorded.size(), 0u);
+
+    // A write to node 0 via the COW-aware accessor must copy block 0: the live
+    // node ends up at different storage and the allocator grows by a block.
+    size_t mem_before = allocator->getAllocationSize();
+    ElementGraphData *liveNode0 = hnsw->getGraphDataByInternalIdForWrite(0);
+    ASSERT_NE(liveNode0, snapNode0);
+    ASSERT_GT(allocator->getAllocationSize(), mem_before);
+    size_t mem_after_cow = allocator->getAllocationSize();
+
+    // Mutate the live node; the snapshot must not observe the change.
+    liveNode0->getElementLevelData(0, lds).setLinkAtPos(0, (idType)0xABCDE);
+    ASSERT_EQ(snapLd0.getNumLinks(), recorded.size());
+    for (size_t j = 0; j < recorded.size(); j++) {
+        ASSERT_EQ(snapLd0.getLinkAtPos(j), recorded[j]);
+    }
+
+    // Dropping the snapshot reclaims the superseded buffer automatically.
+    snap = HNSWGraphSnapshot{};
+    ASSERT_LT(allocator->getAllocationSize(), mem_after_cow);
+
+    // With no snapshot live, a write mutates in place: no allocation.
+    size_t mem_steady = allocator->getAllocationSize();
+    (void)hnsw->getGraphDataByInternalIdForWrite(1);
+    ASSERT_EQ(allocator->getAllocationSize(), mem_steady);
+
+    VecSimIndex_Free(index);
+}
+
 // Phase 3 of the HNSW snapshot work: the immutable snapshot handle. Capturing it
 // is O(1) (a shared_ptr bump + scalar state, no element allocation), it records
-// the entry-point / level / count as of capture, and reads through it resolve the
-// captured graph topology without touching the live `graphDataBlocks` member.
-// (Isolation under concurrent writes is a separate guarantee that depends on the
-// copy-on-write write path; it is exercised in the next branch.)
+// the entry-point/level/count as of capture, and reads through it stay stable
+// while the live index inserts and copy-on-writes underneath.
 TYPED_TEST(HNSWTest, snapshotHandleCapture) {
     size_t dim = 4;
     size_t M = 8;
@@ -2369,20 +2431,297 @@ TYPED_TEST(HNSWTest, snapshotHandleCapture) {
     ASSERT_EQ(snap.entrypointNode, hnsw->entrypointNode);
     ASSERT_EQ(snap.maxLevel, hnsw->maxLevel);
 
-    // The snapshot resolves node access through the captured backbone and returns
-    // the same level-0 topology the live index holds (no writes have occurred).
+    // Record the snapshot's view of node 0.
     const size_t lds = hnsw->levelDataSize;
     ElementLevelData &snapLd0 = snap.getLevelData(0, 0);
-    ElementLevelData &liveLd0 = hnsw->getElementLevelData((idType)0, (size_t)0);
-    ASSERT_EQ(snapLd0.getNumLinks(), liveLd0.getNumLinks());
-    ASSERT_GT(snapLd0.getNumLinks(), 0u);
+    std::vector<idType> recorded;
     for (size_t j = 0; j < snapLd0.getNumLinks(); j++) {
-        ASSERT_EQ(snapLd0.getLinkAtPos(j), liveLd0.getLinkAtPos(j));
+        recorded.push_back(snapLd0.getLinkAtPos(j));
     }
-    // The snapshot's node 0 resolves to the same (still-shared) buffer as the live
-    // index, since nothing has been copied-on-write yet.
-    ASSERT_EQ(snap.getGraphData(0), hnsw->getGraphDataByInternalId(0));
-    (void)lds;
+    ASSERT_GT(recorded.size(), 0u);
+
+    // Mutate the live index after capture: grow it (new block) and rewrite a link
+    // on node 0 (forcing a block copy-on-write since the snapshot shares it).
+    for (size_t i = n; i < n + bs; i++) {
+        GenerateAndAddVector<TEST_DATA_T>(index, dim, i, i);
+    }
+    ElementGraphData *live0 = hnsw->getGraphDataByInternalIdForWrite(0);
+    live0->getElementLevelData(0, lds).setLinkAtPos(0, (idType)0xBEEF);
+
+    // The snapshot is unaffected: it still sees the original count and links, and
+    // node 0 now resolves to a different (pre-COW) buffer than the live index.
+    ASSERT_EQ(snap.curElementCount, n);
+    ElementLevelData &snapLd0b = snap.getLevelData(0, 0);
+    ASSERT_EQ(snapLd0b.getNumLinks(), recorded.size());
+    for (size_t j = 0; j < recorded.size(); j++) {
+        ASSERT_EQ(snapLd0b.getLinkAtPos(j), recorded[j]);
+    }
+    ASSERT_NE(snap.getGraphData(0), live0);
+
+    snap = HNSWGraphSnapshot{};
+    VecSimIndex_Free(index);
+}
+
+// Phase 4 of the HNSW snapshot work: lock-free snapshot reads are consistent and
+// race-free while a concurrent writer copy-on-writes the live index. This is the
+// foundational guarantee behind "capture under the lock, then release it and
+// iterate lock-free while writers proceed". A reader thread repeatedly hashes the
+// captured snapshot's level-0 topology (which must never change), while a writer
+// thread rewrites links on the live index — each such write COWs the shared block
+// rather than mutating the buffer the snapshot pinned. Dropping the snapshot then
+// reclaims the superseded buffers automatically.
+TYPED_TEST(HNSWTest, snapshotLockFreeConsistency) {
+    size_t dim = 4;
+    size_t M = 8;
+    size_t bs = 64;
+    HNSWParams params = {
+        .dim = dim, .metric = VecSimMetric_L2, .blockSize = bs, .M = M, .efConstruction = 200};
+    VecSimIndex *index = this->CreateNewIndex(params);
+    auto *hnsw = this->CastToHNSW(index);
+    auto allocator = index->getAllocator();
+
+    const size_t n = bs * 5;
+    for (size_t i = 0; i < n; i++) {
+        GenerateAndAddVector<TEST_DATA_T>(index, dim, i, i);
+    }
+    const size_t lds = hnsw->levelDataSize;
+
+    // Capture the snapshot (as a query would, under the read lock) and fingerprint
+    // its level-0 topology over the whole visible set.
+    auto snap = hnsw->captureGraphSnapshot();
+    ASSERT_TRUE(hnsw->graphSnapshotActive());
+    auto fingerprint = [&](const HNSWGraphSnapshot &s) {
+        uint64_t h = 1469598103934665603ull;
+        for (idType id = 0; id < (idType)s.curElementCount; id++) {
+            ElementLevelData &ld = s.getLevelData(id, 0);
+            h = (h ^ ld.getNumLinks()) * 1099511628211ull;
+            for (size_t j = 0; j < ld.getNumLinks(); j++) {
+                h = (h ^ ld.getLinkAtPos(j)) * 1099511628211ull;
+            }
+        }
+        return h;
+    };
+    const uint64_t baseline = fingerprint(snap);
+
+    // Single concurrent writer: rewrite links on the live index. Each write goes
+    // through the COW-aware accessor, so it copies the block the snapshot shares
+    // instead of mutating it. Only this thread touches the live graphDataBlocks;
+    // the reader below only touches its own captured `snap`.
+    std::atomic<bool> stop{false};
+    std::atomic<uint64_t> writes{0};
+    std::thread writer([&] {
+        std::mt19937 rng(1234);
+        while (!stop.load(std::memory_order_relaxed)) {
+            idType id = rng() % n;
+            ElementGraphData *gd = hnsw->getGraphDataByInternalIdForWrite(id);
+            ElementLevelData &ld = gd->getElementLevelData(0, lds);
+            if (ld.getNumLinks() > 0) {
+                ld.setLinkAtPos(0, (idType)(rng() % n));
+                writes.fetch_add(1, std::memory_order_relaxed);
+            }
+        }
+    });
+
+    // Reader: the snapshot fingerprint must stay invariant through many lock-free
+    // reads while the writer mutates the live index.
+    bool consistent = true;
+    for (int i = 0; i < 3000; i++) {
+        if (fingerprint(snap) != baseline) {
+            consistent = false;
+            break;
+        }
+        std::this_thread::yield();
+    }
+    stop.store(true);
+    writer.join();
+    ASSERT_TRUE(consistent);
+    ASSERT_GT(writes.load(), 0u);
+    ASSERT_EQ(fingerprint(snap), baseline);
+
+    // The live index has diverged (the writer COW'd blocks the snapshot pinned).
+    size_t mem_with_snapshot = allocator->getAllocationSize();
+    // Dropping the whole handle releases both the pinned root and the live-id
+    // registration, so the snapshot is no longer reported active.
+    snap = HNSWGraphSnapshot{};
+    ASSERT_FALSE(hnsw->graphSnapshotActive());
+    // Dropping the snapshot reclaims the superseded buffers automatically.
+    ASSERT_LT(allocator->getAllocationSize(), mem_with_snapshot);
+
+    VecSimIndex_Free(index);
+}
+
+// Phase 4 (continued): a captured snapshot is preserved through *real* HNSW
+// inserts running concurrently on the live index. This exercises the COW-aware
+// write-site conversion: mutuallyConnectNewElement / revisitNeighborConnections /
+// mutuallyRemoveNeighborAtPos now copy a block out of the shared snapshot before
+// rewriting any node's links, so a full insert (which rewires existing neighbors)
+// never mutates a buffer the snapshot holds. A single writer matches the design's
+// "writes are serialized under the exclusive lock" model; the reader is fully
+// lock-free over the immutable snapshot.
+TYPED_TEST(HNSWTest, snapshotPreservedUnderConcurrentInserts) {
+    size_t dim = 4;
+    size_t M = 8;
+    size_t bs = 64;
+    HNSWParams params = {
+        .dim = dim, .metric = VecSimMetric_L2, .blockSize = bs, .M = M, .efConstruction = 200};
+    VecSimIndex *index = this->CreateNewIndex(params);
+    auto *hnsw = this->CastToHNSW(index);
+    auto allocator = index->getAllocator();
+
+    const size_t n0 = bs * 4;
+    for (size_t i = 0; i < n0; i++) {
+        GenerateAndAddVector<TEST_DATA_T>(index, dim, i, i);
+    }
+
+    // Capture the snapshot, then fingerprint its level-0 topology over the whole
+    // visible set as of capture.
+    auto snap = hnsw->captureGraphSnapshot();
+    auto fingerprint = [&](const HNSWGraphSnapshot &s) {
+        uint64_t h = 1469598103934665603ull;
+        for (idType id = 0; id < (idType)s.curElementCount; id++) {
+            ElementLevelData &ld = s.getLevelData(id, 0);
+            h = (h ^ ld.getNumLinks()) * 1099511628211ull;
+            for (size_t j = 0; j < ld.getNumLinks(); j++) {
+                h = (h ^ ld.getLinkAtPos(j)) * 1099511628211ull;
+            }
+        }
+        return h;
+    };
+    const uint64_t baseline = fingerprint(snap);
+
+    // Single writer performing genuine HNSW insertions (which rewire existing
+    // nodes' neighbor lists) on the live index.
+    std::atomic<bool> stop{false};
+    std::atomic<size_t> inserted{0};
+    std::thread writer([&] {
+        for (size_t i = n0; i < n0 + bs * 4; i++) {
+            if (stop.load(std::memory_order_relaxed)) {
+                break;
+            }
+            GenerateAndAddVector<TEST_DATA_T>(index, dim, i, i);
+            inserted.fetch_add(1, std::memory_order_relaxed);
+        }
+    });
+
+    // The snapshot fingerprint must stay invariant through lock-free reads while
+    // real inserts copy-on-write the live graph underneath.
+    bool consistent = true;
+    for (int i = 0; i < 4000; i++) {
+        if (fingerprint(snap) != baseline) {
+            consistent = false;
+            break;
+        }
+        std::this_thread::yield();
+    }
+    stop.store(true);
+    writer.join();
+    ASSERT_TRUE(consistent);
+    ASSERT_GT(inserted.load(), 0u);
+    ASSERT_EQ(fingerprint(snap), baseline);
+    // The live index grew/diverged; the snapshot still sees exactly n0 elements.
+    ASSERT_EQ(snap.curElementCount, n0);
+    ASSERT_GT(hnsw->indexSize(), n0);
+
+    size_t mem_with_snapshot = allocator->getAllocationSize();
+    snap = HNSWGraphSnapshot{};
+    ASSERT_LT(allocator->getAllocationSize(), mem_with_snapshot);
+
+    VecSimIndex_Free(index);
+}
+
+// Task 4.9: the COW clone decision is driven by the GENERATION TAG, not use_count.
+// The case that motivated it: write block A (which clones the backbone once), then
+// write a DIFFERENT block B — B must still be copy-on-written for the snapshot.
+// A container-level use_count check goes stale after the first backbone clone; the
+// generation tag (clone iff max_live_snapshot_id >= block.gen) stays correct.
+TYPED_TEST(HNSWTest, cowDecisionByGenerationNotUseCount) {
+    size_t dim = 4;
+    size_t M = 8;
+    size_t bs = 64;
+    HNSWParams params = {
+        .dim = dim, .metric = VecSimMetric_L2, .blockSize = bs, .M = M, .efConstruction = 200};
+    VecSimIndex *index = this->CreateNewIndex(params);
+    auto *hnsw = this->CastToHNSW(index);
+    const size_t lds = hnsw->levelDataSize;
+
+    for (size_t i = 0; i < bs * 3; i++) {
+        GenerateAndAddVector<TEST_DATA_T>(index, dim, i, i);
+    }
+    ASSERT_GE(hnsw->graphDataBlocks.size(), 3u);
+
+    auto snap = hnsw->captureGraphSnapshot();
+
+    const idType idA = 0;      // block 0
+    const idType idB = bs + 1; // a different block (block 1)
+    ASSERT_NE(idA / bs, idB / bs);
+
+    ElementLevelData &snapA = snap.getLevelData(idA, 0);
+    ElementLevelData &snapB = snap.getLevelData(idB, 0);
+    ASSERT_GT(snapA.getNumLinks(), 0u);
+    ASSERT_GT(snapB.getNumLinks(), 0u);
+    const idType snapA0 = snapA.getLinkAtPos(0);
+    const idType snapB0 = snapB.getLinkAtPos(0);
+    const ElementGraphData *snapAptr = snap.getGraphData(idA);
+    const ElementGraphData *snapBptr = snap.getGraphData(idB);
+
+    // Write A: clones the (snapshot-shared) backbone once, plus block 0.
+    ElementGraphData *liveA = hnsw->getGraphDataByInternalIdForWrite(idA);
+    liveA->getElementLevelData(0, lds).setLinkAtPos(0, (idType)0xA11);
+    ASSERT_NE(liveA, snapAptr);
+
+    // Write B in a DIFFERENT block: backbone is already cloned, but the gen tag
+    // must still force a COW of block B for the snapshot.
+    ElementGraphData *liveB = hnsw->getGraphDataByInternalIdForWrite(idB);
+    liveB->getElementLevelData(0, lds).setLinkAtPos(0, (idType)0xB22);
+    ASSERT_NE(liveB, snapBptr) << "block B must be COW'd even after the backbone was cloned";
+
+    // The snapshot is intact for both blocks.
+    ASSERT_EQ(snap.getLevelData(idA, 0).getLinkAtPos(0), snapA0);
+    ASSERT_EQ(snap.getLevelData(idB, 0).getLinkAtPos(0), snapB0);
+
+    snap = HNSWGraphSnapshot{};
+    VecSimIndex_Free(index);
+}
+
+// Task 4.9 / design.md "What pins a block version": a snapshot pins EVERY block
+// via the captured backbone, not via traversal order. A block the snapshot has
+// not yet read is still preserved when the writer COWs it — because capture is
+// eager and whole-backbone, and backbone COW clones the header vector before
+// repointing any slot (so the captured backbone keeps pointing at the old buffer).
+TYPED_TEST(HNSWTest, snapshotPinsUntraversedBlock) {
+    size_t dim = 4;
+    size_t M = 8;
+    size_t bs = 64;
+    HNSWParams params = {
+        .dim = dim, .metric = VecSimMetric_L2, .blockSize = bs, .M = M, .efConstruction = 200};
+    VecSimIndex *index = this->CreateNewIndex(params);
+    auto *hnsw = this->CastToHNSW(index);
+    const size_t lds = hnsw->levelDataSize;
+
+    for (size_t i = 0; i < bs * 3; i++) {
+        GenerateAndAddVector<TEST_DATA_T>(index, dim, i, i);
+    }
+    ASSERT_GE(hnsw->graphDataBlocks.size(), 3u);
+
+    auto snap = hnsw->captureGraphSnapshot();
+
+    // A block the snapshot has not read yet. Record its as-of-capture pointer + link.
+    const idType idFar = bs + 1; // block 1
+    ElementLevelData &snapFar = snap.getLevelData(idFar, 0);
+    ASSERT_GT(snapFar.getNumLinks(), 0u);
+    const idType snapFar0 = snapFar.getLinkAtPos(0);
+    const ElementGraphData *snapFarPtr = snap.getGraphData(idFar);
+
+    // Writer COWs that block *before* the snapshot ever traverses to it.
+    ElementGraphData *liveFar = hnsw->getGraphDataByInternalIdForWrite(idFar);
+    liveFar->getElementLevelData(0, lds).setLinkAtPos(0, (idType)0xFA4);
+
+    // Reading it from the snapshot now must return the as-of-capture buffer: the
+    // captured backbone still pins it (not freed, not the live version), regardless
+    // of read order.
+    ASSERT_EQ(snap.getGraphData(idFar), snapFarPtr);
+    ASSERT_NE(snap.getGraphData(idFar), liveFar);
+    ASSERT_EQ(snap.getLevelData(idFar, 0).getLinkAtPos(0), snapFar0);
 
     snap = HNSWGraphSnapshot{};
     VecSimIndex_Free(index);

From b4dbf41605f175c11147b4938c53a46ea6e8731e Mon Sep 17 00:00:00 2001
From: jonathan keinan <jonathan.keinan@redis.com>
Date: Thu, 11 Jun 2026 19:47:25 +0300
Subject: [PATCH 05/10] HNSW snapshot 05: lock-free KNN over a captured
 snapshot
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add HNSWIndex::topKFromSnapshot(snap, query, k, params): the full two-phase HNSW
KNN (greedy upper-level descent + ef-bounded level-0 best-first) reading ALL
state through the immutable snapshot — graph topology via the captured backbone,
vector data via the captured per-block pointers, and deleted/in-process flags +
labels via the captured (frozen) metadata root — with no per-node locks and a
local visited set. It touches no live graph, vectors, metadata, or mutex, so it
runs after the index read lock is released and stays point-in-time consistent
while writers proceed.

Tests: lockFreeSnapshotQuery (same results as the live topKQuery on the same
graph; stays consistent while a writer COWs links underneath) and
lockFreeSnapshotQueryDuringInserts (writer does real addVectors, growing all
three per-id containers, yet the lock-free reader's results never change).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 src/VecSim/algorithms/hnsw/hnsw.h             | 133 ++++++++++++++
 .../algorithms/hnsw/hnsw_base_tests_friends.h |   4 +-
 tests/unit/test_hnsw.cpp                      | 167 ++++++++++++++++++
 3 files changed, 303 insertions(+), 1 deletion(-)

diff --git a/src/VecSim/algorithms/hnsw/hnsw.h b/src/VecSim/algorithms/hnsw/hnsw.h
index aa9f155c0..17998e54d 100644
--- a/src/VecSim/algorithms/hnsw/hnsw.h
+++ b/src/VecSim/algorithms/hnsw/hnsw.h
@@ -423,6 +423,13 @@ class HNSWIndex : public VecSimIndexAbstract<DataType, DistType>,
                      const HNSWAddVectorState &state);
     VecSimQueryReply *topKQuery(const void *query_data, size_t k,
                                 VecSimQueryParams *queryParams) const override;
+    // Lock-free KNN over an immutable graph snapshot. Resolves all graph access
+    // through `snap` (no per-node locks — the snapshot's buffers are immutable),
+    // so it can run after the index read lock has been released and stays
+    // consistent with the index state at capture time while writers proceed. On
+    // the same graph and ef/k as the live path it returns the same results.
+    VecSimQueryReply *topKFromSnapshot(const HNSWGraphSnapshot &snap, const void *query_data,
+                                       size_t k, VecSimQueryParams *queryParams) const;
     VecSimQueryReply *rangeQuery(const void *query_data, double radius,
                                  VecSimQueryParams *queryParams) const override;
 
@@ -2159,6 +2166,132 @@ VecSimQueryReply *HNSWIndex<DataType, DistType>::topKQuery(const void *query_dat
     return rep;
 }
 
+template <typename DataType, typename DistType>
+VecSimQueryReply *
+HNSWIndex<DataType, DistType>::topKFromSnapshot(const HNSWGraphSnapshot &snap,
+                                                const void *query_data, size_t k,
+                                                VecSimQueryParams *queryParams) const {
+    auto rep = new VecSimQueryReply(this->allocator);
+    if (!snap.valid() || snap.curElementCount == 0 || k == 0 ||
+        snap.entrypointNode == INVALID_ID) {
+        return rep;
+    }
+
+    auto processed_query_ptr = this->preprocessQuery(query_data);
+    const void *query = processed_query_ptr.get();
+    void *timeoutCtx = nullptr;
+    size_t ef = this->ef;
+    if (queryParams) {
+        timeoutCtx = queryParams->timeoutCtx;
+        if (queryParams->hnswRuntimeParams.efRuntime != 0) {
+            ef = queryParams->hnswRuntimeParams.efRuntime;
+        }
+    }
+    ef = std::max(ef, k);
+
+    // A link can only reference an id that existed at capture; guard defensively.
+    const idType visibleCount = (idType)snap.curElementCount;
+    auto visible = [&](idType id) { return id < visibleCount; };
+
+    // Read flags + labels from the captured (frozen) metadata, not the live index,
+    // so a concurrent insert that reallocates idToMetaData can't race this scan.
+    const auto *meta = static_cast<const vecsim_stl::vector<ElementMetaData> *>(snap.metaData.get());
+    auto snapDeleted = [&](idType id) { return ((*meta)[id].flags & DELETE_MARK) != 0; };
+    auto snapInProcess = [&](idType id) { return ((*meta)[id].flags & IN_PROCESS) != 0; };
+    auto snapLabel = [&](idType id) { return (*meta)[id].label; };
+
+    // ---- Phase 1: greedy descent through the upper levels (no locks). ----
+    idType cur = snap.entrypointNode;
+    DistType curDist = this->calcDistance(query, snap.getVectorData(cur));
+    for (size_t level = snap.maxLevel; level > 0; level--) {
+        bool changed = true;
+        while (changed) {
+            if (VECSIM_TIMEOUT(timeoutCtx)) {
+                rep->code = VecSim_QueryReply_TimedOut;
+                return rep;
+            }
+            changed = false;
+            ElementLevelData &node_level = snap.getLevelData(cur, level);
+            for (linkListSize j = 0; j < node_level.getNumLinks(); j++) {
+                idType cand = node_level.getLinkAtPos(j);
+                if (!visible(cand) || snapInProcess(cand)) {
+                    continue;
+                }
+                DistType d = this->calcDistance(query, snap.getVectorData(cand));
+                if (d < curDist) {
+                    curDist = d;
+                    cur = cand;
+                    changed = true;
+                }
+            }
+        }
+    }
+
+    // ---- Phase 2: ef-bounded best-first search at level 0 (no locks). ----
+    // Local visited set so the scan touches no shared/live state.
+    vecsim_stl::vector<bool> visited(snap.curElementCount, false, this->allocator);
+    candidatesLabelsMaxHeap<DistType> *top_candidates = getNewMaxPriorityQueue();
+    candidatesMaxHeap<DistType> candidate_set(this->allocator);
+
+    DistType lowerBound;
+    if (!snapDeleted(cur)) {
+        DistType dist = this->calcDistance(query, snap.getVectorData(cur));
+        lowerBound = dist;
+        top_candidates->emplace(dist, snapLabel(cur));
+        candidate_set.emplace(-dist, cur);
+    } else {
+        lowerBound = std::numeric_limits<DistType>::max();
+        candidate_set.emplace(-lowerBound, cur);
+    }
+    visited[cur] = true;
+
+    while (!candidate_set.empty()) {
+        auto curr_pair = candidate_set.top();
+        if ((-curr_pair.first) > lowerBound && top_candidates->size() >= ef) {
+            break;
+        }
+        if (VECSIM_TIMEOUT(timeoutCtx)) {
+            rep->code = VecSim_QueryReply_TimedOut;
+            delete top_candidates;
+            return rep;
+        }
+        candidate_set.pop();
+
+        ElementLevelData &node_level = snap.getLevelData(curr_pair.second, 0);
+        for (linkListSize j = 0; j < node_level.getNumLinks(); j++) {
+            idType cand = node_level.getLinkAtPos(j);
+            if (!visible(cand) || visited[cand] || snapInProcess(cand)) {
+                continue;
+            }
+            visited[cand] = true;
+            DistType d = this->calcDistance(query, snap.getVectorData(cand));
+            if (lowerBound > d || top_candidates->size() < ef) {
+                candidate_set.emplace(-d, cand);
+                if (!snapDeleted(cand)) {
+                    top_candidates->emplace(d, snapLabel(cand));
+                }
+                if (top_candidates->size() > ef) {
+                    top_candidates->pop();
+                }
+                if (!top_candidates->empty()) {
+                    lowerBound = top_candidates->top().first;
+                }
+            }
+        }
+    }
+
+    while (top_candidates->size() > k) {
+        top_candidates->pop();
+    }
+    rep->results.resize(top_candidates->size());
+    for (auto result = rep->results.rbegin(); result != rep->results.rend(); result++) {
+        std::tie(result->score, result->id) = top_candidates->top();
+        top_candidates->pop();
+    }
+    delete top_candidates;
+    return rep;
+}
+
 template <typename DataType, typename DistType>
 VecSimQueryResultContainer HNSWIndex<DataType, DistType>::searchRangeBottomLayer_WithTimeout(
     idType ep_id, const void *data_point, double epsilon, DistType radius, void *timeoutCtx,
diff --git a/src/VecSim/algorithms/hnsw/hnsw_base_tests_friends.h b/src/VecSim/algorithms/hnsw/hnsw_base_tests_friends.h
index b60eab73f..e9cab0ec1 100644
--- a/src/VecSim/algorithms/hnsw/hnsw_base_tests_friends.h
+++ b/src/VecSim/algorithms/hnsw/hnsw_base_tests_friends.h
@@ -16,12 +16,14 @@ INDEX_TEST_FRIEND_CLASS(IndexAllocatorTest_testIncomingEdgesSet_Test)
 INDEX_TEST_FRIEND_CLASS(IndexAllocatorTest_test_hnsw_reclaim_memory_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTest_markDelete_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTest_copyToDeepCopy_Test)
-INDEX_TEST_FRIEND_CLASS(HNSWTest_snapshotHandleCapture_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTest_cowStorageSnapshotIsolation_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTest_snapshotHandleCapture_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTest_snapshotLockFreeConsistency_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTest_snapshotPreservedUnderConcurrentInserts_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTest_cowDecisionByGenerationNotUseCount_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTest_snapshotPinsUntraversedBlock_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTest_lockFreeSnapshotQuery_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTest_lockFreeSnapshotQueryDuringInserts_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWMultiTest_markDelete_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTest_allMarkedDeletedLevel_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTestParallel)
diff --git a/tests/unit/test_hnsw.cpp b/tests/unit/test_hnsw.cpp
index 8d8560668..cb25ba204 100644
--- a/tests/unit/test_hnsw.cpp
+++ b/tests/unit/test_hnsw.cpp
@@ -2726,3 +2726,170 @@ TYPED_TEST(HNSWTest, snapshotPinsUntraversedBlock) {
     snap = HNSWGraphSnapshot{};
     VecSimIndex_Free(index);
 }
+
+// Task 4.1 (read path): a lock-free KNN over a captured snapshot. (1) On the same
+// graph it returns the same results as the live search; (2) run with no lock, it
+// stays point-in-time consistent while a writer copy-on-writes the live graph
+// underneath. (Concurrent *inserts* would also grow the vectors container, which
+// is not yet snapshot-safe — gap 4-vector-data — so the writer here rewrites
+// existing links via the COW-aware path, which does not grow the index.)
+TYPED_TEST(HNSWTest, lockFreeSnapshotQuery) {
+    size_t dim = 4;
+    size_t M = 16;
+    size_t n = 1000;
+    size_t k = 10;
+    HNSWParams params = {.dim = dim,
+                         .metric = VecSimMetric_L2,
+                         .M = M,
+                         .efConstruction = 200,
+                         .efRuntime = 200};
+    VecSimIndex *index = this->CreateNewIndex(params);
+    auto *hnsw = this->CastToHNSW(index);
+    const size_t lds = hnsw->levelDataSize;
+    for (size_t i = 0; i < n; i++) {
+        GenerateAndAddVector<TEST_DATA_T>(index, dim, i, i);
+    }
+
+    // Tie-free query (0.3 keeps all |i - 0.3| distinct), so the result order is
+    // unambiguous and the two search paths must agree exactly.
+    TEST_DATA_T query[dim];
+    GenerateVector<TEST_DATA_T>(query, dim, 0.3);
+
+    auto snap = hnsw->captureGraphSnapshot();
+
+    auto fingerprint = [](VecSimQueryReply *r) {
+        uint64_t h = 1469598103934665603ull;
+        size_t len = VecSimQueryReply_Len(r);
+        for (size_t i = 0; i < len; i++) {
+            h = (h ^ (uint64_t)VecSimQueryResult_GetId(r->results.data() + i)) * 1099511628211ull;
+        }
+        return std::make_pair(h, len);
+    };
+
+    // (1) Correctness: the lock-free snapshot search == the live search on the
+    // same graph.
+    VecSimQueryReply *liveRep = hnsw->topKQuery(query, k, nullptr);
+    VecSimQueryReply *snapRep = hnsw->topKFromSnapshot(snap, query, k, nullptr);
+    ASSERT_EQ(VecSimQueryReply_Len(liveRep), k);
+    ASSERT_EQ(VecSimQueryReply_Len(snapRep), VecSimQueryReply_Len(liveRep));
+    for (size_t i = 0; i < k; i++) {
+        ASSERT_EQ(VecSimQueryResult_GetId(liveRep->results.data() + i),
+                  VecSimQueryResult_GetId(snapRep->results.data() + i))
+            << "rank " << i;
+        ASSERT_EQ(VecSimQueryResult_GetScore(liveRep->results.data() + i),
+                  VecSimQueryResult_GetScore(snapRep->results.data() + i));
+    }
+    auto baseline = fingerprint(snapRep);
+    VecSimQueryReply_Free(liveRep);
+    VecSimQueryReply_Free(snapRep);
+
+    // (2) Lock-free + point-in-time consistent: repeatedly run the snapshot query
+    // with no lock while a writer COWs the live graph; the results must not change.
+    std::atomic<bool> stop{false};
+    std::atomic<size_t> writes{0};
+    std::thread writer([&] {
+        std::mt19937 rng(99);
+        while (!stop.load(std::memory_order_relaxed)) {
+            idType id = rng() % n;
+            ElementGraphData *gd = hnsw->getGraphDataByInternalIdForWrite(id);
+            ElementLevelData &ld = gd->getElementLevelData(0, lds);
+            if (ld.getNumLinks() > 0) {
+                ld.setLinkAtPos(0, (idType)(rng() % n));
+                writes.fetch_add(1, std::memory_order_relaxed);
+            }
+        }
+    });
+
+    bool consistent = true;
+    for (int it = 0; it < 300; it++) {
+        VecSimQueryReply *r = hnsw->topKFromSnapshot(snap, query, k, nullptr);
+        auto f = fingerprint(r);
+        VecSimQueryReply_Free(r);
+        if (f != baseline) {
+            consistent = false;
+            break;
+        }
+        std::this_thread::yield();
+    }
+    stop.store(true);
+    writer.join();
+    ASSERT_TRUE(consistent);
+    ASSERT_GT(writes.load(), 0u);
+
+    snap = HNSWGraphSnapshot{};
+    VecSimIndex_Free(index);
+}
+
+// Like lockFreeSnapshotQuery, but the writer performs *real inserts* (addVector),
+// which grow and reallocate all three per-id containers — the graph backbone, the
+// raw vectors, and idToMetaData (labels + flags). A snapshot pins as-of-capture
+// copies of all three, so the lock-free reader must keep returning identical
+// results and must be race-free (this is the test that must stay TSan-clean).
+TYPED_TEST(HNSWTest, lockFreeSnapshotQueryDuringInserts) {
+    size_t dim = 4;
+    size_t M = 16;
+    size_t n = 1000;   // captured population
+    size_t k = 10;
+    HNSWParams params = {.dim = dim,
+                         .metric = VecSimMetric_L2,
+                         .M = M,
+                         .efConstruction = 200,
+                         .efRuntime = 200};
+    VecSimIndex *index = this->CreateNewIndex(params);
+    auto *hnsw = this->CastToHNSW(index);
+    for (size_t i = 0; i < n; i++) {
+        GenerateAndAddVector<TEST_DATA_T>(index, dim, i, i);
+    }
+
+    TEST_DATA_T query[dim];
+    GenerateVector<TEST_DATA_T>(query, dim, 0.3);
+
+    auto snap = hnsw->captureGraphSnapshot();
+
+    auto fingerprint = [](VecSimQueryReply *r) {
+        uint64_t h = 1469598103934665603ull;
+        size_t len = VecSimQueryReply_Len(r);
+        for (size_t i = 0; i < len; i++) {
+            h = (h ^ (uint64_t)VecSimQueryResult_GetId(r->results.data() + i)) * 1099511628211ull;
+        }
+        return std::make_pair(h, len);
+    };
+
+    VecSimQueryReply *snapRep = hnsw->topKFromSnapshot(snap, query, k, nullptr);
+    ASSERT_EQ(VecSimQueryReply_Len(snapRep), k);
+    auto baseline = fingerprint(snapRep);
+    VecSimQueryReply_Free(snapRep);
+
+    // Writer keeps inserting new ids past the captured population, forcing the
+    // live containers to grow/realloc while the snapshot is read lock-free.
+    std::atomic<bool> stop{false};
+    std::atomic<size_t> writes{0};
+    std::thread writer([&] {
+        std::mt19937 rng(99);
+        size_t next = n;
+        while (!stop.load(std::memory_order_relaxed)) {
+            GenerateAndAddVector<TEST_DATA_T>(index, dim, next, (float)(rng() % 4096));
+            next++;
+            writes.fetch_add(1, std::memory_order_relaxed);
+        }
+    });
+
+    bool consistent = true;
+    for (int it = 0; it < 300; it++) {
+        VecSimQueryReply *r = hnsw->topKFromSnapshot(snap, query, k, nullptr);
+        auto f = fingerprint(r);
+        VecSimQueryReply_Free(r);
+        if (f != baseline) {
+            consistent = false;
+            break;
+        }
+        std::this_thread::yield();
+    }
+    stop.store(true);
+    writer.join();
+    ASSERT_TRUE(consistent);
+    ASSERT_GT(writes.load(), 0u);
+
+    snap = HNSWGraphSnapshot{};
+    VecSimIndex_Free(index);
+}

From 64ecbabe9494c29630e6800a42a2d696889f8606 Mon Sep 17 00:00:00 2001
From: jonathan keinan <jonathan.keinan@redis.com>
Date: Thu, 11 Jun 2026 19:47:25 +0300
Subject: [PATCH 06/10] HNSW snapshot 06: snapshot-backed batch iterator (plain
 HNSW, opt-in)

Add HNSWSnapshot_BatchIterator (+ single/multi subclasses): a batch iterator that
captures an immutable graph snapshot once under the read lock in newBatchIterator,
then iterates completely lock-free. It mirrors the resumable two-phase search of
HNSW_BatchIterator (greedy descent to a level-0 entry point, then ef-bounded
best-first expansion carrying candidates/extras/lower_bound across batches), but
reads exactly like topKFromSnapshot: links via snap.getLevelData, vectors via
snap.getVectorData, deleted/in-process flags + labels from the captured frozen
metadata, with a private vector<bool> visited set and NO per-node locks. The live
HNSW_BatchIterator and its hot scanGraphInternal are left untouched (zero perf/risk
to the default path).

Opt-in via HNSWRuntimeParams.useGraphSnapshotIterator (default false, zero-init
preserves the live path). HNSWIndex_Single/Multi::newBatchIterator capture the
snapshot under lockSharedIndexDataGuard() then dispatch to the snapshot iterator.

Tests (plain single + multi): snapshotIterator_EquivalenceWithLive (flag on == off,
ids + scores in order), snapshotIterator_InsertDuringIterationInvisible
(point-in-time: post-capture inserts never appear; growth + COW mid-iteration is
safe), snapshotIterator_Reset.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 src/VecSim/algorithms/hnsw/hnsw_multi.h       |  12 +
 src/VecSim/algorithms/hnsw/hnsw_single.h      |  12 +
 .../hnsw/hnsw_snapshot_batch_iterator.h       | 393 ++++++++++++++++++
 src/VecSim/vec_sim_common.h                   |   5 +
 tests/unit/test_hnsw.cpp                      | 134 ++++++
 tests/unit/test_hnsw_multi.cpp                |  47 +++
 6 files changed, 603 insertions(+)
 create mode 100644 src/VecSim/algorithms/hnsw/hnsw_snapshot_batch_iterator.h

diff --git a/src/VecSim/algorithms/hnsw/hnsw_multi.h b/src/VecSim/algorithms/hnsw/hnsw_multi.h
index 50ff1a37d..7224cef14 100644
--- a/src/VecSim/algorithms/hnsw/hnsw_multi.h
+++ b/src/VecSim/algorithms/hnsw/hnsw_multi.h
@@ -10,6 +10,7 @@
 
 #include "hnsw.h"
 #include "hnsw_multi_batch_iterator.h"
+#include "hnsw_snapshot_batch_iterator.h"
 #include "VecSim/utils/updatable_heap.h"
 
 template <typename DataType, typename DistType>
@@ -225,6 +226,17 @@ HNSWIndex_Multi<DataType, DistType>::newBatchIterator(const void *queryBlob,
 
     // take ownership of the blob copy and pass it to the batch iterator.
     auto *queryBlobCopyPtr = queryBlobCopy.release();
+
+    // Opt-in lock-free path: capture an immutable graph snapshot under the read
+    // lock, then iterate it without holding any lock (writers proceed meanwhile).
+    if (queryParams && queryParams->hnswRuntimeParams.useGraphSnapshotIterator) {
+        this->lockSharedIndexDataGuard();
+        auto snapshot = this->captureGraphSnapshot();
+        this->unlockSharedIndexDataGuard();
+        return new (this->allocator) HNSWSnapshotMulti_BatchIterator<DataType, DistType>(
+            queryBlobCopyPtr, this, std::move(snapshot), queryParams, this->allocator);
+    }
+
     // Ownership of queryBlobCopy moves to HNSW_BatchIterator that will free it at the end.
     return new (this->allocator) HNSWMulti_BatchIterator<DataType, DistType>(
         queryBlobCopyPtr, this, queryParams, this->allocator);
diff --git a/src/VecSim/algorithms/hnsw/hnsw_single.h b/src/VecSim/algorithms/hnsw/hnsw_single.h
index 61899a142..f9f6e46d7 100644
--- a/src/VecSim/algorithms/hnsw/hnsw_single.h
+++ b/src/VecSim/algorithms/hnsw/hnsw_single.h
@@ -10,6 +10,7 @@
 
 #include "hnsw.h"
 #include "hnsw_single_batch_iterator.h"
+#include "hnsw_snapshot_batch_iterator.h"
 
 template <typename DataType, typename DistType>
 class HNSWIndex_Single : public HNSWIndex<DataType, DistType> {
@@ -182,6 +183,17 @@ HNSWIndex_Single<DataType, DistType>::newBatchIterator(const void *queryBlob,
 
     // take ownership of the blob copy and pass it to the batch iterator.
     auto *queryBlobCopyPtr = queryBlobCopy.release();
+
+    // Opt-in lock-free path: capture an immutable graph snapshot under the read
+    // lock, then iterate it without holding any lock (writers proceed meanwhile).
+    if (queryParams && queryParams->hnswRuntimeParams.useGraphSnapshotIterator) {
+        this->lockSharedIndexDataGuard();
+        auto snapshot = this->captureGraphSnapshot();
+        this->unlockSharedIndexDataGuard();
+        return new (this->allocator) HNSWSnapshotSingle_BatchIterator<DataType, DistType>(
+            queryBlobCopyPtr, this, std::move(snapshot), queryParams, this->allocator);
+    }
+
     // Ownership of queryBlobCopy moves to HNSW_BatchIterator that will free it at the end.
     return new (this->allocator) HNSWSingle_BatchIterator<DataType, DistType>(
         queryBlobCopyPtr, this, queryParams, this->allocator);
diff --git a/src/VecSim/algorithms/hnsw/hnsw_snapshot_batch_iterator.h b/src/VecSim/algorithms/hnsw/hnsw_snapshot_batch_iterator.h
new file mode 100644
index 000000000..0c8caf4a3
--- /dev/null
+++ b/src/VecSim/algorithms/hnsw/hnsw_snapshot_batch_iterator.h
@@ -0,0 +1,393 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#pragma once
+
+#include "VecSim/batch_iterator.h"
+#include "hnsw.h"
+#include "VecSim/query_result_definitions.h"
+#include "VecSim/utils/vec_utils.h"
+#include <limits>
+
+// Batch iterator that serves results from an immutable, point-in-time graph
+// snapshot (captured once under the read lock by newBatchIterator), then iterates
+// completely lock-free: every graph / vector / metadata access resolves through
+// the captured `HNSWGraphSnapshot`, so concurrent writers proceed (copy-on-write)
+// without perturbing this scan, and the iterator never touches a per-node mutex.
+//
+// It mirrors the resumable two-phase search of HNSW_BatchIterator (greedy descent
+// to a level-0 entry point on the first batch, then ef-bounded best-first
+// expansion that carries `candidates` / `top_candidates_extras` / `lower_bound`
+// across batches), but reads exactly like HNSWIndex::topKFromSnapshot: links via
+// snap.getLevelData, vectors via snap.getVectorData, and deleted/in-process flags
+// + labels from the captured (frozen) metadata. The visited set is private to the
+// iterator (a vector<bool> sized to the captured element count) rather than the
+// shared visited-nodes pool, so it needs no synchronization with the live index.
+//
+// Internal ids stay valid for the iterator's whole life because SWAP slot reuse is
+// deferred while the snapshot's liveToken is held (HNSWIndex::graphSnapshotActive).
+template <typename DataType, typename DistType>
+class HNSWSnapshot_BatchIterator : public VecSimBatchIterator {
+protected:
+    const HNSWIndex<DataType, DistType> *index;
+    HNSWGraphSnapshot snap;
+    // Frozen metadata captured with the snapshot (label + flags as of capture).
+    // Null only for an empty-at-capture index, where no id is ever visible.
+    const vecsim_stl::vector<ElementMetaData> *meta;
+    vecsim_stl::vector<bool> visited; // private visited set, sized snap.curElementCount
+    idType entry_point;               // level-0 entry point (resolved on first batch)
+    bool depleted;
+    bool entry_resolved;
+    size_t ef;
+
+    template <typename Identifier>
+    using candidatesMinHeap = vecsim_stl::min_priority_queue<DistType, Identifier>;
+
+    DistType lower_bound;
+    candidatesMinHeap<labelType> top_candidates_extras;
+    candidatesMinHeap<idType> candidates;
+
+    // A link can only reference an id that existed at capture; guard defensively.
+    inline bool visible(idType id) const { return id < (idType)snap.curElementCount; }
+    inline bool snapDeleted(idType id) const { return ((*meta)[id].flags & DELETE_MARK) != 0; }
+    inline bool snapInProcess(idType id) const { return ((*meta)[id].flags & IN_PROCESS) != 0; }
+    inline labelType snapLabel(idType id) const { return (*meta)[id].label; }
+
+    inline void visitNode(idType id) { visited[id] = true; }
+    inline bool hasVisitedNode(idType id) const { return visited[id]; }
+
+    // Greedy descent through the captured upper levels to the level-0 entry point
+    // (analog of searchBottomLayerEP, reading the snapshot). Returns INVALID_ID if
+    // the snapshot is empty / has no entry point.
+    idType snapshotBottomLayerEP(VecSimQueryReply_Code *rc);
+    VecSimQueryReply_Code scanSnapshotInternal(candidatesLabelsMaxHeap<DistType> *top_candidates);
+    candidatesLabelsMaxHeap<DistType> *scanSnapshot(VecSimQueryReply_Code *rc);
+
+    virtual inline void prepareResults(VecSimQueryReply *rep,
+                                       candidatesLabelsMaxHeap<DistType> *top_candidates,
+                                       size_t n_res) = 0;
+    virtual inline void fillFromExtras(candidatesLabelsMaxHeap<DistType> *top_candidates) = 0;
+    virtual inline void updateHeaps(candidatesLabelsMaxHeap<DistType> *top_candidates, DistType dist,
+                                    idType id) = 0;
+
+public:
+    HNSWSnapshot_BatchIterator(void *query_vector, const HNSWIndex<DataType, DistType> *index,
+                               HNSWGraphSnapshot &&snapshot, VecSimQueryParams *queryParams,
+                               std::shared_ptr<VecSimAllocator> allocator);
+
+    VecSimQueryReply *getNextResults(size_t n_res, VecSimQueryReply_Order order) override;
+    bool isDepleted() override;
+    void reset() override;
+    ~HNSWSnapshot_BatchIterator() override = default;
+};
+
+/******************** Ctor **************/
+
+template <typename DataType, typename DistType>
+HNSWSnapshot_BatchIterator<DataType, DistType>::HNSWSnapshot_BatchIterator(
+    void *query_vector, const HNSWIndex<DataType, DistType> *index, HNSWGraphSnapshot &&snapshot,
+    VecSimQueryParams *queryParams, std::shared_ptr<VecSimAllocator> allocator)
+    : VecSimBatchIterator(query_vector, queryParams ? queryParams->timeoutCtx : nullptr,
+                          std::move(allocator)),
+      index(index), snap(std::move(snapshot)),
+      meta(static_cast<const vecsim_stl::vector<ElementMetaData> *>(snap.metaData.get())),
+      visited(snap.curElementCount, false, this->allocator), entry_point(INVALID_ID),
+      depleted(false), entry_resolved(false), top_candidates_extras(this->allocator),
+      candidates(this->allocator) {
+
+    if (queryParams && queryParams->hnswRuntimeParams.efRuntime > 0) {
+        this->ef = queryParams->hnswRuntimeParams.efRuntime;
+    } else {
+        this->ef = this->index->getEf();
+    }
+}
+
+/******************** Implementation **************/
+
+template <typename DataType, typename DistType>
+idType HNSWSnapshot_BatchIterator<DataType, DistType>::snapshotBottomLayerEP(
+    VecSimQueryReply_Code *rc) {
+    *rc = VecSim_QueryReply_OK;
+    if (!snap.valid() || snap.curElementCount == 0 || snap.entrypointNode == INVALID_ID) {
+        return INVALID_ID;
+    }
+    const void *query = this->getQueryBlob();
+    idType cur = snap.entrypointNode;
+    DistType curDist = this->index->calcDistance(query, snap.getVectorData(cur));
+    for (size_t level = snap.maxLevel; level > 0; level--) {
+        bool changed = true;
+        while (changed) {
+            if (VECSIM_TIMEOUT(this->getTimeoutCtx())) {
+                *rc = VecSim_QueryReply_TimedOut;
+                return cur;
+            }
+            changed = false;
+            ElementLevelData &node_level = snap.getLevelData(cur, level);
+            for (linkListSize j = 0; j < node_level.getNumLinks(); j++) {
+                idType cand = node_level.getLinkAtPos(j);
+                if (!visible(cand) || snapInProcess(cand)) {
+                    continue;
+                }
+                DistType d = this->index->calcDistance(query, snap.getVectorData(cand));
+                if (d < curDist) {
+                    curDist = d;
+                    cur = cand;
+                    changed = true;
+                }
+            }
+        }
+    }
+    return cur;
+}
+
+template <typename DataType, typename DistType>
+VecSimQueryReply_Code HNSWSnapshot_BatchIterator<DataType, DistType>::scanSnapshotInternal(
+    candidatesLabelsMaxHeap<DistType> *top_candidates) {
+    const void *query = this->getQueryBlob();
+    while (!candidates.empty()) {
+        DistType curr_node_dist = candidates.top().first;
+        idType curr_node_id = candidates.top().second;
+
+        // If the closest candidate is further than the furthest kept result and we
+        // already have enough, the search frontier is exhausted for this batch.
+        if (curr_node_dist > this->lower_bound && top_candidates->size() >= this->ef) {
+            break;
+        }
+        if (VECSIM_TIMEOUT(this->getTimeoutCtx())) {
+            return VecSim_QueryReply_TimedOut;
+        }
+        if (!snapDeleted(curr_node_id)) {
+            updateHeaps(top_candidates, curr_node_dist, curr_node_id);
+        }
+
+        candidates.pop();
+        ElementLevelData &node_level = snap.getLevelData(curr_node_id, 0);
+        for (linkListSize j = 0; j < node_level.getNumLinks(); j++) {
+            idType candidate_id = node_level.getLinkAtPos(j);
+            if (!visible(candidate_id) || hasVisitedNode(candidate_id) ||
+                snapInProcess(candidate_id)) {
+                continue;
+            }
+            visitNode(candidate_id);
+            DistType candidate_dist =
+                this->index->calcDistance(query, snap.getVectorData(candidate_id));
+            candidates.emplace(candidate_dist, candidate_id);
+        }
+    }
+    return VecSim_QueryReply_OK;
+}
+
+template <typename DataType, typename DistType>
+candidatesLabelsMaxHeap<DistType> *
+HNSWSnapshot_BatchIterator<DataType, DistType>::scanSnapshot(VecSimQueryReply_Code *rc) {
+    candidatesLabelsMaxHeap<DistType> *top_candidates = this->index->getNewMaxPriorityQueue();
+    if (this->entry_point == INVALID_ID) {
+        this->depleted = true;
+        return top_candidates;
+    }
+
+    // First iteration: seed the candidate frontier with the entry point.
+    if (this->getResultsCount() == 0 && this->top_candidates_extras.empty() &&
+        this->candidates.empty()) {
+        if (!snapDeleted(this->entry_point)) {
+            this->lower_bound = this->index->calcDistance(this->getQueryBlob(),
+                                                          snap.getVectorData(this->entry_point));
+        } else {
+            this->lower_bound = std::numeric_limits<DistType>::max();
+        }
+        this->visitNode(this->entry_point);
+        candidates.emplace(this->lower_bound, this->entry_point);
+    }
+    if (VECSIM_TIMEOUT(this->getTimeoutCtx())) {
+        *rc = VecSim_QueryReply_TimedOut;
+        return top_candidates;
+    }
+
+    // Carry the spare results from the previous batch into this one.
+    fillFromExtras(top_candidates);
+    if (top_candidates->size() == this->ef) {
+        return top_candidates;
+    }
+    *rc = this->scanSnapshotInternal(top_candidates);
+
+    if (top_candidates->size() < this->ef) {
+        this->depleted = true;
+    }
+    return top_candidates;
+}
+
+template <typename DataType, typename DistType>
+VecSimQueryReply *
+HNSWSnapshot_BatchIterator<DataType, DistType>::getNextResults(size_t n_res,
+                                                               VecSimQueryReply_Order order) {
+    auto batch = new VecSimQueryReply(this->allocator);
+    size_t orig_ef = this->ef;
+    if (orig_ef < n_res) {
+        this->ef = n_res;
+    }
+
+    // On the first batch, descend the captured upper levels to the level-0 entry
+    // point (lock-free, reading the snapshot).
+    if (!this->entry_resolved) {
+        this->entry_point = snapshotBottomLayerEP(&batch->code);
+        this->entry_resolved = true;
+        if (VecSim_OK != batch->code) {
+            this->ef = orig_ef;
+            return batch;
+        }
+    }
+
+    auto *top_candidates = this->scanSnapshot(&batch->code);
+    if (VecSim_OK != batch->code) {
+        delete top_candidates;
+        this->ef = orig_ef;
+        return batch;
+    }
+    this->prepareResults(batch, top_candidates, n_res);
+    delete top_candidates;
+
+    this->updateResultsCount(VecSimQueryReply_Len(batch));
+    if (order == BY_ID) {
+        sort_results_by_id(batch);
+    }
+    this->ef = orig_ef;
+    return batch;
+}
+
+template <typename DataType, typename DistType>
+bool HNSWSnapshot_BatchIterator<DataType, DistType>::isDepleted() {
+    return this->depleted && this->top_candidates_extras.empty();
+}
+
+// reset() restarts the scan over the SAME captured snapshot — it does NOT
+// re-capture. The snapshot was taken once at construction and is owned for the
+// iterator's whole life, so a reset cursor yields the identical point-in-time
+// view it had before. (The tiered snapshot cursor differs: it re-captures a fresh
+// snapshot on reset — see TieredHNSW_BatchIterator::reset.)
+template <typename DataType, typename DistType>
+void HNSWSnapshot_BatchIterator<DataType, DistType>::reset() {
+    this->resetResultsCount();
+    this->depleted = false;
+    this->entry_resolved = false;
+    this->entry_point = INVALID_ID;
+    std::fill(this->visited.begin(), this->visited.end(), false);
+    this->lower_bound = std::numeric_limits<DistType>::infinity();
+    this->candidates = candidatesMinHeap<idType>(this->allocator);
+    this->top_candidates_extras = candidatesMinHeap<labelType>(this->allocator);
+}
+
+/******************** Single-value variant **************/
+
+template <typename DataType, typename DistType>
+class HNSWSnapshotSingle_BatchIterator : public HNSWSnapshot_BatchIterator<DataType, DistType> {
+private:
+    inline void prepareResults(VecSimQueryReply *rep,
+                               candidatesLabelsMaxHeap<DistType> *top_candidates,
+                               size_t n_res) override {
+        while (top_candidates->size() > n_res) {
+            this->top_candidates_extras.emplace(top_candidates->top());
+            top_candidates->pop();
+        }
+        rep->results.resize(top_candidates->size());
+        for (auto result = rep->results.rbegin(); result != rep->results.rend(); ++result) {
+            std::tie(result->score, result->id) = top_candidates->top();
+            top_candidates->pop();
+        }
+    }
+    inline void fillFromExtras(candidatesLabelsMaxHeap<DistType> *top_candidates) override {
+        while (top_candidates->size() < this->ef && !this->top_candidates_extras.empty()) {
+            top_candidates->emplace(this->top_candidates_extras.top().first,
+                                    this->top_candidates_extras.top().second);
+            this->top_candidates_extras.pop();
+        }
+    }
+    inline void updateHeaps(candidatesLabelsMaxHeap<DistType> *top_candidates, DistType dist,
+                            idType id) override {
+        if (top_candidates->size() < this->ef) {
+            top_candidates->emplace(dist, this->snapLabel(id));
+            this->lower_bound = top_candidates->top().first;
+        } else if (this->lower_bound > dist) {
+            top_candidates->emplace(dist, this->snapLabel(id));
+            this->top_candidates_extras.emplace(top_candidates->top().first,
+                                                top_candidates->top().second);
+            top_candidates->pop();
+            this->lower_bound = top_candidates->top().first;
+        }
+    }
+
+public:
+    HNSWSnapshotSingle_BatchIterator(void *query_vector,
+                                     const HNSWIndex<DataType, DistType> *index,
+                                     HNSWGraphSnapshot &&snapshot, VecSimQueryParams *queryParams,
+                                     std::shared_ptr<VecSimAllocator> allocator)
+        : HNSWSnapshot_BatchIterator<DataType, DistType>(
+              query_vector, index, std::move(snapshot), queryParams, std::move(allocator)) {}
+    ~HNSWSnapshotSingle_BatchIterator() override = default;
+};
+
+/******************** Multi-value variant **************/
+
+template <typename DataType, typename DistType>
+class HNSWSnapshotMulti_BatchIterator : public HNSWSnapshot_BatchIterator<DataType, DistType> {
+private:
+    vecsim_stl::unordered_set<labelType> returned;
+
+    inline void prepareResults(VecSimQueryReply *rep,
+                               candidatesLabelsMaxHeap<DistType> *top_candidates,
+                               size_t n_res) override {
+        while (top_candidates->size() > n_res) {
+            this->top_candidates_extras.emplace(top_candidates->top().first,
+                                                top_candidates->top().second);
+            top_candidates->pop();
+        }
+        rep->results.resize(top_candidates->size());
+        for (auto result = rep->results.rbegin(); result != rep->results.rend(); ++result) {
+            std::tie(result->score, result->id) = top_candidates->top();
+            this->returned.insert(result->id);
+            top_candidates->pop();
+        }
+    }
+    inline void fillFromExtras(candidatesLabelsMaxHeap<DistType> *top_candidates) override {
+        while (top_candidates->size() < this->ef && !this->top_candidates_extras.empty()) {
+            if (returned.find(this->top_candidates_extras.top().second) == returned.end()) {
+                top_candidates->emplace(this->top_candidates_extras.top().first,
+                                        this->top_candidates_extras.top().second);
+            }
+            this->top_candidates_extras.pop();
+        }
+    }
+    inline void updateHeaps(candidatesLabelsMaxHeap<DistType> *top_candidates, DistType dist,
+                            idType id) override {
+        if (this->lower_bound > dist || top_candidates->size() < this->ef) {
+            labelType label = this->snapLabel(id);
+            if (returned.find(label) == returned.end()) {
+                top_candidates->emplace(dist, label);
+                if (top_candidates->size() > this->ef) {
+                    this->top_candidates_extras.emplace(top_candidates->top().first,
+                                                        top_candidates->top().second);
+                    top_candidates->pop();
+                }
+                this->lower_bound = top_candidates->top().first;
+            }
+        }
+    }
+
+public:
+    HNSWSnapshotMulti_BatchIterator(void *query_vector, const HNSWIndex<DataType, DistType> *index,
+                                    HNSWGraphSnapshot &&snapshot, VecSimQueryParams *queryParams,
+                                    std::shared_ptr<VecSimAllocator> allocator)
+        : HNSWSnapshot_BatchIterator<DataType, DistType>(query_vector, index, std::move(snapshot),
+                                                         queryParams, std::move(allocator)),
+          returned(index->indexSize(), this->allocator) {}
+    ~HNSWSnapshotMulti_BatchIterator() override = default;
+
+    void reset() override {
+        this->returned.clear();
+        HNSWSnapshot_BatchIterator<DataType, DistType>::reset();
+    }
+};
diff --git a/src/VecSim/vec_sim_common.h b/src/VecSim/vec_sim_common.h
index fe10a5a0c..a35b2daf7 100644
--- a/src/VecSim/vec_sim_common.h
+++ b/src/VecSim/vec_sim_common.h
@@ -282,6 +282,11 @@ typedef enum {
 typedef struct {
     size_t efRuntime; // EF parameter for HNSW graph accuracy/latency for search.
     double epsilon;   // Epsilon parameter for HNSW graph accuracy/latency for range search.
+    // Opt-in: serve a batch iterator from an immutable point-in-time graph snapshot
+    // captured under the read lock, then iterate lock-free (writers proceed during
+    // iteration). Default false preserves the live-graph iterator. See the HNSW
+    // snapshot design (docs/design/hnsw-snapshot).
+    bool useGraphSnapshotIterator;
 } HNSWRuntimeParams;
 
 typedef struct {
diff --git a/tests/unit/test_hnsw.cpp b/tests/unit/test_hnsw.cpp
index cb25ba204..700f130ce 100644
--- a/tests/unit/test_hnsw.cpp
+++ b/tests/unit/test_hnsw.cpp
@@ -2893,3 +2893,137 @@ TYPED_TEST(HNSWTest, lockFreeSnapshotQueryDuringInserts) {
     snap = HNSWGraphSnapshot{};
     VecSimIndex_Free(index);
 }
+
+// Drain a whole batch-iterator run into a flat (id, score) sequence.
+static std::vector<std::pair<size_t, double>>
+drainBatchIterator(VecSimIndex *index, const void *query, bool useSnapshot, size_t batch) {
+    VecSimQueryParams qp = {};
+    qp.hnswRuntimeParams.useGraphSnapshotIterator = useSnapshot;
+    VecSimBatchIterator *it = VecSimBatchIterator_New(index, query, &qp);
+    std::vector<std::pair<size_t, double>> out;
+    while (VecSimBatchIterator_HasNext(it)) {
+        VecSimQueryReply *r = VecSimBatchIterator_Next(it, batch, BY_SCORE);
+        size_t len = VecSimQueryReply_Len(r);
+        for (size_t i = 0; i < len; i++) {
+            out.emplace_back(VecSimQueryResult_GetId(r->results.data() + i),
+                             VecSimQueryResult_GetScore(r->results.data() + i));
+        }
+        VecSimQueryReply_Free(r);
+    }
+    VecSimBatchIterator_Free(it);
+    return out;
+}
+
+// The opt-in snapshot-backed batch iterator returns exactly the same paginated
+// results (ids + scores, in order) as the default live iterator on a static index.
+TYPED_TEST(HNSWTest, snapshotIterator_EquivalenceWithLive) {
+    size_t dim = 4, M = 16, n = 1000;
+    HNSWParams params = {.dim = dim,
+                         .metric = VecSimMetric_L2,
+                         .M = M,
+                         .efConstruction = 200,
+                         .efRuntime = 200};
+    VecSimIndex *index = this->CreateNewIndex(params);
+    for (size_t i = 0; i < n; i++) {
+        GenerateAndAddVector<TEST_DATA_T>(index, dim, i, i);
+    }
+    // Query past the largest id so all distances are distinct (unambiguous order).
+    TEST_DATA_T query[dim];
+    GenerateVector<TEST_DATA_T>(query, dim, (TEST_DATA_T)(n + 5));
+
+    auto live = drainBatchIterator(index, query, /*useSnapshot=*/false, 7);
+    auto snap = drainBatchIterator(index, query, /*useSnapshot=*/true, 7);
+    ASSERT_EQ(live.size(), n);
+    ASSERT_EQ(snap.size(), live.size());
+    for (size_t i = 0; i < live.size(); i++) {
+        ASSERT_EQ(live[i].first, snap[i].first) << "rank " << i;
+        ASSERT_EQ(live[i].second, snap[i].second) << "rank " << i;
+    }
+    VecSimIndex_Free(index);
+}
+
+// A snapshot iterator is point-in-time: vectors inserted after it was created (the
+// capture) are never returned, and iterating while the live index grows + COWs is
+// safe.
+TYPED_TEST(HNSWTest, snapshotIterator_InsertDuringIterationInvisible) {
+    size_t dim = 4, M = 16, n = 1000;
+    HNSWParams params = {.dim = dim,
+                         .metric = VecSimMetric_L2,
+                         .M = M,
+                         .efConstruction = 200,
+                         .efRuntime = 200};
+    VecSimIndex *index = this->CreateNewIndex(params);
+    for (size_t i = 0; i < n; i++) {
+        GenerateAndAddVector<TEST_DATA_T>(index, dim, i, i);
+    }
+    TEST_DATA_T query[dim];
+    GenerateVector<TEST_DATA_T>(query, dim, (TEST_DATA_T)(n + 5));
+
+    VecSimQueryParams qp = {};
+    qp.hnswRuntimeParams.useGraphSnapshotIterator = true;
+    VecSimBatchIterator *it = VecSimBatchIterator_New(index, query, &qp);
+
+    size_t returned = 0;
+    bool first = true;
+    while (VecSimBatchIterator_HasNext(it)) {
+        VecSimQueryReply *r = VecSimBatchIterator_Next(it, 25, BY_SCORE);
+        size_t len = VecSimQueryReply_Len(r);
+        for (size_t i = 0; i < len; i++) {
+            // No id captured after T (>= n) may ever appear.
+            ASSERT_LT(VecSimQueryResult_GetId(r->results.data() + i), n);
+        }
+        returned += len;
+        VecSimQueryReply_Free(r);
+        if (first) {
+            // Grow the live index by a full extra population mid-iteration.
+            for (size_t i = n; i < 2 * n; i++) {
+                GenerateAndAddVector<TEST_DATA_T>(index, dim, i, i);
+            }
+            first = false;
+        }
+    }
+    VecSimBatchIterator_Free(it);
+    ASSERT_EQ(returned, n); // exactly the as-of-capture population
+    ASSERT_EQ(VecSimIndex_IndexSize(index), 2 * n);
+    VecSimIndex_Free(index);
+}
+
+// Reset re-iterates the same captured snapshot and yields the same sequence.
+TYPED_TEST(HNSWTest, snapshotIterator_Reset) {
+    size_t dim = 4, M = 16, n = 500;
+    HNSWParams params = {.dim = dim,
+                         .metric = VecSimMetric_L2,
+                         .M = M,
+                         .efConstruction = 200,
+                         .efRuntime = 200};
+    VecSimIndex *index = this->CreateNewIndex(params);
+    for (size_t i = 0; i < n; i++) {
+        GenerateAndAddVector<TEST_DATA_T>(index, dim, i, i);
+    }
+    TEST_DATA_T query[dim];
+    GenerateVector<TEST_DATA_T>(query, dim, (TEST_DATA_T)(n + 5));
+
+    VecSimQueryParams qp = {};
+    qp.hnswRuntimeParams.useGraphSnapshotIterator = true;
+    VecSimBatchIterator *it = VecSimBatchIterator_New(index, query, &qp);
+
+    auto collect = [&]() {
+        std::vector<size_t> ids;
+        while (VecSimBatchIterator_HasNext(it)) {
+            VecSimQueryReply *r = VecSimBatchIterator_Next(it, 10, BY_SCORE);
+            size_t len = VecSimQueryReply_Len(r);
+            for (size_t i = 0; i < len; i++) {
+                ids.push_back(VecSimQueryResult_GetId(r->results.data() + i));
+            }
+            VecSimQueryReply_Free(r);
+        }
+        return ids;
+    };
+    auto firstRun = collect();
+    VecSimBatchIterator_Reset(it);
+    auto secondRun = collect();
+    ASSERT_EQ(firstRun.size(), n);
+    ASSERT_EQ(firstRun, secondRun);
+    VecSimBatchIterator_Free(it);
+    VecSimIndex_Free(index);
+}
diff --git a/tests/unit/test_hnsw_multi.cpp b/tests/unit/test_hnsw_multi.cpp
index 9b6b2b974..e66aaf90f 100644
--- a/tests/unit/test_hnsw_multi.cpp
+++ b/tests/unit/test_hnsw_multi.cpp
@@ -1626,3 +1626,50 @@ TYPED_TEST(HNSWMultiTest, markDelete) {
     VecSimBatchIterator_Free(batchIterator);
     VecSimIndex_Free(index);
 }
+
+// The opt-in snapshot-backed batch iterator returns the same paginated (deduped by
+// label) results as the default live iterator on a static multi-value index.
+TYPED_TEST(HNSWMultiTest, snapshotIterator_EquivalenceWithLive) {
+    size_t dim = 4, M = 16, n_labels = 500, perLabel = 5;
+    size_t n = n_labels * perLabel;
+    HNSWParams params = {.dim = dim,
+                         .metric = VecSimMetric_L2,
+                         .M = M,
+                         .efConstruction = 200,
+                         .efRuntime = 200};
+    VecSimIndex *index = this->CreateNewIndex(params);
+    for (size_t i = 0; i < n; i++) {
+        GenerateAndAddVector<TEST_DATA_T>(index, dim, i / perLabel, i);
+    }
+    ASSERT_EQ(index->indexLabelCount(), n_labels);
+
+    TEST_DATA_T query[dim];
+    GenerateVector<TEST_DATA_T>(query, dim, (TEST_DATA_T)(n + 5));
+
+    auto drain = [&](bool useSnapshot) {
+        VecSimQueryParams qp = {};
+        qp.hnswRuntimeParams.useGraphSnapshotIterator = useSnapshot;
+        VecSimBatchIterator *it = VecSimBatchIterator_New(index, query, &qp);
+        std::vector<std::pair<size_t, double>> out;
+        while (VecSimBatchIterator_HasNext(it)) {
+            VecSimQueryReply *r = VecSimBatchIterator_Next(it, 9, BY_SCORE);
+            size_t len = VecSimQueryReply_Len(r);
+            for (size_t i = 0; i < len; i++) {
+                out.emplace_back(VecSimQueryResult_GetId(r->results.data() + i),
+                                 VecSimQueryResult_GetScore(r->results.data() + i));
+            }
+            VecSimQueryReply_Free(r);
+        }
+        VecSimBatchIterator_Free(it);
+        return out;
+    };
+    auto live = drain(false);
+    auto snap = drain(true);
+    ASSERT_EQ(live.size(), n_labels);
+    ASSERT_EQ(snap.size(), live.size());
+    for (size_t i = 0; i < live.size(); i++) {
+        ASSERT_EQ(live[i].first, snap[i].first) << "rank " << i;
+        ASSERT_EQ(live[i].second, snap[i].second) << "rank " << i;
+    }
+    VecSimIndex_Free(index);
+}

From 07992d56141a56a1c2fc86fccd81e2ed46c4a6ba Mon Sep 17 00:00:00 2001
From: jonathan keinan <jonathan.keinan@redis.com>
Date: Thu, 11 Jun 2026 20:23:31 +0300
Subject: [PATCH 07/10] HNSW snapshot 07: tiered cursor captures-then-releases
 the main lock
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Wire the snapshot-backed backend iterator into TieredHNSW_BatchIterator. When the
useGraphSnapshotIterator flag is set, the cursor takes the main index lock only
long enough for the backend newBatchIterator to capture its immutable snapshot,
then releases it immediately and iterates lock-free. The backend iterator's
liveToken keeps the graph snapshot live (graphSnapshotActive) for the cursor's
whole life, so ingestion (flat->HNSW) and GC are no longer blocked by the cursor.
The depletion / destructor / reset paths skip the now-redundant unlock in snapshot
mode (the lock was released right after capture). The flat (frontend) side keeps
its short per-batch flatIndexGuard locking.

Tests: tieredSnapshotIteratorEquivalence (flag on == off on a fully-ingested
index, single + multi) and tieredSnapshotIngestionProgressesWhileCursorOpen — a
single-threaded driver that ingests flat->HNSW while a snapshot cursor is open and
asserts the backend grows (this would self-deadlock under the live iterator, which
holds the shared lock); the cursor stays point-in-time (only the as-of-capture
population is returned).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 src/VecSim/algorithms/hnsw/hnsw_tiered.h      |  50 ++++++--
 .../hnsw/hnsw_tiered_tests_friends.h          |   1 +
 tests/unit/test_hnsw_tiered.cpp               | 115 ++++++++++++++++++
 3 files changed, 159 insertions(+), 7 deletions(-)

diff --git a/src/VecSim/algorithms/hnsw/hnsw_tiered.h b/src/VecSim/algorithms/hnsw/hnsw_tiered.h
index a4d5e08e4..2197eb970 100644
--- a/src/VecSim/algorithms/hnsw/hnsw_tiered.h
+++ b/src/VecSim/algorithms/hnsw/hnsw_tiered.h
@@ -171,6 +171,13 @@ class TieredHNSWIndex : public VecSimTieredIndex<DataType, DistType> {
         // because of the approximate nature of the algorithm.
         vecsim_stl::unordered_set<labelType> returned_results_set;
 
+        // When the backend (HNSW) iterator is snapshot-backed, we hold the main
+        // index read lock only long enough to capture the snapshot, then release it
+        // and iterate lock-free — so a long-lived cursor no longer blocks ingestion
+        // or GC/swap. In that mode the depletion/destructor paths must NOT release
+        // mainIndexGuard again (it was already released right after capture).
+        bool snapshot_mode;
+
     private:
         template <bool isMultiValue>
         inline VecSimQueryReply *compute_current_batch(size_t n_res);
@@ -940,7 +947,8 @@ TieredHNSWIndex<DataType, DistType>::TieredHNSW_BatchIterator::TieredHNSW_BatchI
                           std::move(allocator)),
       index(index), flat_results(this->allocator), hnsw_results(this->allocator),
       flat_iterator(this->index->frontendIndex->newBatchIterator(query_vector, queryParams)),
-      hnsw_iterator(UNINITIALIZED), returned_results_set(this->allocator) {
+      hnsw_iterator(UNINITIALIZED), returned_results_set(this->allocator),
+      snapshot_mode(queryParams && queryParams->hnswRuntimeParams.useGraphSnapshotIterator) {
     // Save a copy of the query params to initialize the HNSW iterator with (on first batch and
     // first batch after reset).
     if (queryParams) {
@@ -958,7 +966,11 @@ TieredHNSWIndex<DataType, DistType>::TieredHNSW_BatchIterator::~TieredHNSW_Batch
 
     if (this->hnsw_iterator != UNINITIALIZED && this->hnsw_iterator != DEPLETED) {
         delete this->hnsw_iterator;
-        this->index->mainIndexGuard.unlock_shared();
+        // In snapshot mode the lock was released right after capture (see
+        // getNextResults); only the live-iterator mode still holds it here.
+        if (!this->snapshot_mode) {
+            this->index->mainIndexGuard.unlock_shared();
+        }
     }
 
     this->allocator->free_allocation(this->queryParams);
@@ -985,11 +997,19 @@ VecSimQueryReply *TieredHNSWIndex<DataType, DistType>::TieredHNSW_BatchIterator:
         }
         this->flat_results.swap(cur_flat_results->results);
         VecSimQueryReply_Free(cur_flat_results);
-        // We also take the lock on the main index on the first call to getNextResults, and we hold
-        // it until the iterator is depleted or freed.
+        // Take the main index read lock to create the backend iterator. In the
+        // default (live) mode we hold it until the iterator is depleted or freed.
+        // In snapshot mode the backend iterator captures an immutable snapshot at
+        // creation, so we release the lock immediately afterwards and iterate
+        // lock-free — the snapshot's liveToken keeps graphSnapshotActive() true for
+        // the cursor's whole life, while ingestion and GC are no longer blocked by
+        // this cursor.
         this->index->mainIndexGuard.lock_shared();
         this->hnsw_iterator = this->index->backendIndex->newBatchIterator(
             this->flat_iterator->getQueryBlob(), queryParams);
+        if (this->snapshot_mode) {
+            this->index->mainIndexGuard.unlock_shared();
+        }
         auto cur_hnsw_results = this->hnsw_iterator->getNextResults(n_res, BY_SCORE_THEN_ID);
         hnsw_code = cur_hnsw_results->code;
         this->hnsw_results.swap(cur_hnsw_results->results);
@@ -997,7 +1017,9 @@ VecSimQueryReply *TieredHNSWIndex<DataType, DistType>::TieredHNSW_BatchIterator:
         if (this->hnsw_iterator->isDepleted()) {
             delete this->hnsw_iterator;
             this->hnsw_iterator = DEPLETED;
-            this->index->mainIndexGuard.unlock_shared();
+            if (!this->snapshot_mode) {
+                this->index->mainIndexGuard.unlock_shared();
+            }
         }
     } else {
         while (this->flat_results.size() < n_res && !this->flat_iterator->isDepleted()) {
@@ -1036,7 +1058,9 @@ VecSimQueryReply *TieredHNSWIndex<DataType, DistType>::TieredHNSW_BatchIterator:
             if (this->hnsw_iterator->isDepleted()) {
                 delete this->hnsw_iterator;
                 this->hnsw_iterator = DEPLETED;
-                this->index->mainIndexGuard.unlock_shared();
+                if (!this->snapshot_mode) {
+                    this->index->mainIndexGuard.unlock_shared();
+                }
             }
         }
     }
@@ -1072,11 +1096,23 @@ bool TieredHNSWIndex<DataType, DistType>::TieredHNSW_BatchIterator::isDepleted()
            this->hnsw_results.empty() && this->hnsw_iterator == DEPLETED;
 }
 
+// reset() tears down the backend iterator and rebuilds it on the next
+// getNextResults — the tiered iterator's established convention. In snapshot mode
+// that means a reset cursor RE-CAPTURES a fresh snapshot (a new point-in-time),
+// rather than replaying the original one. This is deliberate and differs from the
+// plain HNSWSnapshot_BatchIterator (which owns one snapshot for life and replays
+// it on reset); the tiered cursor mirrors how its live-mode counterpart re-reads
+// the index on reset. Callers needing a stable point-in-time across reset should
+// not rely on the tiered cursor preserving it.
 template <typename DataType, typename DistType>
 void TieredHNSWIndex<DataType, DistType>::TieredHNSW_BatchIterator::reset() {
     if (this->hnsw_iterator != UNINITIALIZED && this->hnsw_iterator != DEPLETED) {
         delete this->hnsw_iterator;
-        this->index->mainIndexGuard.unlock_shared();
+        // Snapshot mode already released the lock right after capture (a fresh
+        // snapshot is captured on the next getNextResults); only live mode holds it.
+        if (!this->snapshot_mode) {
+            this->index->mainIndexGuard.unlock_shared();
+        }
     }
     this->resetResultsCount();
     this->flat_iterator->reset();
diff --git a/src/VecSim/algorithms/hnsw/hnsw_tiered_tests_friends.h b/src/VecSim/algorithms/hnsw/hnsw_tiered_tests_friends.h
index 21f99f8f5..175c1f771 100644
--- a/src/VecSim/algorithms/hnsw/hnsw_tiered_tests_friends.h
+++ b/src/VecSim/algorithms/hnsw/hnsw_tiered_tests_friends.h
@@ -23,6 +23,7 @@ INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_deleteVector_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_deleteVectorAndRepairAsync_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_alternateInsertDeleteAsync_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_swapJobBasic_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_tieredSnapshotIngestionProgressesWhileCursorOpen_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_swapJobBasic2_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_deleteVectorsAndSwapSync_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_BatchIterator_Test)
diff --git a/tests/unit/test_hnsw_tiered.cpp b/tests/unit/test_hnsw_tiered.cpp
index b64885b97..eca6b644f 100644
--- a/tests/unit/test_hnsw_tiered.cpp
+++ b/tests/unit/test_hnsw_tiered.cpp
@@ -4530,3 +4530,118 @@ TYPED_TEST(HNSWTieredIndexTestBasic, HNSWResize) {
               hnsw_index->indexMetaDataCapacity() +
                   tiered_index->frontendIndex->indexMetaDataCapacity());
 }
+
+// The tiered batch iterator with the snapshot flag returns the same paginated
+// results as the default live iterator on a stable (fully-ingested) index.
+TYPED_TEST(HNSWTieredIndexTest, tieredSnapshotIteratorEquivalence) {
+    size_t dim = 4, n = 1000;
+    HNSWParams params = {.type = TypeParam::get_index_type(),
+                         .dim = dim,
+                         .metric = VecSimMetric_L2,
+                         .multi = TypeParam::isMulti(),
+                         .efRuntime = 200};
+    VecSimParams hnsw_params = CreateParams(params);
+    auto mock_thread_pool = tieredIndexMock();
+    auto *tiered_index = this->CreateTieredHNSWIndex(hnsw_params, mock_thread_pool);
+
+    for (size_t i = 0; i < n; i++) {
+        GenerateAndAddVector<TEST_DATA_T>(tiered_index, dim, i, i);
+    }
+    // Drain all insert jobs so the population is stable (all in the backend graph).
+    while (mock_thread_pool.jobQ.size() > 0) {
+        mock_thread_pool.thread_iteration();
+    }
+    ASSERT_EQ(VecSimIndex_IndexSize(tiered_index), n);
+
+    TEST_DATA_T query[dim];
+    GenerateVector<TEST_DATA_T>(query, dim, (TEST_DATA_T)(n + 5));
+    auto drain = [&](bool useSnapshot) {
+        VecSimQueryParams qp = {};
+        qp.hnswRuntimeParams.useGraphSnapshotIterator = useSnapshot;
+        VecSimBatchIterator *it = VecSimBatchIterator_New(tiered_index, query, &qp);
+        std::vector<size_t> ids;
+        while (VecSimBatchIterator_HasNext(it)) {
+            VecSimQueryReply *r = VecSimBatchIterator_Next(it, 11, BY_SCORE);
+            for (size_t i = 0; i < VecSimQueryReply_Len(r); i++) {
+                ids.push_back(VecSimQueryResult_GetId(r->results.data() + i));
+            }
+            VecSimQueryReply_Free(r);
+        }
+        VecSimBatchIterator_Free(it);
+        return ids;
+    };
+    auto live = drain(false);
+    auto snap = drain(true);
+    ASSERT_EQ(live.size(), n);
+    ASSERT_EQ(live, snap);
+}
+
+// The core win: a snapshot-backed cursor does NOT hold the main index read lock,
+// so ingestion (flat -> HNSW) proceeds while the cursor is open. This is driven
+// single-threaded: with the old live iterator the cursor would hold mainIndexGuard
+// shared and this in-thread ingestion (which takes the exclusive lock) would
+// self-deadlock; in snapshot mode the lock is released right after capture, so it
+// completes — and the cursor stays point-in-time (only the as-of-capture population
+// is ever returned).
+TYPED_TEST(HNSWTieredIndexTest, tieredSnapshotIngestionProgressesWhileCursorOpen) {
+    size_t dim = 4, base = 500, extra = 500;
+    HNSWParams params = {.type = TypeParam::get_index_type(),
+                         .dim = dim,
+                         .metric = VecSimMetric_L2,
+                         .multi = TypeParam::isMulti(),
+                         .efRuntime = 200};
+    VecSimParams hnsw_params = CreateParams(params);
+    auto mock_thread_pool = tieredIndexMock();
+    auto *tiered_index = this->CreateTieredHNSWIndex(hnsw_params, mock_thread_pool);
+
+    // Base population, fully ingested into the backend graph.
+    for (size_t i = 0; i < base; i++) {
+        GenerateAndAddVector<TEST_DATA_T>(tiered_index, dim, i, i);
+    }
+    while (mock_thread_pool.jobQ.size() > 0) {
+        mock_thread_pool.thread_iteration();
+    }
+    ASSERT_EQ(tiered_index->backendIndex->indexSize(), base);
+    ASSERT_EQ(tiered_index->frontendIndex->indexSize(), 0);
+
+    // Open a snapshot cursor and pull the first batch (captures the backend snapshot
+    // and releases mainIndexGuard immediately).
+    TEST_DATA_T query[dim];
+    GenerateVector<TEST_DATA_T>(query, dim, (TEST_DATA_T)(base + extra + 5));
+    VecSimQueryParams qp = {};
+    qp.hnswRuntimeParams.useGraphSnapshotIterator = true;
+    VecSimBatchIterator *it = VecSimBatchIterator_New(tiered_index, query, &qp);
+    ASSERT_TRUE(VecSimBatchIterator_HasNext(it));
+    size_t total = 0;
+    {
+        VecSimQueryReply *r = VecSimBatchIterator_Next(it, 10, BY_SCORE);
+        total += VecSimQueryReply_Len(r);
+        VecSimQueryReply_Free(r);
+    }
+    ASSERT_TRUE(tiered_index->getHNSWIndex()->graphSnapshotActive());
+
+    // While the cursor is open, insert MORE vectors and ingest them in-thread. This
+    // would self-deadlock under the live iterator (shared lock held); here it runs.
+    for (size_t i = base; i < base + extra; i++) {
+        GenerateAndAddVector<TEST_DATA_T>(tiered_index, dim, i, i);
+    }
+    while (mock_thread_pool.jobQ.size() > 0) {
+        mock_thread_pool.thread_iteration();
+    }
+    ASSERT_EQ(tiered_index->backendIndex->indexSize(), base + extra)
+        << "ingestion must progress while a snapshot cursor is open";
+    ASSERT_EQ(tiered_index->frontendIndex->indexSize(), 0);
+
+    // The cursor stayed point-in-time: drain it; every returned id is from the
+    // as-of-capture population and the total equals it.
+    while (VecSimBatchIterator_HasNext(it)) {
+        VecSimQueryReply *r = VecSimBatchIterator_Next(it, 50, BY_SCORE);
+        for (size_t i = 0; i < VecSimQueryReply_Len(r); i++) {
+            ASSERT_LT(VecSimQueryResult_GetId(r->results.data() + i), base);
+        }
+        total += VecSimQueryReply_Len(r);
+        VecSimQueryReply_Free(r);
+    }
+    VecSimBatchIterator_Free(it);
+    ASSERT_EQ(total, base);
+}

From e7356002f6d68977ac6657a0e30d460d6002529f Mon Sep 17 00:00:00 2001
From: jonathan keinan <jonathan.keinan@redis.com>
Date: Thu, 11 Jun 2026 20:23:31 +0300
Subject: [PATCH 08/10] HNSW snapshot 08: concurrent-writer safety (lock-free
 block COW + atomic metadata)

Make a live snapshot safe against *concurrent* writers, not just a single
writer holding the lock. Two independent tears are closed:

- Graph blocks: cowBlock now forks the live buffer with an atomic
  compare-exchange (generation-tag idempotent), so two inserts COW'ing the
  same block can't free a buffer a lock-free reader is mid-scan on. blockData
  returns a raw pointer (gated on a snapshot being live) to avoid shared_ptr
  refcount churn on the hot read path. This is the fix for the use-after-free
  the concurrent insert + snapshot repro hit.

- Metadata: MetaDataStore is read through an atomic published pointer, so a
  reader never observes a torn 16-byte shared_ptr while a writer republishes.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .../algorithms/hnsw/graph_data_blocks.h       | 108 +++++++++++++++---
 src/VecSim/algorithms/hnsw/hnsw.h             |  80 ++++++++++---
 src/VecSim/algorithms/hnsw/hnsw_multi.h       |   7 +-
 src/VecSim/algorithms/hnsw/hnsw_single.h      |   7 +-
 src/VecSim/algorithms/hnsw/hnsw_tiered.h      |  30 +++--
 tests/unit/test_hnsw.cpp                      |   9 +-
 6 files changed, 191 insertions(+), 50 deletions(-)

diff --git a/src/VecSim/algorithms/hnsw/graph_data_blocks.h b/src/VecSim/algorithms/hnsw/graph_data_blocks.h
index 8285095cd..8288808e0 100644
--- a/src/VecSim/algorithms/hnsw/graph_data_blocks.h
+++ b/src/VecSim/algorithms/hnsw/graph_data_blocks.h
@@ -133,6 +133,11 @@ class RootedCowStore {
         return (registry_ ? registry_->newestLive() : 0) >= gen;
     }
     bool mustCloneRoot() const { return mustClone(gen_); }
+    // Cheap relaxed check: is any snapshot live (so a block COW / fork can happen)?
+    // Lets the read path skip the (spinlock-backed) atomic shared_ptr load when no
+    // snapshot exists. Safe for a reader holding at least the shared index lock:
+    // capture runs under the exclusive lock, so this value can't flip mid-read.
+    bool anySnapshotLive() const { return registry_ && registry_->anyLive(); }
 
     // Copy-on-write: if a live snapshot can still see this version, replace it with
     // `cloneFn(*root_)` and re-stamp; otherwise leave it (subsequent writes mutate
@@ -144,6 +149,21 @@ class RootedCowStore {
         }
     }
 
+    // Return the current version (pinned for a snapshot) AND install a private fork
+    // — `cloneFn(*root_)` — as the new live version, stamped with the current
+    // generation. Must run under the exclusive lock (it reassigns root_). Used at
+    // capture so the live index keeps mutating a private copy while the snapshot
+    // owns the captured one; the current-generation stamp means later writes COW at
+    // a finer grain off the fork instead of re-forking the root.
+    template <class CloneFn> std::shared_ptr<T> captureAndFork(CloneFn &&cloneFn) {
+        std::shared_ptr<T> captured = root_;
+        if (captured) {
+            root_ = cloneFn(static_cast<const T &>(*root_));
+            gen_ = currentGeneration();
+        }
+        return captured;
+    }
+
 private:
     std::shared_ptr<SnapshotRegistry> registry_;
     std::shared_ptr<T> root_;
@@ -235,9 +255,15 @@ class GraphBlockBuffer : public VecsimBaseObject {
     uint64_t gen_; // generation last written in (clone decision; not the use_count)
 };
 
-// A backbone slot: a refcounted handle to one block's buffer. Trivially/cheaply
-// copyable, which is what keeps backbone copy-on-write (path-copying the header
-// vector) cheap — only refcounts are bumped, buffers are shared.
+// A backbone slot: a refcounted handle to one block's buffer. While a snapshot is
+// live, concurrent tiered inserts copy-on-write a block's buffer (a CAS swap in
+// cowBlock) while other inserts/readers load it; to avoid tearing the 16-byte
+// shared_ptr or racing reclamation, the LIVE backbone's handles are accessed with
+// the std::atomic_* free functions (load/compare_exchange). The handle stays a
+// plain shared_ptr (not std::atomic<shared_ptr>, which libstdc++ only provides
+// from GCC 12) so the slot remains trivially copyable — important because the
+// backbone vector is value-copied at capture-fork, which always runs under the
+// exclusive lock (no concurrent CAS), where a plain copy is safe.
 struct GraphBlock {
     std::shared_ptr<GraphBlockBuffer> data;
 };
@@ -289,14 +315,15 @@ class GraphDataBlocks {
     }
 
     // ---- read path ----
+    // The backbone vector is structurally stable under the shared lock (it only
+    // grows / is replaced under the exclusive lock), so indexing it is safe; the
+    // per-block handle is loaded atomically (a concurrent insert may CAS-fork it).
     char *getElement(size_t id) const {
-        return (*backbone_.root())[id / block_size].data->getElement(id % block_size);
-    }
-    size_t blockLength(size_t blockIdx) const {
-        return (*backbone_.root())[blockIdx].data->getLength();
+        return blockData(id / block_size)->getElement(id % block_size);
     }
+    size_t blockLength(size_t blockIdx) const { return blockData(blockIdx)->getLength(); }
     char *getElementInBlock(size_t blockIdx, size_t offset) const {
-        return (*backbone_.root())[blockIdx].data->getElement(offset);
+        return blockData(blockIdx)->getElement(offset);
     }
 
     // ---- write path (copy-on-write aware; must run under the index write lock) ----
@@ -306,7 +333,7 @@ class GraphDataBlocks {
     char *getElementForWrite(size_t id) {
         cowBackbone();
         cowBlock(id / block_size);
-        return (*backbone_.root())[id / block_size].data->getElement(id % block_size);
+        return blockData(id / block_size)->getElement(id % block_size);
     }
 
     // Append an empty block (graph growth). Path-copies the backbone first if a
@@ -322,7 +349,7 @@ class GraphDataBlocks {
     char *addElement(const void *element_record) {
         cowBackbone();
         cowBlock(backbone_.root()->size() - 1);
-        return backbone_.root()->back().data->addElement(element_record);
+        return blockData(backbone_.root()->size() - 1)->addElement(element_record);
     }
 
     // Remove (decrement) the last record of the last block and return it. COWs
@@ -330,7 +357,7 @@ class GraphDataBlocks {
     char *removeAndFetchLastElement() {
         cowBackbone();
         cowBlock(backbone_.root()->size() - 1);
-        return backbone_.root()->back().data->removeAndFetchLastElement();
+        return blockData(backbone_.root()->size() - 1)->removeAndFetchLastElement();
     }
 
     // Drop the last (empty) block. Path-copies the backbone first if shared.
@@ -344,7 +371,42 @@ class GraphDataBlocks {
     // index read lock; reads through the returned Root are then lock-free.
     Root captureRoot() const { return backbone_.capture(); }
 
+    // Capture the current backbone for a snapshot AND install a private fork as the
+    // new live backbone (stamped with the current generation). Must run under the
+    // exclusive index lock. Effect: the snapshot owns the captured (now immutable)
+    // backbone; the live index keeps mutating the fork. Because the fork carries the
+    // current generation, a concurrent insert's cowBackbone is a no-op (the backbone
+    // is never re-forked under the shared lock — so the backbone vector stays
+    // structurally stable for lock-free readers), and only per-block buffers are
+    // copy-on-written (cowBlock, via an atomic CAS) as they are first written.
+    Root captureRootAndFork() {
+        return backbone_.captureAndFork([this](const Backbone &old) {
+            Root fresh(new (allocator) Backbone(allocator));
+            *fresh = old; // copies the per-block handles (atomic load/store); buffers shared
+            return fresh;
+        });
+    }
+
 private:
+    // Load a block's buffer handle. A concurrent insert may CAS-fork it ONLY while a
+    // snapshot is live, so the (spinlock-backed) atomic load is used only then; with
+    // no snapshot the handle is immutable and a plain read avoids the overhead. The
+    // gate is correct for callers holding at least the shared index lock (capture,
+    // which is the only thing that can make a snapshot live, runs under the
+    // exclusive lock and so cannot flip the gate mid-read).
+    // Returns a raw buffer pointer (no refcount traffic — matching the original
+    // in-place deref). With no snapshot the handle is immutable, so a plain read is
+    // safe. With a snapshot live a concurrent insert may CAS-fork the handle, so it
+    // is loaded atomically; the buffer it points at stays alive for this locked read
+    // because any fork pins the old buffer in the snapshot that forced it.
+    GraphBlockBuffer *blockData(size_t blockIdx) const {
+        const GraphBlock &blk = (*backbone_.root())[blockIdx];
+        if (!backbone_.anySnapshotLive()) {
+            return blk.data.get();
+        }
+        return std::atomic_load_explicit(&blk.data, std::memory_order_acquire).get();
+    }
+
     void ensureRoot() {
         if (!backbone_.hasRoot()) {
             // Allocate the backbone vector object through the index allocator (so
@@ -376,12 +438,30 @@ class GraphDataBlocks {
     // A per-buffer use_count would be misleading before the backbone is cloned (a
     // shared backbone holds a single shared_ptr per block, so use_count == 1 even
     // though a snapshot sees it) — the generation tag is reliable.
+    // Safe under concurrent tiered inserts (shared lock + per-node mutexes): the
+    // fork is published with a compare-exchange so that if two writers race to COW
+    // the same block, exactly one fork wins and the loser adopts it. The generation
+    // tag makes the CAS idempotent: the winner stamps the fresh buffer with the
+    // current generation, so the loser — after its CAS fails and reloads the now
+    // current-gen buffer — sees mustClone()==false and uses the winner's fork
+    // instead of forking again. No lost updates: both `fresh` copies are pure
+    // copies of the committed pre-write state, so discarding the loser's is safe;
+    // the actual link writes happen afterwards, in place, on the single shared fork.
     void cowBlock(size_t blockIdx) {
         GraphBlock &blk = (*backbone_.root())[blockIdx];
-        if (backbone_.mustClone(blk.data->gen())) {
+        std::shared_ptr<GraphBlockBuffer> cur =
+            std::atomic_load_explicit(&blk.data, std::memory_order_acquire);
+        while (backbone_.mustClone(cur->gen())) {
             auto fresh = makeBuffer(); // carries currentGeneration()
-            blk.data->copyLiveInto(*fresh);
-            blk.data = std::move(fresh);
+            cur->copyLiveInto(*fresh);
+            // On success, publish our fork. On failure, `cur` is updated to the
+            // value another writer published; the loop re-checks mustClone (false
+            // once that value carries the current generation) and we adopt it.
+            if (std::atomic_compare_exchange_strong_explicit(&blk.data, &cur, fresh,
+                                                             std::memory_order_acq_rel,
+                                                             std::memory_order_acquire)) {
+                break;
+            }
         }
     }
 
diff --git a/src/VecSim/algorithms/hnsw/hnsw.h b/src/VecSim/algorithms/hnsw/hnsw.h
index 17998e54d..d1b0a9a71 100644
--- a/src/VecSim/algorithms/hnsw/hnsw.h
+++ b/src/VecSim/algorithms/hnsw/hnsw.h
@@ -102,27 +102,50 @@ class MetaDataStore {
         store_.setRegistry(std::move(registry));
     }
 
-    ElementMetaData &operator[](size_t id) { return (*store_.root())[id]; }
-    const ElementMetaData &operator[](size_t id) const {
-        const Vector &v = *store_.root();
-        return v[id];
-    }
-    ElementMetaData &at(size_t id) { return store_.root()->at(id); }
-    const ElementMetaData &at(size_t id) const {
-        const Vector &v = *store_.root();
-        return v.at(id);
+    // Reads go through an atomically-published raw pointer to the current version's
+    // vector, NOT the shared_ptr root. Rationale: the per-id flags are read on some
+    // paths without the index lock (the weak deleted/in-process consistency), and a
+    // copy-on-write under a live snapshot reassigns the shared_ptr root. Reading a
+    // 16-byte shared_ptr there can tear -> crash; an 8-byte atomic pointer load
+    // cannot. The pointed-to vector stays alive across a COW because the COW only
+    // happens while a snapshot pins the old version, so a stale read still hits live
+    // memory (and yields the accepted as-of-capture flag value). No refcount traffic
+    // on the read path. Capacity changes (resize/shrink) republish under the write
+    // lock.
+    //
+    // NOTE: this published-pointer scheme removes the shared_ptr TEAR, but the
+    // per-element flag byte itself is still written in place and read without a lock
+    // (deliberate weak deleted/in-process consistency — pre-existing, see task 4.4a).
+    // That is a genuine data race ThreadSanitizer will report on the flag field; it
+    // is accepted by design (the read tolerates either the pre- or post-write value)
+    // and is NOT introduced by the snapshot work — a lock-free snapshot reader is
+    // just another participant in the same long-standing relaxed flag access.
+    ElementMetaData &operator[](size_t id) { return (*live())[id]; }
+    const ElementMetaData &operator[](size_t id) const { return (*live())[id]; }
+    ElementMetaData &at(size_t id) { return live()->at(id); }
+    const ElementMetaData &at(size_t id) const { return live()->at(id); }
+    ElementMetaData *data() {
+        Vector *v = live();
+        return v ? v->data() : nullptr;
     }
-    ElementMetaData *data() { return store_.hasRoot() ? store_.root()->data() : nullptr; }
     const ElementMetaData *data() const {
-        return store_.hasRoot() ? store_.root()->data() : nullptr;
+        Vector *v = live();
+        return v ? v->data() : nullptr;
+    }
+    size_t size() const {
+        Vector *v = live();
+        return v ? v->size() : 0;
+    }
+    size_t capacity() const {
+        Vector *v = live();
+        return v ? v->capacity() : 0;
     }
-    size_t size() const { return store_.hasRoot() ? store_.root()->size() : 0; }
-    size_t capacity() const { return store_.hasRoot() ? store_.root()->capacity() : 0; }
 
     void resize(size_t n) {
         ensureRoot();
         cowForResize();
         store_.root()->resize(n);
+        publish();
     }
     void shrink_to_fit() {
         if (!store_.hasRoot()) {
@@ -130,16 +153,26 @@ class MetaDataStore {
         }
         if (store_.root()->empty()) {
             store_.reset(); // return to the initial (no-heap) footprint
+            publish();
             return;
         }
         cowForResize();
         store_.root()->shrink_to_fit();
+        publish();
     }
 
     // O(1) capture of the current metadata version (pins it for a snapshot).
     std::shared_ptr<Vector> captureRoot() const { return store_.capture(); }
 
 private:
+    // Current version's vector, read with an acquire load (tear-free, lock-free).
+    Vector *live() const { return live_.load(std::memory_order_acquire); }
+    // Publish the current root's vector pointer; call after any change to the root
+    // (init / COW / resize / reset). Must run under the index write lock.
+    void publish() {
+        live_.store(store_.hasRoot() ? store_.root().get() : nullptr,
+                    std::memory_order_release);
+    }
     void ensureRoot() {
         if (!store_.hasRoot()) {
             store_.initRoot(std::shared_ptr<Vector>(new (allocator_) Vector(allocator_)));
@@ -154,6 +187,8 @@ class MetaDataStore {
     }
     std::shared_ptr<VecSimAllocator> allocator_;
     RootedCowStore<Vector> store_;
+    // Atomically-published pointer to store_'s current vector (see live()/publish()).
+    std::atomic<Vector *> live_{nullptr};
 };
 
 //////////////////////////////////// HNSW index implementation ////////////////////////////////////
@@ -358,9 +393,14 @@ class HNSWIndex : public VecSimIndexAbstract<DataType, DistType>,
         return getGraphDataByInternalIdForWrite(internal_id)->getElementLevelData(
             level, this->levelDataSize);
     }
-    // Capture an immutable, point-in-time view of the graph. Must be called under
-    // the index read lock; the returned handle is then usable without holding any
-    // lock, as writers copy-on-write rather than mutate the buffers it pins.
+    // Capture an immutable, point-in-time view of the graph. **Must be called under
+    // the index *write* (exclusive) lock**, held briefly: capture is the quiescence
+    // point at which no writer is active, so it can (a) hand out a generation, (b)
+    // fork the backbone so the live index keeps mutating a private copy while this
+    // snapshot owns the captured one, and (c) snapshot the vector-block base
+    // pointers — all without racing an in-flight insert. The work is O(#blocks)
+    // (pointer copies; no element/vector data is copied); iteration afterward holds
+    // no lock, as writers copy-on-write per block rather than mutating pinned data.
     //
     // Consistency contract (what a reader of the returned snapshot sees):
     //   - STRICT point-in-time for graph TOPOLOGY and MEMBERSHIP: the set of ids
@@ -387,8 +427,7 @@ class HNSWIndex : public VecSimIndexAbstract<DataType, DistType>,
             });
         // Capture the per-block base pointers of the raw vector data so a
         // lock-free query never reads the live (reallocating) vectors container.
-        // Called under the index read lock, so no concurrent writer is reallocating
-        // the block-header vector here.
+        // Under the exclusive lock, so no concurrent writer is reallocating here.
         auto *vectorsContainer = static_cast<const DataBlocksContainer *>(this->vectors);
         auto vectorBlocks = std::make_shared<std::vector<const char *>>();
         size_t numVectorBlocks = vectorsContainer->numBlocks();
@@ -396,7 +435,10 @@ class HNSWIndex : public VecSimIndexAbstract<DataType, DistType>,
         for (size_t b = 0; b < numVectorBlocks; b++) {
             vectorBlocks->push_back(vectorsContainer->getElement(b * this->blockSize));
         }
-        return HNSWGraphSnapshot{graphDataBlocks.captureRoot(),
+        // Fork the live backbone (capture is const at the API level but legitimately
+        // mutates the live storage under the exclusive lock; the snapshot receives
+        // the pre-fork backbone). const_cast keeps newBatchIterator's const contract.
+        return HNSWGraphSnapshot{const_cast<GraphDataBlocks &>(graphDataBlocks).captureRootAndFork(),
                                  entrypointNode,
                                  maxLevel,
                                  curElementCount,
diff --git a/src/VecSim/algorithms/hnsw/hnsw_multi.h b/src/VecSim/algorithms/hnsw/hnsw_multi.h
index 7224cef14..6116bb8b6 100644
--- a/src/VecSim/algorithms/hnsw/hnsw_multi.h
+++ b/src/VecSim/algorithms/hnsw/hnsw_multi.h
@@ -230,9 +230,12 @@ HNSWIndex_Multi<DataType, DistType>::newBatchIterator(const void *queryBlob,
     // Opt-in lock-free path: capture an immutable graph snapshot under the read
     // lock, then iterate it without holding any lock (writers proceed meanwhile).
     if (queryParams && queryParams->hnswRuntimeParams.useGraphSnapshotIterator) {
-        this->lockSharedIndexDataGuard();
+        // Exclusive (write) lock: capture is the brief quiescence point where no
+        // writer is active, so the backbone fork + generation handout can't race an
+        // in-flight insert (see captureGraphSnapshot).
+        this->lockIndexDataGuard();
         auto snapshot = this->captureGraphSnapshot();
-        this->unlockSharedIndexDataGuard();
+        this->unlockIndexDataGuard();
         return new (this->allocator) HNSWSnapshotMulti_BatchIterator<DataType, DistType>(
             queryBlobCopyPtr, this, std::move(snapshot), queryParams, this->allocator);
     }
diff --git a/src/VecSim/algorithms/hnsw/hnsw_single.h b/src/VecSim/algorithms/hnsw/hnsw_single.h
index f9f6e46d7..56a789122 100644
--- a/src/VecSim/algorithms/hnsw/hnsw_single.h
+++ b/src/VecSim/algorithms/hnsw/hnsw_single.h
@@ -187,9 +187,12 @@ HNSWIndex_Single<DataType, DistType>::newBatchIterator(const void *queryBlob,
     // Opt-in lock-free path: capture an immutable graph snapshot under the read
     // lock, then iterate it without holding any lock (writers proceed meanwhile).
     if (queryParams && queryParams->hnswRuntimeParams.useGraphSnapshotIterator) {
-        this->lockSharedIndexDataGuard();
+        // Exclusive (write) lock: capture is the brief quiescence point where no
+        // writer is active, so the backbone fork + generation handout can't race an
+        // in-flight insert (see captureGraphSnapshot).
+        this->lockIndexDataGuard();
         auto snapshot = this->captureGraphSnapshot();
-        this->unlockSharedIndexDataGuard();
+        this->unlockIndexDataGuard();
         return new (this->allocator) HNSWSnapshotSingle_BatchIterator<DataType, DistType>(
             queryBlobCopyPtr, this, std::move(snapshot), queryParams, this->allocator);
     }
diff --git a/src/VecSim/algorithms/hnsw/hnsw_tiered.h b/src/VecSim/algorithms/hnsw/hnsw_tiered.h
index 2197eb970..e04f68692 100644
--- a/src/VecSim/algorithms/hnsw/hnsw_tiered.h
+++ b/src/VecSim/algorithms/hnsw/hnsw_tiered.h
@@ -997,18 +997,26 @@ VecSimQueryReply *TieredHNSWIndex<DataType, DistType>::TieredHNSW_BatchIterator:
         }
         this->flat_results.swap(cur_flat_results->results);
         VecSimQueryReply_Free(cur_flat_results);
-        // Take the main index read lock to create the backend iterator. In the
-        // default (live) mode we hold it until the iterator is depleted or freed.
-        // In snapshot mode the backend iterator captures an immutable snapshot at
-        // creation, so we release the lock immediately afterwards and iterate
-        // lock-free — the snapshot's liveToken keeps graphSnapshotActive() true for
-        // the cursor's whole life, while ingestion and GC are no longer blocked by
-        // this cursor.
-        this->index->mainIndexGuard.lock_shared();
-        this->hnsw_iterator = this->index->backendIndex->newBatchIterator(
-            this->flat_iterator->getQueryBlob(), queryParams);
+        // Create the backend iterator. In the default (live) mode we take the main
+        // index *read* lock and hold it until the iterator is depleted or freed.
+        // In snapshot mode we instead take the *write* (exclusive) lock briefly so
+        // capture is a quiescence point with no in-flight writer (tiered inserts
+        // wire the graph holding mainIndexGuard *shared*, so only the exclusive lock
+        // drains them). The backend captures + forks under it, then we release
+        // immediately and iterate lock-free — the snapshot's liveToken keeps
+        // graphSnapshotActive() true for the cursor's whole life, while ingestion
+        // and GC are no longer blocked by this cursor. (How concurrent SWAP / repair
+        // stay safe against the live snapshot is handled GC-side: see
+        // executeReadySwapJobs and executeRepairJob.)
         if (this->snapshot_mode) {
-            this->index->mainIndexGuard.unlock_shared();
+            this->index->lockMainIndexGuard();
+            this->hnsw_iterator = this->index->backendIndex->newBatchIterator(
+                this->flat_iterator->getQueryBlob(), queryParams);
+            this->index->unlockMainIndexGuard();
+        } else {
+            this->index->mainIndexGuard.lock_shared();
+            this->hnsw_iterator = this->index->backendIndex->newBatchIterator(
+                this->flat_iterator->getQueryBlob(), queryParams);
         }
         auto cur_hnsw_results = this->hnsw_iterator->getNextResults(n_res, BY_SCORE_THEN_ID);
         hnsw_code = cur_hnsw_results->code;
diff --git a/tests/unit/test_hnsw.cpp b/tests/unit/test_hnsw.cpp
index 700f130ce..a3061f8fc 100644
--- a/tests/unit/test_hnsw.cpp
+++ b/tests/unit/test_hnsw.cpp
@@ -2422,10 +2422,15 @@ TYPED_TEST(HNSWTest, snapshotHandleCapture) {
         GenerateAndAddVector<TEST_DATA_T>(index, dim, i, i);
     }
 
-    // Capture is O(1): no element memory is allocated, only refcounts/scalars.
+    // Capture forks the backbone (so concurrent writers mutate a private copy while
+    // this snapshot keeps the captured one). That allocates a new backbone vector +
+    // per-block handles — O(#blocks) — but NOT element/vector data: the growth must
+    // be far below one block's worth of elements.
     size_t mem_before = allocator->getAllocationSize();
     auto snap = hnsw->captureGraphSnapshot();
-    ASSERT_EQ(allocator->getAllocationSize(), mem_before);
+    size_t capture_growth = allocator->getAllocationSize() - mem_before;
+    ASSERT_LT(capture_growth, bs * hnsw->elementGraphDataSize)
+        << "capture must copy only block handles, not element data";
     ASSERT_TRUE(snap.valid());
     ASSERT_EQ(snap.curElementCount, n);
     ASSERT_EQ(snap.entrypointNode, hnsw->entrypointNode);

From 1d8121cc3a2dd48549ef94eebdf7d6004ce2a784 Mon Sep 17 00:00:00 2001
From: jonathan keinan <jonathan.keinan@redis.com>
Date: Thu, 11 Jun 2026 20:23:31 +0300
Subject: [PATCH 09/10] HNSW snapshot 09: swap-side reclaim horizon
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A SWAP recycles a deleted slot by overwriting its vector + metadata in place,
which copy-on-write does not version — so a live snapshot must not let a slot
it can still read be recycled. A snapshot only ever reads ids below its
captured curElementCount, so the reclaim *horizon* is the max curElementCount
over all live snapshots: a swap whose freed id is >= the horizon is invisible
to every snapshot and runs immediately; one below it stays a tombstone until
the cursors that could see it are released. horizon == 0 (no snapshot) ⇒
everything recyclable. This keeps compaction flowing under a long-lived cursor
instead of stalling all reclamation globally.

The horizon is tracked by SnapshotRegistry (max captured count over live
snapshots) and gated in executeReadySwapJobs under the exclusive main lock.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .../algorithms/hnsw/graph_data_blocks.h       | 28 +++++++-
 src/VecSim/algorithms/hnsw/hnsw.h             |  9 ++-
 .../algorithms/hnsw/hnsw_base_tests_friends.h |  1 +
 .../hnsw/hnsw_multi_tests_friends.h           |  1 +
 .../hnsw/hnsw_single_tests_friends.h          |  1 +
 src/VecSim/algorithms/hnsw/hnsw_tiered.h      | 22 +++++-
 .../hnsw/hnsw_tiered_tests_friends.h          |  1 +
 tests/unit/test_hnsw_tiered.cpp               | 69 +++++++++++++++++++
 8 files changed, 128 insertions(+), 4 deletions(-)

diff --git a/src/VecSim/algorithms/hnsw/graph_data_blocks.h b/src/VecSim/algorithms/hnsw/graph_data_blocks.h
index 8288808e0..4e318985e 100644
--- a/src/VecSim/algorithms/hnsw/graph_data_blocks.h
+++ b/src/VecSim/algorithms/hnsw/graph_data_blocks.h
@@ -17,6 +17,7 @@
 #include <cassert>
 #include <cstdint>
 #include <cstring>
+#include <map>
 #include <memory>
 #include <mutex>
 #include <set>
@@ -53,6 +54,9 @@
 //   - `newestLive()` = max(live ids) = the **clone threshold**: a backbone/block
 //     version stamped `gen` must be preserved (cloned on write) iff
 //     `newestLive() >= gen` — some live snapshot can still see it.
+//   - `maxVisibleCount()` = max captured curElementCount = the **reclaim horizon**:
+//     a freed slot `d` may be physically recycled (SWAP) only once
+//     `d >= maxVisibleCount()` — no live snapshot can still read it.
 // `currentGeneration()` = the next id to be handed out = strictly greater than
 // every already-captured id; freshly-written/cloned versions are stamped with it,
 // so writes that post-date all live snapshots stay in place.
@@ -61,17 +65,24 @@
 // lock-free atomics; the mutex guards the ordered set on capture/release.
 class SnapshotRegistry {
 public:
-    uint64_t acquire() {
+    // `visibleCount` is the snapshot's curElementCount at capture: it only ever
+    // reads ids < visibleCount, so it is the per-snapshot reclaim horizon (a SWAP
+    // of an id >= visibleCount can never be observed by it).
+    uint64_t acquire(size_t visibleCount = 0) {
         uint64_t g = nextGeneration_.fetch_add(1);
         std::lock_guard<std::mutex> lk(guard_);
         live_.insert(g);
+        genVisibleCount_[g] = visibleCount;
         maxLive_.store(*live_.rbegin());
+        maxVisibleCount_.store(recomputeMaxVisibleCount());
         return g;
     }
     void release(uint64_t generation) {
         std::lock_guard<std::mutex> lk(guard_);
         live_.erase(generation);
+        genVisibleCount_.erase(generation);
         maxLive_.store(live_.empty() ? 0 : *live_.rbegin());
+        maxVisibleCount_.store(recomputeMaxVisibleCount());
     }
     // "is any snapshot live?" — the degenerate gate (used by the SWAP deferral).
     bool anyLive() const { return maxLive_.load() != 0; }
@@ -79,12 +90,27 @@ class SnapshotRegistry {
     uint64_t newestLive() const { return maxLive_.load(); }
     // next id to be handed out (> all captured ids). Lock-free.
     uint64_t currentGeneration() const { return nextGeneration_.load(); }
+    // Reclaim horizon for slot recycling: max curElementCount over live snapshots.
+    // A SWAP that overwrites internal id `d` is observable by a live snapshot iff
+    // `d < that snapshot's visibleCount`; so the swap is safe for ALL live snapshots
+    // iff `d >= maxVisibleCount()`. 0 when none live (everything recyclable).
+    // Lock-free.
+    size_t maxVisibleCount() const { return maxVisibleCount_.load(); }
 
 private:
+    size_t recomputeMaxVisibleCount() const { // caller holds guard_
+        size_t m = 0;
+        for (auto &kv : genVisibleCount_) {
+            m = std::max(m, kv.second);
+        }
+        return m;
+    }
     mutable std::mutex guard_;
     std::set<uint64_t> live_;            // unique, monotonic ids; begin=min, rbegin=max
+    std::map<uint64_t, size_t> genVisibleCount_; // gen -> curElementCount at capture
     std::atomic<uint64_t> nextGeneration_{1}; // 0 reserved for "no/invalid generation"
     std::atomic<uint64_t> maxLive_{0};   // cached max(live_) for the lock-free hot path
+    std::atomic<size_t> maxVisibleCount_{0}; // cached max(visibleCount) over live snapshots
 };
 
 // Generation-tagged, copy-on-write holder for a single `shared_ptr<T>` version
diff --git a/src/VecSim/algorithms/hnsw/hnsw.h b/src/VecSim/algorithms/hnsw/hnsw.h
index d1b0a9a71..546604282 100644
--- a/src/VecSim/algorithms/hnsw/hnsw.h
+++ b/src/VecSim/algorithms/hnsw/hnsw.h
@@ -419,7 +419,9 @@ class HNSWIndex : public VecSimIndexAbstract<DataType, DistType>,
     HNSWGraphSnapshot captureGraphSnapshot() const {
         // Hand out a fresh generation id and a registration token whose deleter
         // removes it from the live set when the last copy of the handle drops.
-        uint64_t generation = snapshotRegistry_->acquire();
+        // Pass curElementCount: this snapshot only reads ids < it, so it is the
+        // snapshot's slot-reclaim horizon (a SWAP of a higher id is unobservable).
+        uint64_t generation = snapshotRegistry_->acquire(curElementCount);
         auto registry = snapshotRegistry_;
         auto liveToken = std::shared_ptr<void>(
             static_cast<void *>(nullptr), [registry, generation](void *) {
@@ -458,6 +460,11 @@ class HNSWIndex : public VecSimIndexAbstract<DataType, DistType>,
     // (the snapshot holds the *old* root), so use_count would wrongly report no
     // snapshot mid-stream — the registry tracks the snapshot's whole lifetime.
     bool graphSnapshotActive() const { return snapshotRegistry_->anyLive(); }
+    // Reclaim horizon for SWAP slot recycling: the max curElementCount over live
+    // snapshots. A swap that overwrites an internal id >= this is invisible to
+    // every live snapshot (each only reads ids < its captured curElementCount) and
+    // is safe to run; one below it must be deferred. 0 when no snapshot is live.
+    size_t snapshotReclaimHorizon() const { return snapshotRegistry_->maxVisibleCount(); }
     idType searchBottomLayerEP(const void *query_data, void *timeoutCtx,
                                VecSimQueryReply_Code *rc) const;
 
diff --git a/src/VecSim/algorithms/hnsw/hnsw_base_tests_friends.h b/src/VecSim/algorithms/hnsw/hnsw_base_tests_friends.h
index e9cab0ec1..8f69a74cf 100644
--- a/src/VecSim/algorithms/hnsw/hnsw_base_tests_friends.h
+++ b/src/VecSim/algorithms/hnsw/hnsw_base_tests_friends.h
@@ -32,4 +32,5 @@ INDEX_TEST_FRIEND_CLASS(HNSWTestParallel_parallelSearchCombined_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTestBasic_deleteFromHNSWMultiLevels_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_deleteFromHNSWWithRepairJobExec_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_swapJobBasic_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_swapDeferredWhileSnapshotHeld_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTestBasic_deleteInplaceAvoidUpdatedMarkedDeleted_Test)
diff --git a/src/VecSim/algorithms/hnsw/hnsw_multi_tests_friends.h b/src/VecSim/algorithms/hnsw/hnsw_multi_tests_friends.h
index dcc87d434..c471c95ef 100644
--- a/src/VecSim/algorithms/hnsw/hnsw_multi_tests_friends.h
+++ b/src/VecSim/algorithms/hnsw/hnsw_multi_tests_friends.h
@@ -17,3 +17,4 @@ INDEX_TEST_FRIEND_CLASS(HNSWMultiTest_markDelete_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_testSizeEstimation_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWMultiTest_removeVectorWithSwaps_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_swapJobBasic_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_swapDeferredWhileSnapshotHeld_Test)
diff --git a/src/VecSim/algorithms/hnsw/hnsw_single_tests_friends.h b/src/VecSim/algorithms/hnsw/hnsw_single_tests_friends.h
index 9c27adead..ac79676d7 100644
--- a/src/VecSim/algorithms/hnsw/hnsw_single_tests_friends.h
+++ b/src/VecSim/algorithms/hnsw/hnsw_single_tests_friends.h
@@ -15,6 +15,7 @@ INDEX_TEST_FRIEND_CLASS(IndexAllocatorTest_test_hnsw_reclaim_memory_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTestParallel_parallelInsertSearch_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_testSizeEstimation_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_swapJobBasic_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_swapDeferredWhileSnapshotHeld_Test)
 friend class BF16HNSWTest_testSizeEstimation_Test;
 friend class BF16TieredTest_testSizeEstimation_Test;
 friend class FP16HNSWTest_testSizeEstimation_Test;
diff --git a/src/VecSim/algorithms/hnsw/hnsw_tiered.h b/src/VecSim/algorithms/hnsw/hnsw_tiered.h
index e04f68692..97069389d 100644
--- a/src/VecSim/algorithms/hnsw/hnsw_tiered.h
+++ b/src/VecSim/algorithms/hnsw/hnsw_tiered.h
@@ -341,14 +341,32 @@ void TieredHNSWIndex<DataType, DistType>::executeReadySwapJobs(size_t maxJobsToR
 
     // Execute swap jobs - acquire hnsw write lock.
     this->lockMainIndexGuard();
+
+    // SWAP recycles a slot by overwriting its vector + metadata in place
+    // (SwapLastIdWithDeletedId), which copy-on-write does NOT version. A live
+    // snapshot only ever reads ids < its captured curElementCount, so it can only
+    // observe a swap that recycles an id below that. The reclaim *horizon* is the
+    // max curElementCount over all live snapshots: a swap whose freed id is >= the
+    // horizon is invisible to every live snapshot and is safe to run now; one below
+    // it stays a tombstone until the cursors that could see it are released. This
+    // keeps compaction flowing under a long-lived cursor (new, high-id churn is
+    // reclaimed) instead of stalling globally. The swap runs under the exclusive
+    // main lock, so its COWs can't race a concurrent reader. horizon == 0 ⇒ no
+    // snapshot ⇒ everything recyclable.
+    const size_t horizon = this->getHNSWIndex()->snapshotReclaimHorizon();
+
     TIERED_LOG(VecSimCommonStrings::LOG_VERBOSE_STRING,
-               "Tiered HNSW index GC: there are %zu ready swap jobs. Start executing %zu swap jobs",
-               readySwapJobs, std::min(readySwapJobs, maxJobsToRun));
+               "Tiered HNSW index GC: there are %zu ready swap jobs (reclaim horizon %zu)",
+               readySwapJobs, horizon);
 
     vecsim_stl::vector<idType> idsToRemove(this->allocator);
     idsToRemove.reserve(idToSwapJob.size());
     for (auto &it : idToSwapJob) {
         auto *swap_job = it.second;
+        // Below the horizon: a live snapshot may still read this slot — defer.
+        if (swap_job->deleted_id < horizon) {
+            continue;
+        }
         if (swap_job->pending_repair_jobs_counter.load() == 0) {
             // Swap job is ready for execution - execute and delete it.
             this->getHNSWIndex()->removeAndSwapMarkDeletedElement(swap_job->deleted_id);
diff --git a/src/VecSim/algorithms/hnsw/hnsw_tiered_tests_friends.h b/src/VecSim/algorithms/hnsw/hnsw_tiered_tests_friends.h
index 175c1f771..9284cda47 100644
--- a/src/VecSim/algorithms/hnsw/hnsw_tiered_tests_friends.h
+++ b/src/VecSim/algorithms/hnsw/hnsw_tiered_tests_friends.h
@@ -24,6 +24,7 @@ INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_deleteVectorAndRepairAsync_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_alternateInsertDeleteAsync_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_swapJobBasic_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_tieredSnapshotIngestionProgressesWhileCursorOpen_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_swapDeferredWhileSnapshotHeld_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_swapJobBasic2_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_deleteVectorsAndSwapSync_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_BatchIterator_Test)
diff --git a/tests/unit/test_hnsw_tiered.cpp b/tests/unit/test_hnsw_tiered.cpp
index eca6b644f..34a473892 100644
--- a/tests/unit/test_hnsw_tiered.cpp
+++ b/tests/unit/test_hnsw_tiered.cpp
@@ -1860,6 +1860,75 @@ TYPED_TEST(HNSWTieredIndexTest, swapJobBasic) {
     EXPECT_EQ(allocator->getAllocationSize(), sizeof(size_t));
 }
 
+// Task 4.7b: slot reclamation is gated by a live snapshot. A "slot" (internal id)
+// spans three parallel containers, and SWAP recycles it by overwriting the vector
+// and metadata IN PLACE (copy-on-write only versions the graph). So while a
+// snapshot is live, executeReadySwapJobs must DEFER the swap (tombstone the slot)
+// rather than recycle it — otherwise the snapshot would read a different
+// element's embedding/identity in that slot.
+TYPED_TEST(HNSWTieredIndexTest, swapDeferredWhileSnapshotHeld) {
+    size_t dim = 4;
+    HNSWParams params = {.type = TypeParam::get_index_type(),
+                         .dim = dim,
+                         .metric = VecSimMetric_L2,
+                         .multi = TypeParam::isMulti()};
+    VecSimParams hnsw_params = CreateParams(params);
+    auto mock_thread_pool = tieredIndexMock();
+    auto *tiered_index = this->CreateTieredHNSWIndex(hnsw_params, mock_thread_pool);
+    auto *hnsw = tiered_index->getHNSWIndex();
+
+    // Three vectors directly in the backend graph (label == id, distinct values).
+    for (size_t i = 0; i < 3; i++) {
+        GenerateAndAddVector<TEST_DATA_T>(tiered_index->backendIndex, dim, i, i);
+    }
+    ASSERT_EQ(tiered_index->indexSize(), 3);
+
+    // Delete label 0 and drain its repair jobs so the swap job becomes ready.
+    // We do this BEFORE capturing the snapshot: the graph-repair path is not yet
+    // copy-on-write-safe under a live snapshot (task 4.5 remainder), and this test
+    // targets the SWAP-deferral gate (4.7a), not repair-under-snapshot. Capturing
+    // after the repair still exercises the gate (it defers ALL swaps while any
+    // snapshot is live) without running repair while a snapshot is held.
+    ASSERT_EQ(tiered_index->deleteVector(0), 1);
+    while (mock_thread_pool.jobQ.size() > 0) {
+        mock_thread_pool.thread_iteration();
+    }
+    ASSERT_EQ(tiered_index->idToSwapJob.count(0), 1u);
+
+    // Record slot 0's identity (the swap has not run yet, so it still holds it).
+    const size_t data_sz = hnsw->getStoredDataSize();
+    std::vector<char> vec0_before(data_sz);
+    memcpy(vec0_before.data(), hnsw->getDataByInternalId(0), data_sz);
+    labelType label0_before = hnsw->getExternalLabel(0);
+    ASSERT_TRUE(hnsw->isMarkedDeleted(0));
+
+    // Now capture a snapshot, then try to run the ready swap: it must be deferred
+    // because a snapshot is live (the slot must not be physically recycled).
+    auto snap = hnsw->captureGraphSnapshot();
+    ASSERT_TRUE(hnsw->graphSnapshotActive());
+    tiered_index->executeReadySwapJobs();
+    ASSERT_EQ(tiered_index->idToSwapJob.count(0), 1u)
+        << "swap must be deferred while a snapshot is live";
+
+    // Slot 0 is intact: same embedding and same identity (NOT the swap source's),
+    // showing element 0's own (now-deleted) flag rather than a live element wrongly
+    // occupying the recycled slot.
+    ASSERT_EQ(memcmp(hnsw->getDataByInternalId(0), vec0_before.data(), data_sz), 0)
+        << "snapshot-referenced slot's vector must not be overwritten by a deferred SWAP";
+    ASSERT_EQ(hnsw->getExternalLabel(0), label0_before);
+    ASSERT_TRUE(hnsw->isMarkedDeleted(0));
+
+    // Releasing the snapshot lets the deferred swap recycle the slot.
+    snap = HNSWGraphSnapshot{};
+    ASSERT_FALSE(hnsw->graphSnapshotActive());
+    tiered_index->executeReadySwapJobs();
+    ASSERT_EQ(tiered_index->idToSwapJob.count(0), 0u)
+        << "swap must proceed once no snapshot is live";
+
+    // The mock thread pool owns the index (index_strong_ref) and frees it on
+    // destruction — no explicit free here (matches swapJobBasic).
+}
+
 TYPED_TEST(HNSWTieredIndexTest, swapJobBasic2) {
     // Create TieredHNSW index instance with a mock queue.
     size_t dim = 4;

From 71bd317b2559f93aecaf7edd155577f2cbab2b01 Mon Sep 17 00:00:00 2001
From: jonathan keinan <jonathan.keinan@redis.com>
Date: Thu, 11 Jun 2026 20:23:31 +0300
Subject: [PATCH 10/10] =?UTF-8?q?HNSW=20snapshot=2010:=20repair=20COW-safe?=
 =?UTF-8?q?ty=20=E2=80=94=20run=20graph=20repair=20under=20live=20snapshot?=
 =?UTF-8?q?s?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Graph repair (repairNodeConnections) rewrites a node's links in place. Under a
live snapshot that is unsafe: the COW-aware link update would move a node's
neighbor-guard mutex out from under a held lock and mutate a buffer the snapshot
still shares. The fix makes repair copy-on-write-safe: mutuallyUpdateForRepairedNode
COWs every node it will touch BEFORE taking any link lock, so the snapshot keeps
its frozen view and the live rewrite is consistent. removeVectorInPlace captures
the locked candidate id so lock/unlock can't target different ids when the best
candidate is reassigned mid-loop.

With repair COW-safe, repairs run even while a snapshot is live — they are no
longer deferred — so the reclaim horizon (snapshot 09) is no longer inert: deletes'
repairs complete and their swaps become ready, then the horizon decides which
recycle now (id >= horizon) and which wait for the cursor to close (id < horizon).

Adds tieredSnapshotRepairRunsHorizonRecycles and the concurrent
parallelBatchIteratorSearchSnapshot repro (insert + delete + repair + lock-free
cursors), validated under ASan.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 src/VecSim/algorithms/hnsw/hnsw.h             |  14 +++
 src/VecSim/algorithms/hnsw/hnsw_tiered.h      |   9 ++
 .../hnsw/hnsw_tiered_tests_friends.h          |   4 +-
 tests/unit/test_hnsw_tiered.cpp               | 119 ++++++++++++++++++
 4 files changed, 145 insertions(+), 1 deletion(-)

diff --git a/src/VecSim/algorithms/hnsw/hnsw.h b/src/VecSim/algorithms/hnsw/hnsw.h
index 546604282..54d84b03d 100644
--- a/src/VecSim/algorithms/hnsw/hnsw.h
+++ b/src/VecSim/algorithms/hnsw/hnsw.h
@@ -1576,6 +1576,20 @@ void HNSWIndex<DataType, DistType>::mutuallyUpdateForRepairedNode(
     nodes_to_update.push_back(node_id);
     std::sort(nodes_to_update.begin(), nodes_to_update.end());
     size_t nodes_to_update_count = nodes_to_update.size();
+
+    // Copy-on-write ALL touched nodes' blocks up front, BEFORE taking any per-node
+    // lock. Every node we mutate or lock below is in nodes_to_update, so after this
+    // loop their buffers are final (stamped the current generation). That matters
+    // under a live snapshot: a COW-aware write inside the locked region would
+    // otherwise fork a block and move a node (and its mutex) to a new buffer out
+    // from under the lock we hold — so lock(id) and unlock(id) would resolve to
+    // different mutexes. Forking first makes getGraphDataByInternalId(id) stable for
+    // the whole critical section (no re-fork: a new snapshot can't be captured here
+    // — capture takes the exclusive lock, repair holds it shared). No-op when no
+    // snapshot is live (cowBlock doesn't clone), so the default path is unchanged.
+    for (size_t i = 0; i < nodes_to_update_count; i++) {
+        getGraphDataByInternalIdForWrite(nodes_to_update[i]);
+    }
     for (size_t i = 0; i < nodes_to_update_count; i++) {
         lockNodeLinks(nodes_to_update[i]);
     }
diff --git a/src/VecSim/algorithms/hnsw/hnsw_tiered.h b/src/VecSim/algorithms/hnsw/hnsw_tiered.h
index 97069389d..366e85e4e 100644
--- a/src/VecSim/algorithms/hnsw/hnsw_tiered.h
+++ b/src/VecSim/algorithms/hnsw/hnsw_tiered.h
@@ -355,6 +355,9 @@ void TieredHNSWIndex<DataType, DistType>::executeReadySwapJobs(size_t maxJobsToR
     // snapshot ⇒ everything recyclable.
     const size_t horizon = this->getHNSWIndex()->snapshotReclaimHorizon();
 
+    // (Repairs are no longer deferred under a snapshot — repairNodeConnections is
+    // now COW-safe — so there is no deferred-repair queue to drain here.)
+
     TIERED_LOG(VecSimCommonStrings::LOG_VERBOSE_STRING,
                "Tiered HNSW index GC: there are %zu ready swap jobs (reclaim horizon %zu)",
                readySwapJobs, horizon);
@@ -645,6 +648,12 @@ void TieredHNSWIndex<DataType, DistType>::executeRepairJob(HNSWRepairJob *job) {
     }
     HNSWIndex<DataType, DistType> *hnsw_index = this->getHNSWIndex();
 
+    // Repair runs even while a snapshot is live: mutuallyUpdateForRepairedNode
+    // copy-on-writes every touched node's block before locking, so a snapshot keeps
+    // its frozen view (COW) and the live link rewrite is consistent (no mutex moves
+    // out from under a held lock). Slot recycling (SWAP) is still gated separately
+    // by the reclaim horizon.
+
     // Remove this job pointer from the repair jobs lookup BEFORE it has been executed. Had we done
     // it after executing the repair job, we might have see that there is a pending repair job for
     // this node id upon deleting another neighbor of this node, and we may avoid creating another
diff --git a/src/VecSim/algorithms/hnsw/hnsw_tiered_tests_friends.h b/src/VecSim/algorithms/hnsw/hnsw_tiered_tests_friends.h
index 9284cda47..4a00138e9 100644
--- a/src/VecSim/algorithms/hnsw/hnsw_tiered_tests_friends.h
+++ b/src/VecSim/algorithms/hnsw/hnsw_tiered_tests_friends.h
@@ -23,8 +23,10 @@ INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_deleteVector_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_deleteVectorAndRepairAsync_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_alternateInsertDeleteAsync_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_swapJobBasic_Test)
-INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_tieredSnapshotIngestionProgressesWhileCursorOpen_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_swapDeferredWhileSnapshotHeld_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_tieredSnapshotIngestionProgressesWhileCursorOpen_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_tieredSnapshotRepairRunsHorizonRecycles_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_parallelBatchIteratorSearchSnapshot_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_swapJobBasic2_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_deleteVectorsAndSwapSync_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_BatchIterator_Test)
diff --git a/tests/unit/test_hnsw_tiered.cpp b/tests/unit/test_hnsw_tiered.cpp
index 34a473892..a5a6f41a5 100644
--- a/tests/unit/test_hnsw_tiered.cpp
+++ b/tests/unit/test_hnsw_tiered.cpp
@@ -4714,3 +4714,122 @@ TYPED_TEST(HNSWTieredIndexTest, tieredSnapshotIngestionProgressesWhileCursorOpen
     VecSimBatchIterator_Free(it);
     ASSERT_EQ(total, base);
 }
+
+// Repair rewrites links (not COW-safe under a live snapshot), so while a snapshot is
+// held the repair jobs are deferred (kept pending), and the associated swap stays
+// pending. Once the snapshot is released, a GC pass re-submits the deferred repairs;
+// after they run the swap becomes ready and recycles the slot.
+// Repair runs even while a snapshot is live: mutuallyUpdateForRepairedNode
+// copy-on-writes every touched node's block before locking, so the snapshot keeps
+// its frozen view and the link rewrite is consistent. Slot recycling (SWAP) is
+// gated by the reclaim horizon (= max curElementCount over live snapshots): a
+// deleted id below the horizon is deferred (the snapshot can still read that slot),
+// one at/above it recycles immediately.
+TYPED_TEST(HNSWTieredIndexTest, tieredSnapshotRepairRunsHorizonRecycles) {
+    size_t dim = 4;
+    HNSWParams params = {.type = TypeParam::get_index_type(),
+                         .dim = dim,
+                         .metric = VecSimMetric_L2,
+                         .multi = TypeParam::isMulti()};
+    VecSimParams hnsw_params = CreateParams(params);
+    auto mock_thread_pool = tieredIndexMock();
+    auto *tiered_index = this->CreateTieredHNSWIndex(hnsw_params, mock_thread_pool);
+    auto *hnsw = tiered_index->getHNSWIndex();
+
+    size_t n = 16;
+    for (size_t i = 0; i < n; i++) {
+        GenerateAndAddVector<TEST_DATA_T>(tiered_index->backendIndex, dim, i, i);
+    }
+    ASSERT_EQ(tiered_index->backendIndex->indexSize(), n);
+
+    // Capture a snapshot at curElementCount == n, so the reclaim horizon is n.
+    auto snap = hnsw->captureGraphSnapshot();
+    ASSERT_TRUE(hnsw->graphSnapshotActive());
+
+    // Delete a LOW id (< horizon). Its repair jobs RUN under the snapshot (no longer
+    // deferred): the swap becomes ready (pending repairs == 0). But the SWAP is
+    // deferred because the snapshot can still read slot 2.
+    ASSERT_EQ(tiered_index->deleteVector(2), 1);
+    while (mock_thread_pool.jobQ.size() > 0) {
+        mock_thread_pool.thread_iteration();
+    }
+    ASSERT_EQ(tiered_index->idToSwapJob.count(2), 1u);
+    ASSERT_EQ(tiered_index->idToSwapJob.at(2)->pending_repair_jobs_counter.load(), 0)
+        << "repairs ran under the snapshot — the swap is ready, not parked";
+    tiered_index->executeReadySwapJobs();
+    ASSERT_EQ(tiered_index->idToSwapJob.count(2), 1u)
+        << "a swap below the reclaim horizon stays deferred while the snapshot is live";
+
+    // Grow past the snapshot's horizon, then delete a HIGH id (>= horizon = n). The
+    // snapshot cannot see that slot, so its swap recycles immediately even though the
+    // snapshot is still live.
+    for (size_t i = n; i < 2 * n; i++) {
+        GenerateAndAddVector<TEST_DATA_T>(tiered_index->backendIndex, dim, i, i);
+    }
+    const idType high = (idType)(n + 3);
+    ASSERT_EQ(tiered_index->deleteVector(high), 1);
+    while (mock_thread_pool.jobQ.size() > 0) {
+        mock_thread_pool.thread_iteration();
+    }
+    tiered_index->executeReadySwapJobs();
+    ASSERT_EQ(tiered_index->idToSwapJob.count(high), 0u)
+        << "a swap at/above the reclaim horizon recycles under a live snapshot";
+
+    // Releasing the snapshot drops the horizon to 0, so the deferred low-id swap runs.
+    snap = HNSWGraphSnapshot{};
+    ASSERT_FALSE(hnsw->graphSnapshotActive());
+    tiered_index->executeReadySwapJobs();
+    ASSERT_EQ(tiered_index->idToSwapJob.count(2), 0u);
+}
+
+// Concurrency stress: many snapshot-backed cursors iterate lock-free while workers
+// ingest (flat->HNSW) and the main thread deletes (whose repairs now run under the
+// live cursors). Guards against the COW/use-after-free hazards in the concurrent
+// insert + delete + repair + snapshot-read path; run under ASan/TSan. Asserts
+// everyone makes progress (all cursors complete, ingestion finishes).
+TYPED_TEST(HNSWTieredIndexTest, parallelBatchIteratorSearchSnapshot) {
+    size_t dim = 4, ef = 200, n = 1000;
+    bool isMulti = TypeParam::isMulti();
+    size_t per_label = isMulti ? 5 : 1;
+    size_t n_labels = n / per_label;
+    HNSWParams params = {.type = TypeParam::get_index_type(), .dim = dim,
+                         .metric = VecSimMetric_L2, .multi = isMulti, .efRuntime = ef};
+    VecSimParams hnsw_params = CreateParams(params);
+    auto mock_thread_pool = tieredIndexMock();
+    auto *tiered_index = this->CreateTieredHNSWIndex(hnsw_params, mock_thread_pool);
+    auto allocator = tiered_index->getAllocator();
+    std::atomic_int successful_searches(0);
+    auto snapshot_search = [](AsyncJob *job) {
+        auto *sj = reinterpret_cast<tieredIndexMock::SearchJobMock *>(job);
+        VecSimQueryParams qp = {};
+        qp.hnswRuntimeParams.useGraphSnapshotIterator = true;
+        VecSimBatchIterator *it = VecSimBatchIterator_New(sj->index, sj->query, &qp);
+        size_t iteration = 0;
+        while (iteration < 10 && VecSimBatchIterator_HasNext(it)) {
+            VecSimQueryReply *r = VecSimBatchIterator_Next(it, sj->k, BY_SCORE);
+            VecSimQueryReply_Free(r);
+            iteration++;
+        }
+        VecSimBatchIterator_Free(it);
+        (*sj->successful_searches)++;
+        delete job;
+    };
+    for (size_t i = 0; i < n; i++) {
+        GenerateAndAddVector<TEST_DATA_T>(tiered_index, dim, i % n_labels, i);
+        auto query = (TEST_DATA_T *)allocator->allocate(dim * sizeof(TEST_DATA_T));
+        GenerateVector<TEST_DATA_T>(query, dim, i % n_labels);
+        auto *sj = new (allocator) tieredIndexMock::SearchJobMock(
+            allocator, snapshot_search, tiered_index, 10, query, n, dim, &successful_searches);
+        tiered_index->submitSingleJob(sj);
+    }
+    mock_thread_pool.init_threads();
+    for (size_t label = 0; label < n_labels / 4; label++) {
+        tiered_index->deleteVector(label);
+    }
+    mock_thread_pool.thread_pool_join();
+    for (int round = 0; round < 4; round++) {
+        while (mock_thread_pool.jobQ.size() > 0) mock_thread_pool.thread_iteration();
+        tiered_index->executeReadySwapJobs();
+    }
+    EXPECT_EQ(successful_searches, n);
+}