From 87160070304e257920e163b4bd705ebdd7e54b3b Mon Sep 17 00:00:00 2001
From: zhangt <zhangt@local>
Date: Wed, 20 May 2026 03:16:16 +0000
Subject: [PATCH 01/12] Replay distributed work onto
 users/qiazh/pre-merge-tikv-bugfix

Branch users/zhangt/merge-onto-qiazh ports our shared remote/local pool +
per-layer routing changes from users/zhangt/merge-distributed-to-tikv on
top of qianxi's TiKV bugfix branch (lock ordering, splitAsync, version
check, etc.). Avoids the 21-block ExtraDynamicSearcher.h merge conflict
on the merged_spfresh side by replaying instead of merging.

Pragmatic approach for heavy files (ExtraDynamicSearcher.h, SPFreshTest.cpp):
take our HEAD versions wholesale (which already contain our distributed +
MultiChunk logic), and patch only the compile-breaking deltas caused by
qianxi's refactors:
  - PostingCountCache moved from ExtraDynamicSearcher.h to ExtraTiKVController.h
  - KeyValueIO grew MultiMerge + LogAsyncWaitStatsAndReset virtuals
    (qianxi version kept; our MultiPut/MultiDelete virtuals re-added on top)
  - Options/ParameterDefinitionList: kept qianxi version (adds m_globalIDPath)
  - ThreadPool: kept our add_high + added addfront alias for qianxi callers

Index.h / IExtraSearcher.h / SPANNIndex.cpp: applied small additive hooks
on top of qianxi (forward-decl WorkerNode, SetWorker/GetSharedSplitPool
accessors, BuildIndexInternalLayer + AddIndex worker loop). qianxi
bugfixes preserved in those files.

Build system:
  - CMakeLists updated for absl_cord + cordz family (kvproto 25.3 uses
    absl 2308, anaconda's grpc bundles 2111; explicit linkage avoids
    DSO-missing-from-command-line)
  - cmake invoked with gRPC_DIR/Protobuf_DIR/absl_DIR pointing at
    /usr/local so generated kvproto + libabsl 2308 versions align

Verified: SPTAGTest links cleanly.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .gitignore                                    |    3 +-
 AnnService/CMakeLists.txt                     |    8 +-
 AnnService/inc/Core/Common/FineGrainedLock.h  |   25 +-
 AnnService/inc/Core/Common/IVersionMap.h      |   12 +
 AnnService/inc/Core/Common/TiKVVersionMap.h   |   52 +
 .../SPANN/Distributed/ConsistentHashRing.h    |   93 ++
 .../SPANN/Distributed/DispatchCoordinator.h   |  364 +++++
 .../Core/SPANN/Distributed/DispatcherNode.h   |  293 ++++
 .../SPANN/Distributed/DistributedProtocol.h   |  651 ++++++++
 .../inc/Core/SPANN/Distributed/NetworkNode.h  |  319 ++++
 .../Core/SPANN/Distributed/RemotePostingOps.h | 1325 ++++++++++++++++
 .../inc/Core/SPANN/Distributed/WorkerNode.h   |  616 ++++++++
 .../inc/Core/SPANN/ExtraDynamicSearcher.h     |  620 +++++++-
 .../inc/Core/SPANN/ExtraTiKVController.h      |    1 +
 AnnService/inc/Core/SPANN/IExtraSearcher.h    |   17 +
 AnnService/inc/Core/SPANN/Index.h             |   40 +
 AnnService/inc/Core/VectorIndex.h             |    9 +
 AnnService/inc/Helper/KeyValueIO.h            |   14 +
 AnnService/inc/Helper/ThreadPool.h            |   33 +-
 AnnService/inc/Socket/ConnectionManager.h     |    6 +-
 AnnService/inc/Socket/Packet.h                |   36 +-
 AnnService/inc/Socket/SimpleSerialization.h   |   52 +
 .../src/Core/SPANN/ExtraFileController.cpp    |    2 +-
 AnnService/src/Core/SPANN/SPANNIndex.cpp      |   78 +-
 AnnService/src/Core/VectorIndex.cpp           |   25 +
 AnnService/src/Socket/Connection.cpp          |   30 +-
 AnnService/src/Socket/Server.cpp              |    2 +-
 Test/CMakeLists.txt                           |    2 +-
 Test/inc/TestDataGenerator.h                  |   15 +-
 Test/src/SPFreshTest.cpp                      | 1071 +++++++++++--
 Test/src/TestDataGenerator.cpp                |   12 +-
 Test/src/main.cpp                             |    7 +-
 benchmark.ini                                 |   19 +
 evaluation/distributed/README.md              |  294 ++++
 .../configs/benchmark_100m_1node.ini          |   71 +
 .../configs/benchmark_100m_2node.ini          |   71 +
 .../configs/benchmark_100m_template.ini       |   71 +
 .../configs/benchmark_10m_1node.ini           |   62 +
 .../configs/benchmark_10m_2node.ini           |   62 +
 .../configs/benchmark_10m_template.ini        |   62 +
 .../benchmark_insert_dominant_1node.ini       |   58 +
 .../benchmark_insert_dominant_2node.ini       |   58 +
 .../benchmark_insert_dominant_3node.ini       |   59 +
 .../benchmark_insert_dominant_template.ini    |   58 +
 .../distributed/configs/cluster_2node.conf    |   31 +
 .../distributed/configs/cluster_3node.conf    |   34 +
 evaluation/distributed/configs/tikv.toml      |   74 +
 evaluation/distributed/run_distributed.sh     | 1364 +++++++++++++++++
 48 files changed, 8050 insertions(+), 231 deletions(-)
 create mode 100644 AnnService/inc/Core/SPANN/Distributed/ConsistentHashRing.h
 create mode 100644 AnnService/inc/Core/SPANN/Distributed/DispatchCoordinator.h
 create mode 100644 AnnService/inc/Core/SPANN/Distributed/DispatcherNode.h
 create mode 100644 AnnService/inc/Core/SPANN/Distributed/DistributedProtocol.h
 create mode 100644 AnnService/inc/Core/SPANN/Distributed/NetworkNode.h
 create mode 100644 AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h
 create mode 100644 AnnService/inc/Core/SPANN/Distributed/WorkerNode.h
 create mode 100644 benchmark.ini
 create mode 100644 evaluation/distributed/README.md
 create mode 100644 evaluation/distributed/configs/benchmark_100m_1node.ini
 create mode 100644 evaluation/distributed/configs/benchmark_100m_2node.ini
 create mode 100644 evaluation/distributed/configs/benchmark_100m_template.ini
 create mode 100644 evaluation/distributed/configs/benchmark_10m_1node.ini
 create mode 100644 evaluation/distributed/configs/benchmark_10m_2node.ini
 create mode 100644 evaluation/distributed/configs/benchmark_10m_template.ini
 create mode 100644 evaluation/distributed/configs/benchmark_insert_dominant_1node.ini
 create mode 100644 evaluation/distributed/configs/benchmark_insert_dominant_2node.ini
 create mode 100644 evaluation/distributed/configs/benchmark_insert_dominant_3node.ini
 create mode 100644 evaluation/distributed/configs/benchmark_insert_dominant_template.ini
 create mode 100644 evaluation/distributed/configs/cluster_2node.conf
 create mode 100644 evaluation/distributed/configs/cluster_3node.conf
 create mode 100755 evaluation/distributed/configs/tikv.toml
 create mode 100755 evaluation/distributed/run_distributed.sh

diff --git a/.gitignore b/.gitignore
index 190ca29d3..e3dc9796a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -464,4 +464,5 @@ FodyWeavers.xsd
 *.sln.iml
 
 # SPTAG benchmark generated artifacts
-*perftest_*
+/perftest_*
+/evaluation/2026-04-23/output_distributed_hostname_*.json
diff --git a/AnnService/CMakeLists.txt b/AnnService/CMakeLists.txt
index cd23345fd..299faf3ed 100644
--- a/AnnService/CMakeLists.txt
+++ b/AnnService/CMakeLists.txt
@@ -10,6 +10,12 @@ include_directories(${Zstd}/lib)
 file(GLOB_RECURSE HDR_FILES ${AnnService}/inc/Core/*.h  ${AnnService}/inc/Helper/*.h)
 file(GLOB_RECURSE SRC_FILES ${AnnService}/src/Core/*.cpp ${AnnService}/src/Helper/*.cpp)
 
+# Include Socket sources in core lib for PostingRouter
+file(GLOB SOCKET_HDR_FILES ${AnnService}/inc/Socket/*.h)
+file(GLOB SOCKET_SRC_FILES ${AnnService}/src/Socket/*.cpp)
+list(APPEND HDR_FILES ${SOCKET_HDR_FILES})
+list(APPEND SRC_FILES ${SOCKET_SRC_FILES})
+
 set(SPDK_LIBRARIES "")
 if (SPDK)
     set(Spdk ${PROJECT_SOURCE_DIR}/ThirdParty/spdk/build)
@@ -73,7 +79,7 @@ endif()
 add_library (SPTAGLib SHARED ${SRC_FILES} ${HDR_FILES} ${TiKV_PROTO_SOURCES})
 target_link_libraries (SPTAGLib DistanceUtils ${RocksDB_LIBRARIES} ${uring_LIBRARIES} libzstd_shared ${NUMA_LIBRARY} ${TBB_LIBRARIES} ${SPDK_LIBRARIES} ${TiKV_LIBRARIES})
 add_library (SPTAGLibStatic STATIC ${SRC_FILES} ${HDR_FILES} ${TiKV_PROTO_SOURCES})
-target_link_libraries (SPTAGLibStatic DistanceUtils ${RocksDB_LIBRARIES} ${uring_LIBRARIES} libzstd_static ${NUMA_LIBRARY_STATIC} ${TBB_LIBRARIES} ${SPDK_LIBRARIES} ${TiKV_LIBRARIES})
+target_link_libraries (SPTAGLibStatic DistanceUtils ${RocksDB_LIBRARIES} ${uring_LIBRARIES} libzstd_static ${NUMA_LIBRARY_STATIC} ${TBB_LIBRARIES} ${SPDK_LIBRARIES} ${TiKV_LIBRARIES} ${Boost_LIBRARIES})
 
 if (MSVC)
     # SPANNIndex.cpp can exceed COFF section limits in Debug without /bigobj.
diff --git a/AnnService/inc/Core/Common/FineGrainedLock.h b/AnnService/inc/Core/Common/FineGrainedLock.h
index 06c8f44d1..5cfad7ac6 100644
--- a/AnnService/inc/Core/Common/FineGrainedLock.h
+++ b/AnnService/inc/Core/Common/FineGrainedLock.h
@@ -56,10 +56,27 @@ namespace SPTAG
                 return GetLock(idx);
             }
 
+            // Per-posting lock identity. Two indices share a lock iff they are
+            // the same posting, so external callers can use `hash_func(a) ==
+            // hash_func(b)` as a self-lock guard (e.g. in Split, to skip
+            // re-locking the same head VID).
             static inline unsigned hash_func(unsigned idx)
             {
                 return idx;
             }
+
+            // Bucket index for the internal mutex-sharded unordered_map of
+            // per-posting locks. Exposed for callers that need an array sized
+            // to BucketCount and indexed by the same granularity as the lock
+            // pool (e.g. ExtraDynamicSearcher::m_remoteBucketLocked).
+            static inline unsigned BucketIndex(SizeType idx)
+            {
+                unsigned key = static_cast<unsigned>(idx);
+                return ((unsigned)(key * 99991) + _rotl(key, 2) + 101) & BucketMask;
+            }
+
+            static const int BucketMask = 32767;
+            static const int BucketCount = BucketMask + 1;
         private:
             struct Bucket {
                 std::mutex mutex;
@@ -76,14 +93,6 @@ namespace SPTAG
                 return *iter->second;
             }
 
-            static inline unsigned BucketIndex(SizeType idx)
-            {
-                unsigned key = static_cast<unsigned>(idx);
-                return ((unsigned)(key * 99991) + _rotl(key, 2) + 101) & BucketMask;
-            }
-
-            static const int BucketMask = 32767;
-            static const int BucketCount = BucketMask + 1;
             mutable std::unique_ptr<Bucket[]> m_buckets;
         };
     }
diff --git a/AnnService/inc/Core/Common/IVersionMap.h b/AnnService/inc/Core/Common/IVersionMap.h
index b939bd534..05d638cd9 100644
--- a/AnnService/inc/Core/Common/IVersionMap.h
+++ b/AnnService/inc/Core/Common/IVersionMap.h
@@ -43,6 +43,18 @@ namespace SPTAG
             virtual uint8_t GetVersion(const SizeType& key) = 0;
             virtual uint8_t GetVersion(const SizeType& key, VersionReadPolicy policy) { return GetVersion(key); }
             virtual void SetVersion(const SizeType& key, const uint8_t& version) = 0;
+
+            /// Batch SetVersion: apply (vids[i] -> versions[i]) for all i.
+            /// Default impl is a per-VID loop. TiKV-backed maps override this
+            /// to group writes by chunk so N records in the same chunk only
+            /// trigger 1 ReadChunk + 1 WriteChunk RPC pair
+            virtual void SetVersionBatch(const std::vector<SizeType>& vids, const std::vector<uint8_t>& versions)
+            {
+                size_t n = std::min(vids.size(), versions.size());
+                for (size_t i = 0; i < n; i++) {
+                    SetVersion(vids[i], versions[i]);
+                }
+            }
             /// Increment the version of a VID.
             /// @param expectedOld If not 0xff, the caller asserts the current version should be this value.
             ///   If TiKV already holds (expectedOld+1)&0x7f, treat as success (another node did the same increment).
diff --git a/AnnService/inc/Core/Common/TiKVVersionMap.h b/AnnService/inc/Core/Common/TiKVVersionMap.h
index 0dce69ce8..69191fe1b 100644
--- a/AnnService/inc/Core/Common/TiKVVersionMap.h
+++ b/AnnService/inc/Core/Common/TiKVVersionMap.h
@@ -385,6 +385,58 @@ namespace SPTAG
                 else if (oldVal != 0xfe && version == 0xfe) m_deleted++;
             }
 
+            // Group writes by chunk: 1 ReadChunk + N byte-modifications + 1 WriteChunk
+            // per chunk, instead of N × (ReadChunk + WriteChunk). 
+            void SetVersionBatch(const std::vector<SizeType>& vids, const std::vector<uint8_t>& versions) override
+            {
+                size_t n = std::min(vids.size(), versions.size());
+                if (n == 0) return;
+                const SizeType localCount = m_count.load();
+
+                // Group (idx into vids/versions) by chunk id.
+                std::unordered_map<SizeType, std::vector<size_t>> byChunk;
+                byChunk.reserve(n);
+                for (size_t i = 0; i < n; i++) {
+                    SizeType vid = vids[i];
+                    if (vid < 0 || vid >= localCount) continue;
+                    byChunk[ChunkId(vid)].push_back(i);
+                }
+                if (byChunk.empty()) return;
+
+                long deletedDelta = 0;
+                for (auto& kv : byChunk) {
+                    SizeType cid = kv.first;
+                    auto& idxs = kv.second;
+                    std::lock_guard<std::mutex> lock(ChunkMutex(cid));
+                    std::string chunk = ReadChunkCached(cid);
+                    if (chunk.empty()) {
+                        chunk.assign(m_chunkSize, static_cast<char>(0xff));
+                    }
+                    bool dirty = false;
+                    for (size_t i : idxs) {
+                        SizeType vid = vids[i];
+                        uint8_t newVal = versions[i];
+                        int offset = ChunkOffset(vid);
+                        if (offset < 0 || offset >= (int)chunk.size()) continue;
+                        uint8_t oldVal = static_cast<uint8_t>(chunk[offset]);
+                        if (oldVal == newVal) continue;
+                        if (oldVal == 0xfe && newVal != 0xfe) deletedDelta--;
+                        else if (oldVal != 0xfe && newVal == 0xfe) deletedDelta++;
+                        chunk[offset] = static_cast<char>(newVal);
+                        dirty = true;
+                    }
+                    if (dirty) {
+                        auto ret = WriteChunk(cid, chunk);
+                        if (ret != ErrorCode::Success) {
+                            SPTAGLIB_LOG(Helper::LogLevel::LL_Error,
+                                "TiKVVersionMap::SetVersionBatch: WriteChunk failed chunk=%d layer=%d\n",
+                                cid, m_layer);
+                        }
+                    }
+                }
+                if (deletedDelta != 0) m_deleted += deletedDelta;
+            }
+
             bool IncVersion(const SizeType& key, uint8_t* newVersion, uint8_t expectedOld = 0xff) override
             {
                 if (key < 0 || key >= m_count.load()) {
diff --git a/AnnService/inc/Core/SPANN/Distributed/ConsistentHashRing.h b/AnnService/inc/Core/SPANN/Distributed/ConsistentHashRing.h
new file mode 100644
index 000000000..ec5c7855c
--- /dev/null
+++ b/AnnService/inc/Core/SPANN/Distributed/ConsistentHashRing.h
@@ -0,0 +1,93 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "inc/Core/Common.h"
+#include <cstdint>
+#include <map>
+#include <set>
+
+namespace SPTAG::SPANN {
+
+    /// Consistent hash ring for distributing headIDs across compute nodes.
+    /// Uses virtual nodes (vnodes) for balanced distribution.
+    /// When nodes are added/removed, only ~1/N of keys are remapped.
+    class ConsistentHashRing {
+    public:
+        explicit ConsistentHashRing(int vnodeCount = 150)
+            : m_vnodeCount(vnodeCount) {}
+
+        /// Add a physical node to the ring with its virtual nodes.
+        void AddNode(int nodeIndex) {
+            for (int i = 0; i < m_vnodeCount; i++) {
+                uint32_t h = HashVNode(nodeIndex, i);
+                m_ring[h] = nodeIndex;
+            }
+            m_nodes.insert(nodeIndex);
+        }
+
+        /// Remove a physical node and all its virtual nodes from the ring.
+        void RemoveNode(int nodeIndex) {
+            for (int i = 0; i < m_vnodeCount; i++) {
+                uint32_t h = HashVNode(nodeIndex, i);
+                m_ring.erase(h);
+            }
+            m_nodes.erase(nodeIndex);
+        }
+
+        /// Find the owner node for a given key (headID).
+        /// Returns -1 if the ring is empty.
+        int GetOwner(SizeType headID) const {
+            if (m_ring.empty()) return -1;
+            uint32_t h = HashKey(headID);
+            auto it = m_ring.lower_bound(h);
+            if (it == m_ring.end()) it = m_ring.begin();
+            return it->second;
+        }
+
+        bool Empty() const { return m_ring.empty(); }
+        size_t NodeCount() const { return m_nodes.size(); }
+        bool HasNode(int nodeIndex) const { return m_nodes.count(nodeIndex) > 0; }
+        const std::set<int>& GetNodes() const { return m_nodes; }
+        int GetVNodeCount() const { return m_vnodeCount; }
+
+    private:
+        static uint32_t HashKey(SizeType headID) {
+            uint32_t hash = 2166136261u; // FNV-1a offset basis
+            uint32_t val = static_cast<uint32_t>(headID);
+            for (int i = 0; i < 4; i++) {
+                hash ^= (val >> (i * 8)) & 0xFF;
+                hash *= 16777619u; // FNV prime
+            }
+            return hash;
+        }
+
+        static uint32_t HashVNode(int nodeIndex, int vnodeIdx) {
+            // Raw FNV-1a on tiny nodeIndex (1, 2, 3) produces a
+            // pathologically biased ring (71.9% vs 28.1% for nodes 1/2 with
+            // 150 vnodes). Pre-mix nodeIndex through Knuth's golden-ratio
+            // multiplier so small node IDs become full-spectrum uint32 values
+            // before they hit FNV's accumulator. Validated to give ≈50/50
+            // for K=2 and stay within ±15% of even split for K up to 8.
+            uint32_t saltedVnode =
+                static_cast<uint32_t>(vnodeIdx) ^
+                (static_cast<uint32_t>(nodeIndex) * 2654435761u);
+            uint32_t hash = 2166136261u;
+            auto mix = [&](uint32_t v) {
+                for (int i = 0; i < 4; i++) {
+                    hash ^= (v >> (i * 8)) & 0xFF;
+                    hash *= 16777619u;
+                }
+            };
+            mix(saltedVnode);
+            mix(static_cast<uint32_t>(nodeIndex));
+            return hash;
+        }
+
+        int m_vnodeCount;
+        std::map<uint32_t, int> m_ring;  // hash position → nodeIndex
+        std::set<int> m_nodes;           // active physical node indices
+    };
+
+} // namespace SPTAG::SPANN
diff --git a/AnnService/inc/Core/SPANN/Distributed/DispatchCoordinator.h b/AnnService/inc/Core/SPANN/Distributed/DispatchCoordinator.h
new file mode 100644
index 000000000..8bb32a7eb
--- /dev/null
+++ b/AnnService/inc/Core/SPANN/Distributed/DispatchCoordinator.h
@@ -0,0 +1,364 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "inc/Core/SPANN/Distributed/DistributedProtocol.h"
+#include "inc/Socket/Client.h"
+#include "inc/Socket/Packet.h"
+#include "inc/Socket/SimpleSerialization.h"
+#include <atomic>
+#include <condition_variable>
+#include <functional>
+#include <future>
+#include <memory>
+#include <mutex>
+#include <set>
+#include <thread>
+#include <unordered_map>
+#include <vector>
+
+namespace SPTAG::SPANN {
+
+    /// Coordinates driver↔worker dispatch for distributed benchmarks.
+    ///
+    /// The driver broadcasts Insert/Search/Stop commands to all workers and
+    /// collects their results.  Workers execute commands via a callback and
+    /// report results back.
+    ///
+    /// This class is independent of posting routing — it only needs a way to
+    /// send packets to peer nodes (provided via PeerNetwork interface).
+    class DispatchCoordinator {
+    public:
+        /// Abstract interface for sending packets to peer nodes.
+        /// NetworkNode implements this so DispatchCoordinator doesn't
+        /// depend on the full node class.
+        class PeerNetwork {
+        public:
+            virtual ~PeerNetwork() = default;
+            /// Get connection to a peer node (reconnecting if needed).
+            virtual Socket::ConnectionID GetPeerConnection(int nodeIndex) = 0;
+            /// Total number of nodes in the cluster.
+            virtual int GetNumNodes() const = 0;
+            /// Index of this node.
+            virtual int GetLocalNodeIndex() const = 0;
+            /// Send a packet via the client socket.
+            virtual void SendPacket(Socket::ConnectionID connID, Socket::Packet&& pkt,
+                                    std::function<void(bool)> callback) = 0;
+        };
+
+        using DispatchCallback = std::function<DispatchResult(const DispatchCommand&)>;
+
+        DispatchCoordinator() = default;
+
+        ~DispatchCoordinator() {
+            ClearDispatchCallback();
+        }
+
+        /// Attach to a peer network (must outlive this coordinator).
+        void SetNetwork(PeerNetwork* network) {
+            m_network = network;
+        }
+
+        /// Mark a worker node as "local" — its work is done inline by the
+        /// driver so it should be skipped during broadcast/result collection.
+        void SetLocalWorkerIndex(int idx) { m_localWorkerIndex = idx; }
+
+        /// Set the callback for executing dispatch commands (worker side).
+        void SetDispatchCallback(DispatchCallback cb) {
+            m_dispatchCallback = std::move(cb);
+        }
+
+        /// Clear the dispatch callback and wait for in-flight dispatch
+        /// threads to complete. Call before destroying callback state.
+        void ClearDispatchCallback() {
+            m_dispatchCallback = nullptr;
+            std::unique_lock<std::mutex> lock(m_activeDispatchMutex);
+            m_activeDispatchCV.wait(lock, [this]() {
+                return m_activeDispatchCount == 0;
+            });
+        }
+
+        // ---- Driver side ----
+
+        /// Broadcast a dispatch command to all worker nodes.
+        /// Returns the dispatchId assigned to this command.
+        std::uint64_t BroadcastDispatchCommand(DispatchCommand::Type type, std::uint32_t round) {
+            std::uint64_t dispatchId = m_nextDispatchId.fetch_add(1);
+
+            DispatchCommand cmd;
+            cmd.m_type = type;
+            cmd.m_dispatchId = dispatchId;
+            cmd.m_round = round;
+
+            int numNodes = m_network->GetNumNodes();
+            int localIdx = m_network->GetLocalNodeIndex();
+
+            // Build list of nodes to skip (dispatcher + local worker if set)
+            auto shouldSkip = [&](int i) {
+                return i == localIdx || i == m_localWorkerIndex;
+            };
+
+            // Count remote workers (nodes we will actually dispatch to)
+            int remoteWorkers = 0;
+            for (int i = 0; i < numNodes; i++) {
+                if (!shouldSkip(i)) remoteWorkers++;
+            }
+
+            // Set up pending state for collecting results (not for Stop / Heartbeat)
+            if (type != DispatchCommand::Type::Stop &&
+                type != DispatchCommand::Type::Heartbeat &&
+                remoteWorkers > 0) {
+                auto state = std::make_shared<PendingDispatch>();
+                state->remaining.store(remoteWorkers);
+                for (int i = 0; i < numNodes; i++) {
+                    if (!shouldSkip(i)) state->pendingNodes.insert(i);
+                }
+                {
+                    std::lock_guard<std::mutex> lock(m_dispatchMutex);
+                    m_pendingDispatches[dispatchId] = state;
+                }
+            }
+
+            auto bodySize = static_cast<std::uint32_t>(cmd.EstimateBufferSize());
+
+            for (int i = 0; i < numNodes; i++) {
+                if (shouldSkip(i)) continue;
+
+                Socket::ConnectionID connID = m_network->GetPeerConnection(i);
+                if (connID == Socket::c_invalidConnectionID) {
+                    SPTAGLIB_LOG(Helper::LogLevel::LL_Warning,
+                        "DispatchCoordinator: Cannot dispatch to node %d (no connection)\n", i);
+                    if (type != DispatchCommand::Type::Stop &&
+                        type != DispatchCommand::Type::Heartbeat) {
+                        std::lock_guard<std::mutex> lock(m_dispatchMutex);
+                        auto it = m_pendingDispatches.find(dispatchId);
+                        if (it != m_pendingDispatches.end()) {
+                            it->second->errors++;
+                            if (it->second->remaining.fetch_sub(1) == 1) {
+                                it->second->done.set_value();
+                            }
+                        }
+                    }
+                    continue;
+                }
+
+                Socket::Packet pkt;
+                pkt.Header().m_packetType = Socket::PacketType::DispatchCommand;
+                pkt.Header().m_processStatus = Socket::PacketProcessStatus::Ok;
+                pkt.Header().m_connectionID = Socket::c_invalidConnectionID;
+                pkt.Header().m_resourceID = 0;
+                pkt.Header().m_bodyLength = bodySize;
+                pkt.AllocateBuffer(bodySize);
+                cmd.Write(pkt.Body());
+                pkt.Header().WriteBuffer(pkt.HeaderBuffer());
+
+                m_network->SendPacket(connID, std::move(pkt), nullptr);
+            }
+
+            // Heartbeats fire every interval seconds — keep logs clean.
+            if (type != DispatchCommand::Type::Heartbeat) {
+                SPTAGLIB_LOG(Helper::LogLevel::LL_Info,
+                    "DispatchCoordinator: Dispatched %s (id=%llu round=%u) to %d workers\n",
+                    type == DispatchCommand::Type::Search ? "Search" :
+                    type == DispatchCommand::Type::Insert ? "Insert" : "Stop",
+                    (unsigned long long)dispatchId, round, remoteWorkers);
+            }
+
+            return dispatchId;
+        }
+
+        /// Wait for all workers to report results for a dispatch.
+        /// Returns collected wall times from workers. Empty on timeout.
+        std::vector<double> WaitForAllResults(std::uint64_t dispatchId, int timeoutSec = 300) {
+            std::shared_ptr<PendingDispatch> state;
+            {
+                std::lock_guard<std::mutex> lock(m_dispatchMutex);
+                auto it = m_pendingDispatches.find(dispatchId);
+                if (it == m_pendingDispatches.end()) return {};
+                state = it->second;
+            }
+
+            auto future = state->done.get_future();
+            auto status = future.wait_for(std::chrono::seconds(timeoutSec));
+
+            {
+                std::lock_guard<std::mutex> lock(m_dispatchMutex);
+                m_pendingDispatches.erase(dispatchId);
+            }
+
+            if (status == std::future_status::timeout) {
+                std::string nodeList;
+                {
+                    std::lock_guard<std::mutex> lock(state->mutex);
+                    for (int n : state->pendingNodes) {
+                        if (!nodeList.empty()) nodeList += ",";
+                        nodeList += std::to_string(n);
+                    }
+                }
+                SPTAGLIB_LOG(Helper::LogLevel::LL_Error,
+                    "DispatchCoordinator: Timeout waiting for results (id=%llu, %d remaining, nodes=[%s])\n",
+                    (unsigned long long)dispatchId, state->remaining.load(), nodeList.c_str());
+                return {};
+            }
+
+            if (state->errors > 0) {
+                SPTAGLIB_LOG(Helper::LogLevel::LL_Warning,
+                    "DispatchCoordinator: Dispatch %llu completed with %d errors\n",
+                    (unsigned long long)dispatchId, (int)state->errors);
+            }
+
+            std::lock_guard<std::mutex> lock(state->mutex);
+            return state->wallTimes;
+        }
+
+        // ---- Worker side ----
+
+        /// Send a dispatch result back to the driver (worker side).
+        void SendDispatchResult(const DispatchResult& result) {
+            int driverNode = 0;
+            if (driverNode == m_network->GetLocalNodeIndex()) return;
+
+            Socket::ConnectionID connID = m_network->GetPeerConnection(driverNode);
+            if (connID == Socket::c_invalidConnectionID) {
+                SPTAGLIB_LOG(Helper::LogLevel::LL_Error,
+                    "DispatchCoordinator: Cannot send result to driver\n");
+                return;
+            }
+
+            Socket::Packet pkt;
+            auto bodySize = static_cast<std::uint32_t>(result.EstimateBufferSize());
+            pkt.Header().m_packetType = Socket::PacketType::DispatchResult;
+            pkt.Header().m_processStatus = Socket::PacketProcessStatus::Ok;
+            pkt.Header().m_connectionID = Socket::c_invalidConnectionID;
+            pkt.Header().m_resourceID = 0;
+            pkt.Header().m_bodyLength = bodySize;
+            pkt.AllocateBuffer(bodySize);
+            result.Write(pkt.Body());
+            pkt.Header().WriteBuffer(pkt.HeaderBuffer());
+
+            m_network->SendPacket(connID, std::move(pkt), nullptr);
+        }
+
+        // ---- Packet handlers (called by NetworkNode's server/client) ----
+
+        /// Handle an incoming dispatch command from the driver (worker side).
+        void HandleDispatchCommand(Socket::ConnectionID connID, Socket::Packet packet) {
+            if (packet.Header().m_bodyLength == 0) {
+                SPTAGLIB_LOG(Helper::LogLevel::LL_Error,
+                    "DispatchCoordinator: Empty DispatchCommand received\n");
+                return;
+            }
+
+            DispatchCommand cmd;
+            if (cmd.Read(packet.Body()) == nullptr) {
+                SPTAGLIB_LOG(Helper::LogLevel::LL_Error,
+                    "DispatchCoordinator: DispatchCommand parse failed\n");
+                return;
+            }
+
+            SPTAGLIB_LOG(Helper::LogLevel::LL_Info,
+                "DispatchCoordinator: Received command type=%d id=%llu round=%u\n",
+                (int)cmd.m_type, (unsigned long long)cmd.m_dispatchId, cmd.m_round);
+
+            auto callback = m_dispatchCallback;
+            if (!callback) {
+                SPTAGLIB_LOG(Helper::LogLevel::LL_Warning,
+                    "DispatchCoordinator: No callback set, ignoring command\n");
+                return;
+            }
+
+            {
+                std::lock_guard<std::mutex> lock(m_activeDispatchMutex);
+                m_activeDispatchCount++;
+            }
+
+            auto self = this;
+            int localIdx = m_network->GetLocalNodeIndex();
+            std::thread([self, callback, cmd, localIdx]() {
+                DispatchResult result = callback(cmd);
+                result.m_nodeIndex = localIdx;
+                result.m_dispatchId = cmd.m_dispatchId;
+                result.m_round = cmd.m_round;
+
+                if (cmd.m_type != DispatchCommand::Type::Stop &&
+                    cmd.m_type != DispatchCommand::Type::Heartbeat) {
+                    self->SendDispatchResult(result);
+                }
+
+                {
+                    std::lock_guard<std::mutex> lock(self->m_activeDispatchMutex);
+                    self->m_activeDispatchCount--;
+                }
+                self->m_activeDispatchCV.notify_all();
+            }).detach();
+        }
+
+        /// Handle an incoming dispatch result from a worker (driver side).
+        void HandleDispatchResult(Socket::ConnectionID connID, Socket::Packet packet) {
+            if (packet.Header().m_bodyLength == 0) return;
+
+            DispatchResult result;
+            if (result.Read(packet.Body()) == nullptr) {
+                SPTAGLIB_LOG(Helper::LogLevel::LL_Warning,
+                    "DispatchCoordinator: DispatchResult parse failed\n");
+                return;
+            }
+
+            SPTAGLIB_LOG(Helper::LogLevel::LL_Info,
+                "DispatchCoordinator: Result id=%llu round=%u node=%d status=%d wallTime=%.3f\n",
+                (unsigned long long)result.m_dispatchId, result.m_round,
+                result.m_nodeIndex, (int)result.m_status, result.m_wallTime);
+
+            std::shared_ptr<PendingDispatch> state;
+            {
+                std::lock_guard<std::mutex> lock(m_dispatchMutex);
+                auto it = m_pendingDispatches.find(result.m_dispatchId);
+                if (it == m_pendingDispatches.end()) {
+                    SPTAGLIB_LOG(Helper::LogLevel::LL_Warning,
+                        "DispatchCoordinator: Result for unknown dispatch %llu (late/expired)\n",
+                        (unsigned long long)result.m_dispatchId);
+                    return;
+                }
+                state = it->second;
+            }
+
+            if (result.m_status != DispatchResult::Status::Success) {
+                state->errors++;
+            }
+
+            {
+                std::lock_guard<std::mutex> lock(state->mutex);
+                state->wallTimes.push_back(result.m_wallTime);
+                if (result.m_nodeIndex >= 0)
+                    state->pendingNodes.erase(result.m_nodeIndex);
+            }
+
+            if (state->remaining.fetch_sub(1) == 1) {
+                state->done.set_value();
+            }
+        }
+
+    private:
+        struct PendingDispatch {
+            std::atomic<int> remaining{0};
+            std::atomic<int> errors{0};
+            std::promise<void> done;
+            std::mutex mutex;
+            std::vector<double> wallTimes;
+            std::set<int> pendingNodes;  // nodes that haven't responded yet
+        };
+
+        PeerNetwork* m_network = nullptr;
+        int m_localWorkerIndex = -1;  // driver's worker node to skip in broadcasts
+        DispatchCallback m_dispatchCallback;
+        std::atomic<std::uint64_t> m_nextDispatchId{1};
+        std::mutex m_dispatchMutex;
+        std::unordered_map<std::uint64_t, std::shared_ptr<PendingDispatch>> m_pendingDispatches;
+
+        std::mutex m_activeDispatchMutex;
+        std::condition_variable m_activeDispatchCV;
+        int m_activeDispatchCount{0};
+    };
+
+} // namespace SPTAG::SPANN
diff --git a/AnnService/inc/Core/SPANN/Distributed/DispatcherNode.h b/AnnService/inc/Core/SPANN/Distributed/DispatcherNode.h
new file mode 100644
index 000000000..00b7bbdb6
--- /dev/null
+++ b/AnnService/inc/Core/SPANN/Distributed/DispatcherNode.h
@@ -0,0 +1,293 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "inc/Core/SPANN/Distributed/NetworkNode.h"
+
+namespace SPTAG::SPANN {
+
+    /// Dispatcher node: manages the consistent hash ring and coordinates
+    /// external dispatch commands (Insert/Search/Stop) to worker nodes.
+    ///
+    /// The dispatcher does NOT perform search or posting operations.
+    /// It is a lightweight coordination point that:
+    ///   - Accepts NodeRegister requests from workers
+    ///   - Maintains the authoritative hash ring and broadcasts updates
+    ///   - Tracks per-worker ACK status with retry
+    ///   - Delegates BroadcastDispatchCommand / WaitForAllResults
+    class DispatcherNode : public NetworkNode {
+    public:
+        using DispatchCallback = DispatchCoordinator::DispatchCallback;
+
+        /// Initialize the dispatcher with separate addresses.
+        /// Builds the full hash ring at startup (workers 1..N).
+        bool Initialize(
+            const std::pair<std::string, std::string>& dispatcherAddr,
+            const std::vector<std::pair<std::string, std::string>>& workerAddrs,
+            int vnodeCount = 150)
+        {
+            // Build combined addr list: [dispatcher, worker0, worker1, ...]
+            std::vector<std::pair<std::string, std::string>> allAddrs;
+            allAddrs.push_back(dispatcherAddr);
+            allAddrs.insert(allAddrs.end(), workerAddrs.begin(), workerAddrs.end());
+
+            if (!InitializeNetwork(0, allAddrs, vnodeCount)) return false;
+
+            // [Bug 30] Dispatcher has no local data shard; mark with -1.
+            m_numDispatchNodes = 1;
+            m_numWorkerNodes = static_cast<int>(workerAddrs.size());
+            m_workerNodeIndex = -1;
+
+            // Pre-build complete ring with all workers (internal indices 1..N)
+            int numWorkers = static_cast<int>(workerAddrs.size());
+            auto ring = std::make_shared<ConsistentHashRing>(vnodeCount);
+            for (int i = 1; i <= numWorkers; i++) {
+                ring->AddNode(i);
+            }
+            std::atomic_store(&m_hashRing,
+                std::shared_ptr<const ConsistentHashRing>(std::move(ring)));
+            m_currentRingVersion.store(1);
+
+            m_dispatch.SetNetwork(this);
+
+            SPTAGLIB_LOG(Helper::LogLevel::LL_Info,
+                "DispatcherNode: initialized with %d workers, ring v1\n", numWorkers);
+            return true;
+        }
+
+        bool Start() { return StartNetwork(); }
+
+        // ---- Dispatch protocol ----
+
+        /// Mark the driver's local worker node so broadcasts skip it.
+        void SetLocalWorkerIndex(int idx) { m_dispatch.SetLocalWorkerIndex(idx); }
+
+        std::uint64_t BroadcastDispatchCommand(DispatchCommand::Type type, std::uint32_t round) {
+            return m_dispatch.BroadcastDispatchCommand(type, round);
+        }
+
+        std::vector<double> WaitForAllResults(std::uint64_t dispatchId, int timeoutSec = 300) {
+            return m_dispatch.WaitForAllResults(dispatchId, timeoutSec);
+        }
+
+        void SetDispatchCallback(DispatchCallback cb) {
+            m_dispatch.SetDispatchCallback(std::move(cb));
+        }
+
+        void ClearDispatchCallback() {
+            m_dispatch.ClearDispatchCallback();
+        }
+
+        // ---- Heartbeat pump ----
+        //
+        // Periodically broadcasts a Heartbeat dispatch to every remote worker.
+        // Workers use the heartbeat to detect driver failure / network
+        // partition and exit cleanly rather than relying on a fixed
+        // wall-clock receiver timeout.
+        //
+        // Idempotent: callable from any thread; second call without StopHeartbeat
+        // is a no-op. StopHeartbeat joins the thread; destructor calls it.
+
+        void StartHeartbeat(int intervalSec) {
+            if (intervalSec <= 0) return;
+            if (m_heartbeatThread.joinable()) return;
+            m_heartbeatStop.store(false);
+            m_heartbeatThread = std::thread([this, intervalSec]() {
+                std::uint32_t round = 0;
+                while (!m_heartbeatStop.load()) {
+                    BroadcastDispatchCommand(DispatchCommand::Type::Heartbeat, round++);
+                    for (int i = 0; i < intervalSec * 10 && !m_heartbeatStop.load(); i++) {
+                        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+                    }
+                }
+            });
+            SPTAGLIB_LOG(Helper::LogLevel::LL_Info,
+                "DispatcherNode: heartbeat pump started (interval=%ds)\n", intervalSec);
+        }
+
+        void StopHeartbeat() {
+            if (!m_heartbeatThread.joinable()) return;
+            m_heartbeatStop.store(true);
+            m_heartbeatThread.join();
+            SPTAGLIB_LOG(Helper::LogLevel::LL_Info,
+                "DispatcherNode: heartbeat pump stopped\n");
+        }
+
+        ~DispatcherNode() {
+            StopHeartbeat();
+        }
+
+        // ---- Ring management ----
+
+        bool AllWorkersAcked() const {
+            std::uint32_t currentVer = m_currentRingVersion.load();
+            if (currentVer == 0) return false;
+            std::lock_guard<std::mutex> lock(m_ackMutex);
+            int numNodes = static_cast<int>(m_nodeAddrs.size());
+            for (int i = 0; i < numNodes; i++) {
+                if (i == m_localNodeIndex) continue;
+                auto it = m_workerAckedVersion.find(i);
+                if (it == m_workerAckedVersion.end() || it->second < currentVer) return false;
+            }
+            return true;
+        }
+
+    protected:
+        void RegisterServerHandlers(Socket::PacketHandlerMapPtr& handlers) override {
+            handlers->emplace(Socket::PacketType::NodeRegisterRequest,
+                [this](Socket::ConnectionID c, Socket::Packet p) { HandleNodeRegisterRequest(c, std::move(p)); });
+            handlers->emplace(Socket::PacketType::RingUpdateACK,
+                [this](Socket::ConnectionID c, Socket::Packet p) { HandleRingUpdateACK(c, std::move(p)); });
+            handlers->emplace(Socket::PacketType::DispatchCommand,
+                [this](Socket::ConnectionID c, Socket::Packet p) { m_dispatch.HandleDispatchCommand(c, std::move(p)); });
+            handlers->emplace(Socket::PacketType::DispatchResult,
+                [this](Socket::ConnectionID c, Socket::Packet p) { m_dispatch.HandleDispatchResult(c, std::move(p)); });
+        }
+
+        void RegisterClientHandlers(Socket::PacketHandlerMapPtr& handlers) override {
+            handlers->emplace(Socket::PacketType::DispatchResult,
+                [this](Socket::ConnectionID c, Socket::Packet p) { m_dispatch.HandleDispatchResult(c, std::move(p)); });
+        }
+
+        void BgProtocolStep() override {
+            if (m_currentRingVersion.load() > 0) {
+                RetryUnackedRingUpdates();
+            }
+        }
+
+        bool IsRingSettled() const override {
+            return AllWorkersAcked();
+        }
+
+    private:
+        void HandleNodeRegisterRequest(Socket::ConnectionID connID, Socket::Packet packet) {
+            NodeRegisterMsg msg;
+            if (!msg.Read(packet.Body())) {
+                SPTAGLIB_LOG(Helper::LogLevel::LL_Error,
+                    "DispatcherNode: Failed to parse NodeRegisterRequest\n");
+                return;
+            }
+
+            SPTAGLIB_LOG(Helper::LogLevel::LL_Info,
+                "DispatcherNode: NodeRegister from node %d (%s:%s, store=%s)\n",
+                msg.m_nodeIndex, msg.m_host.c_str(), msg.m_port.c_str(), msg.m_store.c_str());
+
+            // Ring is pre-built at startup, just broadcast current ring to the new connection
+            BroadcastRingUpdate();
+        }
+
+        void HandleRingUpdateACK(Socket::ConnectionID connID, Socket::Packet packet) {
+            RingUpdateACKMsg msg;
+            if (!msg.Read(packet.Body())) {
+                SPTAGLIB_LOG(Helper::LogLevel::LL_Error,
+                    "DispatcherNode: Failed to parse RingUpdateACK\n");
+                return;
+            }
+            {
+                std::lock_guard<std::mutex> lock(m_ackMutex);
+                auto& ver = m_workerAckedVersion[msg.m_nodeIndex];
+                if (msg.m_ringVersion > ver) ver = msg.m_ringVersion;
+            }
+            SPTAGLIB_LOG(Helper::LogLevel::LL_Info,
+                "DispatcherNode: RingUpdateACK from node %d (v%u)\n",
+                msg.m_nodeIndex, msg.m_ringVersion);
+        }
+
+        void BroadcastRingUpdate() {
+            auto ring = std::atomic_load(&m_hashRing);
+            if (!ring) return;
+
+            std::uint32_t version = m_currentRingVersion.load();
+            RingUpdateMsg msg;
+            msg.m_ringVersion = version;
+            msg.m_vnodeCount = ring->GetVNodeCount();
+            for (int idx : ring->GetNodes()) {
+                msg.m_nodeIndices.push_back(idx);
+            }
+
+            std::size_t bodySize = msg.EstimateBufferSize();
+            int numNodes = static_cast<int>(m_nodeAddrs.size());
+
+            for (int i = 0; i < numNodes; i++) {
+                if (i == m_localNodeIndex) continue;
+                auto peerConn = GetPeerConnection(i);
+                if (peerConn == Socket::c_invalidConnectionID) continue;
+
+                Socket::Packet pkt;
+                pkt.Header().m_packetType = Socket::PacketType::RingUpdate;
+                pkt.Header().m_processStatus = Socket::PacketProcessStatus::Ok;
+                pkt.Header().m_connectionID = Socket::c_invalidConnectionID;
+                pkt.Header().m_resourceID = 0;
+                pkt.Header().m_bodyLength = static_cast<std::uint32_t>(bodySize);
+                pkt.AllocateBuffer(static_cast<std::uint32_t>(bodySize));
+                msg.Write(pkt.Body());
+                pkt.Header().WriteBuffer(pkt.HeaderBuffer());
+
+                m_client->SendPacket(peerConn, std::move(pkt), nullptr);
+            }
+
+            SPTAGLIB_LOG(Helper::LogLevel::LL_Info,
+                "DispatcherNode: Broadcast RingUpdate v%u (%d nodes)\n",
+                version, (int)msg.m_nodeIndices.size());
+        }
+
+        void RetryUnackedRingUpdates() {
+            auto ring = std::atomic_load(&m_hashRing);
+            if (!ring) return;
+            std::uint32_t currentVer = m_currentRingVersion.load();
+            if (currentVer == 0) return;
+
+            std::vector<int> unacked;
+            {
+                std::lock_guard<std::mutex> lock(m_ackMutex);
+                int numNodes = static_cast<int>(m_nodeAddrs.size());
+                for (int i = 0; i < numNodes; i++) {
+                    if (i == m_localNodeIndex) continue;
+                    auto it = m_workerAckedVersion.find(i);
+                    if (it == m_workerAckedVersion.end() || it->second < currentVer)
+                        unacked.push_back(i);
+                }
+            }
+            if (unacked.empty()) return;
+
+            RingUpdateMsg msg;
+            msg.m_ringVersion = currentVer;
+            msg.m_vnodeCount = ring->GetVNodeCount();
+            for (int idx : ring->GetNodes()) msg.m_nodeIndices.push_back(idx);
+            std::size_t bodySize = msg.EstimateBufferSize();
+
+            for (int nodeIdx : unacked) {
+                auto peerConn = GetPeerConnection(nodeIdx);
+                if (peerConn == Socket::c_invalidConnectionID) {
+                    SPTAGLIB_LOG(Helper::LogLevel::LL_Info,
+                        "DispatcherNode: RetryUnackedRingUpdates skip node %d (no peer conn)\n", nodeIdx);
+                    continue;
+                }
+
+                Socket::Packet pkt;
+                pkt.Header().m_packetType = Socket::PacketType::RingUpdate;
+                pkt.Header().m_processStatus = Socket::PacketProcessStatus::Ok;
+                pkt.Header().m_connectionID = Socket::c_invalidConnectionID;
+                pkt.Header().m_resourceID = 0;
+                pkt.Header().m_bodyLength = static_cast<std::uint32_t>(bodySize);
+                pkt.AllocateBuffer(static_cast<std::uint32_t>(bodySize));
+                msg.Write(pkt.Body());
+                pkt.Header().WriteBuffer(pkt.HeaderBuffer());
+
+                m_client->SendPacket(peerConn, std::move(pkt), nullptr);
+                SPTAGLIB_LOG(Helper::LogLevel::LL_Info,
+                    "DispatcherNode: Retried RingUpdate to node %d (connID=%u)\n", nodeIdx, peerConn);
+            }
+        }
+
+        DispatchCoordinator m_dispatch;
+        std::atomic<std::uint32_t> m_currentRingVersion{0};
+        mutable std::mutex m_ackMutex;
+        std::unordered_map<int, std::uint32_t> m_workerAckedVersion;
+
+        std::thread m_heartbeatThread;
+        std::atomic<bool> m_heartbeatStop{false};
+    };
+
+} // namespace SPTAG::SPANN
diff --git a/AnnService/inc/Core/SPANN/Distributed/DistributedProtocol.h b/AnnService/inc/Core/SPANN/Distributed/DistributedProtocol.h
new file mode 100644
index 000000000..b4da82fcc
--- /dev/null
+++ b/AnnService/inc/Core/SPANN/Distributed/DistributedProtocol.h
@@ -0,0 +1,651 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "inc/Core/Common.h"
+#include "inc/Socket/SimpleSerialization.h"
+#include <cstdint>
+#include <cstring>
+#include <string>
+#include <vector>
+
+namespace SPTAG::SPANN {
+
+    /// Serializable request for remote Append operations sent between compute nodes.
+    /// MirrorVersion 1 added m_layer to disambiguate which ExtraDynamicSearcher on
+    /// the receiver side handles the request. Version 0 packets default m_layer=0.
+    struct RemoteAppendRequest {
+        static constexpr std::uint16_t MajorVersion() { return 1; }
+        static constexpr std::uint16_t MirrorVersion() { return 1; }
+
+        SizeType m_headID = 0;
+        std::string m_headVec;        // raw head vector bytes
+        std::int32_t m_appendNum = 0;
+        std::string m_appendPosting;  // serialized posting data
+        std::int32_t m_layer = 0;     // originating ExtraDynamicSearcher layer
+
+        std::size_t EstimateBufferSize() const {
+            std::size_t size = 0;
+            size += sizeof(std::uint16_t) * 2;  // version fields
+            size += sizeof(SizeType);            // headID
+            size += sizeof(std::uint32_t) + m_headVec.size();       // headVec (len-prefixed)
+            size += sizeof(std::int32_t);        // appendNum
+            size += sizeof(std::uint32_t) + m_appendPosting.size(); // appendPosting (len-prefixed)
+            size += sizeof(std::int32_t);        // layer (mirrorVer >= 1)
+            return size;
+        }
+
+        std::uint8_t* Write(std::uint8_t* p_buffer) const {
+            using namespace Socket::SimpleSerialization;
+            p_buffer = SimpleWriteBuffer(MajorVersion(), p_buffer);
+            p_buffer = SimpleWriteBuffer(MirrorVersion(), p_buffer);
+            p_buffer = SimpleWriteBuffer(m_headID, p_buffer);
+            p_buffer = SimpleWriteBuffer(m_headVec, p_buffer);
+            p_buffer = SimpleWriteBuffer(m_appendNum, p_buffer);
+            p_buffer = SimpleWriteBuffer(m_appendPosting, p_buffer);
+            p_buffer = SimpleWriteBuffer(m_layer, p_buffer);
+            return p_buffer;
+        }
+
+        const std::uint8_t* Read(const std::uint8_t* p_buffer) {
+            return Read(p_buffer, nullptr);
+        }
+
+        const std::uint8_t* Read(const std::uint8_t* p_buffer, const std::uint8_t* p_bufEnd) {
+            using namespace Socket::SimpleSerialization;
+            std::uint16_t majorVer = 0, mirrorVer = 0;
+            p_buffer = SafeSimpleReadBuffer(p_buffer, p_bufEnd, majorVer);
+            p_buffer = SafeSimpleReadBuffer(p_buffer, p_bufEnd, mirrorVer);
+            if (p_buffer == nullptr || majorVer != MajorVersion()) return nullptr;
+            p_buffer = SafeSimpleReadBuffer(p_buffer, p_bufEnd, m_headID);
+            p_buffer = SafeSimpleReadBuffer(p_buffer, p_bufEnd, m_headVec);
+            p_buffer = SafeSimpleReadBuffer(p_buffer, p_bufEnd, m_appendNum);
+            p_buffer = SafeSimpleReadBuffer(p_buffer, p_bufEnd, m_appendPosting);
+            if (mirrorVer >= 1) {
+                p_buffer = SafeSimpleReadBuffer(p_buffer, p_bufEnd, m_layer);
+            } else {
+                m_layer = 0;
+            }
+            return p_buffer;
+        }
+    };
+
+    /// Response for remote Append operations.
+    struct RemoteAppendResponse {
+        static constexpr std::uint16_t MajorVersion() { return 1; }
+        static constexpr std::uint16_t MirrorVersion() { return 0; }
+
+        enum class Status : std::uint8_t { Success = 0, Failed = 1 };
+        Status m_status = Status::Success;
+
+        std::size_t EstimateBufferSize() const {
+            return sizeof(std::uint16_t) * 2 + sizeof(std::uint8_t);
+        }
+
+        std::uint8_t* Write(std::uint8_t* p_buffer) const {
+            using namespace Socket::SimpleSerialization;
+            p_buffer = SimpleWriteBuffer(MajorVersion(), p_buffer);
+            p_buffer = SimpleWriteBuffer(MirrorVersion(), p_buffer);
+            p_buffer = SimpleWriteBuffer(m_status, p_buffer);
+            return p_buffer;
+        }
+
+        const std::uint8_t* Read(const std::uint8_t* p_buffer) {
+            using namespace Socket::SimpleSerialization;
+            std::uint16_t majorVer = 0, mirrorVer = 0;
+            p_buffer = SimpleReadBuffer(p_buffer, majorVer);
+            p_buffer = SimpleReadBuffer(p_buffer, mirrorVer);
+            if (majorVer != MajorVersion()) return nullptr;
+            p_buffer = SimpleReadBuffer(p_buffer, m_status);
+            return p_buffer;
+        }
+    };
+
+    /// Identifies a compute node target for routing decisions.
+    struct RouteTarget {
+        int nodeIndex = -1;
+        bool isLocal = true;
+    };
+
+    /// Batch of remote append requests sent to a single node in one round-trip.
+    struct BatchRemoteAppendRequest {
+        static constexpr std::uint16_t MajorVersion() { return 1; }
+        static constexpr std::uint16_t MirrorVersion() { return 0; }
+
+        std::uint32_t m_count = 0;
+        std::vector<RemoteAppendRequest> m_items;
+
+        std::size_t EstimateBufferSize() const {
+            std::size_t size = sizeof(std::uint16_t) * 2;  // version
+            size += sizeof(std::uint32_t);  // count
+            for (auto& item : m_items) size += item.EstimateBufferSize();
+            return size;
+        }
+
+        std::uint8_t* Write(std::uint8_t* p_buffer) const {
+            using namespace Socket::SimpleSerialization;
+            p_buffer = SimpleWriteBuffer(MajorVersion(), p_buffer);
+            p_buffer = SimpleWriteBuffer(MirrorVersion(), p_buffer);
+            p_buffer = SimpleWriteBuffer(m_count, p_buffer);
+            for (auto& item : m_items) p_buffer = item.Write(p_buffer);
+            return p_buffer;
+        }
+
+        const std::uint8_t* Read(const std::uint8_t* p_buffer, std::uint32_t bodyLength = 0) {
+            using namespace Socket::SimpleSerialization;
+            const std::uint8_t* bufEnd = (bodyLength > 0) ? (p_buffer + bodyLength) : nullptr;
+            std::uint16_t majorVer = 0, mirrorVer = 0;
+            p_buffer = SafeSimpleReadBuffer(p_buffer, bufEnd, majorVer);
+            p_buffer = SafeSimpleReadBuffer(p_buffer, bufEnd, mirrorVer);
+            if (p_buffer == nullptr || majorVer != MajorVersion()) {
+                m_items.clear();
+                return nullptr;
+            }
+            p_buffer = SafeSimpleReadBuffer(p_buffer, bufEnd, m_count);
+            if (p_buffer == nullptr) {
+                m_items.clear();
+                return nullptr;
+            }
+            // Reject obviously corrupt counts before allocating
+            if (bodyLength > 0 && m_count > bodyLength / 8) {
+                m_items.clear();
+                return nullptr;
+            }
+            m_items.resize(m_count);
+            for (std::uint32_t i = 0; i < m_count; i++) {
+                if (bufEnd && p_buffer >= bufEnd) {
+                    m_items.clear();
+                    return nullptr;
+                }
+                p_buffer = m_items[i].Read(p_buffer, bufEnd);
+                if (!p_buffer) {
+                    m_items.clear();
+                    return nullptr;
+                }
+                if (bufEnd && p_buffer > bufEnd) {
+                    m_items.clear();
+                    return nullptr;
+                }
+            }
+            return p_buffer;
+        }
+    };
+
+    /// Response for batch remote append.
+    struct BatchRemoteAppendResponse {
+        static constexpr std::uint16_t MajorVersion() { return 1; }
+        static constexpr std::uint16_t MirrorVersion() { return 0; }
+
+        std::uint32_t m_successCount = 0;
+        std::uint32_t m_failCount = 0;
+
+        std::size_t EstimateBufferSize() const {
+            return sizeof(std::uint16_t) * 2 + sizeof(std::uint32_t) * 2;
+        }
+
+        std::uint8_t* Write(std::uint8_t* p_buffer) const {
+            using namespace Socket::SimpleSerialization;
+            p_buffer = SimpleWriteBuffer(MajorVersion(), p_buffer);
+            p_buffer = SimpleWriteBuffer(MirrorVersion(), p_buffer);
+            p_buffer = SimpleWriteBuffer(m_successCount, p_buffer);
+            p_buffer = SimpleWriteBuffer(m_failCount, p_buffer);
+            return p_buffer;
+        }
+
+        const std::uint8_t* Read(const std::uint8_t* p_buffer) {
+            using namespace Socket::SimpleSerialization;
+            std::uint16_t majorVer = 0, mirrorVer = 0;
+            p_buffer = SimpleReadBuffer(p_buffer, majorVer);
+            p_buffer = SimpleReadBuffer(p_buffer, mirrorVer);
+            if (majorVer != MajorVersion()) return nullptr;
+            p_buffer = SimpleReadBuffer(p_buffer, m_successCount);
+            p_buffer = SimpleReadBuffer(p_buffer, m_failCount);
+            return p_buffer;
+        }
+    };
+
+    /// Cross-node merge hint. Search-side trigger on node X observed that
+    /// posting `m_headID` (owned by the target node based on consistent-hash
+    /// ownership) is below the merge threshold. The receiver enqueues a
+    /// local MergeAsync; the local MergePostings logic decides whether the
+    /// posting really needs merging at execution time. Fire-and-forget: no
+    /// response packet, no retry queue. Multiple notifications for the same
+    /// head are dedup'd by m_mergeList on the receiver.
+    struct RemoteMergeRequest {
+        static constexpr std::uint16_t MajorVersion() { return 1; }
+        static constexpr std::uint16_t MirrorVersion() { return 0; }
+
+        SizeType m_headID = 0;
+        std::int32_t m_layer = 0;
+
+        std::size_t EstimateBufferSize() const {
+            return sizeof(std::uint16_t) * 2 + sizeof(SizeType) + sizeof(std::int32_t);
+        }
+
+        std::uint8_t* Write(std::uint8_t* p_buffer) const {
+            using namespace Socket::SimpleSerialization;
+            p_buffer = SimpleWriteBuffer(MajorVersion(), p_buffer);
+            p_buffer = SimpleWriteBuffer(MirrorVersion(), p_buffer);
+            p_buffer = SimpleWriteBuffer(m_headID, p_buffer);
+            p_buffer = SimpleWriteBuffer(m_layer, p_buffer);
+            return p_buffer;
+        }
+
+        const std::uint8_t* Read(const std::uint8_t* p_buffer, const std::uint8_t* p_bufEnd) {
+            using namespace Socket::SimpleSerialization;
+            std::uint16_t majorVer = 0, mirrorVer = 0;
+            p_buffer = SafeSimpleReadBuffer(p_buffer, p_bufEnd, majorVer);
+            p_buffer = SafeSimpleReadBuffer(p_buffer, p_bufEnd, mirrorVer);
+            if (p_buffer == nullptr || majorVer != MajorVersion()) return nullptr;
+            p_buffer = SafeSimpleReadBuffer(p_buffer, p_bufEnd, m_headID);
+            p_buffer = SafeSimpleReadBuffer(p_buffer, p_bufEnd, m_layer);
+            return p_buffer;
+        }
+    };
+
+    /// Batch of cross-node merge hints sent to a single owner node in one
+    /// fire-and-forget packet. Sender-side dedups by (layer, headID) so
+    /// each entry appears at most once per flush window.
+    struct BatchRemoteMergeRequest {
+        static constexpr std::uint16_t MajorVersion() { return 1; }
+        static constexpr std::uint16_t MirrorVersion() { return 0; }
+
+        std::uint32_t m_count = 0;
+        std::vector<RemoteMergeRequest> m_items;
+
+        std::size_t EstimateBufferSize() const {
+            std::size_t size = sizeof(std::uint16_t) * 2;
+            size += sizeof(std::uint32_t);
+            for (auto& item : m_items) size += item.EstimateBufferSize();
+            return size;
+        }
+
+        std::uint8_t* Write(std::uint8_t* p_buffer) const {
+            using namespace Socket::SimpleSerialization;
+            p_buffer = SimpleWriteBuffer(MajorVersion(), p_buffer);
+            p_buffer = SimpleWriteBuffer(MirrorVersion(), p_buffer);
+            p_buffer = SimpleWriteBuffer(m_count, p_buffer);
+            for (auto& item : m_items) p_buffer = item.Write(p_buffer);
+            return p_buffer;
+        }
+
+        const std::uint8_t* Read(const std::uint8_t* p_buffer, std::uint32_t bodyLength = 0) {
+            using namespace Socket::SimpleSerialization;
+            const std::uint8_t* bufEnd = (bodyLength > 0) ? (p_buffer + bodyLength) : nullptr;
+            std::uint16_t majorVer = 0, mirrorVer = 0;
+            p_buffer = SafeSimpleReadBuffer(p_buffer, bufEnd, majorVer);
+            p_buffer = SafeSimpleReadBuffer(p_buffer, bufEnd, mirrorVer);
+            if (p_buffer == nullptr || majorVer != MajorVersion()) {
+                m_items.clear();
+                return nullptr;
+            }
+            p_buffer = SafeSimpleReadBuffer(p_buffer, bufEnd, m_count);
+            if (p_buffer == nullptr) { m_items.clear(); return nullptr; }
+            if (bodyLength > 0 && m_count > bodyLength / 8) {
+                m_items.clear();
+                return nullptr;
+            }
+            m_items.resize(m_count);
+            for (std::uint32_t i = 0; i < m_count; i++) {
+                if (bufEnd && p_buffer >= bufEnd) { m_items.clear(); return nullptr; }
+                p_buffer = m_items[i].Read(p_buffer, bufEnd);
+                if (!p_buffer) { m_items.clear(); return nullptr; }
+                if (bufEnd && p_buffer > bufEnd) { m_items.clear(); return nullptr; }
+            }
+            return p_buffer;
+        }
+    };
+
+    /// Entry in a head sync broadcast: one add or delete of a head node.
+    /// `m_layer` identifies the originating ExtraDynamicSearcher so the
+    /// receiver applies the entry to the matching layer's head index
+    /// (with multi-layer SPANN, layer 0 and layer 1 both broadcast head
+    /// add/delete; without the layer field every entry would be misrouted
+    /// to a single shared callback).
+    struct HeadSyncEntry {
+        enum class Op : std::uint8_t { Add = 0, Delete = 1 };
+        Op op;
+        SizeType headVID;
+        std::string headVector;       // only for Add; empty for Delete
+        std::int32_t m_layer = 0;     // originating ExtraDynamicSearcher layer
+
+        size_t EstimateBufferSize() const {
+            return sizeof(std::uint8_t)   // op
+                 + sizeof(SizeType)       // headVID
+                 + sizeof(std::uint32_t)  // headVector length
+                 + headVector.size()
+                 + sizeof(std::int32_t);  // layer
+        }
+
+        std::uint8_t* Write(std::uint8_t* p_buffer) const {
+            using namespace Socket::SimpleSerialization;
+            p_buffer = SimpleWriteBuffer(static_cast<std::uint8_t>(op), p_buffer);
+            p_buffer = SimpleWriteBuffer(headVID, p_buffer);
+            std::uint32_t vecLen = static_cast<std::uint32_t>(headVector.size());
+            p_buffer = SimpleWriteBuffer(vecLen, p_buffer);
+            if (vecLen > 0) {
+                memcpy(p_buffer, headVector.data(), vecLen);
+                p_buffer += vecLen;
+            }
+            p_buffer = SimpleWriteBuffer(m_layer, p_buffer);
+            return p_buffer;
+        }
+
+        const std::uint8_t* Read(const std::uint8_t* p_buffer) {
+            using namespace Socket::SimpleSerialization;
+            std::uint8_t rawOp = 0;
+            p_buffer = SimpleReadBuffer(p_buffer, rawOp);
+            op = static_cast<Op>(rawOp);
+            p_buffer = SimpleReadBuffer(p_buffer, headVID);
+            std::uint32_t vecLen = 0;
+            p_buffer = SimpleReadBuffer(p_buffer, vecLen);
+            if (vecLen > 0) {
+                headVector.assign(reinterpret_cast<const char*>(p_buffer), vecLen);
+                p_buffer += vecLen;
+            } else {
+                headVector.clear();
+            }
+            p_buffer = SimpleReadBuffer(p_buffer, m_layer);
+            return p_buffer;
+        }
+    };
+
+    /// Dispatch command from driver to workers (replaces file-based barriers).
+    struct DispatchCommand {
+        static constexpr std::uint16_t MajorVersion() { return 1; }
+        static constexpr std::uint16_t MirrorVersion() { return 0; }
+
+        enum class Type : std::uint8_t { Search = 0, Insert = 1, Stop = 2, Heartbeat = 3 };
+        Type m_type = Type::Search;
+        std::uint64_t m_dispatchId = 0;   // unique ID from driver
+        std::uint32_t m_round = 0;        // search round or insert batch index
+
+        std::size_t EstimateBufferSize() const {
+            return sizeof(std::uint16_t) * 2 + sizeof(std::uint8_t)
+                 + sizeof(std::uint64_t) + sizeof(std::uint32_t);
+        }
+
+        std::uint8_t* Write(std::uint8_t* p_buffer) const {
+            using namespace Socket::SimpleSerialization;
+            p_buffer = SimpleWriteBuffer(MajorVersion(), p_buffer);
+            p_buffer = SimpleWriteBuffer(MirrorVersion(), p_buffer);
+            p_buffer = SimpleWriteBuffer(static_cast<std::uint8_t>(m_type), p_buffer);
+            p_buffer = SimpleWriteBuffer(m_dispatchId, p_buffer);
+            p_buffer = SimpleWriteBuffer(m_round, p_buffer);
+            return p_buffer;
+        }
+
+        const std::uint8_t* Read(const std::uint8_t* p_buffer) {
+            using namespace Socket::SimpleSerialization;
+            std::uint16_t majorVer = 0, mirrorVer = 0;
+            p_buffer = SimpleReadBuffer(p_buffer, majorVer);
+            p_buffer = SimpleReadBuffer(p_buffer, mirrorVer);
+            if (majorVer != MajorVersion()) return nullptr;
+            std::uint8_t rawType = 0;
+            p_buffer = SimpleReadBuffer(p_buffer, rawType);
+            m_type = static_cast<Type>(rawType);
+            p_buffer = SimpleReadBuffer(p_buffer, m_dispatchId);
+            p_buffer = SimpleReadBuffer(p_buffer, m_round);
+            return p_buffer;
+        }
+    };
+
+    /// Result from worker back to driver after executing a dispatch command.
+    struct DispatchResult {
+        static constexpr std::uint16_t MajorVersion() { return 1; }
+        static constexpr std::uint16_t MirrorVersion() { return 1; }
+
+        enum class Status : std::uint8_t { Success = 0, Failed = 1 };
+        Status m_status = Status::Success;
+        std::uint64_t m_dispatchId = 0;
+        std::uint32_t m_round = 0;
+        double m_wallTime = 0.0;
+        std::int32_t m_nodeIndex = -1;  // which worker sent this result
+
+        std::size_t EstimateBufferSize() const {
+            return sizeof(std::uint16_t) * 2 + sizeof(std::uint8_t)
+                 + sizeof(std::uint64_t) + sizeof(std::uint32_t) + sizeof(double)
+                 + sizeof(std::int32_t);
+        }
+
+        std::uint8_t* Write(std::uint8_t* p_buffer) const {
+            using namespace Socket::SimpleSerialization;
+            p_buffer = SimpleWriteBuffer(MajorVersion(), p_buffer);
+            p_buffer = SimpleWriteBuffer(MirrorVersion(), p_buffer);
+            p_buffer = SimpleWriteBuffer(static_cast<std::uint8_t>(m_status), p_buffer);
+            p_buffer = SimpleWriteBuffer(m_dispatchId, p_buffer);
+            p_buffer = SimpleWriteBuffer(m_round, p_buffer);
+            p_buffer = SimpleWriteBuffer(m_wallTime, p_buffer);
+            p_buffer = SimpleWriteBuffer(m_nodeIndex, p_buffer);
+            return p_buffer;
+        }
+
+        const std::uint8_t* Read(const std::uint8_t* p_buffer) {
+            using namespace Socket::SimpleSerialization;
+            std::uint16_t majorVer = 0, mirrorVer = 0;
+            p_buffer = SimpleReadBuffer(p_buffer, majorVer);
+            p_buffer = SimpleReadBuffer(p_buffer, mirrorVer);
+            if (majorVer != MajorVersion()) return nullptr;
+            std::uint8_t rawStatus = 0;
+            p_buffer = SimpleReadBuffer(p_buffer, rawStatus);
+            m_status = static_cast<Status>(rawStatus);
+            p_buffer = SimpleReadBuffer(p_buffer, m_dispatchId);
+            p_buffer = SimpleReadBuffer(p_buffer, m_round);
+            p_buffer = SimpleReadBuffer(p_buffer, m_wallTime);
+            if (mirrorVer >= 1) {
+                p_buffer = SimpleReadBuffer(p_buffer, m_nodeIndex);
+            }
+            return p_buffer;
+        }
+    };
+
+    /// Request to lock/unlock a headID on its owner node (for cross-node Merge).
+    /// MirrorVersion 1 added m_layer so multi-layer setups dispatch to the
+    /// correct lock pool (each ExtraDynamicSearcher owns its own bucket flags).
+    struct RemoteLockRequest {
+        static constexpr std::uint16_t MajorVersion() { return 1; }
+        static constexpr std::uint16_t MirrorVersion() { return 1; }
+
+        enum class Op : std::uint8_t { Lock = 0, Unlock = 1 };
+        Op m_op = Op::Lock;
+        SizeType m_headID = 0;
+        std::int32_t m_layer = 0;
+
+        std::size_t EstimateBufferSize() const {
+            return sizeof(std::uint16_t) * 2 + sizeof(std::uint8_t)
+                 + sizeof(SizeType) + sizeof(std::int32_t);
+        }
+
+        std::uint8_t* Write(std::uint8_t* p_buffer) const {
+            using namespace Socket::SimpleSerialization;
+            p_buffer = SimpleWriteBuffer(MajorVersion(), p_buffer);
+            p_buffer = SimpleWriteBuffer(MirrorVersion(), p_buffer);
+            p_buffer = SimpleWriteBuffer(static_cast<std::uint8_t>(m_op), p_buffer);
+            p_buffer = SimpleWriteBuffer(m_headID, p_buffer);
+            p_buffer = SimpleWriteBuffer(m_layer, p_buffer);
+            return p_buffer;
+        }
+
+        const std::uint8_t* Read(const std::uint8_t* p_buffer) {
+            using namespace Socket::SimpleSerialization;
+            std::uint16_t majorVer = 0, mirrorVer = 0;
+            p_buffer = SimpleReadBuffer(p_buffer, majorVer);
+            p_buffer = SimpleReadBuffer(p_buffer, mirrorVer);
+            if (majorVer != MajorVersion()) return nullptr;
+            std::uint8_t rawOp = 0;
+            p_buffer = SimpleReadBuffer(p_buffer, rawOp);
+            m_op = static_cast<Op>(rawOp);
+            p_buffer = SimpleReadBuffer(p_buffer, m_headID);
+            if (mirrorVer >= 1) {
+                p_buffer = SimpleReadBuffer(p_buffer, m_layer);
+            } else {
+                m_layer = 0;
+            }
+            return p_buffer;
+        }
+    };
+
+    /// Response for remote lock operations.
+    struct RemoteLockResponse {
+        static constexpr std::uint16_t MajorVersion() { return 1; }
+        static constexpr std::uint16_t MirrorVersion() { return 0; }
+
+        enum class Status : std::uint8_t { Granted = 0, Denied = 1 };
+        Status m_status = Status::Granted;
+
+        std::size_t EstimateBufferSize() const {
+            return sizeof(std::uint16_t) * 2 + sizeof(std::uint8_t);
+        }
+
+        std::uint8_t* Write(std::uint8_t* p_buffer) const {
+            using namespace Socket::SimpleSerialization;
+            p_buffer = SimpleWriteBuffer(MajorVersion(), p_buffer);
+            p_buffer = SimpleWriteBuffer(MirrorVersion(), p_buffer);
+            p_buffer = SimpleWriteBuffer(static_cast<std::uint8_t>(m_status), p_buffer);
+            return p_buffer;
+        }
+
+        const std::uint8_t* Read(const std::uint8_t* p_buffer) {
+            using namespace Socket::SimpleSerialization;
+            std::uint16_t majorVer = 0, mirrorVer = 0;
+            p_buffer = SimpleReadBuffer(p_buffer, majorVer);
+            p_buffer = SimpleReadBuffer(p_buffer, mirrorVer);
+            if (majorVer != MajorVersion()) return nullptr;
+            std::uint8_t rawOp = 0;
+            p_buffer = SimpleReadBuffer(p_buffer, rawOp);
+            m_status = static_cast<Status>(rawOp);
+            return p_buffer;
+        }
+    };
+
+    /// Worker → dispatcher registration message.
+    struct NodeRegisterMsg {
+        static constexpr std::uint16_t MajorVersion() { return 1; }
+        static constexpr std::uint16_t MirrorVersion() { return 0; }
+
+        std::int32_t m_nodeIndex = 0;
+        std::string m_host;
+        std::string m_port;
+        std::string m_store;
+
+        std::size_t EstimateBufferSize() const {
+            std::size_t size = 0;
+            size += sizeof(std::uint16_t) * 2;
+            size += sizeof(std::int32_t);
+            size += sizeof(std::uint32_t) + m_host.size();
+            size += sizeof(std::uint32_t) + m_port.size();
+            size += sizeof(std::uint32_t) + m_store.size();
+            return size;
+        }
+
+        std::uint8_t* Write(std::uint8_t* p_buffer) const {
+            using namespace Socket::SimpleSerialization;
+            p_buffer = SimpleWriteBuffer(MajorVersion(), p_buffer);
+            p_buffer = SimpleWriteBuffer(MirrorVersion(), p_buffer);
+            p_buffer = SimpleWriteBuffer(m_nodeIndex, p_buffer);
+            p_buffer = SimpleWriteBuffer(m_host, p_buffer);
+            p_buffer = SimpleWriteBuffer(m_port, p_buffer);
+            p_buffer = SimpleWriteBuffer(m_store, p_buffer);
+            return p_buffer;
+        }
+
+        const std::uint8_t* Read(const std::uint8_t* p_buffer) {
+            using namespace Socket::SimpleSerialization;
+            std::uint16_t majorVer = 0, mirrorVer = 0;
+            p_buffer = SimpleReadBuffer(p_buffer, majorVer);
+            p_buffer = SimpleReadBuffer(p_buffer, mirrorVer);
+            if (majorVer != MajorVersion()) return nullptr;
+            p_buffer = SimpleReadBuffer(p_buffer, m_nodeIndex);
+            p_buffer = SimpleReadBuffer(p_buffer, m_host);
+            p_buffer = SimpleReadBuffer(p_buffer, m_port);
+            p_buffer = SimpleReadBuffer(p_buffer, m_store);
+            return p_buffer;
+        }
+    };
+
+    /// Dispatcher → worker ring update (full node list, versioned).
+    struct RingUpdateMsg {
+        static constexpr std::uint16_t MajorVersion() { return 1; }
+        static constexpr std::uint16_t MirrorVersion() { return 0; }
+
+        std::uint32_t m_ringVersion = 0;
+        std::int32_t m_vnodeCount = 150;
+        std::vector<std::int32_t> m_nodeIndices;
+
+        std::size_t EstimateBufferSize() const {
+            std::size_t size = 0;
+            size += sizeof(std::uint16_t) * 2;
+            size += sizeof(std::uint32_t);      // ringVersion
+            size += sizeof(std::int32_t);       // vnodeCount
+            size += sizeof(std::uint32_t);      // numNodes
+            size += sizeof(std::int32_t) * m_nodeIndices.size();
+            return size;
+        }
+
+        std::uint8_t* Write(std::uint8_t* p_buffer) const {
+            using namespace Socket::SimpleSerialization;
+            p_buffer = SimpleWriteBuffer(MajorVersion(), p_buffer);
+            p_buffer = SimpleWriteBuffer(MirrorVersion(), p_buffer);
+            p_buffer = SimpleWriteBuffer(m_ringVersion, p_buffer);
+            p_buffer = SimpleWriteBuffer(m_vnodeCount, p_buffer);
+            std::uint32_t count = static_cast<std::uint32_t>(m_nodeIndices.size());
+            p_buffer = SimpleWriteBuffer(count, p_buffer);
+            for (auto idx : m_nodeIndices) {
+                p_buffer = SimpleWriteBuffer(idx, p_buffer);
+            }
+            return p_buffer;
+        }
+
+        const std::uint8_t* Read(const std::uint8_t* p_buffer) {
+            using namespace Socket::SimpleSerialization;
+            std::uint16_t majorVer = 0, mirrorVer = 0;
+            p_buffer = SimpleReadBuffer(p_buffer, majorVer);
+            p_buffer = SimpleReadBuffer(p_buffer, mirrorVer);
+            if (majorVer != MajorVersion()) return nullptr;
+            p_buffer = SimpleReadBuffer(p_buffer, m_ringVersion);
+            p_buffer = SimpleReadBuffer(p_buffer, m_vnodeCount);
+            std::uint32_t count = 0;
+            p_buffer = SimpleReadBuffer(p_buffer, count);
+            m_nodeIndices.resize(count);
+            for (std::uint32_t i = 0; i < count; i++) {
+                p_buffer = SimpleReadBuffer(p_buffer, m_nodeIndices[i]);
+            }
+            return p_buffer;
+        }
+    };
+
+    /// Worker → dispatcher ACK for a ring update.
+    struct RingUpdateACKMsg {
+        static constexpr std::uint16_t MajorVersion() { return 1; }
+        static constexpr std::uint16_t MirrorVersion() { return 0; }
+
+        std::int32_t m_nodeIndex = -1;
+        std::uint32_t m_ringVersion = 0;
+
+        std::size_t EstimateBufferSize() const {
+            return sizeof(std::uint16_t) * 2 + sizeof(std::int32_t) + sizeof(std::uint32_t);
+        }
+
+        std::uint8_t* Write(std::uint8_t* p_buffer) const {
+            using namespace Socket::SimpleSerialization;
+            p_buffer = SimpleWriteBuffer(MajorVersion(), p_buffer);
+            p_buffer = SimpleWriteBuffer(MirrorVersion(), p_buffer);
+            p_buffer = SimpleWriteBuffer(m_nodeIndex, p_buffer);
+            p_buffer = SimpleWriteBuffer(m_ringVersion, p_buffer);
+            return p_buffer;
+        }
+
+        const std::uint8_t* Read(const std::uint8_t* p_buffer) {
+            using namespace Socket::SimpleSerialization;
+            std::uint16_t majorVer = 0, mirrorVer = 0;
+            p_buffer = SimpleReadBuffer(p_buffer, majorVer);
+            p_buffer = SimpleReadBuffer(p_buffer, mirrorVer);
+            if (majorVer != MajorVersion()) return nullptr;
+            p_buffer = SimpleReadBuffer(p_buffer, m_nodeIndex);
+            p_buffer = SimpleReadBuffer(p_buffer, m_ringVersion);
+            return p_buffer;
+        }
+    };
+
+} // namespace SPTAG::SPANN
diff --git a/AnnService/inc/Core/SPANN/Distributed/NetworkNode.h b/AnnService/inc/Core/SPANN/Distributed/NetworkNode.h
new file mode 100644
index 000000000..4e11a4b08
--- /dev/null
+++ b/AnnService/inc/Core/SPANN/Distributed/NetworkNode.h
@@ -0,0 +1,319 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#ifndef _SPTAG_SPANN_NETWORKNODE_H_
+#define _SPTAG_SPANN_NETWORKNODE_H_
+
+#include "inc/Core/SPANN/Distributed/DistributedProtocol.h"
+#include "inc/Core/SPANN/Distributed/ConsistentHashRing.h"
+#include "inc/Core/SPANN/Distributed/DispatchCoordinator.h"
+#include "inc/Core/SPANN/Distributed/RemotePostingOps.h"
+#include "inc/Socket/Client.h"
+#include "inc/Socket/Server.h"
+#include "inc/Socket/Packet.h"
+#include <string>
+#include <mutex>
+#include <condition_variable>
+#include <memory>
+#include <vector>
+#include <atomic>
+#include <thread>
+
+namespace SPTAG::SPANN {
+
+    /// Base class providing shared networking infrastructure for all
+    /// distributed node roles. Manages server/client sockets, peer
+    /// connections, consistent hash ring storage, and a background
+    /// connection maintenance thread.
+    ///
+    /// Subclasses override RegisterHandlers() to wire up their specific
+    /// packet handlers, and BgProtocolStep() / IsRingSettled() for
+    /// role-specific background work.
+    class NetworkNode : public DispatchCoordinator::PeerNetwork,
+                        public RemotePostingOps::NetworkAccess {
+    public:
+        NetworkNode()
+            : m_enabled(false), m_localNodeIndex(-1) {}
+
+        virtual ~NetworkNode() {
+            m_bgConnectStop.store(true);
+            if (m_bgConnectThread.joinable()) m_bgConnectThread.join();
+        }
+
+        /// Initialize shared networking state.
+        bool InitializeNetwork(
+            int localNodeIdx,
+            const std::vector<std::pair<std::string, std::string>>& nodeAddrs,
+            int vnodeCount = 150)
+        {
+            if (nodeAddrs.empty() || localNodeIdx < 0 ||
+                localNodeIdx >= static_cast<int>(nodeAddrs.size())) {
+                SPTAGLIB_LOG(Helper::LogLevel::LL_Error,
+                    "NetworkNode::Initialize invalid config: %d nodes, localIdx=%d\n",
+                    (int)nodeAddrs.size(), localNodeIdx);
+                return false;
+            }
+
+            m_localNodeIndex = localNodeIdx;
+            m_nodeAddrs = nodeAddrs;
+            m_vnodeCount = vnodeCount;
+
+            // Start with empty hash ring
+            std::atomic_store(&m_hashRing,
+                std::shared_ptr<const ConsistentHashRing>(
+                    std::make_shared<ConsistentHashRing>(vnodeCount)));
+
+            m_enabled = true;
+            return true;
+        }
+
+        /// Start server + client + background connection thread.
+        /// Subclasses must have called InitializeNetwork() first.
+        /// Each node listens on its own address from the combined address list.
+        bool StartNetwork() {
+            if (!m_enabled) return false;
+
+            // Pre-size m_peerConnections BEFORE the server is started — the
+            // server's handler threads can dispatch packets immediately on
+            // bind, and inbound handlers (e.g. HandleRingUpdate ->
+            // SendRingUpdateACK) call GetPeerConnection which indexes into
+            // m_peerConnections. Resizing here closes a startup race that
+            // could segfault when an early peer (typically the dispatcher
+            // sending the initial RingUpdate) won the race.
+            m_peerConnections.resize(m_nodeAddrs.size(), Socket::c_invalidConnectionID);
+
+            // --- Client side ---
+            // Construct the Socket::Client BEFORE starting the
+            // server. Server handlers (notably HeadSync receiver / ring
+            // update) can fire as soon as the listening socket accepts a
+            // peer, and they may call ConnectToPeer → m_client->
+            // ConnectToServer. If m_client is still null at that point,
+            // the call dereferences a null unique_ptr and segfaults
+            // (Pre-build "All N connection attempts to node X failed"
+            // crash). Construct the client first so the handler path is
+            // safe before any socket can be accepted.
+            Socket::PacketHandlerMapPtr clientHandlers(new Socket::PacketHandlerMap);
+            RegisterClientHandlers(clientHandlers);
+
+            m_client.reset(new Socket::Client(clientHandlers, 8, 30));
+
+            // --- Server side ---
+            {
+                Socket::PacketHandlerMapPtr serverHandlers(new Socket::PacketHandlerMap);
+                RegisterServerHandlers(serverHandlers);
+
+                const auto& localAddr = m_nodeAddrs[m_localNodeIndex];
+                m_server.reset(new Socket::Server(
+                    localAddr.first, localAddr.second, serverHandlers, 8));
+                SPTAGLIB_LOG(Helper::LogLevel::LL_Info,
+                    "NetworkNode server listening on %s:%s\n",
+                    localAddr.first.c_str(), localAddr.second.c_str());
+            }
+
+            // --- Background thread ---
+            m_bgConnectStop.store(false);
+            m_bgConnectThread = std::thread([this]() {
+                int numNodes = static_cast<int>(m_nodeAddrs.size());
+                int delayMs = 500;
+                while (!m_bgConnectStop.load()) {
+                    bool allConnected = true;
+                    for (int i = 0; i < numNodes; i++) {
+                        if (i == m_localNodeIndex) continue;
+                        {
+                            std::lock_guard<std::mutex> lock(m_connMutex);
+                            if (m_peerConnections[i] != Socket::c_invalidConnectionID)
+                                continue;
+                        }
+                        allConnected = false;
+                        ConnectToPeer(i, 1, 0);
+                    }
+
+                    BgProtocolStep();
+
+                    if (allConnected && IsRingSettled()) {
+                        SPTAGLIB_LOG(Helper::LogLevel::LL_Info,
+                            "NetworkNode: All peers connected and ring synchronized\n");
+                        break;
+                    }
+                    std::this_thread::sleep_for(std::chrono::milliseconds(delayMs));
+                    delayMs = std::min(delayMs + 500, 5000);
+                }
+            });
+
+            return true;
+        }
+
+        // ---- PeerNetwork + NetworkAccess interface ----
+        //
+        // GetLocalNodeIndex() / GetNumNodes() use NETWORK-SLOT semantics:
+        // m_nodeAddrs is the flat address table indexed by internal slot
+        // (slot 0 = dispatcher, slots 1..N = workers). These are the
+        // values used for raw socket connections and dispatch routing.
+        //
+        // For COMPUTE-WORKER semantics (VID interleaving, version-map
+        // sizing, hash-ring partitioning), use GetNumWorkerNodes() /
+        // GetWorkerNodeIndex() instead — those exclude the dispatcher
+        // and use 0-indexed worker shard numbering. Mixing the two
+        // produces off-by-one shard math
+        // (AllocateGlobalVID maps to the wrong globalVID range).
+
+        int GetLocalNodeIndex() const override { return m_localNodeIndex; }
+
+        int GetNumNodes() const override {
+            return static_cast<int>(m_nodeAddrs.size());
+        }
+
+        // ---- Compute-role accessors ----
+        //
+        // These describe the LOGICAL cluster composition independent of
+        // the network slot layout. Subclasses populate the m_num*Nodes /
+        // m_workerNodeIndex fields during Initialize().
+        //
+        // Use these (NOT GetNumNodes / GetLocalNodeIndex) for:
+        //  * AllocateGlobalVID interleaving math
+        //  * Version-map cross-node bound sizing
+        //  * AddIDCapacity growth multiplier
+        //  * Any "how many shards are storing user data?" question
+
+        int GetNumWorkerNodes() const { return m_numWorkerNodes; }
+        int GetNumDispatchNodes() const { return m_numDispatchNodes; }
+
+        /// 0-indexed compute-shard position for this node, or -1 if this
+        /// node is dispatcher-only (has no local data shard).
+        int GetWorkerNodeIndex() const { return m_workerNodeIndex; }
+
+        Socket::ConnectionID GetPeerConnection(int nodeIndex) override {
+            {
+                std::lock_guard<std::mutex> lock(m_connMutex);
+                if (m_peerConnections[nodeIndex] != Socket::c_invalidConnectionID)
+                    return m_peerConnections[nodeIndex];
+            }
+            if (ConnectToPeer(nodeIndex, 5, 1000)) {
+                std::lock_guard<std::mutex> lock(m_connMutex);
+                return m_peerConnections[nodeIndex];
+            }
+            return Socket::c_invalidConnectionID;
+        }
+
+        void SendPacket(Socket::ConnectionID connID, Socket::Packet&& pkt,
+                        std::function<void(bool)> callback) override {
+            m_client->SendPacket(connID, std::move(pkt), std::move(callback));
+        }
+
+        void InvalidatePeerConnection(int nodeIndex) override {
+            std::lock_guard<std::mutex> lock(m_connMutex);
+            m_peerConnections[nodeIndex] = Socket::c_invalidConnectionID;
+        }
+
+        Socket::Client* GetClient() override { return m_client.get(); }
+        Socket::Server* GetServer() override { return m_server.get(); }
+
+        // ---- Shared accessors ----
+
+        bool IsEnabled() const { return m_enabled; }
+
+        std::shared_ptr<const ConsistentHashRing> GetHashRing() const {
+            return std::atomic_load(&m_hashRing);
+        }
+
+        void SetHashRing(std::shared_ptr<const ConsistentHashRing> ring) {
+            std::atomic_store(&m_hashRing, std::move(ring));
+        }
+
+        bool WaitForAllPeersConnected(int timeoutSec = 120) {
+            if (!m_enabled) return true;
+            int numNodes = static_cast<int>(m_nodeAddrs.size());
+            auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(timeoutSec);
+            while (std::chrono::steady_clock::now() < deadline) {
+                bool allConnected = true;
+                for (int i = 0; i < numNodes; i++) {
+                    if (i == m_localNodeIndex) continue;
+                    std::lock_guard<std::mutex> lock(m_connMutex);
+                    if (m_peerConnections[i] == Socket::c_invalidConnectionID) {
+                        allConnected = false;
+                        break;
+                    }
+                }
+                if (allConnected) return true;
+                std::this_thread::sleep_for(std::chrono::milliseconds(500));
+            }
+            SPTAGLIB_LOG(Helper::LogLevel::LL_Error,
+                "NetworkNode: Timed out waiting for peer connections (%ds)\n", timeoutSec);
+            return false;
+        }
+
+        bool ConnectToPeer(int nodeIndex, int maxRetries = 10, int initialDelayMs = 500) {
+            if (nodeIndex == m_localNodeIndex) return true;
+            std::pair<std::string, std::string> addr;
+            {
+                std::lock_guard<std::mutex> lock(m_connMutex);
+                if (nodeIndex >= static_cast<int>(m_nodeAddrs.size())) return false;
+                addr = m_nodeAddrs[nodeIndex];
+            }
+            int delayMs = initialDelayMs;
+            for (int attempt = 1; attempt <= maxRetries; attempt++) {
+                ErrorCode ec;
+                auto connID = m_client->ConnectToServer(addr.first, addr.second, ec);
+                if (ec == ErrorCode::Success) {
+                    std::lock_guard<std::mutex> lock(m_connMutex);
+                    m_peerConnections[nodeIndex] = connID;
+                    SPTAGLIB_LOG(Helper::LogLevel::LL_Info,
+                        "NetworkNode[local=%d]: Connected to node %d (%s:%s), connID=%u (attempt %d)\n",
+                        m_localNodeIndex, nodeIndex, addr.first.c_str(), addr.second.c_str(), connID, attempt);
+                    return true;
+                }
+                if (attempt < maxRetries) {
+                    std::this_thread::sleep_for(std::chrono::milliseconds(delayMs));
+                    delayMs = std::min(delayMs * 2, 5000);
+                }
+            }
+            SPTAGLIB_LOG(Helper::LogLevel::LL_Error,
+                "NetworkNode: All %d connection attempts to node %d failed\n",
+                maxRetries, nodeIndex);
+            return false;
+        }
+
+    protected:
+        /// Subclasses register their packet handlers here.
+        virtual void RegisterServerHandlers(Socket::PacketHandlerMapPtr& handlers) = 0;
+        virtual void RegisterClientHandlers(Socket::PacketHandlerMapPtr& handlers) = 0;
+
+        /// Called each iteration of the bg thread for role-specific protocol work.
+        virtual void BgProtocolStep() {}
+
+        /// Return true when ring is fully synchronized for this node's role.
+        virtual bool IsRingSettled() const { return true; }
+
+        bool m_enabled;
+        int m_localNodeIndex;
+        int m_vnodeCount = 150;
+
+        // Compute-role accounting. Set by subclass Initialize().
+        // m_workerNodeIndex == -1 means this node has no local data shard
+        // (dispatcher-only role). See GetNumWorkerNodes() / GetWorkerNodeIndex()
+        // for the rationale on why these are separate from m_nodeAddrs.size().
+        int m_numWorkerNodes = 0;
+        int m_numDispatchNodes = 0;
+        int m_workerNodeIndex = -1;
+
+        // Consistent hash ring (lock-free RCU: atomic_load to read, copy-on-write to modify)
+        std::shared_ptr<const ConsistentHashRing> m_hashRing;
+        std::mutex m_ringWriteMutex;
+
+        // Node addresses
+        std::vector<std::pair<std::string, std::string>> m_nodeAddrs;
+
+        // Networking
+        std::unique_ptr<Socket::Server> m_server;
+        std::unique_ptr<Socket::Client> m_client;
+        std::mutex m_connMutex;
+        std::vector<Socket::ConnectionID> m_peerConnections;
+
+        // Background thread
+        std::thread m_bgConnectThread;
+        std::atomic<bool> m_bgConnectStop{false};
+    };
+
+} // namespace SPTAG::SPANN
+
+#endif // _SPTAG_SPANN_NETWORKNODE_H_
diff --git a/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h b/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h
new file mode 100644
index 000000000..577b91876
--- /dev/null
+++ b/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h
@@ -0,0 +1,1325 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "inc/Core/SPANN/Distributed/DistributedProtocol.h"
+#include "inc/Helper/ThreadPool.h"
+#include "inc/Socket/Client.h"
+#include "inc/Socket/Server.h"
+#include "inc/Socket/Packet.h"
+#include "inc/Socket/SimpleSerialization.h"
+#include <atomic>
+#include <condition_variable>
+#include <cstdlib>
+#include <deque>
+#include <functional>
+#include <future>
+#include <memory>
+#include <mutex>
+#include <shared_mutex>
+#include <stdexcept>
+#include <string>
+#include <thread>
+#include <unordered_map>
+#include <vector>
+
+namespace SPTAG::SPANN {
+
+    // Per-thread hook so the SPDKThreadPool's pre-allocated ExtraWorkSpace
+    // (initialised once per worker thread, see SPDKThreadPool::initSPDK) can
+    // be reached from inside the AppendCallback lambda without changing the
+    // callback signature. BatchAppendItemJob::exec(workspace*, abort*) sets
+    // this before invoking the callback so the callback skips the per-item
+    // InitWorkSpace allocation / m_freeWorkSpaceIds churn that otherwise
+    // serialises 10k-item batches into ~130s on the receiver.
+    inline thread_local void* tls_preallocAppendWorkSpace = nullptr;
+
+    /// Handles all node-to-node RPC mechanics for internal posting operations:
+    ///   - Append / BatchAppend (forward writes to the correct owner node)
+    ///   - HeadSync (broadcast head index changes to peers)
+    ///   - RemoteLock (cross-node locking for merge/split)
+    ///
+    /// This class owns the request/response matching state and serialization
+    /// logic. It is independent of routing decisions — WorkerNode decides
+    /// *where* to send, RemotePostingOps handles *how*.
+    class RemotePostingOps {
+    public:
+        using AppendCallback = std::function<ErrorCode(
+            SizeType headID,
+            std::shared_ptr<std::string> headVec,
+            int appendNum,
+            std::string& appendPosting)>;
+
+        using HeadSyncCallback = std::function<void(const HeadSyncEntry& entry)>;
+        using RemoteLockCallback = std::function<bool(SizeType headID, bool lock)>;
+
+        /// Callback for cross-node merge: search on a peer node observed
+        /// that posting `headID` (which we own) looks underfull. The peer
+        /// sent a fire-and-forget MergeRequest to us; we just schedule the
+        /// local MergeAsync. Returns nothing; receiver-side m_mergeList
+        /// already dedupes repeated triggers, so dropped notifications
+        /// are recoverable on the next observation.
+        using MergeCallback = std::function<void(SizeType headID)>;
+
+        /// Abstract interface for network access (implemented by NetworkNode).
+        class NetworkAccess {
+        public:
+            virtual ~NetworkAccess() = default;
+            virtual Socket::ConnectionID GetPeerConnection(int nodeIndex) = 0;
+            virtual void InvalidatePeerConnection(int nodeIndex) = 0;
+            virtual int GetLocalNodeIndex() const = 0;
+            virtual int GetNumNodes() const = 0;
+            virtual Socket::Client* GetClient() = 0;
+            virtual Socket::Server* GetServer() = 0;
+        };
+
+        RemotePostingOps() {
+            StartHeadSyncRetryThread();
+        }
+
+        ~RemotePostingOps() {
+            StopHeadSyncRetryThread();
+        }
+
+        RemotePostingOps(const RemotePostingOps&) = delete;
+        RemotePostingOps& operator=(const RemotePostingOps&) = delete;
+
+        void SetNetwork(NetworkAccess* net) { m_net = net; }
+
+        // Inject the searcher's shared compute pool. Receiver-side BatchAppend
+        // work runs as Jobs on this pool so it shares a single bounded-
+        // concurrency budget with local Append/Split/Merge/Reassign (instead
+        // of a separate bg executor + transient std::threads which over-
+        // subscribed TiKV). Per-layer: each layer's ExtraDynamicSearcher owns
+        // its own m_splitThreadPool, so BatchAppend items dispatch by the
+        // request's m_layer to the matching pool. A single submitter would
+        // pile both layers' remote appends into whichever pool wired last.
+        using JobSubmitter = std::function<void(Helper::ThreadPool::Job*, bool /*high*/)>;
+        void SetJobSubmitter(int layer, JobSubmitter submitter) {
+            std::unique_lock<std::shared_timed_mutex> lk(m_callbackLifetimeMutex);
+            EnsureLayerSlot_NoLock(layer);
+            if (m_jobSubmitters.size() <= static_cast<size_t>(layer)) {
+                m_jobSubmitters.resize(static_cast<size_t>(layer) + 1);
+            }
+            m_jobSubmitters[layer] = std::move(submitter);
+        }
+
+        // Helper: ensure the per-layer registries are wide enough for `layer`.
+        // Caller must hold m_callbackLifetimeMutex in exclusive mode.
+        void EnsureLayerSlot_NoLock(int layer) {
+            if (layer < 0) return;
+            const size_t needed = static_cast<size_t>(layer) + 1;
+            if (m_appendCallbacks.size() < needed) m_appendCallbacks.resize(needed);
+            if (m_headSyncCallbacks.size() < needed) m_headSyncCallbacks.resize(needed);
+            if (m_remoteLockCallbacks.size() < needed) m_remoteLockCallbacks.resize(needed);
+            if (m_mergeCallbacks.size() < needed) m_mergeCallbacks.resize(needed);
+            if (m_callbackOwners.size() < needed) {
+                std::vector<std::atomic<const void*>> grown(needed);
+                for (size_t i = 0; i < m_callbackOwners.size(); ++i) {
+                    grown[i].store(
+                        m_callbackOwners[i].load(std::memory_order_acquire),
+                        std::memory_order_release);
+                }
+                m_callbackOwners = std::move(grown);
+            }
+        }
+
+        void SetAppendCallback(int layer, AppendCallback cb) {
+            std::unique_lock<std::shared_timed_mutex> lk(m_callbackLifetimeMutex);
+            EnsureLayerSlot_NoLock(layer);
+            m_appendCallbacks[layer] = std::move(cb);
+        }
+        void SetHeadSyncCallback(int layer, HeadSyncCallback cb) {
+            std::unique_lock<std::shared_timed_mutex> lk(m_callbackLifetimeMutex);
+            EnsureLayerSlot_NoLock(layer);
+            m_headSyncCallbacks[layer] = std::move(cb);
+        }
+        void SetRemoteLockCallback(int layer, RemoteLockCallback cb) {
+            std::unique_lock<std::shared_timed_mutex> lk(m_callbackLifetimeMutex);
+            EnsureLayerSlot_NoLock(layer);
+            m_remoteLockCallbacks[layer] = std::move(cb);
+        }
+        void SetMergeCallback(int layer, MergeCallback cb) {
+            std::unique_lock<std::shared_timed_mutex> lk(m_callbackLifetimeMutex);
+            EnsureLayerSlot_NoLock(layer);
+            m_mergeCallbacks[layer] = std::move(cb);
+        }
+
+        /// Atomically clear ALL callbacks (every layer) and wait for any in-flight
+        /// callback invocation to finish. Required before the owner of the captured
+        /// `this` pointer (e.g. ExtraDynamicSearcher) is destroyed, otherwise
+        /// the lambdas registered via SetXxxCallback would dereference a dangling
+        /// pointer.
+        void ClearCallbacks() {
+            std::unique_lock<std::shared_timed_mutex> lk(m_callbackLifetimeMutex);
+            m_appendCallbacks.clear();
+            m_headSyncCallbacks.clear();
+            m_remoteLockCallbacks.clear();
+            m_mergeCallbacks.clear();
+            m_callbackOwners = std::vector<std::atomic<const void*>>();
+        }
+
+        /// Claim ownership of the registered callbacks for a SPECIFIC layer.
+        /// Each ExtraDynamicSearcher owns its own layer slot; per-layer
+        /// ownership prevents one layer's destructor from wiping another
+        /// layer's still-valid callbacks (the original 1-layer design used a
+        /// single ownership token; with Layers>=2 each layer needs its own).
+        void ClaimCallbackOwnership(int layer, const void* owner) {
+            std::unique_lock<std::shared_timed_mutex> lk(m_callbackLifetimeMutex);
+            EnsureLayerSlot_NoLock(layer);
+            m_callbackOwners[layer].store(owner, std::memory_order_release);
+        }
+
+        /// Clear callbacks for `layer` ONLY if `owner` is the current registered
+        /// owner of that layer. Used by ExtraDynamicSearcher destructor: each
+        /// layer's destructor only clears its own slot. Returns true if cleared.
+        bool ClearCallbacksIfOwner(int layer, const void* owner) {
+            std::unique_lock<std::shared_timed_mutex> lk(m_callbackLifetimeMutex);
+            if (layer < 0 || static_cast<size_t>(layer) >= m_callbackOwners.size()) {
+                return false;
+            }
+            if (m_callbackOwners[layer].load(std::memory_order_acquire) != owner) {
+                return false;
+            }
+            m_appendCallbacks[layer] = nullptr;
+            m_headSyncCallbacks[layer] = nullptr;
+            m_remoteLockCallbacks[layer] = nullptr;
+            if (layer >= 0 && static_cast<size_t>(layer) < m_mergeCallbacks.size()) {
+                m_mergeCallbacks[layer] = nullptr;
+            }
+            m_callbackOwners[layer].store(nullptr, std::memory_order_release);
+            return true;
+        }
+
+        // ----- internal callback lookup helpers (caller holds shared lock) -----
+        const AppendCallback* LookupAppendCallback_Locked(int layer) const {
+            if (layer < 0 || static_cast<size_t>(layer) >= m_appendCallbacks.size()) return nullptr;
+            const auto& cb = m_appendCallbacks[layer];
+            return cb ? &cb : nullptr;
+        }
+        const HeadSyncCallback* LookupHeadSyncCallback_Locked(int layer) const {
+            if (layer < 0 || static_cast<size_t>(layer) >= m_headSyncCallbacks.size()) return nullptr;
+            const auto& cb = m_headSyncCallbacks[layer];
+            return cb ? &cb : nullptr;
+        }
+        const RemoteLockCallback* LookupRemoteLockCallback_Locked(int layer) const {
+            if (layer < 0 || static_cast<size_t>(layer) >= m_remoteLockCallbacks.size()) return nullptr;
+            const auto& cb = m_remoteLockCallbacks[layer];
+            return cb ? &cb : nullptr;
+        }
+        // PutPosting/FetchPosting/DeletePosting RPCs lived here historically.
+        // With shared TiKV every node reads and writes the posting store
+        // directly (PD routes the key), so the cross-node scatter-gather
+        // and owner-callback round-trips are unnecessary.
+        const MergeCallback* LookupMergeCallback_Locked(int layer) const {
+            if (layer < 0 || static_cast<size_t>(layer) >= m_mergeCallbacks.size()) return nullptr;
+            const auto& cb = m_mergeCallbacks[layer];
+            return cb ? &cb : nullptr;
+        }
+
+        // ==================================================================
+        //  Append — single item, synchronous (waits for response)
+        // ==================================================================
+
+        ErrorCode SendRemoteAppend(
+            int targetNodeIndex,
+            int layer,
+            SizeType headID,
+            const std::shared_ptr<std::string>& headVec,
+            int appendNum,
+            std::string& appendPosting)
+        {
+            Socket::ConnectionID connID = m_net->GetPeerConnection(targetNodeIndex);
+            if (connID == Socket::c_invalidConnectionID) {
+                SPTAGLIB_LOG(Helper::LogLevel::LL_Error,
+                    "RemotePostingOps: Cannot connect to node %d for headID %lld\n",
+                    targetNodeIndex, (std::int64_t)headID);
+                return ErrorCode::Fail;
+            }
+
+            RemoteAppendRequest req;
+            req.m_layer = layer;
+            req.m_headID = headID;
+            req.m_headVec = *headVec;
+            req.m_appendNum = appendNum;
+            req.m_appendPosting = appendPosting;
+
+            Socket::ResourceID resID = m_nextResourceId.fetch_add(1);
+            auto [future, _] = CreatePendingResponse(resID);
+            (void)_;
+
+            Socket::Packet packet;
+            packet.Header().m_packetType = Socket::PacketType::AppendRequest;
+            packet.Header().m_processStatus = Socket::PacketProcessStatus::Ok;
+            packet.Header().m_connectionID = Socket::c_invalidConnectionID;
+            packet.Header().m_resourceID = resID;
+
+            auto bodySize = static_cast<std::uint32_t>(req.EstimateBufferSize());
+            packet.Header().m_bodyLength = bodySize;
+            packet.AllocateBuffer(bodySize);
+            req.Write(packet.Body());
+            packet.Header().WriteBuffer(packet.HeaderBuffer());
+
+            m_net->GetClient()->SendPacket(connID, std::move(packet),
+                MakeSendFailHandler(resID));
+
+            auto status = future.wait_for(std::chrono::seconds(30));
+            if (status == std::future_status::timeout) {
+                SPTAGLIB_LOG(Helper::LogLevel::LL_Error,
+                    "RemotePostingOps: Timeout waiting for append response for headID %lld from node %d\n",
+                    (std::int64_t)headID, targetNodeIndex);
+                ErasePending(resID);
+                return ErrorCode::Fail;
+            }
+            return future.get();
+        }
+
+        // ==================================================================
+        //  Append — batch, synchronous with retry
+        // ==================================================================
+
+        ErrorCode SendBatchRemoteAppend(
+            int targetNodeIndex,
+            std::vector<RemoteAppendRequest>& items)
+        {
+            if (items.empty()) return ErrorCode::Success;
+
+            // Chunk the batch so a single RPC never exceeds kChunkSize items.
+            // Large batches (millions of items) cannot be processed by the
+            // receiver within a single timeout window, causing data loss
+            // when the request is dropped. Chunking keeps each RPC bounded.
+            // [v38] Reduced 50000 → 10000 to (a) shrink end-of-batch drain
+            // tail (final chunk no longer 14s wide) and (b) let multiple
+            // chunks pipeline on the receiver pool.
+            // [v43] Back to 50000 — v42 (10k) was throughput-best (906/s)
+            // but during-insert p50 was 222ms; v43 (50k) trades throughput
+            // (-22% → 704/s) for during-insert p50 (-36% → 141ms) and big
+            // recovery in post-insert r1 QPS (47→85). v44 (100k) blew up
+            // tail drain: a single 100k chunk took 116s on the receiver,
+            // making end-of-batch drain run 40+ min (vs 8 min at 50k).
+            // 50k is the sweet spot.
+            // [v47] With shared-pool receiver (BatchAppendItemJob on
+            // m_splitThreadPool), 50k chunks still occasionally exceed the
+            // 180s wait_for window under contention → "Timeout waiting for
+            // batch response" + retries. Drop to 10k so each RPC's worst-case
+            // receiver wall-clock is ~6× smaller and stays under the timeout.
+            constexpr size_t kChunkSize = 3000;
+            const size_t total = items.size();
+            size_t offset = 0;
+            std::vector<RemoteAppendRequest> chunk;
+            chunk.reserve(std::min(kChunkSize, total));
+
+            while (offset < total) {
+                size_t end = std::min(offset + kChunkSize, total);
+                chunk.clear();
+                chunk.reserve(end - offset);
+                for (size_t i = offset; i < end; ++i) {
+                    chunk.push_back(std::move(items[i]));
+                }
+
+                ErrorCode chunkRet = SendBatchRemoteAppendChunk(targetNodeIndex, chunk);
+                if (chunkRet != ErrorCode::Success) {
+                    SPTAGLIB_LOG(Helper::LogLevel::LL_Error,
+                        "RemotePostingOps: Chunk send failed to node %d (offset=%zu/%zu, chunk=%zu items)\n",
+                        targetNodeIndex, offset, total, end - offset);
+                    return chunkRet;
+                }
+                offset = end;
+            }
+            return ErrorCode::Success;
+        }
+
+    private:
+        ErrorCode SendBatchRemoteAppendChunk(
+            int targetNodeIndex,
+            std::vector<RemoteAppendRequest>& items)
+        {
+            if (items.empty()) return ErrorCode::Success;
+
+            for (int attempt = 0; attempt < 3; attempt++) {
+                Socket::ConnectionID connID = m_net->GetPeerConnection(targetNodeIndex);
+                if (connID == Socket::c_invalidConnectionID) {
+                    SPTAGLIB_LOG(Helper::LogLevel::LL_Error,
+                        "RemotePostingOps: Cannot connect to node %d for batch (%d items, attempt %d)\n",
+                        targetNodeIndex, (int)items.size(), attempt + 1);
+                    if (attempt < 2) continue;
+                    return ErrorCode::Fail;
+                }
+
+                BatchRemoteAppendRequest batchReq;
+                batchReq.m_count = static_cast<std::uint32_t>(items.size());
+                batchReq.m_items = std::move(items);
+
+                Socket::ResourceID resID = m_nextResourceId.fetch_add(1);
+                auto [future, _] = CreatePendingResponse(resID);
+                (void)_;
+
+                Socket::Packet packet;
+                packet.Header().m_packetType = Socket::PacketType::BatchAppendRequest;
+                packet.Header().m_processStatus = Socket::PacketProcessStatus::Ok;
+                packet.Header().m_connectionID = Socket::c_invalidConnectionID;
+                packet.Header().m_resourceID = resID;
+
+                auto bodySize = static_cast<std::uint32_t>(batchReq.EstimateBufferSize());
+                packet.Header().m_bodyLength = bodySize;
+                packet.AllocateBuffer(bodySize);
+                batchReq.Write(packet.Body());
+                items = std::move(batchReq.m_items); // restore for retry
+
+                packet.Header().WriteBuffer(packet.HeaderBuffer());
+
+                SPTAGLIB_LOG(Helper::LogLevel::LL_Debug,
+                    "RemotePostingOps: Sending batch of %u appends to node %d (resID=%u, attempt=%d)\n",
+                    batchReq.m_count, targetNodeIndex, resID, attempt + 1);
+
+                auto waitStart = std::chrono::steady_clock::now();
+                SPTAGLIB_LOG(Helper::LogLevel::LL_Info,
+                    "RemotePostingOps: BatchAppendChunk -> node %d (resID=%u, attempt=%d, items=%u) wait_start\n",
+                    targetNodeIndex, resID, attempt + 1, batchReq.m_count);
+
+                m_net->GetClient()->SendPacket(connID, std::move(packet),
+                    MakeSendFailHandler(resID));
+
+                // Generous timeout: 50k items * (~10ms TiKV roundtrip / 16 worker threads)
+                // = ~31s typical; cap at 180s to allow for lock contention with merges/splits.
+                auto status = future.wait_for(std::chrono::seconds(180));
+                auto waitMs = std::chrono::duration_cast<std::chrono::milliseconds>(
+                    std::chrono::steady_clock::now() - waitStart).count();
+                if (status == std::future_status::timeout) {
+                    SPTAGLIB_LOG(Helper::LogLevel::LL_Error,
+                        "RemotePostingOps: Timeout waiting for batch response from node %d (chunk=%u items, attempt=%d, waited=%lldms)\n",
+                        targetNodeIndex, batchReq.m_count, attempt + 1, (long long)waitMs);
+                    ErasePending(resID);
+                    // Do NOT invalidate the connection on timeout — a slow
+                    // response is not a broken connection, and reconnecting
+                    // floods the worker's accept loop. Real connection errors
+                    // are signalled via MakeSendFailHandler (which sets the
+                    // promise to Fail, taking the "result != Success" path
+                    // below).
+                    if (attempt < 2) continue;
+                    return ErrorCode::Fail;
+                }
+
+                ErrorCode result = future.get();
+                SPTAGLIB_LOG(Helper::LogLevel::LL_Info,
+                    "RemotePostingOps: BatchAppendChunk <- node %d (resID=%u, attempt=%d, items=%u, waited=%lldms, result=%d)\n",
+                    targetNodeIndex, resID, attempt + 1, batchReq.m_count, (long long)waitMs, (int)result);
+                if (result == ErrorCode::Success) return ErrorCode::Success;
+
+                SPTAGLIB_LOG(Helper::LogLevel::LL_Warning,
+                    "RemotePostingOps: Batch to node %d failed (attempt %d), reconnecting...\n",
+                    targetNodeIndex, attempt + 1);
+                m_net->InvalidatePeerConnection(targetNodeIndex);
+            }
+            return ErrorCode::Fail;
+        }
+
+    public:
+
+        // ==================================================================
+        //  HeadSync — fire-and-forget broadcast
+        // ==================================================================
+
+        void BroadcastHeadSync(const std::vector<HeadSyncEntry>& entries) {
+            if (entries.empty()) return;
+
+            int numNodes = m_net->GetNumNodes();
+            int localIdx = m_net->GetLocalNodeIndex();
+
+            // Count once per peer for sent-entry totals.
+            std::uint64_t targetCount = 0;
+            for (int i = 0; i < numNodes; i++) {
+                if (i != localIdx) targetCount++;
+            }
+            m_headSyncBroadcastEntries.fetch_add(entries.size() * targetCount,
+                                                  std::memory_order_relaxed);
+
+            for (int i = 0; i < numNodes; i++) {
+                if (i == localIdx) continue;
+                // Pass a copy of `entries` per peer so each can be re-enqueued
+                // into its own retry backlog independently on send failure.
+                SendOneHeadSync(i, std::vector<HeadSyncEntry>(entries),
+                                /*isRetry=*/false);
+            }
+        }
+
+        // Send a HeadSync packet to a single peer. On TCP-level send failure
+        // (success=false reported by the network stack), the entries are
+        // appended to the per-peer retry backlog so the background retry
+        // thread can re-attempt delivery. Counter increments are done
+        // best-effort once the SendPacket completion lambda fires.
+        void SendOneHeadSync(int nodeIdx,
+                             std::vector<HeadSyncEntry> entries,
+                             bool isRetry)
+        {
+            if (entries.empty()) return;
+
+            Socket::ConnectionID connID = m_net->GetPeerConnection(nodeIdx);
+            if (connID == Socket::c_invalidConnectionID) {
+                SPTAGLIB_LOG(Helper::LogLevel::LL_Warning,
+                    "RemotePostingOps: HeadSync no connection to node %d (count=%zu, isRetry=%d)\n",
+                    nodeIdx, entries.size(), isRetry ? 1 : 0);
+                EnqueueHeadSyncRetry(nodeIdx, std::move(entries));
+                return;
+            }
+
+            size_t bodySize = sizeof(std::uint32_t);
+            for (const auto& e : entries) bodySize += e.EstimateBufferSize();
+
+            Socket::Packet pkt;
+            pkt.Header().m_packetType = Socket::PacketType::HeadSyncRequest;
+            pkt.Header().m_processStatus = Socket::PacketProcessStatus::Ok;
+            pkt.Header().m_connectionID = Socket::c_invalidConnectionID;
+            pkt.Header().m_resourceID = 0;
+            pkt.Header().m_bodyLength = static_cast<std::uint32_t>(bodySize);
+            pkt.AllocateBuffer(static_cast<std::uint32_t>(bodySize));
+
+            std::uint8_t* buf = pkt.Body();
+            buf = Socket::SimpleSerialization::SimpleWriteBuffer(
+                static_cast<std::uint32_t>(entries.size()), buf);
+            for (const auto& e : entries) buf = e.Write(buf);
+            pkt.Header().WriteBuffer(pkt.HeaderBuffer());
+
+            const std::uint64_t sentCount = entries.size();
+            std::shared_ptr<std::vector<HeadSyncEntry>> entriesShared =
+                std::make_shared<std::vector<HeadSyncEntry>>(std::move(entries));
+            const bool wasRetry = isRetry;
+
+            m_net->GetClient()->SendPacket(connID, std::move(pkt),
+                [this, nodeIdx, entriesShared, sentCount, wasRetry](bool success) {
+                    if (success) {
+                        m_headSyncBroadcastSendOK.fetch_add(sentCount,
+                            std::memory_order_relaxed);
+                        if (wasRetry) {
+                            m_headSyncRetrySucceeded.fetch_add(sentCount,
+                                std::memory_order_relaxed);
+                        }
+                    } else {
+                        m_headSyncBroadcastSendFail.fetch_add(sentCount,
+                            std::memory_order_relaxed);
+                        SPTAGLIB_LOG(Helper::LogLevel::LL_Warning,
+                            "RemotePostingOps: HeadSync send to node %d FAILED "
+                            "(count=%llu, isRetry=%d) — enqueueing for retry\n",
+                            nodeIdx,
+                            (unsigned long long)sentCount,
+                            wasRetry ? 1 : 0);
+                        m_net->InvalidatePeerConnection(nodeIdx);
+                        EnqueueHeadSyncRetry(nodeIdx, std::move(*entriesShared));
+                    }
+                });
+        }
+
+        void EnqueueHeadSyncRetry(int nodeIdx, std::vector<HeadSyncEntry> entries) {
+            if (entries.empty()) return;
+            auto backlog = GetOrCreateBacklog(nodeIdx);
+            std::lock_guard<std::mutex> g(backlog->mu);
+            if (backlog->queue.size() + entries.size() > HeadSyncBacklog::kMaxEntries) {
+                std::uint64_t dropped = entries.size();
+                m_headSyncRetryDropped.fetch_add(dropped, std::memory_order_relaxed);
+                SPTAGLIB_LOG(Helper::LogLevel::LL_Error,
+                    "RemotePostingOps: HeadSync retry queue full for node %d "
+                    "(queue=%zu, dropping=%llu) — index will diverge!\n",
+                    nodeIdx, backlog->queue.size(),
+                    (unsigned long long)dropped);
+                return;
+            }
+            for (auto& e : entries) backlog->queue.push_back(std::move(e));
+            m_headSyncRetryEnqueued.fetch_add(entries.size(),
+                std::memory_order_relaxed);
+        }
+
+        // Pull up to maxBatch entries from the per-peer backlog and re-send
+        // them. Called from the retry thread and on demand. Returns the
+        // total number of entries dispatched (including for retry-of-retry).
+        size_t DrainHeadSyncBacklog(size_t maxBatch = 1024) {
+            if (!m_net) return 0;
+            std::vector<int> nodeIdxs;
+            {
+                std::shared_lock<std::shared_timed_mutex> lk(m_headSyncBacklogsMu);
+                nodeIdxs.reserve(m_headSyncBacklogs.size());
+                for (auto& kv : m_headSyncBacklogs) nodeIdxs.push_back(kv.first);
+            }
+            size_t dispatched = 0;
+            for (int nodeIdx : nodeIdxs) {
+                auto backlog = GetOrCreateBacklog(nodeIdx);
+                std::vector<HeadSyncEntry> batch;
+                {
+                    std::lock_guard<std::mutex> g(backlog->mu);
+                    if (backlog->queue.empty()) continue;
+                    size_t bs = std::min<size_t>(backlog->queue.size(), maxBatch);
+                    batch.reserve(bs);
+                    for (size_t i = 0; i < bs; i++) {
+                        batch.push_back(std::move(backlog->queue.front()));
+                        backlog->queue.pop_front();
+                    }
+                }
+                size_t bs = batch.size();
+                SendOneHeadSync(nodeIdx, std::move(batch), /*isRetry=*/true);
+                dispatched += bs;
+            }
+            return dispatched;
+        }
+
+        size_t GetHeadSyncBacklogSize() const {
+            size_t total = 0;
+            std::vector<std::shared_ptr<HeadSyncBacklog>> snapshot;
+            {
+                std::shared_lock<std::shared_timed_mutex> lk(m_headSyncBacklogsMu);
+                snapshot.reserve(m_headSyncBacklogs.size());
+                for (auto& kv : m_headSyncBacklogs) snapshot.push_back(kv.second);
+            }
+            for (auto& b : snapshot) {
+                std::lock_guard<std::mutex> g(b->mu);
+                total += b->queue.size();
+            }
+            return total;
+        }
+
+        // Best-effort log dump of HeadSync delivery counters. Use whenever a
+        // checkpoint is needed (start/end of insert phase, before query, on
+        // SaveIndex, etc.).
+        void DumpHeadSyncStats(const char* label) const {
+            SPTAGLIB_LOG(Helper::LogLevel::LL_Info,
+                "[HeadSync stats %s] broadcast_entries=%llu send_ok=%llu send_fail=%llu "
+                "recv_entries=%llu apply_add=%llu apply_del=%llu "
+                "retry_enqueued=%llu retry_succeeded=%llu retry_dropped=%llu "
+                "backlog_now=%zu\n",
+                label ? label : "",
+                (unsigned long long)m_headSyncBroadcastEntries.load(std::memory_order_relaxed),
+                (unsigned long long)m_headSyncBroadcastSendOK.load(std::memory_order_relaxed),
+                (unsigned long long)m_headSyncBroadcastSendFail.load(std::memory_order_relaxed),
+                (unsigned long long)m_headSyncRecvEntries.load(std::memory_order_relaxed),
+                (unsigned long long)m_headSyncApplyAdd.load(std::memory_order_relaxed),
+                (unsigned long long)m_headSyncApplyDelete.load(std::memory_order_relaxed),
+                (unsigned long long)m_headSyncRetryEnqueued.load(std::memory_order_relaxed),
+                (unsigned long long)m_headSyncRetrySucceeded.load(std::memory_order_relaxed),
+                (unsigned long long)m_headSyncRetryDropped.load(std::memory_order_relaxed),
+                GetHeadSyncBacklogSize());
+        }
+
+        // Counters incremented by the receiver-side HandleHeadSyncRequest /
+        // AddHeadIndex callback. Public so the ExtraDynamicSearcher
+        // HeadSyncCallback lambda can bump them after applying each entry.
+        void NoteHeadSyncApplyAdd() {
+            m_headSyncApplyAdd.fetch_add(1, std::memory_order_relaxed);
+        }
+        void NoteHeadSyncApplyDelete() {
+            m_headSyncApplyDelete.fetch_add(1, std::memory_order_relaxed);
+        }
+
+        // Best-effort log dump of cross-node merge-hint channel counters.
+        // Mirrors DumpHeadSyncStats: sender side tracks how many hints we
+        // broadcast (send_ok / send_fail); receiver side tracks how many
+        // hints we got and how many were dropped (callback missing).
+        void DumpMergeRequestStats(const char* label) const {
+            SPTAGLIB_LOG(Helper::LogLevel::LL_Info,
+                "[MergeHint stats %s] send_ok=%llu send_fail=%llu "
+                "recv_hints=%llu recv_dropped=%llu\n",
+                label ? label : "",
+                (unsigned long long)m_mergeBroadcastSendOK.load(std::memory_order_relaxed),
+                (unsigned long long)m_mergeBroadcastSendFail.load(std::memory_order_relaxed),
+                (unsigned long long)m_mergeRecvHints.load(std::memory_order_relaxed),
+                (unsigned long long)m_mergeRecvDropped.load(std::memory_order_relaxed));
+        }
+
+        // ==================================================================
+        //  RemoteLock — synchronous request/response
+        // ==================================================================
+
+        bool SendRemoteLock(int nodeIndex, int layer, SizeType headID, bool lock) {
+            Socket::ConnectionID connID = m_net->GetPeerConnection(nodeIndex);
+            if (connID == Socket::c_invalidConnectionID) {
+                SPTAGLIB_LOG(Helper::LogLevel::LL_Warning,
+                    "RemotePostingOps: Cannot send remote lock to node %d\n", nodeIndex);
+                return false;
+            }
+
+            RemoteLockRequest req;
+            req.m_op = lock ? RemoteLockRequest::Op::Lock : RemoteLockRequest::Op::Unlock;
+            req.m_headID = headID;
+            req.m_layer = layer;
+
+            Socket::ResourceID rid = m_nextResourceId.fetch_add(1);
+            auto [future, _] = CreatePendingResponse(rid);
+            (void)_;
+
+            Socket::Packet pkt;
+            auto bodySize = req.EstimateBufferSize();
+            pkt.Header().m_packetType = Socket::PacketType::RemoteLockRequest;
+            pkt.Header().m_processStatus = Socket::PacketProcessStatus::Ok;
+            pkt.Header().m_connectionID = Socket::c_invalidConnectionID;
+            pkt.Header().m_resourceID = rid;
+            pkt.Header().m_bodyLength = static_cast<std::uint32_t>(bodySize);
+            pkt.AllocateBuffer(static_cast<std::uint32_t>(bodySize));
+            req.Write(pkt.Body());
+            pkt.Header().WriteBuffer(pkt.HeaderBuffer());
+
+            m_net->GetClient()->SendPacket(connID, std::move(pkt),
+                MakeSendFailHandler(rid));
+
+            auto status = future.wait_for(std::chrono::milliseconds(5000));
+            if (status != std::future_status::ready) {
+                ErasePending(rid);
+                SPTAGLIB_LOG(Helper::LogLevel::LL_Warning,
+                    "RemotePostingOps: Lock timeout for headID %lld on node %d\n",
+                    (std::int64_t)headID, nodeIndex);
+                return false;
+            }
+            return future.get() == ErrorCode::Success;
+        }
+
+        // ==================================================================
+        //  Inbound packet handlers (called by WorkerNode's server/client)
+        // ==================================================================
+
+        void HandleAppendRequest(Socket::ConnectionID connID, Socket::Packet packet) {
+            if (packet.Header().m_bodyLength == 0) {
+                SPTAGLIB_LOG(Helper::LogLevel::LL_Error,
+                    "RemotePostingOps: Empty AppendRequest\n");
+                return;
+            }
+
+            if (Socket::c_invalidConnectionID == packet.Header().m_connectionID)
+                packet.Header().m_connectionID = connID;
+
+            RemoteAppendRequest req;
+            const std::uint8_t* body = packet.Body();
+            const std::uint8_t* bodyEnd = body + packet.Header().m_bodyLength;
+            if (req.Read(body, bodyEnd) == nullptr) {
+                SPTAGLIB_LOG(Helper::LogLevel::LL_Error,
+                    "RemotePostingOps: AppendRequest version mismatch\n");
+                SendAppendResponse(packet, RemoteAppendResponse::Status::Failed);
+                return;
+            }
+
+            ErrorCode result = ErrorCode::Fail;
+            {
+                std::shared_lock<std::shared_timed_mutex> cbLock(m_callbackLifetimeMutex);
+                const auto* cb = LookupAppendCallback_Locked(req.m_layer);
+                if (cb) {
+                    auto headVec = std::make_shared<std::string>(std::move(req.m_headVec));
+                    result = (*cb)(
+                        req.m_headID, headVec, req.m_appendNum, req.m_appendPosting);
+                } else {
+                    SPTAGLIB_LOG(Helper::LogLevel::LL_Warning,
+                        "RemotePostingOps: AppendRequest layer=%d has no callback registered\n",
+                        req.m_layer);
+                }
+            }
+
+            auto status = (result == ErrorCode::Success)
+                ? RemoteAppendResponse::Status::Success
+                : RemoteAppendResponse::Status::Failed;
+            SendAppendResponse(packet, status);
+        }
+
+        void HandleAppendResponse(Socket::ConnectionID connID, Socket::Packet packet) {
+            Socket::ResourceID resID = packet.Header().m_resourceID;
+            auto promise = TakePendingResponse(resID);
+            if (!promise) return;
+
+            if (packet.Header().m_processStatus != Socket::PacketProcessStatus::Ok) {
+                promise->set_value(ErrorCode::Fail);
+                return;
+            }
+
+            RemoteAppendResponse resp;
+            if (resp.Read(packet.Body()) == nullptr) {
+                promise->set_value(ErrorCode::Fail);
+                return;
+            }
+
+            promise->set_value(
+                resp.m_status == RemoteAppendResponse::Status::Success
+                    ? ErrorCode::Success : ErrorCode::Fail);
+        }
+
+        void HandleBatchAppendRequest(Socket::ConnectionID connID, Socket::Packet packet) {
+            if (packet.Header().m_bodyLength == 0) {
+                SPTAGLIB_LOG(Helper::LogLevel::LL_Error,
+                    "RemotePostingOps: Empty BatchAppendRequest\n");
+                return;
+            }
+
+            if (Socket::c_invalidConnectionID == packet.Header().m_connectionID)
+                packet.Header().m_connectionID = connID;
+
+            auto batchReq = std::make_shared<BatchRemoteAppendRequest>();
+            if (batchReq->Read(packet.Body(), packet.Header().m_bodyLength) == nullptr) {
+                SPTAGLIB_LOG(Helper::LogLevel::LL_Error,
+                    "RemotePostingOps: BatchAppendRequest parse failed\n");
+                SendBatchAppendResponse(packet, 0, 1);
+                return;
+            }
+
+            SPTAGLIB_LOG(Helper::LogLevel::LL_Debug,
+                "RemotePostingOps: Received batch of %u appends\n", batchReq->m_count);
+
+            // Submit each item as a high-priority Job to the searcher's
+            // shared compute pool. Pool workers run the local Append callback
+            // exactly like a local insert would. Last completion ACKs the
+            // sender. This puts remote work on the SAME concurrency budget
+            // as local Split/Merge/Reassign — eliminating the over-subscribed
+            // TiKV behaviour of the old separate bg executor + transient
+            // sub-worker threads.
+            auto packetPtr = std::make_shared<Socket::Packet>(std::move(packet));
+            const size_t total = batchReq->m_items.size();
+            if (total == 0) {
+                SendBatchAppendResponse(*packetPtr, 0, 0);
+                return;
+            }
+            auto remaining    = std::make_shared<std::atomic<size_t>>(total);
+            auto successCount = std::make_shared<std::atomic<std::uint32_t>>(0);
+            auto failCount    = std::make_shared<std::atomic<std::uint32_t>>(0);
+
+            if (m_jobSubmitters.empty()) {
+                // Fallback: process inline on the network thread. Should not
+                // happen once ExtraDynamicSearcher has wired its pool.
+                SPTAGLIB_LOG(Helper::LogLevel::LL_Warning,
+                    "RemotePostingOps: no job submitter wired; running BatchAppend synchronously\n");
+                std::shared_lock<std::shared_timed_mutex> cbLock(m_callbackLifetimeMutex);
+                for (auto& req : batchReq->m_items) {
+                    ErrorCode r = ErrorCode::Fail;
+                    const auto* cb = LookupAppendCallback_Locked(req.m_layer);
+                    if (cb) {
+                        auto hv = std::make_shared<std::string>(std::move(req.m_headVec));
+                        r = (*cb)(req.m_headID, hv, req.m_appendNum, req.m_appendPosting);
+                    }
+                    (r == ErrorCode::Success ? *successCount : *failCount).fetch_add(1);
+                }
+                SendBatchAppendResponse(*packetPtr, successCount->load(), failCount->load());
+                return;
+            }
+
+            for (size_t i = 0; i < total; i++) {
+                auto* job = new BatchAppendItemJob(
+                    this, batchReq, i, remaining, successCount, failCount, packetPtr);
+                // Route to the per-layer searcher pool matching this item's
+                // m_layer so local Append/Split/Merge on layer N and remote
+                // appends targeting layer N share the same 16-thread budget.
+                // A single global submitter sent both layers' work into one
+                // pool, causing 35k+ queue depth on the receiver side.
+                int layer = batchReq->m_items[i].m_layer;
+                const JobSubmitter* sub = nullptr;
+                if (layer >= 0 && static_cast<size_t>(layer) < m_jobSubmitters.size()
+                    && m_jobSubmitters[layer]) {
+                    sub = &m_jobSubmitters[layer];
+                } else {
+                    // Layer's pool not yet wired — fall back to whichever
+                    // submitter we have.
+                    for (auto& s : m_jobSubmitters) { if (s) { sub = &s; break; } }
+                }
+                // Normal priority. Per-layer routing (m_jobSubmitters[layer])
+                // already isolates layer-N append items from other layers'
+                // pools. High priority starved split entirely (split:N
+                // in_flight, 0 completed) because once all 16 worker threads
+                // are running long-tail append items, fresh high-prio appends
+                // keep cutting in front of split. Append throughput per chunk
+                // is limited by pool concurrency × per-item RMW; widen the
+                // pool (AppendThreadNum) instead of using priority hacks.
+                if (sub) (*sub)(job, /*high=*/false);
+                else     { delete job; failCount->fetch_add(1); remaining->fetch_sub(1); }
+            }
+        }
+
+        void HandleBatchAppendResponse(Socket::ConnectionID connID, Socket::Packet packet) {
+            Socket::ResourceID resID = packet.Header().m_resourceID;
+            auto promise = TakePendingResponse(resID);
+            if (!promise) return;
+
+            if (packet.Header().m_processStatus != Socket::PacketProcessStatus::Ok) {
+                promise->set_value(ErrorCode::Fail);
+                return;
+            }
+
+            BatchRemoteAppendResponse resp;
+            if (resp.Read(packet.Body()) == nullptr) {
+                promise->set_value(ErrorCode::Fail);
+                return;
+            }
+
+            promise->set_value(resp.m_failCount == 0 ? ErrorCode::Success : ErrorCode::Fail);
+        }
+
+        void HandleHeadSyncRequest(Socket::ConnectionID connID, Socket::Packet packet) {
+            std::shared_lock<std::shared_timed_mutex> cbLock(m_callbackLifetimeMutex);
+            if (m_headSyncCallbacks.empty()) {
+                SPTAGLIB_LOG(Helper::LogLevel::LL_Warning,
+                    "RemotePostingOps: HeadSyncRequest but no callbacks registered\n");
+                return;
+            }
+
+            const std::uint8_t* buf = packet.Body();
+            const std::uint8_t* bufEnd = buf + packet.Header().m_bodyLength;
+            std::uint32_t entryCount = 0;
+            buf = Socket::SimpleSerialization::SimpleReadBuffer(buf, entryCount);
+
+            std::uint32_t bodyLength = packet.Header().m_bodyLength;
+            if (bodyLength < sizeof(std::uint32_t) ||
+                entryCount > (bodyLength - sizeof(std::uint32_t)) / 8) {
+                SPTAGLIB_LOG(Helper::LogLevel::LL_Error,
+                    "RemotePostingOps: HeadSyncRequest entryCount=%u exceeds bodyLength=%u\n",
+                    entryCount, bodyLength);
+                return;
+            }
+
+            for (std::uint32_t i = 0; i < entryCount; i++) {
+                if (buf >= bufEnd) {
+                    SPTAGLIB_LOG(Helper::LogLevel::LL_Error,
+                        "RemotePostingOps: HeadSync buffer overrun at entry %u/%u\n", i, entryCount);
+                    break;
+                }
+                HeadSyncEntry entry;
+                buf = entry.Read(buf);
+                if (!buf || buf > bufEnd) {
+                    SPTAGLIB_LOG(Helper::LogLevel::LL_Error,
+                        "RemotePostingOps: HeadSync parse error at entry %u/%u\n", i, entryCount);
+                    break;
+                }
+                m_headSyncRecvEntries.fetch_add(1, std::memory_order_relaxed);
+                const auto* cb = LookupHeadSyncCallback_Locked(entry.m_layer);
+                if (cb) {
+                    (*cb)(entry);
+                } else {
+                    SPTAGLIB_LOG(Helper::LogLevel::LL_Warning,
+                        "RemotePostingOps: HeadSyncEntry layer=%d has no callback registered (op=%d, vid=%d)\n",
+                        entry.m_layer, static_cast<int>(entry.op), (int)entry.headVID);
+                }
+            }
+        }
+
+        // ==================================================================
+        //  Merge — fire-and-forget cross-node hint
+        // ==================================================================
+
+        /// Send a batch of merge hints to one peer. Fire-and-forget: no
+        /// response is expected and no retry queue is maintained. Receiver-
+        /// side m_mergeList dedups, and the owner discovers underfull
+        /// postings through its own paths (own search, own Append) if any
+        /// notification is dropped.
+        void SendBatchRemoteMerge(int targetNodeIndex,
+                                  const std::vector<RemoteMergeRequest>& items)
+        {
+            if (items.empty()) return;
+
+            Socket::ConnectionID connID = m_net->GetPeerConnection(targetNodeIndex);
+            if (connID == Socket::c_invalidConnectionID) {
+                m_mergeBroadcastSendFail.fetch_add(items.size(), std::memory_order_relaxed);
+                return;
+            }
+
+            BatchRemoteMergeRequest batch;
+            batch.m_count = static_cast<std::uint32_t>(items.size());
+            batch.m_items = items;
+
+            Socket::Packet pkt;
+            pkt.Header().m_packetType = Socket::PacketType::MergeRequest;
+            pkt.Header().m_processStatus = Socket::PacketProcessStatus::Ok;
+            pkt.Header().m_connectionID = Socket::c_invalidConnectionID;
+            pkt.Header().m_resourceID = 0;
+
+            auto bodySize = static_cast<std::uint32_t>(batch.EstimateBufferSize());
+            pkt.Header().m_bodyLength = bodySize;
+            pkt.AllocateBuffer(bodySize);
+            batch.Write(pkt.Body());
+            pkt.Header().WriteBuffer(pkt.HeaderBuffer());
+
+            const std::uint64_t sentCount = items.size();
+            m_net->GetClient()->SendPacket(connID, std::move(pkt),
+                [this, targetNodeIndex, sentCount](bool success) {
+                    if (success) {
+                        m_mergeBroadcastSendOK.fetch_add(sentCount, std::memory_order_relaxed);
+                    } else {
+                        m_mergeBroadcastSendFail.fetch_add(sentCount, std::memory_order_relaxed);
+                        m_net->InvalidatePeerConnection(targetNodeIndex);
+                    }
+                });
+        }
+
+        void HandleMergeRequest(Socket::ConnectionID connID, Socket::Packet packet) {
+            (void)connID;
+            BatchRemoteMergeRequest batch;
+            if (batch.Read(packet.Body(), packet.Header().m_bodyLength) == nullptr) {
+                SPTAGLIB_LOG(Helper::LogLevel::LL_Error,
+                    "RemotePostingOps: MergeRequest parse failed (bodyLength=%u)\n",
+                    packet.Header().m_bodyLength);
+                return;
+            }
+
+            std::shared_lock<std::shared_timed_mutex> cbLock(m_callbackLifetimeMutex);
+            for (const auto& item : batch.m_items) {
+                const auto* cb = LookupMergeCallback_Locked(item.m_layer);
+                if (cb) {
+                    (*cb)(item.m_headID);
+                    m_mergeRecvHints.fetch_add(1, std::memory_order_relaxed);
+                } else {
+                    m_mergeRecvDropped.fetch_add(1, std::memory_order_relaxed);
+                }
+            }
+        }
+
+        void HandleRemoteLockRequest(Socket::ConnectionID connID, Socket::Packet packet) {
+            RemoteLockRequest req;
+            if (req.Read(packet.Body()) == nullptr) {
+                SPTAGLIB_LOG(Helper::LogLevel::LL_Error,
+                    "RemotePostingOps: Failed to parse RemoteLockRequest\n");
+                return;
+            }
+
+            RemoteLockResponse resp;
+            resp.m_status = RemoteLockResponse::Status::Denied;
+
+            {
+                std::shared_lock<std::shared_timed_mutex> cbLock(m_callbackLifetimeMutex);
+                const auto* cb = LookupRemoteLockCallback_Locked(req.m_layer);
+                if (cb) {
+                    bool isLock = (req.m_op == RemoteLockRequest::Op::Lock);
+                    bool success = (*cb)(req.m_headID, isLock);
+                    if (success) resp.m_status = RemoteLockResponse::Status::Granted;
+                } else {
+                    SPTAGLIB_LOG(Helper::LogLevel::LL_Warning,
+                        "RemotePostingOps: RemoteLockRequest layer=%d has no callback registered\n",
+                        req.m_layer);
+                }
+            }
+
+            Socket::Packet ret;
+            auto bodySize = resp.EstimateBufferSize();
+            ret.Header().m_packetType = Socket::PacketType::RemoteLockResponse;
+            ret.Header().m_processStatus = Socket::PacketProcessStatus::Ok;
+            ret.Header().m_connectionID = connID;
+            ret.Header().m_resourceID = packet.Header().m_resourceID;
+            ret.Header().m_bodyLength = static_cast<std::uint32_t>(bodySize);
+            ret.AllocateBuffer(static_cast<std::uint32_t>(bodySize));
+            resp.Write(ret.Body());
+            ret.Header().WriteBuffer(ret.HeaderBuffer());
+
+            m_net->GetServer()->SendPacket(connID, std::move(ret), nullptr);
+        }
+
+        void HandleRemoteLockResponse(Socket::ConnectionID connID, Socket::Packet packet) {
+            Socket::ResourceID rid = packet.Header().m_resourceID;
+            auto promise = TakePendingResponse(rid);
+            if (!promise) return;
+
+            RemoteLockResponse resp;
+            if (resp.Read(packet.Body()) == nullptr) {
+                promise->set_value(ErrorCode::Fail);
+                return;
+            }
+
+            promise->set_value(resp.m_status == RemoteLockResponse::Status::Granted
+                ? ErrorCode::Success : ErrorCode::Fail);
+        }
+
+        // ---- Response matching helpers ----
+
+        std::pair<std::future<ErrorCode>, bool> CreatePendingResponse(Socket::ResourceID resID) {
+            std::promise<ErrorCode> promise;
+            auto future = promise.get_future();
+            std::lock_guard<std::mutex> lock(m_pendingMutex);
+            m_pendingResponses.emplace(resID, std::move(promise));
+            return {std::move(future), true};
+        }
+
+        void ErasePending(Socket::ResourceID resID) {
+            std::lock_guard<std::mutex> lock(m_pendingMutex);
+            m_pendingResponses.erase(resID);
+        }
+
+        /// Take a pending promise out of the map (returns nullptr if not found).
+        std::unique_ptr<std::promise<ErrorCode>> TakePendingResponse(Socket::ResourceID resID) {
+            std::lock_guard<std::mutex> lock(m_pendingMutex);
+            auto it = m_pendingResponses.find(resID);
+            if (it == m_pendingResponses.end()) {
+                SPTAGLIB_LOG(Helper::LogLevel::LL_Warning,
+                    "RemotePostingOps: Response for unknown resourceID %u\n", resID);
+                return nullptr;
+            }
+            auto p = std::make_unique<std::promise<ErrorCode>>(std::move(it->second));
+            m_pendingResponses.erase(it);
+            return p;
+        }
+
+        /// Create a send-failure callback that resolves the pending promise.
+        std::function<void(bool)> MakeSendFailHandler(Socket::ResourceID resID) {
+            return [resID, this](bool success) {
+                if (!success) {
+                    std::lock_guard<std::mutex> lock(m_pendingMutex);
+                    auto it = m_pendingResponses.find(resID);
+                    if (it != m_pendingResponses.end()) {
+                        it->second.set_value(ErrorCode::Fail);
+                        m_pendingResponses.erase(it);
+                    }
+                }
+            };
+        }
+
+        void SendAppendResponse(Socket::Packet& srcPacket, RemoteAppendResponse::Status status) {
+            RemoteAppendResponse resp;
+            resp.m_status = status;
+
+            Socket::Packet ret;
+            ret.Header().m_packetType = Socket::PacketType::AppendResponse;
+            ret.Header().m_processStatus = Socket::PacketProcessStatus::Ok;
+            ret.Header().m_connectionID = srcPacket.Header().m_connectionID;
+            ret.Header().m_resourceID = srcPacket.Header().m_resourceID;
+
+            auto bodySize = static_cast<std::uint32_t>(resp.EstimateBufferSize());
+            ret.Header().m_bodyLength = bodySize;
+            ret.AllocateBuffer(bodySize);
+            resp.Write(ret.Body());
+            ret.Header().WriteBuffer(ret.HeaderBuffer());
+
+            m_net->GetServer()->SendPacket(srcPacket.Header().m_connectionID, std::move(ret), nullptr);
+        }
+
+        void SendBatchAppendResponse(Socket::Packet& srcPacket,
+            std::uint32_t successCount, std::uint32_t failCount) {
+            BatchRemoteAppendResponse resp;
+            resp.m_successCount = successCount;
+            resp.m_failCount = failCount;
+
+            Socket::Packet ret;
+            ret.Header().m_packetType = Socket::PacketType::BatchAppendResponse;
+            ret.Header().m_processStatus = Socket::PacketProcessStatus::Ok;
+            ret.Header().m_connectionID = srcPacket.Header().m_connectionID;
+            ret.Header().m_resourceID = srcPacket.Header().m_resourceID;
+
+            auto bodySize = static_cast<std::uint32_t>(resp.EstimateBufferSize());
+            ret.Header().m_bodyLength = bodySize;
+            ret.AllocateBuffer(bodySize);
+            resp.Write(ret.Body());
+            ret.Header().WriteBuffer(ret.HeaderBuffer());
+
+            m_net->GetServer()->SendPacket(srcPacket.Header().m_connectionID, std::move(ret), nullptr);
+        }
+
+        // ==================================================================
+        //  [Bug 26] Background executor — slow-lane for batch RPC handlers
+        // ==================================================================
+        //
+        // Why: the network server thread pool has only 8 threads
+        // (NetworkNode.h). HandleBatchAppendRequest does heavy TiKV work
+        // (fan out to 4 sub-workers and join), each call tying up its
+        // network thread for tens of seconds during inserts.
+        // Once 4–8 such handlers run concurrently, every network thread is
+        // blocked and latency-sensitive RPCs (HeadSync, RemoteLock) cannot be
+        // serviced.
+        //
+        // Fix: parse on the network thread (fast), then enqueue the heavy
+        // work onto a dedicated background thread pool and return. The
+        // network thread immediately becomes available for other RPCs.
+        // The background worker eventually sends the response itself.
+        //
+        // Sizing rationale:
+        //   - Threads default to 8: matches the network pool so we never
+        //     under-utilize CPU even if every network thread is parsing a
+        //     batch. Tunable via env SPTAG_BG_EXEC_THREADS.
+        //   - Queue cap default 256: plenty of headroom for typical bursts;
+        //     when full, falls back to synchronous execution to preserve
+        //     correctness rather than dropping requests.
+
+        // Background executor removed: BatchAppend now runs as sub-Jobs on
+        // the searcher's shared compute pool via SetJobSubmitter() so it
+        // shares a single concurrency budget with local Split/Merge/Reassign
+        // (with high-priority jumping the queue). See HandleBatchAppendRequest.
+
+        // ==================================================================
+        //  HeadSync retry thread — periodic best-effort drain of per-peer
+        //  backlogs that were populated by failed BroadcastHeadSync sends.
+        //
+        //  Why: BroadcastHeadSync is fire-and-forget by design (we don't
+        //  want to block the layer-1 split path on a slow peer). When the
+        //  TCP send completion reports failure, we previously dropped the
+        //  entries forever and the peer's headIndex / m_pSamples diverged,
+        //  causing the receiver's BKTree to miss heads at search time and
+        //  recall to collapse on later batches. The retry queue + this
+        //  thread make HeadSync delivery reliable best-effort.
+        // ==================================================================
+
+        struct HeadSyncBacklog {
+            std::mutex mu;
+            std::deque<HeadSyncEntry> queue;
+            // Matches m_addCountForRebuild scale per peer. If we ever hit
+            // this we log + drop (fall back to manual reconcile).
+            static constexpr size_t kMaxEntries = 1u << 18;  // 262144
+        };
+
+        void StartHeadSyncRetryThread() {
+            const char* envIntervalMs = std::getenv("SPTAG_HEADSYNC_RETRY_INTERVAL_MS");
+            int intervalMs = 500;
+            if (envIntervalMs) {
+                try { intervalMs = std::max(50, std::stoi(envIntervalMs)); } catch (...) {}
+            }
+            m_headSyncRetryIntervalMs = intervalMs;
+            m_headSyncRetryStop.store(false, std::memory_order_release);
+            m_headSyncRetryThread = std::thread([this]() { HeadSyncRetryLoop(); });
+        }
+
+        void StopHeadSyncRetryThread() {
+            m_headSyncRetryStop.store(true, std::memory_order_release);
+            if (m_headSyncRetryThread.joinable()) m_headSyncRetryThread.join();
+        }
+
+        void HeadSyncRetryLoop() {
+            using namespace std::chrono;
+            while (!m_headSyncRetryStop.load(std::memory_order_acquire)) {
+                std::this_thread::sleep_for(milliseconds(m_headSyncRetryIntervalMs));
+                if (m_net) DrainHeadSyncBacklog();
+            }
+            // Final drain pass to give the network a chance to flush.
+            for (int i = 0; i < 5 && m_net; i++) {
+                size_t dispatched = DrainHeadSyncBacklog();
+                if (dispatched == 0) break;
+                std::this_thread::sleep_for(milliseconds(200));
+            }
+            if (m_headSyncBroadcastEntries.load(std::memory_order_relaxed) > 0
+                || m_headSyncRecvEntries.load(std::memory_order_relaxed) > 0) {
+                DumpHeadSyncStats("shutdown");
+            }
+            if (m_mergeBroadcastSendOK.load(std::memory_order_relaxed) > 0
+                || m_mergeRecvHints.load(std::memory_order_relaxed) > 0) {
+                DumpMergeRequestStats("shutdown");
+            }
+        }
+
+        std::shared_ptr<HeadSyncBacklog> GetOrCreateBacklog(int nodeIdx) {
+            {
+                std::shared_lock<std::shared_timed_mutex> lk(m_headSyncBacklogsMu);
+                auto it = m_headSyncBacklogs.find(nodeIdx);
+                if (it != m_headSyncBacklogs.end()) return it->second;
+            }
+            std::unique_lock<std::shared_timed_mutex> lk(m_headSyncBacklogsMu);
+            auto& slot = m_headSyncBacklogs[nodeIdx];
+            if (!slot) slot = std::make_shared<HeadSyncBacklog>();
+            return slot;
+        }
+
+        // ---- State ----
+
+        NetworkAccess* m_net = nullptr;
+
+        // Per-layer callback registries. Indexed by ExtraDynamicSearcher layer
+        // (m_layer at the call site). Resized lazily by SetXxxCallback. The
+        // empty/null entry at layer 0 is preserved so a single-layer caller
+        // (legacy or test) without explicit Set keeps the no-op default.
+        //
+        // The shared-callback design existed because the original SPANN had
+        // a single ExtraDynamicSearcher (Layers=1). With Layers>=2, each
+        // layer's lambda captures its own `this` (hence m_layer) and dispatch
+        // by request.m_layer is required to avoid routing layer-0 events to
+        // layer-1's storage and vice versa.
+        std::vector<AppendCallback> m_appendCallbacks;
+        std::vector<HeadSyncCallback> m_headSyncCallbacks;
+        std::vector<RemoteLockCallback> m_remoteLockCallbacks;
+        std::vector<MergeCallback> m_mergeCallbacks;
+
+        // Per-layer ownership tokens. Each ExtraDynamicSearcher claims its
+        // layer slot at SetWorker time and releases it on destruction; this
+        // prevents earlier-layer destructors from wiping a later-layer's
+        // callbacks (the original ClaimCallbackOwnership purpose, now
+        // applied per-layer instead of globally).
+        std::vector<std::atomic<const void*>> m_callbackOwners;
+
+        // Guards the lifetime of the captured `this` inside the callbacks.
+        // Held in shared mode by every callback invocation site, and in
+        // exclusive mode by ClearCallbacks() / SetXxxCallback() so that
+        // (re)assigning a callback can never race with an in-flight invocation.
+        mutable std::shared_timed_mutex m_callbackLifetimeMutex;
+
+        std::atomic<Socket::ResourceID> m_nextResourceId{1};
+        std::mutex m_pendingMutex;
+        std::unordered_map<Socket::ResourceID, std::promise<ErrorCode>> m_pendingResponses;
+
+        // Per-item Job: each remote append request becomes one Job submitted
+        // to the searcher's shared SPDKThreadPool. The last completing Job
+        // ACKs the sender. Identical to how a local insert thread would call
+        // Append; the only difference is the request originated on a peer.
+        class BatchAppendItemJob : public Helper::ThreadPool::Job {
+        public:
+            BatchAppendItemJob(RemotePostingOps* ops,
+                               std::shared_ptr<BatchRemoteAppendRequest> batchReq,
+                               size_t index,
+                               std::shared_ptr<std::atomic<size_t>> remaining,
+                               std::shared_ptr<std::atomic<std::uint32_t>> successCount,
+                               std::shared_ptr<std::atomic<std::uint32_t>> failCount,
+                               std::shared_ptr<Socket::Packet> replyPacket)
+                : m_ops(ops), m_batchReq(std::move(batchReq)), m_index(index),
+                  m_remaining(std::move(remaining)),
+                  m_success(std::move(successCount)),
+                  m_fail(std::move(failCount)),
+                  m_replyPacket(std::move(replyPacket)) {}
+
+            void exec(IAbortOperation*) override { run(); }
+            void exec(void* workspace, IAbortOperation*) override {
+                void* prev = tls_preallocAppendWorkSpace;
+                tls_preallocAppendWorkSpace = workspace;
+                run();
+                tls_preallocAppendWorkSpace = prev;
+            }
+
+        private:
+            void run() {
+                {
+                    std::shared_lock<std::shared_timed_mutex> cbLock(m_ops->m_callbackLifetimeMutex);
+                    auto& req = m_batchReq->m_items[m_index];
+                    ErrorCode r = ErrorCode::Fail;
+                    const auto* cb = m_ops->LookupAppendCallback_Locked(req.m_layer);
+                    if (cb) {
+                        auto hv = std::make_shared<std::string>(std::move(req.m_headVec));
+                        r = (*cb)(req.m_headID, hv, req.m_appendNum, req.m_appendPosting);
+                    }
+                    if (r == ErrorCode::Success) m_success->fetch_add(1);
+                    else                         m_fail->fetch_add(1);
+                }
+                if (m_remaining->fetch_sub(1) == 1) {
+                    m_ops->SendBatchAppendResponse(
+                        *m_replyPacket, m_success->load(), m_fail->load());
+                }
+            }
+
+            RemotePostingOps* m_ops;
+            std::shared_ptr<BatchRemoteAppendRequest> m_batchReq;
+            size_t m_index;
+            std::shared_ptr<std::atomic<size_t>> m_remaining;
+            std::shared_ptr<std::atomic<std::uint32_t>> m_success;
+            std::shared_ptr<std::atomic<std::uint32_t>> m_fail;
+            std::shared_ptr<Socket::Packet> m_replyPacket;
+        };
+
+        // [Bug 26 retired] bg executor removed — see HandleBatchAppendRequest.
+        // m_bgWorkers etc were replaced by per-layer job submission into the
+        // searcher's shared SPDKThreadPool via m_jobSubmitters[layer].
+        std::vector<JobSubmitter> m_jobSubmitters;
+
+        // HeadSync delivery diagnostics + retry queue (v33). Counters give
+        // observability for sender/receiver gaps; per-peer backlogs +
+        // retry thread make broadcast reliable best-effort.
+        std::atomic<std::uint64_t> m_headSyncBroadcastEntries{0};
+        std::atomic<std::uint64_t> m_headSyncBroadcastSendOK{0};
+        std::atomic<std::uint64_t> m_headSyncBroadcastSendFail{0};
+        std::atomic<std::uint64_t> m_headSyncRecvEntries{0};
+        std::atomic<std::uint64_t> m_headSyncApplyAdd{0};
+        std::atomic<std::uint64_t> m_headSyncApplyDelete{0};
+        std::atomic<std::uint64_t> m_headSyncRetryEnqueued{0};
+        std::atomic<std::uint64_t> m_headSyncRetrySucceeded{0};
+        std::atomic<std::uint64_t> m_headSyncRetryDropped{0};
+
+        // Cross-node merge hint counters. No retry queue: dropped
+        // notifications are recoverable since the owner discovers underfull
+        // postings via its own paths too.
+        std::atomic<std::uint64_t> m_mergeBroadcastSendOK{0};
+        std::atomic<std::uint64_t> m_mergeBroadcastSendFail{0};
+        std::atomic<std::uint64_t> m_mergeRecvHints{0};
+        std::atomic<std::uint64_t> m_mergeRecvDropped{0};
+
+        mutable std::shared_timed_mutex m_headSyncBacklogsMu;
+        std::unordered_map<int, std::shared_ptr<HeadSyncBacklog>> m_headSyncBacklogs;
+        std::thread m_headSyncRetryThread;
+        std::atomic<bool> m_headSyncRetryStop{false};
+        int m_headSyncRetryIntervalMs{500};
+    };
+
+} // namespace SPTAG::SPANN
diff --git a/AnnService/inc/Core/SPANN/Distributed/WorkerNode.h b/AnnService/inc/Core/SPANN/Distributed/WorkerNode.h
new file mode 100644
index 000000000..8af906fcc
--- /dev/null
+++ b/AnnService/inc/Core/SPANN/Distributed/WorkerNode.h
@@ -0,0 +1,616 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#ifndef _SPTAG_SPANN_WORKERNODE_H_
+#define _SPTAG_SPANN_WORKERNODE_H_
+
+#include "inc/Core/SPANN/Distributed/NetworkNode.h"
+#include "inc/Helper/KeyValueIO.h"
+#include "inc/Helper/CommonHelper.h"
+#include "inc/Socket/SimpleSerialization.h"
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <map>
+#include <set>
+#include <functional>
+#include <future>
+#include <atomic>
+#include <chrono>
+#include <memory>
+#include <mutex>
+#include <thread>
+
+namespace SPTAG::SPANN {
+
+    /// Distributed compute worker node.
+    ///
+    /// Responsibilities:
+    ///   - Route headIDs to owner nodes via consistent hash ring
+    ///   - Queue and flush remote appends (batched RPC)
+    ///   - HeadSync broadcast and remote locking
+    ///   - Register with dispatcher and receive ring updates
+    ///   - Handle incoming dispatch commands from the driver
+    class WorkerNode : public NetworkNode {
+    public:
+        using AppendCallback = RemotePostingOps::AppendCallback;
+        using DispatchCallback = DispatchCoordinator::DispatchCallback;
+        using HeadSyncCallback = RemotePostingOps::HeadSyncCallback;
+        using RemoteLockCallback = RemotePostingOps::RemoteLockCallback;
+
+        /// Initialize with separate dispatcher/worker/store addresses.
+        /// workerIndex is 0-based (0 = driver/local, 1+ = remote).
+        /// Internal node index = workerIndex + 1 (0 is reserved for dispatcher).
+        bool Initialize(
+            std::shared_ptr<Helper::KeyValueIO> p_db,
+            int workerIndex,
+            const std::pair<std::string, std::string>& dispatcherAddr,
+            const std::vector<std::pair<std::string, std::string>>& workerAddrs,
+            const std::vector<std::string>& storeAddrs,
+            int vnodeCount = 150)
+        {
+            if (storeAddrs.empty()) {
+                SPTAGLIB_LOG(Helper::LogLevel::LL_Error,
+                    "WorkerNode::Initialize: storeAddrs is empty\n");
+                return false;
+            }
+
+            // Build combined addr list: [dispatcher, worker0, worker1, ...]
+            std::vector<std::pair<std::string, std::string>> allAddrs;
+            allAddrs.push_back(dispatcherAddr);
+            allAddrs.insert(allAddrs.end(), workerAddrs.begin(), workerAddrs.end());
+
+            int internalIdx = workerIndex + 1;  // 0 = dispatcher, 1..N = workers
+            if (!InitializeNetwork(internalIdx, allAddrs, vnodeCount)) return false;
+
+            // [Bug 30] Populate compute-role fields so callers can ask
+            // "how many data shards?" / "which shard am I?" without
+            // accidentally including the dispatcher slot.
+            m_numDispatchNodes = 1;
+            m_numWorkerNodes = static_cast<int>(workerAddrs.size());
+            m_workerNodeIndex = workerIndex;
+
+            m_db = p_db;
+            m_nodeStores = storeAddrs;
+
+            // Build store → node list mapping (worker internal indices 1..N)
+            int numWorkers = static_cast<int>(workerAddrs.size());
+            int numStores = static_cast<int>(storeAddrs.size());
+            for (int wi = 0; wi < numWorkers; wi++) {
+                int storeIdx = wi % numStores;
+                m_storeToNodes[storeAddrs[storeIdx]].push_back(wi + 1);
+            }
+            for (auto& [store, nodes] : m_storeToNodes) {
+                std::string nodeList;
+                for (int n : nodes) { nodeList += std::to_string(n) + " "; }
+                SPTAGLIB_LOG(Helper::LogLevel::LL_Info,
+                    "WorkerNode: store %s → nodes [%s]\n", store.c_str(), nodeList.c_str());
+            }
+
+            SPTAGLIB_LOG(Helper::LogLevel::LL_Info,
+                "WorkerNode: initialized (workerIndex=%d, internalIdx=%d, %d stores, %d vnodes/node)\n",
+                workerIndex, internalIdx, numStores, vnodeCount);
+
+            m_dispatch.SetNetwork(this);
+            m_remoteOps.SetNetwork(this);
+
+            return true;
+        }
+
+    public:
+        bool Start() { return StartNetwork(); }
+
+        // ---- Callbacks ----
+        //
+        // ExtraDynamicSearcher passes its m_layer when binding callbacks so
+        // that with multi-layer SPANN (Layers >= 2) each layer has its own
+        // captured `this` and request dispatch on the receiver side routes by
+        // request.m_layer.
+
+        void SetAppendCallback(int layer, AppendCallback cb) { m_remoteOps.SetAppendCallback(layer, std::move(cb)); }
+        void SetHeadSyncCallback(int layer, HeadSyncCallback cb) { m_remoteOps.SetHeadSyncCallback(layer, std::move(cb)); }
+        void SetRemoteLockCallback(int layer, RemoteLockCallback cb) { m_remoteOps.SetRemoteLockCallback(layer, std::move(cb)); }
+        // Inject the searcher's shared compute pool so receiver-side
+        // BatchAppend work runs there (high-priority Jobs) instead of in a
+        // separate executor. Idempotent: safe to call multiple times.
+        void SetJobSubmitter(int layer, RemotePostingOps::JobSubmitter s) {
+            m_remoteOps.SetJobSubmitter(layer, std::move(s));
+        }
+        /// Atomically clear all RPC callbacks (every layer) and wait for any
+        /// in-flight invocation to finish.
+        void ClearCallbacks() {
+            m_remoteOps.ClearCallbacks();
+        }
+        /// Per-layer ownership API used by ExtraDynamicSearcher to avoid having
+        /// one layer's destructor wipe another layer's still-active callbacks.
+        /// SetWorker calls ClaimCallbackOwnership(m_layer, this) before
+        /// registering; the destructor calls ClearCallbacksIfOwner(m_layer, this).
+        void ClaimCallbackOwnership(int layer, const void* owner) {
+            m_remoteOps.ClaimCallbackOwnership(layer, owner);
+        }
+        bool ClearCallbacksIfOwner(int layer, const void* owner) {
+            return m_remoteOps.ClearCallbacksIfOwner(layer, owner);
+        }
+        void SetDispatchCallback(DispatchCallback cb) { m_dispatch.SetDispatchCallback(std::move(cb)); }
+        void ClearDispatchCallback() { m_dispatch.ClearDispatchCallback(); }
+
+        // ---- Routing ----
+
+        RouteTarget GetOwner(SizeType headID) {
+            RouteTarget target;
+            target.isLocal = true;
+            target.nodeIndex = m_localNodeIndex;
+
+            if (!m_enabled) {
+                m_routeStats.disabled++;
+                return target;
+            }
+            {
+                auto ring = std::atomic_load(&m_hashRing);
+                if (!ring || ring->NodeCount() <= 1) {
+                    m_routeStats.local++;
+                    return target;
+                }
+                target.nodeIndex = ring->GetOwner(headID);
+            }
+            target.isLocal = (target.nodeIndex == m_localNodeIndex);
+            if (target.isLocal) m_routeStats.local++;
+            else m_routeStats.remote++;
+            return target;
+        }
+
+        void LogRouteStats(const char* context = "") {
+            SPTAGLIB_LOG(Helper::LogLevel::LL_Info,
+                "WorkerNode stats%s: local=%d remote=%d disabled=%d keyMiss=%d noMapping=%d\n",
+                context, (int)m_routeStats.local, (int)m_routeStats.remote,
+                (int)m_routeStats.disabled, (int)m_routeStats.keyMiss,
+                (int)m_routeStats.noMapping);
+        }
+
+        void ResetRouteStats() {
+            m_routeStats.local.store(0);
+            m_routeStats.remote.store(0);
+            m_routeStats.disabled.store(0);
+            m_routeStats.keyMiss.store(0);
+            m_routeStats.noMapping.store(0);
+        }
+
+        // ---- Remote posting ops ----
+
+        ErrorCode SendRemoteAppend(int targetNodeIndex, int layer, SizeType headID,
+            const std::shared_ptr<std::string>& headVec, int appendNum,
+            std::string& appendPosting)
+        {
+            return m_remoteOps.SendRemoteAppend(targetNodeIndex, layer, headID, headVec, appendNum, appendPosting);
+        }
+
+        ErrorCode SendBatchRemoteAppend(int targetNodeIndex, std::vector<RemoteAppendRequest>& items) {
+            return m_remoteOps.SendBatchRemoteAppend(targetNodeIndex, items);
+        }
+
+        void BroadcastHeadSync(const std::vector<HeadSyncEntry>& entries) {
+            if (!m_enabled) return;
+            m_remoteOps.BroadcastHeadSync(entries);
+        }
+
+        // v33: expose HeadSync delivery diagnostics + retry queue.
+        void DumpHeadSyncStats(const char* label) const {
+            m_remoteOps.DumpHeadSyncStats(label);
+        }
+        // Cross-node merge-hint channel diagnostics.
+        void DumpMergeRequestStats(const char* label) const {
+            m_remoteOps.DumpMergeRequestStats(label);
+        }
+        size_t GetHeadSyncBacklogSize() const {
+            return m_remoteOps.GetHeadSyncBacklogSize();
+        }
+        size_t DrainHeadSyncBacklog(size_t maxBatch = 1024) {
+            return m_remoteOps.DrainHeadSyncBacklog(maxBatch);
+        }
+        void NoteHeadSyncApplyAdd() {
+            m_remoteOps.NoteHeadSyncApplyAdd();
+        }
+        void NoteHeadSyncApplyDelete() {
+            m_remoteOps.NoteHeadSyncApplyDelete();
+        }
+
+        bool SendRemoteLock(int nodeIndex, int layer, SizeType headID, bool lock) {
+            if (!m_enabled) return false;
+            return m_remoteOps.SendRemoteLock(nodeIndex, layer, headID, lock);
+        }
+
+        void SetMergeCallback(int layer, RemotePostingOps::MergeCallback cb) {
+            m_remoteOps.SetMergeCallback(layer, std::move(cb));
+        }
+
+        // ---- Append queue ----
+
+        void QueueRemoteAppend(int nodeIndex, RemoteAppendRequest req) {
+            std::vector<RemoteAppendRequest> toFlush;
+            bool didReserveSlot = false;
+            {
+                std::lock_guard<std::mutex> lock(m_appendQueueMutex);
+                auto& q = m_appendQueue[nodeIndex];
+                q.push_back(std::move(req));
+                m_remoteQueueSize.fetch_add(1, std::memory_order_relaxed);
+                // [PERF] Auto-flush per node once we have a full chunk worth
+                // (kAutoFlushThreshold items). Without this, every remote
+                // append accumulates until end-of-batch FlushRemoteAppends —
+                // which then sends hundreds of thousands of items serially
+                // (10k chunks * ~3s/chunk) AFTER all insert compute is done.
+                // Auto-flushing while inserts keep running overlaps the
+                // network with CPU and drops end-of-batch tail latency.
+                //
+                // [v38] Allow up to kMaxInflightPerNode concurrent in-flight
+                // chunks per node so a producer burst (split fan-out, reassign
+                // wave) can saturate the receiver's bg-executor pool instead of
+                // queueing up serially behind a single per-node mutex.
+                if (q.size() >= kAutoFlushThreshold
+                    && m_perNodeInflight[nodeIndex] < kMaxInflightPerNode) {
+                    toFlush.swap(q);
+                    m_remoteQueueSize.fetch_sub(toFlush.size(), std::memory_order_relaxed);
+                    ++m_perNodeInflight[nodeIndex];
+                    didReserveSlot = true;
+                }
+            }
+            if (!didReserveSlot) return;
+
+            // Fire-and-forget async send. After the initial chunk completes,
+            // the same thread loops to pick up any further accumulation so we
+            // avoid thread-spawn churn while keeping per-node concurrency at
+            // kMaxInflightPerNode. Order across batches is best-effort: the
+            // receiver runs 8 worker threads on each chunk that already
+            // interleave items within a chunk, so cross-chunk ordering adds
+            // no extra correctness risk for the per-posting RMW path.
+            auto items = std::make_shared<std::vector<RemoteAppendRequest>>(std::move(toFlush));
+            m_inflightAppendFlushes.fetch_add(1, std::memory_order_relaxed);
+            std::thread([this, nodeIndex, items]() {
+                while (true) {
+                    ErrorCode ret = SendBatchRemoteAppend(nodeIndex, *items);
+                    if (ret != ErrorCode::Success) {
+                        SPTAGLIB_LOG(Helper::LogLevel::LL_Error,
+                            "QueueRemoteAppend auto-flush: batch to node %d failed (%zu items)\n",
+                            nodeIndex, items->size());
+                    }
+                    items->clear();
+                    {
+                        std::lock_guard<std::mutex> lock(m_appendQueueMutex);
+                        auto it = m_appendQueue.find(nodeIndex);
+                        if (it == m_appendQueue.end()
+                            || it->second.size() < kAutoFlushThreshold) {
+                            --m_perNodeInflight[nodeIndex];
+                            break;
+                        }
+                        items->swap(it->second);
+                        m_remoteQueueSize.fetch_sub(items->size(),
+                            std::memory_order_relaxed);
+                    }
+                }
+                m_inflightAppendFlushes.fetch_sub(1, std::memory_order_relaxed);
+            }).detach();
+        }
+
+        size_t GetRemoteQueueSize() const {
+            return m_remoteQueueSize.load(std::memory_order_relaxed);
+        }
+
+        ErrorCode FlushRemoteAppends() {
+            // Drain the queue under m_flushMutex so concurrent flush callers
+            // serialize. Loop in case items get queued mid-send. This avoids
+            // the thundering-herd of 100+ concurrent FlushRemoteAppends calls
+            // (one per split worker) overwhelming the remote node's tiny
+            // (8-thread, 256-connection-pool) network server.
+            std::lock_guard<std::mutex> flushGuard(m_flushMutex);
+
+            // Wait for any in-flight async auto-flushes triggered by
+            // QueueRemoteAppend (>= kAutoFlushThreshold) to drain so the
+            // residue we send below is the actual tail. Callers invoke
+            // FlushRemoteAppends after all producers (AddIndex / split /
+            // reassign) have quiesced, so no new auto-flushes will start
+            // here.
+            while (m_inflightAppendFlushes.load(std::memory_order_relaxed) > 0) {
+                std::this_thread::sleep_for(std::chrono::milliseconds(20));
+            }
+
+            int errors = 0;
+            int iterations = 0;
+            while (true) {
+                std::unordered_map<int, std::vector<RemoteAppendRequest>> toSend;
+                {
+                    std::lock_guard<std::mutex> lock(m_appendQueueMutex);
+                    if (m_appendQueue.empty()) break;
+                    toSend.swap(m_appendQueue);
+                    m_remoteQueueSize.store(0, std::memory_order_relaxed);
+                }
+                if (toSend.empty()) break;
+                ++iterations;
+
+                std::atomic<int> iterErrors{0};
+                std::vector<std::thread> threads;
+                for (auto& [nodeIdx, items] : toSend) {
+                    if (items.empty()) continue;
+                    threads.emplace_back([this, &iterErrors, nodeIdx, &items]() {
+                        // Per-node mutex serializes against any straggler
+                        // auto-flush still in flight for this node.
+                        std::mutex& nodeMtx = GetPerNodeAppendFlushMutex(nodeIdx);
+                        std::lock_guard<std::mutex> nlock(nodeMtx);
+                        ErrorCode ret = SendBatchRemoteAppend(nodeIdx, items);
+                        if (ret != ErrorCode::Success) {
+                            SPTAGLIB_LOG(Helper::LogLevel::LL_Error,
+                                "FlushRemoteAppends: batch to node %d failed (%d items)\n",
+                                nodeIdx, (int)items.size());
+                            iterErrors++;
+                        }
+                    });
+                }
+                for (auto& t : threads) t.join();
+                errors += iterErrors.load();
+            }
+            return errors > 0 ? ErrorCode::Fail : ErrorCode::Success;
+        }
+
+        // ---- Cross-node merge hint queue ----
+        //
+        // Search-side fire-and-forget notifications: node X sees posting H
+        // underfull, where H is owned by Y. We dedup (layer, headID) within
+        // a flush window and batch-send to Y in one packet. The receiver's
+        // m_mergeList dedups on top of this, so an occasional dropped or
+        // duplicated notification only costs a few cycles.
+        void QueueRemoteMerge(int nodeIndex, int layer, SizeType headID) {
+            std::vector<RemoteMergeRequest> toFlush;
+            {
+                std::lock_guard<std::mutex> lock(m_mergeQueueMutex);
+                std::int64_t key = (static_cast<std::int64_t>(layer) << 32)
+                                 | static_cast<std::uint32_t>(headID);
+                auto& bucket = m_mergeQueue[nodeIndex];
+                if (!bucket.insert(key).second) return;  // already pending
+                m_mergeQueueSize.fetch_add(1, std::memory_order_relaxed);
+
+                if (bucket.size() >= kMergeAutoFlushThreshold) {
+                    toFlush.reserve(bucket.size());
+                    for (std::int64_t k : bucket) {
+                        RemoteMergeRequest req;
+                        req.m_layer = static_cast<std::int32_t>(k >> 32);
+                        req.m_headID = static_cast<SizeType>(static_cast<std::int32_t>(k & 0xFFFFFFFF));
+                        toFlush.push_back(std::move(req));
+                    }
+                    m_mergeQueueSize.fetch_sub(bucket.size(), std::memory_order_relaxed);
+                    bucket.clear();
+                }
+            }
+            if (!toFlush.empty()) {
+                m_remoteOps.SendBatchRemoteMerge(nodeIndex, toFlush);
+            }
+        }
+
+        ErrorCode FlushRemoteMerges() {
+            std::unordered_map<int, std::vector<RemoteMergeRequest>> toSend;
+            {
+                std::lock_guard<std::mutex> lock(m_mergeQueueMutex);
+                if (m_mergeQueue.empty()) return ErrorCode::Success;
+                for (auto& [nodeIdx, bucket] : m_mergeQueue) {
+                    auto& vec = toSend[nodeIdx];
+                    vec.reserve(bucket.size());
+                    for (std::int64_t k : bucket) {
+                        RemoteMergeRequest req;
+                        req.m_layer = static_cast<std::int32_t>(k >> 32);
+                        req.m_headID = static_cast<SizeType>(static_cast<std::int32_t>(k & 0xFFFFFFFF));
+                        vec.push_back(std::move(req));
+                    }
+                }
+                m_mergeQueue.clear();
+                m_mergeQueueSize.store(0, std::memory_order_relaxed);
+            }
+            for (auto& [nodeIdx, items] : toSend) {
+                if (!items.empty()) m_remoteOps.SendBatchRemoteMerge(nodeIdx, items);
+            }
+            return ErrorCode::Success;
+        }
+
+        // ---- Ring protocol (worker side) ----
+
+        bool WaitForRing(int timeoutSec = 120) {
+            auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(timeoutSec);
+            while (std::chrono::steady_clock::now() < deadline) {
+                auto ring = std::atomic_load(&m_hashRing);
+                if (ring && ring->NodeCount() > 0) return true;
+                std::this_thread::sleep_for(std::chrono::milliseconds(200));
+            }
+            SPTAGLIB_LOG(Helper::LogLevel::LL_Error,
+                "WorkerNode: Timed out waiting for ring (%ds)\n", timeoutSec);
+            return false;
+        }
+
+        // ---- Data members (public for ExtraDynamicSearcher access) ----
+
+        std::shared_ptr<Helper::KeyValueIO> m_db;
+        std::vector<std::string> m_nodeStores;
+        std::unordered_map<std::string, std::vector<int>> m_storeToNodes;
+
+        struct RouteStats {
+            std::atomic<int> local{0};
+            std::atomic<int> remote{0};
+            std::atomic<int> disabled{0};
+            std::atomic<int> keyMiss{0};
+            std::atomic<int> noMapping{0};
+        } m_routeStats;
+
+    protected:
+        void RegisterServerHandlers(Socket::PacketHandlerMapPtr& handlers) override {
+            handlers->emplace(Socket::PacketType::AppendRequest,
+                [this](Socket::ConnectionID c, Socket::Packet p) { m_remoteOps.HandleAppendRequest(c, std::move(p)); });
+            handlers->emplace(Socket::PacketType::BatchAppendRequest,
+                [this](Socket::ConnectionID c, Socket::Packet p) { m_remoteOps.HandleBatchAppendRequest(c, std::move(p)); });
+            handlers->emplace(Socket::PacketType::HeadSyncRequest,
+                [this](Socket::ConnectionID c, Socket::Packet p) { m_remoteOps.HandleHeadSyncRequest(c, std::move(p)); });
+            handlers->emplace(Socket::PacketType::RemoteLockRequest,
+                [this](Socket::ConnectionID c, Socket::Packet p) { m_remoteOps.HandleRemoteLockRequest(c, std::move(p)); });
+            handlers->emplace(Socket::PacketType::MergeRequest,
+                [this](Socket::ConnectionID c, Socket::Packet p) { m_remoteOps.HandleMergeRequest(c, std::move(p)); });
+            handlers->emplace(Socket::PacketType::DispatchCommand,
+                [this](Socket::ConnectionID c, Socket::Packet p) { m_dispatch.HandleDispatchCommand(c, std::move(p)); });
+            handlers->emplace(Socket::PacketType::DispatchResult,
+                [this](Socket::ConnectionID c, Socket::Packet p) { m_dispatch.HandleDispatchResult(c, std::move(p)); });
+            handlers->emplace(Socket::PacketType::RingUpdate,
+                [this](Socket::ConnectionID c, Socket::Packet p) { HandleRingUpdate(c, std::move(p)); });
+        }
+
+        void RegisterClientHandlers(Socket::PacketHandlerMapPtr& handlers) override {
+            handlers->emplace(Socket::PacketType::AppendResponse,
+                [this](Socket::ConnectionID c, Socket::Packet p) { m_remoteOps.HandleAppendResponse(c, std::move(p)); });
+            handlers->emplace(Socket::PacketType::BatchAppendResponse,
+                [this](Socket::ConnectionID c, Socket::Packet p) { m_remoteOps.HandleBatchAppendResponse(c, std::move(p)); });
+            handlers->emplace(Socket::PacketType::RemoteLockResponse,
+                [this](Socket::ConnectionID c, Socket::Packet p) { m_remoteOps.HandleRemoteLockResponse(c, std::move(p)); });
+            handlers->emplace(Socket::PacketType::DispatchResult,
+                [this](Socket::ConnectionID c, Socket::Packet p) { m_dispatch.HandleDispatchResult(c, std::move(p)); });
+        }
+
+        void BgProtocolStep() override {
+            // Keep sending NodeRegister until ring is populated
+            auto ring = std::atomic_load(&m_hashRing);
+            if (!ring || ring->NodeCount() == 0) {
+                Socket::ConnectionID connID = Socket::c_invalidConnectionID;
+                {
+                    std::lock_guard<std::mutex> lock(m_connMutex);
+                    if (m_dispatcherNodeIndex < (int)m_peerConnections.size())
+                        connID = m_peerConnections[m_dispatcherNodeIndex];
+                }
+                if (connID != Socket::c_invalidConnectionID) {
+                    SendNodeRegister();
+                }
+            }
+        }
+
+        bool IsRingSettled() const override {
+            auto ring = std::atomic_load(&m_hashRing);
+            return ring && ring->NodeCount() > 0;
+        }
+
+    private:
+        void SendNodeRegister() {
+            NodeRegisterMsg msg;
+            msg.m_nodeIndex = m_localNodeIndex;
+            msg.m_host = m_nodeAddrs[m_localNodeIndex].first;
+            msg.m_port = m_nodeAddrs[m_localNodeIndex].second;
+            // Worker's 0-based index = m_localNodeIndex - 1 (since 0 is dispatcher)
+            int workerIdx = m_localNodeIndex - 1;
+            int numStores = static_cast<int>(m_nodeStores.size());
+            msg.m_store = (numStores > 0) ? m_nodeStores[workerIdx % numStores] : "";
+
+            std::size_t bodySize = msg.EstimateBufferSize();
+            Socket::Packet pkt;
+            pkt.Header().m_packetType = Socket::PacketType::NodeRegisterRequest;
+            pkt.Header().m_processStatus = Socket::PacketProcessStatus::Ok;
+            pkt.Header().m_connectionID = Socket::c_invalidConnectionID;
+            pkt.Header().m_resourceID = 0;
+            pkt.Header().m_bodyLength = static_cast<std::uint32_t>(bodySize);
+            pkt.AllocateBuffer(static_cast<std::uint32_t>(bodySize));
+            msg.Write(pkt.Body());
+            pkt.Header().WriteBuffer(pkt.HeaderBuffer());
+
+            auto connID = GetPeerConnection(m_dispatcherNodeIndex);
+            if (connID != Socket::c_invalidConnectionID) {
+                m_client->SendPacket(connID, std::move(pkt), nullptr);
+                SPTAGLIB_LOG(Helper::LogLevel::LL_Info,
+                    "WorkerNode: Sent NodeRegister (node %d) to dispatcher\n", m_localNodeIndex);
+            }
+        }
+
+        void HandleRingUpdate(Socket::ConnectionID connID, Socket::Packet packet) {
+            RingUpdateMsg msg;
+            if (!msg.Read(packet.Body())) {
+                SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "WorkerNode: Failed to parse RingUpdate\n");
+                return;
+            }
+
+            auto newRing = std::make_shared<ConsistentHashRing>(msg.m_vnodeCount);
+            for (auto idx : msg.m_nodeIndices) {
+                newRing->AddNode(idx);
+            }
+            {
+                std::lock_guard<std::mutex> guard(m_ringWriteMutex);
+                std::atomic_store(&m_hashRing,
+                    std::shared_ptr<const ConsistentHashRing>(std::move(newRing)));
+            }
+            SPTAGLIB_LOG(Helper::LogLevel::LL_Info,
+                "WorkerNode: Ring updated — %d nodes (v%u)\n",
+                (int)msg.m_nodeIndices.size(), msg.m_ringVersion);
+
+            SendRingUpdateACK(msg.m_ringVersion);
+        }
+
+        void SendRingUpdateACK(std::uint32_t ringVersion) {
+            RingUpdateACKMsg msg;
+            msg.m_nodeIndex = m_localNodeIndex;
+            msg.m_ringVersion = ringVersion;
+
+            std::size_t bodySize = msg.EstimateBufferSize();
+            Socket::Packet pkt;
+            pkt.Header().m_packetType = Socket::PacketType::RingUpdateACK;
+            pkt.Header().m_processStatus = Socket::PacketProcessStatus::Ok;
+            pkt.Header().m_connectionID = Socket::c_invalidConnectionID;
+            pkt.Header().m_resourceID = 0;
+            pkt.Header().m_bodyLength = static_cast<std::uint32_t>(bodySize);
+            pkt.AllocateBuffer(static_cast<std::uint32_t>(bodySize));
+            msg.Write(pkt.Body());
+            pkt.Header().WriteBuffer(pkt.HeaderBuffer());
+
+            auto connID = GetPeerConnection(m_dispatcherNodeIndex);
+            if (connID != Socket::c_invalidConnectionID) {
+                m_client->SendPacket(connID, std::move(pkt), nullptr);
+            }
+        }
+
+        int m_dispatcherNodeIndex = 0;
+        RemotePostingOps m_remoteOps;
+        DispatchCoordinator m_dispatch;
+
+        mutable std::mutex m_appendQueueMutex;
+        std::unordered_map<int, std::vector<RemoteAppendRequest>> m_appendQueue;
+        std::atomic<size_t> m_remoteQueueSize{0};
+        // Serializes concurrent FlushRemoteAppends() callers so we don't open
+        // hundreds of simultaneous RPC streams to the remote worker (which has
+        // only 8 server threads / 256 connection slots). With this mutex, only
+        // one thread sends at a time; concurrent callers either wait for the
+        // current flush to finish or contribute their items to the queue.
+        std::mutex m_flushMutex;
+
+        // Per-node mutex used by end-of-batch FlushRemoteAppends so concurrent
+        // sends to the SAME node from the final-drain path remain ordered.
+        // Auto-flushes (QueueRemoteAppend) instead use m_perNodeInflight to
+        // cap concurrency at kMaxInflightPerNode per node.
+        std::mutex m_perNodeAppendFlushMutexMapLock;
+        std::unordered_map<int, std::unique_ptr<std::mutex>> m_perNodeAppendFlushMutex;
+        std::atomic<int> m_inflightAppendFlushes{0};
+        std::unordered_map<int, int> m_perNodeInflight; // guarded by m_appendQueueMutex
+        static constexpr size_t kAutoFlushThreshold = 50000;
+        static constexpr int kMaxInflightPerNode = 4;
+
+        std::mutex& GetPerNodeAppendFlushMutex(int nodeIndex) {
+            std::lock_guard<std::mutex> lk(m_perNodeAppendFlushMutexMapLock);
+            auto it = m_perNodeAppendFlushMutex.find(nodeIndex);
+            if (it == m_perNodeAppendFlushMutex.end()) {
+                auto ins = m_perNodeAppendFlushMutex.emplace(
+                    nodeIndex, std::make_unique<std::mutex>());
+                return *ins.first->second;
+            }
+            return *it->second;
+        }
+
+        // Cross-node merge hint queue. Per-target dedup set of packed
+        // (layer << 32 | headID) values; QueueRemoteMerge inserts and
+        // auto-flushes when the per-target bucket reaches threshold.
+        mutable std::mutex m_mergeQueueMutex;
+        std::unordered_map<int, std::unordered_set<std::int64_t>> m_mergeQueue;
+        std::atomic<size_t> m_mergeQueueSize{0};
+        // Merge hints are non-urgent (best-effort optimization). A larger
+        // bucket trades a small amount of latency for much better dedup and
+        // network batching. End-of-batch FlushRemoteMerges() guarantees no
+        // hint is permanently dropped.
+        static constexpr size_t kMergeAutoFlushThreshold = 8192;
+    };
+
+} // namespace SPTAG::SPANN
+
+#endif // _SPTAG_SPANN_WORKERNODE_H_
diff --git a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h
index fe3d306a1..29129bdb4 100644
--- a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h
+++ b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h
@@ -19,6 +19,7 @@
 #include "inc/Core/Common/LocalVersionMap.h"
 #include "inc/Core/Common/TiKVVersionMap.h"
 #include "ExtraFileController.h"
+#include "Distributed/WorkerNode.h"
 #include <chrono>
 #include <cstdint>
 #include <algorithm>
@@ -207,15 +208,29 @@ namespace SPTAG::SPANN {
         };
 
     private:
+        std::atomic<int> m_workspaceCount = 0;
+
         std::shared_ptr<Helper::KeyValueIO> db;
+        WorkerNode* m_worker = nullptr;  // externally owned, set via SetWorker()
+
+    public:
+        // Expose the underlying KV handle so a standalone WorkerNode can be wired to the
+        // same DB this searcher already opened, instead of opening a second one.
+        std::shared_ptr<Helper::KeyValueIO> GetDB() const { return db; }
 
+    private:
         SPANN::Index<ValueType>* m_headIndex;
         std::unique_ptr<COMMON::IVersionMap> m_versionMap;
         Options* m_opt;
         int m_layer;
+        SizeType m_initialVectorSize = 0;  // vector count at build time (before inserts)
 
         COMMON::FineGrainedRWLock m_rwLocks;
 
+        // Per-bucket flags for remote (cross-node) locking.
+        static constexpr int kRemoteLockPoolSize = 32767;
+        std::unique_ptr<std::atomic<bool>[]> m_remoteBucketLocked;
+
         IndexStats m_stat;
 
         std::shared_ptr<PersistentBuffer> m_wal;
@@ -339,9 +354,247 @@ namespace SPTAG::SPANN {
             SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Posting size limit: %d, search limit: %f, merge threshold: %d\n", m_postingSizeLimit, p_opt.m_latencyLimit, m_mergeThreshold);
             SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "[CONFIG] layer=%d DistributedVersionMap=%s SearchCheckVersionMapOnlyLayer0=%s UseMultiChunkPosting=%s PostingPageLimit=%d\n",
                 layer, p_opt.m_distributedVersionMap ? "true" : "false", p_opt.m_searchCheckVersionMapOnlyLayer0 ? "true" : "false", p_opt.m_useMultiChunkPosting ? "true" : "false", p_opt.m_postingPageLimit);
+
+            // Initialize per-bucket remote lock flags
+            m_remoteBucketLocked.reset(new std::atomic<bool>[kRemoteLockPoolSize + 1]{});
+        }
+
+        ~ExtraDynamicSearcher() {
+            if (m_worker) {
+                m_worker->ClearCallbacksIfOwner(m_layer, this);
+                m_worker = nullptr;
+            }
+        }
+
+        int GetNumWorkerNodes() const {
+            if (m_worker && m_worker->IsEnabled()) {
+                return std::max(1, m_worker->GetNumWorkerNodes());
+            }
+            return 1;
+        }
+
+        int GetWorkerNodeIndex() const {
+            if (m_worker && m_worker->IsEnabled()) {
+                int idx = m_worker->GetWorkerNodeIndex();
+                return idx >= 0 ? idx : 0;
+            }
+            return 0;
+        }
+
+        // Stripe globalVID across worker nodes (only for vectors added after build).
+        SizeType AllocateGlobalVID(SizeType localVID) const override {
+            int numWorkers = GetNumWorkerNodes();
+            if (numWorkers <= 1 || localVID < m_initialVectorSize) return localVID;
+            return m_initialVectorSize + (localVID - m_initialVectorSize) * numWorkers + GetWorkerNodeIndex();
+        }
+
+        // Idempotent: wires the receiver's BatchAppend Jobs onto our shared
+        // SPDKThreadPool. Called both after pool creation and from
+        // SetWorker(); whichever happens last actually binds the submitter.
+        void WireJobSubmitterIfReady() {
+            if (!m_worker || !m_splitThreadPool) return;
+            auto pool = m_splitThreadPool;
+            m_worker->SetJobSubmitter(m_layer,
+                [pool](Helper::ThreadPool::Job* j, bool high) {
+                    if (high) pool->add_high(j);
+                    else      pool->add(j);
+                });
+        }
+
+        /// Set the external WorkerNode pointer and bind all callbacks
+        /// (append, head-sync, remote-lock, merge-hint) at THIS instance's m_layer.
+        void SetWorker(WorkerNode* router) override {
+            m_worker = router;
+            if (!m_worker) return;
+
+            WireJobSubmitterIfReady();
+
+            // Claim ownership so the matching destructor's IfOwner check
+            // clears the right slot if/when we are deleted (multi-layer SPANN
+            // each layer has its own slot keyed by m_layer).
+            m_worker->ClaimCallbackOwnership(m_layer, this);
+
+            // Append callback: routes incoming remote appends to local Append()
+            m_worker->SetAppendCallback(m_layer,
+                [this](SizeType headID, std::shared_ptr<std::string> headVec,
+                       int appendNum, std::string& appendPosting) -> ErrorCode {
+                    // Reuse SPDKThreadPool's per-worker pre-allocated workspace
+                    // when called from BatchAppendItemJob on m_splitThreadPool.
+                    ExtraWorkSpace localWorkSpace;
+                    ExtraWorkSpace* ws = static_cast<ExtraWorkSpace*>(tls_preallocAppendWorkSpace);
+                    if (!ws) {
+                        m_headIndex->InitWorkSpace(&localWorkSpace);
+                        ws = &localWorkSpace;
+                    }
+                    bool wasMissing = !m_headIndex->ContainSample(headID, m_layer + 1);
+                    if (wasMissing && headVec && !headVec->empty()) {
+                        DimensionType dim = static_cast<DimensionType>(
+                            headVec->size() / sizeof(ValueType));
+                        m_headIndex->AddHeadIndex(headVec->data(), headID, 0,
+                            dim, m_layer + 1, ws);
+                    }
+
+                    // Mirror sender's version map for the records we're about
+                    // to persist so MergePostings + SearchIndex don't drop
+                    // them as "stale". See HEAD git history for rationale.
+                    {
+                        const uint8_t* basePtr = reinterpret_cast<const uint8_t*>(appendPosting.data());
+                        size_t totalRec = appendPosting.size() / m_vectorInfoSize;
+                        EnsureVersionMapCoversPosting(basePtr, totalRec, "AppendCallback", headID);
+
+                        const SizeType localCount = m_versionMap->Count();
+                        std::vector<SizeType> batchVids;
+                        std::vector<uint8_t> batchVers;
+                        batchVids.reserve(totalRec);
+                        batchVers.reserve(totalRec);
+                        for (size_t i = 0; i < totalRec; ++i) {
+                            const uint8_t* p = basePtr + i * m_vectorInfoSize;
+                            SizeType vid = *reinterpret_cast<const SizeType*>(p);
+                            uint8_t recVer = *(p + sizeof(SizeType));
+                            if (vid < 0 || vid >= localCount) continue;
+                            if (recVer == 0xfe) continue;
+                            uint8_t curVer = m_versionMap->GetVersion(vid);
+                            if (curVer == 0xfe) continue;
+                            if (curVer == recVer) continue;
+                            batchVids.push_back(vid);
+                            batchVers.push_back(recVer);
+                        }
+                        if (!batchVids.empty()) {
+                            m_versionMap->SetVersionBatch(batchVids, batchVers);
+                        }
+                    }
+                    return Append(ws, headID, appendNum, appendPosting, 0);
+                });
+
+            // Head sync callback: apply head index updates from peers
+            auto* headIndex = m_headIndex;
+            int layer = m_layer;
+            auto* worker = m_worker;
+            m_worker->SetHeadSyncCallback(m_layer, [headIndex, layer, worker](const HeadSyncEntry& entry) {
+                if (entry.op == HeadSyncEntry::Op::Add) {
+                    headIndex->AddHeadIndex(entry.headVector.data(), entry.headVID, 0,
+                        static_cast<DimensionType>(entry.headVector.size() / sizeof(ValueType)),
+                        layer + 1, nullptr);
+                    if (worker) worker->NoteHeadSyncApplyAdd();
+                } else {
+                    headIndex->DeleteIndex(entry.headVID, layer + 1);
+                    if (worker) worker->NoteHeadSyncApplyDelete();
+                }
+            });
+
+            // Remote lock callback: per-bucket atomic flags
+            m_worker->SetRemoteLockCallback(m_layer, [this](SizeType headID, bool lock) -> bool {
+                unsigned bucket = COMMON::FineGrainedRWLock::BucketIndex(static_cast<unsigned>(headID));
+                if (lock) {
+                    bool expected = false;
+                    if (!m_remoteBucketLocked[bucket].compare_exchange_strong(expected, true)) {
+                        return false;
+                    }
+                    if (!m_rwLocks[headID].try_lock()) {
+                        m_remoteBucketLocked[bucket].store(false);
+                        return false;
+                    }
+                    m_rwLocks[headID].unlock();
+                    return true;
+                } else {
+                    m_remoteBucketLocked[bucket].store(false);
+                    return true;
+                }
+            });
+
+            // Cross-node merge hint callback
+            m_worker->SetMergeCallback(m_layer, [this](SizeType headID) {
+                MergeAsync(headID);
+            });
+
+            SPTAGLIB_LOG(Helper::LogLevel::LL_Info,
+                "WorkerNode bound to ExtraDynamicSearcher (layer %d)\n", m_layer);
         }
 
-        ~ExtraDynamicSearcher() {}
+        // Owner-side wait for any in-flight remote lock on this bucket.
+        void WaitForRemoteBucketUnlocked(SizeType headID) const {
+            if (!m_worker || !m_worker->IsEnabled()) return;
+            unsigned bucket = COMMON::FineGrainedRWLock::BucketIndex(static_cast<unsigned>(headID));
+            if (!m_remoteBucketLocked[bucket].load(std::memory_order_acquire)) return;
+            constexpr int kMaxRemoteBucketWaitMs = 5000;
+            auto deadline = std::chrono::steady_clock::now()
+                          + std::chrono::milliseconds(kMaxRemoteBucketWaitMs);
+            while (m_remoteBucketLocked[bucket].load(std::memory_order_acquire)) {
+                if (std::chrono::steady_clock::now() > deadline) {
+                    SPTAGLIB_LOG(Helper::LogLevel::LL_Warning,
+                        "WaitForRemoteBucketUnlocked: headID=%lld bucket=%u stuck for %d ms, proceeding\n",
+                        (std::int64_t)headID, bucket, kMaxRemoteBucketWaitMs);
+                    return;
+                }
+                std::this_thread::sleep_for(std::chrono::milliseconds(1));
+            }
+        }
+
+        // Pack and enqueue a RemoteAppendRequest for an already-resolved
+        // remote owner. headVecBytes may be nullptr when the caller has no
+        // centroid bytes (plain Append into an existing head).
+        void EnqueueRemoteAppend(int nodeIndex,
+                                 SizeType headID,
+                                 int appendNum,
+                                 std::string posting,
+                                 const void* headVecBytes = nullptr) {
+            RemoteAppendRequest req;
+            req.m_headID = headID;
+            req.m_layer = m_layer;
+            if (headVecBytes != nullptr) {
+                req.m_headVec.assign(static_cast<const char*>(headVecBytes),
+                                     m_vectorDataSize);
+            }
+            req.m_appendNum = appendNum;
+            req.m_appendPosting = std::move(posting);
+            m_worker->QueueRemoteAppend(nodeIndex, std::move(req));
+        }
+
+        // If headID is owned by a remote node, queue the append for that
+        // node and return true; otherwise return false (caller continues
+        // with local write logic).
+        bool TryRouteRemoteAppend(SizeType headID,
+                                  int appendNum,
+                                  std::string posting,
+                                  const void* headVecBytes = nullptr) {
+            if (!m_worker || !m_worker->IsEnabled()) return false;
+            // Only the outer (head) layer participates in the owner-ring
+            // route. Inner layers (m_layer > 0) hold per-node-local state
+            // (no shared head VID space, no cross-node TiKV key naming
+            // contract), so each node services its own inner layer
+            // independently. Without this gate inner-layer appends would
+            // also dispatch RPCs that the receiver can't meaningfully
+            // apply.
+            if (m_layer != 0) return false;
+            auto target = m_worker->GetOwner(headID);
+            if (target.isLocal) return false;
+            EnqueueRemoteAppend(target.nodeIndex, headID, appendNum,
+                                std::move(posting), headVecBytes);
+            return true;
+        }
+
+        // Validate (and lazily extend) the local version map so that
+        // every (vid, ver) tuple in a posting we are about to write is
+        // representable. Without this, remote-originated postings carrying
+        // VIDs above our current Count() get dropped silently.
+        void EnsureVersionMapCoversPosting(const uint8_t* p_basePtr, size_t p_totalRec,
+                                           const char* p_caller, SizeType p_headID) {
+            const SizeType localCount = m_versionMap->Count();
+            SizeType maxVid = -1;
+            for (size_t i = 0; i < p_totalRec; ++i) {
+                const uint8_t* p = p_basePtr + i * m_vectorInfoSize;
+                SizeType vid = *reinterpret_cast<const SizeType*>(p);
+                if (vid > maxVid) maxVid = vid;
+            }
+            if (maxVid >= localCount) {
+                SizeType need = maxVid + 1 - localCount;
+                m_versionMap->AddBatch(need);
+                SPTAGLIB_LOG(Helper::LogLevel::LL_Debug,
+                    "%s: extended local versionMap by %lld (head=%lld maxVid=%lld localCount=%lld)\n",
+                    p_caller, (std::int64_t)need, (std::int64_t)p_headID,
+                    (std::int64_t)maxVid, (std::int64_t)localCount);
+            }
+        }
 
         virtual bool Available() override
         {
@@ -419,7 +672,12 @@ namespace SPTAG::SPANN {
         
         virtual ErrorCode AddIDCapacity(SizeType capa, bool deleted) override
         {
-            return m_versionMap->AddBatch(capa, deleted);
+            // Distributed: grow the version map by the FULL batch size
+            // (capa * numWorkers), not just this node's slice. Stripe formula
+            // in AllocateGlobalVID produces globalVIDs up to
+            // m_initialVectorSize + insertCount * numWorkers.
+            int numWorkers = GetNumWorkerNodes();
+            return m_versionMap->AddBatch(capa * numWorkers, deleted);
         }
 
         SPANN::Index<ValueType>* GetHeadIndex() const { return m_headIndex; }
@@ -616,6 +874,23 @@ namespace SPTAG::SPANN {
             double elapsedMSeconds;
             uint64_t splitPostingVectors = 0;
             uint64_t splitNewHeadCount = 0;
+
+            // Only the OWNER of headID should run Split. Remote-issued
+            // splits get dropped early so we don't mutate a posting that
+            // doesn't live on this node.
+            if (m_worker && m_worker->IsEnabled()) {
+                auto target = m_worker->GetOwner(headID);
+                if (!target.isLocal) {
+                    std::unique_lock<std::shared_timed_mutex> tmplock(m_splitListLock);
+                    m_splitList.unsafe_erase(headID);
+                    return ErrorCode::Success;
+                }
+            }
+
+            // Owner-side: wait for any in-flight remote-initiated lock on
+            // this bucket to release the advisory flag before we mutate.
+            WaitForRemoteBucketUnlocked(headID);
+
             {
                 std::unique_lock<std::shared_timed_mutex> lock(m_rwLocks[headID], std::defer_lock);
                 if (requirelock) {
@@ -838,6 +1113,17 @@ namespace SPTAG::SPANN {
                             //SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Split: new head VID %lld already exists in head index. Do merging...\n", (std::int64_t)(newHeadVID));
                             m_stat.m_splitExistingHeadMergeCount.fetch_add(1, std::memory_order_relaxed);
 
+                            // If newHeadVID's owner is a remote node, route
+                            // the new posting via RemoteAppend; the owner
+                            // will merge it into the existing posting list.
+                            if (TryRouteRemoteAppend(
+                                    newHeadVID,
+                                    (int)(newPostingLists[k].size() / m_vectorInfoSize),
+                                    newPostingLists[k],
+                                    args.centers + k * args._D)) {
+                                if (m_rwLocks.hash_func(newHeadVID) != m_rwLocks.hash_func(headID)) anotherLock.unlock();
+                                continue;
+                            }
 
                             std::string mergedPostingList;
                             std::set<SizeType> vectorIdSet;
@@ -925,20 +1211,36 @@ namespace SPTAG::SPANN {
                                 SplitAsync(newHeadVID, currentLength);
                             }
                         } else {
-                            auto splitPutBegin = std::chrono::high_resolution_clock::now();
-                            if ((ret=db->Put(DBKey(newHeadVID), newPostingLists[k], MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) {
-                                SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to add new posting %lld\n", (std::int64_t)(newHeadVID));
-                                return ret;
+                            // If newHeadVID's owner is a remote node, route
+                            // the initial posting via RemoteAppend so it
+                            // ends up in the owner's TiKV. We still add the
+                            // head locally and rely on BroadcastHeadSync
+                            // (after this loop) to spread the head index
+                            // update to all nodes. The receiver's
+                            // AppendCallback materializes the head if its
+                            // HeadSync hasn't arrived yet.
+                            bool remoteCreated = TryRouteRemoteAppend(
+                                newHeadVID,
+                                (int)(newPostingLists[k].size() / m_vectorInfoSize),
+                                newPostingLists[k],
+                                args.centers + k * args._D);
+
+                            if (!remoteCreated) {
+                                auto splitPutBegin = std::chrono::high_resolution_clock::now();
+                                if ((ret=db->Put(DBKey(newHeadVID), newPostingLists[k], MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) {
+                                    SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to add new posting %lld\n", (std::int64_t)(newHeadVID));
+                                    return ret;
+                                }
+                                CheckCentroid(newHeadVID, newPostingLists[k], "Split-NewPosting");
+                                auto splitPutEnd = std::chrono::high_resolution_clock::now();
+                                elapsedMSeconds = std::chrono::duration_cast<std::chrono::microseconds>(splitPutEnd - splitPutBegin).count();
+                                m_stat.m_putCost += elapsedMSeconds;
                             }
-                            CheckCentroid(newHeadVID, newPostingLists[k], "Split-NewPosting");
-                            auto splitPutEnd = std::chrono::high_resolution_clock::now();
-                            elapsedMSeconds = std::chrono::duration_cast<std::chrono::microseconds>(splitPutEnd - splitPutBegin).count();
-                            m_stat.m_putCost += elapsedMSeconds;
 
                             auto updateHeadBegin = std::chrono::high_resolution_clock::now();
                             if ((ret = m_headIndex->AddHeadIndex(args.centers + k * args._D, newHeadVID, version, m_opt->m_dim, m_layer + 1, p_exWorkSpace)) != ErrorCode::Success) {
                                 SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to update head index %lld\n", (std::int64_t)(newHeadVID));
-                                if (db->Delete(DBKey(newHeadVID)) != ErrorCode::Success) {
+                                if (!remoteCreated && db->Delete(DBKey(newHeadVID)) != ErrorCode::Success) {
                                     SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to delete gc posting %lld\n", (std::int64_t)(newHeadVID));
                                 }
                                 return ret;
@@ -962,6 +1264,35 @@ namespace SPTAG::SPANN {
                     }
                 }
 
+                // Broadcast HeadSync to peer nodes when the head update lands
+                // in our local BKT (in-memory, per-compute). Lower-layer head
+                // adds that resolve to m_extraSearchers[m_layer+1]->AddIndex
+                // already write to shared TiKV so re-broadcasting them would
+                // only duplicate.
+                if (m_worker && m_worker->IsEnabled()
+                    && m_headIndex->GetDiskIndex(m_layer + 1) == nullptr) {
+                    std::vector<HeadSyncEntry> headSyncEntries;
+                    for (int k = 0; k < 2; k++) {
+                        if (args.counts[k] == 0 || (int)newHeadsID.size() <= k) continue;
+                        HeadSyncEntry entry;
+                        entry.op = HeadSyncEntry::Op::Add;
+                        entry.headVID = newHeadsID[k];
+                        entry.m_layer = m_layer;
+                        entry.headVector.assign(args.centers + k * args._D, args.centers + k * args._D + m_vectorDataSize);
+                        headSyncEntries.push_back(std::move(entry));
+                    }
+                    if (!theSameHead) {
+                        HeadSyncEntry entry;
+                        entry.op = HeadSyncEntry::Op::Delete;
+                        entry.headVID = headID;
+                        entry.m_layer = m_layer;
+                        headSyncEntries.push_back(std::move(entry));
+                    }
+                    if (!headSyncEntries.empty()) {
+                        m_worker->BroadcastHeadSync(headSyncEntries);
+                    }
+                }
+
                 {
                     std::unique_lock<std::shared_timed_mutex> tmplock(m_splitListLock);
                     //SPTAGLIB_LOG(Helper::LogLevel::LL_Info,"erase: %d\n", headID);
@@ -1003,6 +1334,18 @@ namespace SPTAG::SPANN {
 
         ErrorCode MergePostings(ExtraWorkSpace *p_exWorkSpace, SizeType headID)
         {
+            // The owner runs its own merge passes. Skip when this head is
+            // owned by another node — we'd just be racing the owner.
+            if (m_worker && m_worker->IsEnabled()) {
+                auto target = m_worker->GetOwner(headID);
+                if (!target.isLocal) {
+                    std::unique_lock<std::shared_timed_mutex> tmplock(m_mergeListLock);
+                    m_mergeList.unsafe_erase(headID);
+                    return ErrorCode::Success;
+                }
+            }
+            WaitForRemoteBucketUnlocked(headID);
+
             std::unique_lock<std::shared_timed_mutex> lock(m_rwLocks[headID]);
 
             if (!m_headIndex->ContainSample(headID, m_layer + 1)) {
@@ -1102,23 +1445,61 @@ namespace SPTAG::SPANN {
                 int deletedLength = 0;
                 {
                     std::unique_lock<std::shared_timed_mutex> anotherLock(m_rwLocks[queryResult->VID], std::defer_lock);
-                    // SPTAGLIB_LOG(Helper::LogLevel::LL_Info,"Locked: %d, to be lock: %d\n", headID, queryResult->VID);
-                    if (m_rwLocks.hash_func(queryResult->VID) != m_rwLocks.hash_func(headID)) {
-                        if (!anotherLock.try_lock()) {
-                            auto* curJob = new MergeAsyncJob(this, headID, nullptr);
-                            // Re-queue counts as a new submission; matched by the
-                            // m_mergeJobsInFlight-- / m_totalMergeCompleted++ in
-                            // MergeAsyncJob::exec(). Without these increments
-                            // m_mergeJobsInFlight underflows to a huge uint64
-                            // and m_totalMergeCompleted exceeds m_totalMergeSubmitted.
-                            m_mergeJobsInFlight++;
-                            m_totalMergeSubmitted++;
-                            m_splitThreadPool->add(curJob);
-                            return ErrorCode::Success;
+
+                    // RAII guard for the advisory remote bucket lock.
+                    struct RemoteLockGuard {
+                        WorkerNode* router = nullptr;
+                        int nodeIndex = -1;
+                        int layer = 0;
+                        SizeType headID = -1;
+                        bool active = false;
+                        ~RemoteLockGuard() { if (active && router) router->SendRemoteLock(nodeIndex, layer, headID, false); }
+                        void release() { active = false; }
+                    } remoteLockGuard;
+
+                    bool isRemoteCandidate = false;
+                    int remoteNodeIndex = -1;
+                    if (m_worker && m_worker->IsEnabled()) {
+                        auto target = m_worker->GetOwner(queryResult->VID);
+                        if (!target.isLocal) {
+                            isRemoteCandidate = true;
+                            remoteNodeIndex = target.nodeIndex;
+                            if (!m_worker->SendRemoteLock(remoteNodeIndex, m_layer, queryResult->VID, true)) {
+                                // Remote owner busy; skip this candidate.
+                                continue;
+                            }
+                            remoteLockGuard.router = m_worker;
+                            remoteLockGuard.nodeIndex = remoteNodeIndex;
+                            remoteLockGuard.layer = m_layer;
+                            remoteLockGuard.headID = queryResult->VID;
+                            remoteLockGuard.active = true;
                         }
                     }
-                    if (!m_headIndex->ContainSample(queryResult->VID, m_layer + 1)) continue;
+
+                    if (!isRemoteCandidate) {
+                        // SPTAGLIB_LOG(Helper::LogLevel::LL_Info,"Locked: %d, to be lock: %d\n", headID, queryResult->VID);
+                        if (m_rwLocks.hash_func(queryResult->VID) != m_rwLocks.hash_func(headID)) {
+                            if (!anotherLock.try_lock()) {
+                                auto* curJob = new MergeAsyncJob(this, headID, nullptr);
+                                // Re-queue counts as a new submission; matched by the
+                                // m_mergeJobsInFlight-- / m_totalMergeCompleted++ in
+                                // MergeAsyncJob::exec(). Without these increments
+                                // m_mergeJobsInFlight underflows to a huge uint64
+                                // and m_totalMergeCompleted exceeds m_totalMergeSubmitted.
+                                m_mergeJobsInFlight++;
+                                m_totalMergeSubmitted++;
+                                m_splitThreadPool->add(curJob);
+                                return ErrorCode::Success;
+                            }
+                        }
+                        if (!m_headIndex->ContainSample(queryResult->VID, m_layer + 1)) continue;
+                    }
+
                     if ((ret=db->Get(DBKey(queryResult->VID), &nextPostingList, MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) {
+                        if (isRemoteCandidate) {
+                            // Stale fetch on remote side; skip and let next round retry.
+                            continue;
+                        }
                         SPTAGLIB_LOG(Helper::LogLevel::LL_Error,
                                         "Fail to get to be merged posting: %lld, get size:%d\n",
                                         (std::int64_t)(queryResult->VID), (int)(nextPostingList.size()));
@@ -1143,6 +1524,14 @@ namespace SPTAG::SPANN {
                         nextLength++;
                     }
                     if (resultVec == nullptr) {
+                        if (isRemoteCandidate) {
+                            // Stale fetch / version skew on remote side. Skip
+                            // and let the next merge round retry.
+                            SPTAGLIB_LOG(Helper::LogLevel::LL_Warning,
+                                "MergePostings: remote candidate %lld has no head record in fetched posting, skipping\n",
+                                (std::int64_t)(queryResult->VID));
+                            continue;
+                        }
                         SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "MergePostings fail: cannot find another head vector in posting! headID:%lld\n", (std::int64_t)(queryResult->VID));
                         return ErrorCode::Fail;
                     }
@@ -1158,11 +1547,25 @@ namespace SPTAG::SPANN {
                             return ret;
                         }
                         CheckCentroid(headID, mergedPostingList, "MergePostings-currentLength >= nextLength");
-                        m_headIndex->DeleteIndex(queryResult->VID, m_layer + 1);
-                        if ((ret=db->Delete(DBKey(queryResult->VID))) != ErrorCode::Success)
-                        {
-                            SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to delete old posting %lld in Merge\n", (std::int64_t)(queryResult->VID));
-                            return ret;
+                        if (isRemoteCandidate) {
+                            // Survivor is local; delete remote loser first
+                            // (so we don't have duplicate VID across nodes),
+                            // then drop local head-index entry.
+                            if ((ret=db->Delete(DBKey(queryResult->VID))) != ErrorCode::Success
+                                && ret != ErrorCode::Key_NotFound) {
+                                SPTAGLIB_LOG(Helper::LogLevel::LL_Warning,
+                                    "MergePostings: remote-loser Delete(%lld) failed; survivor %lld is durable\n",
+                                    (std::int64_t)queryResult->VID, (std::int64_t)headID);
+                                return ret;
+                            }
+                            m_headIndex->DeleteIndex(queryResult->VID, m_layer + 1);
+                        } else {
+                            m_headIndex->DeleteIndex(queryResult->VID, m_layer + 1);
+                            if ((ret=db->Delete(DBKey(queryResult->VID))) != ErrorCode::Success)
+                            {
+                                SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to delete old posting %lld in Merge\n", (std::int64_t)(queryResult->VID));
+                                return ret;
+                            }
                         }
                         nextHeadID = headID;
                         nextHeadVec = headVec;
@@ -1175,6 +1578,12 @@ namespace SPTAG::SPANN {
                             mergedPostingList += *resultVec;
                         }
                         if ((ret=db->Put(DBKey(queryResult->VID), mergedPostingList, MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) {
+                            if (isRemoteCandidate) {
+                                SPTAGLIB_LOG(Helper::LogLevel::LL_Warning,
+                                    "MergePostings: remote-survivor Put(%lld) failed; no state mutated, next round will retry\n",
+                                    (std::int64_t)queryResult->VID);
+                                return ret;
+                            }
                             SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "MergePostings fail to override posting %lld after merge\n", (std::int64_t)(queryResult->VID));
                             return ret;
                         }
@@ -1182,6 +1591,12 @@ namespace SPTAG::SPANN {
                         m_headIndex->DeleteIndex(headID, m_layer + 1);
                         if ((ret = db->Delete(DBKey(headID))) != ErrorCode::Success)
                         {
+                            if (isRemoteCandidate) {
+                                SPTAGLIB_LOG(Helper::LogLevel::LL_Warning,
+                                    "MergePostings: local-loser Delete(%lld) failed; remote survivor %lld is durable\n",
+                                    (std::int64_t)headID, (std::int64_t)queryResult->VID);
+                                return ret;
+                            }
                             SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to delete old posting %lld in Merge\n", (std::int64_t)(headID));
                             return ret;
                         }
@@ -1191,7 +1606,15 @@ namespace SPTAG::SPANN {
                         deletedPostingList = &currentPostingList;
                         deletedLength = currentLength;
                     }
-                    if (m_rwLocks.hash_func(queryResult->VID) != m_rwLocks.hash_func(headID)) anotherLock.unlock();
+                    if (isRemoteCandidate) {
+                        // Release advisory remote lock before reassign below.
+                        if (remoteLockGuard.active) {
+                            remoteLockGuard.router->SendRemoteLock(
+                                remoteLockGuard.nodeIndex, remoteLockGuard.layer,
+                                remoteLockGuard.headID, false);
+                            remoteLockGuard.release();
+                        }
+                    } else if (m_rwLocks.hash_func(queryResult->VID) != m_rwLocks.hash_func(headID)) anotherLock.unlock();
                 }
 
                 // SPTAGLIB_LOG(Helper::LogLevel::LL_Info,"Release: %d, Release: %d\n", headID, queryResult->VID);
@@ -1553,6 +1976,38 @@ namespace SPTAG::SPANN {
                 SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Error!, headID :%lld, appendNum:%d\n", (std::int64_t)headID, appendNum);
             }
 
+            // If this head is owned by a remote node, route the append via
+            // QueueRemoteAppend instead of touching local TiKV. appendNum is
+            // captured BEFORE std::move(appendPosting) to avoid use-after-move.
+            // If the batch carries the head's own self-entry (VID == headID),
+            // forward its vector bytes so the receiver can materialize the
+            // head index before the BroadcastHeadSync arrives. See the
+            // matching scan in BatchAppend() for rationale.
+            {
+                const uint8_t* basePtr =
+                    reinterpret_cast<const uint8_t*>(appendPosting.data());
+                const void* headVecBytes = nullptr;
+                for (int i = 0; i < appendNum; ++i) {
+                    const uint8_t* p = basePtr + i * m_vectorInfoSize;
+                    SizeType vid = *reinterpret_cast<const SizeType*>(p);
+                    if (vid == headID) {
+                        headVecBytes = p + m_metaDataSize;
+                        break;
+                    }
+                }
+                if (TryRouteRemoteAppend(headID, appendNum, appendPosting, headVecBytes)) {
+                    if (!reassignThreshold) {
+                        m_totalAppendCount++;
+                        m_stat.m_appendTaskNum++;
+                    }
+                    return ErrorCode::Success;
+                }
+            }
+
+            // If a remote initiator is currently holding the advisory lock
+            // on this bucket, wait it out before we touch the posting.
+            WaitForRemoteBucketUnlocked(headID);
+
         checkDeleted:
             if (!m_headIndex->ContainSample(headID, m_layer + 1)) {
                 for (int i = 0; i < appendNum; i++)
@@ -1684,6 +2139,41 @@ namespace SPTAG::SPANN {
                 auto appendIt = headAppends.find(headID);
                 if (appendIt == headAppends.end()) continue;
 
+                // Owner gate: forward heads owned by a remote node via the
+                // batched RemoteAppend queue. Local heads fall through to
+                // the standard MultiMerge path below. Without this hook,
+                // every node writes to every head's TiKV key and the owner
+                // ring is ignored (no remote RPC, no route stats).
+                //
+                // Pass headVecBytes when this batch carries the head's own
+                // self-entry (VID == headID). During Build-time seed the
+                // receiver may not yet have the head index entry; without
+                // headVecBytes its AppendCallback can't materialize the head
+                // and falls into the ReassignAsync redirect path, dropping
+                // the self-entry from the posting and later causing
+                // "MergePostings fail: cannot find head vector in posting!".
+                {
+                    const std::string& posting = appendIt->second;
+                    const uint8_t* basePtr =
+                        reinterpret_cast<const uint8_t*>(posting.data());
+                    size_t totalRec = posting.size() / m_vectorInfoSize;
+                    const void* headVecBytes = nullptr;
+                    for (size_t i = 0; i < totalRec; ++i) {
+                        const uint8_t* p = basePtr + i * m_vectorInfoSize;
+                        SizeType vid = *reinterpret_cast<const SizeType*>(p);
+                        if (vid == headID) {
+                            headVecBytes = p + m_metaDataSize;
+                            break;
+                        }
+                    }
+                    if (TryRouteRemoteAppend(headID,
+                                             (int)(posting.size() / m_vectorInfoSize),
+                                             posting,
+                                             headVecBytes)) {
+                        continue;
+                    }
+                }
+
                 std::unique_lock<std::shared_timed_mutex> headLock(m_rwLocks[headID]);
 
                 if (!m_headIndex->ContainSample(headID, m_layer + 1)) {
@@ -1788,6 +2278,10 @@ namespace SPTAG::SPANN {
                 //LOG(Helper::LogLevel::LL_Info, "Reassign: oldVID:%d, replicaCount:%d, candidateNum:%d, dist0:%f\n", oldVID, replicaCount, i, selections[0].distance);
                 for (int i = 0; i < replicaCount && m_versionMap->GetVersion(VID) == version; i++) {
                     //LOG(Helper::LogLevel::LL_Info, "Reassign: headID :%d, oldVID:%d, newVID:%d, posting length: %d, dist: %f, string size: %d\n", headID, oldVID, VID, m_postingSizes[headID].load(), selections[i].distance, newPart.size());
+                    if (TryRouteRemoteAppend(selections[i].VID, 1, *vectorInfo,
+                                             selections[i].Vec.Data())) {
+                        continue;
+                    }
                     // [FIX H3] use reassignThreshold=0 so that an oversized
                     // target posting triggers SplitAsync (not a synchronous
                     // Split on this worker thread). This matches the
@@ -1813,6 +2307,7 @@ namespace SPTAG::SPANN {
 
         bool LoadIndex(Options& p_opt) override {
             m_opt = &p_opt;
+            m_initialVectorSize = p_opt.m_vectorSize;  // initial count for VID stripe
             SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "DataBlockSize: %d, Capacity: %d\n", m_opt->m_datasetRowsInBlock, m_opt->m_datasetCapacity);
             std::string versionmapPath = m_opt->m_indexDirectory + FolderSep + m_opt->m_deleteIDFile + "_" + std::to_string(m_layer);
             if (m_opt->m_recovery) {
@@ -1901,13 +2396,33 @@ namespace SPTAG::SPANN {
 	    }
             if (m_opt->m_update) {
                 if (m_splitThreadPool == nullptr) {
-                    SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "SPFresh: initialize thread pools, append: %d, reassign %d\n", m_opt->m_appendThreadNum, m_opt->m_reassignThreadNum);
-
-                    m_splitThreadPool = std::make_shared<SPDKThreadPool>();
-                    m_splitThreadPool->initSPDK(m_opt->m_appendThreadNum, this);
-                    //m_reassignThreadPool = std::make_shared<SPDKThreadPool>();
-                    //m_reassignThreadPool->initSPDK(m_opt->m_reassignThreadNum, this);
-                    SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "SPFresh: finish initialization\n");
+                    // Only layer 0 participates in the shared-pool slot:
+                    // it both adopts (if a sibling published first) and
+                    // publishes (so the WorkerNode receiver and any later
+                    // layer-0 instance can reuse the same threads).
+                    // Inner layers (m_layer > 0) always create their own
+                    // pool, matching qianxi's per-instance pool design.
+                    if (m_layer == 0 && m_headIndex) {
+                        auto shared = m_headIndex->GetSharedSplitPool();
+                        if (shared) {
+                            m_splitThreadPool = std::static_pointer_cast<SPDKThreadPool>(shared);
+                        }
+                    }
+                    if (m_splitThreadPool == nullptr) {
+                        SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "SPFresh: initialize thread pools, append: %d, reassign %d\n", m_opt->m_appendThreadNum, m_opt->m_reassignThreadNum);
+
+                        m_splitThreadPool = std::make_shared<SPDKThreadPool>();
+                        m_splitThreadPool->initSPDK(m_opt->m_appendThreadNum, this);
+                        //m_reassignThreadPool = std::make_shared<SPDKThreadPool>();
+                        //m_reassignThreadPool->initSPDK(m_opt->m_reassignThreadNum, this);
+                        SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "SPFresh: finish initialization\n");
+                        if (m_layer == 0 && m_headIndex) m_headIndex->SetSharedSplitPool(m_splitThreadPool);
+                    } else {
+                        SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "SPFresh: adopted shared split pool from sibling layer\n");
+                    }
+                    // Pool is now ready: re-attempt wiring the worker's job
+                    // submitter (may have been set before pool was alive).
+                    WireJobSubmitterIfReady();
                 }
                 
                 if (m_opt->m_enableWAL && !m_opt->m_persistentBufferPath.empty()) {
@@ -2345,6 +2860,7 @@ namespace SPTAG::SPANN {
             {
                 auto fullVectors = p_reader->GetVectorSet();
                 fullCount = fullVectors->Count();
+                m_initialVectorSize = fullCount;  // remember bulk-build count for stripe formula
                 m_metaDataSize = sizeof(SizeType) + sizeof(uint8_t);
                 m_vectorDataSize = fullVectors->PerVectorDataSize();
                 m_vectorInfoSize = m_vectorDataSize + m_metaDataSize;
@@ -2556,10 +3072,20 @@ namespace SPTAG::SPANN {
 
             if (m_opt->m_update && !m_opt->m_allowZeroReplica && zeroReplicaCount > 0)
             {
-                SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "SPFresh: initialize thread pools, append: %d, reassign %d\n", m_opt->m_appendThreadNum, m_opt->m_reassignThreadNum);
-                m_splitThreadPool = std::make_shared<SPDKThreadPool>();
-                m_splitThreadPool->initSPDK(m_opt->m_appendThreadNum, this);
-                SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "SPFresh: finish initialization, zeroReplicaCount:%zu\n", zeroReplicaCount);
+                if (m_splitThreadPool == nullptr && m_layer == 0 && m_headIndex) {
+                    auto shared = m_headIndex->GetSharedSplitPool();
+                    if (shared) {
+                        m_splitThreadPool = std::static_pointer_cast<SPDKThreadPool>(shared);
+                    }
+                }
+                if (m_splitThreadPool == nullptr) {
+                    SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "SPFresh: initialize thread pools, append: %d, reassign %d\n", m_opt->m_appendThreadNum, m_opt->m_reassignThreadNum);
+                    m_splitThreadPool = std::make_shared<SPDKThreadPool>();
+                    m_splitThreadPool->initSPDK(m_opt->m_appendThreadNum, this);
+                    SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "SPFresh: finish initialization, zeroReplicaCount:%zu\n", zeroReplicaCount);
+                    if (m_layer == 0 && m_headIndex) m_headIndex->SetSharedSplitPool(m_splitThreadPool);
+                }
+                WireJobSubmitterIfReady();
 
                 uint32_t splitNumBeforeZeroReplica = m_stat.m_splitNum;
                 uint32_t reassignNumBeforeZeroReplica = m_stat.m_reAssignNum;
@@ -2834,6 +3360,16 @@ namespace SPTAG::SPANN {
             return ErrorCode::VectorNotFound;
         }
 
+        ErrorCode FlushRemoteAppends() {
+            if (m_worker && m_worker->IsEnabled()) {
+                ErrorCode ret = m_worker->FlushRemoteAppends();
+                m_worker->LogRouteStats(" (batch flush)");
+                m_worker->ResetRouteStats();
+                return ret;
+            }
+            return ErrorCode::Success;
+        }
+
         bool AllFinished() {
             if (!m_splitThreadPool) return true;
 
diff --git a/AnnService/inc/Core/SPANN/ExtraTiKVController.h b/AnnService/inc/Core/SPANN/ExtraTiKVController.h
index d7528d479..0541eaad1 100644
--- a/AnnService/inc/Core/SPANN/ExtraTiKVController.h
+++ b/AnnService/inc/Core/SPANN/ExtraTiKVController.h
@@ -12,6 +12,7 @@
 #include "kvproto/tikvpb.grpc.pb.h"
 #include "kvproto/kvrpcpb.pb.h"
 #include "kvproto/metapb.pb.h"
+#include "kvproto/pdpb.pb.h"
 #include "kvproto/pdpb.grpc.pb.h"
 
 #include <map>
diff --git a/AnnService/inc/Core/SPANN/IExtraSearcher.h b/AnnService/inc/Core/SPANN/IExtraSearcher.h
index 554b02421..ec8d8bf95 100644
--- a/AnnService/inc/Core/SPANN/IExtraSearcher.h
+++ b/AnnService/inc/Core/SPANN/IExtraSearcher.h
@@ -22,6 +22,11 @@
 namespace SPTAG {
     namespace SPANN {
 
+        // Forward declaration; the only IExtraSearcher API that touches WorkerNode
+        // is the SetWorker() hook below. Concrete searchers that care
+        // (ExtraDynamicSearcher) include the full header and override.
+        class WorkerNode;
+
         struct SearchStats
         {
             SearchStats()
@@ -589,6 +594,11 @@ namespace SPTAG {
                 SizeType p_begin) { return ErrorCode::Undefined; }
             virtual ErrorCode DeleteIndex(SizeType p_id) { return ErrorCode::Undefined; }
 
+            // Allocate globalVID to this node's BKT counter.
+            // ExtraDynamicSearcher overrides this with
+            // the stripe formula when m_worker is enabled.
+            virtual SizeType AllocateGlobalVID(SizeType p_localVID) const { return p_localVID; }
+
             virtual SizeType GetNumSamples() const = 0;
 
             virtual bool ContainSample(const SizeType idx) const
@@ -624,6 +634,11 @@ namespace SPTAG {
                 return ErrorCode::Undefined;
             }
 
+            // Bind a routing worker (no-op by default). ExtraDynamicSearcher
+            // overrides this to install the cross-node append + put +
+            // fetch-postings callbacks. ExtraStaticSearcher etc. ignore it.
+            virtual void SetWorker(WorkerNode* /*worker*/) {}
+
             virtual bool AllFinished() { return false; }
             virtual void GetDBStats() { return; }
             virtual int64_t GetNumBlocks() { return 0; }
@@ -640,6 +655,8 @@ namespace SPTAG {
             }
 
             virtual ErrorCode Checkpoint(std::string prefix) { return ErrorCode::Success; }
+
+            virtual void InitWorkSpace(ExtraWorkSpace* p_exWorkSpace, bool clear = false) {}
         };
     } // SPANN
 } // SPTAG
diff --git a/AnnService/inc/Core/SPANN/Index.h b/AnnService/inc/Core/SPANN/Index.h
index 5479d2d42..255043a58 100644
--- a/AnnService/inc/Core/SPANN/Index.h
+++ b/AnnService/inc/Core/SPANN/Index.h
@@ -47,6 +47,11 @@ namespace SPTAG
         template<typename T>
 	    class SPANNResultIterator;
 
+        // Forward-declare so Index<T> can hold/forward a WorkerNode pointer
+        // without dragging in the full Distributed/WorkerNode.h header (and
+        // thus its boost-asio + grpc transitive deps) into Index.h.
+        class WorkerNode;
+
         template<typename T>
         class Index;
         template<typename T>
@@ -63,6 +68,12 @@ namespace SPTAG
             std::vector<std::shared_ptr<IExtraSearcher>> m_extraSearchers;
             std::unique_ptr<SPTAG::COMMON::IWorkSpaceFactory<ExtraWorkSpace>> m_workSpaceFactory;
 
+            // Routing worker bound BEFORE BuildIndex so that
+            // ExtraDynamicSearcher::WriteDownAllPostingToDB and other build
+            // hooks see a non-null m_worker as each layer's searcher is
+            // emplaced. SPFreshTest sets this in BuildOnly+Distributed mode.
+            WorkerNode* m_pendingWorker = nullptr;
+
             Options m_options;
 
             std::function<float(const T*, const T*, DimensionType)> m_fComputeDistance;
@@ -85,6 +96,14 @@ namespace SPTAG
             std::shared_ptr<Helper::Concurrent::ConcurrentQueue<int>> m_freeWorkSpaceIds;
             std::atomic<int> m_workspaceCount = 0;
 
+            // Single split/append thread pool shared by all extraSearchers
+            // (one per layer). Lazily populated by the first layer that
+            // initializes its pool inside LoadIndex; subsequent layers
+            // adopt the same shared instance so the total worker count
+            // is AppendThreadNum (not AppendThreadNum * layers).
+            mutable std::mutex m_sharedSplitPoolMutex;
+            std::shared_ptr<Helper::ThreadPool> m_sharedSplitPool;
+
         public:
             Index()
             {
@@ -124,6 +143,27 @@ namespace SPTAG
             inline std::shared_ptr<IExtraSearcher> GetDiskIndex(int layer = 0) { if (layer < m_extraSearchers.size()) return m_extraSearchers[layer]; else return nullptr; }
             inline Options* GetOptions() { return &m_options; }
 
+            // Bind a routing worker. Forwards to all currently-existing
+            // extraSearchers and remembers the pointer so newly-emplaced
+            // searchers (created during BuildIndexInternalLayer) also pick
+            // it up. Pass nullptr to detach.
+            void SetWorker(WorkerNode* worker) {
+                m_pendingWorker = worker;
+                for (auto& searcher : m_extraSearchers) {
+                    if (searcher) searcher->SetWorker(worker);
+                }
+            }
+            inline WorkerNode* GetPendingWorker() const { return m_pendingWorker; }
+
+            inline std::shared_ptr<Helper::ThreadPool> GetSharedSplitPool() const {
+                std::lock_guard<std::mutex> lk(m_sharedSplitPoolMutex);
+                return m_sharedSplitPool;
+            }
+            inline void SetSharedSplitPool(std::shared_ptr<Helper::ThreadPool> pool) {
+                std::lock_guard<std::mutex> lk(m_sharedSplitPoolMutex);
+                m_sharedSplitPool = std::move(pool);
+            }
+
             inline SizeType GetNumSamples() const { return GetNumSamples(0); }
             inline SizeType GetNumSamples(int layer) const { if (layer < m_extraSearchers.size()) return m_extraSearchers[layer]->GetNumSamples(); else return m_topIndex->GetNumSamples(); }
             inline DimensionType GetFeatureDim() const { return m_topIndex->GetFeatureDim(); }
diff --git a/AnnService/inc/Core/VectorIndex.h b/AnnService/inc/Core/VectorIndex.h
index a25bf1e63..62e2ca843 100644
--- a/AnnService/inc/Core/VectorIndex.h
+++ b/AnnService/inc/Core/VectorIndex.h
@@ -5,6 +5,7 @@
 #define _SPTAG_VECTORINDEX_H_
 
 #include <unordered_set>
+#include <map>
 #include "Common.h"
 #include "Common/WorkSpace.h"
 #include "inc/Helper/DiskIO.h"
@@ -160,6 +161,14 @@ class VectorIndex
 
     static ErrorCode LoadIndex(const std::string& p_loaderFilePath, std::shared_ptr<VectorIndex>& p_vectorIndex);
 
+    /// LoadIndex with config overrides applied between LoadIndexConfig and LoadIndexData,
+    /// so settings such as TiKVPDAddresses take effect before the underlying KV connection
+    /// is constructed. Override keys may be section-qualified ("Section.Param"); unqualified
+    /// keys default to the "BuildSSDIndex" section.
+    static ErrorCode LoadIndex(const std::string& p_loaderFilePath,
+                               const std::map<std::string, std::string>& p_paramOverrides,
+                               std::shared_ptr<VectorIndex>& p_vectorIndex);
+
     static ErrorCode LoadIndexFromFile(const std::string& p_file, std::shared_ptr<VectorIndex>& p_vectorIndex);
 
     static ErrorCode LoadIndex(const std::string& p_config, const std::vector<ByteArray>& p_indexBlobs, std::shared_ptr<VectorIndex>& p_vectorIndex);
diff --git a/AnnService/inc/Helper/KeyValueIO.h b/AnnService/inc/Helper/KeyValueIO.h
index a7c3c25b8..9d7c1e2a3 100644
--- a/AnnService/inc/Helper/KeyValueIO.h
+++ b/AnnService/inc/Helper/KeyValueIO.h
@@ -34,6 +34,20 @@ namespace SPTAG
 
             virtual ErrorCode Put(const SizeType key, const std::string& value, const std::chrono::microseconds& timeout, std::vector<Helper::AsyncReadRequest>* reqs) = 0;
 
+            // Batched writes/deletes. Default implementations return Undefined so that
+            // backends without native batching (RocksDB, FileIO) can ignore them.
+            // TiKVIO overrides these to issue a single batched RPC per region group,
+            // which dramatically reduces the number of synchronous gRPC round-trips
+            // when callers (e.g. SPANN AddIndex Phase 2 / PutPostingToDB) want to
+            // commit several keys at once.
+            virtual ErrorCode MultiPut(const std::vector<std::string>& keys,
+                                       const std::vector<std::string>& values,
+                                       const std::chrono::microseconds& timeout,
+                                       std::vector<Helper::AsyncReadRequest>* reqs) { return ErrorCode::Undefined; }
+
+            virtual ErrorCode MultiDelete(const std::vector<std::string>& keys,
+                                          const std::chrono::microseconds& timeout) { return ErrorCode::Undefined; }
+
             virtual ErrorCode Merge(const SizeType key, const std::string &value,
                                     const std::chrono::microseconds &timeout,
                                     std::vector<Helper::AsyncReadRequest> *reqs, int& size) = 0;
diff --git a/AnnService/inc/Helper/ThreadPool.h b/AnnService/inc/Helper/ThreadPool.h
index 01c82e2a7..a351a75c8 100644
--- a/AnnService/inc/Helper/ThreadPool.h
+++ b/AnnService/inc/Helper/ThreadPool.h
@@ -5,7 +5,7 @@
 #define _SPTAG_HELPER_THREADPOOL_H_
 
 #include <atomic>
-#include <deque>
+#include <queue>
 #include <vector>
 #include <thread>
 #include <mutex>
@@ -78,28 +78,42 @@ namespace SPTAG
             {
                 {
                     std::lock_guard<std::mutex> lock(m_lock);
-                    m_jobs.push_back(j);
+                    m_jobs.push(j);
                 }
                 m_cond.notify_one();
             }
 
-            void addfront(Job* j)
+            // High-priority push: jobs in m_highJobs always run before m_jobs.
+            // Used by the distributed receiver to let inbound BatchAppend RPC
+            // work jump ahead of local Split/Merge/Reassign so the sender
+            // (driver) doesn't time out waiting for the chunk ack while the
+            // local pool drains long-running rebalance work.
+            void add_high(Job* j)
             {
                 {
                     std::lock_guard<std::mutex> lock(m_lock);
-                    m_jobs.push_front(j);
+                    m_highJobs.push(j);
                 }
                 m_cond.notify_one();
             }
 
+            // Alias kept for compatibility with code that calls addfront()
+            // (e.g., split-async path). Same semantics as add_high.
+            void addfront(Job* j) { add_high(j); }
+
             bool get(Job*& j)
             {
                 std::unique_lock<std::mutex> lock(m_lock);
-                while (m_jobs.empty() && !m_abort.ShouldAbort()) m_cond.wait(lock);
+                while (m_jobs.empty() && m_highJobs.empty() && !m_abort.ShouldAbort()) m_cond.wait(lock);
                 if (!m_abort.ShouldAbort()) {
-                    j = m_jobs.front();
+                    if (!m_highJobs.empty()) {
+                        j = m_highJobs.front();
+                        m_highJobs.pop();
+                    } else {
+                        j = m_jobs.front();
+                        m_jobs.pop();
+                    }
                     currentJobs++;
-                    m_jobs.pop_front();
                     return true;
                 }
                 return false;
@@ -108,7 +122,7 @@ namespace SPTAG
             size_t jobsize()
             {
                 std::lock_guard<std::mutex> lock(m_lock);
-                return m_jobs.size();
+                return m_jobs.size() + m_highJobs.size();
             }
 
             inline uint32_t runningJobs() { return currentJobs; }
@@ -122,7 +136,8 @@ namespace SPTAG
 
         protected:
             std::atomic_uint32_t currentJobs{ 0 };
-            std::deque<Job*> m_jobs;
+            std::queue<Job*> m_jobs;
+            std::queue<Job*> m_highJobs;
             Abort m_abort;
             std::mutex m_lock;
             std::condition_variable m_cond;
diff --git a/AnnService/inc/Socket/ConnectionManager.h b/AnnService/inc/Socket/ConnectionManager.h
index e487c6105..0c199ecb1 100644
--- a/AnnService/inc/Socket/ConnectionManager.h
+++ b/AnnService/inc/Socket/ConnectionManager.h
@@ -41,7 +41,11 @@ class ConnectionManager : public std::enable_shared_from_this<ConnectionManager>
     inline static std::uint32_t GetPosition(ConnectionID p_connectionID);
 
 private:
-    static constexpr std::uint32_t c_connectionPoolSize = 1 << 8;
+    // Bumped from 1<<8 (256) to 1<<12 (4096) to avoid silently dropping new
+    // connections when reconnect storms (e.g., from concurrent FlushRemoteAppends
+    // timeouts) saturate the pool. Each ConnectionItem is small; 4096 slots is
+    // ~64KB per ConnectionManager, which is negligible.
+    static constexpr std::uint32_t c_connectionPoolSize = 1 << 12;
 
     static constexpr std::uint32_t c_connectionPoolMask = c_connectionPoolSize - 1;
 
diff --git a/AnnService/inc/Socket/Packet.h b/AnnService/inc/Socket/Packet.h
index 8c99b09fe..6d8c1d146 100644
--- a/AnnService/inc/Socket/Packet.h
+++ b/AnnService/inc/Socket/Packet.h
@@ -27,13 +27,47 @@ enum class PacketType : std::uint8_t
 
     SearchRequest = 0x03,
 
+    AppendRequest = 0x04,
+
+    BatchAppendRequest = 0x05,
+
+    HeadSyncRequest = 0x07,
+
+    RemoteLockRequest = 0x08,
+
+    DispatchCommand = 0x09,
+
+    NodeRegisterRequest = 0x0A,
+
+    RingUpdate = 0x0B,
+
+    RingUpdateACK = 0x0C,
+
+    // Cross-node merge hint. Search on node X observes posting H is
+    // underfull, but H is owned by node Y. X sends MergeRequest to Y so
+    // Y can schedule its own MergeAsync(H). Fire-and-forget (no response
+    // packet): the receiver's MergeAsync already dedups via m_mergeList,
+    // a lost notification just means Y discovers H underfull via some
+    // other path (own search, own Append, explicit RefineIndex).
+    MergeRequest = 0x11,
+
     ResponseMask = 0x80,
 
+    NodeRegisterResponse = ResponseMask | NodeRegisterRequest,
+
     HeartbeatResponse = ResponseMask | HeartbeatRequest,
 
     RegisterResponse = ResponseMask | RegisterRequest,
 
-    SearchResponse = ResponseMask | SearchRequest
+    SearchResponse = ResponseMask | SearchRequest,
+
+    AppendResponse = ResponseMask | AppendRequest,
+
+    BatchAppendResponse = ResponseMask | BatchAppendRequest,
+
+    RemoteLockResponse = ResponseMask | RemoteLockRequest,
+
+    DispatchResult = ResponseMask | DispatchCommand,
 };
 
 
diff --git a/AnnService/inc/Socket/SimpleSerialization.h b/AnnService/inc/Socket/SimpleSerialization.h
index 6da925625..e0b8141dd 100644
--- a/AnnService/inc/Socket/SimpleSerialization.h
+++ b/AnnService/inc/Socket/SimpleSerialization.h
@@ -82,6 +82,58 @@ namespace SimpleSerialization
     }
 
 
+    /// Bounds-checked variants of SimpleReadBuffer.
+    /// All return nullptr if a read would overrun [p_buffer, p_bufEnd).
+    /// p_buffer is also returned as nullptr (and p_val left unchanged) if it is already nullptr.
+    template<typename T>
+    inline const std::uint8_t*
+    SafeSimpleReadBuffer(const std::uint8_t* p_buffer, const std::uint8_t* p_bufEnd, T& p_val)
+    {
+        static_assert(std::is_fundamental<T>::value || std::is_enum<T>::value,
+                      "Only applied for fundanmental type.");
+
+        if (p_buffer == nullptr) return nullptr;
+        if (p_bufEnd != nullptr && static_cast<std::size_t>(p_bufEnd - p_buffer) < sizeof(T)) return nullptr;
+        p_val = *(reinterpret_cast<const T*>(p_buffer));
+        return p_buffer + sizeof(T);
+    }
+
+
+    inline const std::uint8_t*
+    SafeSimpleReadBuffer(const std::uint8_t* p_buffer, const std::uint8_t* p_bufEnd, std::string& p_val)
+    {
+        p_val.clear();
+        if (p_buffer == nullptr) return nullptr;
+        std::uint32_t len = 0;
+        p_buffer = SafeSimpleReadBuffer(p_buffer, p_bufEnd, len);
+        if (p_buffer == nullptr) return nullptr;
+        if (len > 0)
+        {
+            if (p_bufEnd != nullptr && static_cast<std::size_t>(p_bufEnd - p_buffer) < len) return nullptr;
+            p_val.assign(reinterpret_cast<const char*>(p_buffer), len);
+        }
+        return p_buffer + len;
+    }
+
+
+    inline const std::uint8_t*
+    SafeSimpleReadBuffer(const std::uint8_t* p_buffer, const std::uint8_t* p_bufEnd, ByteArray& p_val)
+    {
+        p_val.Clear();
+        if (p_buffer == nullptr) return nullptr;
+        std::uint32_t len = 0;
+        p_buffer = SafeSimpleReadBuffer(p_buffer, p_bufEnd, len);
+        if (p_buffer == nullptr) return nullptr;
+        if (len > 0)
+        {
+            if (p_bufEnd != nullptr && static_cast<std::size_t>(p_bufEnd - p_buffer) < len) return nullptr;
+            p_val = ByteArray::Alloc(len);
+            std::memcpy(p_val.Data(), p_buffer, len);
+        }
+        return p_buffer + len;
+    }
+
+
     template<>
     inline std::size_t
     EstimateBufferSize<std::string>(const std::string& p_val)
diff --git a/AnnService/src/Core/SPANN/ExtraFileController.cpp b/AnnService/src/Core/SPANN/ExtraFileController.cpp
index 24c839455..b5db83822 100644
--- a/AnnService/src/Core/SPANN/ExtraFileController.cpp
+++ b/AnnService/src/Core/SPANN/ExtraFileController.cpp
@@ -25,7 +25,7 @@ bool FileIO::BlockController::Initialize(SPANN::Options &p_opt, int p_layer)
 #ifndef _MSC_VER
             O_RDWR | O_DIRECT, numblocks, 2, 2,
             max(p_opt.m_ioThreads, (2 * max(p_opt.m_searchThreadNum, p_opt.m_iSSDNumberOfThreads) +
-                                    (p_opt.m_layers + 1) * (p_opt.m_insertThreadNum + p_opt.m_reassignThreadNum + p_opt.m_appendThreadNum))),
+                                    p_opt.m_insertThreadNum + p_opt.m_reassignThreadNum + p_opt.m_appendThreadNum)),
             ((std::uint64_t)p_opt.m_startFileSize) << 30
 #else
             GENERIC_READ | GENERIC_WRITE, numblocks, 2, 2,
diff --git a/AnnService/src/Core/SPANN/SPANNIndex.cpp b/AnnService/src/Core/SPANN/SPANNIndex.cpp
index f3f83dca6..38ea1c72d 100644
--- a/AnnService/src/Core/SPANN/SPANNIndex.cpp
+++ b/AnnService/src/Core/SPANN/SPANNIndex.cpp
@@ -1227,6 +1227,15 @@ template <typename T> ErrorCode Index<T>::BuildIndexInternalLayer(std::shared_pt
             m_extraSearchers.emplace_back(std::make_shared<ExtraDynamicSearcher<T>>(m_options, m_extraSearchers.size(), this, m_db));
         }
 
+        // Hand the routing worker (if any) to the freshly-created searcher
+        // before BuildIndex runs. Build itself no longer routes postings
+        // (shared TiKV cluster — the driver writes straight to TiKV and PD
+        // routes each key to the owning store), but other build-time hooks
+        // that consult m_worker still benefit from seeing a non-null value.
+        if (m_pendingWorker) {
+            m_extraSearchers.back()->SetWorker(m_pendingWorker);
+        }
+
         {
             std::shared_ptr<Helper::DiskIO> ptr = SPTAG::f_createIO();
             if (ptr == nullptr ||
@@ -1862,7 +1871,74 @@ ErrorCode Index<T>::AddIndex(const void *p_data, SizeType p_vectorNum, Dimension
     }
     workSpace->m_deduper.clear();
     workSpace->m_postingIDs.clear();
-    return m_extraSearchers[0]->AddIndex(workSpace.get(), vectorSet, begin);
+
+    // Use multiple threads for RNGSelection + Append when vector count is large enough.
+    // Each thread fetch_add's one vector and calls ExtraDynamicSearcher::AddIndex with a
+    // single-vector view, so AppendBatchAsync flushes per-vector and pipelines with the
+    // worker side rather than queuing the whole batch behind a single huge flush.
+    if (p_vectorNum > 1 && m_options.m_iSSDNumberOfThreads > 1) {
+        int numThreads = std::min((int)p_vectorNum, m_options.m_iSSDNumberOfThreads);
+        std::atomic_int nextVec{0};
+        std::atomic<ErrorCode> globalError{ErrorCode::Success};
+        int printStep = std::max(1, p_vectorNum / 50);
+
+        auto worker = [&](bool isFirst) {
+            std::unique_ptr<ExtraWorkSpace> ws;
+            ExtraWorkSpace* wsPtr;
+            if (isFirst) {
+                wsPtr = workSpace.get();
+            } else {
+                ws = m_workSpaceFactory->GetWorkSpace();
+                if (!ws) {
+                    ws.reset(new ExtraWorkSpace());
+                    InitWorkSpace(ws.get(), false);
+                } else {
+                    InitWorkSpace(ws.get(), true);
+                }
+                ws->m_deduper.clear();
+                ws->m_postingIDs.clear();
+                wsPtr = ws.get();
+            }
+
+            while (globalError.load(std::memory_order_relaxed) == ErrorCode::Success) {
+                int v = nextVec.fetch_add(1);
+                if (v >= p_vectorNum) break;
+
+                if (v % printStep == 0) {
+                    SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "AddIndex bulk: %d/%d (%.1f%%)\n",
+                                 v, p_vectorNum, v * 100.0 / p_vectorNum);
+                    GetDBStat();
+                }
+
+                std::shared_ptr<VectorSet> singleVec = std::make_shared<BasicVectorSet>(
+                    ByteArray((std::uint8_t*)vectorSet->GetVector(v),
+                              sizeof(T) * p_dimension, false),
+                    GetEnumValueType<T>(), p_dimension, 1);
+                ErrorCode ret = m_extraSearchers[0]->AddIndex(wsPtr, singleVec,
+                    m_extraSearchers[0]->AllocateGlobalVID(begin + v));
+                if (ret != ErrorCode::Success) {
+                    globalError.store(ret, std::memory_order_relaxed);
+                }
+            }
+
+            if (!isFirst && ws) {
+                m_workSpaceFactory->ReturnWorkSpace(std::move(ws));
+            }
+        };
+
+        std::vector<std::thread> threads;
+        threads.reserve(numThreads - 1);
+        for (int t = 1; t < numThreads; t++) {
+            threads.emplace_back(worker, false);
+        }
+        worker(true);
+        for (auto& t : threads) t.join();
+
+        return globalError.load();
+    }
+
+    return m_extraSearchers[0]->AddIndex(workSpace.get(), vectorSet,
+        m_extraSearchers[0]->AllocateGlobalVID(begin));
 }
 
 template <typename T>
diff --git a/AnnService/src/Core/VectorIndex.cpp b/AnnService/src/Core/VectorIndex.cpp
index 2f8ebfd13..35bcaf585 100644
--- a/AnnService/src/Core/VectorIndex.cpp
+++ b/AnnService/src/Core/VectorIndex.cpp
@@ -793,6 +793,14 @@ std::shared_ptr<VectorIndex> VectorIndex::CreateInstance(IndexAlgoType p_algo, V
 }
 
 ErrorCode VectorIndex::LoadIndex(const std::string &p_loaderFilePath, std::shared_ptr<VectorIndex> &p_vectorIndex)
+{
+    static const std::map<std::string, std::string> emptyOverrides;
+    return LoadIndex(p_loaderFilePath, emptyOverrides, p_vectorIndex);
+}
+
+ErrorCode VectorIndex::LoadIndex(const std::string &p_loaderFilePath,
+                                 const std::map<std::string, std::string> &p_paramOverrides,
+                                 std::shared_ptr<VectorIndex> &p_vectorIndex)
 {
     std::string folderPath(p_loaderFilePath);
     if (!folderPath.empty() && *(folderPath.rbegin()) != FolderSep)
@@ -816,6 +824,23 @@ ErrorCode VectorIndex::LoadIndex(const std::string &p_loaderFilePath, std::share
     if ((ret = p_vectorIndex->LoadIndexConfig(iniReader)) != ErrorCode::Success)
         return ret;
 
+    // Apply param overrides AFTER LoadIndexConfig but BEFORE LoadIndexData, so that
+    // settings like TiKVPDAddresses are reflected in m_options before the KV connection
+    // is constructed inside LoadIndexData -> PrepareDB.
+    for (const auto &kv : p_paramOverrides)
+    {
+        const std::string &key = kv.first;
+        const std::string &val = kv.second;
+        auto dotPos = key.find('.');
+        if (dotPos != std::string::npos) {
+            std::string section = key.substr(0, dotPos);
+            std::string param = key.substr(dotPos + 1);
+            p_vectorIndex->SetParameter(param.c_str(), val.c_str(), section.c_str());
+        } else {
+            p_vectorIndex->SetParameter(key.c_str(), val.c_str(), "BuildSSDIndex");
+        }
+    }
+
     std::shared_ptr<std::vector<std::string>> indexfiles = p_vectorIndex->GetIndexFiles();
     if (iniReader.DoesSectionExist("MetaData"))
     {
diff --git a/AnnService/src/Socket/Connection.cpp b/AnnService/src/Socket/Connection.cpp
index 150889d2f..444c7afb0 100644
--- a/AnnService/src/Socket/Connection.cpp
+++ b/AnnService/src/Socket/Connection.cpp
@@ -26,10 +26,19 @@ Connection::Connection(ConnectionID p_connectionID, boost::asio::ip::tcp::socket
 
 void Connection::Start()
 {
-    SPTAGLIB_LOG(Helper::LogLevel::LL_Debug, "Connection Start, local: %u, remote: %s:%u\n",
-                 static_cast<uint32_t>(m_socket.local_endpoint().port()),
-                 m_socket.remote_endpoint().address().to_string().c_str(),
-                 static_cast<uint32_t>(m_socket.remote_endpoint().port()));
+    boost::system::error_code epEc;
+    auto localEp = m_socket.local_endpoint(epEc);
+    auto remoteEp = m_socket.remote_endpoint(epEc);
+    if (!epEc) {
+        SPTAGLIB_LOG(Helper::LogLevel::LL_Debug, "Connection Start, local: %u, remote: %s:%u\n",
+                     static_cast<uint32_t>(localEp.port()),
+                     remoteEp.address().to_string().c_str(),
+                     static_cast<uint32_t>(remoteEp.port()));
+    } else {
+        SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, "Connection Start, socket not connected: %s\n",
+                     epEc.message().c_str());
+        return;
+    }
 
     if (!m_stopped.exchange(false))
     {
@@ -42,10 +51,15 @@ void Connection::Start()
 
 void Connection::Stop()
 {
-    SPTAGLIB_LOG(Helper::LogLevel::LL_Debug, "Connection Stop, local: %u, remote: %s:%u\n",
-                 static_cast<uint32_t>(m_socket.local_endpoint().port()),
-                 m_socket.remote_endpoint().address().to_string().c_str(),
-                 static_cast<uint32_t>(m_socket.remote_endpoint().port()));
+    boost::system::error_code epEc;
+    auto localEp = m_socket.local_endpoint(epEc);
+    auto remoteEp = m_socket.remote_endpoint(epEc);
+    if (!epEc) {
+        SPTAGLIB_LOG(Helper::LogLevel::LL_Debug, "Connection Stop, local: %u, remote: %s:%u\n",
+                     static_cast<uint32_t>(localEp.port()),
+                     remoteEp.address().to_string().c_str(),
+                     static_cast<uint32_t>(remoteEp.port()));
+    }
 
     if (m_stopped.exchange(true))
     {
diff --git a/AnnService/src/Socket/Server.cpp b/AnnService/src/Socket/Server.cpp
index 9781bf1d4..8be0682c6 100644
--- a/AnnService/src/Socket/Server.cpp
+++ b/AnnService/src/Socket/Server.cpp
@@ -26,7 +26,7 @@ Server::Server(const std::string &p_address, const std::string &p_port, const Pa
 
     boost::asio::ip::tcp::endpoint endpoint = *(endPoints.begin());
     m_acceptor.open(endpoint.protocol());
-    m_acceptor.set_option(boost::asio::ip::tcp::acceptor::reuse_address(false));
+    m_acceptor.set_option(boost::asio::ip::tcp::acceptor::reuse_address(true));
 
     m_acceptor.bind(endpoint, errCode);
     if (errCode)
diff --git a/Test/CMakeLists.txt b/Test/CMakeLists.txt
index 52f4168a9..27bdeebb5 100644
--- a/Test/CMakeLists.txt
+++ b/Test/CMakeLists.txt
@@ -24,7 +24,7 @@ if (NOT LIBRARYONLY)
     file(GLOB TEST_HDR_FILES ${PROJECT_SOURCE_DIR}/Test/inc/Test.h)
     file(GLOB TEST_SRC_FILES ${PROJECT_SOURCE_DIR}/Test/src/*.cpp)
     add_executable(SPTAGTest ${TEST_SRC_FILES} ${TEST_HDR_FILES})
-    target_link_libraries(SPTAGTest SPTAGLibStatic ssdservingLib ${Boost_LIBRARIES})
+    target_link_libraries(SPTAGTest SPTAGLibStatic ssdservingLib ${Boost_LIBRARIES} absl_synchronization absl_cord absl_cordz_info absl_cord_internal absl_cordz_functions absl_cordz_handle)
 
     install(TARGETS SPTAGTest
       RUNTIME DESTINATION bin  
diff --git a/Test/inc/TestDataGenerator.h b/Test/inc/TestDataGenerator.h
index 5820c8422..9f958f43d 100644
--- a/Test/inc/TestDataGenerator.h
+++ b/Test/inc/TestDataGenerator.h
@@ -29,7 +29,20 @@ namespace TestUtils {
 
         static std::shared_ptr<SPTAG::MetadataSet> LoadMetadataSet(const std::string pmetaset, const std::string pmetaidx, SPTAG::SizeType start = 0, SPTAG::SizeType count = -1);
 
-        static float EvaluateRecall(const std::vector<SPTAG::QueryResult> &res, std::shared_ptr<SPTAG::VectorSet> &truth, int recallK, int k, int batch, int totalbatches);
+        // Compute recall against truth file.
+        //
+        // Distributed (per-node) recall: when each node only owns a SUBSET of
+        // the global query set, pass the global query count and this node's
+        // query offset so the truth row indexing is computed in global terms.
+        // The truth file is laid out as:
+        //   [iter=0 VIDs for queries 0..Q-1] [iter=1 VIDs ...] ...
+        //   [iter=0 dists for queries 0..Q-1] [iter=1 dists ...] ...
+        // where Q is the GLOBAL query count, NOT res.size(). With the legacy
+        // res.size()-based formula, distributed batches > 0 read the wrong
+        // rows (off by Q-myCount), giving near-random recall that's noise.
+        // totalQueries=-1 (default) preserves the legacy single-node formula.
+        static float EvaluateRecall(const std::vector<SPTAG::QueryResult> &res, std::shared_ptr<SPTAG::VectorSet> &truth, int recallK, int k, int batch, int totalbatches,
+                                    int totalQueries = -1, int queryOffset = 0);
 
         void RunBatches(std::shared_ptr<SPTAG::VectorSet> &vecset, std::shared_ptr<SPTAG::MetadataSet> &metaset,
                         std::shared_ptr<SPTAG::VectorSet> &addvecset, std::shared_ptr<SPTAG::MetadataSet> &addmetaset,
diff --git a/Test/src/SPFreshTest.cpp b/Test/src/SPFreshTest.cpp
index 95c1fc4d5..9ab420db9 100644
--- a/Test/src/SPFreshTest.cpp
+++ b/Test/src/SPFreshTest.cpp
@@ -5,6 +5,10 @@
 #include "inc/Core/Common/DistanceUtils.h"
 #include "inc/Core/Common/QueryResultSet.h"
 #include "inc/Core/SPANN/Index.h"
+#include "inc/Core/SPANN/Distributed/WorkerNode.h"
+#include "inc/Core/SPANN/Distributed/DispatcherNode.h"
+#include "inc/Core/SPANN/ExtraDynamicSearcher.h"
+#include "inc/Core/SPANN/ExtraTiKVController.h"
 #include "inc/Core/SPANN/SPANNResultIterator.h"
 #include "inc/Core/VectorIndex.h"
 #include "inc/Core/Common/IQuantizer.h"
@@ -17,10 +21,13 @@
 #include "inc/Test.h"
 #include "inc/TestDataGenerator.h"
 
+#include <algorithm>
 #include <atomic>
 #include <chrono>
+#include <cstring>
 #include <filesystem>
 #include <fstream>
+#include <future>
 #include <iomanip>
 #include <map>
 #include <memory>
@@ -55,6 +62,181 @@ static __attribute__((constructor)) void install_segfault_handler() {
 
 using namespace SPTAG;
 
+// ---------------------------------------------------------------------------
+// Stride sharding (a.k.a. odd/even sharding) experiment
+// ---------------------------------------------------------------------------
+// When the env var SPFRESH_SHARD_STRIDE is set to "1"/"true", each node, instead
+// of inserting a contiguous slice [n*B/N, (n+1)*B/N) of the per-iteration batch,
+// inserts the strided rows {n, n+N, n+2*N, ...} where n=nodeIndex, N=numNodes.
+// This breaks any spatial structure in the input dataset (e.g. SIFT files that
+// are roughly sorted by visual feature), letting us check whether the layer-0
+// split skew (driver 71 vs worker 2 in v18) is caused by contiguous slicing
+// landing similar vectors on the same node and overflowing a small set of heads.
+//
+// The total number of vectors inserted across all nodes per iteration is the
+// same; only the assignment changes. Recall measurement still works because
+// the dataset and ground truth are unchanged — only insert routing differs.
+static bool IsStrideShardEnabled() {
+    const char* e = std::getenv("SPFRESH_SHARD_STRIDE");
+    if (!e) return false;
+    std::string v(e);
+    return v == "1" || v == "true" || v == "TRUE" || v == "yes";
+}
+
+// Compute count of indices i in [0, total) with (i % stride) == offset.
+static SizeType StrideCount(SizeType total, int stride, int offset) {
+    if (stride <= 1) return total;
+    if (offset < 0 || offset >= stride) return 0;
+    if (total <= offset) return 0;
+    return (total - 1 - offset) / stride + 1;
+}
+
+// Build a strided sub-VectorSet by copying every `stride`-th vector starting
+// at `offset` into a contiguous packed ByteArray. Returns a BasicVectorSet.
+static std::shared_ptr<VectorSet> ExtractStridedVectors(
+    const std::shared_ptr<VectorSet>& full, int stride, int offset)
+{
+    if (!full) return nullptr;
+    SizeType totalCount = full->Count();
+    SizeType outCount = StrideCount(totalCount, stride, offset);
+    auto vt = full->GetValueType();
+    auto dim = full->Dimension();
+    size_t perVecSize = full->PerVectorDataSize();
+    if (outCount <= 0) {
+        return std::make_shared<BasicVectorSet>(ByteArray::Alloc(0), vt, dim, 0);
+    }
+    ByteArray buf = ByteArray::Alloc(static_cast<size_t>(outCount) * perVecSize);
+    for (SizeType i = 0; i < outCount; ++i) {
+        SizeType srcIdx = static_cast<SizeType>(offset) + i * static_cast<SizeType>(stride);
+        std::memcpy(buf.Data() + static_cast<size_t>(i) * perVecSize,
+                    full->GetVector(srcIdx),
+                    perVecSize);
+    }
+    return std::make_shared<BasicVectorSet>(buf, vt, dim, outCount);
+}
+
+// Build a strided sub-MetadataSet. Two-pass: first compute offsets, then copy.
+static std::shared_ptr<MetadataSet> ExtractStridedMetadata(
+    const std::shared_ptr<MetadataSet>& full, int stride, int offset)
+{
+    if (!full) return nullptr;
+    SizeType totalCount = full->Count();
+    SizeType outCount = StrideCount(totalCount, stride, offset);
+    if (outCount <= 0) {
+        ByteArray emptyMeta = ByteArray::Alloc(0);
+        ByteArray offBuf = ByteArray::Alloc(sizeof(std::uint64_t));
+        *reinterpret_cast<std::uint64_t*>(offBuf.Data()) = 0ULL;
+        return std::make_shared<MemMetadataSet>(emptyMeta, offBuf, 0);
+    }
+    std::vector<std::uint64_t> offsets(static_cast<size_t>(outCount) + 1, 0ULL);
+    std::uint64_t total = 0;
+    for (SizeType i = 0; i < outCount; ++i) {
+        SizeType srcIdx = static_cast<SizeType>(offset) + i * static_cast<SizeType>(stride);
+        ByteArray meta = full->GetMetadata(srcIdx);
+        offsets[i] = total;
+        total += meta.Length();
+    }
+    offsets[outCount] = total;
+    ByteArray metaBuf = ByteArray::Alloc(total > 0 ? total : 1);
+    for (SizeType i = 0; i < outCount; ++i) {
+        SizeType srcIdx = static_cast<SizeType>(offset) + i * static_cast<SizeType>(stride);
+        ByteArray meta = full->GetMetadata(srcIdx);
+        if (meta.Length() > 0) {
+            std::memcpy(metaBuf.Data() + offsets[i], meta.Data(), meta.Length());
+        }
+    }
+    ByteArray offBuf = ByteArray::Alloc((static_cast<size_t>(outCount) + 1) * sizeof(std::uint64_t));
+    std::memcpy(offBuf.Data(), offsets.data(), offsets.size() * sizeof(std::uint64_t));
+    return std::make_shared<MemMetadataSet>(metaBuf, offBuf, outCount);
+}
+
+// Helper: parse "host:port,host:port,..." into vector of pairs.
+static std::vector<std::pair<std::string, std::string>> ParseNodeAddrs(const std::string& addrStr) {
+    std::vector<std::pair<std::string, std::string>> result;
+    auto parts = Helper::StrUtils::SplitString(addrStr, ",");
+    for (auto& part : parts) {
+        auto hp = Helper::StrUtils::SplitString(part, ":");
+        if (hp.size() == 2) result.emplace_back(hp[0], hp[1]);
+    }
+    return result;
+}
+
+// Helper: bind a WorkerNode to ALL ExtraDynamicSearcher layers inside a VectorIndex.
+// Calls SetWorker() which wires up append, head-sync, and remote-lock callbacks.
+// All layers must have the worker bound so that AddIDCapacity (called per-layer) sees
+// the correct numNodes and grows each layer's TiKVVersionMap to cover the full global
+// VID space (capa * numNodes), not just this node's slice.
+template <typename T>
+static void BindWorkerToIndex(SPANN::WorkerNode* worker, std::shared_ptr<VectorIndex>& index) {
+    auto* spannIndex = dynamic_cast<SPANN::Index<T>*>(index.get());
+    if (!spannIndex) return;
+    for (int layer = 0; ; ++layer) {
+        auto diskIndex = spannIndex->GetDiskIndex(layer);
+        if (!diskIndex) break;
+        auto* searcher = dynamic_cast<SPANN::ExtraDynamicSearcher<T>*>(diskIndex.get());
+        if (searcher) searcher->SetWorker(worker);
+    }
+}
+
+// Helper: same as BindWorkerToIndex but takes a raw SPANN::Index<T>* directly
+// (for sites that have already extracted the spannIndex pointer).
+template <typename T>
+static void BindWorkerToAllLayers(SPANN::WorkerNode* worker, SPANN::Index<T>* spannIndex) {
+    if (!spannIndex) return;
+    for (int layer = 0; ; ++layer) {
+        auto diskIndex = spannIndex->GetDiskIndex(layer);
+        if (!diskIndex) break;
+        auto* searcher = dynamic_cast<SPANN::ExtraDynamicSearcher<T>*>(diskIndex.get());
+        if (searcher) searcher->SetWorker(worker);
+    }
+}
+
+// Configuration for distributed mode, read from [Distributed] ini section.
+struct DistributedConfig {
+    bool enabled = false;
+    int workerIndex = 0;          // 0-based: 0 = driver (dispatcher + worker 0), 1+ = remote worker
+    std::string dispatcherAddr;   // "host:port"
+    std::string workerAddrs;      // "host:port,host:port,..."
+    std::string storeAddrs;       // "addr,addr,..."
+    std::string pdAddrs;          // "host:port,host:port,..." (per-worker PD)
+
+    // Number of workers (for query/insert partitioning)
+    int GetNumWorkers() const {
+        if (!enabled || workerAddrs.empty()) return 1;
+        return (int)std::count(workerAddrs.begin(), workerAddrs.end(), ',') + 1;
+    }
+
+    // Parse dispatcher address into host:port pair
+    std::pair<std::string, std::string> GetDispatcherAddr() const {
+        auto hp = Helper::StrUtils::SplitString(dispatcherAddr, ":");
+        if (hp.size() == 2) return {hp[0], hp[1]};
+        return {"", ""};
+    }
+
+    // Get PD address for this worker (falls back to global TiKVPDAddresses)
+    std::string GetLocalPDAddr() const {
+        if (pdAddrs.empty()) return "";
+        auto addrs = Helper::StrUtils::SplitString(pdAddrs, ",");
+        if (workerIndex < (int)addrs.size()) return addrs[workerIndex];
+        return addrs[0];
+    }
+
+    static DistributedConfig FromIni(Helper::IniReader& ini) {
+        DistributedConfig cfg;
+        cfg.enabled = ini.GetParameter("Distributed", "Enabled", false);
+        cfg.dispatcherAddr = ini.GetParameter("Distributed", "DispatcherAddr", std::string(""));
+        cfg.workerAddrs = ini.GetParameter("Distributed", "WorkerAddrs", std::string(""));
+        cfg.storeAddrs = ini.GetParameter("Distributed", "StoreAddrs", std::string(""));
+        cfg.pdAddrs = ini.GetParameter("Distributed", "PDAddrs", std::string(""));
+
+        // Worker index from env var (0 = driver, 1+ = remote worker)
+        const char* wiEnv = std::getenv("WORKER_INDEX");
+        cfg.workerIndex = wiEnv ? std::atoi(wiEnv) : 0;
+
+        return cfg;
+    }
+};
+
 namespace SPFreshTest
 {
 SizeType N = 10000;
@@ -306,13 +488,17 @@ std::shared_ptr<VectorIndex> BuildIndex(const std::string &outDirectory, std::sh
 
 template <typename T>
 std::shared_ptr<VectorIndex> BuildLargeIndex(const std::string &outDirectory, std::string &pvecset,
-                                        std::string& pmetaset, std::string& pmetaidx, Helper::IniReader& iniReader, const std::string &distMethod = "L2",
+                                        std::string& pmetaset, std::string& pmetaidx, const std::string &distMethod = "L2",
                                         int searchthread = 2, int insertthread = 2, int layers = 1,
-                                        std::shared_ptr<COMMON::IQuantizer> quantizer = nullptr, std::string quantizerFilePath = "quantizer.bin")
+                                        std::shared_ptr<COMMON::IQuantizer> quantizer = nullptr, std::string quantizerFilePath = "quantizer.bin",
+                                        const std::map<std::string, std::string>& ssdOverrides = {},
+                                        bool ssdOnly = false,
+                                        SPANN::WorkerNode* p_worker = nullptr)
 {
     auto vecIndex = VectorIndex::CreateInstance(IndexAlgoType::SPANN, GetEnumValueType<T>());
     int maxthreads = std::thread::hardware_concurrency();
     int postingLimit = 4 * sizeof(T);
+    remove((outDirectory + FolderSep + "ssdmapping_0_postings").c_str());
     std::string configuration = R"(
         [Base]
             DistCalcMethod=)" + distMethod + R"(
@@ -399,15 +585,29 @@ std::shared_ptr<VectorIndex> BuildLargeIndex(const std::string &outDirectory, st
         }
     }
 
-    for (const auto &sec : sections)
+    // Apply overrides (e.g., Storage, TiKV settings, SelectHead/BuildHead params)
+    for (const auto &[key, val] : ssdOverrides)
     {
-        auto params = iniReader.GetParameters(sec.c_str());
-        for (const auto &[key, val] : params)
-        {
-            vecIndex->SetParameter(key.c_str(), val.c_str(), sec.c_str());
+        // Keys prefixed with "SectionName." are routed to the corresponding section
+        auto dotPos = key.find('.');
+        if (dotPos != std::string::npos) {
+            std::string section = key.substr(0, dotPos);
+            std::string param = key.substr(dotPos + 1);
+            vecIndex->SetParameter(param.c_str(), val.c_str(), section.c_str());
+        } else {
+            vecIndex->SetParameter(key.c_str(), val.c_str(), "BuildSSDIndex");
         }
     }
 
+    // SSD-only mode: skip SelectHead and BuildHead, resume from specified layer
+    if (ssdOnly)
+    {
+        // Allow explicit ResumeLayer from config/overrides; otherwise default to layer 0
+        // (rebuild SSD for all layers, reusing existing head indexes)
+        int resumeLayer = 0;
+        vecIndex->SetParameter("ResumeLayer", std::to_string(resumeLayer).c_str(), "BuildSSDIndex");
+    }
+
     if (quantizer)
     {
         vecIndex->SetParameter("QuantizerFilePath", quantizerFilePath.c_str(), "Base");
@@ -415,6 +615,20 @@ std::shared_ptr<VectorIndex> BuildLargeIndex(const std::string &outDirectory, st
         vecIndex->SetQuantizerADC(false);
         vecIndex->SetParameter("Dim", std::to_string(quantizer->GetNumSubvectors()).c_str(), "Base");
     }
+
+    // Bind a routing worker (if any) to the freshly-created SSD searcher
+    // before BuildIndex runs. Build itself does not route postings any more
+    // (shared TiKV cluster — driver writes directly), so in buildOnly mode
+    // the workerPtr will simply be nullptr and this block is a no-op.
+    if (p_worker) {
+        if (auto* spannIdx = dynamic_cast<SPANN::Index<T>*>(vecIndex.get())) {
+            spannIdx->SetWorker(p_worker);
+            SPTAGLIB_LOG(Helper::LogLevel::LL_Info,
+                "BuildLargeIndex: bound routing worker (numNodes=%d)\n",
+                p_worker->GetNumNodes());
+        }
+    }
+
     auto buildStatus = vecIndex->BuildIndex();
     if (buildStatus != ErrorCode::Success)
         return nullptr;
@@ -452,9 +666,19 @@ float Search(std::shared_ptr<VectorIndex> &vecIndex, std::shared_ptr<VectorSet>
     return TestUtils::TestDataGenerator<T>::EvaluateRecall(results, truth, k, k, batch, totalbatches);
 }
 
+template <typename T>
+double ExecutePartitionedSearch(VectorIndex* index,
+                                std::shared_ptr<VectorSet>& queryset,
+                                int myStart, int myCount,
+                                int searchK, int numThreads,
+                                std::vector<QueryResult>& results,
+                                std::vector<float>* latenciesOut,
+                                std::vector<SPANN::SearchStats>* statsOut);
+
 template <typename ValueType>
 void InsertVectors(SPANN::Index<ValueType> *p_index, int insertThreads, int step,
-                   std::shared_ptr<VectorSet> addset, std::shared_ptr<MetadataSet> &metaset, int searchThreads = 0, std::shared_ptr<VectorSet> queryset = nullptr, int numQueries = 0, int k = 5, std::ostream* benchmarkData = nullptr, int start = 0)
+                   std::shared_ptr<VectorSet> addset, std::shared_ptr<MetadataSet> &metaset, int searchThreads = 0, std::shared_ptr<VectorSet> queryset = nullptr, int numQueries = 0, int k = 5, std::ostream* benchmarkData = nullptr, int start = 0,
+                   SPANN::WorkerNode* router = nullptr)
 {
     p_index->ForceCompaction();
     p_index->GetDBStat();
@@ -462,8 +686,15 @@ void InsertVectors(SPANN::Index<ValueType> *p_index, int insertThreads, int step
     std::vector<std::thread> threads;
 
     int printstep = step / 50;
+
+    // Bulk path: single AddIndex call amortizes remote-append RPCs into one AppendBatchAsync.
+    // Per-vector RNGSelection is parallelized inside ExtraDynamicSearcher::AddIndex so we
+    // keep insertThreads-way parallelism while saving N-1 RPCs.
+    bool useBulk = (router && router->GetNumNodes() > 1);
+
+    // Per-vector insert (original path): each thread grabs one vector at a time
     std::atomic_size_t vectorsSent(start);
-    auto func = [&]() {
+    auto perVecFunc = [&]() {
         size_t index = start;
         while (true)
         {
@@ -500,43 +731,48 @@ void InsertVectors(SPANN::Index<ValueType> *p_index, int insertThreads, int step
         }
     };
 
-    if (searchThreads > 0 && queryset != nullptr && numQueries != 0 && benchmarkData != nullptr) {
-        std::vector<float> latencies(numQueries);
-        std::vector<QueryResult> results(numQueries);
-        std::vector<float> duration(searchThreads);
-
-        for (int i = 0; i < numQueries; i++)
+    // Bulk insert (router path): single call, parallelism inside SPANNIndex::AddIndex
+    auto bulkFunc = [&]() {
+        SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "InsertVectors: bulk AddIndex for %d vectors (router enabled)\n", step);
+        ErrorCode ret = p_index->AddIndex(addset->GetVector((SizeType)start), step, addset->Dimension(), metaset, true);
+        if (ret != ErrorCode::Success)
         {
-            results[i] = QueryResult((const ValueType *)queryset->GetVector(i), k, false);
+            SPTAGLIB_LOG(Helper::LogLevel::LL_Error,
+                         "AddIndex bulk failed. start:%d count:%d Dim:%d Error:%d\n",
+                         start, step, addset->Dimension(), static_cast<int>(ret));
         }
+        BOOST_REQUIRE(ret == ErrorCode::Success);
+    };
 
-        std::atomic_size_t queriesSent(0);
-        auto search = [&](int tid) {
-            auto s1 = std::chrono::high_resolution_clock::now();
-            size_t qid;
-            while ((qid = queriesSent.fetch_add(1)) < numQueries)
-            {
-                auto t1 = std::chrono::high_resolution_clock::now();
-                p_index->SearchIndex(results[qid]);
-                auto t2 = std::chrono::high_resolution_clock::now();
-                latencies[qid] = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count() / 1000.0f;
-            }
-            auto s2 = std::chrono::high_resolution_clock::now();
-            duration[tid] = std::chrono::duration_cast<std::chrono::microseconds>(s2 - s1).count() / 1000.0f;
-        };
+    std::function<void()> func;
+    int insertThreadCount;
+    if (useBulk) {
+        func = bulkFunc;
+        insertThreadCount = 1;
+    } else {
+        func = perVecFunc;
+        insertThreadCount = insertThreads;
+    }
+
+    if (searchThreads > 0 && queryset != nullptr && numQueries != 0 && benchmarkData != nullptr) {
+        std::vector<float> latencies;
+        std::vector<QueryResult> results;
+        double searchWallSeconds = 0.0;
 
-        for (int j = 0; j < insertThreads; j++)
+        for (int j = 0; j < insertThreadCount; j++)
         {
             threads.emplace_back(func);
         }
-        for (int j = 0; j < searchThreads; j++)
-        {
-            threads.emplace_back(search, j);
-        }
+        std::thread searchThread([&]() {
+            searchWallSeconds = ExecutePartitionedSearch<ValueType>(
+                p_index, queryset, /*myStart=*/0, numQueries, k, searchThreads,
+                results, &latencies, /*statsOut=*/nullptr);
+        });
         for (auto &thread : threads)
         {
             thread.join();
         }
+        searchThread.join();
 
         // Calculate statistics
         float mean = 0, minLat = (std::numeric_limits<float>::max)(), maxLat = 0;
@@ -553,10 +789,7 @@ void InsertVectors(SPANN::Index<ValueType> *p_index, int insertThreads, int step
         float p90 = latencies[static_cast<size_t>(numQueries * 0.90)];
         float p95 = latencies[static_cast<size_t>(numQueries * 0.95)];
         float p99 = latencies[static_cast<size_t>(numQueries * 0.99)];
-        float maxBatchLatency = 1e-6;
-        for (int i = 0; i < searchThreads; i++)
-            if (maxBatchLatency < duration[i]) maxBatchLatency = duration[i];
-        float qps = numQueries / maxBatchLatency;
+        float qps = numQueries / std::max(static_cast<float>(searchWallSeconds), 1e-6f);
 
         *benchmarkData << "        \"numQueries\": " << numQueries << ",\n";
         *benchmarkData << "        \"meanLatency\": " << mean << ",\n";
@@ -567,6 +800,17 @@ void InsertVectors(SPANN::Index<ValueType> *p_index, int insertThreads, int step
         *benchmarkData << "        \"minLatency\": " << minLat << ",\n";
         *benchmarkData << "        \"maxLatency\": " << maxLat << ",\n";
         *benchmarkData << "        \"qps\": " << qps << ",\n";
+    } else {
+        // No search-during-insert path: just run the insert threads.
+        // (Used by worker dispatch and any caller that doesn't need stats.)
+        for (int j = 0; j < insertThreadCount; j++)
+        {
+            threads.emplace_back(func);
+        }
+        for (auto &thread : threads)
+        {
+            thread.join();
+        }
     }
     auto barrierStart = std::chrono::high_resolution_clock::now();
     size_t barrierPolls = 0;
@@ -587,72 +831,82 @@ void InsertVectors(SPANN::Index<ValueType> *p_index, int insertThreads, int step
 }
 
 
+
+
+
 template <typename T>
 void BenchmarkQueryPerformance(std::shared_ptr<VectorIndex> &index, std::shared_ptr<VectorSet> &queryset,
                                std::shared_ptr<VectorSet> &truth, const std::string &truthPath,
                                SizeType baseVectorCount, int topK, int searchK, int numThreads, int numQueries, int batches, int totalbatches,
-                               std::ostream &benchmarkData, std::string prefix = "")
+                               std::ostream &benchmarkData, std::string prefix = "",
+                               int nodeIndex = 0, SPANN::WorkerNode* router = nullptr,
+                               SPANN::DispatcherNode* dispatcher = nullptr)
 {
-    // Benchmark: Query performance with detailed latency stats
-    std::vector<float> latencies(numQueries);
-    std::atomic_size_t queriesSent(0);
-    std::vector<QueryResult> results(numQueries);
-    std::vector<SPANN::SearchStats> searchStats(numQueries);
-    auto* spannIndex = dynamic_cast<SPANN::Index<T>*>(index.get());
-
-    for (int i = 0; i < numQueries; i++)
-    {
-        results[i] = QueryResult((const T *)queryset->GetVector(i), searchK, false);
+    // Use hash ring node count (workers only) for partitioning, not GetNumNodes() (includes dispatcher)
+    auto ring = (router && router->IsEnabled()) ? router->GetHashRing() : nullptr;
+    int nodeCount = ring ? static_cast<int>(ring->NodeCount()) : 1;
+    bool distributed = (dispatcher != nullptr && router != nullptr && router->IsEnabled() && nodeCount > 1);
+
+    // Determine this node's query range (balanced contiguous partition)
+    int myStart = 0, myCount = numQueries;
+    if (distributed) {
+        myStart = (int)((long long)nodeIndex * numQueries / nodeCount);
+        int myEnd = (int)((long long)(nodeIndex + 1) * numQueries / nodeCount);
+        myCount = myEnd - myStart;
     }
 
-    std::vector<std::thread> threads;
-    threads.reserve(numThreads);
-
-    auto batchStart = std::chrono::high_resolution_clock::now();
-
-    for (int i = 0; i < numThreads; i++)
-    {
-        threads.emplace_back([&]() {
-            size_t qid;
-            while ((qid = queriesSent.fetch_add(1)) < numQueries)
-            {
-                auto t1 = std::chrono::high_resolution_clock::now();
-                if (spannIndex != nullptr)
-                {
-                    spannIndex->SearchIndex(results[qid], &searchStats[qid]);
-                }
-                else
-                {
-                    index->SearchIndex(results[qid]);
-                }
-                auto t2 = std::chrono::high_resolution_clock::now();
-                latencies[qid] = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count() / 1000.0f;
-            }
-        });
+    // Dispatch search command to all workers via TCP (distributed only)
+    std::int64_t dispatchId = -1;
+    int round = 0;
+    if (distributed) {
+        static std::atomic<int> s_searchRound{0};
+        round = s_searchRound.fetch_add(1);
+        dispatchId = dispatcher->BroadcastDispatchCommand(
+            SPANN::DispatchCommand::Type::Search, static_cast<std::uint32_t>(round));
     }
 
-    for (auto &thread : threads)
-        thread.join();
+    // Run this node's share of queries.
+    std::vector<QueryResult> results;
+    std::vector<float> latencies;
+    std::vector<SPANN::SearchStats> searchStats;
+    double localWallTime = ExecutePartitionedSearch<T>(
+        index.get(), queryset, myStart, myCount, searchK, numThreads,
+        results, &latencies, &searchStats);
+    float batchLatency = static_cast<float>(localWallTime);
+    auto* spannIndex = dynamic_cast<SPANN::Index<T>*>(index.get());
 
-    auto batchEnd = std::chrono::high_resolution_clock::now();
-    float batchLatency =
-        std::chrono::duration_cast<std::chrono::microseconds>(batchEnd - batchStart).count() / 1000000.0f;
+    if (distributed) {
+        // Driver also runs searches against its local node, so it can have
+        // outgoing merge hints queued. Drain before we move on.
+        if (router) {
+            router->FlushRemoteMerges();
+        }
+        // Collect worker timings via TCP; QPS is governed by the slowest node.
+        auto workerTimes = dispatcher->WaitForAllResults(dispatchId, 300);
+        for (double wt : workerTimes) {
+            batchLatency = std::max(batchLatency, static_cast<float>(wt));
+        }
+        SPTAGLIB_LOG(Helper::LogLevel::LL_Info,
+            "BenchmarkQueryPerformance round %d: local=%.1fms (%d queries), max=%.1fms, QPS=%.1f\n",
+            round, localWallTime * 1000, myCount, batchLatency * 1000, numQueries / batchLatency);
+    }
 
-    // Calculate statistics
+    // Calculate statistics (from this node's queries)
+    int statsCount = myCount;
     float mean = 0, minLat = (std::numeric_limits<float>::max)(), maxLat = 0;
-    for (int i = 0; i < numQueries; i++)
+    for (int i = 0; i < statsCount; i++)
     {
         mean += latencies[i];
         minLat = (std::min)(minLat, latencies[i]);
         maxLat = (std::max)(maxLat, latencies[i]);
     }
-    mean /= numQueries;
+    mean /= statsCount;
 
     std::sort(latencies.begin(), latencies.end());
-    float p50 = latencies[static_cast<size_t>(numQueries * 0.50)];
-    float p90 = latencies[static_cast<size_t>(numQueries * 0.90)];
-    float p95 = latencies[static_cast<size_t>(numQueries * 0.95)];
-    float p99 = latencies[static_cast<size_t>(numQueries * 0.99)];
+    float p50 = latencies[static_cast<size_t>(statsCount * 0.50)];
+    float p90 = latencies[static_cast<size_t>(statsCount * 0.90)];
+    float p95 = latencies[static_cast<size_t>(statsCount * 0.95)];
+    float p99 = latencies[static_cast<size_t>(statsCount * 0.99)];
     float qps = numQueries / batchLatency;
 
     BOOST_TEST_MESSAGE("  Queries: " << numQueries);
@@ -749,7 +1003,7 @@ void BenchmarkQueryPerformance(std::shared_ptr<VectorIndex> &index, std::shared_
         benchmarkData << prefix << "      },\n";
     }
 
-    // Recall evaluation (if truth file provided)
+    // Recall evaluation
     if (!truth || truthPath.empty() || truthPath == "none")
     {
         BOOST_TEST_MESSAGE("  Recall evaluation skipped (no truth data)");
@@ -760,7 +1014,13 @@ void BenchmarkQueryPerformance(std::shared_ptr<VectorIndex> &index, std::shared_
 
     BOOST_TEST_MESSAGE("Checking for truth file: " << truthPath);
     std::shared_ptr<VectorSet> pvecset, paddvecset;
-    float avgRecall = TestUtils::TestDataGenerator<T>::EvaluateRecall(results, truth, topK, searchK, batches, totalbatches);
+    // In distributed mode, this node only searched queries [myStart, myStart+myCount).
+    // Pass the global query count and this node's offset so EvaluateRecall indexes
+    // the truth file in global terms (BATCH > 0 reads the wrong truth rows otherwise).
+    int recallTotalQueries = distributed ? numQueries : -1;
+    int recallQueryOffset = distributed ? myStart : 0;
+    float avgRecall = TestUtils::TestDataGenerator<T>::EvaluateRecall(results, truth, topK, searchK, batches, totalbatches,
+                                                                      recallTotalQueries, recallQueryOffset);
     BOOST_TEST_MESSAGE("  Recall" << topK << "@" << searchK << " = " << (avgRecall * 100.0f) << "%");
     BOOST_TEST_MESSAGE("  (Evaluated on " << numQueries << " queries against base vectors)");
     benchmarkData << std::fixed << std::setprecision(4);
@@ -772,6 +1032,115 @@ void BenchmarkQueryPerformance(std::shared_ptr<VectorIndex> &index, std::shared_
     benchmarkData << prefix << "    }";
 }
 
+// Run [myStart, myStart+myCount) queries against `index` using `numThreads` workers.
+// Returns wall time in seconds. Fills `results` and (when non-null) per-query
+// `latenciesOut` (ms) and `statsOut` (SPANN SearchStats). When `statsOut` is
+// non-null and the index is a SPANN index, the stats overload of SearchIndex
+// is used; otherwise the plain SearchIndex path runs.
+template <typename T>
+double ExecutePartitionedSearch(VectorIndex* index,
+                                std::shared_ptr<VectorSet>& queryset,
+                                int myStart, int myCount,
+                                int searchK, int numThreads,
+                                std::vector<QueryResult>& results,
+                                std::vector<float>* latenciesOut,
+                                std::vector<SPANN::SearchStats>* statsOut)
+{
+    auto* spannIndex = dynamic_cast<SPANN::Index<T>*>(index);
+    bool useStats = (statsOut != nullptr && spannIndex != nullptr);
+
+    results.resize(myCount);
+    for (int i = 0; i < myCount; i++) {
+        results[i] = QueryResult((const T*)queryset->GetVector(myStart + i), searchK, false);
+    }
+    if (useStats) statsOut->assign(myCount, SPANN::SearchStats());
+    if (latenciesOut) latenciesOut->assign(myCount, 0.0f);
+
+    std::atomic_size_t queriesSent(0);
+    int nThreads = std::min(numThreads, std::max(myCount, 1));
+    std::vector<std::thread> threads;
+    threads.reserve(nThreads);
+
+    auto t0 = std::chrono::high_resolution_clock::now();
+    for (int i = 0; i < nThreads; i++) {
+        threads.emplace_back([&]() {
+            size_t qid;
+            while ((qid = queriesSent.fetch_add(1)) < static_cast<size_t>(myCount)) {
+                auto t1 = std::chrono::high_resolution_clock::now();
+                if (useStats) {
+                    spannIndex->SearchIndex(results[qid], &(*statsOut)[qid]);
+                } else if (spannIndex != nullptr) {
+                    spannIndex->SearchIndex(results[qid]);
+                } else {
+                    index->SearchIndex(results[qid]);
+                }
+                auto t2 = std::chrono::high_resolution_clock::now();
+                if (latenciesOut) {
+                    (*latenciesOut)[qid] =
+                        std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count() / 1000.0f;
+                }
+            }
+        });
+    }
+    for (auto& t : threads) t.join();
+    auto t3 = std::chrono::high_resolution_clock::now();
+    return std::chrono::duration_cast<std::chrono::microseconds>(t3 - t0).count() / 1000000.0;
+}
+
+ErrorCode QuantizeVectors(const std::shared_ptr<COMMON::IQuantizer>& quantizer,
+                          const std::shared_ptr<VectorSet>& source,
+                          ByteArray& dest);
+
+template <typename T>
+void LoadAndInsertBatch(SPANN::Index<T>* spannIndex,
+                        const std::string& paddset,
+                        const std::string& paddmeta,
+                        const std::string& paddmetaidx,
+                        int dimension,
+                        int insertStart, int loadCount, int perNodeBatch,
+                        bool strideShard, int numNodes, int nodeIndex,
+                        int numInsertThreads,
+                        SPANN::WorkerNode* router,
+                        std::shared_ptr<COMMON::IQuantizer> quantizer,
+                        int searchDuringInsertThreads,
+                        std::shared_ptr<VectorSet> queryset,
+                        int numQueries, int searchK,
+                        std::ostream* benchmarkData,
+                        const char* logPrefix)
+{
+    auto addset = TestUtils::TestDataGenerator<T>::LoadVectorSet(paddset, dimension, insertStart, loadCount);
+    if (quantizer) {
+        auto addFloat = ConvertToFloatVectorSet(addset);
+        BOOST_REQUIRE(addFloat != nullptr);
+        ByteArray quantizedAddBytes =
+            ByteArray::Alloc((size_t)addFloat->Count() * (size_t)(quantizer->GetNumSubvectors()));
+        BOOST_REQUIRE(QuantizeVectors(quantizer, addFloat, quantizedAddBytes) == ErrorCode::Success);
+        addset = std::make_shared<BasicVectorSet>(quantizedAddBytes,
+                                                  VectorValueType::UInt8,
+                                                  quantizer->GetNumSubvectors(),
+                                                  addFloat->Count());
+    }
+    auto addmetaset = TestUtils::TestDataGenerator<T>::LoadMetadataSet(paddmeta, paddmetaidx, insertStart, loadCount);
+    if (strideShard) {
+        addset = ExtractStridedVectors(addset, numNodes, nodeIndex);
+        addmetaset = ExtractStridedMetadata(addmetaset, numNodes, nodeIndex);
+        SPTAGLIB_LOG(Helper::LogLevel::LL_Info,
+                     "%s: stride-shard batchStart=%d loadCount=%d -> kept=%d (every %d-th, offset=%d)\n",
+                     logPrefix, insertStart, loadCount,
+                     (int)(addset ? addset->Count() : 0), numNodes, nodeIndex);
+    }
+    InsertVectors<T>(spannIndex, numInsertThreads, perNodeBatch,
+                     addset, addmetaset,
+                     searchDuringInsertThreads, queryset, numQueries, searchK,
+                     benchmarkData, 0, router);
+    if (router) {
+        router->FlushRemoteAppends();
+        router->FlushRemoteMerges();
+        router->LogRouteStats(" (batch flush)");
+        router->ResetRouteStats();
+    }
+}
+
 template <typename T>
 void LogCheckpointLayerStats(const std::shared_ptr<VectorIndex>& index, int layers, int currentBatch, int totalBatches)
 {
@@ -836,9 +1205,13 @@ ErrorCode QuantizeVectors(const std::shared_ptr<COMMON::IQuantizer>& quantizer,
 template <typename T>
 void RunBenchmark(const std::string &vectorPath, const std::string &queryPath, const std::string &truthPath,
                   DistCalcMethod distMethod, const std::string &indexPath, int dimension, int baseVectorCount,
-                  int insertVectorCount, int deleteVectorCount, int batches, int topK, int numSearchThreads, int numInsertThreads, int numSearchDuringInsertThreads, int numQueries, Helper::IniReader& iniReader,
+                  int insertVectorCount, int deleteVectorCount, int batches, int topK, int numSearchThreads, int numInsertThreads, int numSearchDuringInsertThreads, int numQueries,
                   const std::string &outputFile = "output.json", const bool rebuild = true, const int resume = -1,
-                  const std::string &quantizerFilePath = std::string(""), int quantizedDim = 0, int layers = 1)
+                  const std::string &quantizerFilePath = std::string(""), int quantizedDim = 0, int layers = 1,
+                  const std::map<std::string, std::string>& ssdOverrides = {},
+                  bool rebuildSsdOnly = false,
+                  bool buildOnly = false,
+                  const DistributedConfig& distCfg = {})
 {
     int oldM = M, oldK = K, oldN = N, oldQueries = queries;
     N = baseVectorCount;
@@ -849,6 +1222,27 @@ void RunBenchmark(const std::string &vectorPath, const std::string &queryPath, c
     int insertBatchSize = insertVectorCount / max(batches, 1);
     int deleteBatchSize = deleteVectorCount / max(batches, 1);
 
+    // Use distributed config for multi-node partitioning
+    int nodeIndex = distCfg.workerIndex;
+    int numNodes = distCfg.GetNumWorkers();
+    bool strideShard = IsStrideShardEnabled() && numNodes > 1;
+    int myInsertStart, myInsertEnd, perNodeBatch;
+    if (strideShard) {
+        // Stride mode: each node loads the FULL per-iter batch then keeps rows
+        // where (rowIdx % numNodes) == nodeIndex. myInsertStart/End span the
+        // full batch; perNodeBatch is the count of strided rows.
+        myInsertStart = 0;
+        myInsertEnd = insertBatchSize;
+        perNodeBatch = static_cast<int>(StrideCount(insertBatchSize, numNodes, nodeIndex));
+    } else {
+        myInsertStart = (numNodes > 1) ? (nodeIndex * insertBatchSize) / numNodes : 0;
+        myInsertEnd = (numNodes > 1) ? ((nodeIndex + 1) * insertBatchSize) / numNodes : insertBatchSize;
+        perNodeBatch = myInsertEnd - myInsertStart;
+    }
+    SPTAGLIB_LOG(Helper::LogLevel::LL_Info,
+                 "RunBenchmark: nodeIndex=%d numNodes=%d insertBatchSize=%d myInsertStart=%d myInsertEnd=%d perNodeBatch=%d strideShard=%d\n",
+                 nodeIndex, numNodes, insertBatchSize, myInsertStart, myInsertEnd, perNodeBatch, strideShard ? 1 : 0);
+
     // Variables to collect JSON output data
     std::ostringstream tmpbenchmark;
 
@@ -902,12 +1296,78 @@ void RunBenchmark(const std::string &vectorPath, const std::string &queryPath, c
     jsonFile << "  \"results\": {\n";
 
     int SearchK = enableQuantization? topK * 4 : topK;
+    // Distributed routing: dispatcher + local worker (driver node is both)
+    std::unique_ptr<SPANN::DispatcherNode> dispatcher;
+    std::unique_ptr<SPANN::WorkerNode> worker;
+    SPANN::WorkerNode* workerPtr = nullptr;  // convenience alias
     std::shared_ptr<VectorIndex> index;
     std::shared_ptr<COMMON::IQuantizer> quantizer;
-    
+
+    // Distributed setup: when running a non-buildOnly distributed benchmark
+    // (i.e. the search/insert run phase), create the dispatcher + worker0
+    // so the driver can broadcast the hash ring and accept remote callbacks.
+    // BuildOnly mode skips this entirely — build runs single-node and writes
+    // straight to the shared TiKV cluster (PD routes each key to the owning
+    // store), so no dispatcher / worker plumbing is needed for the build
+    // path.
+    if (distCfg.enabled && !buildOnly) {
+        auto dispAddr = distCfg.GetDispatcherAddr();
+        auto workerAddrs = ParseNodeAddrs(distCfg.workerAddrs);
+        auto storeAddrs = Helper::StrUtils::SplitString(distCfg.storeAddrs, ",");
+
+        dispatcher.reset(new SPANN::DispatcherNode());
+        BOOST_REQUIRE_MESSAGE(dispatcher->Initialize(dispAddr, workerAddrs),
+            "DispatcherNode initialization failed (build-phase setup)");
+        BOOST_REQUIRE(dispatcher->Start());
+
+        worker.reset(new SPANN::WorkerNode());
+        // Pre-build: pass nullptr DB. After BuildIndex, swap in the real DB
+        // via SetDB() (or rebuild the worker on top of it for run mode).
+        BOOST_REQUIRE_MESSAGE(
+            worker->Initialize(nullptr, 0, dispAddr, workerAddrs, storeAddrs),
+            "WorkerNode initialization failed (build-phase setup)");
+        BOOST_REQUIRE(worker->Start());
+        workerPtr = worker.get();
+
+        dispatcher->SetLocalWorkerIndex(worker->GetLocalNodeIndex());
+        worker->SetHashRing(dispatcher->GetHashRing());
+
+        SPTAGLIB_LOG(Helper::LogLevel::LL_Info,
+            "Pre-build: waiting for all peer connections...\n");
+        BOOST_REQUIRE_MESSAGE(dispatcher->WaitForAllPeersConnected(180),
+            "Timed out waiting for peer connections (build-phase)");
+
+        auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(180);
+        while (std::chrono::steady_clock::now() < deadline) {
+            if (dispatcher->AllWorkersAcked()) break;
+            std::this_thread::sleep_for(std::chrono::milliseconds(200));
+        }
+        BOOST_REQUIRE_MESSAGE(dispatcher->AllWorkersAcked(),
+            "Timed out waiting for workers to ACK ring (build-phase)");
+        SPTAGLIB_LOG(Helper::LogLevel::LL_Info,
+            "Pre-build: all %d workers connected and ring synchronized\n", numNodes);
+
+        // Start heartbeat pump so remote workers can detect driver failure
+        // and exit cleanly instead of relying on a fixed wall-clock receiver
+        // timeout. Worker side enforces HeartbeatTimeoutSec (default 180s).
+        // Interval is fixed at 30s; six missed pings before worker bails.
+        dispatcher->StartHeartbeat(30);
+    }
+
     // Build initial index
     BOOST_TEST_MESSAGE("\n=== Building Index ===");
-    if (rebuild || !direxists(indexPath.c_str())) {
+    if (rebuild || rebuildSsdOnly || !direxists(indexPath.c_str())) {
+        if (!rebuildSsdOnly) {
+            // Allow empty or non-existent directories; block only if index files already exist
+            if (direxists(indexPath.c_str()) && fileexists((indexPath + FolderSep + "indexloader.ini").c_str())) {
+                SPTAGLIB_LOG(Helper::LogLevel::LL_Error,
+                    "Index directory '%s' already exists with index files. Refusing to delete. "
+                    "Remove it manually or use RebuildSSDOnly=true to resume.\n",
+                    indexPath.c_str());
+                BOOST_FAIL("Index directory already exists: " + indexPath);
+                return;
+            }
+        }
         auto buildstart = std::chrono::high_resolution_clock::now();
 
         if (enableQuantization)
@@ -932,13 +1392,13 @@ void RunBenchmark(const std::string &vectorPath, const std::string &queryPath, c
                 quantizedBase->Save(pquanvecset);
             }
 
-            index = BuildLargeIndex<uint8_t>(indexPath, pquanvecset, pmeta, pmetaidx, iniReader, dist, numSearchThreads, numInsertThreads, layers, quantizer, "quantizer.bin");
+            index = BuildLargeIndex<uint8_t>(indexPath, pquanvecset, pmeta, pmetaidx, dist, numSearchThreads, numInsertThreads, layers, quantizer, "quantizer.bin", ssdOverrides, rebuildSsdOnly, workerPtr);
             BOOST_REQUIRE(index != nullptr);
             index->SetQuantizerADC(true);
         }
         else
         {
-            index = BuildLargeIndex<T>(indexPath, pvecset, pmeta, pmetaidx, iniReader, dist, numSearchThreads, numInsertThreads, layers);
+            index = BuildLargeIndex<T>(indexPath, pvecset, pmeta, pmetaidx, dist, numSearchThreads, numInsertThreads, layers, nullptr, "quantizer.bin", ssdOverrides, rebuildSsdOnly, workerPtr);
             BOOST_REQUIRE(index != nullptr);
         }
 
@@ -954,6 +1414,23 @@ void RunBenchmark(const std::string &vectorPath, const std::string &queryPath, c
         BOOST_REQUIRE(index != nullptr);
     }
 
+    // Set up distributed routing for RUN mode if configured.
+    // (Build-phase needs no dispatcher/worker; the run-phase dispatcher+worker
+    // were created in the pre-build block above.) The driver node is both
+    // dispatcher (ring management) and worker 0 (compute).
+    if (distCfg.enabled && !buildOnly) {
+        // Bind worker to ALL searcher layers (wires append + headsync + lock + fetch callbacks).
+        // Every layer must see the worker so AddIDCapacity grows each layer's
+        // version map by capa * numNodes (not just capa).
+        auto* spannIndex = dynamic_cast<SPANN::Index<T>*>(index.get());
+        BOOST_REQUIRE(spannIndex != nullptr);
+        BindWorkerToAllLayers<T>(workerPtr, spannIndex);
+
+        SPTAGLIB_LOG(Helper::LogLevel::LL_Info,
+            "Run mode: worker bound to all %d layers\n",
+            (int)spannIndex->GetOptions()->m_layers);
+    }
+
     auto queryset = TestUtils::TestDataGenerator<T>::LoadVectorSet(pqueryset, M);
     BOOST_REQUIRE(queryset != nullptr);
 
@@ -973,32 +1450,50 @@ void RunBenchmark(const std::string &vectorPath, const std::string &queryPath, c
         truth = TestUtils::TestDataGenerator<float>::LoadVectorSet(ptruth, K);
     }
 
-    // Benchmark 0: Query performance before insertions (round 1 — cold cache)
-    BOOST_TEST_MESSAGE("\n=== Benchmark 0: Query Before Insertions (Round 1) ===");
-    BenchmarkQueryPerformance<T>(index, queryset, truth, truthPath, baseVectorCount, topK, SearchK,
-                                 numSearchThreads, numQueries, 0, batches, tmpbenchmark);
-    jsonFile << "    \"benchmark0_query_before_insert\": ";
-    BenchmarkQueryPerformance<T>(index, queryset, truth, truthPath, baseVectorCount, topK, SearchK,
-                                 numSearchThreads, numQueries, 0, batches, jsonFile);
-    jsonFile << ",\n";
-    jsonFile.flush();
-
-    // Benchmark 0b: Query performance before insertions (round 2 — warm cache)
-    BOOST_TEST_MESSAGE("\n=== Benchmark 0b: Query Before Insertions (Round 2) ===");
-    BenchmarkQueryPerformance<T>(index, queryset, truth, truthPath, baseVectorCount, topK, SearchK,
-                                 numSearchThreads, numQueries, 0, batches, tmpbenchmark);
-    jsonFile << "    \"benchmark0b_query_before_insert_round2\": ";
-    BenchmarkQueryPerformance<T>(index, queryset, truth, truthPath, baseVectorCount, topK, SearchK,
-                                 numSearchThreads, numQueries, 0, batches, jsonFile);
-    jsonFile << ",\n";
-    jsonFile.flush();
+    // Benchmark 0/0b: query performance before insertions. Skip in BuildOnly
+    // mode (no point measuring queries when we're about to exit; queries also
+    // require workers to be running for distributed scatter-gather).
+    if (!buildOnly) {
+        // Benchmark 0: Query performance before insertions (round 1 — cold cache)
+        BOOST_TEST_MESSAGE("\n=== Benchmark 0: Query Before Insertions (Round 1) ===");
+        BenchmarkQueryPerformance<T>(index, queryset, truth, truthPath, baseVectorCount, topK, SearchK,
+                                     numSearchThreads, numQueries, 0, batches, tmpbenchmark, "",
+                                     nodeIndex, workerPtr, dispatcher.get());
+        jsonFile << "    \"benchmark0_query_before_insert\": ";
+        BenchmarkQueryPerformance<T>(index, queryset, truth, truthPath, baseVectorCount, topK, SearchK,
+                                     numSearchThreads, numQueries, 0, batches, jsonFile, "",
+                                     nodeIndex, workerPtr, dispatcher.get());
+        jsonFile << ",\n";
+        jsonFile.flush();
+
+        // Benchmark 0b: Query performance before insertions (round 2 — warm cache)
+        BOOST_TEST_MESSAGE("\n=== Benchmark 0b: Query Before Insertions (Round 2) ===");
+        BenchmarkQueryPerformance<T>(index, queryset, truth, truthPath, baseVectorCount, topK, SearchK,
+                                     numSearchThreads, numQueries, 0, batches, tmpbenchmark, "",
+                                     nodeIndex, workerPtr, dispatcher.get());
+        jsonFile << "    \"benchmark0b_query_before_insert_round2\": ";
+        BenchmarkQueryPerformance<T>(index, queryset, truth, truthPath, baseVectorCount, topK, SearchK,
+                                     numSearchThreads, numQueries, 0, batches, jsonFile, "",
+                                     nodeIndex, workerPtr, dispatcher.get());
+        jsonFile << ",\n";
+        jsonFile.flush();
+    } else {
+        SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "BuildOnly=true: skipping Benchmark 0/0b query rounds\n");
+        jsonFile << "    \"benchmark0_query_before_insert\": {},\n";
+        jsonFile << "    \"benchmark0b_query_before_insert_round2\": {},\n";
+        jsonFile.flush();
+    }
 
     BOOST_REQUIRE(index->SaveIndex(indexPath) == ErrorCode::Success);
     index = nullptr;
 
 
     // Benchmark 1: Insert performance
-    if (insertBatchSize > 0)
+    if (buildOnly) {
+        SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "BuildOnly=true: skipping insert batches, index saved to %s\n", indexPath.c_str());
+        jsonFile << "    \"benchmark1_insert\": {}\n";
+    }
+    else if (insertBatchSize > 0)
     {
         BOOST_TEST_MESSAGE("\n=== Benchmark 1: Insert Performance ===");
         {
@@ -1076,31 +1571,53 @@ void RunBenchmark(const std::string &vectorPath, const std::string &queryPath, c
                 SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Cloned index from %s to %s, check:%d, time: %f seconds\n",
                              prevPath.c_str(), clonePath.c_str(), (int)(cloneret == ErrorCode::Success), seconds);
 
-                int insertStart = iter * insertBatchSize;
+                // Re-bind the worker to ALL layers of the new cloned index's searchers
+                // (every layer must see the worker so AddIDCapacity grows each layer's
+                // version map by capa * numNodes).
+                if (workerPtr) {
+                    BindWorkerToIndex<T>(workerPtr, cloneIndex);
+                }
+
+                // Dispatch insert command to workers via TCP
+                std::uint64_t insertDispatchId = 0;
+                if (dispatcher && numNodes > 1) {
+                    insertDispatchId = dispatcher->BroadcastDispatchCommand(
+                        SPANN::DispatchCommand::Type::Insert, static_cast<std::uint32_t>(iter));
+                }
+
+                // Each node inserts its partition. Default mode: contiguous slice
+                // [iter*batchSize + myInsertStart, +perNodeBatch). Stride mode:
+                // every numNodes-th row of the full batch starting at nodeIndex
+                // (loads full batch then filters down to perNodeBatch rows).
+                int insertStart = iter * insertBatchSize + myInsertStart;
+                int loadCount = strideShard ? insertBatchSize : perNodeBatch;
                 {
-                    std::shared_ptr<VectorSet> addset = TestUtils::TestDataGenerator<T>::LoadVectorSet(paddset, M, insertStart, insertBatchSize);
-                    ByteArray quantizedAddBytes;
-                    if (enableQuantization) {
-                        auto addFloat = ConvertToFloatVectorSet(addset);
-                        BOOST_REQUIRE(addFloat != nullptr);
-                        quantizedAddBytes = ByteArray::Alloc((size_t)addFloat->Count() * (size_t)(quantizer->GetNumSubvectors()));
-                        BOOST_REQUIRE(QuantizeVectors(quantizer, addFloat, quantizedAddBytes) == ErrorCode::Success);
-                        addset = std::make_shared<BasicVectorSet>(quantizedAddBytes,
-                                                                 VectorValueType::UInt8,
-                                                                 quantizer->GetNumSubvectors(),
-                                                                 addFloat->Count());
-                    }
-                    std::shared_ptr<MetadataSet> addmetaset = TestUtils::TestDataGenerator<T>::LoadMetadataSet(paddmeta, paddmetaidx, insertStart, insertBatchSize);
+                    std::string driverTag = "RunBenchmark iter=" + std::to_string(iter);
                     start = std::chrono::high_resolution_clock::now();
-                    InsertVectors<T>(static_cast<SPANN::Index<T> *>(cloneIndex.get()), numInsertThreads, insertBatchSize,
-                                     addset, addmetaset, numSearchDuringInsertThreads, queryset, numQueries, SearchK, &jsonFile, 0);
-                    end = std::chrono::high_resolution_clock::now();
+                    LoadAndInsertBatch<T>(static_cast<SPANN::Index<T>*>(cloneIndex.get()),
+                                          paddset, paddmeta, paddmetaidx, M,
+                                          insertStart, loadCount, perNodeBatch,
+                                          strideShard, numNodes, nodeIndex,
+                                          numInsertThreads, workerPtr,
+                                          enableQuantization ? quantizer : nullptr,
+                                          numSearchDuringInsertThreads, queryset,
+                                          numQueries, SearchK, &jsonFile,
+                                          driverTag.c_str());
                 }
+
+                // Wait for all worker nodes to finish this batch via TCP.
+                if (insertDispatchId > 0) {
+                    auto workerTimes = dispatcher->WaitForAllResults(insertDispatchId, 7200);
+                    SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Driver: all %d workers finished batch %d\n",
+                                 (int)workerTimes.size(), iter + 1);
+                }
+
+                end = std::chrono::high_resolution_clock::now();
                 seconds =
                     std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() / 1000000.0f;
                 double throughput = insertBatchSize / seconds;
 
-                BOOST_TEST_MESSAGE("  Inserted: " << insertBatchSize << " vectors");
+                BOOST_TEST_MESSAGE("  Inserted: " << insertBatchSize << " vectors (" << perNodeBatch << " local)");
                 BOOST_TEST_MESSAGE("  Time: " << seconds << " seconds");
                 BOOST_TEST_MESSAGE("  Throughput: " << throughput << " vectors/sec");
 
@@ -1164,17 +1681,21 @@ void RunBenchmark(const std::string &vectorPath, const std::string &queryPath, c
                 BOOST_TEST_MESSAGE("\n=== Benchmark 2: Query After Insertions and Deletions ===");
                 jsonFile << "        \"search\":";
                 BenchmarkQueryPerformance<T>(cloneIndex, queryset, truth, truthPath, baseVectorCount, topK, SearchK, numSearchThreads,
-                                             numQueries, iter + 1, batches, tmpbenchmark, "    ");
+                                             numQueries, iter + 1, batches, tmpbenchmark, "    ",
+                                             nodeIndex, workerPtr, dispatcher.get());
                 BenchmarkQueryPerformance<T>(cloneIndex, queryset, truth, truthPath, baseVectorCount,
-                                             topK, SearchK, numSearchThreads, numQueries, iter + 1, batches, jsonFile, "    ");
+                                             topK, SearchK, numSearchThreads, numQueries, iter + 1, batches, jsonFile, "    ",
+                                             nodeIndex, workerPtr, dispatcher.get());
                 jsonFile << ",\n";
 
                 BOOST_TEST_MESSAGE("\n=== Benchmark 2b: Query After Insertions and Deletions (Round 2) ===");
                 jsonFile << "        \"search_round2\":";
                 BenchmarkQueryPerformance<T>(cloneIndex, queryset, truth, truthPath, baseVectorCount, topK, SearchK, numSearchThreads,
-                                             numQueries, iter + 1, batches, tmpbenchmark, "    ");
+                                             numQueries, iter + 1, batches, tmpbenchmark, "    ",
+                                             nodeIndex, workerPtr, dispatcher.get());
                 BenchmarkQueryPerformance<T>(cloneIndex, queryset, truth, truthPath, baseVectorCount,
-                                             topK, SearchK, numSearchThreads, numQueries, iter + 1, batches, jsonFile, "    ");
+                                             topK, SearchK, numSearchThreads, numQueries, iter + 1, batches, jsonFile, "    ",
+                                             nodeIndex, workerPtr, dispatcher.get());
                 jsonFile << ",\n";
 
                 start = std::chrono::high_resolution_clock::now();
@@ -1223,6 +1744,18 @@ void RunBenchmark(const std::string &vectorPath, const std::string &queryPath, c
     jsonFile << "}\n";
     jsonFile.close();
 
+    // Stop workers in distributed mode
+    if (dispatcher && numNodes > 1) {
+        // Stop the heartbeat pump first so we don't race a stray Heartbeat
+        // packet against the Stop dispatch on the same connection.
+        dispatcher->StopHeartbeat();
+        auto dispatchId = dispatcher->BroadcastDispatchCommand(SPANN::DispatchCommand::Type::Stop, 0);
+        // Wait briefly for ACKs so workers exit cleanly before the driver
+        // tears down the network (which would force-kill in-flight RPCs).
+        dispatcher->WaitForAllResults(dispatchId, 60);
+        SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Driver: sent Stop command to all workers\n");
+    }
+
     M = oldM;
     K = oldK;
     N = oldN;
@@ -2198,6 +2731,14 @@ BOOST_AUTO_TEST_CASE(IterativeSearchPerf)
     std::filesystem::remove_all("original_index");
 }
 
+// Forward declaration
+template <typename T>
+void RunWorker(const std::string& indexPath, int dimension, int baseVectorCount,
+               int insertVectorCount, int batches, int topK, int numSearchThreads,
+               int numInsertThreads, int numQueries, VectorValueType valueType,
+               const std::map<std::string, std::string>& ssdOverrides,
+               const DistributedConfig& distCfg, int workerTimeout);
+
 BOOST_AUTO_TEST_CASE(BenchmarkFromConfig)
 {
     using namespace SPFreshTest;
@@ -2245,14 +2786,59 @@ BOOST_AUTO_TEST_CASE(BenchmarkFromConfig)
     int topK = iniReader.GetParameter("Benchmark", "TopK", 10);
     int numSearchThreads = iniReader.GetParameter("Benchmark", "NumSearchThreads", 8);
     int numInsertThreads = iniReader.GetParameter("Benchmark", "NumInsertThreads", 8);
-    int appendThreadNum = iniReader.GetParameter("Benchmark", "AppendThreadNum", 0);
     int numSearchDuringInsertThreads = iniReader.GetParameter("Benchmark", "NumSearchDuringInsertThreads", 1);
+    int appendThreadNum = iniReader.GetParameter("Benchmark", "AppendThreadNum", 0);
     int numQueries = iniReader.GetParameter("Benchmark", "NumQueries", 1000);
     int layers = iniReader.GetParameter("Benchmark", "Layers", 1);
     DistCalcMethod distMethod = iniReader.GetParameter("Benchmark", "DistMethod", DistCalcMethod::L2);
-    bool rebuild = (iniReader.GetParameter("Benchmark", "Rebuild", true) || iniReader.GetParameter("Benchmark", "RebuildSSDOnly", false));
+    bool rebuild = iniReader.GetParameter("Benchmark", "Rebuild", true);
+    bool rebuildSsdOnly = iniReader.GetParameter("Benchmark", "RebuildSSDOnly", false);
+    bool buildOnly = iniReader.GetParameter("Benchmark", "BuildOnly", false);
     int resume = iniReader.GetParameter("Benchmark", "Resume", -1);
 
+    // Read storage backend overrides for BuildSSDIndex
+    std::map<std::string, std::string> ssdOverrides;
+    std::string storage = iniReader.GetParameter("Benchmark", "Storage", std::string(""));
+    if (!storage.empty()) {
+        ssdOverrides["Storage"] = storage;
+    }
+    std::string tikvKeyPrefix = iniReader.GetParameter("Benchmark", "TiKVKeyPrefix", std::string(""));
+    if (!tikvKeyPrefix.empty()) {
+        ssdOverrides["TiKVKeyPrefix"] = tikvKeyPrefix;
+    }
+    if (appendThreadNum > 0) {
+        ssdOverrides["AppendThreadNum"] = std::to_string(appendThreadNum);
+    }
+
+    // Pass through any [BuildSSDIndex] section params from the ini as overrides
+    auto buildSSDParams = iniReader.GetParameters("BuildSSDIndex");
+    for (const auto &[key, val] : buildSSDParams) {
+        ssdOverrides[key] = val;
+    }
+
+    // Read distributed config from [Distributed] section
+    auto distCfg = DistributedConfig::FromIni(iniReader);
+
+    // Shared TiKV raft cluster: every compute node connects to the FULL PD
+    // endpoint list. The TiKV client uses PD-raft to route reads/writes to
+    // whichever store owns the region, so any compute can access any posting.
+    if (!distCfg.pdAddrs.empty()) {
+        ssdOverrides["TiKVPDAddresses"] = distCfg.pdAddrs;
+        SPTAGLIB_LOG(Helper::LogLevel::LL_Info,
+            "Using PD address: %s (workerIndex=%d)\n",
+            distCfg.pdAddrs.c_str(), distCfg.workerIndex);
+    }
+
+    // Pass through [SelectHead] and [BuildHead] params as overrides too
+    auto selectHeadParams = iniReader.GetParameters("SelectHead");
+    for (const auto &[key, val] : selectHeadParams) {
+        ssdOverrides["SelectHead." + key] = val;
+    }
+    auto buildHeadParams = iniReader.GetParameters("BuildHead");
+    for (const auto &[key, val] : buildHeadParams) {
+        ssdOverrides["BuildHead." + key] = val;
+    }
+
     BOOST_TEST_MESSAGE("=== Benchmark Configuration ===");
     BOOST_TEST_MESSAGE("Vector Path: " << vectorPath);
     BOOST_TEST_MESSAGE("Query Path: " << queryPath);
@@ -2273,31 +2859,224 @@ BOOST_AUTO_TEST_CASE(BenchmarkFromConfig)
         BOOST_TEST_MESSAGE("QuantizedDim: " << quantizedDim);
     }
 
+    // Worker node path: if distributed and workerIndex > 0, run as remote worker and return
+    if (distCfg.enabled && distCfg.workerIndex > 0) {
+        int workerTimeout = iniReader.GetParameter("Benchmark", "WorkerTimeout", 3600);
+        BOOST_TEST_MESSAGE("Running as worker node " << distCfg.workerIndex);
+        if (valueType == VectorValueType::Float)
+            RunWorker<float>(indexPath, dimension, baseVectorCount, insertVectorCount, batchNum, topK, numSearchThreads, numInsertThreads, numQueries, valueType, ssdOverrides, distCfg, workerTimeout);
+        else if (valueType == VectorValueType::Int8)
+            RunWorker<std::int8_t>(indexPath, dimension, baseVectorCount, insertVectorCount, batchNum, topK, numSearchThreads, numInsertThreads, numQueries, valueType, ssdOverrides, distCfg, workerTimeout);
+        else if (valueType == VectorValueType::UInt8)
+            RunWorker<std::uint8_t>(indexPath, dimension, baseVectorCount, insertVectorCount, batchNum, topK, numSearchThreads, numInsertThreads, numQueries, valueType, ssdOverrides, distCfg, workerTimeout);
+        return;
+    }
+
     // Get output file path from environment variable or use default
     const char *outputPath = std::getenv("BENCHMARK_OUTPUT");
     std::string outputFile = outputPath ? std::string(outputPath) : "output.json";
     BOOST_TEST_MESSAGE("Output File: " << outputFile);
 
-    // Dispatch to appropriate type
+    // Driver path (nodeIndex == 0 or single-node mode)
     if (valueType == VectorValueType::Float)
     {
         RunBenchmark<float>(vectorPath, queryPath, truthPath, distMethod, indexPath, dimension, baseVectorCount,
-                    insertVectorCount, deleteVectorCount, batchNum, topK, numSearchThreads, numInsertThreads, numSearchDuringInsertThreads, numQueries, iniReader, 
-                    outputFile, rebuild, resume, quantizerFilePath, quantizedDim, layers);
+                    insertVectorCount, deleteVectorCount, batchNum, topK, numSearchThreads, numInsertThreads, numSearchDuringInsertThreads, numQueries, outputFile, 
+                    rebuild, resume, quantizerFilePath, quantizedDim, layers, ssdOverrides, rebuildSsdOnly, buildOnly, distCfg);
     }
     else if (valueType == VectorValueType::Int8)
     {
         RunBenchmark<std::int8_t>(vectorPath, queryPath, truthPath, distMethod, indexPath, dimension, baseVectorCount,
-                      insertVectorCount, deleteVectorCount, batchNum, topK, numSearchThreads, numInsertThreads, numSearchDuringInsertThreads, numQueries, iniReader,
-                      outputFile, rebuild, resume, quantizerFilePath, quantizedDim, layers);
+                      insertVectorCount, deleteVectorCount, batchNum, topK, numSearchThreads, numInsertThreads, numSearchDuringInsertThreads, numQueries,
+                      outputFile, rebuild, resume, quantizerFilePath, quantizedDim, layers, ssdOverrides, rebuildSsdOnly, buildOnly, distCfg);
     }
     else if (valueType == VectorValueType::UInt8)
     {
         RunBenchmark<std::uint8_t>(vectorPath, queryPath, truthPath, distMethod, indexPath, dimension, baseVectorCount,
-                       insertVectorCount, deleteVectorCount, batchNum, topK, numSearchThreads, numInsertThreads, numSearchDuringInsertThreads, numQueries, iniReader,
-                       outputFile, rebuild, resume, quantizerFilePath, quantizedDim, layers);
+                       insertVectorCount, deleteVectorCount, batchNum, topK, numSearchThreads, numInsertThreads, numSearchDuringInsertThreads, numQueries,
+                       outputFile, rebuild, resume, quantizerFilePath, quantizedDim, layers, ssdOverrides, rebuildSsdOnly, buildOnly, distCfg);
+    }
+}
+
+/// Worker node path for distributed benchmark (nodeIndex > 0).
+/// Loads a pre-built head index, connects to TiKV, starts WorkerNode,
+/// and waits for TCP dispatch commands from the driver node.
+template <typename T>
+void RunWorker(const std::string& indexPath, int dimension, int baseVectorCount,
+               int insertVectorCount, int batches, int topK, int numSearchThreads,
+               int numInsertThreads, int numQueries, VectorValueType valueType,
+               const std::map<std::string, std::string>& ssdOverrides,
+               const DistributedConfig& distCfg, int workerTimeout)
+{
+    int oldN = N, oldM = M, oldK = K, oldQ = queries;
+    N = baseVectorCount; M = dimension; K = topK; queries = numQueries;
+
+    int nodeIndex = distCfg.workerIndex;
+    int numNodes = distCfg.GetNumWorkers();
+    int insertBatchSize = insertVectorCount / std::max(batches, 1);
+    bool strideShard = IsStrideShardEnabled() && numNodes > 1;
+    int myInsertStart, myInsertEnd, perNodeBatch;
+    if (strideShard) {
+        myInsertStart = 0;
+        myInsertEnd = insertBatchSize;
+        perNodeBatch = static_cast<int>(StrideCount(insertBatchSize, numNodes, nodeIndex));
+    } else {
+        myInsertStart = (numNodes > 1) ? (nodeIndex * insertBatchSize) / numNodes : 0;
+        myInsertEnd = (numNodes > 1) ? ((nodeIndex + 1) * insertBatchSize) / numNodes : insertBatchSize;
+        perNodeBatch = myInsertEnd - myInsertStart;
+    }
+
+    BOOST_TEST_MESSAGE("Worker node " << nodeIndex << ": Loading index from " << indexPath);
+    std::shared_ptr<VectorIndex> index;
+    // IMPORTANT: Pass ssdOverrides through LoadIndex so that worker-specific settings
+    // (especially TiKVPDAddresses pointing at this worker's local PD) are applied
+    // BEFORE the underlying TiKV connection is constructed in PrepareDB. Without this,
+    // the worker would inherit the driver's PD address from the saved indexloader.ini
+    // and route every KV write back to the driver's TiKV instead of its own.
+    BOOST_REQUIRE(VectorIndex::LoadIndex(indexPath, ssdOverrides, index) == ErrorCode::Success);
+    BOOST_REQUIRE(index != nullptr);
+
+    // Create WorkerNode
+    auto dispAddr = distCfg.GetDispatcherAddr();
+    auto workerAddrs = ParseNodeAddrs(distCfg.workerAddrs);
+    auto storeAddrs = Helper::StrUtils::SplitString(distCfg.storeAddrs, ",");
+
+    auto* spannIndex = dynamic_cast<SPANN::Index<T>*>(index.get());
+    BOOST_REQUIRE_MESSAGE(spannIndex != nullptr, "Failed to cast to SPANN::Index<T>");
+    auto diskIndex = spannIndex->GetDiskIndex(0);
+    BOOST_REQUIRE(diskIndex != nullptr);
+    auto* searcher = dynamic_cast<SPANN::ExtraDynamicSearcher<T>*>(diskIndex.get());
+    BOOST_REQUIRE(searcher != nullptr);
+    auto workerDb = searcher->GetDB();
+    BOOST_REQUIRE_MESSAGE(workerDb != nullptr, "Worker: could not extract db from index");
+
+    SPANN::WorkerNode workerNode;
+    BOOST_REQUIRE_MESSAGE(workerNode.Initialize(workerDb, nodeIndex, dispAddr, workerAddrs, storeAddrs),
+                          "WorkerNode initialization failed");
+    BOOST_REQUIRE(workerNode.Start());
+    auto* router = &workerNode;
+
+    // Bind worker to ALL searcher layers (every layer must see the worker so
+    // AddIDCapacity grows each layer's version map by capa * numNodes).
+    BindWorkerToAllLayers<T>(router, spannIndex);
+
+    // Wait for ring from dispatcher
+    BOOST_REQUIRE_MESSAGE(router->WaitForRing(120),
+                          "Worker: Timed out waiting for ring from dispatcher");
+
+    BOOST_TEST_MESSAGE("Worker " << nodeIndex << ": Ready, numNodes=" << numNodes
+                       << " perNodeBatch=" << perNodeBatch);
+
+    // Build data file names
+    std::string typeStr = Helper::Convert::ConvertToString(valueType);
+    std::string paddset = "perftest_addvector.bin." + typeStr + "_" + std::to_string(insertVectorCount) + "_" + std::to_string(dimension);
+    std::string paddmeta = "perftest_addmeta.bin." + std::to_string(baseVectorCount) + "_" + std::to_string(insertVectorCount);
+    std::string paddmetaidx = "perftest_addmetaidx.bin." + std::to_string(baseVectorCount) + "_" + std::to_string(insertVectorCount);
+
+    // Load query set
+    int searchK = topK;
+    std::string pqueryset = "perftest_query.bin." + typeStr + "_" + std::to_string(numQueries) + "_" + std::to_string(dimension);
+    auto queryset = TestUtils::TestDataGenerator<T>::LoadVectorSet(pqueryset, dimension);
+    BOOST_REQUIRE_MESSAGE(queryset != nullptr, "Worker: Failed to load query set from " << pqueryset);
+
+    // Register dispatch callback
+    std::promise<void> stopPromise;
+    auto stopFuture = stopPromise.get_future();
+    std::once_flag stopOnce;
+
+    router->SetDispatchCallback([&](const SPANN::DispatchCommand& cmd) -> SPANN::DispatchResult {
+        SPANN::DispatchResult result;
+        result.m_dispatchId = cmd.m_dispatchId;
+        result.m_round = cmd.m_round;
+
+        if (cmd.m_type == SPANN::DispatchCommand::Type::Stop) {
+            SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Worker %d: Stop command received\n", nodeIndex);
+            std::call_once(stopOnce, [&]() { stopPromise.set_value(); });
+            result.m_status = SPANN::DispatchResult::Status::Success;
+            return result;
+        }
+
+        if (cmd.m_type == SPANN::DispatchCommand::Type::Heartbeat) {
+            // Driver sends a Heartbeat every HeartbeatIntervalSec; the result
+            // is dropped by DispatchCoordinator. Acknowledge silently so we
+            // don't log noise every 30s during the insert phase.
+            result.m_status = SPANN::DispatchResult::Status::Success;
+            return result;
+        }
+
+        if (cmd.m_type == SPANN::DispatchCommand::Type::Search) {
+            int myStart = (int)((long long)nodeIndex * numQueries / numNodes);
+            int myEnd = (int)((long long)(nodeIndex + 1) * numQueries / numNodes);
+            int myCount = myEnd - myStart;
+
+            SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Worker %d: Search round %u - %d queries [%d, %d)\n",
+                         nodeIndex, cmd.m_round, myCount, myStart, myEnd);
+
+            std::vector<QueryResult> results;
+            double wallTime = ExecutePartitionedSearch<T>(
+                index.get(), queryset, myStart, myCount, searchK,
+                std::min(numSearchThreads, myCount),
+                results, /*latenciesOut=*/nullptr, /*statsOut=*/nullptr);
+
+            // Drain merge hints accumulated during this search round.
+            // Search-side AsyncMergeInSearch on remote-owned heads enqueues
+            // notifications via QueueRemoteMerge; auto-flush only fires when
+            // a per-target bucket reaches kMergeAutoFlushThreshold, so the
+            // tail of every round (and any sparse rounds) needs an explicit
+            // drain to guarantee no hint is dropped.
+            router->FlushRemoteMerges();
+
+            SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Worker %d: Search round %u done - %.1fms\n",
+                         nodeIndex, cmd.m_round, wallTime * 1000);
+            result.m_status = SPANN::DispatchResult::Status::Success;
+            result.m_wallTime = wallTime;
+            return result;
+        }
+
+        if (cmd.m_type == SPANN::DispatchCommand::Type::Insert) {
+            int insertStart = cmd.m_round * insertBatchSize + myInsertStart;
+            int loadCount = strideShard ? insertBatchSize : perNodeBatch;
+            SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Worker %d: Batch %u - inserting %d vectors (offset %d, strideShard=%d)\n",
+                         nodeIndex, cmd.m_round + 1, perNodeBatch, insertStart, strideShard ? 1 : 0);
+
+            auto t1 = std::chrono::high_resolution_clock::now();
+            std::string workerTag =
+                "Worker " + std::to_string(nodeIndex) + " batch=" + std::to_string(cmd.m_round + 1);
+            LoadAndInsertBatch<T>(spannIndex, paddset, paddmeta, paddmetaidx, dimension,
+                                  insertStart, loadCount, perNodeBatch,
+                                  strideShard, numNodes, nodeIndex,
+                                  numInsertThreads, router,
+                                  /*quantizer=*/nullptr,
+                                  /*searchDuringInsertThreads=*/0,
+                                  /*queryset=*/nullptr,
+                                  /*numQueries=*/0, /*searchK=*/5,
+                                  /*benchmarkData=*/nullptr,
+                                  workerTag.c_str());
+            auto t2 = std::chrono::high_resolution_clock::now();
+            double secs = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count() / 1000000.0;
+            SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Worker %d: Batch %u done - %d vectors in %.2f s (%.1f vec/s)\n",
+                         nodeIndex, cmd.m_round + 1, perNodeBatch, secs, perNodeBatch / secs);
+
+            result.m_status = SPANN::DispatchResult::Status::Success;
+            result.m_wallTime = secs;
+            return result;
+        }
+
+        SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, "Worker %d: Unknown command type %d\n",
+                     nodeIndex, (int)cmd.m_type);
+        result.m_status = SPANN::DispatchResult::Status::Failed;
+        return result;
+    });
+
+    SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Worker %d: Waiting for dispatch commands\n", nodeIndex);
+
+    auto status = stopFuture.wait_for(std::chrono::seconds(workerTimeout));
+    if (status == std::future_status::timeout) {
+        SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, "Worker %d: Timeout after %ds\n", nodeIndex, workerTimeout);
     }
 
-    //std::filesystem::remove_all(indexPath);
+    router->ClearDispatchCallback();
+    N = oldN; M = oldM; K = oldK; queries = oldQ;
+    BOOST_TEST_MESSAGE("Worker " << nodeIndex << ": Shutting down");
 }
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/Test/src/TestDataGenerator.cpp b/Test/src/TestDataGenerator.cpp
index cb3318548..c32f19e0a 100644
--- a/Test/src/TestDataGenerator.cpp
+++ b/Test/src/TestDataGenerator.cpp
@@ -274,7 +274,8 @@ void TestDataGenerator<T>::GenerateBatchTruth(const std::string &filename, std::
 }
 
 template <typename T>
-float TestDataGenerator<T>::EvaluateRecall(const std::vector<SPTAG::QueryResult> &res, std::shared_ptr<SPTAG::VectorSet> &truth, int recallK, int k, int batch, int totalbatches)
+float TestDataGenerator<T>::EvaluateRecall(const std::vector<SPTAG::QueryResult> &res, std::shared_ptr<SPTAG::VectorSet> &truth, int recallK, int k, int batch, int totalbatches,
+                                           int totalQueries, int queryOffset)
 {
     if (!truth)
     {
@@ -285,14 +286,17 @@ float TestDataGenerator<T>::EvaluateRecall(const std::vector<SPTAG::QueryResult>
     recallK = min(recallK, static_cast<int>(truth->Dimension()));
     float totalRecall = 0.0f;
     float eps = 1e-4f;
-    SizeType distbase = truth->Count() - (totalbatches + 1) * res.size();
+    // Use global queryCount when caller provides it (distributed path); otherwise
+    // assume single-node where res.size() IS the global query count.
+    SizeType queryCount = (totalQueries > 0) ? static_cast<SizeType>(totalQueries) : static_cast<SizeType>(res.size());
+    SizeType distbase = truth->Count() - (totalbatches + 1) * queryCount;
     for (SizeType i = 0; i < res.size(); ++i)
     {
-        const SizeType *truthNN = reinterpret_cast<const SizeType *>(truth->GetData()) + batch * res.size() + i;
+        const SizeType *truthNN = reinterpret_cast<const SizeType *>(truth->GetVector(batch * queryCount + queryOffset + i));
         float *truthD = nullptr;
         if (truth->Count() > distbase)
         {
-            truthD = reinterpret_cast<float *>(truth->GetVector(distbase + batch * res.size() + i));
+            truthD = reinterpret_cast<float *>(truth->GetVector(distbase + batch * queryCount + queryOffset + i));
         }
         for (int j = 0; j < recallK; ++j)
         {
diff --git a/Test/src/main.cpp b/Test/src/main.cpp
index c1a5cde60..ab8d1342c 100644
--- a/Test/src/main.cpp
+++ b/Test/src/main.cpp
@@ -7,9 +7,7 @@
 
 #include <boost/test/tree/visitor.hpp>
 #include <string>
-#ifdef TIKV
 #include <absl/synchronization/mutex.h>
-#endif
 
 using namespace boost::unit_test;
 
@@ -38,9 +36,8 @@ struct GlobalFixture
         // adds GraphCycles bookkeeping under a global spinlock on every Lock();
         // observed to consume ~12% CPU under high worker-thread parallelism in
         // gRPC client paths (perf-recorded 2026-05-06).
-#ifdef TIKV
-    	absl::SetMutexDeadlockDetectionMode(absl::OnDeadlockCycle::kIgnore);
-#endif
+        absl::SetMutexDeadlockDetectionMode(absl::OnDeadlockCycle::kIgnore);
+
         SPTAGVisitor visitor;
         traverse_test_tree(framework::master_test_suite(), visitor, false);
     }
diff --git a/benchmark.ini b/benchmark.ini
new file mode 100644
index 000000000..e2b400767
--- /dev/null
+++ b/benchmark.ini
@@ -0,0 +1,19 @@
+[Benchmark]
+VectorPath=sift1b/base.100M.u8bin
+QueryPath=sift1b/query.public.10K.u8bin
+TruthPath=none
+IndexPath=proidx/spann_index
+ValueType=UInt8
+Dimension=128
+BaseVectorCount=10000
+InsertVectorCount=10000
+DeleteVectorCount=0
+BatchNum=10
+TopK=5
+NumThreads=8
+NumQueries=100
+DistMethod=L2
+Rebuild=true
+Resume=-1
+QuantizerFilePath=quantizer.bin
+QuantizedDim=64
diff --git a/evaluation/distributed/README.md b/evaluation/distributed/README.md
new file mode 100644
index 000000000..1f24bc865
--- /dev/null
+++ b/evaluation/distributed/README.md
@@ -0,0 +1,294 @@
+# Distributed Benchmark Evaluation — Insert Dominant
+
+Multi-machine SPTAG SPANN distributed benchmark for an **insert-dominant** workload
+(1M base + 10M inserts in 10 batches, with concurrent search-during-insert) on
+SIFT1B. Each physical node runs its own independent PD + TiKV (no shared Raft
+replication — see "TiKV deployment model" below).
+
+## Files in this folder
+
+| File | Purpose |
+| --- | --- |
+| `configs/benchmark_insert_dominant_template.ini` | Benchmark template; `run_distributed.sh` fills `IndexPath`, `TiKVPDAddresses`, `TiKVKeyPrefix`, and `[Distributed]` from `cluster.conf`. |
+| `run_distributed.sh` | Orchestrator: `deploy` / `start-tikv` / `run` / `stop-tikv` / `cleanup`. |
+| `README.md` | This file. |
+
+## Architecture
+
+```
+                    ┌──────────────┐
+                    │   Driver     │  (node 0)
+                    │  RunBenchmark│
+                    │   + Router   │
+                    └──┬───┬───┬──┘
+           TCP Dispatch│   │   │
+              ┌────────┘   │   └────────┐
+              ▼            ▼            ▼
+        ┌──────────┐ ┌──────────┐ ┌──────────┐
+        │ Worker 1 │ │ Worker 2 │ │ Worker N │
+        │  + Router│ │  + Router│ │  + Router│
+        └────┬─────┘ └────┬─────┘ └────┬─────┘
+             │            │            │
+             ▼            ▼            ▼
+        ┌──────────┐ ┌──────────┐ ┌──────────┐
+        │  TiKV 1  │ │  TiKV 2  │ │  TiKV N  │ (one PD + one TiKV per node)
+        └──────────┘ └──────────┘ └──────────┘
+```
+
+- **Driver** (node 0): Builds the index, sends Search/Insert/Stop commands via TCP dispatch.
+- **Workers** (nodes 1..N): Receive commands, execute their shard locally, report results back.
+- **TiKV (per node)**: Each node runs its own independent PD + TiKV pair. Postings
+  for a head live on the node that owns that head's hash partition.
+- **PostingRouter**: Hash-based head routing, remote append, head sync, dispatch protocol.
+
+## TiKV deployment model
+
+Unlike a single-machine multi-docker TiKV (3 PD + 3 TiKV behind 127.0.0.1 ports
+22791-3 / 20161-3 sharing one Raft cluster), in this multi-machine setup **each
+node runs its own isolated PD + TiKV pair** under host networking. Heads are
+routed to nodes by hash, and each node's TiKV stores only its own shard. There
+is no Raft replication between nodes (no cross-node region quorum), which is
+intentional for insert-dominated benchmarks where Raft log overhead would dominate.
+
+Per-node ports (defaults from `cluster.conf`):
+
+| Service | Port | Notes |
+| --- | --- | --- |
+| PD client | `2379` | Local app uses `<node_ip>:2379`. |
+| PD peer | `2380` | Inter-PD; isolated cluster of 1 PD per node. |
+| TiKV client | `20161` | The node-local SPTAG worker connects here. |
+| Router | `30001+` | TCP dispatch / posting routing between nodes. |
+
+## Prerequisites
+
+- `Release/SPTAGTest` built with TiKV support on the driver node:
+  ```bash
+  cd <SPTAG_ROOT>
+  cd ThirdParty/kvproto && ./generate_cpp.sh && cd ../..
+  mkdir -p Release && cd Release
+  cmake .. -DTIKV=ON -DTBB=ON -DCMAKE_BUILD_TYPE=Release -DGPU=OFF
+  cmake --build . --target SPTAGTest -j$(nproc)
+  ```
+  *Note: building the full project may fail on the Java wrapper (`JAVASPTAGFileIO`)
+  due to a pre-existing `FileIOInterface.h` signature mismatch — the `SPTAGTest`
+  target alone is sufficient.*
+- Passwordless SSH from driver to every other node (configure `ssh_key` in `cluster.conf`).
+- Docker installed on every node (TiKV/PD run as containers in host network mode).
+- Same dataset path on every node (default `/mnt/nvme/sift1b/`):
+  - `/mnt/nvme/sift1b/bigann_base.u8bin` (1B × 128 × u8)
+  - `/mnt/nvme/sift1b/query.10K.u8bin`
+- Same fast-storage path for index + TiKV data on every node (`data_dir` in `cluster.conf`,
+  default `/mnt/nvme`).
+
+## Step 1 — Cluster config
+
+```bash
+cp evaluation/distributed/cluster.conf.example cluster.conf
+vim cluster.conf
+```
+
+Example:
+
+```ini
+[cluster]
+ssh_user=superbench
+sptag_dir=/home/superbench/zhangt/SPTAG
+data_dir=/mnt/nvme
+tikv_version=v7.5.1
+pd_version=v7.5.1
+
+[nodes]
+# host           router_port
+10.0.1.1         30001          # driver (always first)
+10.0.1.2         30002          # worker 1
+10.0.1.3         30003          # worker 2
+
+[tikv]
+# host           pd_client  pd_peer  tikv_port
+10.0.1.1         2379       2380     20161
+10.0.1.2         2379       2380     20161
+10.0.1.3         2379       2380     20161
+```
+
+`run_distributed.sh` reads this file to fill the template's `[Distributed]`,
+`TiKVPDAddresses`, `IndexPath`, and `TiKVKeyPrefix` automatically.
+
+## Step 2 — Deploy
+
+```bash
+./evaluation/distributed/run_distributed.sh deploy cluster.conf
+```
+
+This rsyncs `Release/SPTAGTest` (and required shared libs) to every node and
+ensures the per-node TiKV / PD data directories exist under `data_dir`.
+
+## Step 3 — Start TiKV (per-node, independent)
+
+```bash
+./evaluation/distributed/run_distributed.sh start-tikv cluster.conf
+```
+
+This starts one PD + one TiKV per node in host-network containers. Single-replica
+placement (`max-replicas=1`) is set so we measure benchmark performance without
+3-way Raft replication.
+
+Health check (run on driver, repeat per node):
+
+```bash
+for ip in 10.0.1.1 10.0.1.2 10.0.1.3; do
+  curl -s "http://$ip:2379/pd/api/v1/stores" \
+    | python3 -c 'import json,sys; print([s["store"]["state_name"] for s in json.load(sys.stdin)["stores"]])'
+done
+# Each node should report ['Up'].
+```
+
+### Pre-split & scatter (optional but recommended)
+
+For the insert-dominant workload to spread region writes evenly across regions
+within a node's TiKV, pre-split the keyspace at boundaries derived from
+`DBKey(headID) = MaxID*layer + headID` little-endian byte 0. The TiKV raw key is
+`TiKVKeyPrefix + "_" + uint32_le(DBKey)`; for multi-chunk it appends `\x00` /
+`\x02` for chunk / count keys, but we split *only* on the head-key prefix so all
+chunk and count variants for a head share a region. Boundaries used: `0x02, 0x04,
+…, 0xfe` (127 split points → 128 regions).
+
+Driver-side helper (each PD is independent, so run per node):
+
+```bash
+PREFIX="bench_insert_dominant_3node"   # keep in sync with KEY_PREFIX in run_distributed.sh
+for ip in 10.0.1.1 10.0.1.2 10.0.1.3; do
+  PD="http://$ip:2379"
+  PDCTL=(docker run --rm --network host --entrypoint /pd-ctl pingcap/pd:v7.5.1 -u "$PD")
+  python3 - "$PREFIX" "${PDCTL[@]}" <<'PY'
+import json, subprocess, sys
+prefix = sys.argv[1].encode() + b'_'
+pdctl = sys.argv[2:]
+def run(args): return subprocess.check_output(pdctl + args, text=True)
+def region_for(hex_key): return json.loads(run(['region', 'key', '--format=hex', hex_key]))['id']
+for b in range(2, 256, 2):
+    key = (prefix + bytes([b, 0, 0, 0])).hex()
+    rid = region_for(key)
+    run(['operator', 'add', 'split-region', str(rid), '--policy=usekey', '--keys', key])
+for r in json.loads(run(['region', 'scan']))['regions']:
+    run(['operator', 'add', 'scatter-region', str(r['id'])])
+PY
+done
+```
+
+Skip this on the very first run if you don't have load skew — `start-tikv` works
+without it. For 1B-scale insert-dominant runs on a single node it materially
+reduces head-region hot-spotting.
+
+## Step 4 — Run the benchmark
+
+```bash
+# Single scale, explicit node count (driver + (N-1) workers):
+./evaluation/distributed/run_distributed.sh run cluster.conf insert_dominant 3
+
+# Or sweep 1-node baseline + N-node distributed for one or more scales:
+./evaluation/distributed/run_distributed.sh bench cluster.conf insert_dominant
+```
+
+What `run` does:
+
+1. **Build** (driver only): driver builds the index locally with router
+   *disabled* (`Rebuild=true`, no `[Router]`). Output goes to `…_n0/spann_index`.
+2. **Distribute**: rsync head index + perftest files from driver to each worker.
+3. **Workers**: SSH-launches `SPTAGTest` on each worker with `WORKER_INDEX=i` and
+   the per-node ini (router enabled, `Rebuild=false`).
+4. **Driver**: relaunches `SPTAGTest` with router enabled, `Rebuild=false`. The
+   driver dispatches Insert / Search commands across batches via TCP.
+5. **Collect**: driver sends Stop, joins worker logs into `benchmark_logs/`.
+
+Useful environment overrides (see header of `run_distributed.sh`):
+
+- `NOCACHE=1` — disable TiKV block cache, OS pagecache, and `VersionCacheMaxChunks`.
+- `BUILD_WITH_CACHE=1` — build with caches, then drop caches before search/insert (NOCACHE only).
+- `SKIP_TIKV_SWAP=1` — when using `BUILD_WITH_CACHE`, skip the destructive TiKV
+  container restart that has corrupted recall at 100M scale.
+- `SKIP_SAVE_LOAD=1` — skip post-build SaveIndex / per-batch Load+Clone+Save (NOCACHE only).
+- `SKIP_HEAD_BUILD=1` — reuse existing HeadIndex if present (RebuildSSDOnly).
+
+## Step 5 — Stop / cleanup
+
+```bash
+./evaluation/distributed/run_distributed.sh stop-tikv cluster.conf
+./evaluation/distributed/run_distributed.sh cleanup cluster.conf   # remove deployed files
+```
+
+## Key knobs in `benchmark_insert_dominant_template.ini`
+
+| Key | Value | Meaning |
+| --- | --- | --- |
+| `BaseVectorCount` | 1_000_000 | Initial index build size. |
+| `InsertVectorCount` / `BatchNum` | 10_000_000 / 10 | 10 batches × 1M inserts. |
+| `NumSearchThreads` | 4 | Threads for the standalone post-batch query benchmark. |
+| `NumInsertThreads` | 16 | Threads driving `AddIndex` calls on the driver. |
+| `AppendThreadNum` | 144 | Async append worker pool size — overprovisioned (≈3× cores) because each thread is I/O-bound on TiKV RPCs, so high concurrency increases in-flight RPCs. |
+| `NumSearchDuringInsertThreads` | 1 | Concurrent search threads while inserting (continuous loop, ~1s sleep per query). |
+| `NumQueries` | 200 | Size of the rotating query pool (in-insert search loops over it). |
+| `WorkerTimeout` | 14400 | Seconds a worker waits for the driver before exiting. |
+| `Storage` / `TiKVKeyPrefix` / `TiKVPDAddresses` | `TIKVIO` / filled / filled | Filled by `run_distributed.sh` from `cluster.conf`. |
+| `Layers` | 2 | SPANN multi-layer head. |
+| `BuildSSDIndex.UseMultiChunkPosting` | false | Single-key posting layout (one TiKV value per head). |
+| `BuildSSDIndex.PostingPageLimit` | 8 | Posting page limit; runtime cap is logged as ~246 vectors. |
+| `BuildSSDIndex.PostingCountCacheCapacity` | 1_000_000 | Posting-count cache capacity. |
+| `BuildSSDIndex.DistributedVersionMap` | true | Use TiKV-backed distributed version map. |
+| `BuildSSDIndex.ReassignK` | 64 | Split/reassign target fanout knob. |
+| `BuildSSDIndex.AsyncMergeInSearch` | true | Async merge during search. |
+| `BuildSSDIndex.VersionCacheMaxChunks` | 100_000 | Local version-chunk cache (set ≤0 to disable). |
+| `BuildSSDIndex.LatencyLimit` | 100 | ms latency cap fed to SPANN. |
+| `BuildSSDIndex.MaxCheck` | 8192 | Max posting checks per query. |
+| `BuildSSDIndex.SearchInternalResultNum` | 64 | Internal candidate count during search. |
+
+## Output JSON structure (per batch)
+
+For each insert batch, `output.json/results.benchmark1_insert.batch_N` contains:
+
+- `Load timeSeconds` / `Load vectorCount` — reload of previous batch.
+- `Clone timeSeconds`.
+- In-insert concurrent search stats (continuous-loop variant):
+  `numQueries` (actual count issued), `meanLatency`, `p50/p90/p95/p99`, `qps`,
+  `batch barrier waitSeconds`.
+- `inserted`, `insert timeSeconds`, `insert throughput`.
+- `search` and `search_round2` — standalone `BenchmarkQueryPerformance` results
+  against the post-batch index (cold + warm), independent of the in-insert numbers.
+- `save timeSeconds`.
+
+Pre-insert baseline lives at `results.benchmark0_query_before_insert` and
+`results.benchmark0b_query_before_insert_round2`.
+
+## Dispatch Protocol
+
+The TCP dispatch protocol replaces file-based barriers. Communication flows through
+PostingRouter's existing TCP transport:
+
+| Packet | Direction | Purpose |
+|--------|-----------|---------|
+| `DispatchCommand (0x09)` | Driver → Worker | Search/Insert/Stop with `dispatchId` + round. |
+| `DispatchResult (0x89)` | Worker → Driver | Status + wallTime for aggregation. |
+
+- **Search**: Driver broadcasts to workers, runs local queries in parallel, collects
+  wall times for percentile stats.
+- **Insert**: Driver broadcasts batch index, workers insert their shard, driver
+  waits for all to finish.
+- **Stop**: Driver sends at end of benchmark; workers exit gracefully.
+
+Each command has a unique `dispatchId` (monotonic uint64) to avoid round collisions
+between search and insert operations.
+
+## Troubleshooting
+
+- **Workers don't connect**: confirm `RouterNodeAddrs` ports (default 30001+) are
+  reachable between every pair of nodes — the router uses TCP with 2 io_context
+  threads.
+- **TiKV timeout**: ensure each node's PD `advertise-client-urls` use a reachable
+  IP (not 127.0.0.1) — `start-tikv` sets this from `cluster.conf`. Check
+  `docker logs sptag-pd-0` on the affected node.
+- **Worker exits prematurely**: check the worker logs in `benchmark_logs/`.
+  Common causes: TiKV not ready, index path mismatch, router connection failure.
+- **Build fails on Java wrapper**: pre-existing issue unrelated to the benchmark.
+  Build only what's needed:
+  ```bash
+  cmake --build . --target SPTAGTest -j$(nproc)
+  ```
diff --git a/evaluation/distributed/configs/benchmark_100m_1node.ini b/evaluation/distributed/configs/benchmark_100m_1node.ini
new file mode 100644
index 000000000..42ec07f49
--- /dev/null
+++ b/evaluation/distributed/configs/benchmark_100m_1node.ini
@@ -0,0 +1,71 @@
+; 100m: 99M base + 1M insert (insert is ~1% of base, "freshness / steady-state" workload).
+; 100× larger base index than insert_dominant. Tests how the system behaves when
+; the head index is large (~tens of millions of heads on layer 0) and the insert
+; rate is moderate. Layers=2, L2 distance, SIFT1B dataset.
+;
+; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from
+; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs).
+;
+; Notes for 100M-scale operation:
+;   - First run MUST build the index (Rebuild=true). Build of 99M base takes hours;
+;     reuse with Rebuild=false on subsequent runs and SKIP_HEAD_BUILD=1 if the
+;     HeadIndex on disk is intact.
+;   - Truth (top-5 over 99M) is recomputed at start each run; expect ~minutes.
+;   - SaveIndex at 100M has been observed to hang in BG-job-drain on some hosts;
+;     use SKIP_SAVE_LOAD=1 when iterating to bypass the per-batch save/load cycle.
+;   - TiKV data will grow to ~50-100GB per store at this scale; both nodes need
+;     plenty of NVMe headroom (verified: driver has 6.2T, worker has 691G).
+[Benchmark]
+WorkerTimeout=14400
+VectorPath=/mnt/nvme/sift1b/bigann_base.u8bin
+QueryPath=/mnt/nvme/sift1b/query.10K.u8bin
+TruthPath=truth
+IndexPath=/mnt/nvme/proidx_100m_1node/spann_index
+ValueType=UInt8
+Dimension=128
+BaseVectorCount=99000000
+InsertVectorCount=1000000
+DeleteVectorCount=0
+BatchNum=1
+TopK=5
+NumSearchThreads=4
+NumInsertThreads=4
+AppendThreadNum=16
+NumSearchDuringInsertThreads=1
+NumQueries=200
+DistMethod=L2
+Rebuild=true
+BuildOnly=false
+Resume=-1
+Layers=2
+
+Storage=TIKVIO
+TiKVPDAddresses=PLACEHOLDER
+TiKVKeyPrefix=bench100m_1node
+
+[SelectHead]
+ParallelBKTBuild=true
+
+[BuildHead]
+ParallelBKTBuild=true
+
+[BuildSSDIndex]
+LatencyLimit=100
+MaxCheck=8192
+SearchInternalResultNum=64
+UseMultiChunkPosting=false
+PostingPageLimit=8
+PostingCountCacheCapacity=10000000
+SearchCheckVersionMapOnlyLayer0=true
+DistributedVersionMap=true
+ReassignK=64
+AsyncMergeInSearch=true
+VersionCacheMaxChunks=1000000
+AsyncRpcMaxInflight=512
+
+[Distributed]
+Enabled=true
+DispatcherAddr=10.11.0.7:30001
+WorkerAddrs=10.11.0.7:30011
+StoreAddrs=10.11.0.7:20171
+PDAddrs=10.11.0.7:23791
diff --git a/evaluation/distributed/configs/benchmark_100m_2node.ini b/evaluation/distributed/configs/benchmark_100m_2node.ini
new file mode 100644
index 000000000..01b9c3e81
--- /dev/null
+++ b/evaluation/distributed/configs/benchmark_100m_2node.ini
@@ -0,0 +1,71 @@
+; 100m: 99M base + 1M insert (insert is ~1% of base, "freshness / steady-state" workload).
+; 100× larger base index than insert_dominant. Tests how the system behaves when
+; the head index is large (~tens of millions of heads on layer 0) and the insert
+; rate is moderate. Layers=2, L2 distance, SIFT1B dataset.
+;
+; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from
+; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs).
+;
+; Notes for 100M-scale operation:
+;   - First run MUST build the index (Rebuild=true). Build of 99M base takes hours;
+;     reuse with Rebuild=false on subsequent runs and SKIP_HEAD_BUILD=1 if the
+;     HeadIndex on disk is intact.
+;   - Truth (top-5 over 99M) is recomputed at start each run; expect ~minutes.
+;   - SaveIndex at 100M has been observed to hang in BG-job-drain on some hosts;
+;     use SKIP_SAVE_LOAD=1 when iterating to bypass the per-batch save/load cycle.
+;   - TiKV data will grow to ~50-100GB per store at this scale; both nodes need
+;     plenty of NVMe headroom (verified: driver has 6.2T, worker has 691G).
+[Benchmark]
+WorkerTimeout=14400
+VectorPath=/mnt/nvme/sift1b/bigann_base.u8bin
+QueryPath=/mnt/nvme/sift1b/query.10K.u8bin
+TruthPath=truth
+IndexPath=/mnt/nvme/proidx_100m_2node/spann_index
+ValueType=UInt8
+Dimension=128
+BaseVectorCount=99000000
+InsertVectorCount=1000000
+DeleteVectorCount=0
+BatchNum=1
+TopK=5
+NumSearchThreads=4
+NumInsertThreads=4
+AppendThreadNum=16
+NumSearchDuringInsertThreads=1
+NumQueries=200
+DistMethod=L2
+Rebuild=false
+BuildOnly=false
+Resume=-1
+Layers=2
+
+Storage=TIKVIO
+TiKVPDAddresses=PLACEHOLDER
+TiKVKeyPrefix=bench100m_2node
+
+[SelectHead]
+ParallelBKTBuild=true
+
+[BuildHead]
+ParallelBKTBuild=true
+
+[BuildSSDIndex]
+LatencyLimit=100
+MaxCheck=8192
+SearchInternalResultNum=64
+UseMultiChunkPosting=false
+PostingPageLimit=8
+PostingCountCacheCapacity=10000000
+SearchCheckVersionMapOnlyLayer0=true
+DistributedVersionMap=true
+ReassignK=64
+AsyncMergeInSearch=true
+VersionCacheMaxChunks=1000000
+AsyncRpcMaxInflight=512
+
+[Distributed]
+Enabled=true
+DispatcherAddr=10.11.0.7:30001
+WorkerAddrs=10.11.0.7:30011,10.11.0.10:30002
+StoreAddrs=10.11.0.7:20171,10.11.0.10:20171
+PDAddrs=10.11.0.7:23791,10.11.0.10:23791
diff --git a/evaluation/distributed/configs/benchmark_100m_template.ini b/evaluation/distributed/configs/benchmark_100m_template.ini
new file mode 100644
index 000000000..4a69f39a4
--- /dev/null
+++ b/evaluation/distributed/configs/benchmark_100m_template.ini
@@ -0,0 +1,71 @@
+; 100m: 99M base + 1M insert (insert is ~1% of base, "freshness / steady-state" workload).
+; 100× larger base index than insert_dominant. Tests how the system behaves when
+; the head index is large (~tens of millions of heads on layer 0) and the insert
+; rate is moderate. Layers=2, L2 distance, SIFT1B dataset.
+;
+; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from
+; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs).
+;
+; Notes for 100M-scale operation:
+;   - First run MUST build the index (Rebuild=true). Build of 99M base takes hours;
+;     reuse with Rebuild=false on subsequent runs and SKIP_HEAD_BUILD=1 if the
+;     HeadIndex on disk is intact.
+;   - Truth (top-5 over 99M) is recomputed at start each run; expect ~minutes.
+;   - SaveIndex at 100M has been observed to hang in BG-job-drain on some hosts;
+;     use SKIP_SAVE_LOAD=1 when iterating to bypass the per-batch save/load cycle.
+;   - TiKV data will grow to ~50-100GB per store at this scale; both nodes need
+;     plenty of NVMe headroom (verified: driver has 6.2T, worker has 691G).
+[Benchmark]
+WorkerTimeout=14400
+VectorPath=/mnt/nvme/sift1b/bigann_base.u8bin
+QueryPath=/mnt/nvme/sift1b/query.10K.u8bin
+TruthPath=truth
+IndexPath=PLACEHOLDER
+ValueType=UInt8
+Dimension=128
+BaseVectorCount=99000000
+InsertVectorCount=1000000
+DeleteVectorCount=0
+BatchNum=1
+TopK=5
+NumSearchThreads=4
+NumInsertThreads=4
+AppendThreadNum=16
+NumSearchDuringInsertThreads=1
+NumQueries=200
+DistMethod=L2
+Rebuild=true
+BuildOnly=false
+Resume=-1
+Layers=2
+
+Storage=TIKVIO
+TiKVPDAddresses=PLACEHOLDER
+TiKVKeyPrefix=PLACEHOLDER
+
+[SelectHead]
+ParallelBKTBuild=true
+
+[BuildHead]
+ParallelBKTBuild=true
+
+[BuildSSDIndex]
+LatencyLimit=100
+MaxCheck=8192
+SearchInternalResultNum=64
+UseMultiChunkPosting=false
+PostingPageLimit=8
+PostingCountCacheCapacity=10000000
+SearchCheckVersionMapOnlyLayer0=true
+DistributedVersionMap=true
+ReassignK=64
+AsyncMergeInSearch=true
+VersionCacheMaxChunks=1000000
+AsyncRpcMaxInflight=512
+
+[Distributed]
+Enabled=true
+DispatcherAddr=PLACEHOLDER
+WorkerAddrs=PLACEHOLDER
+StoreAddrs=PLACEHOLDER
+PDAddrs=PLACEHOLDER
diff --git a/evaluation/distributed/configs/benchmark_10m_1node.ini b/evaluation/distributed/configs/benchmark_10m_1node.ini
new file mode 100644
index 000000000..56dbd9088
--- /dev/null
+++ b/evaluation/distributed/configs/benchmark_10m_1node.ini
@@ -0,0 +1,62 @@
+; 10m: 9M base + 1M insert (insert is ~10% of base, "growing-index" workload).
+; 10× larger base index than insert_dominant, 10× smaller than 100m.
+; Useful for validating scaling between 1M and 100M without paying the
+; multi-hour build cost of 100m. Layers=2, L2 distance, SIFT1B dataset
+; (truncated to 10M of the 1B available).
+;
+; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from
+; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs).
+[Benchmark]
+WorkerTimeout=14400
+VectorPath=/mnt/nvme/sift1b/bigann_base.u8bin
+QueryPath=/mnt/nvme/sift1b/query.10K.u8bin
+TruthPath=truth
+IndexPath=/mnt/nvme/proidx_10m_1node/spann_index
+ValueType=UInt8
+Dimension=128
+BaseVectorCount=9000000
+InsertVectorCount=1000000
+DeleteVectorCount=0
+BatchNum=1
+TopK=5
+NumSearchThreads=4
+NumInsertThreads=4
+AppendThreadNum=16
+NumSearchDuringInsertThreads=1
+NumQueries=200
+DistMethod=L2
+Rebuild=true
+BuildOnly=false
+Resume=-1
+Layers=2
+
+Storage=TIKVIO
+TiKVPDAddresses=PLACEHOLDER
+TiKVKeyPrefix=bench10m_1node
+
+[SelectHead]
+ParallelBKTBuild=true
+
+[BuildHead]
+ParallelBKTBuild=true
+
+[BuildSSDIndex]
+LatencyLimit=100
+MaxCheck=8192
+SearchInternalResultNum=64
+UseMultiChunkPosting=false
+PostingPageLimit=8
+PostingCountCacheCapacity=1000000
+SearchCheckVersionMapOnlyLayer0=true
+DistributedVersionMap=true
+ReassignK=64
+AsyncMergeInSearch=true
+VersionCacheMaxChunks=1000000
+AsyncRpcMaxInflight=512
+
+[Distributed]
+Enabled=true
+DispatcherAddr=10.11.0.7:30001
+WorkerAddrs=10.11.0.7:30011
+StoreAddrs=10.11.0.7:20171
+PDAddrs=10.11.0.7:23791
diff --git a/evaluation/distributed/configs/benchmark_10m_2node.ini b/evaluation/distributed/configs/benchmark_10m_2node.ini
new file mode 100644
index 000000000..4ed317ac3
--- /dev/null
+++ b/evaluation/distributed/configs/benchmark_10m_2node.ini
@@ -0,0 +1,62 @@
+; 10m: 9M base + 1M insert (insert is ~10% of base, "growing-index" workload).
+; 10× larger base index than insert_dominant, 10× smaller than 100m.
+; Useful for validating scaling between 1M and 100M without paying the
+; multi-hour build cost of 100m. Layers=2, L2 distance, SIFT1B dataset
+; (truncated to 10M of the 1B available).
+;
+; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from
+; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs).
+[Benchmark]
+WorkerTimeout=14400
+VectorPath=/mnt/nvme/sift1b/bigann_base.u8bin
+QueryPath=/mnt/nvme/sift1b/query.10K.u8bin
+TruthPath=truth
+IndexPath=/mnt/nvme/proidx_10m_2node/spann_index
+ValueType=UInt8
+Dimension=128
+BaseVectorCount=9000000
+InsertVectorCount=1000000
+DeleteVectorCount=0
+BatchNum=1
+TopK=5
+NumSearchThreads=4
+NumInsertThreads=4
+AppendThreadNum=16
+NumSearchDuringInsertThreads=1
+NumQueries=200
+DistMethod=L2
+Rebuild=false
+BuildOnly=false
+Resume=-1
+Layers=2
+
+Storage=TIKVIO
+TiKVPDAddresses=PLACEHOLDER
+TiKVKeyPrefix=bench10m_2node
+
+[SelectHead]
+ParallelBKTBuild=true
+
+[BuildHead]
+ParallelBKTBuild=true
+
+[BuildSSDIndex]
+LatencyLimit=100
+MaxCheck=8192
+SearchInternalResultNum=64
+UseMultiChunkPosting=false
+PostingPageLimit=8
+PostingCountCacheCapacity=1000000
+SearchCheckVersionMapOnlyLayer0=true
+DistributedVersionMap=true
+ReassignK=64
+AsyncMergeInSearch=true
+VersionCacheMaxChunks=1000000
+AsyncRpcMaxInflight=512
+
+[Distributed]
+Enabled=true
+DispatcherAddr=10.11.0.7:30001
+WorkerAddrs=10.11.0.7:30011,10.11.0.10:30002
+StoreAddrs=10.11.0.7:20171,10.11.0.10:20171
+PDAddrs=10.11.0.7:23791,10.11.0.10:23791
diff --git a/evaluation/distributed/configs/benchmark_10m_template.ini b/evaluation/distributed/configs/benchmark_10m_template.ini
new file mode 100644
index 000000000..f40203559
--- /dev/null
+++ b/evaluation/distributed/configs/benchmark_10m_template.ini
@@ -0,0 +1,62 @@
+; 10m: 9M base + 1M insert (insert is ~10% of base, "growing-index" workload).
+; 10× larger base index than insert_dominant, 10× smaller than 100m.
+; Useful for validating scaling between 1M and 100M without paying the
+; multi-hour build cost of 100m. Layers=2, L2 distance, SIFT1B dataset
+; (truncated to 10M of the 1B available).
+;
+; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from
+; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs).
+[Benchmark]
+WorkerTimeout=14400
+VectorPath=/mnt/nvme/sift1b/bigann_base.u8bin
+QueryPath=/mnt/nvme/sift1b/query.10K.u8bin
+TruthPath=truth
+IndexPath=PLACEHOLDER
+ValueType=UInt8
+Dimension=128
+BaseVectorCount=9000000
+InsertVectorCount=1000000
+DeleteVectorCount=0
+BatchNum=1
+TopK=5
+NumSearchThreads=4
+NumInsertThreads=4
+AppendThreadNum=16
+NumSearchDuringInsertThreads=1
+NumQueries=200
+DistMethod=L2
+Rebuild=true
+BuildOnly=false
+Resume=-1
+Layers=2
+
+Storage=TIKVIO
+TiKVPDAddresses=PLACEHOLDER
+TiKVKeyPrefix=PLACEHOLDER
+
+[SelectHead]
+ParallelBKTBuild=true
+
+[BuildHead]
+ParallelBKTBuild=true
+
+[BuildSSDIndex]
+LatencyLimit=100
+MaxCheck=8192
+SearchInternalResultNum=64
+UseMultiChunkPosting=false
+PostingPageLimit=8
+PostingCountCacheCapacity=1000000
+SearchCheckVersionMapOnlyLayer0=true
+DistributedVersionMap=true
+ReassignK=64
+AsyncMergeInSearch=true
+VersionCacheMaxChunks=1000000
+AsyncRpcMaxInflight=512
+
+[Distributed]
+Enabled=true
+DispatcherAddr=PLACEHOLDER
+WorkerAddrs=PLACEHOLDER
+StoreAddrs=PLACEHOLDER
+PDAddrs=PLACEHOLDER
diff --git a/evaluation/distributed/configs/benchmark_insert_dominant_1node.ini b/evaluation/distributed/configs/benchmark_insert_dominant_1node.ini
new file mode 100644
index 000000000..30fe77bbe
--- /dev/null
+++ b/evaluation/distributed/configs/benchmark_insert_dominant_1node.ini
@@ -0,0 +1,58 @@
+; insert_dominant: 1M base + 1M insert with concurrent search-during-insert.
+; Layers=2, L2 distance, SIFT1B dataset (truncated to 1M).
+;
+; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from
+; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs).
+[Benchmark]
+WorkerTimeout=14400
+VectorPath=/mnt/nvme/sift1b/bigann_base.u8bin
+QueryPath=/mnt/nvme/sift1b/query.10K.u8bin
+TruthPath=truth
+IndexPath=/mnt/nvme/proidx_insert_dominant_1node/spann_index
+ValueType=UInt8
+Dimension=128
+BaseVectorCount=1000000
+InsertVectorCount=1000000
+DeleteVectorCount=0
+BatchNum=1
+TopK=5
+NumSearchThreads=4
+NumInsertThreads=4
+AppendThreadNum=16
+NumSearchDuringInsertThreads=1
+NumQueries=200
+DistMethod=L2
+Rebuild=true
+BuildOnly=false
+Resume=-1
+Layers=2
+
+Storage=TIKVIO
+TiKVPDAddresses=PLACEHOLDER
+TiKVKeyPrefix=benchinsert_dominant_1node
+
+[SelectHead]
+ParallelBKTBuild=true
+
+[BuildHead]
+ParallelBKTBuild=true
+
+[BuildSSDIndex]
+LatencyLimit=100
+MaxCheck=8192
+SearchInternalResultNum=64
+UseMultiChunkPosting=false
+PostingPageLimit=8
+PostingCountCacheCapacity=1000000
+SearchCheckVersionMapOnlyLayer0=true
+DistributedVersionMap=true
+ReassignK=64
+AsyncMergeInSearch=true
+VersionCacheMaxChunks=100000
+
+[Distributed]
+Enabled=true
+DispatcherAddr=10.11.0.7:30001
+WorkerAddrs=10.11.0.7:30011
+StoreAddrs=10.11.0.7:20171
+PDAddrs=10.11.0.7:23791
diff --git a/evaluation/distributed/configs/benchmark_insert_dominant_2node.ini b/evaluation/distributed/configs/benchmark_insert_dominant_2node.ini
new file mode 100644
index 000000000..d45870b50
--- /dev/null
+++ b/evaluation/distributed/configs/benchmark_insert_dominant_2node.ini
@@ -0,0 +1,58 @@
+; insert_dominant: 1M base + 1M insert with concurrent search-during-insert.
+; Layers=2, L2 distance, SIFT1B dataset (truncated to 1M).
+;
+; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from
+; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs).
+[Benchmark]
+WorkerTimeout=14400
+VectorPath=/mnt/nvme/sift1b/bigann_base.u8bin
+QueryPath=/mnt/nvme/sift1b/query.10K.u8bin
+TruthPath=truth
+IndexPath=/mnt/nvme/proidx_insert_dominant_2node/spann_index
+ValueType=UInt8
+Dimension=128
+BaseVectorCount=1000000
+InsertVectorCount=1000000
+DeleteVectorCount=0
+BatchNum=1
+TopK=5
+NumSearchThreads=4
+NumInsertThreads=4
+AppendThreadNum=16
+NumSearchDuringInsertThreads=1
+NumQueries=200
+DistMethod=L2
+Rebuild=false
+BuildOnly=false
+Resume=-1
+Layers=2
+
+Storage=TIKVIO
+TiKVPDAddresses=PLACEHOLDER
+TiKVKeyPrefix=benchinsert_dominant_2node
+
+[SelectHead]
+ParallelBKTBuild=true
+
+[BuildHead]
+ParallelBKTBuild=true
+
+[BuildSSDIndex]
+LatencyLimit=100
+MaxCheck=8192
+SearchInternalResultNum=64
+UseMultiChunkPosting=false
+PostingPageLimit=8
+PostingCountCacheCapacity=1000000
+SearchCheckVersionMapOnlyLayer0=true
+DistributedVersionMap=true
+ReassignK=64
+AsyncMergeInSearch=true
+VersionCacheMaxChunks=100000
+
+[Distributed]
+Enabled=true
+DispatcherAddr=10.11.0.7:30001
+WorkerAddrs=10.11.0.7:30011,10.11.0.10:30002
+StoreAddrs=10.11.0.7:20171,10.11.0.10:20171
+PDAddrs=10.11.0.7:23791,10.11.0.10:23791
diff --git a/evaluation/distributed/configs/benchmark_insert_dominant_3node.ini b/evaluation/distributed/configs/benchmark_insert_dominant_3node.ini
new file mode 100644
index 000000000..a8050732d
--- /dev/null
+++ b/evaluation/distributed/configs/benchmark_insert_dominant_3node.ini
@@ -0,0 +1,59 @@
+; insert_dominant: 1M base + 10M insert (10× scale-up) with concurrent search-during-insert.
+; Tests how the index handles insertion-dominated workloads where insertion volume
+; is much larger than the initial baseline. Layers=2, L2 distance, SIFT1B dataset.
+;
+; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from
+; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs).
+[Benchmark]
+WorkerTimeout=14400
+VectorPath=/mnt/data/sift1b/base.1B.u8bin
+QueryPath=/mnt/data/sift1b/query.public.10K.u8bin
+TruthPath=truth
+IndexPath=/mnt/md0/proidx_insert_dominant_3node/spann_index
+ValueType=UInt8
+Dimension=128
+BaseVectorCount=1000000
+InsertVectorCount=1000000
+DeleteVectorCount=0
+BatchNum=1
+TopK=5
+NumSearchThreads=4
+NumInsertThreads=4
+AppendThreadNum=16
+NumSearchDuringInsertThreads=1
+NumQueries=200
+DistMethod=L2
+Rebuild=false
+BuildOnly=false
+Resume=-1
+Layers=2
+
+Storage=TIKVIO
+TiKVPDAddresses=PLACEHOLDER
+TiKVKeyPrefix=benchinsert_dominant_3node
+
+[SelectHead]
+ParallelBKTBuild=true
+
+[BuildHead]
+ParallelBKTBuild=true
+
+[BuildSSDIndex]
+LatencyLimit=100
+MaxCheck=8192
+SearchInternalResultNum=64
+UseMultiChunkPosting=false
+PostingPageLimit=8
+PostingCountCacheCapacity=1000000
+SearchCheckVersionMapOnlyLayer0=true
+DistributedVersionMap=true
+ReassignK=64
+AsyncMergeInSearch=true
+VersionCacheMaxChunks=100000
+
+[Distributed]
+Enabled=true
+DispatcherAddr=172.27.0.4:30001
+WorkerAddrs=172.27.0.4:30011,172.27.0.5:30002,172.27.0.6:30003
+StoreAddrs=172.27.0.4:20171,172.27.0.5:20171,172.27.0.6:20171
+PDAddrs=172.27.0.4:23791,172.27.0.5:23791,172.27.0.6:23791
diff --git a/evaluation/distributed/configs/benchmark_insert_dominant_template.ini b/evaluation/distributed/configs/benchmark_insert_dominant_template.ini
new file mode 100644
index 000000000..f8085c03b
--- /dev/null
+++ b/evaluation/distributed/configs/benchmark_insert_dominant_template.ini
@@ -0,0 +1,58 @@
+; insert_dominant: 1M base + 1M insert with concurrent search-during-insert.
+; Layers=2, L2 distance, SIFT1B dataset (truncated to 1M).
+;
+; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from
+; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs).
+[Benchmark]
+WorkerTimeout=14400
+VectorPath=/mnt/nvme/sift1b/bigann_base.u8bin
+QueryPath=/mnt/nvme/sift1b/query.10K.u8bin
+TruthPath=truth
+IndexPath=PLACEHOLDER
+ValueType=UInt8
+Dimension=128
+BaseVectorCount=1000000
+InsertVectorCount=1000000
+DeleteVectorCount=0
+BatchNum=1
+TopK=5
+NumSearchThreads=4
+NumInsertThreads=4
+AppendThreadNum=16
+NumSearchDuringInsertThreads=1
+NumQueries=200
+DistMethod=L2
+Rebuild=true
+BuildOnly=false
+Resume=-1
+Layers=2
+
+Storage=TIKVIO
+TiKVPDAddresses=PLACEHOLDER
+TiKVKeyPrefix=PLACEHOLDER
+
+[SelectHead]
+ParallelBKTBuild=true
+
+[BuildHead]
+ParallelBKTBuild=true
+
+[BuildSSDIndex]
+LatencyLimit=100
+MaxCheck=8192
+SearchInternalResultNum=64
+UseMultiChunkPosting=false
+PostingPageLimit=8
+PostingCountCacheCapacity=1000000
+SearchCheckVersionMapOnlyLayer0=true
+DistributedVersionMap=true
+ReassignK=64
+AsyncMergeInSearch=true
+VersionCacheMaxChunks=100000
+
+[Distributed]
+Enabled=true
+DispatcherAddr=PLACEHOLDER
+WorkerAddrs=PLACEHOLDER
+StoreAddrs=PLACEHOLDER
+PDAddrs=PLACEHOLDER
diff --git a/evaluation/distributed/configs/cluster_2node.conf b/evaluation/distributed/configs/cluster_2node.conf
new file mode 100644
index 000000000..f94500487
--- /dev/null
+++ b/evaluation/distributed/configs/cluster_2node.conf
@@ -0,0 +1,31 @@
+# 2-node cluster: driver/worker0 on dev-000003 (10.11.0.7),
+#                 worker1 on dev-000006 (10.11.0.10).
+# On 000006, /mnt/nvme is symlinked to /mnt_ssd/data7/sptag-bench (data lives on data7 NVMe).
+#
+# Cluster mode: SHARED TiKV raft cluster. Both PDs form one raft group; both
+# TiKVs share the same cluster (max-replicas=1, so each region lives on
+# exactly one store and PD routes reads to it). Compute nodes are stateless
+# TiKV clients — no cross-compute fetch RPCs during RNGSelection.
+[cluster]
+ssh_user=superbench
+ssh_key=/home/superbench/.ssh/id_rsa
+sptag_dir=/home/superbench/zhangt/SPTAG
+data_dir=/mnt/nvme
+tikv_version=v8.5.1
+pd_version=v8.5.1
+# Image refs (optional). Defaults:
+#   tikv_image=sptag-tikv               (with tag :${tikv_version})
+#   pd_image=sptag-pd                   (with tag :${pd_version})
+#   helper_image=mcr.microsoft.com/mirror/docker/library/ubuntu:22.04
+# Override here to use different registries / replace with pingcap/* etc.
+
+[nodes]
+# host         router_port
+# node 0 (driver) router_port must differ from dispatcher port (hardcoded 30001).
+10.11.0.7      30011
+10.11.0.10     30002
+
+[tikv]
+# host         pd_client_port  pd_peer_port  tikv_port
+10.11.0.7      23791           23801          20171
+10.11.0.10     23791           23801          20171
diff --git a/evaluation/distributed/configs/cluster_3node.conf b/evaluation/distributed/configs/cluster_3node.conf
new file mode 100644
index 000000000..ff2ba8af4
--- /dev/null
+++ b/evaluation/distributed/configs/cluster_3node.conf
@@ -0,0 +1,34 @@
+# 3-node cluster: driver/worker0 on 172.27.0.4,
+#                 worker1 on 172.27.0.5 (20.92.202.166),
+#                 worker2 on 172.27.0.6 (20.5.138.158).
+# Data lives on /mnt/md0 (NVMe RAID0, ~11T per node).
+#
+# Cluster mode: SHARED TiKV raft cluster. All PDs form one raft group; all
+# TiKVs share the same cluster (max-replicas=1, so each region lives on
+# exactly one store and PD routes reads to it). Compute nodes are stateless
+# TiKV clients — no cross-compute fetch RPCs during RNGSelection.
+[cluster]
+ssh_user=azureuser
+ssh_key=/home/azureuser/.ssh/id_rsa
+sptag_dir=/home/azureuser/zhangt/SPTAG
+data_dir=/mnt/md0
+tikv_version=v8.5.1
+pd_version=v8.5.1
+# Image refs (optional). Defaults:
+#   tikv_image=sptag-tikv               (with tag :${tikv_version})
+#   pd_image=sptag-pd                   (with tag :${pd_version})
+#   helper_image=mcr.microsoft.com/mirror/docker/library/ubuntu:22.04
+# Override here to use different registries / replace with pingcap/* etc.
+
+[nodes]
+# host         router_port
+# node 0 (driver) router_port must differ from dispatcher port (hardcoded 30001).
+172.27.0.4      30011
+172.27.0.5      30002
+172.27.0.6      30003
+
+[tikv]
+# host         pd_client_port  pd_peer_port  tikv_port
+172.27.0.4      23791           23801          20171
+172.27.0.5      23791           23801          20171
+172.27.0.6      23791           23801          20171
diff --git a/evaluation/distributed/configs/tikv.toml b/evaluation/distributed/configs/tikv.toml
new file mode 100755
index 000000000..4ba5282c0
--- /dev/null
+++ b/evaluation/distributed/configs/tikv.toml
@@ -0,0 +1,74 @@
+memory-usage-limit = "80GB"
+
+[server]
+# v41: 16 → 32 to handle higher concurrent gRPC streams. 96-core host has
+# plenty of headroom; previous setting was a default-y stab in the dark.
+grpc-concurrency = 32
+grpc-memory-pool-quota = "16GB"
+
+[raftstore]
+region-max-size = "512MB"
+region-split-size = "384MB"
+region-max-keys = 5120000
+region-split-keys = 3840000
+# v41: 4 → 32. apply-pool is the path raft-log → RocksDB writes go through.
+# At 32 concurrent RMW ops per store (4 local insert + 16 receiver sub-workers
+# + 4 search + 4 search-during-insert + misc), a 4-thread apply pool meant
+# ~8× queue depth, which is the primary write-amp source we observed
+# (TiKV at 7/96 cores while ops are still queueing).
+apply-pool-size = 32
+# v41: 4 → 16. store-pool routes raft messages between peers and to apply.
+store-pool-size = 16
+# v41: batch up raft entries per fsync. If we're disk-fsync bound (likely),
+# this directly amortizes the sync cost.
+raft-write-batch-size = "1MB"
+
+[storage]
+reserve-space = "1GB"
+# v41: 4 (default) → 16. KV scheduler is the front-end before raftstore.
+scheduler-worker-pool-size = 16
+
+[storage.block-cache]
+capacity = "60GB"
+
+# v41: new section. Read pool default = 0.8×CPU = 76 on 96-core host, which
+# would let reads steal CPU from writes. Cap at 32 to leave room for write
+# path. Min 8 ensures reads stay responsive under light load.
+[readpool.unified]
+max-thread-count = 32
+min-thread-count = 8
+
+[rocksdb]
+max-background-jobs = 32
+max-sub-compactions = 8
+# v41: 8 dedicated flush threads (subset of max-background-jobs). Reduces
+# the chance that compaction monopolizes background-jobs and starves flushes.
+max-background-flushes = 8
+rate-bytes-per-sec = "0"
+
+[rocksdb.defaultcf]
+# v41: 512MB → 1GB. Bigger memtable means fewer flushes (and thus fewer L0
+# files), reducing the chance of slowdown/stop write triggers under burst.
+write-buffer-size = "1GB"
+# v41: 5 → 8. More memtables = more headroom before flush back-pressure.
+max-write-buffer-number = 8
+min-write-buffer-number-to-merge = 2
+level0-file-num-compaction-trigger = 12
+# v41: 28 → 40, 40 → 60. Loosen the L0 stall thresholds so bursts have more
+# slack. With 10K-item chunks (v39+) we generate more small writes than v38
+# did, so we hit slowdown more often.
+level0-slowdown-writes-trigger = 40
+level0-stop-writes-trigger = 60
+max-bytes-for-level-base = "2GB"
+compression-per-level = ["no", "no", "no", "lz4", "lz4", "zstd", "zstd"]
+target-file-size-base = "128MB"
+
+[rocksdb.writecf]
+write-buffer-size = "128MB"
+max-write-buffer-number = 5
+
+[coprocessor]
+region-max-size = "512MB"
+region-split-size = "384MB"
+region-max-keys = 5120000
+region-split-keys = 3840000
diff --git a/evaluation/distributed/run_distributed.sh b/evaluation/distributed/run_distributed.sh
new file mode 100755
index 000000000..c383a7eed
--- /dev/null
+++ b/evaluation/distributed/run_distributed.sh
@@ -0,0 +1,1364 @@
+#!/bin/bash
+# Multi-machine distributed benchmark orchestrator for SPTAG.
+#
+# Usage:
+#   ./run_distributed.sh deploy     <cluster.conf>                Deploy binary + data to all nodes
+#   ./run_distributed.sh setup-bins <cluster.conf>                Download tikv-server / pd-server to every node
+#   ./run_distributed.sh start-tikv <cluster.conf> [node_count]   Start independent TiKV/PD instances
+#   ./run_distributed.sh stop-tikv  <cluster.conf> [node_count]   Stop TiKV/PD instances
+#   ./run_distributed.sh run        <cluster.conf> <scale> <node_count>  Run benchmark
+#   ./run_distributed.sh bench      <cluster.conf> <scale> [scale...]    Run 1-node + N-node for each scale
+#   ./run_distributed.sh cleanup    <cluster.conf>                Remove deployed files from remote nodes
+#
+# Environment variables:
+#   NOCACHE=1          Disable all caches (TiKV block cache, OS page cache, VersionCache)
+#   BUILD_WITH_CACHE=1 (only with NOCACHE=1) Use cached TiKV+VersionCache during the
+#                      build phase, then restart TiKV with nocache config and drop all
+#                      OS caches before the search/insert phase. Useful for large scales
+#                      (e.g. 100M) where building under nocache is impractical.
+#   SKIP_TIKV_SWAP=1   (only with BUILD_WITH_CACHE=1) Skip the TiKV container restart.
+#                      Drop OS caches and rely on VersionCache=0 INI overrides for "nocache"
+#                      semantics. Avoids docker rm -f corruption that has destroyed recall
+#                      at 100M scale; TiKV block cache stays warm but contains mostly recent
+#                      build writes (random search reads largely miss it anyway).
+#   SKIP_SAVE_LOAD=1   (only with NOCACHE=1) Bypass the post-build SaveIndex / per-batch
+#                      LoadIndex / Clone / SaveIndex cycles. For 1-node, build+search+insert
+#                      run in a single SPTAGTest process, dropping OS pagecache after build.
+#                      For 2-node, the build phase skips the broken final SaveIndex (relies
+#                      on the index files written during BuildLargeIndex). Required at 100M
+#                      scale where SaveIndex's "wait for all background jobs to finish" loop
+#                      never terminates and risks a gRPC SEGFAULT after several hours.
+#                      VersionCache cannot be reset mid-process so it stays warm from build.
+#   SKIP_HEAD_BUILD=1  Reuse existing HeadIndex if present (RebuildSSDOnly). Falls back to
+#                      full build if HeadIndex is missing.
+#
+# Prerequisites:
+#   - Passwordless SSH from driver to all nodes (configure ssh_key in cluster.conf)
+#   - Docker installed on all nodes (for TiKV)
+#   - cluster.conf configured (see cluster.conf.example)
+#
+# The driver (first node in [nodes]) orchestrates everything.
+# Compute nodes share a single TiKV raft cluster: all PDs join one raft group,
+# all TiKVs point to all PDs, max-replicas=1 (no replication, each region on
+# exactly one store). With 2 nodes this gives 2 PDs + 2 TiKV stores in one
+# cluster; any compute can read any posting via PD-routed TiKV calls, so the
+# distributed routing layer no longer needs to forward reads between computes.
+
+set -o pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+LOGDIR="$(cd "$SCRIPT_DIR/../.." && pwd)/benchmark_logs"
+mkdir -p "$LOGDIR"
+
+# ─── Config Parsing ───
+
+declare -a NODE_HOSTS NODE_ROUTER_PORTS
+declare -a TIKV_HOSTS TIKV_PD_CLIENT_PORTS TIKV_PD_PEER_PORTS TIKV_PORTS
+declare SSH_USER SPTAG_DIR DATA_DIR TIKV_VERSION PD_VERSION SSH_KEY
+declare TIKV_IMAGE PD_IMAGE HELPER_IMAGE BIN_DIR MIRROR
+TOTAL_NODES=0
+
+parse_config() {
+    local CONF="$1"
+    if [ ! -f "$CONF" ]; then
+        echo "ERROR: Config file not found: $CONF"
+        exit 1
+    fi
+
+    local SECTION=""
+
+    while IFS= read -r line || [ -n "$line" ]; do
+        # Strip comments and whitespace
+        line="${line%%#*}"
+        line="$(echo "$line" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')"
+        [ -z "$line" ] && continue
+
+        # Section header
+        if [[ "$line" =~ ^\[(.+)\]$ ]]; then
+            SECTION="${BASH_REMATCH[1]}"
+            continue
+        fi
+
+        case "$SECTION" in
+            cluster)
+                local key="${line%%=*}"
+                local val="${line#*=}"
+                case "$key" in
+                    ssh_user)     SSH_USER="$val" ;;
+                    sptag_dir)    SPTAG_DIR="$val" ;;
+                    data_dir)     DATA_DIR="$val" ;;
+                    tikv_version) TIKV_VERSION="$val" ;;
+                    pd_version)   PD_VERSION="$val" ;;
+                    tikv_image)   TIKV_IMAGE="$val" ;;
+                    pd_image)     PD_IMAGE="$val" ;;
+                    helper_image) HELPER_IMAGE="$val" ;;
+                    bin_dir)      BIN_DIR="$val" ;;
+                    mirror)       MIRROR="$val" ;;
+                    ssh_key)      SSH_KEY="$val" ;;
+                esac
+                ;;
+            nodes)
+                read -r host rport <<< "$line"
+                NODE_HOSTS+=("$host")
+                NODE_ROUTER_PORTS+=("$rport")
+                ;;
+            tikv)
+                read -r host pd_client pd_peer tikv_port <<< "$line"
+                TIKV_HOSTS+=("$host")
+                TIKV_PD_CLIENT_PORTS+=("$pd_client")
+                TIKV_PD_PEER_PORTS+=("$pd_peer")
+                TIKV_PORTS+=("$tikv_port")
+                ;;
+        esac
+    done < "$CONF"
+
+    # Defaults
+    SSH_USER="${SSH_USER:-$(whoami)}"
+    TIKV_VERSION="${TIKV_VERSION:-v8.5.1}"
+    PD_VERSION="${PD_VERSION:-v8.5.1}"
+    # Single image used for ALL containers (PD, TiKV, helper). Stock MCR
+    # ubuntu:22.04 — never modified, never layered, so security scanners see
+    # only the MCR base image. TiKV / PD binaries are downloaded to the host
+    # at $BIN_DIR by `setup-bins` and bind-mounted into the container.
+    HELPER_IMAGE="${HELPER_IMAGE:-mcr.microsoft.com/mirror/docker/library/ubuntu:22.04}"
+    TIKV_IMAGE="${TIKV_IMAGE:-${HELPER_IMAGE}}"
+    PD_IMAGE="${PD_IMAGE:-${HELPER_IMAGE}}"
+    # Host path on every node where tikv-server / pd-server live. Populated
+    # by `setup-bins`. Mounted read-only into containers as /sptag-bin.
+    BIN_DIR="${BIN_DIR:-${SPTAG_DIR}/evaluation/distributed/bin}"
+    MIRROR="${MIRROR:-https://tiup-mirrors.pingcap.com}"
+
+    # Expand ~ in ssh_key path
+    if [ -n "$SSH_KEY" ]; then
+        SSH_KEY="${SSH_KEY/#\~/$HOME}"
+    fi
+
+    TOTAL_NODES=${#NODE_HOSTS[@]}
+
+    if [ "$TOTAL_NODES" -lt 1 ]; then
+        echo "ERROR: No compute nodes defined in [nodes]"
+        exit 1
+    fi
+    if [ ${#TIKV_HOSTS[@]} -lt 1 ]; then
+        echo "ERROR: No TiKV instances defined in [tikv]"
+        exit 1
+    fi
+
+    echo "Cluster config loaded:"
+    echo "  Compute nodes: $TOTAL_NODES (driver: ${NODE_HOSTS[0]})"
+    echo "  TiKV instances: ${#TIKV_HOSTS[@]}"
+    echo "  SSH user: $SSH_USER"
+    echo "  SSH key: ${SSH_KEY:-(none)}"
+    echo "  SPTAG dir: $SPTAG_DIR"
+    echo "  Data dir: $DATA_DIR"
+}
+
+# ─── SSH Helpers ───
+
+# Build SSH options string (key + host checking)
+_ssh_opts() {
+    local opts="-o StrictHostKeyChecking=no -o ConnectTimeout=10"
+    if [ -n "$SSH_KEY" ]; then
+        opts+=" -i $SSH_KEY"
+    fi
+    echo "$opts"
+}
+
+# Run command on remote host (or locally if it's the driver)
+remote_exec() {
+    local host="$1"; shift
+    if [ "$host" = "${NODE_HOSTS[0]}" ] || [ "$host" = "localhost" ] || [ "$host" = "127.0.0.1" ]; then
+        eval "$@"
+    else
+        ssh $(_ssh_opts) "$SSH_USER@$host" "$@"
+    fi
+}
+
+# rsync files to remote host
+remote_sync() {
+    local host="$1"
+    local src="$2"
+    local dst="$3"
+    if [ "$host" = "${NODE_HOSTS[0]}" ] || [ "$host" = "localhost" ]; then
+        # Local copy — skip if same path
+        if [ "$(realpath "$src")" != "$(realpath "$dst")" ]; then
+            rsync -az --progress "$src" "$dst"
+        fi
+    else
+        rsync -az --progress -e "ssh $(_ssh_opts)" "$src" "$SSH_USER@$host:$dst"
+    fi
+}
+
+# ─── Deploy ───
+
+cmd_deploy() {
+    echo ""
+    echo "=== Deploying SPTAG to ${#NODE_HOSTS[@]} nodes ==="
+    echo ""
+
+    # Validate SSH connectivity
+    for host in "${NODE_HOSTS[@]}"; do
+        if [ "$host" = "${NODE_HOSTS[0]}" ]; then continue; fi
+        echo -n "  Checking SSH to $host... "
+        if remote_exec "$host" "echo ok" >/dev/null 2>&1; then
+            echo "OK"
+        else
+            echo "FAILED"
+            echo "ERROR: Cannot SSH to $SSH_USER@$host"
+            exit 1
+        fi
+    done
+
+    # Deploy binary to all remote nodes
+    echo ""
+    echo "Deploying binary..."
+    local BINARY="$SPTAG_DIR/Release/SPTAGTest"
+    if [ ! -f "$BINARY" ]; then
+        echo "ERROR: Binary not found: $BINARY (run cmake build first)"
+        exit 1
+    fi
+
+    for host in "${NODE_HOSTS[@]}"; do
+        if [ "$host" = "${NODE_HOSTS[0]}" ]; then continue; fi
+        echo "  → $host:$SPTAG_DIR/Release/"
+        remote_exec "$host" "mkdir -p $SPTAG_DIR/Release"
+        remote_sync "$host" "$BINARY" "$SPTAG_DIR/Release/SPTAGTest"
+        # Also deploy any shared libraries
+        if ls "$SPTAG_DIR/Release/"*.so 2>/dev/null; then
+            remote_sync "$host" "$SPTAG_DIR/Release/*.so" "$SPTAG_DIR/Release/"
+        fi
+        # Deploy bundled runtime libs (boost 1.73 / abseil / tbb / libstdc++)
+        # used by SPTAGTest. Not committed; produced locally on the driver.
+        if [ -d "$SPTAG_DIR/Release/runtime_libs" ]; then
+            remote_exec "$host" "mkdir -p $SPTAG_DIR/Release/runtime_libs"
+            rsync -az -e "ssh $(_ssh_opts)" \
+                "$SPTAG_DIR/Release/runtime_libs/" \
+                "$SSH_USER@$host:$SPTAG_DIR/Release/runtime_libs/"
+        fi
+    done
+
+    # Deploy data files (perftest_* vectors, queries)
+    echo ""
+    echo "Deploying data files..."
+    for host in "${NODE_HOSTS[@]}"; do
+        if [ "$host" = "${NODE_HOSTS[0]}" ]; then continue; fi
+        echo "  → $host:$SPTAG_DIR/ (perftest_* files)"
+        remote_exec "$host" "mkdir -p $SPTAG_DIR"
+        rsync -az --progress \
+            --include='perftest_*' --exclude='*' \
+            -e "ssh $(_ssh_opts)" \
+            "$SPTAG_DIR/" "$SSH_USER@$host:$SPTAG_DIR/"
+    done
+
+    echo ""
+    echo "Deploy complete."
+}
+
+# ─── TiKV/PD Binary Setup ───
+
+setup_bins_one_host() {
+    # Ensure tikv-server / pd-server are present at $BIN_DIR on $1.
+    # Downloads from $MIRROR if missing or version mismatch. Idempotent.
+    local host="$1"
+    local cmd
+    # shellcheck disable=SC2016
+    cmd='set -e
+        mkdir -p "'"$BIN_DIR"'"
+        cd "'"$BIN_DIR"'"
+        need_tikv=1
+        if [ -x tikv-server ] && ./tikv-server --version 2>/dev/null | grep -q "Release Version:[[:space:]]*'"${TIKV_VERSION#v}"'"; then
+            need_tikv=0
+        fi
+        if [ "$need_tikv" = "1" ]; then
+            echo "  Downloading tikv-'"${TIKV_VERSION}"'..."
+            curl -fsSL "'"${MIRROR}"'/tikv-'"${TIKV_VERSION}"'-linux-amd64.tar.gz" | tar -xz
+            chmod +x tikv-server
+        else
+            echo "  tikv-'"${TIKV_VERSION}"' already present"
+        fi
+        need_pd=1
+        if [ -x pd-server ] && ./pd-server --version 2>/dev/null | grep -q "Release Version:[[:space:]]*'"${PD_VERSION}"'"; then
+            need_pd=0
+        fi
+        if [ "$need_pd" = "1" ]; then
+            echo "  Downloading pd-'"${PD_VERSION}"'..."
+            curl -fsSL "'"${MIRROR}"'/pd-'"${PD_VERSION}"'-linux-amd64.tar.gz" | tar -xz
+            chmod +x pd-server pd-ctl pd-recover 2>/dev/null || true
+        else
+            echo "  pd-'"${PD_VERSION}"' already present"
+        fi'
+
+    if [ "$host" = "${NODE_HOSTS[0]}" ] || [ "$host" = "localhost" ] || [ "$host" = "127.0.0.1" ]; then
+        bash -c "$cmd"
+    else
+        remote_exec "$host" "$cmd"
+    fi
+}
+
+cmd_setup_bins() {
+    # Download tikv-server + pd-server to ${BIN_DIR} on every distinct host
+    # used by the cluster (compute nodes ∪ tikv nodes). Idempotent.
+    echo ""
+    echo "=== Setting up TiKV/PD binaries ==="
+    echo "  BIN_DIR : $BIN_DIR"
+    echo "  TIKV    : $TIKV_VERSION"
+    echo "  PD      : $PD_VERSION"
+    echo "  MIRROR  : $MIRROR"
+
+    declare -A seen
+    local -a hosts=()
+    local h
+    for h in "${NODE_HOSTS[@]}" "${TIKV_HOSTS[@]}"; do
+        if [ -z "${seen[$h]:-}" ]; then
+            seen[$h]=1
+            hosts+=("$h")
+        fi
+    done
+
+    for h in "${hosts[@]}"; do
+        echo ""
+        echo "→ $h"
+        setup_bins_one_host "$h"
+    done
+
+    echo ""
+    echo "Binary setup complete."
+}
+
+# ─── TiKV Management (Independent Mode) ───
+
+
+tikv_start() {
+    # Start the first <node_count> PD+TiKV pairs.
+    #
+    # node_count == 1: standalone PD + TiKV (1-node benchmarks).
+    # node_count >= 2: SHARED raft cluster — all PDs join one raft group,
+    #                  all TiKVs point to all PDs. max-replicas=1 so each
+    #                  region lives on exactly one store; PD routes reads
+    #                  to whichever store has the region.
+    local node_count="${1:-${#TIKV_HOSTS[@]}}"
+    echo ""
+    if [ "$node_count" -le 1 ]; then
+        echo "=== Starting 1 standalone TiKV instance ==="
+    else
+        echo "=== Starting $node_count-node SHARED TiKV raft cluster ==="
+    fi
+
+    # Ensure binaries are present on every host that will run a container.
+    # Cheap if already there (version-grep, no download).
+    local i_host
+    for (( i_host=0; i_host<node_count; i_host++ )); do
+        local h="${TIKV_HOSTS[$i_host]}"
+        # quick presence check; only call full setup if missing
+        local present
+        if [ "$h" = "${NODE_HOSTS[0]}" ] || [ "$h" = "localhost" ] || [ "$h" = "127.0.0.1" ]; then
+            present=$([ -x "$BIN_DIR/tikv-server" ] && [ -x "$BIN_DIR/pd-server" ] && echo yes || echo no)
+        else
+            present=$(remote_exec "$h" "[ -x $BIN_DIR/tikv-server ] && [ -x $BIN_DIR/pd-server ] && echo yes || echo no" 2>/dev/null | tr -d '[:space:]')
+        fi
+        if [ "$present" != "yes" ]; then
+            echo "  → $h: binaries missing, running setup-bins"
+            setup_bins_one_host "$h"
+        fi
+    done
+
+    # Build the initial-cluster string used by every PD.
+    # For 1-node it's a single-member raft; for N>=2 every PD lists all members.
+    local initial_cluster=""
+    for (( i=0; i<node_count; i++ )); do
+        local host="${TIKV_HOSTS[$i]}"
+        local peer_port="${TIKV_PD_PEER_PORTS[$i]}"
+        local pd_name="pd${i}"
+        [ -n "$initial_cluster" ] && initial_cluster+=","
+        initial_cluster+="${pd_name}=http://${host}:${peer_port}"
+    done
+
+    # Build the comma-separated pd-endpoints list for TiKV --pd-endpoints.
+    # For shared mode, every TiKV connects to every PD so PD-raft failover works.
+    local pd_endpoints=""
+    for (( i=0; i<node_count; i++ )); do
+        local host="${TIKV_HOSTS[$i]}"
+        local pd_port="${TIKV_PD_CLIENT_PORTS[$i]}"
+        [ -n "$pd_endpoints" ] && pd_endpoints+=","
+        pd_endpoints+="http://${host}:${pd_port}"
+    done
+
+    # Start PD instances. With node_count >= 2 they form a raft group.
+    echo "Starting PD instances (initial-cluster=${initial_cluster})..."
+    for (( i=0; i<node_count; i++ )); do
+        local host="${TIKV_HOSTS[$i]}"
+        local client_port="${TIKV_PD_CLIENT_PORTS[$i]}"
+        local peer_port="${TIKV_PD_PEER_PORTS[$i]}"
+        local pd_name="pd${i}"
+        echo "  PD $i on $host:$client_port"
+
+        remote_exec "$host" "docker rm -f sptag-pd-$i 2>/dev/null; \
+            docker run -d --name sptag-pd-$i --net host \
+            -v $DATA_DIR/tikv-data/pd-$i:/data \
+            -v ${BIN_DIR}:/sptag-bin:ro \
+            --entrypoint /sptag-bin/pd-server \
+            ${PD_IMAGE} \
+            --name=${pd_name} \
+            --data-dir=/data \
+            --client-urls=http://0.0.0.0:${client_port} \
+            --advertise-client-urls=http://${host}:${client_port} \
+            --peer-urls=http://0.0.0.0:${peer_port} \
+            --advertise-peer-urls=http://${host}:${peer_port} \
+            --initial-cluster=${initial_cluster}"
+    done
+
+    echo "Waiting for PD raft to form..."
+    sleep 5
+
+    # Wait until every PD reports the expected member count (raft quorum up).
+    for (( i=0; i<node_count; i++ )); do
+        local host="${TIKV_HOSTS[$i]}"
+        local pd_port="${TIKV_PD_CLIENT_PORTS[$i]}"
+        for attempt in $(seq 1 60); do
+            local members
+            members=$(curl -sf "http://${host}:${pd_port}/pd/api/v1/members" 2>/dev/null \
+                | python3 -c "import sys,json; d=json.load(sys.stdin); print(len(d.get('members',[])))" 2>/dev/null || echo 0)
+            if [ "$members" -ge "$node_count" ]; then
+                echo "  PD $i ($host:$pd_port) healthy (members=${members})"
+                break
+            fi
+            if [ "$attempt" -eq 60 ]; then
+                echo "  ERROR: PD $i ($host:$pd_port) only sees ${members}/${node_count} members after 60s"
+                return 1
+            fi
+            sleep 1
+        done
+    done
+
+    # NOTE: max-replicas is configured AFTER TiKV starts (see below). Setting
+    # placement rules requires cluster bootstrap, which only happens once a
+    # TiKV store joins. Before bootstrap, /pd/api/v1/config/rule returns 500
+    # ErrNotBootstrapped. We rely on the fact that no data is written until
+    # SPTAGTest connects (which happens after this function returns), so the
+    # brief window where bootstrap uses default max-replicas=3 is harmless.
+
+    # Start TiKV instances pointing at the shared PD endpoints.
+    echo "Starting TiKV instances (pd-endpoints=${pd_endpoints})..."
+    for (( i=0; i<node_count; i++ )); do
+        local host="${TIKV_HOSTS[$i]}"
+        local tikv_port="${TIKV_PORTS[$i]}"
+        echo "  TiKV $i on $host:$tikv_port → shared PD cluster"
+
+        # Deploy tikv.toml to remote host.
+        # When BUILD_WITH_CACHE=1 we always start with the cached config; the search
+        # phase will swap to tikv_nocache.toml via tikv_switch_to_nocache().
+        local TIKV_TOML="$SCRIPT_DIR/configs/tikv.toml"
+        if [[ "${NOCACHE:-0}" == "1" && "${BUILD_WITH_CACHE:-0}" != "1" \
+              && -f "$SCRIPT_DIR/configs/tikv_nocache.toml" ]]; then
+            TIKV_TOML="$SCRIPT_DIR/configs/tikv_nocache.toml"
+            echo "  [NOCACHE] Using tikv_nocache.toml (block cache = 1MB)"
+        elif [[ "${NOCACHE:-0}" == "1" && "${BUILD_WITH_CACHE:-0}" == "1" ]]; then
+            echo "  [NOCACHE+BUILD_WITH_CACHE] Starting with cached tikv.toml (will swap before run phase)"
+        fi
+        if [[ -f "$TIKV_TOML" ]]; then
+            remote_exec "$host" "docker run --rm -v $DATA_DIR/tikv-data:/data ${HELPER_IMAGE} mkdir -p /data/conf"
+            if [ "$host" = "${NODE_HOSTS[0]}" ] || [ "$host" = "localhost" ] || [ "$host" = "127.0.0.1" ]; then
+                docker run --rm -v $DATA_DIR/tikv-data/conf:/conf -v $(realpath "$TIKV_TOML"):/src/tikv.toml:ro ${HELPER_IMAGE} cp /src/tikv.toml /conf/tikv.toml
+            else
+                scp $(_ssh_opts) "$TIKV_TOML" "${SSH_USER}@${host}:${SPTAG_DIR}/tikv.toml"
+                remote_exec "$host" "docker run --rm -v $DATA_DIR/tikv-data/conf:/conf -v ${SPTAG_DIR}/tikv.toml:/src/tikv.toml:ro ${HELPER_IMAGE} cp /src/tikv.toml /conf/tikv.toml"
+            fi
+        fi
+
+        remote_exec "$host" "docker rm -f sptag-tikv-$i 2>/dev/null; \
+            docker run -d --name sptag-tikv-$i --net host \
+            --ulimit nofile=1048576:1048576 \
+            -v $DATA_DIR/tikv-data/tikv-$i:/data \
+            -v $DATA_DIR/tikv-data/conf:/conf \
+            -v ${BIN_DIR}:/sptag-bin:ro \
+            --entrypoint /sptag-bin/tikv-server \
+            ${TIKV_IMAGE} \
+            --config=/conf/tikv.toml \
+            --addr=0.0.0.0:${tikv_port} \
+            --advertise-addr=${host}:${tikv_port} \
+            --data-dir=/data \
+            --pd-endpoints=${pd_endpoints}"
+    done
+
+    echo "Waiting for TiKV stores to register..."
+    sleep 5
+
+    # All stores show up in PD's store list (any PD works — they share state).
+    local pd_host="${TIKV_HOSTS[0]}"
+    local pd_port_first="${TIKV_PD_CLIENT_PORTS[0]}"
+    for attempt in $(seq 1 60); do
+        local store_count
+        store_count=$(curl -sf "http://${pd_host}:${pd_port_first}/pd/api/v1/stores" 2>/dev/null \
+            | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('count',0))" 2>/dev/null || echo 0)
+        if [ "$store_count" -ge "$node_count" ]; then
+            echo "  All ${store_count} TiKV stores registered"
+            break
+        fi
+        if [ "$attempt" -eq 60 ]; then
+            echo "  WARNING: only ${store_count}/${node_count} TiKV stores registered after 60s"
+        fi
+        sleep 1
+    done
+
+    # Set max-replicas=1 on the shared cluster, NOW that cluster is bootstrapped.
+    #
+    # PD v6+ defaults to enable-placement-rules=true. The authoritative source
+    # for replica count is then the default placement rule, NOT the legacy
+    # max-replicas config. /config POST auto-syncs to the rule but is racy;
+    # we explicitly POST the rule too. Both endpoints require bootstrap.
+    # Bug seen v45: skipping this caused 30%+ of a 1-node run to execute with
+    # max-replicas=3 → PD endlessly tried to schedule replicas onto 1 store
+    # → constant region state changes → gRPC Deadline / region_error storm.
+    echo "Setting max-replicas=1 (default placement rule)..."
+    local target_replicas=1
+    local mr_ok=0
+    for attempt in $(seq 1 30); do
+        curl -sf "http://${pd_host}:${pd_port_first}/pd/api/v1/config" \
+            -X POST -d "{\"max-replicas\": ${target_replicas}}" >/dev/null 2>&1 || true
+        curl -sf "http://${pd_host}:${pd_port_first}/pd/api/v1/config/rule" \
+            -X POST -d "{\"group_id\":\"pd\",\"id\":\"default\",\"start_key\":\"\",\"end_key\":\"\",\"role\":\"voter\",\"count\":${target_replicas}}" \
+            >/dev/null 2>&1 || true
+        sleep 1
+        local got_cfg
+        got_cfg=$(curl -sf "http://${pd_host}:${pd_port_first}/pd/api/v1/config/replicate" 2>/dev/null \
+            | python3 -c 'import sys,json;print(json.load(sys.stdin).get("max-replicas"))' 2>/dev/null)
+        local got_rule
+        got_rule=$(curl -sf "http://${pd_host}:${pd_port_first}/pd/api/v1/config/rule/pd/default" 2>/dev/null \
+            | python3 -c 'import sys,json;print(json.load(sys.stdin).get("count"))' 2>/dev/null)
+        if [ "$got_cfg" = "$target_replicas" ] && [ "$got_rule" = "$target_replicas" ]; then
+            echo "  max-replicas=${target_replicas} set (attempt $attempt, config & rule verified)"
+            mr_ok=1
+            break
+        fi
+        sleep 1
+    done
+    if [ "$mr_ok" != "1" ]; then
+        echo "  ERROR: Failed to set max-replicas=${target_replicas} after 30 attempts. Aborting." >&2
+        return 1
+    fi
+
+    echo "TiKV cluster started ($node_count node(s))."
+}
+
+tikv_stop() {
+    # Stop the first <node_count> TiKV+PD instances.
+    local node_count="${1:-${#TIKV_HOSTS[@]}}"
+    echo ""
+    echo "=== Stopping $node_count TiKV instances ==="
+
+    for (( i=0; i<node_count; i++ )); do
+        local host="${TIKV_HOSTS[$i]}"
+        echo "  Stopping TiKV $i and PD $i on $host..."
+        remote_exec "$host" "docker rm -f sptag-tikv-$i sptag-pd-$i 2>/dev/null || true"
+    done
+
+    echo "TiKV instances stopped."
+}
+
+tikv_switch_to_nocache() {
+    # Restart TiKV containers (NOT PD) with the nocache config, so that the search
+    # and insert phases use cold block cache. Data on disk is preserved because we
+    # reuse the same data-dir; PD keeps the cluster metadata.
+    local node_count="${1:-${#TIKV_HOSTS[@]}}"
+    if [[ ! -f "$SCRIPT_DIR/configs/tikv_nocache.toml" ]]; then
+        echo "  ERROR: configs/tikv_nocache.toml not found; cannot switch to nocache"
+        return 1
+    fi
+    echo ""
+    echo "=== Restarting $node_count TiKV instances with tikv_nocache.toml ==="
+
+    # Reconstruct the shared pd-endpoints list (same as tikv_start).
+    local pd_endpoints=""
+    for (( i=0; i<node_count; i++ )); do
+        local h="${TIKV_HOSTS[$i]}"
+        local pp="${TIKV_PD_CLIENT_PORTS[$i]}"
+        [ -n "$pd_endpoints" ] && pd_endpoints+=","
+        pd_endpoints+="http://${h}:${pp}"
+    done
+
+    for (( i=0; i<node_count; i++ )); do
+        local host="${TIKV_HOSTS[$i]}"
+        local tikv_port="${TIKV_PORTS[$i]}"
+        local TIKV_TOML="$SCRIPT_DIR/configs/tikv_nocache.toml"
+        echo "  TiKV $i on $host:$tikv_port → swapping config"
+
+        remote_exec "$host" "docker run --rm -v $DATA_DIR/tikv-data:/data ${HELPER_IMAGE} mkdir -p /data/conf"
+        if [ "$host" = "${NODE_HOSTS[0]}" ] || [ "$host" = "localhost" ] || [ "$host" = "127.0.0.1" ]; then
+            docker run --rm -v $DATA_DIR/tikv-data/conf:/conf -v $(realpath "$TIKV_TOML"):/src/tikv.toml:ro ${HELPER_IMAGE} cp /src/tikv.toml /conf/tikv.toml
+        else
+            scp $(_ssh_opts) "$TIKV_TOML" "${SSH_USER}@${host}:${SPTAG_DIR}/tikv.toml"
+            remote_exec "$host" "docker run --rm -v $DATA_DIR/tikv-data/conf:/conf -v ${SPTAG_DIR}/tikv.toml:/src/tikv.toml:ro ${HELPER_IMAGE} cp /src/tikv.toml /conf/tikv.toml"
+        fi
+
+        remote_exec "$host" "docker stop -t 120 sptag-tikv-$i 2>/dev/null; \
+            docker rm -f sptag-tikv-$i 2>/dev/null; \
+            docker run -d --name sptag-tikv-$i --net host \
+            --ulimit nofile=1048576:1048576 \
+            -v $DATA_DIR/tikv-data/tikv-$i:/data \
+            -v $DATA_DIR/tikv-data/conf:/conf \
+            -v ${BIN_DIR}:/sptag-bin:ro \
+            --entrypoint /sptag-bin/tikv-server \
+            ${TIKV_IMAGE} \
+            --config=/conf/tikv.toml \
+            --addr=0.0.0.0:${tikv_port} \
+            --advertise-addr=${host}:${tikv_port} \
+            --data-dir=/data \
+            --pd-endpoints=${pd_endpoints}"
+    done
+
+    echo "Waiting for TiKV stores to re-register..."
+    sleep 5
+    local pd_host_first="${TIKV_HOSTS[0]}"
+    local pd_port_first="${TIKV_PD_CLIENT_PORTS[0]}"
+    for attempt in $(seq 1 60); do
+        local store_count
+        store_count=$(curl -sf "http://${pd_host_first}:${pd_port_first}/pd/api/v1/stores" 2>/dev/null \
+            | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('count',0))" 2>/dev/null || echo 0)
+        if [ "$store_count" -ge "$node_count" ]; then
+            echo "  All ${store_count} TiKV stores re-registered"
+            break
+        fi
+        if [ "$attempt" -eq 60 ]; then
+            echo "  WARNING: only ${store_count}/${node_count} stores re-registered after 60s"
+        fi
+        sleep 1
+    done
+    echo "TiKV switched to nocache mode."
+}
+
+tikv_clean() {
+    # Clean TiKV data for the first <node_count> instances.
+    local node_count="${1:-${#TIKV_HOSTS[@]}}"
+    echo ""
+    echo "=== Cleaning TiKV data ($node_count instances) ==="
+
+    for (( i=0; i<node_count; i++ )); do
+        local host="${TIKV_HOSTS[$i]}"
+        echo "  Cleaning TiKV data on $host..."
+        remote_exec "$host" "docker run --rm -v $DATA_DIR/tikv-data:/data ${HELPER_IMAGE} \
+            rm -rf /data/tikv-$i /data/pd-$i 2>/dev/null || true"
+    done
+}
+
+# Legacy wrappers for the main case block
+cmd_start_tikv() { tikv_start "${1:-${#TIKV_HOSTS[@]}}"; }
+cmd_stop_tikv()  { tikv_stop  "${1:-${#TIKV_HOSTS[@]}}"; }
+
+# ─── Cache Management ───
+
+drop_all_caches() {
+    # Drop OS page cache + dentries/inodes on the first <node_count> nodes.
+    # This may take 30-60s per node if there are many dirty pages.
+    local node_count="${1:-1}"
+    if [[ "${SKIP_DROP_CACHES:-0}" == "1" ]]; then
+        echo "[SKIP_DROP_CACHES=1] skipping OS page-cache drop on $node_count node(s)"
+        return 0
+    fi
+    echo "Dropping OS page cache on $node_count node(s) (timeout 10s per node)..."
+    for (( i=0; i<node_count; i++ )); do
+        local host="${NODE_HOSTS[$i]}"
+        echo -n "  $host: "
+        remote_exec "$host" "timeout 10 sudo -n sh -c 'echo 3 > /proc/sys/vm/drop_caches'" && echo "done" || echo "timeout/failed (non-fatal)"
+    done
+    echo "Cache drop complete."
+}
+
+# ─── INI Generation ───
+
+generate_ini() {
+    # Generate a benchmark INI from a template, filling in [Distributed] fields.
+    # Usage: generate_ini <scale> <node_count> [overrides...]
+    local SCALE="$1"
+    local NODE_COUNT="$2"
+    shift 2
+
+    local IDX_PATH="$DATA_DIR/proidx_${SCALE}_${NODE_COUNT}node/spann_index"
+    local KEY_PREFIX="bench${SCALE}_${NODE_COUNT}node"
+
+    # Build comma-separated address lists from the first node_count entries
+    local dispatcher_addr="${NODE_HOSTS[0]}:30001"
+    local worker_addrs="" store_addrs="" pd_addrs=""
+    for (( i=0; i<NODE_COUNT; i++ )); do
+        [ -n "$worker_addrs" ] && worker_addrs+=","
+        worker_addrs+="${NODE_HOSTS[$i]}:${NODE_ROUTER_PORTS[$i]}"
+        [ -n "$store_addrs" ] && store_addrs+=","
+        store_addrs+="${TIKV_HOSTS[$i]}:${TIKV_PORTS[$i]}"
+        [ -n "$pd_addrs" ] && pd_addrs+=","
+        pd_addrs+="${TIKV_HOSTS[$i]}:${TIKV_PD_CLIENT_PORTS[$i]}"
+    done
+
+    # Load the base INI template
+    local BASE_INI="$SCRIPT_DIR/configs/benchmark_${SCALE}_template.ini"
+    if [ ! -f "$BASE_INI" ]; then
+        echo "ERROR: Template INI not found: $BASE_INI" >&2
+        return 1
+    fi
+
+    local OUT="$SCRIPT_DIR/configs/benchmark_${SCALE}_${NODE_COUNT}node.ini"
+    cp "$BASE_INI" "$OUT"
+
+    # Fill in placeholder fields
+    sed -i "s|^IndexPath=.*|IndexPath=${IDX_PATH}|" "$OUT"
+    sed -i "s|^TiKVKeyPrefix=.*|TiKVKeyPrefix=${KEY_PREFIX}|" "$OUT"
+    sed -i "s|^DispatcherAddr=.*|DispatcherAddr=${dispatcher_addr}|" "$OUT"
+    sed -i "s|^WorkerAddrs=.*|WorkerAddrs=${worker_addrs}|" "$OUT"
+    sed -i "s|^StoreAddrs=.*|StoreAddrs=${store_addrs}|" "$OUT"
+    sed -i "s|^PDAddrs=.*|PDAddrs=${pd_addrs}|" "$OUT"
+
+    # Apply extra overrides (key=value pairs)
+    for override in "$@"; do
+        local key="${override%%=*}"
+        local val="${override#*=}"
+        if grep -q "^${key}=" "$OUT"; then
+            sed -i "s|^${key}=.*|${key}=${val}|" "$OUT"
+        else
+            # Append to [Benchmark] section
+            sed -i "/^\[Benchmark\]/a ${key}=${val}" "$OUT"
+        fi
+    done
+
+    echo "$OUT"
+}
+
+# ─── Worker Management ───
+
+WORKER_SSH_PIDS=()
+
+start_remote_worker() {
+    # Start a worker on a remote node. Returns immediately; worker runs in background.
+    local NODE_IDX="$1"
+    local INI="$2"
+    local SCALE="$3"
+    local NODE_COUNT="$4"
+    local host="${NODE_HOSTS[$NODE_IDX]}"
+    local LOG="$LOGDIR/benchmark_${SCALE}_${NODE_COUNT}node_worker${NODE_IDX}.log"
+
+    # Copy INI + binary to remote
+    remote_sync "$host" "$INI" "$SPTAG_DIR/worker_n${NODE_IDX}.ini"
+
+    # Start worker via SSH (foreground on remote, background locally).
+    # Use `ssh -n` to redirect stdin from /dev/null so SSH doesn't try to
+    # acquire a TTY when the parent script runs under `nohup`. Without -n,
+    # the SSH client sometimes silently re-points fd1 → /dev/null and fd2
+    # → a deleted /tmp file, dropping the worker log.
+    ssh -n $(_ssh_opts) "$SSH_USER@$host" \
+        "cd $SPTAG_DIR && LD_LIBRARY_PATH=$SPTAG_DIR/Release/runtime_libs:/usr/lib/x86_64-linux-gnu:\${LD_LIBRARY_PATH:-} \
+         WORKER_INDEX=${NODE_IDX} BENCHMARK_CONFIG=worker_n${NODE_IDX}.ini \
+         SPFRESH_SHARD_STRIDE=${SPFRESH_SHARD_STRIDE:-0} \
+         ./Release/SPTAGTest --run_test=SPFreshTest/BenchmarkFromConfig 2>&1" \
+        </dev/null > "$LOG" 2>&1 &
+    local ssh_pid=$!
+    WORKER_SSH_PIDS+=($ssh_pid)
+    echo "  Worker n${NODE_IDX} on $host (SSH PID: $ssh_pid, log: $LOG)"
+}
+
+wait_workers_ready() {
+    local SCALE="$1"
+    local NODE_COUNT="$2"
+    local TIMEOUT=120
+
+    echo "Waiting for ${#WORKER_SSH_PIDS[@]} workers to be ready..."
+    for attempt in $(seq 1 $TIMEOUT); do
+        local all_ready=true
+        for i in $(seq 1 $((NODE_COUNT - 1))); do
+            local LOG="$LOGDIR/benchmark_${SCALE}_${NODE_COUNT}node_worker${i}.log"
+            if ! grep -q "Worker.*[Rr]eady\|Waiting for dispatch" "$LOG" 2>/dev/null; then
+                all_ready=false
+            fi
+        done
+        if $all_ready; then
+            echo "  All workers ready (${attempt}s)"
+            return 0
+        fi
+        # Check if any worker SSH process died
+        for idx in "${!WORKER_SSH_PIDS[@]}"; do
+            if ! kill -0 "${WORKER_SSH_PIDS[$idx]}" 2>/dev/null; then
+                echo "  ERROR: Worker SSH PID ${WORKER_SSH_PIDS[$idx]} exited prematurely"
+                return 1
+            fi
+        done
+        sleep 1
+    done
+    echo "  WARNING: Not all workers ready after ${TIMEOUT}s"
+    return 1
+}
+
+stop_remote_workers() {
+    # Wait for workers to self-exit (driver sends TCP Stop), then force-kill.
+    local TIMEOUT=${1:-30}
+    if [ ${#WORKER_SSH_PIDS[@]} -eq 0 ]; then return; fi
+
+    echo "Waiting for ${#WORKER_SSH_PIDS[@]} remote workers to exit (${TIMEOUT}s timeout)..."
+    for pid in "${WORKER_SSH_PIDS[@]}"; do
+        local elapsed=0
+        while kill -0 "$pid" 2>/dev/null && [ $elapsed -lt $TIMEOUT ]; do
+            sleep 1
+            elapsed=$((elapsed + 1))
+        done
+        if kill -0 "$pid" 2>/dev/null; then
+            echo "  WARNING: SSH PID $pid still alive, force killing"
+            kill -9 "$pid" 2>/dev/null || true
+            wait "$pid" 2>/dev/null || true
+        else
+            echo "  Worker (SSH PID $pid) exited gracefully"
+        fi
+    done
+    WORKER_SSH_PIDS=()
+}
+
+# Watchdog: detect driver death (segfault, OOM, SIGKILL by oom_killer, ...)
+# and tear down remote workers so they don't linger forever.
+# The C++ heartbeat watchdog inside the worker is the primary defense (bounded
+# at HeartbeatTimeoutSec, default 180s). This shell watchdog is a faster
+# secondary path: as soon as the driver PID is gone we (a) kill the local SSH
+# wrappers and (b) `pkill` the remote SPTAGTest processes.
+DRIVER_WATCHDOG_PID=""
+
+start_driver_watchdog() {
+    local DRIVER_PID="$1"
+    local NODE_COUNT="$2"
+    if [ "$NODE_COUNT" -lt 2 ]; then return; fi
+    if [ ${#WORKER_SSH_PIDS[@]} -eq 0 ]; then return; fi
+
+    # Snapshot what we need before backgrounding (subshell forks current env).
+    local _ssh_pids="${WORKER_SSH_PIDS[*]}"
+    local _hosts=()
+    for (( i=1; i<NODE_COUNT; i++ )); do _hosts+=("${NODE_HOSTS[$i]}"); done
+    local _hosts_str="${_hosts[*]}"
+    local _ssh_user="$SSH_USER"
+    local _ssh_opts_str="$(_ssh_opts)"
+
+    (
+        while kill -0 "$DRIVER_PID" 2>/dev/null; do
+            sleep 5
+        done
+        echo "[watchdog] Driver PID $DRIVER_PID is gone; tearing down remote workers" >&2
+        for pid in $_ssh_pids; do
+            kill -TERM "$pid" 2>/dev/null || true
+        done
+        for host in $_hosts_str; do
+            ssh -n $_ssh_opts_str "$_ssh_user@$host" \
+                "pkill -TERM -f 'SPTAGTest.*BenchmarkFromConfig' 2>/dev/null; \
+                 sleep 5; \
+                 pkill -KILL -f 'SPTAGTest.*BenchmarkFromConfig' 2>/dev/null; true" \
+                </dev/null >/dev/null 2>&1 || true
+        done
+        for pid in $_ssh_pids; do
+            kill -0 "$pid" 2>/dev/null && kill -KILL "$pid" 2>/dev/null || true
+        done
+    ) &
+    DRIVER_WATCHDOG_PID=$!
+    echo "  Driver watchdog started (PID: $DRIVER_WATCHDOG_PID, monitoring driver $DRIVER_PID)"
+}
+
+stop_driver_watchdog() {
+    if [ -n "$DRIVER_WATCHDOG_PID" ] && kill -0 "$DRIVER_WATCHDOG_PID" 2>/dev/null; then
+        kill -TERM "$DRIVER_WATCHDOG_PID" 2>/dev/null || true
+        wait "$DRIVER_WATCHDOG_PID" 2>/dev/null || true
+    fi
+    DRIVER_WATCHDOG_PID=""
+}
+
+# ─── Benchmark Run ───
+
+distribute_head_index() {
+    # Copy the head index from driver to all worker nodes.
+    local SCALE="$1"
+    local NODE_COUNT="$2"
+    local SRC="$DATA_DIR/proidx_${SCALE}_${NODE_COUNT}node/spann_index"
+
+    echo "Distributing head index to $((NODE_COUNT - 1)) workers..."
+    for (( i=1; i<NODE_COUNT; i++ )); do
+        local host="${NODE_HOSTS[$i]}"
+        local DST="$DATA_DIR/proidx_${SCALE}_${NODE_COUNT}node/spann_index"
+        echo "  → n${i} ($host)"
+        remote_exec "$host" "mkdir -p $DST"
+        remote_sync "$host" "$SRC/" "$DST/"
+    done
+}
+
+distribute_perftest_files() {
+    # rsync generated perftest_* files from driver to workers.
+    local NODE_COUNT="$1"
+    echo "Distributing perftest_* data files to workers..."
+    for (( i=1; i<NODE_COUNT; i++ )); do
+        local host="${NODE_HOSTS[$i]}"
+        echo "  → $host"
+        rsync -az --progress \
+            --include='perftest_*' --exclude='*' \
+            -e "ssh $(_ssh_opts)" \
+            "$SPTAG_DIR/" "$SSH_USER@$host:$SPTAG_DIR/"
+    done
+}
+
+# Determine build mode: full rebuild or SSD-only (reuse HeadIndex).
+# Sets BUILD_MODE_OVERRIDES array for generate_ini.
+# Usage: resolve_build_mode <scale> <node_count>
+resolve_build_mode() {
+    local SCALE="$1" NODE_COUNT="$2"
+    local IDX_DIR="$DATA_DIR/proidx_${SCALE}_${NODE_COUNT}node/spann_index"
+    local HEAD_DIR="$IDX_DIR/HeadIndex"
+
+    BUILD_MODE_OVERRIDES=()
+    if [[ "${SKIP_HEAD_BUILD:-0}" == "1" ]] && [ -d "$HEAD_DIR" ] && [ -n "$(ls -A "$HEAD_DIR" 2>/dev/null)" ]; then
+        echo "HeadIndex found at $HEAD_DIR — using RebuildSSDOnly (skip SelectHead+BuildHead)"
+        BUILD_MODE_OVERRIDES=("RebuildSSDOnly=true")
+    else
+        if [[ "${SKIP_HEAD_BUILD:-0}" == "1" ]]; then
+            echo "SKIP_HEAD_BUILD=1 but HeadIndex not found at $HEAD_DIR — falling back to full build"
+        fi
+        BUILD_MODE_OVERRIDES=("Rebuild=true")
+    fi
+}
+
+cmd_run() {
+    local SCALE="$1"
+    local NODE_COUNT="$2"
+    if [ -z "$SCALE" ] || [ -z "$NODE_COUNT" ]; then
+        echo "Usage: $0 run <cluster.conf> <scale> <node_count>"
+        exit 1
+    fi
+
+    local BINARY="$SPTAG_DIR/Release/SPTAGTest"
+
+    echo ""
+    echo "═══════════════════════════════════════════════════"
+    echo "  ${SCALE}: ${NODE_COUNT}-node benchmark${NOCACHE:+ [NOCACHE]}"
+    echo "  Start: $(date)"
+    echo "═══════════════════════════════════════════════════"
+
+    if [ "$NODE_COUNT" -eq 1 ]; then
+        # ─── Single-node flow ───
+        echo ""
+        echo "--- Phase 0: Prepare TiKV (1 instance) ---"
+        tikv_stop 1
+        tikv_clean 1
+        if ! tikv_start 1; then
+            echo "ERROR: tikv_start failed; aborting benchmark." >&2
+            return 1
+        fi
+
+        # Resolve build mode before cleaning (SKIP_HEAD_BUILD needs existing dir)
+        resolve_build_mode "$SCALE" "$NODE_COUNT"
+
+        if [[ " ${BUILD_MODE_OVERRIDES[*]} " != *"RebuildSSDOnly=true"* ]]; then
+            # Full build: clean old index dir
+            rm -rf "$DATA_DIR/proidx_${SCALE}_1node"
+        fi
+        mkdir -p "$DATA_DIR/proidx_${SCALE}_1node"
+
+        if [[ "${NOCACHE:-0}" == "1" ]]; then
+            # NOCACHE: Split into build + cache-drop + search
+            local BUILD_VERSIONCACHE_OVERRIDES=("VersionCacheTTLMs=0" "VersionCacheMaxChunks=0")
+            if [[ "${BUILD_WITH_CACHE:-0}" == "1" ]]; then
+                # Build phase keeps caches enabled; the run phase below switches to nocache
+                BUILD_VERSIONCACHE_OVERRIDES=()
+                echo ""
+                echo "--- Phase 1: Build only (BUILD_WITH_CACHE=1, caches enabled) ---"
+            else
+                echo ""
+                echo "--- Phase 1: Build only (NOCACHE) ---"
+            fi
+
+            if [[ "${SKIP_SAVE_LOAD:-0}" == "1" ]]; then
+                # Single-process flow: build + search + insert in one SPTAGTest invocation.
+                # SkipSaveLoadCycles=true bypasses the broken post-build SaveIndex and per-batch
+                # Load/Clone/Save. SPTAGTest itself drops OS pagecache after build, before query.
+                echo "[SKIP_SAVE_LOAD=1] running build + search + insert in a single SPTAGTest process"
+                local SINGLE_INI
+                SINGLE_INI=$(generate_ini "$SCALE" 1 "${BUILD_MODE_OVERRIDES[@]}" \
+                    "SkipSaveLoadCycles=true" "${BUILD_VERSIONCACHE_OVERRIDES[@]}") || exit 1
+
+                ( cd "$SPTAG_DIR" && LD_LIBRARY_PATH="$SPTAG_DIR/Release/runtime_libs:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH:-}" BENCHMARK_CONFIG="$SINGLE_INI" \
+                  BENCHMARK_OUTPUT="output_${SCALE}_1node.json" \
+                  "$BINARY" --run_test=SPFreshTest/BenchmarkFromConfig 2>&1 ) \
+                    | tee "$LOGDIR/benchmark_${SCALE}_1node_driver.log"
+
+                echo "Done: $(date)"
+                tikv_stop 1
+                return 0
+            fi
+
+            local BUILD_INI
+            BUILD_INI=$(generate_ini "$SCALE" 1 "${BUILD_MODE_OVERRIDES[@]}" "BuildOnly=true" "${BUILD_VERSIONCACHE_OVERRIDES[@]}") || exit 1
+
+            ( cd "$SPTAG_DIR" && LD_LIBRARY_PATH="$SPTAG_DIR/Release/runtime_libs:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH:-}" BENCHMARK_CONFIG="$BUILD_INI" \
+              BENCHMARK_OUTPUT="output_${SCALE}_1node_build.json" \
+              "$BINARY" --run_test=SPFreshTest/BenchmarkFromConfig 2>&1 ) \
+                | tee "$LOGDIR/benchmark_${SCALE}_1node_build.log"
+
+            echo "Build done: $(date)"
+
+            if [[ "${BUILD_WITH_CACHE:-0}" == "1" && "${SKIP_TIKV_SWAP:-0}" != "1" ]]; then
+                echo ""
+                echo "--- Phase 1.4: Switch TiKV to nocache config ---"
+                tikv_switch_to_nocache 1
+            elif [[ "${SKIP_TIKV_SWAP:-0}" == "1" ]]; then
+                echo "[SKIP_TIKV_SWAP=1] keeping TiKV containers running; relying on drop_caches + VersionCache=0"
+            fi
+
+            echo ""
+            echo "--- Phase 1.5: Drop all caches (NOCACHE) ---"
+            drop_all_caches 1
+
+            echo ""
+            echo "--- Phase 2: Search+Insert (cold cache) ---"
+            local RUN_INI
+            RUN_INI=$(generate_ini "$SCALE" 1 "Rebuild=false" "VersionCacheTTLMs=0" "VersionCacheMaxChunks=0") || exit 1
+
+            ( cd "$SPTAG_DIR" && LD_LIBRARY_PATH="$SPTAG_DIR/Release/runtime_libs:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH:-}" BENCHMARK_CONFIG="$RUN_INI" \
+              BENCHMARK_OUTPUT="output_${SCALE}_1node.json" \
+              "$BINARY" --run_test=SPFreshTest/BenchmarkFromConfig 2>&1 ) \
+                | tee "$LOGDIR/benchmark_${SCALE}_1node_driver.log"
+        else
+            echo ""
+            echo "--- Phase 1: Single-node run ---"
+            local INI
+            INI=$(generate_ini "$SCALE" 1 "${BUILD_MODE_OVERRIDES[@]}") || exit 1
+
+            echo "Starting driver on ${NODE_HOSTS[0]}..."
+            ( cd "$SPTAG_DIR" && LD_LIBRARY_PATH="$SPTAG_DIR/Release/runtime_libs:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH:-}" BENCHMARK_CONFIG="$INI" \
+              BENCHMARK_OUTPUT="output_${SCALE}_1node.json" \
+              "$BINARY" --run_test=SPFreshTest/BenchmarkFromConfig 2>&1 ) \
+                | tee "$LOGDIR/benchmark_${SCALE}_1node_driver.log"
+        fi
+
+        echo "Done: $(date)"
+        tikv_stop 1
+    else
+        # ─── Multi-node flow ───
+        echo ""
+        echo "--- Phase 0: Prepare TiKV ($NODE_COUNT instances) ---"
+        tikv_stop "$NODE_COUNT"
+        tikv_clean "$NODE_COUNT"
+        if ! tikv_start "$NODE_COUNT"; then
+            echo "ERROR: tikv_start failed; aborting benchmark." >&2
+            return 1
+        fi
+
+        # --- Phase 1: Build index on driver ---
+        echo ""
+        echo "--- Phase 1: Build index on driver ---"
+        local BUILD_INI
+        local NOCACHE_OVERRIDES=()
+        local BUILD_NOCACHE_OVERRIDES=()
+        if [[ "${NOCACHE:-0}" == "1" ]]; then
+            NOCACHE_OVERRIDES=("VersionCacheTTLMs=0" "VersionCacheMaxChunks=0" "WorkerTimeout=14400")
+            if [[ "${BUILD_WITH_CACHE:-0}" == "1" ]]; then
+                # Build with cache, only run phase is nocache
+                BUILD_NOCACHE_OVERRIDES=()
+                echo "[BUILD_WITH_CACHE=1] build phase keeps caches; will switch before run phase"
+            else
+                BUILD_NOCACHE_OVERRIDES=("${NOCACHE_OVERRIDES[@]}")
+            fi
+        fi
+
+        # Resolve build mode before cleaning (SKIP_HEAD_BUILD needs existing dir)
+        resolve_build_mode "$SCALE" "$NODE_COUNT"
+
+        if [[ " ${BUILD_MODE_OVERRIDES[*]} " != *"RebuildSSDOnly=true"* ]]; then
+            # Full build: clean old index dirs on all nodes
+            for (( i=0; i<NODE_COUNT; i++ )); do
+                local host="${NODE_HOSTS[$i]}"
+                remote_exec "$host" "rm -rf $DATA_DIR/proidx_${SCALE}_${NODE_COUNT}node"
+            done
+        fi
+        mkdir -p "$DATA_DIR/proidx_${SCALE}_${NODE_COUNT}node"
+
+        local SKIP_SAVE_LOAD_OVERRIDES=()
+        if [[ "${SKIP_SAVE_LOAD:-0}" == "1" ]]; then
+            # In multi-node, the build phase still needs to persist files to disk so
+            # workers can LoadIndex them. SkipSaveLoadCycles=true skips ONLY the redundant
+            # post-build final SaveIndex (which truncates SPTAGHeadVectorIDs.bin and then
+            # blocks forever in the SaveIndexData drain at 100M scale). Files written by
+            # BuildLargeIndex during BuildHead remain valid on disk for the run phase.
+            SKIP_SAVE_LOAD_OVERRIDES=("SkipSaveLoadCycles=true")
+            echo "[SKIP_SAVE_LOAD=1] build phase will skip post-build SaveIndex"
+        fi
+
+        BUILD_INI=$(generate_ini "$SCALE" "$NODE_COUNT" "${BUILD_MODE_OVERRIDES[@]}" "BuildOnly=true" "${BUILD_NOCACHE_OVERRIDES[@]}" "${SKIP_SAVE_LOAD_OVERRIDES[@]}") || exit 1
+
+        # Build runs on the driver only — shared TiKV cluster routes each
+        # key to the owning store via PD, so the driver writes all postings
+        # straight to TiKV without any per-node dispatch. Workers are not
+        # launched during the build phase; they come up in Phase 3 (run).
+        local BUILD_LOG="$LOGDIR/benchmark_${SCALE}_${NODE_COUNT}node_build.log"
+        echo "Starting driver build on ${NODE_HOSTS[0]}..."
+        ( cd "$SPTAG_DIR" && LD_LIBRARY_PATH="$SPTAG_DIR/Release/runtime_libs:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH:-}" BENCHMARK_CONFIG="$BUILD_INI" \
+          BENCHMARK_OUTPUT="output_${SCALE}_${NODE_COUNT}node_build.json" \
+          "$BINARY" --run_test=SPFreshTest/BenchmarkFromConfig ) \
+            > "$BUILD_LOG" 2>&1 &
+        local BUILD_PID=$!
+        echo "  Driver build PID: $BUILD_PID"
+
+        # Shell-side watchdog: if the driver dies unexpectedly (segfault, OOM,
+        # SIGKILL) we want a fast failure path rather than hanging forever.
+        WORKER_SSH_PIDS=()
+        start_driver_watchdog "$BUILD_PID" "$NODE_COUNT"
+
+        # Wait for the driver build to finish
+        echo "  Waiting for driver build to complete..."
+        wait "$BUILD_PID"
+        local BUILD_RC=$?
+        echo "Driver build done (exit=$BUILD_RC): $(date)"
+        stop_driver_watchdog
+
+        if [[ $BUILD_RC -ne 0 ]] || grep -q "===== SEGFAULT" "$BUILD_LOG"; then
+            echo ""
+            echo "ERROR: Build phase failed (exit=$BUILD_RC, segfault=$(grep -c '===== SEGFAULT' "$BUILD_LOG"))"
+            echo "Refusing to proceed to run phase with broken build state."
+            echo "Tail of build log:"
+            tail -30 "$BUILD_LOG"
+            tikv_stop "$NODE_COUNT"
+            exit 1
+        fi
+
+        echo "Build done: $(date)"
+
+        # --- Phase 2: Distribute data ---
+        echo ""
+        echo "--- Phase 2: Distribute head index + data ---"
+        rm -f "$DATA_DIR/proidx_${SCALE}_${NODE_COUNT}node/spann_index/checkpoint.txt"
+
+        distribute_head_index "$SCALE" "$NODE_COUNT"
+        distribute_perftest_files "$NODE_COUNT"
+
+        # Sync SPTAGTest binary + bundled runtime libs to all workers so
+        # they pick up the latest compiled changes. (cmd_deploy is a separate
+        # subcommand; without this step a stale binary on the worker silently
+        # diverges from the driver.)
+        echo ""
+        echo "Syncing SPTAGTest binary + runtime_libs to workers..."
+        for host in "${NODE_HOSTS[@]}"; do
+            if [ "$host" = "${NODE_HOSTS[0]}" ]; then continue; fi
+            remote_exec "$host" "mkdir -p $SPTAG_DIR/Release"
+            remote_sync "$host" "$SPTAG_DIR/Release/SPTAGTest" "$SPTAG_DIR/Release/SPTAGTest"
+            if [ -d "$SPTAG_DIR/Release/runtime_libs" ]; then
+                remote_exec "$host" "mkdir -p $SPTAG_DIR/Release/runtime_libs"
+                rsync -az -e "ssh $(_ssh_opts)" \
+                    "$SPTAG_DIR/Release/runtime_libs/" \
+                    "$SSH_USER@$host:$SPTAG_DIR/Release/runtime_libs/"
+            fi
+        done
+
+        # Binary already pushed; nothing else to do here.
+
+        # --- Phase 3: Start driver first (contains dispatcher), then workers ---
+        echo ""
+
+        # Drop caches if NOCACHE mode
+        if [[ "${NOCACHE:-0}" == "1" ]]; then
+            if [[ "${BUILD_WITH_CACHE:-0}" == "1" && "${SKIP_TIKV_SWAP:-0}" != "1" ]]; then
+                echo "--- Phase 2.4: Switch TiKV to nocache config ---"
+                tikv_switch_to_nocache "$NODE_COUNT"
+            elif [[ "${SKIP_TIKV_SWAP:-0}" == "1" ]]; then
+                echo "[SKIP_TIKV_SWAP=1] keeping TiKV containers running; relying on drop_caches + VersionCache=0"
+            fi
+            echo "--- Phase 2.5: Drop all caches (NOCACHE) ---"
+            drop_all_caches "$NODE_COUNT"
+        fi
+
+        echo "--- Phase 3: Distributed run ---"
+
+        local RUN_INI
+        RUN_INI=$(generate_ini "$SCALE" "$NODE_COUNT" "Rebuild=false" "${NOCACHE_OVERRIDES[@]}") || exit 1
+
+        # Start driver in background first — it contains the dispatcher that
+        # workers need to connect to for ring registration.
+        local DRIVER_LOG="$LOGDIR/benchmark_${SCALE}_${NODE_COUNT}node_driver.log"
+        echo "Starting driver (dispatcher+worker0) on ${NODE_HOSTS[0]}..."
+        ( cd "$SPTAG_DIR" && LD_LIBRARY_PATH="$SPTAG_DIR/Release/runtime_libs:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH:-}" BENCHMARK_CONFIG="$RUN_INI" \
+          BENCHMARK_OUTPUT="output_${SCALE}_${NODE_COUNT}node.json" \
+          "$BINARY" --run_test=SPFreshTest/BenchmarkFromConfig ) \
+            > "$DRIVER_LOG" 2>&1 &
+        local DRIVER_PID=$!
+        echo "  Driver PID: $DRIVER_PID"
+
+        # Wait for dispatcher to start listening before launching workers
+        local DISP_PORT=30001
+        echo "  Waiting for dispatcher to listen on port $DISP_PORT..."
+        for attempt in $(seq 1 60); do
+            if ss -tlnp 2>/dev/null | grep -q ":${DISP_PORT} " || \
+               netstat -tlnp 2>/dev/null | grep -q ":${DISP_PORT} "; then
+                echo "  Dispatcher listening (${attempt}s)"
+                break
+            fi
+            if ! kill -0 "$DRIVER_PID" 2>/dev/null; then
+                echo "  ERROR: Driver exited prematurely"
+                cat "$DRIVER_LOG"
+                return 1
+            fi
+            if [ "$attempt" -eq 60 ]; then
+                echo "  WARNING: Dispatcher not detected on port $DISP_PORT after 60s, proceeding anyway"
+            fi
+            sleep 1
+        done
+
+        # Now start remote workers — they can connect to the dispatcher
+        WORKER_SSH_PIDS=()
+        for (( i=1; i<NODE_COUNT; i++ )); do
+            start_remote_worker "$i" "$RUN_INI" "$SCALE" "$NODE_COUNT"
+        done
+
+        # Shell-side watchdog (see comment in build phase).
+        start_driver_watchdog "$DRIVER_PID" "$NODE_COUNT"
+
+        # Wait for driver to complete (it runs the full benchmark)
+        echo "  Waiting for driver to complete..."
+        wait "$DRIVER_PID"
+        local DRIVER_EXIT=$?
+        echo "Driver done (exit=$DRIVER_EXIT): $(date)"
+        stop_driver_watchdog
+        # Show driver output
+        tail -20 "$DRIVER_LOG"
+
+        # Driver sends TCP Stop to workers; wait for graceful exit
+        stop_remote_workers 60
+
+        # Collect remote logs
+        echo "Collecting remote logs..."
+        for (( i=1; i<NODE_COUNT; i++ )); do
+            local host="${NODE_HOSTS[$i]}"
+            local REMOTE_LOG="$SPTAG_DIR/worker_n${i}.log"
+            scp $(_ssh_opts) "$SSH_USER@$host:$REMOTE_LOG" \
+                "$LOGDIR/benchmark_${SCALE}_${NODE_COUNT}node_worker${i}_remote.log" 2>/dev/null || true
+        done
+
+        tikv_stop "$NODE_COUNT"
+    fi
+
+    echo ""
+    echo "═══════════════════════════════════════════════════"
+    echo "  ${SCALE} ${NODE_COUNT}-node done: $(date)"
+    echo "  Results: output_${SCALE}_${NODE_COUNT}node.json"
+    echo "  Logs:    $LOGDIR/benchmark_${SCALE}_${NODE_COUNT}node_*.log"
+    echo "═══════════════════════════════════════════════════"
+}
+
+cmd_bench() {
+    # Run 1-node baseline + N-node distributed for each specified scale.
+    # Usage: cmd_bench <scale> [scale...]
+    # Special scale "all" expands to all scales with templates in configs/.
+    local scales=()
+    for arg in "$@"; do
+        if [ "$arg" = "all" ]; then
+            for tmpl in "$SCRIPT_DIR"/configs/benchmark_*_template.ini; do
+                local name
+                name="$(basename "$tmpl")"
+                name="${name#benchmark_}"
+                name="${name%_template.ini}"
+                scales+=("$name")
+            done
+        else
+            scales+=("$arg")
+        fi
+    done
+
+    if [ ${#scales[@]} -eq 0 ]; then
+        echo "Usage: $0 bench <cluster.conf> <scale> [scale...] | all"
+        echo "Available scales:"
+        for tmpl in "$SCRIPT_DIR"/configs/benchmark_*_template.ini; do
+            local name
+            name="$(basename "$tmpl")"
+            name="${name#benchmark_}"
+            name="${name%_template.ini}"
+            echo "  $name"
+        done
+        exit 1
+    fi
+
+    echo ""
+    echo "═══════════════════════════════════════════════════"
+    echo "  Benchmark suite: ${scales[*]}"
+    echo "  Cluster: $TOTAL_NODES nodes"
+    echo "  Start: $(date)"
+    echo "═══════════════════════════════════════════════════"
+
+    for scale in "${scales[@]}"; do
+        echo ""
+        echo "▶▶▶ Scale: $scale — 1-node baseline"
+        cmd_run "$scale" 1
+
+        if [ "$TOTAL_NODES" -gt 1 ]; then
+            echo ""
+            echo "▶▶▶ Scale: $scale — ${TOTAL_NODES}-node distributed"
+            cmd_run "$scale" "$TOTAL_NODES"
+        else
+            echo "  (Skipping multi-node: cluster has only 1 node)"
+        fi
+    done
+
+    echo ""
+    echo "═══════════════════════════════════════════════════"
+    echo "  Benchmark suite complete: $(date)"
+    echo "═══════════════════════════════════════════════════"
+}
+
+# ─── Cleanup ───
+
+cmd_cleanup() {
+    echo ""
+    echo "=== Cleaning up remote nodes ==="
+
+    for i in $(seq 1 $((${#NODE_HOSTS[@]} - 1))); do
+        local host="${NODE_HOSTS[$i]}"
+        echo "  Cleaning $host..."
+        remote_exec "$host" "rm -rf $SPTAG_DIR/Release/SPTAGTest $SPTAG_DIR/perftest_* $SPTAG_DIR/worker_*.ini"
+        # Clean index directories
+        remote_exec "$host" "rm -rf $DATA_DIR/proidx_*"
+    done
+    echo "Cleanup complete."
+}
+
+# ─── Main ───
+
+CMD="$1"
+CONF="$2"
+
+if [ -z "$CMD" ] || [ -z "$CONF" ]; then
+    echo "Usage: $0 <command> <cluster.conf> [args...]"
+    echo ""
+    echo "Commands:"
+    echo "  deploy      Deploy binary and data to all nodes"
+    echo "  start-tikv  Start independent TiKV/PD instances"
+    echo "  stop-tikv   Stop TiKV/PD instances"
+    echo "  run         Run benchmark: $0 run cluster.conf <scale> <node_count>"
+    echo "  bench       Run full benchmark suite: $0 bench cluster.conf <scale> [scale...] | all"
+    echo "  cleanup     Remove deployed files from remote nodes"
+    exit 1
+fi
+
+parse_config "$CONF"
+
+# Trap for cleanup on interrupt
+trap 'echo ""; echo "Interrupted!"; stop_driver_watchdog; stop_remote_workers 5; cmd_stop_tikv; exit 1' INT TERM
+
+case "$CMD" in
+    deploy)
+        cmd_deploy
+        ;;
+    setup-bins)
+        cmd_setup_bins
+        ;;
+    start-tikv)
+        cmd_start_tikv "${3:-}"
+        ;;
+    stop-tikv)
+        cmd_stop_tikv "${3:-}"
+        ;;
+    run)
+        cmd_run "$3" "$4"
+        ;;
+    bench)
+        shift 2  # skip cmd and conf
+        cmd_bench "$@"
+        ;;
+    cleanup)
+        cmd_cleanup
+        ;;
+    *)
+        echo "Unknown command: $CMD"
+        echo "Valid commands: deploy, setup-bins, start-tikv, stop-tikv, run, bench, cleanup"
+        exit 1
+        ;;
+esac

From 418674711afefef9a7548136618940061343f0de Mon Sep 17 00:00:00 2001
From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com>
Date: Wed, 20 May 2026 07:21:04 +0000
Subject: [PATCH 02/12] Fix unneede diff

---
 .gitignore                                    |  3 +-
 Test/src/main.cpp                             |  5 +-
 benchmark.ini                                 | 19 -----
 .../configs/benchmark_100m_1node.ini          | 71 -------------------
 .../configs/benchmark_100m_2node.ini          | 71 -------------------
 .../configs/benchmark_10m_1node.ini           | 62 ----------------
 .../configs/benchmark_10m_2node.ini           | 62 ----------------
 .../benchmark_insert_dominant_1node.ini       | 58 ---------------
 .../benchmark_insert_dominant_2node.ini       | 58 ---------------
 .../benchmark_insert_dominant_3node.ini       | 59 ---------------
 10 files changed, 5 insertions(+), 463 deletions(-)
 delete mode 100644 benchmark.ini
 delete mode 100644 evaluation/distributed/configs/benchmark_100m_1node.ini
 delete mode 100644 evaluation/distributed/configs/benchmark_100m_2node.ini
 delete mode 100644 evaluation/distributed/configs/benchmark_10m_1node.ini
 delete mode 100644 evaluation/distributed/configs/benchmark_10m_2node.ini
 delete mode 100644 evaluation/distributed/configs/benchmark_insert_dominant_1node.ini
 delete mode 100644 evaluation/distributed/configs/benchmark_insert_dominant_2node.ini
 delete mode 100644 evaluation/distributed/configs/benchmark_insert_dominant_3node.ini

diff --git a/.gitignore b/.gitignore
index e3dc9796a..190ca29d3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -464,5 +464,4 @@ FodyWeavers.xsd
 *.sln.iml
 
 # SPTAG benchmark generated artifacts
-/perftest_*
-/evaluation/2026-04-23/output_distributed_hostname_*.json
+*perftest_*
diff --git a/Test/src/main.cpp b/Test/src/main.cpp
index ab8d1342c..49ca39950 100644
--- a/Test/src/main.cpp
+++ b/Test/src/main.cpp
@@ -7,7 +7,9 @@
 
 #include <boost/test/tree/visitor.hpp>
 #include <string>
+#ifdef TIKV
 #include <absl/synchronization/mutex.h>
+#endif
 
 using namespace boost::unit_test;
 
@@ -36,8 +38,9 @@ struct GlobalFixture
         // adds GraphCycles bookkeeping under a global spinlock on every Lock();
         // observed to consume ~12% CPU under high worker-thread parallelism in
         // gRPC client paths (perf-recorded 2026-05-06).
+#ifdef TIKV
         absl::SetMutexDeadlockDetectionMode(absl::OnDeadlockCycle::kIgnore);
-
+#endif
         SPTAGVisitor visitor;
         traverse_test_tree(framework::master_test_suite(), visitor, false);
     }
diff --git a/benchmark.ini b/benchmark.ini
deleted file mode 100644
index e2b400767..000000000
--- a/benchmark.ini
+++ /dev/null
@@ -1,19 +0,0 @@
-[Benchmark]
-VectorPath=sift1b/base.100M.u8bin
-QueryPath=sift1b/query.public.10K.u8bin
-TruthPath=none
-IndexPath=proidx/spann_index
-ValueType=UInt8
-Dimension=128
-BaseVectorCount=10000
-InsertVectorCount=10000
-DeleteVectorCount=0
-BatchNum=10
-TopK=5
-NumThreads=8
-NumQueries=100
-DistMethod=L2
-Rebuild=true
-Resume=-1
-QuantizerFilePath=quantizer.bin
-QuantizedDim=64
diff --git a/evaluation/distributed/configs/benchmark_100m_1node.ini b/evaluation/distributed/configs/benchmark_100m_1node.ini
deleted file mode 100644
index 42ec07f49..000000000
--- a/evaluation/distributed/configs/benchmark_100m_1node.ini
+++ /dev/null
@@ -1,71 +0,0 @@
-; 100m: 99M base + 1M insert (insert is ~1% of base, "freshness / steady-state" workload).
-; 100× larger base index than insert_dominant. Tests how the system behaves when
-; the head index is large (~tens of millions of heads on layer 0) and the insert
-; rate is moderate. Layers=2, L2 distance, SIFT1B dataset.
-;
-; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from
-; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs).
-;
-; Notes for 100M-scale operation:
-;   - First run MUST build the index (Rebuild=true). Build of 99M base takes hours;
-;     reuse with Rebuild=false on subsequent runs and SKIP_HEAD_BUILD=1 if the
-;     HeadIndex on disk is intact.
-;   - Truth (top-5 over 99M) is recomputed at start each run; expect ~minutes.
-;   - SaveIndex at 100M has been observed to hang in BG-job-drain on some hosts;
-;     use SKIP_SAVE_LOAD=1 when iterating to bypass the per-batch save/load cycle.
-;   - TiKV data will grow to ~50-100GB per store at this scale; both nodes need
-;     plenty of NVMe headroom (verified: driver has 6.2T, worker has 691G).
-[Benchmark]
-WorkerTimeout=14400
-VectorPath=/mnt/nvme/sift1b/bigann_base.u8bin
-QueryPath=/mnt/nvme/sift1b/query.10K.u8bin
-TruthPath=truth
-IndexPath=/mnt/nvme/proidx_100m_1node/spann_index
-ValueType=UInt8
-Dimension=128
-BaseVectorCount=99000000
-InsertVectorCount=1000000
-DeleteVectorCount=0
-BatchNum=1
-TopK=5
-NumSearchThreads=4
-NumInsertThreads=4
-AppendThreadNum=16
-NumSearchDuringInsertThreads=1
-NumQueries=200
-DistMethod=L2
-Rebuild=true
-BuildOnly=false
-Resume=-1
-Layers=2
-
-Storage=TIKVIO
-TiKVPDAddresses=PLACEHOLDER
-TiKVKeyPrefix=bench100m_1node
-
-[SelectHead]
-ParallelBKTBuild=true
-
-[BuildHead]
-ParallelBKTBuild=true
-
-[BuildSSDIndex]
-LatencyLimit=100
-MaxCheck=8192
-SearchInternalResultNum=64
-UseMultiChunkPosting=false
-PostingPageLimit=8
-PostingCountCacheCapacity=10000000
-SearchCheckVersionMapOnlyLayer0=true
-DistributedVersionMap=true
-ReassignK=64
-AsyncMergeInSearch=true
-VersionCacheMaxChunks=1000000
-AsyncRpcMaxInflight=512
-
-[Distributed]
-Enabled=true
-DispatcherAddr=10.11.0.7:30001
-WorkerAddrs=10.11.0.7:30011
-StoreAddrs=10.11.0.7:20171
-PDAddrs=10.11.0.7:23791
diff --git a/evaluation/distributed/configs/benchmark_100m_2node.ini b/evaluation/distributed/configs/benchmark_100m_2node.ini
deleted file mode 100644
index 01b9c3e81..000000000
--- a/evaluation/distributed/configs/benchmark_100m_2node.ini
+++ /dev/null
@@ -1,71 +0,0 @@
-; 100m: 99M base + 1M insert (insert is ~1% of base, "freshness / steady-state" workload).
-; 100× larger base index than insert_dominant. Tests how the system behaves when
-; the head index is large (~tens of millions of heads on layer 0) and the insert
-; rate is moderate. Layers=2, L2 distance, SIFT1B dataset.
-;
-; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from
-; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs).
-;
-; Notes for 100M-scale operation:
-;   - First run MUST build the index (Rebuild=true). Build of 99M base takes hours;
-;     reuse with Rebuild=false on subsequent runs and SKIP_HEAD_BUILD=1 if the
-;     HeadIndex on disk is intact.
-;   - Truth (top-5 over 99M) is recomputed at start each run; expect ~minutes.
-;   - SaveIndex at 100M has been observed to hang in BG-job-drain on some hosts;
-;     use SKIP_SAVE_LOAD=1 when iterating to bypass the per-batch save/load cycle.
-;   - TiKV data will grow to ~50-100GB per store at this scale; both nodes need
-;     plenty of NVMe headroom (verified: driver has 6.2T, worker has 691G).
-[Benchmark]
-WorkerTimeout=14400
-VectorPath=/mnt/nvme/sift1b/bigann_base.u8bin
-QueryPath=/mnt/nvme/sift1b/query.10K.u8bin
-TruthPath=truth
-IndexPath=/mnt/nvme/proidx_100m_2node/spann_index
-ValueType=UInt8
-Dimension=128
-BaseVectorCount=99000000
-InsertVectorCount=1000000
-DeleteVectorCount=0
-BatchNum=1
-TopK=5
-NumSearchThreads=4
-NumInsertThreads=4
-AppendThreadNum=16
-NumSearchDuringInsertThreads=1
-NumQueries=200
-DistMethod=L2
-Rebuild=false
-BuildOnly=false
-Resume=-1
-Layers=2
-
-Storage=TIKVIO
-TiKVPDAddresses=PLACEHOLDER
-TiKVKeyPrefix=bench100m_2node
-
-[SelectHead]
-ParallelBKTBuild=true
-
-[BuildHead]
-ParallelBKTBuild=true
-
-[BuildSSDIndex]
-LatencyLimit=100
-MaxCheck=8192
-SearchInternalResultNum=64
-UseMultiChunkPosting=false
-PostingPageLimit=8
-PostingCountCacheCapacity=10000000
-SearchCheckVersionMapOnlyLayer0=true
-DistributedVersionMap=true
-ReassignK=64
-AsyncMergeInSearch=true
-VersionCacheMaxChunks=1000000
-AsyncRpcMaxInflight=512
-
-[Distributed]
-Enabled=true
-DispatcherAddr=10.11.0.7:30001
-WorkerAddrs=10.11.0.7:30011,10.11.0.10:30002
-StoreAddrs=10.11.0.7:20171,10.11.0.10:20171
-PDAddrs=10.11.0.7:23791,10.11.0.10:23791
diff --git a/evaluation/distributed/configs/benchmark_10m_1node.ini b/evaluation/distributed/configs/benchmark_10m_1node.ini
deleted file mode 100644
index 56dbd9088..000000000
--- a/evaluation/distributed/configs/benchmark_10m_1node.ini
+++ /dev/null
@@ -1,62 +0,0 @@
-; 10m: 9M base + 1M insert (insert is ~10% of base, "growing-index" workload).
-; 10× larger base index than insert_dominant, 10× smaller than 100m.
-; Useful for validating scaling between 1M and 100M without paying the
-; multi-hour build cost of 100m. Layers=2, L2 distance, SIFT1B dataset
-; (truncated to 10M of the 1B available).
-;
-; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from
-; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs).
-[Benchmark]
-WorkerTimeout=14400
-VectorPath=/mnt/nvme/sift1b/bigann_base.u8bin
-QueryPath=/mnt/nvme/sift1b/query.10K.u8bin
-TruthPath=truth
-IndexPath=/mnt/nvme/proidx_10m_1node/spann_index
-ValueType=UInt8
-Dimension=128
-BaseVectorCount=9000000
-InsertVectorCount=1000000
-DeleteVectorCount=0
-BatchNum=1
-TopK=5
-NumSearchThreads=4
-NumInsertThreads=4
-AppendThreadNum=16
-NumSearchDuringInsertThreads=1
-NumQueries=200
-DistMethod=L2
-Rebuild=true
-BuildOnly=false
-Resume=-1
-Layers=2
-
-Storage=TIKVIO
-TiKVPDAddresses=PLACEHOLDER
-TiKVKeyPrefix=bench10m_1node
-
-[SelectHead]
-ParallelBKTBuild=true
-
-[BuildHead]
-ParallelBKTBuild=true
-
-[BuildSSDIndex]
-LatencyLimit=100
-MaxCheck=8192
-SearchInternalResultNum=64
-UseMultiChunkPosting=false
-PostingPageLimit=8
-PostingCountCacheCapacity=1000000
-SearchCheckVersionMapOnlyLayer0=true
-DistributedVersionMap=true
-ReassignK=64
-AsyncMergeInSearch=true
-VersionCacheMaxChunks=1000000
-AsyncRpcMaxInflight=512
-
-[Distributed]
-Enabled=true
-DispatcherAddr=10.11.0.7:30001
-WorkerAddrs=10.11.0.7:30011
-StoreAddrs=10.11.0.7:20171
-PDAddrs=10.11.0.7:23791
diff --git a/evaluation/distributed/configs/benchmark_10m_2node.ini b/evaluation/distributed/configs/benchmark_10m_2node.ini
deleted file mode 100644
index 4ed317ac3..000000000
--- a/evaluation/distributed/configs/benchmark_10m_2node.ini
+++ /dev/null
@@ -1,62 +0,0 @@
-; 10m: 9M base + 1M insert (insert is ~10% of base, "growing-index" workload).
-; 10× larger base index than insert_dominant, 10× smaller than 100m.
-; Useful for validating scaling between 1M and 100M without paying the
-; multi-hour build cost of 100m. Layers=2, L2 distance, SIFT1B dataset
-; (truncated to 10M of the 1B available).
-;
-; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from
-; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs).
-[Benchmark]
-WorkerTimeout=14400
-VectorPath=/mnt/nvme/sift1b/bigann_base.u8bin
-QueryPath=/mnt/nvme/sift1b/query.10K.u8bin
-TruthPath=truth
-IndexPath=/mnt/nvme/proidx_10m_2node/spann_index
-ValueType=UInt8
-Dimension=128
-BaseVectorCount=9000000
-InsertVectorCount=1000000
-DeleteVectorCount=0
-BatchNum=1
-TopK=5
-NumSearchThreads=4
-NumInsertThreads=4
-AppendThreadNum=16
-NumSearchDuringInsertThreads=1
-NumQueries=200
-DistMethod=L2
-Rebuild=false
-BuildOnly=false
-Resume=-1
-Layers=2
-
-Storage=TIKVIO
-TiKVPDAddresses=PLACEHOLDER
-TiKVKeyPrefix=bench10m_2node
-
-[SelectHead]
-ParallelBKTBuild=true
-
-[BuildHead]
-ParallelBKTBuild=true
-
-[BuildSSDIndex]
-LatencyLimit=100
-MaxCheck=8192
-SearchInternalResultNum=64
-UseMultiChunkPosting=false
-PostingPageLimit=8
-PostingCountCacheCapacity=1000000
-SearchCheckVersionMapOnlyLayer0=true
-DistributedVersionMap=true
-ReassignK=64
-AsyncMergeInSearch=true
-VersionCacheMaxChunks=1000000
-AsyncRpcMaxInflight=512
-
-[Distributed]
-Enabled=true
-DispatcherAddr=10.11.0.7:30001
-WorkerAddrs=10.11.0.7:30011,10.11.0.10:30002
-StoreAddrs=10.11.0.7:20171,10.11.0.10:20171
-PDAddrs=10.11.0.7:23791,10.11.0.10:23791
diff --git a/evaluation/distributed/configs/benchmark_insert_dominant_1node.ini b/evaluation/distributed/configs/benchmark_insert_dominant_1node.ini
deleted file mode 100644
index 30fe77bbe..000000000
--- a/evaluation/distributed/configs/benchmark_insert_dominant_1node.ini
+++ /dev/null
@@ -1,58 +0,0 @@
-; insert_dominant: 1M base + 1M insert with concurrent search-during-insert.
-; Layers=2, L2 distance, SIFT1B dataset (truncated to 1M).
-;
-; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from
-; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs).
-[Benchmark]
-WorkerTimeout=14400
-VectorPath=/mnt/nvme/sift1b/bigann_base.u8bin
-QueryPath=/mnt/nvme/sift1b/query.10K.u8bin
-TruthPath=truth
-IndexPath=/mnt/nvme/proidx_insert_dominant_1node/spann_index
-ValueType=UInt8
-Dimension=128
-BaseVectorCount=1000000
-InsertVectorCount=1000000
-DeleteVectorCount=0
-BatchNum=1
-TopK=5
-NumSearchThreads=4
-NumInsertThreads=4
-AppendThreadNum=16
-NumSearchDuringInsertThreads=1
-NumQueries=200
-DistMethod=L2
-Rebuild=true
-BuildOnly=false
-Resume=-1
-Layers=2
-
-Storage=TIKVIO
-TiKVPDAddresses=PLACEHOLDER
-TiKVKeyPrefix=benchinsert_dominant_1node
-
-[SelectHead]
-ParallelBKTBuild=true
-
-[BuildHead]
-ParallelBKTBuild=true
-
-[BuildSSDIndex]
-LatencyLimit=100
-MaxCheck=8192
-SearchInternalResultNum=64
-UseMultiChunkPosting=false
-PostingPageLimit=8
-PostingCountCacheCapacity=1000000
-SearchCheckVersionMapOnlyLayer0=true
-DistributedVersionMap=true
-ReassignK=64
-AsyncMergeInSearch=true
-VersionCacheMaxChunks=100000
-
-[Distributed]
-Enabled=true
-DispatcherAddr=10.11.0.7:30001
-WorkerAddrs=10.11.0.7:30011
-StoreAddrs=10.11.0.7:20171
-PDAddrs=10.11.0.7:23791
diff --git a/evaluation/distributed/configs/benchmark_insert_dominant_2node.ini b/evaluation/distributed/configs/benchmark_insert_dominant_2node.ini
deleted file mode 100644
index d45870b50..000000000
--- a/evaluation/distributed/configs/benchmark_insert_dominant_2node.ini
+++ /dev/null
@@ -1,58 +0,0 @@
-; insert_dominant: 1M base + 1M insert with concurrent search-during-insert.
-; Layers=2, L2 distance, SIFT1B dataset (truncated to 1M).
-;
-; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from
-; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs).
-[Benchmark]
-WorkerTimeout=14400
-VectorPath=/mnt/nvme/sift1b/bigann_base.u8bin
-QueryPath=/mnt/nvme/sift1b/query.10K.u8bin
-TruthPath=truth
-IndexPath=/mnt/nvme/proidx_insert_dominant_2node/spann_index
-ValueType=UInt8
-Dimension=128
-BaseVectorCount=1000000
-InsertVectorCount=1000000
-DeleteVectorCount=0
-BatchNum=1
-TopK=5
-NumSearchThreads=4
-NumInsertThreads=4
-AppendThreadNum=16
-NumSearchDuringInsertThreads=1
-NumQueries=200
-DistMethod=L2
-Rebuild=false
-BuildOnly=false
-Resume=-1
-Layers=2
-
-Storage=TIKVIO
-TiKVPDAddresses=PLACEHOLDER
-TiKVKeyPrefix=benchinsert_dominant_2node
-
-[SelectHead]
-ParallelBKTBuild=true
-
-[BuildHead]
-ParallelBKTBuild=true
-
-[BuildSSDIndex]
-LatencyLimit=100
-MaxCheck=8192
-SearchInternalResultNum=64
-UseMultiChunkPosting=false
-PostingPageLimit=8
-PostingCountCacheCapacity=1000000
-SearchCheckVersionMapOnlyLayer0=true
-DistributedVersionMap=true
-ReassignK=64
-AsyncMergeInSearch=true
-VersionCacheMaxChunks=100000
-
-[Distributed]
-Enabled=true
-DispatcherAddr=10.11.0.7:30001
-WorkerAddrs=10.11.0.7:30011,10.11.0.10:30002
-StoreAddrs=10.11.0.7:20171,10.11.0.10:20171
-PDAddrs=10.11.0.7:23791,10.11.0.10:23791
diff --git a/evaluation/distributed/configs/benchmark_insert_dominant_3node.ini b/evaluation/distributed/configs/benchmark_insert_dominant_3node.ini
deleted file mode 100644
index a8050732d..000000000
--- a/evaluation/distributed/configs/benchmark_insert_dominant_3node.ini
+++ /dev/null
@@ -1,59 +0,0 @@
-; insert_dominant: 1M base + 10M insert (10× scale-up) with concurrent search-during-insert.
-; Tests how the index handles insertion-dominated workloads where insertion volume
-; is much larger than the initial baseline. Layers=2, L2 distance, SIFT1B dataset.
-;
-; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from
-; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs).
-[Benchmark]
-WorkerTimeout=14400
-VectorPath=/mnt/data/sift1b/base.1B.u8bin
-QueryPath=/mnt/data/sift1b/query.public.10K.u8bin
-TruthPath=truth
-IndexPath=/mnt/md0/proidx_insert_dominant_3node/spann_index
-ValueType=UInt8
-Dimension=128
-BaseVectorCount=1000000
-InsertVectorCount=1000000
-DeleteVectorCount=0
-BatchNum=1
-TopK=5
-NumSearchThreads=4
-NumInsertThreads=4
-AppendThreadNum=16
-NumSearchDuringInsertThreads=1
-NumQueries=200
-DistMethod=L2
-Rebuild=false
-BuildOnly=false
-Resume=-1
-Layers=2
-
-Storage=TIKVIO
-TiKVPDAddresses=PLACEHOLDER
-TiKVKeyPrefix=benchinsert_dominant_3node
-
-[SelectHead]
-ParallelBKTBuild=true
-
-[BuildHead]
-ParallelBKTBuild=true
-
-[BuildSSDIndex]
-LatencyLimit=100
-MaxCheck=8192
-SearchInternalResultNum=64
-UseMultiChunkPosting=false
-PostingPageLimit=8
-PostingCountCacheCapacity=1000000
-SearchCheckVersionMapOnlyLayer0=true
-DistributedVersionMap=true
-ReassignK=64
-AsyncMergeInSearch=true
-VersionCacheMaxChunks=100000
-
-[Distributed]
-Enabled=true
-DispatcherAddr=172.27.0.4:30001
-WorkerAddrs=172.27.0.4:30011,172.27.0.5:30002,172.27.0.6:30003
-StoreAddrs=172.27.0.4:20171,172.27.0.5:20171,172.27.0.6:20171
-PDAddrs=172.27.0.4:23791,172.27.0.5:23791,172.27.0.6:23791

From ee97d3ff732f69c91c2b35158219c5f3f1873187 Mon Sep 17 00:00:00 2001
From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com>
Date: Wed, 20 May 2026 08:21:07 +0000
Subject: [PATCH 03/12] Remove unused stride-shard experiment
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Strip the SPFRESH_SHARD_STRIDE opt-in code path (4 helpers + plumbing
through LoadAndInsertBatch/RunBenchmark/RunWorker). No active config
sets the env var; we always use the contiguous slice partition.

Test/CMakeLists.txt: explicitly link ${TiKV_LIBRARIES} into SPTAGTest
so a clean build (no .o cache) resolves gpr_/grpc_ symbols pulled in
by the kvproto generated stubs.

ThirdParty/kvproto/.gitignore: stop tracking regenerated stubs going
forward — they are environment-specific (must match the protoc/grpc
in the build env); regenerate locally via generate_cpp.sh.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 Test/CMakeLists.txt                       |   2 +-
 Test/src/SPFreshTest.cpp                  | 148 ++--------------------
 ThirdParty/kvproto/.gitignore             |   4 +
 evaluation/distributed/run_distributed.sh |   1 -
 4 files changed, 19 insertions(+), 136 deletions(-)
 create mode 100644 ThirdParty/kvproto/.gitignore

diff --git a/Test/CMakeLists.txt b/Test/CMakeLists.txt
index 27bdeebb5..9db640da2 100644
--- a/Test/CMakeLists.txt
+++ b/Test/CMakeLists.txt
@@ -24,7 +24,7 @@ if (NOT LIBRARYONLY)
     file(GLOB TEST_HDR_FILES ${PROJECT_SOURCE_DIR}/Test/inc/Test.h)
     file(GLOB TEST_SRC_FILES ${PROJECT_SOURCE_DIR}/Test/src/*.cpp)
     add_executable(SPTAGTest ${TEST_SRC_FILES} ${TEST_HDR_FILES})
-    target_link_libraries(SPTAGTest SPTAGLibStatic ssdservingLib ${Boost_LIBRARIES} absl_synchronization absl_cord absl_cordz_info absl_cord_internal absl_cordz_functions absl_cordz_handle)
+    target_link_libraries(SPTAGTest SPTAGLibStatic ssdservingLib ${Boost_LIBRARIES} ${TiKV_LIBRARIES} absl_synchronization absl_cord absl_cordz_info absl_cord_internal absl_cordz_functions absl_cordz_handle)
 
     install(TARGETS SPTAGTest
       RUNTIME DESTINATION bin  
diff --git a/Test/src/SPFreshTest.cpp b/Test/src/SPFreshTest.cpp
index 9ab420db9..1a2140773 100644
--- a/Test/src/SPFreshTest.cpp
+++ b/Test/src/SPFreshTest.cpp
@@ -62,94 +62,6 @@ static __attribute__((constructor)) void install_segfault_handler() {
 
 using namespace SPTAG;
 
-// ---------------------------------------------------------------------------
-// Stride sharding (a.k.a. odd/even sharding) experiment
-// ---------------------------------------------------------------------------
-// When the env var SPFRESH_SHARD_STRIDE is set to "1"/"true", each node, instead
-// of inserting a contiguous slice [n*B/N, (n+1)*B/N) of the per-iteration batch,
-// inserts the strided rows {n, n+N, n+2*N, ...} where n=nodeIndex, N=numNodes.
-// This breaks any spatial structure in the input dataset (e.g. SIFT files that
-// are roughly sorted by visual feature), letting us check whether the layer-0
-// split skew (driver 71 vs worker 2 in v18) is caused by contiguous slicing
-// landing similar vectors on the same node and overflowing a small set of heads.
-//
-// The total number of vectors inserted across all nodes per iteration is the
-// same; only the assignment changes. Recall measurement still works because
-// the dataset and ground truth are unchanged — only insert routing differs.
-static bool IsStrideShardEnabled() {
-    const char* e = std::getenv("SPFRESH_SHARD_STRIDE");
-    if (!e) return false;
-    std::string v(e);
-    return v == "1" || v == "true" || v == "TRUE" || v == "yes";
-}
-
-// Compute count of indices i in [0, total) with (i % stride) == offset.
-static SizeType StrideCount(SizeType total, int stride, int offset) {
-    if (stride <= 1) return total;
-    if (offset < 0 || offset >= stride) return 0;
-    if (total <= offset) return 0;
-    return (total - 1 - offset) / stride + 1;
-}
-
-// Build a strided sub-VectorSet by copying every `stride`-th vector starting
-// at `offset` into a contiguous packed ByteArray. Returns a BasicVectorSet.
-static std::shared_ptr<VectorSet> ExtractStridedVectors(
-    const std::shared_ptr<VectorSet>& full, int stride, int offset)
-{
-    if (!full) return nullptr;
-    SizeType totalCount = full->Count();
-    SizeType outCount = StrideCount(totalCount, stride, offset);
-    auto vt = full->GetValueType();
-    auto dim = full->Dimension();
-    size_t perVecSize = full->PerVectorDataSize();
-    if (outCount <= 0) {
-        return std::make_shared<BasicVectorSet>(ByteArray::Alloc(0), vt, dim, 0);
-    }
-    ByteArray buf = ByteArray::Alloc(static_cast<size_t>(outCount) * perVecSize);
-    for (SizeType i = 0; i < outCount; ++i) {
-        SizeType srcIdx = static_cast<SizeType>(offset) + i * static_cast<SizeType>(stride);
-        std::memcpy(buf.Data() + static_cast<size_t>(i) * perVecSize,
-                    full->GetVector(srcIdx),
-                    perVecSize);
-    }
-    return std::make_shared<BasicVectorSet>(buf, vt, dim, outCount);
-}
-
-// Build a strided sub-MetadataSet. Two-pass: first compute offsets, then copy.
-static std::shared_ptr<MetadataSet> ExtractStridedMetadata(
-    const std::shared_ptr<MetadataSet>& full, int stride, int offset)
-{
-    if (!full) return nullptr;
-    SizeType totalCount = full->Count();
-    SizeType outCount = StrideCount(totalCount, stride, offset);
-    if (outCount <= 0) {
-        ByteArray emptyMeta = ByteArray::Alloc(0);
-        ByteArray offBuf = ByteArray::Alloc(sizeof(std::uint64_t));
-        *reinterpret_cast<std::uint64_t*>(offBuf.Data()) = 0ULL;
-        return std::make_shared<MemMetadataSet>(emptyMeta, offBuf, 0);
-    }
-    std::vector<std::uint64_t> offsets(static_cast<size_t>(outCount) + 1, 0ULL);
-    std::uint64_t total = 0;
-    for (SizeType i = 0; i < outCount; ++i) {
-        SizeType srcIdx = static_cast<SizeType>(offset) + i * static_cast<SizeType>(stride);
-        ByteArray meta = full->GetMetadata(srcIdx);
-        offsets[i] = total;
-        total += meta.Length();
-    }
-    offsets[outCount] = total;
-    ByteArray metaBuf = ByteArray::Alloc(total > 0 ? total : 1);
-    for (SizeType i = 0; i < outCount; ++i) {
-        SizeType srcIdx = static_cast<SizeType>(offset) + i * static_cast<SizeType>(stride);
-        ByteArray meta = full->GetMetadata(srcIdx);
-        if (meta.Length() > 0) {
-            std::memcpy(metaBuf.Data() + offsets[i], meta.Data(), meta.Length());
-        }
-    }
-    ByteArray offBuf = ByteArray::Alloc((static_cast<size_t>(outCount) + 1) * sizeof(std::uint64_t));
-    std::memcpy(offBuf.Data(), offsets.data(), offsets.size() * sizeof(std::uint64_t));
-    return std::make_shared<MemMetadataSet>(metaBuf, offBuf, outCount);
-}
-
 // Helper: parse "host:port,host:port,..." into vector of pairs.
 static std::vector<std::pair<std::string, std::string>> ParseNodeAddrs(const std::string& addrStr) {
     std::vector<std::pair<std::string, std::string>> result;
@@ -1098,7 +1010,6 @@ void LoadAndInsertBatch(SPANN::Index<T>* spannIndex,
                         const std::string& paddmetaidx,
                         int dimension,
                         int insertStart, int loadCount, int perNodeBatch,
-                        bool strideShard, int numNodes, int nodeIndex,
                         int numInsertThreads,
                         SPANN::WorkerNode* router,
                         std::shared_ptr<COMMON::IQuantizer> quantizer,
@@ -1121,14 +1032,6 @@ void LoadAndInsertBatch(SPANN::Index<T>* spannIndex,
                                                   addFloat->Count());
     }
     auto addmetaset = TestUtils::TestDataGenerator<T>::LoadMetadataSet(paddmeta, paddmetaidx, insertStart, loadCount);
-    if (strideShard) {
-        addset = ExtractStridedVectors(addset, numNodes, nodeIndex);
-        addmetaset = ExtractStridedMetadata(addmetaset, numNodes, nodeIndex);
-        SPTAGLIB_LOG(Helper::LogLevel::LL_Info,
-                     "%s: stride-shard batchStart=%d loadCount=%d -> kept=%d (every %d-th, offset=%d)\n",
-                     logPrefix, insertStart, loadCount,
-                     (int)(addset ? addset->Count() : 0), numNodes, nodeIndex);
-    }
     InsertVectors<T>(spannIndex, numInsertThreads, perNodeBatch,
                      addset, addmetaset,
                      searchDuringInsertThreads, queryset, numQueries, searchK,
@@ -1225,23 +1128,12 @@ void RunBenchmark(const std::string &vectorPath, const std::string &queryPath, c
     // Use distributed config for multi-node partitioning
     int nodeIndex = distCfg.workerIndex;
     int numNodes = distCfg.GetNumWorkers();
-    bool strideShard = IsStrideShardEnabled() && numNodes > 1;
-    int myInsertStart, myInsertEnd, perNodeBatch;
-    if (strideShard) {
-        // Stride mode: each node loads the FULL per-iter batch then keeps rows
-        // where (rowIdx % numNodes) == nodeIndex. myInsertStart/End span the
-        // full batch; perNodeBatch is the count of strided rows.
-        myInsertStart = 0;
-        myInsertEnd = insertBatchSize;
-        perNodeBatch = static_cast<int>(StrideCount(insertBatchSize, numNodes, nodeIndex));
-    } else {
-        myInsertStart = (numNodes > 1) ? (nodeIndex * insertBatchSize) / numNodes : 0;
-        myInsertEnd = (numNodes > 1) ? ((nodeIndex + 1) * insertBatchSize) / numNodes : insertBatchSize;
-        perNodeBatch = myInsertEnd - myInsertStart;
-    }
+    int myInsertStart = (numNodes > 1) ? (nodeIndex * insertBatchSize) / numNodes : 0;
+    int myInsertEnd = (numNodes > 1) ? ((nodeIndex + 1) * insertBatchSize) / numNodes : insertBatchSize;
+    int perNodeBatch = myInsertEnd - myInsertStart;
     SPTAGLIB_LOG(Helper::LogLevel::LL_Info,
-                 "RunBenchmark: nodeIndex=%d numNodes=%d insertBatchSize=%d myInsertStart=%d myInsertEnd=%d perNodeBatch=%d strideShard=%d\n",
-                 nodeIndex, numNodes, insertBatchSize, myInsertStart, myInsertEnd, perNodeBatch, strideShard ? 1 : 0);
+                 "RunBenchmark: nodeIndex=%d numNodes=%d insertBatchSize=%d myInsertStart=%d myInsertEnd=%d perNodeBatch=%d\n",
+                 nodeIndex, numNodes, insertBatchSize, myInsertStart, myInsertEnd, perNodeBatch);
 
     // Variables to collect JSON output data
     std::ostringstream tmpbenchmark;
@@ -1585,19 +1477,16 @@ void RunBenchmark(const std::string &vectorPath, const std::string &queryPath, c
                         SPANN::DispatchCommand::Type::Insert, static_cast<std::uint32_t>(iter));
                 }
 
-                // Each node inserts its partition. Default mode: contiguous slice
-                // [iter*batchSize + myInsertStart, +perNodeBatch). Stride mode:
-                // every numNodes-th row of the full batch starting at nodeIndex
-                // (loads full batch then filters down to perNodeBatch rows).
+                // Each node inserts its contiguous slice
+                // [iter*batchSize + myInsertStart, +perNodeBatch).
                 int insertStart = iter * insertBatchSize + myInsertStart;
-                int loadCount = strideShard ? insertBatchSize : perNodeBatch;
+                int loadCount = perNodeBatch;
                 {
                     std::string driverTag = "RunBenchmark iter=" + std::to_string(iter);
                     start = std::chrono::high_resolution_clock::now();
                     LoadAndInsertBatch<T>(static_cast<SPANN::Index<T>*>(cloneIndex.get()),
                                           paddset, paddmeta, paddmetaidx, M,
                                           insertStart, loadCount, perNodeBatch,
-                                          strideShard, numNodes, nodeIndex,
                                           numInsertThreads, workerPtr,
                                           enableQuantization ? quantizer : nullptr,
                                           numSearchDuringInsertThreads, queryset,
@@ -2914,17 +2803,9 @@ void RunWorker(const std::string& indexPath, int dimension, int baseVectorCount,
     int nodeIndex = distCfg.workerIndex;
     int numNodes = distCfg.GetNumWorkers();
     int insertBatchSize = insertVectorCount / std::max(batches, 1);
-    bool strideShard = IsStrideShardEnabled() && numNodes > 1;
-    int myInsertStart, myInsertEnd, perNodeBatch;
-    if (strideShard) {
-        myInsertStart = 0;
-        myInsertEnd = insertBatchSize;
-        perNodeBatch = static_cast<int>(StrideCount(insertBatchSize, numNodes, nodeIndex));
-    } else {
-        myInsertStart = (numNodes > 1) ? (nodeIndex * insertBatchSize) / numNodes : 0;
-        myInsertEnd = (numNodes > 1) ? ((nodeIndex + 1) * insertBatchSize) / numNodes : insertBatchSize;
-        perNodeBatch = myInsertEnd - myInsertStart;
-    }
+    int myInsertStart = (numNodes > 1) ? (nodeIndex * insertBatchSize) / numNodes : 0;
+    int myInsertEnd = (numNodes > 1) ? ((nodeIndex + 1) * insertBatchSize) / numNodes : insertBatchSize;
+    int perNodeBatch = myInsertEnd - myInsertStart;
 
     BOOST_TEST_MESSAGE("Worker node " << nodeIndex << ": Loading index from " << indexPath);
     std::shared_ptr<VectorIndex> index;
@@ -3035,16 +2916,15 @@ void RunWorker(const std::string& indexPath, int dimension, int baseVectorCount,
 
         if (cmd.m_type == SPANN::DispatchCommand::Type::Insert) {
             int insertStart = cmd.m_round * insertBatchSize + myInsertStart;
-            int loadCount = strideShard ? insertBatchSize : perNodeBatch;
-            SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Worker %d: Batch %u - inserting %d vectors (offset %d, strideShard=%d)\n",
-                         nodeIndex, cmd.m_round + 1, perNodeBatch, insertStart, strideShard ? 1 : 0);
+            int loadCount = perNodeBatch;
+            SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Worker %d: Batch %u - inserting %d vectors (offset %d)\n",
+                         nodeIndex, cmd.m_round + 1, perNodeBatch, insertStart);
 
             auto t1 = std::chrono::high_resolution_clock::now();
             std::string workerTag =
                 "Worker " + std::to_string(nodeIndex) + " batch=" + std::to_string(cmd.m_round + 1);
             LoadAndInsertBatch<T>(spannIndex, paddset, paddmeta, paddmetaidx, dimension,
                                   insertStart, loadCount, perNodeBatch,
-                                  strideShard, numNodes, nodeIndex,
                                   numInsertThreads, router,
                                   /*quantizer=*/nullptr,
                                   /*searchDuringInsertThreads=*/0,
diff --git a/ThirdParty/kvproto/.gitignore b/ThirdParty/kvproto/.gitignore
new file mode 100644
index 000000000..b2dab26f7
--- /dev/null
+++ b/ThirdParty/kvproto/.gitignore
@@ -0,0 +1,4 @@
+# Generated C++ stubs are environment-specific (protoc/grpc versions must
+# match the gRPC libs in the build env). Each developer should regenerate
+# locally via generate_cpp.sh instead of consuming the committed snapshot.
+generated/
diff --git a/evaluation/distributed/run_distributed.sh b/evaluation/distributed/run_distributed.sh
index c383a7eed..bb982ab7d 100755
--- a/evaluation/distributed/run_distributed.sh
+++ b/evaluation/distributed/run_distributed.sh
@@ -744,7 +744,6 @@ start_remote_worker() {
     ssh -n $(_ssh_opts) "$SSH_USER@$host" \
         "cd $SPTAG_DIR && LD_LIBRARY_PATH=$SPTAG_DIR/Release/runtime_libs:/usr/lib/x86_64-linux-gnu:\${LD_LIBRARY_PATH:-} \
          WORKER_INDEX=${NODE_IDX} BENCHMARK_CONFIG=worker_n${NODE_IDX}.ini \
-         SPFRESH_SHARD_STRIDE=${SPFRESH_SHARD_STRIDE:-0} \
          ./Release/SPTAGTest --run_test=SPFreshTest/BenchmarkFromConfig 2>&1" \
         </dev/null > "$LOG" 2>&1 &
     local ssh_pid=$!

From 4df704f9897ede7997e6632568f7362ebe893449 Mon Sep 17 00:00:00 2001
From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com>
Date: Wed, 20 May 2026 08:36:46 +0000
Subject: [PATCH 04/12] InsertVectors: dedupe branches, log InsertThreadNum
 ignore in bulk path

The previous if/else duplicated the thread launch+join. Restructure to
a single launch with an optional search-during-insert thread:
  - launch insertThreadCount workers
  - if benchmarking, launch one search thread in parallel
  - join all, then compute stats (only when search ran)

Also log a clear note when the bulk router path is used: the user-
supplied InsertThreadNum is unused there (driver runs one launcher
thread and parallelism comes from [BuildSSDIndex] AppendThreadNum
inside ExtraDynamicSearcher's append/split pool).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 Test/src/SPFreshTest.cpp | 50 ++++++++++++++++++----------------------
 1 file changed, 23 insertions(+), 27 deletions(-)

diff --git a/Test/src/SPFreshTest.cpp b/Test/src/SPFreshTest.cpp
index 1a2140773..5bef228a3 100644
--- a/Test/src/SPFreshTest.cpp
+++ b/Test/src/SPFreshTest.cpp
@@ -661,29 +661,39 @@ void InsertVectors(SPANN::Index<ValueType> *p_index, int insertThreads, int step
     if (useBulk) {
         func = bulkFunc;
         insertThreadCount = 1;
+        SPTAGLIB_LOG(Helper::LogLevel::LL_Info,
+                     "InsertVectors: bulk path - driver launcher=1, internal parallelism comes from "
+                     "[BuildSSDIndex] AppendThreadNum (user-supplied InsertThreadNum=%d is unused on this path)\n",
+                     insertThreads);
     } else {
         func = perVecFunc;
         insertThreadCount = insertThreads;
     }
 
-    if (searchThreads > 0 && queryset != nullptr && numQueries != 0 && benchmarkData != nullptr) {
-        std::vector<float> latencies;
-        std::vector<QueryResult> results;
-        double searchWallSeconds = 0.0;
+    bool withSearch = (searchThreads > 0 && queryset != nullptr && numQueries != 0 && benchmarkData != nullptr);
 
-        for (int j = 0; j < insertThreadCount; j++)
-        {
-            threads.emplace_back(func);
-        }
-        std::thread searchThread([&]() {
+    for (int j = 0; j < insertThreadCount; j++)
+    {
+        threads.emplace_back(func);
+    }
+
+    std::vector<float> latencies;
+    std::vector<QueryResult> results;
+    double searchWallSeconds = 0.0;
+    std::thread searchThread;
+    if (withSearch) {
+        searchThread = std::thread([&]() {
             searchWallSeconds = ExecutePartitionedSearch<ValueType>(
                 p_index, queryset, /*myStart=*/0, numQueries, k, searchThreads,
                 results, &latencies, /*statsOut=*/nullptr);
         });
-        for (auto &thread : threads)
-        {
-            thread.join();
-        }
+    }
+
+    for (auto &thread : threads)
+    {
+        thread.join();
+    }
+    if (withSearch) {
         searchThread.join();
 
         // Calculate statistics
@@ -712,17 +722,6 @@ void InsertVectors(SPANN::Index<ValueType> *p_index, int insertThreads, int step
         *benchmarkData << "        \"minLatency\": " << minLat << ",\n";
         *benchmarkData << "        \"maxLatency\": " << maxLat << ",\n";
         *benchmarkData << "        \"qps\": " << qps << ",\n";
-    } else {
-        // No search-during-insert path: just run the insert threads.
-        // (Used by worker dispatch and any caller that doesn't need stats.)
-        for (int j = 0; j < insertThreadCount; j++)
-        {
-            threads.emplace_back(func);
-        }
-        for (auto &thread : threads)
-        {
-            thread.join();
-        }
     }
     auto barrierStart = std::chrono::high_resolution_clock::now();
     size_t barrierPolls = 0;
@@ -743,9 +742,6 @@ void InsertVectors(SPANN::Index<ValueType> *p_index, int insertThreads, int step
 }
 
 
-
-
-
 template <typename T>
 void BenchmarkQueryPerformance(std::shared_ptr<VectorIndex> &index, std::shared_ptr<VectorSet> &queryset,
                                std::shared_ptr<VectorSet> &truth, const std::string &truthPath,

From c27a109ac297d350521478b15bcb2e33b7e1827a Mon Sep 17 00:00:00 2001
From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com>
Date: Wed, 20 May 2026 09:10:14 +0000
Subject: [PATCH 05/12] Restore (layers+1) multiplier in BlockController IO
 queue size

87160070 removed the (m_layers+1) multiplier in the SPDK BlockController
queue-depth formula. The change was based on an incorrect assumption
that the distributed port collapses all per-layer SPDK pools into the
single shared layer-0 pool. In practice only layer 0 + the RPC receiver
share a pool; every inner layer (m_layer >= 1) still creates its own
SPDKThreadPool in both BuildIndex and LoadIndex.

With Layers=2 (current active configs) we therefore have ~2 independent
pools each running insert + reassign + append worker threads, so the
peak concurrent IO-submitter count remains the qianxi-original
(layers+1)*(insert+reassign+append) plus search threads. Under-sizing
the BlockController queue could stall IO submission under heavy
split/reassign + search load; over-sizing is harmless. Restore the
multiplier to match qianxi behaviour.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 AnnService/src/Core/SPANN/ExtraFileController.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/AnnService/src/Core/SPANN/ExtraFileController.cpp b/AnnService/src/Core/SPANN/ExtraFileController.cpp
index b5db83822..24c839455 100644
--- a/AnnService/src/Core/SPANN/ExtraFileController.cpp
+++ b/AnnService/src/Core/SPANN/ExtraFileController.cpp
@@ -25,7 +25,7 @@ bool FileIO::BlockController::Initialize(SPANN::Options &p_opt, int p_layer)
 #ifndef _MSC_VER
             O_RDWR | O_DIRECT, numblocks, 2, 2,
             max(p_opt.m_ioThreads, (2 * max(p_opt.m_searchThreadNum, p_opt.m_iSSDNumberOfThreads) +
-                                    p_opt.m_insertThreadNum + p_opt.m_reassignThreadNum + p_opt.m_appendThreadNum)),
+                                    (p_opt.m_layers + 1) * (p_opt.m_insertThreadNum + p_opt.m_reassignThreadNum + p_opt.m_appendThreadNum))),
             ((std::uint64_t)p_opt.m_startFileSize) << 30
 #else
             GENERIC_READ | GENERIC_WRITE, numblocks, 2, 2,

From f3a9de98da29a208ef8eeb7311ad6c433bcfd21b Mon Sep 17 00:00:00 2001
From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com>
Date: Wed, 20 May 2026 09:23:17 +0000
Subject: [PATCH 06/12] SetVersionBatch: bypass LRU cache, read TiKV directly

All distributed runs override VersionCacheMaxChunks=0 (set by
run_distributed.sh in build/run/nocache phases), so the LRU cache is
effectively disabled. Using ReadChunkCached inside SetVersionBatch
adds bookkeeping noise (cache hit/miss path, refresh-mutex acquire)
that produces no benefit. Switch to direct ReadChunk; the dirty-byte
gating still saves the WriteChunk RPC when no version byte actually
changes.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 AnnService/inc/Core/Common/TiKVVersionMap.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/AnnService/inc/Core/Common/TiKVVersionMap.h b/AnnService/inc/Core/Common/TiKVVersionMap.h
index 69191fe1b..ff30306e8 100644
--- a/AnnService/inc/Core/Common/TiKVVersionMap.h
+++ b/AnnService/inc/Core/Common/TiKVVersionMap.h
@@ -386,7 +386,10 @@ namespace SPTAG
             }
 
             // Group writes by chunk: 1 ReadChunk + N byte-modifications + 1 WriteChunk
-            // per chunk, instead of N × (ReadChunk + WriteChunk). 
+            // per chunk, instead of N × (ReadChunk + WriteChunk). Bypasses the LRU
+            // cache because runs that exercise this path always have
+            // VersionCacheMaxChunks=0; reading TiKV directly removes a layer of
+            // bookkeeping (cache invalidate-on-write) we no longer benefit from.
             void SetVersionBatch(const std::vector<SizeType>& vids, const std::vector<uint8_t>& versions) override
             {
                 size_t n = std::min(vids.size(), versions.size());
@@ -408,7 +411,7 @@ namespace SPTAG
                     SizeType cid = kv.first;
                     auto& idxs = kv.second;
                     std::lock_guard<std::mutex> lock(ChunkMutex(cid));
-                    std::string chunk = ReadChunkCached(cid);
+                    std::string chunk = ReadChunk(cid);
                     if (chunk.empty()) {
                         chunk.assign(m_chunkSize, static_cast<char>(0xff));
                     }

From f35ae85bdb46d25d51585061de47c63b312f48c1 Mon Sep 17 00:00:00 2001
From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com>
Date: Wed, 20 May 2026 09:42:39 +0000
Subject: [PATCH 07/12] Drop high-priority job queue from SPDKThreadPool

The distributed port introduced a separate m_highJobs queue + add_high
in ThreadPool plus 'urgent' parameters on AppendAsync/ReassignAsync.
Receiver dispatch already discovered high-priority starved Split jobs
and switched to high=false. The remaining urgent=true callers were:

  - AppendAsync in CollectReAssign's non-TiKV branch (dead under
    Storage::TIKVIO which is the only storage we use)
  - ReassignAsync on head-miss in Append/BatchAppend (same starvation
    risk against Split that motivated the receiver-side revert)

Restore ThreadPool.h to the upstream deque+addfront shape (no semantic
change vs. original) and drop the urgent parameter from AppendAsync/
ReassignAsync, the high flag from JobSubmitter, and the high path from
WireJobSubmitterIfReady.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../Core/SPANN/Distributed/RemotePostingOps.h | 27 ++++++---------
 .../inc/Core/SPANN/ExtraDynamicSearcher.h     | 29 +++++-----------
 AnnService/inc/Helper/ThreadPool.h            | 33 +++++--------------
 3 files changed, 28 insertions(+), 61 deletions(-)

diff --git a/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h b/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h
index 577b91876..0f032c2ba 100644
--- a/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h
+++ b/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h
@@ -95,7 +95,7 @@ namespace SPTAG::SPANN {
         // its own m_splitThreadPool, so BatchAppend items dispatch by the
         // request's m_layer to the matching pool. A single submitter would
         // pile both layers' remote appends into whichever pool wired last.
-        using JobSubmitter = std::function<void(Helper::ThreadPool::Job*, bool /*high*/)>;
+        using JobSubmitter = std::function<void(Helper::ThreadPool::Job*)>;
         void SetJobSubmitter(int layer, JobSubmitter submitter) {
             std::unique_lock<std::shared_timed_mutex> lk(m_callbackLifetimeMutex);
             EnsureLayerSlot_NoLock(layer);
@@ -756,13 +756,12 @@ namespace SPTAG::SPANN {
             SPTAGLIB_LOG(Helper::LogLevel::LL_Debug,
                 "RemotePostingOps: Received batch of %u appends\n", batchReq->m_count);
 
-            // Submit each item as a high-priority Job to the searcher's
-            // shared compute pool. Pool workers run the local Append callback
-            // exactly like a local insert would. Last completion ACKs the
-            // sender. This puts remote work on the SAME concurrency budget
-            // as local Split/Merge/Reassign — eliminating the over-subscribed
-            // TiKV behaviour of the old separate bg executor + transient
-            // sub-worker threads.
+            // Submit each item as a Job to the searcher's shared compute pool.
+            // Pool workers run the local Append callback exactly like a local
+            // insert would. Last completion ACKs the sender. This puts remote
+            // work on the SAME concurrency budget as local Split/Merge/Reassign
+            // — eliminating the over-subscribed TiKV behaviour of the old
+            // separate bg executor + transient sub-worker threads.
             auto packetPtr = std::make_shared<Socket::Packet>(std::move(packet));
             const size_t total = batchReq->m_items.size();
             if (total == 0) {
@@ -810,15 +809,9 @@ namespace SPTAG::SPANN {
                     // submitter we have.
                     for (auto& s : m_jobSubmitters) { if (s) { sub = &s; break; } }
                 }
-                // Normal priority. Per-layer routing (m_jobSubmitters[layer])
-                // already isolates layer-N append items from other layers'
-                // pools. High priority starved split entirely (split:N
-                // in_flight, 0 completed) because once all 16 worker threads
-                // are running long-tail append items, fresh high-prio appends
-                // keep cutting in front of split. Append throughput per chunk
-                // is limited by pool concurrency × per-item RMW; widen the
-                // pool (AppendThreadNum) instead of using priority hacks.
-                if (sub) (*sub)(job, /*high=*/false);
+                // Per-layer routing (m_jobSubmitters[layer]) isolates layer-N
+                // append items from other layers' pools.
+                if (sub) (*sub)(job);
                 else     { delete job; failCount->fetch_add(1); remaining->fetch_sub(1); }
             }
         }
diff --git a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h
index 29129bdb4..b8ca98e85 100644
--- a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h
+++ b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h
@@ -395,10 +395,7 @@ namespace SPTAG::SPANN {
             if (!m_worker || !m_splitThreadPool) return;
             auto pool = m_splitThreadPool;
             m_worker->SetJobSubmitter(m_layer,
-                [pool](Helper::ThreadPool::Job* j, bool high) {
-                    if (high) pool->add_high(j);
-                    else      pool->add(j);
-                });
+                [pool](Helper::ThreadPool::Job* j) { pool->add(j); });
         }
 
         /// Set the external WorkerNode pointer and bind all callbacks
@@ -436,7 +433,7 @@ namespace SPTAG::SPANN {
 
                     // Mirror sender's version map for the records we're about
                     // to persist so MergePostings + SearchIndex don't drop
-                    // them as "stale". See HEAD git history for rationale.
+                    // them as "stale".
                     {
                         const uint8_t* basePtr = reinterpret_cast<const uint8_t*>(appendPosting.data());
                         size_t totalRec = appendPosting.size() / m_vectorInfoSize;
@@ -1713,28 +1710,20 @@ namespace SPTAG::SPANN {
             m_splitThreadPool->add(curJob);
         }
 
-        inline void AppendAsync(SizeType headID, std::shared_ptr<std::string> postingList, bool urgent = false,std::function<void()> p_callback = nullptr)
+        inline void AppendAsync(SizeType headID, std::shared_ptr<std::string> postingList, std::function<void()> p_callback = nullptr)
         {
             auto* curJob = new AppendAsyncJob(this, headID, std::move(postingList), p_callback);
             m_appendJobsInFlight++;
             m_totalAppendSubmitted++;
-            if (urgent) {
-                m_splitThreadPool->addfront(curJob);
-            } else {
-                m_splitThreadPool->add(curJob);
-            }
+            m_splitThreadPool->add(curJob);
         }
 
-        inline void ReassignAsync(std::shared_ptr<std::string> vectorInfo, SizeType headPrev, bool urgent = false, std::function<void()> p_callback = nullptr)
+        inline void ReassignAsync(std::shared_ptr<std::string> vectorInfo, SizeType headPrev, std::function<void()> p_callback = nullptr)
         {
             auto* curJob = new ReassignAsyncJob(this, std::move(vectorInfo), headPrev, p_callback);
             m_reassignJobsInFlight++;
             m_totalReassignSubmitted++;
-            if (urgent) {
-                m_splitThreadPool->addfront(curJob);
-            } else {
-                m_splitThreadPool->add(curJob);
-            }
+            m_splitThreadPool->add(curJob);
         }
 
         ErrorCode CollectReAssign(ExtraWorkSpace *p_exWorkSpace, SizeType headID, std::shared_ptr<std::string> headVec,
@@ -1901,7 +1890,7 @@ namespace SPTAG::SPANN {
             if (m_opt->m_storage == Storage::TIKVIO) ret = BatchAppend(p_exWorkSpace, batchReassign, "CollectReAssign");
             else {
                 for (auto& kv : batchReassign) {
-                    AppendAsync(kv.first, std::make_shared<std::string>(kv.second), true);
+                    AppendAsync(kv.first, std::make_shared<std::string>(kv.second));
                 }
             }
             if (batchReassignCount > 0) {
@@ -2019,7 +2008,7 @@ namespace SPTAG::SPANN {
                     if (m_versionMap->GetVersion(VID) == version) {
                         // SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Head Miss To ReAssign: VID: %d, current version: %d\n", *(int*)(&appendPosting[idx]), version);
                         m_stat.m_headMiss++;
-                        ReassignAsync(vectorInfo, headID, true);
+                        ReassignAsync(vectorInfo, headID);
                     }
                     // SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Head Miss Do Not To ReAssign: VID: %d, version: %d, current version: %d\n", *(int*)(&appendPosting[idx]), m_versionMap->GetVersion(*(int*)(&appendPosting[idx])), version);
                 }
@@ -2185,7 +2174,7 @@ namespace SPTAG::SPANN {
                         uint8_t version = *(uint8_t*)(ptr + sizeof(SizeType));
                         if (m_versionMap->GetVersion(VID) == version) {
                             m_stat.m_headMiss++;
-                            ReassignAsync(std::make_shared<std::string>((char*)ptr, m_vectorInfoSize), headID, true);
+                            ReassignAsync(std::make_shared<std::string>((char*)ptr, m_vectorInfoSize), headID);
                         }
                     }
                     continue;
diff --git a/AnnService/inc/Helper/ThreadPool.h b/AnnService/inc/Helper/ThreadPool.h
index a351a75c8..01c82e2a7 100644
--- a/AnnService/inc/Helper/ThreadPool.h
+++ b/AnnService/inc/Helper/ThreadPool.h
@@ -5,7 +5,7 @@
 #define _SPTAG_HELPER_THREADPOOL_H_
 
 #include <atomic>
-#include <queue>
+#include <deque>
 #include <vector>
 #include <thread>
 #include <mutex>
@@ -78,42 +78,28 @@ namespace SPTAG
             {
                 {
                     std::lock_guard<std::mutex> lock(m_lock);
-                    m_jobs.push(j);
+                    m_jobs.push_back(j);
                 }
                 m_cond.notify_one();
             }
 
-            // High-priority push: jobs in m_highJobs always run before m_jobs.
-            // Used by the distributed receiver to let inbound BatchAppend RPC
-            // work jump ahead of local Split/Merge/Reassign so the sender
-            // (driver) doesn't time out waiting for the chunk ack while the
-            // local pool drains long-running rebalance work.
-            void add_high(Job* j)
+            void addfront(Job* j)
             {
                 {
                     std::lock_guard<std::mutex> lock(m_lock);
-                    m_highJobs.push(j);
+                    m_jobs.push_front(j);
                 }
                 m_cond.notify_one();
             }
 
-            // Alias kept for compatibility with code that calls addfront()
-            // (e.g., split-async path). Same semantics as add_high.
-            void addfront(Job* j) { add_high(j); }
-
             bool get(Job*& j)
             {
                 std::unique_lock<std::mutex> lock(m_lock);
-                while (m_jobs.empty() && m_highJobs.empty() && !m_abort.ShouldAbort()) m_cond.wait(lock);
+                while (m_jobs.empty() && !m_abort.ShouldAbort()) m_cond.wait(lock);
                 if (!m_abort.ShouldAbort()) {
-                    if (!m_highJobs.empty()) {
-                        j = m_highJobs.front();
-                        m_highJobs.pop();
-                    } else {
-                        j = m_jobs.front();
-                        m_jobs.pop();
-                    }
+                    j = m_jobs.front();
                     currentJobs++;
+                    m_jobs.pop_front();
                     return true;
                 }
                 return false;
@@ -122,7 +108,7 @@ namespace SPTAG
             size_t jobsize()
             {
                 std::lock_guard<std::mutex> lock(m_lock);
-                return m_jobs.size() + m_highJobs.size();
+                return m_jobs.size();
             }
 
             inline uint32_t runningJobs() { return currentJobs; }
@@ -136,8 +122,7 @@ namespace SPTAG
 
         protected:
             std::atomic_uint32_t currentJobs{ 0 };
-            std::queue<Job*> m_jobs;
-            std::queue<Job*> m_highJobs;
+            std::deque<Job*> m_jobs;
             Abort m_abort;
             std::mutex m_lock;
             std::condition_variable m_cond;

From a49b26d5292b90c7ccd2ead91fb71176b8e5ae4b Mon Sep 17 00:00:00 2001
From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com>
Date: Wed, 20 May 2026 09:58:06 +0000
Subject: [PATCH 08/12] Fix space

---
 Test/src/main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Test/src/main.cpp b/Test/src/main.cpp
index 49ca39950..c1a5cde60 100644
--- a/Test/src/main.cpp
+++ b/Test/src/main.cpp
@@ -39,7 +39,7 @@ struct GlobalFixture
         // observed to consume ~12% CPU under high worker-thread parallelism in
         // gRPC client paths (perf-recorded 2026-05-06).
 #ifdef TIKV
-        absl::SetMutexDeadlockDetectionMode(absl::OnDeadlockCycle::kIgnore);
+    	absl::SetMutexDeadlockDetectionMode(absl::OnDeadlockCycle::kIgnore);
 #endif
         SPTAGVisitor visitor;
         traverse_test_tree(framework::master_test_suite(), visitor, false);

From 689e5b23e45da738b7ff77830a59283d0a58c5e4 Mon Sep 17 00:00:00 2001
From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com>
Date: Wed, 20 May 2026 10:06:24 +0000
Subject: [PATCH 09/12] Fix distributed benchmark README + drop dead
 orchestrator code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

run_distributed.sh:
- Remove wait_workers_ready() — dead since the driver-listens-on-30001
  handshake replaced log-grep readiness detection.
- Drop the stale 'Binary already pushed; nothing else to do here' comment
  that sat immediately after the actual binary-push rsync block.

README.md:
- Correct the TiKV deployment model: the cluster is SHARED (all PDs in
  one raft group, all TiKVs registered as stores, max-replicas=1) — not
  one isolated PD+TiKV per node as the old text claimed. Architecture
  diagram, port table, and pre-split helper updated accordingly (one PD
  endpoint, not a per-node loop).
- Fix Step 1 cluster-config path: configs/cluster_2node.conf (an actual
  shipped file), not the non-existent cluster.conf.example.
- Update port defaults to match cluster_2node.conf (23791/23801/20171)
  and call out that the driver's router_port must not collide with the
  dispatcher port 30001 (cluster_2node.conf uses 30011 for this reason).
- List all shipped configs (10m, 100m, insert_dominant, tikv.toml,
  cluster_*.conf) in the file table.
- Document setup-bins subcommand alongside deploy.
- Flag the Build / Distribute / Run split as a workaround for the
  missing distributed SelectHead/BuildHead implementation, so readers
  don't mistake it for the steady-state design.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 evaluation/distributed/README.md          | 219 +++++++++++++---------
 evaluation/distributed/run_distributed.sh |  33 ----
 2 files changed, 126 insertions(+), 126 deletions(-)

diff --git a/evaluation/distributed/README.md b/evaluation/distributed/README.md
index 1f24bc865..4717efc35 100644
--- a/evaluation/distributed/README.md
+++ b/evaluation/distributed/README.md
@@ -1,18 +1,26 @@
 # Distributed Benchmark Evaluation — Insert Dominant
 
 Multi-machine SPTAG SPANN distributed benchmark for an **insert-dominant** workload
-(1M base + 10M inserts in 10 batches, with concurrent search-during-insert) on
-SIFT1B. Each physical node runs its own independent PD + TiKV (no shared Raft
-replication — see "TiKV deployment model" below).
+(1M base + 1M-10M inserts in batches, with concurrent search-during-insert) on
+SIFT1B. All nodes share a single TiKV raft cluster (see "TiKV deployment model"
+below).
 
 ## Files in this folder
 
 | File | Purpose |
 | --- | --- |
-| `configs/benchmark_insert_dominant_template.ini` | Benchmark template; `run_distributed.sh` fills `IndexPath`, `TiKVPDAddresses`, `TiKVKeyPrefix`, and `[Distributed]` from `cluster.conf`. |
-| `run_distributed.sh` | Orchestrator: `deploy` / `start-tikv` / `run` / `stop-tikv` / `cleanup`. |
+| `configs/benchmark_insert_dominant_template.ini` | 1M base + 1M insert, search-during-insert workload. |
+| `configs/benchmark_10m_template.ini` | 9M base + 1M insert, growing-index workload. |
+| `configs/benchmark_100m_template.ini` | 99M base + 1M insert, steady-state/freshness workload. |
+| `configs/cluster_2node.conf`, `configs/cluster_3node.conf` | Example cluster topologies. Pick one (or write your own) and pass to the orchestrator. |
+| `configs/tikv.toml` | TiKV server config baked into the containers. |
+| `run_distributed.sh` | Orchestrator: `deploy` / `setup-bins` / `start-tikv` / `run` / `bench` / `stop-tikv` / `cleanup`. |
+| `bin/` | `tikv-server` + `pd-server` binaries used by the containers (`setup-bins` downloads them if missing). |
 | `README.md` | This file. |
 
+`run_distributed.sh` fills the template's `IndexPath`, `TiKVPDAddresses`,
+`TiKVKeyPrefix`, and `[Distributed]` section from the cluster config.
+
 ## Architecture
 
 ```
@@ -29,35 +37,42 @@ replication — see "TiKV deployment model" below).
         │  + Router│ │  + Router│ │  + Router│
         └────┬─────┘ └────┬─────┘ └────┬─────┘
              │            │            │
-             ▼            ▼            ▼
-        ┌──────────┐ ┌──────────┐ ┌──────────┐
-        │  TiKV 1  │ │  TiKV 2  │ │  TiKV N  │ (one PD + one TiKV per node)
-        └──────────┘ └──────────┘ └──────────┘
+             └────────────┼────────────┘
+                          ▼
+                ┌───────────────────┐
+                │ Shared TiKV raft  │  N PDs (one raft group) +
+                │ cluster           │  N TiKV stores (max-replicas=1)
+                └───────────────────┘
 ```
 
-- **Driver** (node 0): Builds the index, sends Search/Insert/Stop commands via TCP dispatch.
-- **Workers** (nodes 1..N): Receive commands, execute their shard locally, report results back.
-- **TiKV (per node)**: Each node runs its own independent PD + TiKV pair. Postings
-  for a head live on the node that owns that head's hash partition.
-- **PostingRouter**: Hash-based head routing, remote append, head sync, dispatch protocol.
+- **Driver** (node 0): builds the index, sends Search/Insert/Stop commands via
+  TCP dispatch.
+- **Workers** (nodes 1..N): receive commands, execute their shard locally,
+  report results back over the dispatch channel.
+- **Shared TiKV cluster**: every node runs a PD + TiKV container; all PDs join
+  one raft group, all TiKVs point to all PDs. PD routes each key to the store
+  that owns its region.
+- **PostingRouter**: hash-based head routing, remote append, head sync,
+  dispatch protocol.
 
 ## TiKV deployment model
 
-Unlike a single-machine multi-docker TiKV (3 PD + 3 TiKV behind 127.0.0.1 ports
-22791-3 / 20161-3 sharing one Raft cluster), in this multi-machine setup **each
-node runs its own isolated PD + TiKV pair** under host networking. Heads are
-routed to nodes by hash, and each node's TiKV stores only its own shard. There
-is no Raft replication between nodes (no cross-node region quorum), which is
-intentional for insert-dominated benchmarks where Raft log overhead would dominate.
+All nodes share **one** TiKV raft cluster: every node's PD joins the same raft
+group, every node's TiKV registers as a store in that cluster, and PD routes
+reads/writes to whichever store owns the region. `max-replicas=1` is set so
+each region lives on exactly one store — we measure benchmark performance
+without 3-way Raft replication. Compute nodes are stateless TiKV clients; they
+read any posting through the shared client, so there is no cross-compute fetch
+RPC during RNGSelection.
 
-Per-node ports (defaults from `cluster.conf`):
+Per-node ports (defaults from `configs/cluster_2node.conf`):
 
-| Service | Port | Notes |
+| Service | Default port | Notes |
 | --- | --- | --- |
-| PD client | `2379` | Local app uses `<node_ip>:2379`. |
-| PD peer | `2380` | Inter-PD; isolated cluster of 1 PD per node. |
-| TiKV client | `20161` | The node-local SPTAG worker connects here. |
-| Router | `30001+` | TCP dispatch / posting routing between nodes. |
+| PD client | `23791` | TiKV client + `pd-ctl` connect here. |
+| PD peer | `23801` | Inter-PD raft traffic. |
+| TiKV client | `20171` | Per-node TiKV listens here. |
+| Router | `30002+` | TCP dispatch / posting routing between nodes. **Driver's `router_port` must NOT be `30001`** — the dispatcher listens on `30001` and a collision will silently break worker registration. The shipped 2-node config uses `30011` on the driver for this reason. |
 
 ## Prerequisites
 
@@ -69,45 +84,47 @@ Per-node ports (defaults from `cluster.conf`):
   cmake .. -DTIKV=ON -DTBB=ON -DCMAKE_BUILD_TYPE=Release -DGPU=OFF
   cmake --build . --target SPTAGTest -j$(nproc)
   ```
-  *Note: building the full project may fail on the Java wrapper (`JAVASPTAGFileIO`)
-  due to a pre-existing `FileIOInterface.h` signature mismatch — the `SPTAGTest`
-  target alone is sufficient.*
-- Passwordless SSH from driver to every other node (configure `ssh_key` in `cluster.conf`).
+  *Note: building the full project may fail on the Java wrapper
+  (`JAVASPTAGFileIO`) due to a pre-existing `FileIOInterface.h` signature
+  mismatch — the `SPTAGTest` target alone is sufficient.*
+- Passwordless SSH from driver to every other node (configure `ssh_key` in
+  the cluster config).
 - Docker installed on every node (TiKV/PD run as containers in host network mode).
 - Same dataset path on every node (default `/mnt/nvme/sift1b/`):
   - `/mnt/nvme/sift1b/bigann_base.u8bin` (1B × 128 × u8)
   - `/mnt/nvme/sift1b/query.10K.u8bin`
-- Same fast-storage path for index + TiKV data on every node (`data_dir` in `cluster.conf`,
-  default `/mnt/nvme`).
+- Same fast-storage path for index + TiKV data on every node (`data_dir` in
+  the cluster config, default `/mnt/nvme`).
 
 ## Step 1 — Cluster config
 
+Pick one of the shipped templates and edit it for your hosts/paths:
+
 ```bash
-cp evaluation/distributed/cluster.conf.example cluster.conf
-vim cluster.conf
+cp evaluation/distributed/configs/cluster_2node.conf my_cluster.conf
+vim my_cluster.conf
 ```
 
-Example:
+Layout:
 
 ```ini
 [cluster]
 ssh_user=superbench
+ssh_key=/home/superbench/.ssh/id_rsa
 sptag_dir=/home/superbench/zhangt/SPTAG
 data_dir=/mnt/nvme
-tikv_version=v7.5.1
-pd_version=v7.5.1
+tikv_version=v8.5.1
+pd_version=v8.5.1
 
 [nodes]
-# host           router_port
-10.0.1.1         30001          # driver (always first)
-10.0.1.2         30002          # worker 1
-10.0.1.3         30003          # worker 2
+# host         router_port    (driver is first; router_port must not equal 30001)
+10.0.1.1       30011          # driver
+10.0.1.2       30002          # worker 1
 
 [tikv]
-# host           pd_client  pd_peer  tikv_port
-10.0.1.1         2379       2380     20161
-10.0.1.2         2379       2380     20161
-10.0.1.3         2379       2380     20161
+# host         pd_client_port  pd_peer_port  tikv_port
+10.0.1.1       23791           23801         20171
+10.0.1.2       23791           23801         20171
 ```
 
 `run_distributed.sh` reads this file to fill the template's `[Distributed]`,
@@ -116,50 +133,49 @@ pd_version=v7.5.1
 ## Step 2 — Deploy
 
 ```bash
-./evaluation/distributed/run_distributed.sh deploy cluster.conf
+./evaluation/distributed/run_distributed.sh deploy      my_cluster.conf
+./evaluation/distributed/run_distributed.sh setup-bins  my_cluster.conf
 ```
 
-This rsyncs `Release/SPTAGTest` (and required shared libs) to every node and
-ensures the per-node TiKV / PD data directories exist under `data_dir`.
+`deploy` rsyncs `Release/SPTAGTest` (and required shared libs) to every node
+and ensures per-node TiKV / PD data directories exist under `data_dir`.
+`setup-bins` downloads `tikv-server` / `pd-server` into `bin/` on every node
+(idempotent; skipped automatically by `start-tikv` if binaries are already
+present).
 
-## Step 3 — Start TiKV (per-node, independent)
+## Step 3 — Start the shared TiKV cluster
 
 ```bash
-./evaluation/distributed/run_distributed.sh start-tikv cluster.conf
+./evaluation/distributed/run_distributed.sh start-tikv my_cluster.conf
 ```
 
-This starts one PD + one TiKV per node in host-network containers. Single-replica
-placement (`max-replicas=1`) is set so we measure benchmark performance without
-3-way Raft replication.
+This starts one PD + one TiKV container per node in host-network mode and
+joins them into a single raft cluster (`max-replicas=1`, no 3-way replication).
 
-Health check (run on driver, repeat per node):
+Health check (single PD endpoint is enough — the cluster is shared):
 
 ```bash
-for ip in 10.0.1.1 10.0.1.2 10.0.1.3; do
-  curl -s "http://$ip:2379/pd/api/v1/stores" \
-    | python3 -c 'import json,sys; print([s["store"]["state_name"] for s in json.load(sys.stdin)["stores"]])'
-done
-# Each node should report ['Up'].
+curl -s "http://10.0.1.1:23791/pd/api/v1/stores" \
+  | python3 -c 'import json,sys; print([s["store"]["state_name"] for s in json.load(sys.stdin)["stores"]])'
+# Expected: ['Up', 'Up'] (one entry per TiKV store).
 ```
 
 ### Pre-split & scatter (optional but recommended)
 
-For the insert-dominant workload to spread region writes evenly across regions
-within a node's TiKV, pre-split the keyspace at boundaries derived from
-`DBKey(headID) = MaxID*layer + headID` little-endian byte 0. The TiKV raw key is
-`TiKVKeyPrefix + "_" + uint32_le(DBKey)`; for multi-chunk it appends `\x00` /
-`\x02` for chunk / count keys, but we split *only* on the head-key prefix so all
-chunk and count variants for a head share a region. Boundaries used: `0x02, 0x04,
-…, 0xfe` (127 split points → 128 regions).
+For the insert-dominant workload, pre-split the keyspace so writes spread
+evenly across regions and stores. Boundaries derive from
+`DBKey(headID) = MaxID*layer + headID` little-endian byte 0; the TiKV raw key
+is `TiKVKeyPrefix + "_" + uint32_le(DBKey)`. We split *only* on the head-key
+prefix so all chunk/count variants for a head share a region. Used split
+points: `0x02, 0x04, …, 0xfe` (127 split points → 128 regions).
 
-Driver-side helper (each PD is independent, so run per node):
+Since the cluster is shared, run the helper **once** against any PD endpoint:
 
 ```bash
-PREFIX="bench_insert_dominant_3node"   # keep in sync with KEY_PREFIX in run_distributed.sh
-for ip in 10.0.1.1 10.0.1.2 10.0.1.3; do
-  PD="http://$ip:2379"
-  PDCTL=(docker run --rm --network host --entrypoint /pd-ctl pingcap/pd:v7.5.1 -u "$PD")
-  python3 - "$PREFIX" "${PDCTL[@]}" <<'PY'
+PREFIX="bench_insert_dominant_2node"   # keep in sync with KEY_PREFIX in run_distributed.sh
+PD="http://10.0.1.1:23791"
+PDCTL=(docker run --rm --network host --entrypoint /pd-ctl pingcap/pd:v8.5.1 -u "$PD")
+python3 - "$PREFIX" "${PDCTL[@]}" <<'PY'
 import json, subprocess, sys
 prefix = sys.argv[1].encode() + b'_'
 pdctl = sys.argv[2:]
@@ -172,48 +188,65 @@ for b in range(2, 256, 2):
 for r in json.loads(run(['region', 'scan']))['regions']:
     run(['operator', 'add', 'scatter-region', str(r['id'])])
 PY
-done
 ```
 
-Skip this on the very first run if you don't have load skew — `start-tikv` works
-without it. For 1B-scale insert-dominant runs on a single node it materially
-reduces head-region hot-spotting.
+Skip this on the very first run if you don't have load skew — `start-tikv`
+works without it. For 1B-scale insert-dominant runs it materially reduces
+head-region hot-spotting.
 
 ## Step 4 — Run the benchmark
 
 ```bash
 # Single scale, explicit node count (driver + (N-1) workers):
-./evaluation/distributed/run_distributed.sh run cluster.conf insert_dominant 3
+./evaluation/distributed/run_distributed.sh run my_cluster.conf insert_dominant 2
 
 # Or sweep 1-node baseline + N-node distributed for one or more scales:
-./evaluation/distributed/run_distributed.sh bench cluster.conf insert_dominant
+./evaluation/distributed/run_distributed.sh bench my_cluster.conf insert_dominant
+./evaluation/distributed/run_distributed.sh bench my_cluster.conf all
 ```
 
 What `run` does:
 
 1. **Build** (driver only): driver builds the index locally with router
-   *disabled* (`Rebuild=true`, no `[Router]`). Output goes to `…_n0/spann_index`.
+   *disabled* (`Rebuild=true`, no `[Distributed]`). Output goes to
+   `…_n0/spann_index`. Because the TiKV cluster is shared, the driver writes
+   all postings straight to TiKV via PD-routed RPCs — there is no need for a
+   distributed build phase.
 2. **Distribute**: rsync head index + perftest files from driver to each worker.
-3. **Workers**: SSH-launches `SPTAGTest` on each worker with `WORKER_INDEX=i` and
-   the per-node ini (router enabled, `Rebuild=false`).
-4. **Driver**: relaunches `SPTAGTest` with router enabled, `Rebuild=false`. The
-   driver dispatches Insert / Search commands across batches via TCP.
+3. **Workers**: SSH-launches `SPTAGTest` on each worker with `WORKER_INDEX=i`
+   and the per-node ini (router enabled, `Rebuild=false`).
+4. **Driver**: relaunches `SPTAGTest` with router enabled, `Rebuild=false`.
+   The driver dispatches Insert / Search commands across batches via TCP.
 5. **Collect**: driver sends Stop, joins worker logs into `benchmark_logs/`.
 
-Useful environment overrides (see header of `run_distributed.sh`):
-
-- `NOCACHE=1` — disable TiKV block cache, OS pagecache, and `VersionCacheMaxChunks`.
-- `BUILD_WITH_CACHE=1` — build with caches, then drop caches before search/insert (NOCACHE only).
-- `SKIP_TIKV_SWAP=1` — when using `BUILD_WITH_CACHE`, skip the destructive TiKV
-  container restart that has corrupted recall at 100M scale.
-- `SKIP_SAVE_LOAD=1` — skip post-build SaveIndex / per-batch Load+Clone+Save (NOCACHE only).
-- `SKIP_HEAD_BUILD=1` — reuse existing HeadIndex if present (RebuildSSDOnly).
+> The "build on the driver, then distribute and run" split is a workaround:
+> we don't yet have a real distributed SelectHead/BuildHead implementation, so
+> Phase 1 is single-node-with-shared-TiKV. The `BuildOnly=true` /
+> `RebuildSSDOnly=true` / `SkipSaveLoadCycles=true` /
+> `tikv_switch_to_nocache` / `drop_caches` choreography exists because of
+> this split; it is not a feature of the steady-state design.
+
+Useful environment overrides (see the header of `run_distributed.sh` for the
+authoritative list):
+
+- `NOCACHE=1` — disable TiKV block cache, OS pagecache, and
+  `VersionCacheMaxChunks` for the search/insert phase.
+- `BUILD_WITH_CACHE=1` — build with caches enabled, then drop caches before
+  search/insert (requires `NOCACHE=1`). Used at 100M scale where building
+  under nocache is impractical.
+- `SKIP_TIKV_SWAP=1` — with `BUILD_WITH_CACHE`, skip the destructive TiKV
+  container restart that has corrupted recall at 100M scale. Relies on
+  drop_caches + `VersionCacheMaxChunks=0` for nocache semantics.
+- `SKIP_SAVE_LOAD=1` — skip the post-build SaveIndex / per-batch
+  Load+Clone+Save cycle (`SkipSaveLoadCycles=true`). Required at 100M scale.
+- `SKIP_HEAD_BUILD=1` — reuse existing HeadIndex if present
+  (`RebuildSSDOnly=true`); falls back to full build if HeadIndex is missing.
 
 ## Step 5 — Stop / cleanup
 
 ```bash
-./evaluation/distributed/run_distributed.sh stop-tikv cluster.conf
-./evaluation/distributed/run_distributed.sh cleanup cluster.conf   # remove deployed files
+./evaluation/distributed/run_distributed.sh stop-tikv my_cluster.conf
+./evaluation/distributed/run_distributed.sh cleanup   my_cluster.conf   # remove deployed files
 ```
 
 ## Key knobs in `benchmark_insert_dominant_template.ini`
diff --git a/evaluation/distributed/run_distributed.sh b/evaluation/distributed/run_distributed.sh
index bb982ab7d..28404c8a3 100755
--- a/evaluation/distributed/run_distributed.sh
+++ b/evaluation/distributed/run_distributed.sh
@@ -751,37 +751,6 @@ start_remote_worker() {
     echo "  Worker n${NODE_IDX} on $host (SSH PID: $ssh_pid, log: $LOG)"
 }
 
-wait_workers_ready() {
-    local SCALE="$1"
-    local NODE_COUNT="$2"
-    local TIMEOUT=120
-
-    echo "Waiting for ${#WORKER_SSH_PIDS[@]} workers to be ready..."
-    for attempt in $(seq 1 $TIMEOUT); do
-        local all_ready=true
-        for i in $(seq 1 $((NODE_COUNT - 1))); do
-            local LOG="$LOGDIR/benchmark_${SCALE}_${NODE_COUNT}node_worker${i}.log"
-            if ! grep -q "Worker.*[Rr]eady\|Waiting for dispatch" "$LOG" 2>/dev/null; then
-                all_ready=false
-            fi
-        done
-        if $all_ready; then
-            echo "  All workers ready (${attempt}s)"
-            return 0
-        fi
-        # Check if any worker SSH process died
-        for idx in "${!WORKER_SSH_PIDS[@]}"; do
-            if ! kill -0 "${WORKER_SSH_PIDS[$idx]}" 2>/dev/null; then
-                echo "  ERROR: Worker SSH PID ${WORKER_SSH_PIDS[$idx]} exited prematurely"
-                return 1
-            fi
-        done
-        sleep 1
-    done
-    echo "  WARNING: Not all workers ready after ${TIMEOUT}s"
-    return 1
-}
-
 stop_remote_workers() {
     # Wait for workers to self-exit (driver sends TCP Stop), then force-kill.
     local TIMEOUT=${1:-30}
@@ -1140,8 +1109,6 @@ cmd_run() {
             fi
         done
 
-        # Binary already pushed; nothing else to do here.
-
         # --- Phase 3: Start driver first (contains dispatcher), then workers ---
         echo ""
 

From ee405d4ddff4ec218c6a827eb4084087d96432cc Mon Sep 17 00:00:00 2001
From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com>
Date: Wed, 20 May 2026 10:09:08 +0000
Subject: [PATCH 10/12] README: clarify driver = worker 0 + dispatcher; workers
 peer-to-peer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous wording made it sound like the driver was a stateless
coordinator and workers only talked back to it. Reality: node 0 runs as
worker 0 (owns its hash shard like every other worker) and additionally
hosts the dispatcher; workers talk to each other directly through
PostingRouter for remote append, head sync, and merge hints — no
driver-mediated forwarding. Diagram and 'What run does' steps updated.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 evaluation/distributed/README.md | 55 +++++++++++++++++++-------------
 1 file changed, 33 insertions(+), 22 deletions(-)

diff --git a/evaluation/distributed/README.md b/evaluation/distributed/README.md
index 4717efc35..2b9c0950e 100644
--- a/evaluation/distributed/README.md
+++ b/evaluation/distributed/README.md
@@ -24,20 +24,23 @@ below).
 ## Architecture
 
 ```
-                    ┌──────────────┐
-                    │   Driver     │  (node 0)
-                    │  RunBenchmark│
-                    │   + Router   │
-                    └──┬───┬───┬──┘
-           TCP Dispatch│   │   │
-              ┌────────┘   │   └────────┐
-              ▼            ▼            ▼
+                    ┌────────────────────┐
+                    │   Driver = Worker 0│  (node 0)
+                    │   + Dispatcher     │
+                    └─┬──┬──┬────────────┘
+       TCP Dispatch  │  │  │       ▲ ▲ ▲
+        (broadcast)  │  │  │       │ │ │  status replies
+              ┌──────┘  │  └──────┐│ │ │
+              ▼         ▼         ▼│ │ │
         ┌──────────┐ ┌──────────┐ ┌──────────┐
         │ Worker 1 │ │ Worker 2 │ │ Worker N │
-        │  + Router│ │  + Router│ │  + Router│
-        └────┬─────┘ └────┬─────┘ └────┬─────┘
-             │            │            │
-             └────────────┼────────────┘
+        └──┬───▲───┘ └──┬───▲───┘ └──┬───▲───┘
+           │   │        │   │        │   │
+           └───┴────────┴───┴────────┴───┘
+              PostingRouter peer-to-peer
+              (remote append / head sync /
+               merge hints, by hash owner)
+                          │
                           ▼
                 ┌───────────────────┐
                 │ Shared TiKV raft  │  N PDs (one raft group) +
@@ -45,15 +48,19 @@ below).
                 └───────────────────┘
 ```
 
-- **Driver** (node 0): builds the index, sends Search/Insert/Stop commands via
-  TCP dispatch.
-- **Workers** (nodes 1..N): receive commands, execute their shard locally,
-  report results back over the dispatch channel.
+- **Driver** (node 0): also runs as **worker 0**. On top of the worker role,
+  it owns the dispatcher: builds the initial index, then broadcasts
+  Search/Insert/Stop commands to the other workers over TCP dispatch.
+- **Workers** (nodes 0..N-1): each owns a shard of the head index by hash.
+  Workers talk to each other peer-to-peer through PostingRouter for remote
+  append, head sync, and merge hints — there is no driver-mediated forwarding.
+  On each `DispatchCommand` they execute the local part of the request and
+  report status back to the dispatcher.
 - **Shared TiKV cluster**: every node runs a PD + TiKV container; all PDs join
   one raft group, all TiKVs point to all PDs. PD routes each key to the store
   that owns its region.
-- **PostingRouter**: hash-based head routing, remote append, head sync,
-  dispatch protocol.
+- **PostingRouter**: hash-based head routing, remote append, head sync, and
+  the TCP dispatch transport used by the dispatcher.
 
 ## TiKV deployment model
 
@@ -213,10 +220,14 @@ What `run` does:
    all postings straight to TiKV via PD-routed RPCs — there is no need for a
    distributed build phase.
 2. **Distribute**: rsync head index + perftest files from driver to each worker.
-3. **Workers**: SSH-launches `SPTAGTest` on each worker with `WORKER_INDEX=i`
-   and the per-node ini (router enabled, `Rebuild=false`).
-4. **Driver**: relaunches `SPTAGTest` with router enabled, `Rebuild=false`.
-   The driver dispatches Insert / Search commands across batches via TCP.
+3. **Workers**: SSH-launches `SPTAGTest` on each remote worker (nodes 1..N-1)
+   with `WORKER_INDEX=i` and the per-node ini (router enabled,
+   `Rebuild=false`). Workers wire PostingRouter so they can reach every peer
+   directly for remote append / head sync.
+4. **Driver**: relaunches `SPTAGTest` on node 0 with router enabled,
+   `Rebuild=false`. The same process acts as **worker 0** (owns its hash
+   shard like any other worker) **and** as the dispatcher (broadcasts Insert
+   / Search / Stop over TCP and waits for status replies).
 5. **Collect**: driver sends Stop, joins worker logs into `benchmark_logs/`.
 
 > The "build on the driver, then distribute and run" split is a workaround:

From 6cf7d36e922d01a86163377a1bbc5cdc3f07f6e8 Mon Sep 17 00:00:00 2001
From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com>
Date: Wed, 20 May 2026 10:10:26 +0000
Subject: [PATCH 11/12] README: drop unused TiKV pre-split helper section

We never actually ran the pre-split/scatter helper in our benchmark
runs. Keeping it in the doc gives the false impression that it's part
of the recommended setup. Remove the whole section.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 evaluation/distributed/README.md | 34 --------------------------------
 1 file changed, 34 deletions(-)

diff --git a/evaluation/distributed/README.md b/evaluation/distributed/README.md
index 2b9c0950e..7b2234908 100644
--- a/evaluation/distributed/README.md
+++ b/evaluation/distributed/README.md
@@ -167,40 +167,6 @@ curl -s "http://10.0.1.1:23791/pd/api/v1/stores" \
 # Expected: ['Up', 'Up'] (one entry per TiKV store).
 ```
 
-### Pre-split & scatter (optional but recommended)
-
-For the insert-dominant workload, pre-split the keyspace so writes spread
-evenly across regions and stores. Boundaries derive from
-`DBKey(headID) = MaxID*layer + headID` little-endian byte 0; the TiKV raw key
-is `TiKVKeyPrefix + "_" + uint32_le(DBKey)`. We split *only* on the head-key
-prefix so all chunk/count variants for a head share a region. Used split
-points: `0x02, 0x04, …, 0xfe` (127 split points → 128 regions).
-
-Since the cluster is shared, run the helper **once** against any PD endpoint:
-
-```bash
-PREFIX="bench_insert_dominant_2node"   # keep in sync with KEY_PREFIX in run_distributed.sh
-PD="http://10.0.1.1:23791"
-PDCTL=(docker run --rm --network host --entrypoint /pd-ctl pingcap/pd:v8.5.1 -u "$PD")
-python3 - "$PREFIX" "${PDCTL[@]}" <<'PY'
-import json, subprocess, sys
-prefix = sys.argv[1].encode() + b'_'
-pdctl = sys.argv[2:]
-def run(args): return subprocess.check_output(pdctl + args, text=True)
-def region_for(hex_key): return json.loads(run(['region', 'key', '--format=hex', hex_key]))['id']
-for b in range(2, 256, 2):
-    key = (prefix + bytes([b, 0, 0, 0])).hex()
-    rid = region_for(key)
-    run(['operator', 'add', 'split-region', str(rid), '--policy=usekey', '--keys', key])
-for r in json.loads(run(['region', 'scan']))['regions']:
-    run(['operator', 'add', 'scatter-region', str(r['id'])])
-PY
-```
-
-Skip this on the very first run if you don't have load skew — `start-tikv`
-works without it. For 1B-scale insert-dominant runs it materially reduces
-head-region hot-spotting.
-
 ## Step 4 — Run the benchmark
 
 ```bash

From 07bdc03a6b1c3e89944da005d96cc073b733acfd Mon Sep 17 00:00:00 2001
From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com>
Date: Wed, 20 May 2026 10:11:38 +0000
Subject: [PATCH 12/12] Clean comment

---
 AnnService/inc/Core/Common/FineGrainedLock.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/AnnService/inc/Core/Common/FineGrainedLock.h b/AnnService/inc/Core/Common/FineGrainedLock.h
index 5cfad7ac6..1f7d1eab4 100644
--- a/AnnService/inc/Core/Common/FineGrainedLock.h
+++ b/AnnService/inc/Core/Common/FineGrainedLock.h
@@ -56,10 +56,6 @@ namespace SPTAG
                 return GetLock(idx);
             }
 
-            // Per-posting lock identity. Two indices share a lock iff they are
-            // the same posting, so external callers can use `hash_func(a) ==
-            // hash_func(b)` as a self-lock guard (e.g. in Split, to skip
-            // re-locking the same head VID).
             static inline unsigned hash_func(unsigned idx)
             {
                 return idx;