From 87160070304e257920e163b4bd705ebdd7e54b3b Mon Sep 17 00:00:00 2001 From: zhangt Date: Wed, 20 May 2026 03:16:16 +0000 Subject: [PATCH 01/12] Replay distributed work onto users/qiazh/pre-merge-tikv-bugfix Branch users/zhangt/merge-onto-qiazh ports our shared remote/local pool + per-layer routing changes from users/zhangt/merge-distributed-to-tikv on top of qianxi's TiKV bugfix branch (lock ordering, splitAsync, version check, etc.). Avoids the 21-block ExtraDynamicSearcher.h merge conflict on the merged_spfresh side by replaying instead of merging. Pragmatic approach for heavy files (ExtraDynamicSearcher.h, SPFreshTest.cpp): take our HEAD versions wholesale (which already contain our distributed + MultiChunk logic), and patch only the compile-breaking deltas caused by qianxi's refactors: - PostingCountCache moved from ExtraDynamicSearcher.h to ExtraTiKVController.h - KeyValueIO grew MultiMerge + LogAsyncWaitStatsAndReset virtuals (qianxi version kept; our MultiPut/MultiDelete virtuals re-added on top) - Options/ParameterDefinitionList: kept qianxi version (adds m_globalIDPath) - ThreadPool: kept our add_high + added addfront alias for qianxi callers Index.h / IExtraSearcher.h / SPANNIndex.cpp: applied small additive hooks on top of qianxi (forward-decl WorkerNode, SetWorker/GetSharedSplitPool accessors, BuildIndexInternalLayer + AddIndex worker loop). qianxi bugfixes preserved in those files. Build system: - CMakeLists updated for absl_cord + cordz family (kvproto 25.3 uses absl 2308, anaconda's grpc bundles 2111; explicit linkage avoids DSO-missing-from-command-line) - cmake invoked with gRPC_DIR/Protobuf_DIR/absl_DIR pointing at /usr/local so generated kvproto + libabsl 2308 versions align Verified: SPTAGTest links cleanly. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .gitignore | 3 +- AnnService/CMakeLists.txt | 8 +- AnnService/inc/Core/Common/FineGrainedLock.h | 25 +- AnnService/inc/Core/Common/IVersionMap.h | 12 + AnnService/inc/Core/Common/TiKVVersionMap.h | 52 + .../SPANN/Distributed/ConsistentHashRing.h | 93 ++ .../SPANN/Distributed/DispatchCoordinator.h | 364 +++++ .../Core/SPANN/Distributed/DispatcherNode.h | 293 ++++ .../SPANN/Distributed/DistributedProtocol.h | 651 ++++++++ .../inc/Core/SPANN/Distributed/NetworkNode.h | 319 ++++ .../Core/SPANN/Distributed/RemotePostingOps.h | 1325 ++++++++++++++++ .../inc/Core/SPANN/Distributed/WorkerNode.h | 616 ++++++++ .../inc/Core/SPANN/ExtraDynamicSearcher.h | 620 +++++++- .../inc/Core/SPANN/ExtraTiKVController.h | 1 + AnnService/inc/Core/SPANN/IExtraSearcher.h | 17 + AnnService/inc/Core/SPANN/Index.h | 40 + AnnService/inc/Core/VectorIndex.h | 9 + AnnService/inc/Helper/KeyValueIO.h | 14 + AnnService/inc/Helper/ThreadPool.h | 33 +- AnnService/inc/Socket/ConnectionManager.h | 6 +- AnnService/inc/Socket/Packet.h | 36 +- AnnService/inc/Socket/SimpleSerialization.h | 52 + .../src/Core/SPANN/ExtraFileController.cpp | 2 +- AnnService/src/Core/SPANN/SPANNIndex.cpp | 78 +- AnnService/src/Core/VectorIndex.cpp | 25 + AnnService/src/Socket/Connection.cpp | 30 +- AnnService/src/Socket/Server.cpp | 2 +- Test/CMakeLists.txt | 2 +- Test/inc/TestDataGenerator.h | 15 +- Test/src/SPFreshTest.cpp | 1071 +++++++++++-- Test/src/TestDataGenerator.cpp | 12 +- Test/src/main.cpp | 7 +- benchmark.ini | 19 + evaluation/distributed/README.md | 294 ++++ .../configs/benchmark_100m_1node.ini | 71 + .../configs/benchmark_100m_2node.ini | 71 + .../configs/benchmark_100m_template.ini | 71 + .../configs/benchmark_10m_1node.ini | 62 + .../configs/benchmark_10m_2node.ini | 62 + .../configs/benchmark_10m_template.ini | 62 + .../benchmark_insert_dominant_1node.ini | 58 + .../benchmark_insert_dominant_2node.ini | 58 + .../benchmark_insert_dominant_3node.ini | 59 + .../benchmark_insert_dominant_template.ini | 58 + .../distributed/configs/cluster_2node.conf | 31 + .../distributed/configs/cluster_3node.conf | 34 + evaluation/distributed/configs/tikv.toml | 74 + evaluation/distributed/run_distributed.sh | 1364 +++++++++++++++++ 48 files changed, 8050 insertions(+), 231 deletions(-) create mode 100644 AnnService/inc/Core/SPANN/Distributed/ConsistentHashRing.h create mode 100644 AnnService/inc/Core/SPANN/Distributed/DispatchCoordinator.h create mode 100644 AnnService/inc/Core/SPANN/Distributed/DispatcherNode.h create mode 100644 AnnService/inc/Core/SPANN/Distributed/DistributedProtocol.h create mode 100644 AnnService/inc/Core/SPANN/Distributed/NetworkNode.h create mode 100644 AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h create mode 100644 AnnService/inc/Core/SPANN/Distributed/WorkerNode.h create mode 100644 benchmark.ini create mode 100644 evaluation/distributed/README.md create mode 100644 evaluation/distributed/configs/benchmark_100m_1node.ini create mode 100644 evaluation/distributed/configs/benchmark_100m_2node.ini create mode 100644 evaluation/distributed/configs/benchmark_100m_template.ini create mode 100644 evaluation/distributed/configs/benchmark_10m_1node.ini create mode 100644 evaluation/distributed/configs/benchmark_10m_2node.ini create mode 100644 evaluation/distributed/configs/benchmark_10m_template.ini create mode 100644 evaluation/distributed/configs/benchmark_insert_dominant_1node.ini create mode 100644 evaluation/distributed/configs/benchmark_insert_dominant_2node.ini create mode 100644 evaluation/distributed/configs/benchmark_insert_dominant_3node.ini create mode 100644 evaluation/distributed/configs/benchmark_insert_dominant_template.ini create mode 100644 evaluation/distributed/configs/cluster_2node.conf create mode 100644 evaluation/distributed/configs/cluster_3node.conf create mode 100755 evaluation/distributed/configs/tikv.toml create mode 100755 evaluation/distributed/run_distributed.sh diff --git a/.gitignore b/.gitignore index 190ca29d3..e3dc9796a 100644 --- a/.gitignore +++ b/.gitignore @@ -464,4 +464,5 @@ FodyWeavers.xsd *.sln.iml # SPTAG benchmark generated artifacts -*perftest_* +/perftest_* +/evaluation/2026-04-23/output_distributed_hostname_*.json diff --git a/AnnService/CMakeLists.txt b/AnnService/CMakeLists.txt index cd23345fd..299faf3ed 100644 --- a/AnnService/CMakeLists.txt +++ b/AnnService/CMakeLists.txt @@ -10,6 +10,12 @@ include_directories(${Zstd}/lib) file(GLOB_RECURSE HDR_FILES ${AnnService}/inc/Core/*.h ${AnnService}/inc/Helper/*.h) file(GLOB_RECURSE SRC_FILES ${AnnService}/src/Core/*.cpp ${AnnService}/src/Helper/*.cpp) +# Include Socket sources in core lib for PostingRouter +file(GLOB SOCKET_HDR_FILES ${AnnService}/inc/Socket/*.h) +file(GLOB SOCKET_SRC_FILES ${AnnService}/src/Socket/*.cpp) +list(APPEND HDR_FILES ${SOCKET_HDR_FILES}) +list(APPEND SRC_FILES ${SOCKET_SRC_FILES}) + set(SPDK_LIBRARIES "") if (SPDK) set(Spdk ${PROJECT_SOURCE_DIR}/ThirdParty/spdk/build) @@ -73,7 +79,7 @@ endif() add_library (SPTAGLib SHARED ${SRC_FILES} ${HDR_FILES} ${TiKV_PROTO_SOURCES}) target_link_libraries (SPTAGLib DistanceUtils ${RocksDB_LIBRARIES} ${uring_LIBRARIES} libzstd_shared ${NUMA_LIBRARY} ${TBB_LIBRARIES} ${SPDK_LIBRARIES} ${TiKV_LIBRARIES}) add_library (SPTAGLibStatic STATIC ${SRC_FILES} ${HDR_FILES} ${TiKV_PROTO_SOURCES}) -target_link_libraries (SPTAGLibStatic DistanceUtils ${RocksDB_LIBRARIES} ${uring_LIBRARIES} libzstd_static ${NUMA_LIBRARY_STATIC} ${TBB_LIBRARIES} ${SPDK_LIBRARIES} ${TiKV_LIBRARIES}) +target_link_libraries (SPTAGLibStatic DistanceUtils ${RocksDB_LIBRARIES} ${uring_LIBRARIES} libzstd_static ${NUMA_LIBRARY_STATIC} ${TBB_LIBRARIES} ${SPDK_LIBRARIES} ${TiKV_LIBRARIES} ${Boost_LIBRARIES}) if (MSVC) # SPANNIndex.cpp can exceed COFF section limits in Debug without /bigobj. diff --git a/AnnService/inc/Core/Common/FineGrainedLock.h b/AnnService/inc/Core/Common/FineGrainedLock.h index 06c8f44d1..5cfad7ac6 100644 --- a/AnnService/inc/Core/Common/FineGrainedLock.h +++ b/AnnService/inc/Core/Common/FineGrainedLock.h @@ -56,10 +56,27 @@ namespace SPTAG return GetLock(idx); } + // Per-posting lock identity. Two indices share a lock iff they are + // the same posting, so external callers can use `hash_func(a) == + // hash_func(b)` as a self-lock guard (e.g. in Split, to skip + // re-locking the same head VID). static inline unsigned hash_func(unsigned idx) { return idx; } + + // Bucket index for the internal mutex-sharded unordered_map of + // per-posting locks. Exposed for callers that need an array sized + // to BucketCount and indexed by the same granularity as the lock + // pool (e.g. ExtraDynamicSearcher::m_remoteBucketLocked). + static inline unsigned BucketIndex(SizeType idx) + { + unsigned key = static_cast(idx); + return ((unsigned)(key * 99991) + _rotl(key, 2) + 101) & BucketMask; + } + + static const int BucketMask = 32767; + static const int BucketCount = BucketMask + 1; private: struct Bucket { std::mutex mutex; @@ -76,14 +93,6 @@ namespace SPTAG return *iter->second; } - static inline unsigned BucketIndex(SizeType idx) - { - unsigned key = static_cast(idx); - return ((unsigned)(key * 99991) + _rotl(key, 2) + 101) & BucketMask; - } - - static const int BucketMask = 32767; - static const int BucketCount = BucketMask + 1; mutable std::unique_ptr m_buckets; }; } diff --git a/AnnService/inc/Core/Common/IVersionMap.h b/AnnService/inc/Core/Common/IVersionMap.h index b939bd534..05d638cd9 100644 --- a/AnnService/inc/Core/Common/IVersionMap.h +++ b/AnnService/inc/Core/Common/IVersionMap.h @@ -43,6 +43,18 @@ namespace SPTAG virtual uint8_t GetVersion(const SizeType& key) = 0; virtual uint8_t GetVersion(const SizeType& key, VersionReadPolicy policy) { return GetVersion(key); } virtual void SetVersion(const SizeType& key, const uint8_t& version) = 0; + + /// Batch SetVersion: apply (vids[i] -> versions[i]) for all i. + /// Default impl is a per-VID loop. TiKV-backed maps override this + /// to group writes by chunk so N records in the same chunk only + /// trigger 1 ReadChunk + 1 WriteChunk RPC pair + virtual void SetVersionBatch(const std::vector& vids, const std::vector& versions) + { + size_t n = std::min(vids.size(), versions.size()); + for (size_t i = 0; i < n; i++) { + SetVersion(vids[i], versions[i]); + } + } /// Increment the version of a VID. /// @param expectedOld If not 0xff, the caller asserts the current version should be this value. /// If TiKV already holds (expectedOld+1)&0x7f, treat as success (another node did the same increment). diff --git a/AnnService/inc/Core/Common/TiKVVersionMap.h b/AnnService/inc/Core/Common/TiKVVersionMap.h index 0dce69ce8..69191fe1b 100644 --- a/AnnService/inc/Core/Common/TiKVVersionMap.h +++ b/AnnService/inc/Core/Common/TiKVVersionMap.h @@ -385,6 +385,58 @@ namespace SPTAG else if (oldVal != 0xfe && version == 0xfe) m_deleted++; } + // Group writes by chunk: 1 ReadChunk + N byte-modifications + 1 WriteChunk + // per chunk, instead of N × (ReadChunk + WriteChunk). + void SetVersionBatch(const std::vector& vids, const std::vector& versions) override + { + size_t n = std::min(vids.size(), versions.size()); + if (n == 0) return; + const SizeType localCount = m_count.load(); + + // Group (idx into vids/versions) by chunk id. + std::unordered_map> byChunk; + byChunk.reserve(n); + for (size_t i = 0; i < n; i++) { + SizeType vid = vids[i]; + if (vid < 0 || vid >= localCount) continue; + byChunk[ChunkId(vid)].push_back(i); + } + if (byChunk.empty()) return; + + long deletedDelta = 0; + for (auto& kv : byChunk) { + SizeType cid = kv.first; + auto& idxs = kv.second; + std::lock_guard lock(ChunkMutex(cid)); + std::string chunk = ReadChunkCached(cid); + if (chunk.empty()) { + chunk.assign(m_chunkSize, static_cast(0xff)); + } + bool dirty = false; + for (size_t i : idxs) { + SizeType vid = vids[i]; + uint8_t newVal = versions[i]; + int offset = ChunkOffset(vid); + if (offset < 0 || offset >= (int)chunk.size()) continue; + uint8_t oldVal = static_cast(chunk[offset]); + if (oldVal == newVal) continue; + if (oldVal == 0xfe && newVal != 0xfe) deletedDelta--; + else if (oldVal != 0xfe && newVal == 0xfe) deletedDelta++; + chunk[offset] = static_cast(newVal); + dirty = true; + } + if (dirty) { + auto ret = WriteChunk(cid, chunk); + if (ret != ErrorCode::Success) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "TiKVVersionMap::SetVersionBatch: WriteChunk failed chunk=%d layer=%d\n", + cid, m_layer); + } + } + } + if (deletedDelta != 0) m_deleted += deletedDelta; + } + bool IncVersion(const SizeType& key, uint8_t* newVersion, uint8_t expectedOld = 0xff) override { if (key < 0 || key >= m_count.load()) { diff --git a/AnnService/inc/Core/SPANN/Distributed/ConsistentHashRing.h b/AnnService/inc/Core/SPANN/Distributed/ConsistentHashRing.h new file mode 100644 index 000000000..ec5c7855c --- /dev/null +++ b/AnnService/inc/Core/SPANN/Distributed/ConsistentHashRing.h @@ -0,0 +1,93 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "inc/Core/Common.h" +#include +#include +#include + +namespace SPTAG::SPANN { + + /// Consistent hash ring for distributing headIDs across compute nodes. + /// Uses virtual nodes (vnodes) for balanced distribution. + /// When nodes are added/removed, only ~1/N of keys are remapped. + class ConsistentHashRing { + public: + explicit ConsistentHashRing(int vnodeCount = 150) + : m_vnodeCount(vnodeCount) {} + + /// Add a physical node to the ring with its virtual nodes. + void AddNode(int nodeIndex) { + for (int i = 0; i < m_vnodeCount; i++) { + uint32_t h = HashVNode(nodeIndex, i); + m_ring[h] = nodeIndex; + } + m_nodes.insert(nodeIndex); + } + + /// Remove a physical node and all its virtual nodes from the ring. + void RemoveNode(int nodeIndex) { + for (int i = 0; i < m_vnodeCount; i++) { + uint32_t h = HashVNode(nodeIndex, i); + m_ring.erase(h); + } + m_nodes.erase(nodeIndex); + } + + /// Find the owner node for a given key (headID). + /// Returns -1 if the ring is empty. + int GetOwner(SizeType headID) const { + if (m_ring.empty()) return -1; + uint32_t h = HashKey(headID); + auto it = m_ring.lower_bound(h); + if (it == m_ring.end()) it = m_ring.begin(); + return it->second; + } + + bool Empty() const { return m_ring.empty(); } + size_t NodeCount() const { return m_nodes.size(); } + bool HasNode(int nodeIndex) const { return m_nodes.count(nodeIndex) > 0; } + const std::set& GetNodes() const { return m_nodes; } + int GetVNodeCount() const { return m_vnodeCount; } + + private: + static uint32_t HashKey(SizeType headID) { + uint32_t hash = 2166136261u; // FNV-1a offset basis + uint32_t val = static_cast(headID); + for (int i = 0; i < 4; i++) { + hash ^= (val >> (i * 8)) & 0xFF; + hash *= 16777619u; // FNV prime + } + return hash; + } + + static uint32_t HashVNode(int nodeIndex, int vnodeIdx) { + // Raw FNV-1a on tiny nodeIndex (1, 2, 3) produces a + // pathologically biased ring (71.9% vs 28.1% for nodes 1/2 with + // 150 vnodes). Pre-mix nodeIndex through Knuth's golden-ratio + // multiplier so small node IDs become full-spectrum uint32 values + // before they hit FNV's accumulator. Validated to give ≈50/50 + // for K=2 and stay within ±15% of even split for K up to 8. + uint32_t saltedVnode = + static_cast(vnodeIdx) ^ + (static_cast(nodeIndex) * 2654435761u); + uint32_t hash = 2166136261u; + auto mix = [&](uint32_t v) { + for (int i = 0; i < 4; i++) { + hash ^= (v >> (i * 8)) & 0xFF; + hash *= 16777619u; + } + }; + mix(saltedVnode); + mix(static_cast(nodeIndex)); + return hash; + } + + int m_vnodeCount; + std::map m_ring; // hash position → nodeIndex + std::set m_nodes; // active physical node indices + }; + +} // namespace SPTAG::SPANN diff --git a/AnnService/inc/Core/SPANN/Distributed/DispatchCoordinator.h b/AnnService/inc/Core/SPANN/Distributed/DispatchCoordinator.h new file mode 100644 index 000000000..8bb32a7eb --- /dev/null +++ b/AnnService/inc/Core/SPANN/Distributed/DispatchCoordinator.h @@ -0,0 +1,364 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "inc/Core/SPANN/Distributed/DistributedProtocol.h" +#include "inc/Socket/Client.h" +#include "inc/Socket/Packet.h" +#include "inc/Socket/SimpleSerialization.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace SPTAG::SPANN { + + /// Coordinates driver↔worker dispatch for distributed benchmarks. + /// + /// The driver broadcasts Insert/Search/Stop commands to all workers and + /// collects their results. Workers execute commands via a callback and + /// report results back. + /// + /// This class is independent of posting routing — it only needs a way to + /// send packets to peer nodes (provided via PeerNetwork interface). + class DispatchCoordinator { + public: + /// Abstract interface for sending packets to peer nodes. + /// NetworkNode implements this so DispatchCoordinator doesn't + /// depend on the full node class. + class PeerNetwork { + public: + virtual ~PeerNetwork() = default; + /// Get connection to a peer node (reconnecting if needed). + virtual Socket::ConnectionID GetPeerConnection(int nodeIndex) = 0; + /// Total number of nodes in the cluster. + virtual int GetNumNodes() const = 0; + /// Index of this node. + virtual int GetLocalNodeIndex() const = 0; + /// Send a packet via the client socket. + virtual void SendPacket(Socket::ConnectionID connID, Socket::Packet&& pkt, + std::function callback) = 0; + }; + + using DispatchCallback = std::function; + + DispatchCoordinator() = default; + + ~DispatchCoordinator() { + ClearDispatchCallback(); + } + + /// Attach to a peer network (must outlive this coordinator). + void SetNetwork(PeerNetwork* network) { + m_network = network; + } + + /// Mark a worker node as "local" — its work is done inline by the + /// driver so it should be skipped during broadcast/result collection. + void SetLocalWorkerIndex(int idx) { m_localWorkerIndex = idx; } + + /// Set the callback for executing dispatch commands (worker side). + void SetDispatchCallback(DispatchCallback cb) { + m_dispatchCallback = std::move(cb); + } + + /// Clear the dispatch callback and wait for in-flight dispatch + /// threads to complete. Call before destroying callback state. + void ClearDispatchCallback() { + m_dispatchCallback = nullptr; + std::unique_lock lock(m_activeDispatchMutex); + m_activeDispatchCV.wait(lock, [this]() { + return m_activeDispatchCount == 0; + }); + } + + // ---- Driver side ---- + + /// Broadcast a dispatch command to all worker nodes. + /// Returns the dispatchId assigned to this command. + std::uint64_t BroadcastDispatchCommand(DispatchCommand::Type type, std::uint32_t round) { + std::uint64_t dispatchId = m_nextDispatchId.fetch_add(1); + + DispatchCommand cmd; + cmd.m_type = type; + cmd.m_dispatchId = dispatchId; + cmd.m_round = round; + + int numNodes = m_network->GetNumNodes(); + int localIdx = m_network->GetLocalNodeIndex(); + + // Build list of nodes to skip (dispatcher + local worker if set) + auto shouldSkip = [&](int i) { + return i == localIdx || i == m_localWorkerIndex; + }; + + // Count remote workers (nodes we will actually dispatch to) + int remoteWorkers = 0; + for (int i = 0; i < numNodes; i++) { + if (!shouldSkip(i)) remoteWorkers++; + } + + // Set up pending state for collecting results (not for Stop / Heartbeat) + if (type != DispatchCommand::Type::Stop && + type != DispatchCommand::Type::Heartbeat && + remoteWorkers > 0) { + auto state = std::make_shared(); + state->remaining.store(remoteWorkers); + for (int i = 0; i < numNodes; i++) { + if (!shouldSkip(i)) state->pendingNodes.insert(i); + } + { + std::lock_guard lock(m_dispatchMutex); + m_pendingDispatches[dispatchId] = state; + } + } + + auto bodySize = static_cast(cmd.EstimateBufferSize()); + + for (int i = 0; i < numNodes; i++) { + if (shouldSkip(i)) continue; + + Socket::ConnectionID connID = m_network->GetPeerConnection(i); + if (connID == Socket::c_invalidConnectionID) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "DispatchCoordinator: Cannot dispatch to node %d (no connection)\n", i); + if (type != DispatchCommand::Type::Stop && + type != DispatchCommand::Type::Heartbeat) { + std::lock_guard lock(m_dispatchMutex); + auto it = m_pendingDispatches.find(dispatchId); + if (it != m_pendingDispatches.end()) { + it->second->errors++; + if (it->second->remaining.fetch_sub(1) == 1) { + it->second->done.set_value(); + } + } + } + continue; + } + + Socket::Packet pkt; + pkt.Header().m_packetType = Socket::PacketType::DispatchCommand; + pkt.Header().m_processStatus = Socket::PacketProcessStatus::Ok; + pkt.Header().m_connectionID = Socket::c_invalidConnectionID; + pkt.Header().m_resourceID = 0; + pkt.Header().m_bodyLength = bodySize; + pkt.AllocateBuffer(bodySize); + cmd.Write(pkt.Body()); + pkt.Header().WriteBuffer(pkt.HeaderBuffer()); + + m_network->SendPacket(connID, std::move(pkt), nullptr); + } + + // Heartbeats fire every interval seconds — keep logs clean. + if (type != DispatchCommand::Type::Heartbeat) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "DispatchCoordinator: Dispatched %s (id=%llu round=%u) to %d workers\n", + type == DispatchCommand::Type::Search ? "Search" : + type == DispatchCommand::Type::Insert ? "Insert" : "Stop", + (unsigned long long)dispatchId, round, remoteWorkers); + } + + return dispatchId; + } + + /// Wait for all workers to report results for a dispatch. + /// Returns collected wall times from workers. Empty on timeout. + std::vector WaitForAllResults(std::uint64_t dispatchId, int timeoutSec = 300) { + std::shared_ptr state; + { + std::lock_guard lock(m_dispatchMutex); + auto it = m_pendingDispatches.find(dispatchId); + if (it == m_pendingDispatches.end()) return {}; + state = it->second; + } + + auto future = state->done.get_future(); + auto status = future.wait_for(std::chrono::seconds(timeoutSec)); + + { + std::lock_guard lock(m_dispatchMutex); + m_pendingDispatches.erase(dispatchId); + } + + if (status == std::future_status::timeout) { + std::string nodeList; + { + std::lock_guard lock(state->mutex); + for (int n : state->pendingNodes) { + if (!nodeList.empty()) nodeList += ","; + nodeList += std::to_string(n); + } + } + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "DispatchCoordinator: Timeout waiting for results (id=%llu, %d remaining, nodes=[%s])\n", + (unsigned long long)dispatchId, state->remaining.load(), nodeList.c_str()); + return {}; + } + + if (state->errors > 0) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "DispatchCoordinator: Dispatch %llu completed with %d errors\n", + (unsigned long long)dispatchId, (int)state->errors); + } + + std::lock_guard lock(state->mutex); + return state->wallTimes; + } + + // ---- Worker side ---- + + /// Send a dispatch result back to the driver (worker side). + void SendDispatchResult(const DispatchResult& result) { + int driverNode = 0; + if (driverNode == m_network->GetLocalNodeIndex()) return; + + Socket::ConnectionID connID = m_network->GetPeerConnection(driverNode); + if (connID == Socket::c_invalidConnectionID) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "DispatchCoordinator: Cannot send result to driver\n"); + return; + } + + Socket::Packet pkt; + auto bodySize = static_cast(result.EstimateBufferSize()); + pkt.Header().m_packetType = Socket::PacketType::DispatchResult; + pkt.Header().m_processStatus = Socket::PacketProcessStatus::Ok; + pkt.Header().m_connectionID = Socket::c_invalidConnectionID; + pkt.Header().m_resourceID = 0; + pkt.Header().m_bodyLength = bodySize; + pkt.AllocateBuffer(bodySize); + result.Write(pkt.Body()); + pkt.Header().WriteBuffer(pkt.HeaderBuffer()); + + m_network->SendPacket(connID, std::move(pkt), nullptr); + } + + // ---- Packet handlers (called by NetworkNode's server/client) ---- + + /// Handle an incoming dispatch command from the driver (worker side). + void HandleDispatchCommand(Socket::ConnectionID connID, Socket::Packet packet) { + if (packet.Header().m_bodyLength == 0) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "DispatchCoordinator: Empty DispatchCommand received\n"); + return; + } + + DispatchCommand cmd; + if (cmd.Read(packet.Body()) == nullptr) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "DispatchCoordinator: DispatchCommand parse failed\n"); + return; + } + + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "DispatchCoordinator: Received command type=%d id=%llu round=%u\n", + (int)cmd.m_type, (unsigned long long)cmd.m_dispatchId, cmd.m_round); + + auto callback = m_dispatchCallback; + if (!callback) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "DispatchCoordinator: No callback set, ignoring command\n"); + return; + } + + { + std::lock_guard lock(m_activeDispatchMutex); + m_activeDispatchCount++; + } + + auto self = this; + int localIdx = m_network->GetLocalNodeIndex(); + std::thread([self, callback, cmd, localIdx]() { + DispatchResult result = callback(cmd); + result.m_nodeIndex = localIdx; + result.m_dispatchId = cmd.m_dispatchId; + result.m_round = cmd.m_round; + + if (cmd.m_type != DispatchCommand::Type::Stop && + cmd.m_type != DispatchCommand::Type::Heartbeat) { + self->SendDispatchResult(result); + } + + { + std::lock_guard lock(self->m_activeDispatchMutex); + self->m_activeDispatchCount--; + } + self->m_activeDispatchCV.notify_all(); + }).detach(); + } + + /// Handle an incoming dispatch result from a worker (driver side). + void HandleDispatchResult(Socket::ConnectionID connID, Socket::Packet packet) { + if (packet.Header().m_bodyLength == 0) return; + + DispatchResult result; + if (result.Read(packet.Body()) == nullptr) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "DispatchCoordinator: DispatchResult parse failed\n"); + return; + } + + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "DispatchCoordinator: Result id=%llu round=%u node=%d status=%d wallTime=%.3f\n", + (unsigned long long)result.m_dispatchId, result.m_round, + result.m_nodeIndex, (int)result.m_status, result.m_wallTime); + + std::shared_ptr state; + { + std::lock_guard lock(m_dispatchMutex); + auto it = m_pendingDispatches.find(result.m_dispatchId); + if (it == m_pendingDispatches.end()) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "DispatchCoordinator: Result for unknown dispatch %llu (late/expired)\n", + (unsigned long long)result.m_dispatchId); + return; + } + state = it->second; + } + + if (result.m_status != DispatchResult::Status::Success) { + state->errors++; + } + + { + std::lock_guard lock(state->mutex); + state->wallTimes.push_back(result.m_wallTime); + if (result.m_nodeIndex >= 0) + state->pendingNodes.erase(result.m_nodeIndex); + } + + if (state->remaining.fetch_sub(1) == 1) { + state->done.set_value(); + } + } + + private: + struct PendingDispatch { + std::atomic remaining{0}; + std::atomic errors{0}; + std::promise done; + std::mutex mutex; + std::vector wallTimes; + std::set pendingNodes; // nodes that haven't responded yet + }; + + PeerNetwork* m_network = nullptr; + int m_localWorkerIndex = -1; // driver's worker node to skip in broadcasts + DispatchCallback m_dispatchCallback; + std::atomic m_nextDispatchId{1}; + std::mutex m_dispatchMutex; + std::unordered_map> m_pendingDispatches; + + std::mutex m_activeDispatchMutex; + std::condition_variable m_activeDispatchCV; + int m_activeDispatchCount{0}; + }; + +} // namespace SPTAG::SPANN diff --git a/AnnService/inc/Core/SPANN/Distributed/DispatcherNode.h b/AnnService/inc/Core/SPANN/Distributed/DispatcherNode.h new file mode 100644 index 000000000..00b7bbdb6 --- /dev/null +++ b/AnnService/inc/Core/SPANN/Distributed/DispatcherNode.h @@ -0,0 +1,293 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "inc/Core/SPANN/Distributed/NetworkNode.h" + +namespace SPTAG::SPANN { + + /// Dispatcher node: manages the consistent hash ring and coordinates + /// external dispatch commands (Insert/Search/Stop) to worker nodes. + /// + /// The dispatcher does NOT perform search or posting operations. + /// It is a lightweight coordination point that: + /// - Accepts NodeRegister requests from workers + /// - Maintains the authoritative hash ring and broadcasts updates + /// - Tracks per-worker ACK status with retry + /// - Delegates BroadcastDispatchCommand / WaitForAllResults + class DispatcherNode : public NetworkNode { + public: + using DispatchCallback = DispatchCoordinator::DispatchCallback; + + /// Initialize the dispatcher with separate addresses. + /// Builds the full hash ring at startup (workers 1..N). + bool Initialize( + const std::pair& dispatcherAddr, + const std::vector>& workerAddrs, + int vnodeCount = 150) + { + // Build combined addr list: [dispatcher, worker0, worker1, ...] + std::vector> allAddrs; + allAddrs.push_back(dispatcherAddr); + allAddrs.insert(allAddrs.end(), workerAddrs.begin(), workerAddrs.end()); + + if (!InitializeNetwork(0, allAddrs, vnodeCount)) return false; + + // [Bug 30] Dispatcher has no local data shard; mark with -1. + m_numDispatchNodes = 1; + m_numWorkerNodes = static_cast(workerAddrs.size()); + m_workerNodeIndex = -1; + + // Pre-build complete ring with all workers (internal indices 1..N) + int numWorkers = static_cast(workerAddrs.size()); + auto ring = std::make_shared(vnodeCount); + for (int i = 1; i <= numWorkers; i++) { + ring->AddNode(i); + } + std::atomic_store(&m_hashRing, + std::shared_ptr(std::move(ring))); + m_currentRingVersion.store(1); + + m_dispatch.SetNetwork(this); + + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "DispatcherNode: initialized with %d workers, ring v1\n", numWorkers); + return true; + } + + bool Start() { return StartNetwork(); } + + // ---- Dispatch protocol ---- + + /// Mark the driver's local worker node so broadcasts skip it. + void SetLocalWorkerIndex(int idx) { m_dispatch.SetLocalWorkerIndex(idx); } + + std::uint64_t BroadcastDispatchCommand(DispatchCommand::Type type, std::uint32_t round) { + return m_dispatch.BroadcastDispatchCommand(type, round); + } + + std::vector WaitForAllResults(std::uint64_t dispatchId, int timeoutSec = 300) { + return m_dispatch.WaitForAllResults(dispatchId, timeoutSec); + } + + void SetDispatchCallback(DispatchCallback cb) { + m_dispatch.SetDispatchCallback(std::move(cb)); + } + + void ClearDispatchCallback() { + m_dispatch.ClearDispatchCallback(); + } + + // ---- Heartbeat pump ---- + // + // Periodically broadcasts a Heartbeat dispatch to every remote worker. + // Workers use the heartbeat to detect driver failure / network + // partition and exit cleanly rather than relying on a fixed + // wall-clock receiver timeout. + // + // Idempotent: callable from any thread; second call without StopHeartbeat + // is a no-op. StopHeartbeat joins the thread; destructor calls it. + + void StartHeartbeat(int intervalSec) { + if (intervalSec <= 0) return; + if (m_heartbeatThread.joinable()) return; + m_heartbeatStop.store(false); + m_heartbeatThread = std::thread([this, intervalSec]() { + std::uint32_t round = 0; + while (!m_heartbeatStop.load()) { + BroadcastDispatchCommand(DispatchCommand::Type::Heartbeat, round++); + for (int i = 0; i < intervalSec * 10 && !m_heartbeatStop.load(); i++) { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + } + }); + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "DispatcherNode: heartbeat pump started (interval=%ds)\n", intervalSec); + } + + void StopHeartbeat() { + if (!m_heartbeatThread.joinable()) return; + m_heartbeatStop.store(true); + m_heartbeatThread.join(); + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "DispatcherNode: heartbeat pump stopped\n"); + } + + ~DispatcherNode() { + StopHeartbeat(); + } + + // ---- Ring management ---- + + bool AllWorkersAcked() const { + std::uint32_t currentVer = m_currentRingVersion.load(); + if (currentVer == 0) return false; + std::lock_guard lock(m_ackMutex); + int numNodes = static_cast(m_nodeAddrs.size()); + for (int i = 0; i < numNodes; i++) { + if (i == m_localNodeIndex) continue; + auto it = m_workerAckedVersion.find(i); + if (it == m_workerAckedVersion.end() || it->second < currentVer) return false; + } + return true; + } + + protected: + void RegisterServerHandlers(Socket::PacketHandlerMapPtr& handlers) override { + handlers->emplace(Socket::PacketType::NodeRegisterRequest, + [this](Socket::ConnectionID c, Socket::Packet p) { HandleNodeRegisterRequest(c, std::move(p)); }); + handlers->emplace(Socket::PacketType::RingUpdateACK, + [this](Socket::ConnectionID c, Socket::Packet p) { HandleRingUpdateACK(c, std::move(p)); }); + handlers->emplace(Socket::PacketType::DispatchCommand, + [this](Socket::ConnectionID c, Socket::Packet p) { m_dispatch.HandleDispatchCommand(c, std::move(p)); }); + handlers->emplace(Socket::PacketType::DispatchResult, + [this](Socket::ConnectionID c, Socket::Packet p) { m_dispatch.HandleDispatchResult(c, std::move(p)); }); + } + + void RegisterClientHandlers(Socket::PacketHandlerMapPtr& handlers) override { + handlers->emplace(Socket::PacketType::DispatchResult, + [this](Socket::ConnectionID c, Socket::Packet p) { m_dispatch.HandleDispatchResult(c, std::move(p)); }); + } + + void BgProtocolStep() override { + if (m_currentRingVersion.load() > 0) { + RetryUnackedRingUpdates(); + } + } + + bool IsRingSettled() const override { + return AllWorkersAcked(); + } + + private: + void HandleNodeRegisterRequest(Socket::ConnectionID connID, Socket::Packet packet) { + NodeRegisterMsg msg; + if (!msg.Read(packet.Body())) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "DispatcherNode: Failed to parse NodeRegisterRequest\n"); + return; + } + + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "DispatcherNode: NodeRegister from node %d (%s:%s, store=%s)\n", + msg.m_nodeIndex, msg.m_host.c_str(), msg.m_port.c_str(), msg.m_store.c_str()); + + // Ring is pre-built at startup, just broadcast current ring to the new connection + BroadcastRingUpdate(); + } + + void HandleRingUpdateACK(Socket::ConnectionID connID, Socket::Packet packet) { + RingUpdateACKMsg msg; + if (!msg.Read(packet.Body())) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "DispatcherNode: Failed to parse RingUpdateACK\n"); + return; + } + { + std::lock_guard lock(m_ackMutex); + auto& ver = m_workerAckedVersion[msg.m_nodeIndex]; + if (msg.m_ringVersion > ver) ver = msg.m_ringVersion; + } + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "DispatcherNode: RingUpdateACK from node %d (v%u)\n", + msg.m_nodeIndex, msg.m_ringVersion); + } + + void BroadcastRingUpdate() { + auto ring = std::atomic_load(&m_hashRing); + if (!ring) return; + + std::uint32_t version = m_currentRingVersion.load(); + RingUpdateMsg msg; + msg.m_ringVersion = version; + msg.m_vnodeCount = ring->GetVNodeCount(); + for (int idx : ring->GetNodes()) { + msg.m_nodeIndices.push_back(idx); + } + + std::size_t bodySize = msg.EstimateBufferSize(); + int numNodes = static_cast(m_nodeAddrs.size()); + + for (int i = 0; i < numNodes; i++) { + if (i == m_localNodeIndex) continue; + auto peerConn = GetPeerConnection(i); + if (peerConn == Socket::c_invalidConnectionID) continue; + + Socket::Packet pkt; + pkt.Header().m_packetType = Socket::PacketType::RingUpdate; + pkt.Header().m_processStatus = Socket::PacketProcessStatus::Ok; + pkt.Header().m_connectionID = Socket::c_invalidConnectionID; + pkt.Header().m_resourceID = 0; + pkt.Header().m_bodyLength = static_cast(bodySize); + pkt.AllocateBuffer(static_cast(bodySize)); + msg.Write(pkt.Body()); + pkt.Header().WriteBuffer(pkt.HeaderBuffer()); + + m_client->SendPacket(peerConn, std::move(pkt), nullptr); + } + + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "DispatcherNode: Broadcast RingUpdate v%u (%d nodes)\n", + version, (int)msg.m_nodeIndices.size()); + } + + void RetryUnackedRingUpdates() { + auto ring = std::atomic_load(&m_hashRing); + if (!ring) return; + std::uint32_t currentVer = m_currentRingVersion.load(); + if (currentVer == 0) return; + + std::vector unacked; + { + std::lock_guard lock(m_ackMutex); + int numNodes = static_cast(m_nodeAddrs.size()); + for (int i = 0; i < numNodes; i++) { + if (i == m_localNodeIndex) continue; + auto it = m_workerAckedVersion.find(i); + if (it == m_workerAckedVersion.end() || it->second < currentVer) + unacked.push_back(i); + } + } + if (unacked.empty()) return; + + RingUpdateMsg msg; + msg.m_ringVersion = currentVer; + msg.m_vnodeCount = ring->GetVNodeCount(); + for (int idx : ring->GetNodes()) msg.m_nodeIndices.push_back(idx); + std::size_t bodySize = msg.EstimateBufferSize(); + + for (int nodeIdx : unacked) { + auto peerConn = GetPeerConnection(nodeIdx); + if (peerConn == Socket::c_invalidConnectionID) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "DispatcherNode: RetryUnackedRingUpdates skip node %d (no peer conn)\n", nodeIdx); + continue; + } + + Socket::Packet pkt; + pkt.Header().m_packetType = Socket::PacketType::RingUpdate; + pkt.Header().m_processStatus = Socket::PacketProcessStatus::Ok; + pkt.Header().m_connectionID = Socket::c_invalidConnectionID; + pkt.Header().m_resourceID = 0; + pkt.Header().m_bodyLength = static_cast(bodySize); + pkt.AllocateBuffer(static_cast(bodySize)); + msg.Write(pkt.Body()); + pkt.Header().WriteBuffer(pkt.HeaderBuffer()); + + m_client->SendPacket(peerConn, std::move(pkt), nullptr); + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "DispatcherNode: Retried RingUpdate to node %d (connID=%u)\n", nodeIdx, peerConn); + } + } + + DispatchCoordinator m_dispatch; + std::atomic m_currentRingVersion{0}; + mutable std::mutex m_ackMutex; + std::unordered_map m_workerAckedVersion; + + std::thread m_heartbeatThread; + std::atomic m_heartbeatStop{false}; + }; + +} // namespace SPTAG::SPANN diff --git a/AnnService/inc/Core/SPANN/Distributed/DistributedProtocol.h b/AnnService/inc/Core/SPANN/Distributed/DistributedProtocol.h new file mode 100644 index 000000000..b4da82fcc --- /dev/null +++ b/AnnService/inc/Core/SPANN/Distributed/DistributedProtocol.h @@ -0,0 +1,651 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "inc/Core/Common.h" +#include "inc/Socket/SimpleSerialization.h" +#include +#include +#include +#include + +namespace SPTAG::SPANN { + + /// Serializable request for remote Append operations sent between compute nodes. + /// MirrorVersion 1 added m_layer to disambiguate which ExtraDynamicSearcher on + /// the receiver side handles the request. Version 0 packets default m_layer=0. + struct RemoteAppendRequest { + static constexpr std::uint16_t MajorVersion() { return 1; } + static constexpr std::uint16_t MirrorVersion() { return 1; } + + SizeType m_headID = 0; + std::string m_headVec; // raw head vector bytes + std::int32_t m_appendNum = 0; + std::string m_appendPosting; // serialized posting data + std::int32_t m_layer = 0; // originating ExtraDynamicSearcher layer + + std::size_t EstimateBufferSize() const { + std::size_t size = 0; + size += sizeof(std::uint16_t) * 2; // version fields + size += sizeof(SizeType); // headID + size += sizeof(std::uint32_t) + m_headVec.size(); // headVec (len-prefixed) + size += sizeof(std::int32_t); // appendNum + size += sizeof(std::uint32_t) + m_appendPosting.size(); // appendPosting (len-prefixed) + size += sizeof(std::int32_t); // layer (mirrorVer >= 1) + return size; + } + + std::uint8_t* Write(std::uint8_t* p_buffer) const { + using namespace Socket::SimpleSerialization; + p_buffer = SimpleWriteBuffer(MajorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(MirrorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(m_headID, p_buffer); + p_buffer = SimpleWriteBuffer(m_headVec, p_buffer); + p_buffer = SimpleWriteBuffer(m_appendNum, p_buffer); + p_buffer = SimpleWriteBuffer(m_appendPosting, p_buffer); + p_buffer = SimpleWriteBuffer(m_layer, p_buffer); + return p_buffer; + } + + const std::uint8_t* Read(const std::uint8_t* p_buffer) { + return Read(p_buffer, nullptr); + } + + const std::uint8_t* Read(const std::uint8_t* p_buffer, const std::uint8_t* p_bufEnd) { + using namespace Socket::SimpleSerialization; + std::uint16_t majorVer = 0, mirrorVer = 0; + p_buffer = SafeSimpleReadBuffer(p_buffer, p_bufEnd, majorVer); + p_buffer = SafeSimpleReadBuffer(p_buffer, p_bufEnd, mirrorVer); + if (p_buffer == nullptr || majorVer != MajorVersion()) return nullptr; + p_buffer = SafeSimpleReadBuffer(p_buffer, p_bufEnd, m_headID); + p_buffer = SafeSimpleReadBuffer(p_buffer, p_bufEnd, m_headVec); + p_buffer = SafeSimpleReadBuffer(p_buffer, p_bufEnd, m_appendNum); + p_buffer = SafeSimpleReadBuffer(p_buffer, p_bufEnd, m_appendPosting); + if (mirrorVer >= 1) { + p_buffer = SafeSimpleReadBuffer(p_buffer, p_bufEnd, m_layer); + } else { + m_layer = 0; + } + return p_buffer; + } + }; + + /// Response for remote Append operations. + struct RemoteAppendResponse { + static constexpr std::uint16_t MajorVersion() { return 1; } + static constexpr std::uint16_t MirrorVersion() { return 0; } + + enum class Status : std::uint8_t { Success = 0, Failed = 1 }; + Status m_status = Status::Success; + + std::size_t EstimateBufferSize() const { + return sizeof(std::uint16_t) * 2 + sizeof(std::uint8_t); + } + + std::uint8_t* Write(std::uint8_t* p_buffer) const { + using namespace Socket::SimpleSerialization; + p_buffer = SimpleWriteBuffer(MajorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(MirrorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(m_status, p_buffer); + return p_buffer; + } + + const std::uint8_t* Read(const std::uint8_t* p_buffer) { + using namespace Socket::SimpleSerialization; + std::uint16_t majorVer = 0, mirrorVer = 0; + p_buffer = SimpleReadBuffer(p_buffer, majorVer); + p_buffer = SimpleReadBuffer(p_buffer, mirrorVer); + if (majorVer != MajorVersion()) return nullptr; + p_buffer = SimpleReadBuffer(p_buffer, m_status); + return p_buffer; + } + }; + + /// Identifies a compute node target for routing decisions. + struct RouteTarget { + int nodeIndex = -1; + bool isLocal = true; + }; + + /// Batch of remote append requests sent to a single node in one round-trip. + struct BatchRemoteAppendRequest { + static constexpr std::uint16_t MajorVersion() { return 1; } + static constexpr std::uint16_t MirrorVersion() { return 0; } + + std::uint32_t m_count = 0; + std::vector m_items; + + std::size_t EstimateBufferSize() const { + std::size_t size = sizeof(std::uint16_t) * 2; // version + size += sizeof(std::uint32_t); // count + for (auto& item : m_items) size += item.EstimateBufferSize(); + return size; + } + + std::uint8_t* Write(std::uint8_t* p_buffer) const { + using namespace Socket::SimpleSerialization; + p_buffer = SimpleWriteBuffer(MajorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(MirrorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(m_count, p_buffer); + for (auto& item : m_items) p_buffer = item.Write(p_buffer); + return p_buffer; + } + + const std::uint8_t* Read(const std::uint8_t* p_buffer, std::uint32_t bodyLength = 0) { + using namespace Socket::SimpleSerialization; + const std::uint8_t* bufEnd = (bodyLength > 0) ? (p_buffer + bodyLength) : nullptr; + std::uint16_t majorVer = 0, mirrorVer = 0; + p_buffer = SafeSimpleReadBuffer(p_buffer, bufEnd, majorVer); + p_buffer = SafeSimpleReadBuffer(p_buffer, bufEnd, mirrorVer); + if (p_buffer == nullptr || majorVer != MajorVersion()) { + m_items.clear(); + return nullptr; + } + p_buffer = SafeSimpleReadBuffer(p_buffer, bufEnd, m_count); + if (p_buffer == nullptr) { + m_items.clear(); + return nullptr; + } + // Reject obviously corrupt counts before allocating + if (bodyLength > 0 && m_count > bodyLength / 8) { + m_items.clear(); + return nullptr; + } + m_items.resize(m_count); + for (std::uint32_t i = 0; i < m_count; i++) { + if (bufEnd && p_buffer >= bufEnd) { + m_items.clear(); + return nullptr; + } + p_buffer = m_items[i].Read(p_buffer, bufEnd); + if (!p_buffer) { + m_items.clear(); + return nullptr; + } + if (bufEnd && p_buffer > bufEnd) { + m_items.clear(); + return nullptr; + } + } + return p_buffer; + } + }; + + /// Response for batch remote append. + struct BatchRemoteAppendResponse { + static constexpr std::uint16_t MajorVersion() { return 1; } + static constexpr std::uint16_t MirrorVersion() { return 0; } + + std::uint32_t m_successCount = 0; + std::uint32_t m_failCount = 0; + + std::size_t EstimateBufferSize() const { + return sizeof(std::uint16_t) * 2 + sizeof(std::uint32_t) * 2; + } + + std::uint8_t* Write(std::uint8_t* p_buffer) const { + using namespace Socket::SimpleSerialization; + p_buffer = SimpleWriteBuffer(MajorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(MirrorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(m_successCount, p_buffer); + p_buffer = SimpleWriteBuffer(m_failCount, p_buffer); + return p_buffer; + } + + const std::uint8_t* Read(const std::uint8_t* p_buffer) { + using namespace Socket::SimpleSerialization; + std::uint16_t majorVer = 0, mirrorVer = 0; + p_buffer = SimpleReadBuffer(p_buffer, majorVer); + p_buffer = SimpleReadBuffer(p_buffer, mirrorVer); + if (majorVer != MajorVersion()) return nullptr; + p_buffer = SimpleReadBuffer(p_buffer, m_successCount); + p_buffer = SimpleReadBuffer(p_buffer, m_failCount); + return p_buffer; + } + }; + + /// Cross-node merge hint. Search-side trigger on node X observed that + /// posting `m_headID` (owned by the target node based on consistent-hash + /// ownership) is below the merge threshold. The receiver enqueues a + /// local MergeAsync; the local MergePostings logic decides whether the + /// posting really needs merging at execution time. Fire-and-forget: no + /// response packet, no retry queue. Multiple notifications for the same + /// head are dedup'd by m_mergeList on the receiver. + struct RemoteMergeRequest { + static constexpr std::uint16_t MajorVersion() { return 1; } + static constexpr std::uint16_t MirrorVersion() { return 0; } + + SizeType m_headID = 0; + std::int32_t m_layer = 0; + + std::size_t EstimateBufferSize() const { + return sizeof(std::uint16_t) * 2 + sizeof(SizeType) + sizeof(std::int32_t); + } + + std::uint8_t* Write(std::uint8_t* p_buffer) const { + using namespace Socket::SimpleSerialization; + p_buffer = SimpleWriteBuffer(MajorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(MirrorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(m_headID, p_buffer); + p_buffer = SimpleWriteBuffer(m_layer, p_buffer); + return p_buffer; + } + + const std::uint8_t* Read(const std::uint8_t* p_buffer, const std::uint8_t* p_bufEnd) { + using namespace Socket::SimpleSerialization; + std::uint16_t majorVer = 0, mirrorVer = 0; + p_buffer = SafeSimpleReadBuffer(p_buffer, p_bufEnd, majorVer); + p_buffer = SafeSimpleReadBuffer(p_buffer, p_bufEnd, mirrorVer); + if (p_buffer == nullptr || majorVer != MajorVersion()) return nullptr; + p_buffer = SafeSimpleReadBuffer(p_buffer, p_bufEnd, m_headID); + p_buffer = SafeSimpleReadBuffer(p_buffer, p_bufEnd, m_layer); + return p_buffer; + } + }; + + /// Batch of cross-node merge hints sent to a single owner node in one + /// fire-and-forget packet. Sender-side dedups by (layer, headID) so + /// each entry appears at most once per flush window. + struct BatchRemoteMergeRequest { + static constexpr std::uint16_t MajorVersion() { return 1; } + static constexpr std::uint16_t MirrorVersion() { return 0; } + + std::uint32_t m_count = 0; + std::vector m_items; + + std::size_t EstimateBufferSize() const { + std::size_t size = sizeof(std::uint16_t) * 2; + size += sizeof(std::uint32_t); + for (auto& item : m_items) size += item.EstimateBufferSize(); + return size; + } + + std::uint8_t* Write(std::uint8_t* p_buffer) const { + using namespace Socket::SimpleSerialization; + p_buffer = SimpleWriteBuffer(MajorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(MirrorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(m_count, p_buffer); + for (auto& item : m_items) p_buffer = item.Write(p_buffer); + return p_buffer; + } + + const std::uint8_t* Read(const std::uint8_t* p_buffer, std::uint32_t bodyLength = 0) { + using namespace Socket::SimpleSerialization; + const std::uint8_t* bufEnd = (bodyLength > 0) ? (p_buffer + bodyLength) : nullptr; + std::uint16_t majorVer = 0, mirrorVer = 0; + p_buffer = SafeSimpleReadBuffer(p_buffer, bufEnd, majorVer); + p_buffer = SafeSimpleReadBuffer(p_buffer, bufEnd, mirrorVer); + if (p_buffer == nullptr || majorVer != MajorVersion()) { + m_items.clear(); + return nullptr; + } + p_buffer = SafeSimpleReadBuffer(p_buffer, bufEnd, m_count); + if (p_buffer == nullptr) { m_items.clear(); return nullptr; } + if (bodyLength > 0 && m_count > bodyLength / 8) { + m_items.clear(); + return nullptr; + } + m_items.resize(m_count); + for (std::uint32_t i = 0; i < m_count; i++) { + if (bufEnd && p_buffer >= bufEnd) { m_items.clear(); return nullptr; } + p_buffer = m_items[i].Read(p_buffer, bufEnd); + if (!p_buffer) { m_items.clear(); return nullptr; } + if (bufEnd && p_buffer > bufEnd) { m_items.clear(); return nullptr; } + } + return p_buffer; + } + }; + + /// Entry in a head sync broadcast: one add or delete of a head node. + /// `m_layer` identifies the originating ExtraDynamicSearcher so the + /// receiver applies the entry to the matching layer's head index + /// (with multi-layer SPANN, layer 0 and layer 1 both broadcast head + /// add/delete; without the layer field every entry would be misrouted + /// to a single shared callback). + struct HeadSyncEntry { + enum class Op : std::uint8_t { Add = 0, Delete = 1 }; + Op op; + SizeType headVID; + std::string headVector; // only for Add; empty for Delete + std::int32_t m_layer = 0; // originating ExtraDynamicSearcher layer + + size_t EstimateBufferSize() const { + return sizeof(std::uint8_t) // op + + sizeof(SizeType) // headVID + + sizeof(std::uint32_t) // headVector length + + headVector.size() + + sizeof(std::int32_t); // layer + } + + std::uint8_t* Write(std::uint8_t* p_buffer) const { + using namespace Socket::SimpleSerialization; + p_buffer = SimpleWriteBuffer(static_cast(op), p_buffer); + p_buffer = SimpleWriteBuffer(headVID, p_buffer); + std::uint32_t vecLen = static_cast(headVector.size()); + p_buffer = SimpleWriteBuffer(vecLen, p_buffer); + if (vecLen > 0) { + memcpy(p_buffer, headVector.data(), vecLen); + p_buffer += vecLen; + } + p_buffer = SimpleWriteBuffer(m_layer, p_buffer); + return p_buffer; + } + + const std::uint8_t* Read(const std::uint8_t* p_buffer) { + using namespace Socket::SimpleSerialization; + std::uint8_t rawOp = 0; + p_buffer = SimpleReadBuffer(p_buffer, rawOp); + op = static_cast(rawOp); + p_buffer = SimpleReadBuffer(p_buffer, headVID); + std::uint32_t vecLen = 0; + p_buffer = SimpleReadBuffer(p_buffer, vecLen); + if (vecLen > 0) { + headVector.assign(reinterpret_cast(p_buffer), vecLen); + p_buffer += vecLen; + } else { + headVector.clear(); + } + p_buffer = SimpleReadBuffer(p_buffer, m_layer); + return p_buffer; + } + }; + + /// Dispatch command from driver to workers (replaces file-based barriers). + struct DispatchCommand { + static constexpr std::uint16_t MajorVersion() { return 1; } + static constexpr std::uint16_t MirrorVersion() { return 0; } + + enum class Type : std::uint8_t { Search = 0, Insert = 1, Stop = 2, Heartbeat = 3 }; + Type m_type = Type::Search; + std::uint64_t m_dispatchId = 0; // unique ID from driver + std::uint32_t m_round = 0; // search round or insert batch index + + std::size_t EstimateBufferSize() const { + return sizeof(std::uint16_t) * 2 + sizeof(std::uint8_t) + + sizeof(std::uint64_t) + sizeof(std::uint32_t); + } + + std::uint8_t* Write(std::uint8_t* p_buffer) const { + using namespace Socket::SimpleSerialization; + p_buffer = SimpleWriteBuffer(MajorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(MirrorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(static_cast(m_type), p_buffer); + p_buffer = SimpleWriteBuffer(m_dispatchId, p_buffer); + p_buffer = SimpleWriteBuffer(m_round, p_buffer); + return p_buffer; + } + + const std::uint8_t* Read(const std::uint8_t* p_buffer) { + using namespace Socket::SimpleSerialization; + std::uint16_t majorVer = 0, mirrorVer = 0; + p_buffer = SimpleReadBuffer(p_buffer, majorVer); + p_buffer = SimpleReadBuffer(p_buffer, mirrorVer); + if (majorVer != MajorVersion()) return nullptr; + std::uint8_t rawType = 0; + p_buffer = SimpleReadBuffer(p_buffer, rawType); + m_type = static_cast(rawType); + p_buffer = SimpleReadBuffer(p_buffer, m_dispatchId); + p_buffer = SimpleReadBuffer(p_buffer, m_round); + return p_buffer; + } + }; + + /// Result from worker back to driver after executing a dispatch command. + struct DispatchResult { + static constexpr std::uint16_t MajorVersion() { return 1; } + static constexpr std::uint16_t MirrorVersion() { return 1; } + + enum class Status : std::uint8_t { Success = 0, Failed = 1 }; + Status m_status = Status::Success; + std::uint64_t m_dispatchId = 0; + std::uint32_t m_round = 0; + double m_wallTime = 0.0; + std::int32_t m_nodeIndex = -1; // which worker sent this result + + std::size_t EstimateBufferSize() const { + return sizeof(std::uint16_t) * 2 + sizeof(std::uint8_t) + + sizeof(std::uint64_t) + sizeof(std::uint32_t) + sizeof(double) + + sizeof(std::int32_t); + } + + std::uint8_t* Write(std::uint8_t* p_buffer) const { + using namespace Socket::SimpleSerialization; + p_buffer = SimpleWriteBuffer(MajorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(MirrorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(static_cast(m_status), p_buffer); + p_buffer = SimpleWriteBuffer(m_dispatchId, p_buffer); + p_buffer = SimpleWriteBuffer(m_round, p_buffer); + p_buffer = SimpleWriteBuffer(m_wallTime, p_buffer); + p_buffer = SimpleWriteBuffer(m_nodeIndex, p_buffer); + return p_buffer; + } + + const std::uint8_t* Read(const std::uint8_t* p_buffer) { + using namespace Socket::SimpleSerialization; + std::uint16_t majorVer = 0, mirrorVer = 0; + p_buffer = SimpleReadBuffer(p_buffer, majorVer); + p_buffer = SimpleReadBuffer(p_buffer, mirrorVer); + if (majorVer != MajorVersion()) return nullptr; + std::uint8_t rawStatus = 0; + p_buffer = SimpleReadBuffer(p_buffer, rawStatus); + m_status = static_cast(rawStatus); + p_buffer = SimpleReadBuffer(p_buffer, m_dispatchId); + p_buffer = SimpleReadBuffer(p_buffer, m_round); + p_buffer = SimpleReadBuffer(p_buffer, m_wallTime); + if (mirrorVer >= 1) { + p_buffer = SimpleReadBuffer(p_buffer, m_nodeIndex); + } + return p_buffer; + } + }; + + /// Request to lock/unlock a headID on its owner node (for cross-node Merge). + /// MirrorVersion 1 added m_layer so multi-layer setups dispatch to the + /// correct lock pool (each ExtraDynamicSearcher owns its own bucket flags). + struct RemoteLockRequest { + static constexpr std::uint16_t MajorVersion() { return 1; } + static constexpr std::uint16_t MirrorVersion() { return 1; } + + enum class Op : std::uint8_t { Lock = 0, Unlock = 1 }; + Op m_op = Op::Lock; + SizeType m_headID = 0; + std::int32_t m_layer = 0; + + std::size_t EstimateBufferSize() const { + return sizeof(std::uint16_t) * 2 + sizeof(std::uint8_t) + + sizeof(SizeType) + sizeof(std::int32_t); + } + + std::uint8_t* Write(std::uint8_t* p_buffer) const { + using namespace Socket::SimpleSerialization; + p_buffer = SimpleWriteBuffer(MajorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(MirrorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(static_cast(m_op), p_buffer); + p_buffer = SimpleWriteBuffer(m_headID, p_buffer); + p_buffer = SimpleWriteBuffer(m_layer, p_buffer); + return p_buffer; + } + + const std::uint8_t* Read(const std::uint8_t* p_buffer) { + using namespace Socket::SimpleSerialization; + std::uint16_t majorVer = 0, mirrorVer = 0; + p_buffer = SimpleReadBuffer(p_buffer, majorVer); + p_buffer = SimpleReadBuffer(p_buffer, mirrorVer); + if (majorVer != MajorVersion()) return nullptr; + std::uint8_t rawOp = 0; + p_buffer = SimpleReadBuffer(p_buffer, rawOp); + m_op = static_cast(rawOp); + p_buffer = SimpleReadBuffer(p_buffer, m_headID); + if (mirrorVer >= 1) { + p_buffer = SimpleReadBuffer(p_buffer, m_layer); + } else { + m_layer = 0; + } + return p_buffer; + } + }; + + /// Response for remote lock operations. + struct RemoteLockResponse { + static constexpr std::uint16_t MajorVersion() { return 1; } + static constexpr std::uint16_t MirrorVersion() { return 0; } + + enum class Status : std::uint8_t { Granted = 0, Denied = 1 }; + Status m_status = Status::Granted; + + std::size_t EstimateBufferSize() const { + return sizeof(std::uint16_t) * 2 + sizeof(std::uint8_t); + } + + std::uint8_t* Write(std::uint8_t* p_buffer) const { + using namespace Socket::SimpleSerialization; + p_buffer = SimpleWriteBuffer(MajorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(MirrorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(static_cast(m_status), p_buffer); + return p_buffer; + } + + const std::uint8_t* Read(const std::uint8_t* p_buffer) { + using namespace Socket::SimpleSerialization; + std::uint16_t majorVer = 0, mirrorVer = 0; + p_buffer = SimpleReadBuffer(p_buffer, majorVer); + p_buffer = SimpleReadBuffer(p_buffer, mirrorVer); + if (majorVer != MajorVersion()) return nullptr; + std::uint8_t rawOp = 0; + p_buffer = SimpleReadBuffer(p_buffer, rawOp); + m_status = static_cast(rawOp); + return p_buffer; + } + }; + + /// Worker → dispatcher registration message. + struct NodeRegisterMsg { + static constexpr std::uint16_t MajorVersion() { return 1; } + static constexpr std::uint16_t MirrorVersion() { return 0; } + + std::int32_t m_nodeIndex = 0; + std::string m_host; + std::string m_port; + std::string m_store; + + std::size_t EstimateBufferSize() const { + std::size_t size = 0; + size += sizeof(std::uint16_t) * 2; + size += sizeof(std::int32_t); + size += sizeof(std::uint32_t) + m_host.size(); + size += sizeof(std::uint32_t) + m_port.size(); + size += sizeof(std::uint32_t) + m_store.size(); + return size; + } + + std::uint8_t* Write(std::uint8_t* p_buffer) const { + using namespace Socket::SimpleSerialization; + p_buffer = SimpleWriteBuffer(MajorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(MirrorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(m_nodeIndex, p_buffer); + p_buffer = SimpleWriteBuffer(m_host, p_buffer); + p_buffer = SimpleWriteBuffer(m_port, p_buffer); + p_buffer = SimpleWriteBuffer(m_store, p_buffer); + return p_buffer; + } + + const std::uint8_t* Read(const std::uint8_t* p_buffer) { + using namespace Socket::SimpleSerialization; + std::uint16_t majorVer = 0, mirrorVer = 0; + p_buffer = SimpleReadBuffer(p_buffer, majorVer); + p_buffer = SimpleReadBuffer(p_buffer, mirrorVer); + if (majorVer != MajorVersion()) return nullptr; + p_buffer = SimpleReadBuffer(p_buffer, m_nodeIndex); + p_buffer = SimpleReadBuffer(p_buffer, m_host); + p_buffer = SimpleReadBuffer(p_buffer, m_port); + p_buffer = SimpleReadBuffer(p_buffer, m_store); + return p_buffer; + } + }; + + /// Dispatcher → worker ring update (full node list, versioned). + struct RingUpdateMsg { + static constexpr std::uint16_t MajorVersion() { return 1; } + static constexpr std::uint16_t MirrorVersion() { return 0; } + + std::uint32_t m_ringVersion = 0; + std::int32_t m_vnodeCount = 150; + std::vector m_nodeIndices; + + std::size_t EstimateBufferSize() const { + std::size_t size = 0; + size += sizeof(std::uint16_t) * 2; + size += sizeof(std::uint32_t); // ringVersion + size += sizeof(std::int32_t); // vnodeCount + size += sizeof(std::uint32_t); // numNodes + size += sizeof(std::int32_t) * m_nodeIndices.size(); + return size; + } + + std::uint8_t* Write(std::uint8_t* p_buffer) const { + using namespace Socket::SimpleSerialization; + p_buffer = SimpleWriteBuffer(MajorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(MirrorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(m_ringVersion, p_buffer); + p_buffer = SimpleWriteBuffer(m_vnodeCount, p_buffer); + std::uint32_t count = static_cast(m_nodeIndices.size()); + p_buffer = SimpleWriteBuffer(count, p_buffer); + for (auto idx : m_nodeIndices) { + p_buffer = SimpleWriteBuffer(idx, p_buffer); + } + return p_buffer; + } + + const std::uint8_t* Read(const std::uint8_t* p_buffer) { + using namespace Socket::SimpleSerialization; + std::uint16_t majorVer = 0, mirrorVer = 0; + p_buffer = SimpleReadBuffer(p_buffer, majorVer); + p_buffer = SimpleReadBuffer(p_buffer, mirrorVer); + if (majorVer != MajorVersion()) return nullptr; + p_buffer = SimpleReadBuffer(p_buffer, m_ringVersion); + p_buffer = SimpleReadBuffer(p_buffer, m_vnodeCount); + std::uint32_t count = 0; + p_buffer = SimpleReadBuffer(p_buffer, count); + m_nodeIndices.resize(count); + for (std::uint32_t i = 0; i < count; i++) { + p_buffer = SimpleReadBuffer(p_buffer, m_nodeIndices[i]); + } + return p_buffer; + } + }; + + /// Worker → dispatcher ACK for a ring update. + struct RingUpdateACKMsg { + static constexpr std::uint16_t MajorVersion() { return 1; } + static constexpr std::uint16_t MirrorVersion() { return 0; } + + std::int32_t m_nodeIndex = -1; + std::uint32_t m_ringVersion = 0; + + std::size_t EstimateBufferSize() const { + return sizeof(std::uint16_t) * 2 + sizeof(std::int32_t) + sizeof(std::uint32_t); + } + + std::uint8_t* Write(std::uint8_t* p_buffer) const { + using namespace Socket::SimpleSerialization; + p_buffer = SimpleWriteBuffer(MajorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(MirrorVersion(), p_buffer); + p_buffer = SimpleWriteBuffer(m_nodeIndex, p_buffer); + p_buffer = SimpleWriteBuffer(m_ringVersion, p_buffer); + return p_buffer; + } + + const std::uint8_t* Read(const std::uint8_t* p_buffer) { + using namespace Socket::SimpleSerialization; + std::uint16_t majorVer = 0, mirrorVer = 0; + p_buffer = SimpleReadBuffer(p_buffer, majorVer); + p_buffer = SimpleReadBuffer(p_buffer, mirrorVer); + if (majorVer != MajorVersion()) return nullptr; + p_buffer = SimpleReadBuffer(p_buffer, m_nodeIndex); + p_buffer = SimpleReadBuffer(p_buffer, m_ringVersion); + return p_buffer; + } + }; + +} // namespace SPTAG::SPANN diff --git a/AnnService/inc/Core/SPANN/Distributed/NetworkNode.h b/AnnService/inc/Core/SPANN/Distributed/NetworkNode.h new file mode 100644 index 000000000..4e11a4b08 --- /dev/null +++ b/AnnService/inc/Core/SPANN/Distributed/NetworkNode.h @@ -0,0 +1,319 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#ifndef _SPTAG_SPANN_NETWORKNODE_H_ +#define _SPTAG_SPANN_NETWORKNODE_H_ + +#include "inc/Core/SPANN/Distributed/DistributedProtocol.h" +#include "inc/Core/SPANN/Distributed/ConsistentHashRing.h" +#include "inc/Core/SPANN/Distributed/DispatchCoordinator.h" +#include "inc/Core/SPANN/Distributed/RemotePostingOps.h" +#include "inc/Socket/Client.h" +#include "inc/Socket/Server.h" +#include "inc/Socket/Packet.h" +#include +#include +#include +#include +#include +#include +#include + +namespace SPTAG::SPANN { + + /// Base class providing shared networking infrastructure for all + /// distributed node roles. Manages server/client sockets, peer + /// connections, consistent hash ring storage, and a background + /// connection maintenance thread. + /// + /// Subclasses override RegisterHandlers() to wire up their specific + /// packet handlers, and BgProtocolStep() / IsRingSettled() for + /// role-specific background work. + class NetworkNode : public DispatchCoordinator::PeerNetwork, + public RemotePostingOps::NetworkAccess { + public: + NetworkNode() + : m_enabled(false), m_localNodeIndex(-1) {} + + virtual ~NetworkNode() { + m_bgConnectStop.store(true); + if (m_bgConnectThread.joinable()) m_bgConnectThread.join(); + } + + /// Initialize shared networking state. + bool InitializeNetwork( + int localNodeIdx, + const std::vector>& nodeAddrs, + int vnodeCount = 150) + { + if (nodeAddrs.empty() || localNodeIdx < 0 || + localNodeIdx >= static_cast(nodeAddrs.size())) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "NetworkNode::Initialize invalid config: %d nodes, localIdx=%d\n", + (int)nodeAddrs.size(), localNodeIdx); + return false; + } + + m_localNodeIndex = localNodeIdx; + m_nodeAddrs = nodeAddrs; + m_vnodeCount = vnodeCount; + + // Start with empty hash ring + std::atomic_store(&m_hashRing, + std::shared_ptr( + std::make_shared(vnodeCount))); + + m_enabled = true; + return true; + } + + /// Start server + client + background connection thread. + /// Subclasses must have called InitializeNetwork() first. + /// Each node listens on its own address from the combined address list. + bool StartNetwork() { + if (!m_enabled) return false; + + // Pre-size m_peerConnections BEFORE the server is started — the + // server's handler threads can dispatch packets immediately on + // bind, and inbound handlers (e.g. HandleRingUpdate -> + // SendRingUpdateACK) call GetPeerConnection which indexes into + // m_peerConnections. Resizing here closes a startup race that + // could segfault when an early peer (typically the dispatcher + // sending the initial RingUpdate) won the race. + m_peerConnections.resize(m_nodeAddrs.size(), Socket::c_invalidConnectionID); + + // --- Client side --- + // Construct the Socket::Client BEFORE starting the + // server. Server handlers (notably HeadSync receiver / ring + // update) can fire as soon as the listening socket accepts a + // peer, and they may call ConnectToPeer → m_client-> + // ConnectToServer. If m_client is still null at that point, + // the call dereferences a null unique_ptr and segfaults + // (Pre-build "All N connection attempts to node X failed" + // crash). Construct the client first so the handler path is + // safe before any socket can be accepted. + Socket::PacketHandlerMapPtr clientHandlers(new Socket::PacketHandlerMap); + RegisterClientHandlers(clientHandlers); + + m_client.reset(new Socket::Client(clientHandlers, 8, 30)); + + // --- Server side --- + { + Socket::PacketHandlerMapPtr serverHandlers(new Socket::PacketHandlerMap); + RegisterServerHandlers(serverHandlers); + + const auto& localAddr = m_nodeAddrs[m_localNodeIndex]; + m_server.reset(new Socket::Server( + localAddr.first, localAddr.second, serverHandlers, 8)); + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "NetworkNode server listening on %s:%s\n", + localAddr.first.c_str(), localAddr.second.c_str()); + } + + // --- Background thread --- + m_bgConnectStop.store(false); + m_bgConnectThread = std::thread([this]() { + int numNodes = static_cast(m_nodeAddrs.size()); + int delayMs = 500; + while (!m_bgConnectStop.load()) { + bool allConnected = true; + for (int i = 0; i < numNodes; i++) { + if (i == m_localNodeIndex) continue; + { + std::lock_guard lock(m_connMutex); + if (m_peerConnections[i] != Socket::c_invalidConnectionID) + continue; + } + allConnected = false; + ConnectToPeer(i, 1, 0); + } + + BgProtocolStep(); + + if (allConnected && IsRingSettled()) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "NetworkNode: All peers connected and ring synchronized\n"); + break; + } + std::this_thread::sleep_for(std::chrono::milliseconds(delayMs)); + delayMs = std::min(delayMs + 500, 5000); + } + }); + + return true; + } + + // ---- PeerNetwork + NetworkAccess interface ---- + // + // GetLocalNodeIndex() / GetNumNodes() use NETWORK-SLOT semantics: + // m_nodeAddrs is the flat address table indexed by internal slot + // (slot 0 = dispatcher, slots 1..N = workers). These are the + // values used for raw socket connections and dispatch routing. + // + // For COMPUTE-WORKER semantics (VID interleaving, version-map + // sizing, hash-ring partitioning), use GetNumWorkerNodes() / + // GetWorkerNodeIndex() instead — those exclude the dispatcher + // and use 0-indexed worker shard numbering. Mixing the two + // produces off-by-one shard math + // (AllocateGlobalVID maps to the wrong globalVID range). + + int GetLocalNodeIndex() const override { return m_localNodeIndex; } + + int GetNumNodes() const override { + return static_cast(m_nodeAddrs.size()); + } + + // ---- Compute-role accessors ---- + // + // These describe the LOGICAL cluster composition independent of + // the network slot layout. Subclasses populate the m_num*Nodes / + // m_workerNodeIndex fields during Initialize(). + // + // Use these (NOT GetNumNodes / GetLocalNodeIndex) for: + // * AllocateGlobalVID interleaving math + // * Version-map cross-node bound sizing + // * AddIDCapacity growth multiplier + // * Any "how many shards are storing user data?" question + + int GetNumWorkerNodes() const { return m_numWorkerNodes; } + int GetNumDispatchNodes() const { return m_numDispatchNodes; } + + /// 0-indexed compute-shard position for this node, or -1 if this + /// node is dispatcher-only (has no local data shard). + int GetWorkerNodeIndex() const { return m_workerNodeIndex; } + + Socket::ConnectionID GetPeerConnection(int nodeIndex) override { + { + std::lock_guard lock(m_connMutex); + if (m_peerConnections[nodeIndex] != Socket::c_invalidConnectionID) + return m_peerConnections[nodeIndex]; + } + if (ConnectToPeer(nodeIndex, 5, 1000)) { + std::lock_guard lock(m_connMutex); + return m_peerConnections[nodeIndex]; + } + return Socket::c_invalidConnectionID; + } + + void SendPacket(Socket::ConnectionID connID, Socket::Packet&& pkt, + std::function callback) override { + m_client->SendPacket(connID, std::move(pkt), std::move(callback)); + } + + void InvalidatePeerConnection(int nodeIndex) override { + std::lock_guard lock(m_connMutex); + m_peerConnections[nodeIndex] = Socket::c_invalidConnectionID; + } + + Socket::Client* GetClient() override { return m_client.get(); } + Socket::Server* GetServer() override { return m_server.get(); } + + // ---- Shared accessors ---- + + bool IsEnabled() const { return m_enabled; } + + std::shared_ptr GetHashRing() const { + return std::atomic_load(&m_hashRing); + } + + void SetHashRing(std::shared_ptr ring) { + std::atomic_store(&m_hashRing, std::move(ring)); + } + + bool WaitForAllPeersConnected(int timeoutSec = 120) { + if (!m_enabled) return true; + int numNodes = static_cast(m_nodeAddrs.size()); + auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(timeoutSec); + while (std::chrono::steady_clock::now() < deadline) { + bool allConnected = true; + for (int i = 0; i < numNodes; i++) { + if (i == m_localNodeIndex) continue; + std::lock_guard lock(m_connMutex); + if (m_peerConnections[i] == Socket::c_invalidConnectionID) { + allConnected = false; + break; + } + } + if (allConnected) return true; + std::this_thread::sleep_for(std::chrono::milliseconds(500)); + } + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "NetworkNode: Timed out waiting for peer connections (%ds)\n", timeoutSec); + return false; + } + + bool ConnectToPeer(int nodeIndex, int maxRetries = 10, int initialDelayMs = 500) { + if (nodeIndex == m_localNodeIndex) return true; + std::pair addr; + { + std::lock_guard lock(m_connMutex); + if (nodeIndex >= static_cast(m_nodeAddrs.size())) return false; + addr = m_nodeAddrs[nodeIndex]; + } + int delayMs = initialDelayMs; + for (int attempt = 1; attempt <= maxRetries; attempt++) { + ErrorCode ec; + auto connID = m_client->ConnectToServer(addr.first, addr.second, ec); + if (ec == ErrorCode::Success) { + std::lock_guard lock(m_connMutex); + m_peerConnections[nodeIndex] = connID; + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "NetworkNode[local=%d]: Connected to node %d (%s:%s), connID=%u (attempt %d)\n", + m_localNodeIndex, nodeIndex, addr.first.c_str(), addr.second.c_str(), connID, attempt); + return true; + } + if (attempt < maxRetries) { + std::this_thread::sleep_for(std::chrono::milliseconds(delayMs)); + delayMs = std::min(delayMs * 2, 5000); + } + } + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "NetworkNode: All %d connection attempts to node %d failed\n", + maxRetries, nodeIndex); + return false; + } + + protected: + /// Subclasses register their packet handlers here. + virtual void RegisterServerHandlers(Socket::PacketHandlerMapPtr& handlers) = 0; + virtual void RegisterClientHandlers(Socket::PacketHandlerMapPtr& handlers) = 0; + + /// Called each iteration of the bg thread for role-specific protocol work. + virtual void BgProtocolStep() {} + + /// Return true when ring is fully synchronized for this node's role. + virtual bool IsRingSettled() const { return true; } + + bool m_enabled; + int m_localNodeIndex; + int m_vnodeCount = 150; + + // Compute-role accounting. Set by subclass Initialize(). + // m_workerNodeIndex == -1 means this node has no local data shard + // (dispatcher-only role). See GetNumWorkerNodes() / GetWorkerNodeIndex() + // for the rationale on why these are separate from m_nodeAddrs.size(). + int m_numWorkerNodes = 0; + int m_numDispatchNodes = 0; + int m_workerNodeIndex = -1; + + // Consistent hash ring (lock-free RCU: atomic_load to read, copy-on-write to modify) + std::shared_ptr m_hashRing; + std::mutex m_ringWriteMutex; + + // Node addresses + std::vector> m_nodeAddrs; + + // Networking + std::unique_ptr m_server; + std::unique_ptr m_client; + std::mutex m_connMutex; + std::vector m_peerConnections; + + // Background thread + std::thread m_bgConnectThread; + std::atomic m_bgConnectStop{false}; + }; + +} // namespace SPTAG::SPANN + +#endif // _SPTAG_SPANN_NETWORKNODE_H_ diff --git a/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h b/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h new file mode 100644 index 000000000..577b91876 --- /dev/null +++ b/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h @@ -0,0 +1,1325 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "inc/Core/SPANN/Distributed/DistributedProtocol.h" +#include "inc/Helper/ThreadPool.h" +#include "inc/Socket/Client.h" +#include "inc/Socket/Server.h" +#include "inc/Socket/Packet.h" +#include "inc/Socket/SimpleSerialization.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace SPTAG::SPANN { + + // Per-thread hook so the SPDKThreadPool's pre-allocated ExtraWorkSpace + // (initialised once per worker thread, see SPDKThreadPool::initSPDK) can + // be reached from inside the AppendCallback lambda without changing the + // callback signature. BatchAppendItemJob::exec(workspace*, abort*) sets + // this before invoking the callback so the callback skips the per-item + // InitWorkSpace allocation / m_freeWorkSpaceIds churn that otherwise + // serialises 10k-item batches into ~130s on the receiver. + inline thread_local void* tls_preallocAppendWorkSpace = nullptr; + + /// Handles all node-to-node RPC mechanics for internal posting operations: + /// - Append / BatchAppend (forward writes to the correct owner node) + /// - HeadSync (broadcast head index changes to peers) + /// - RemoteLock (cross-node locking for merge/split) + /// + /// This class owns the request/response matching state and serialization + /// logic. It is independent of routing decisions — WorkerNode decides + /// *where* to send, RemotePostingOps handles *how*. + class RemotePostingOps { + public: + using AppendCallback = std::function headVec, + int appendNum, + std::string& appendPosting)>; + + using HeadSyncCallback = std::function; + using RemoteLockCallback = std::function; + + /// Callback for cross-node merge: search on a peer node observed + /// that posting `headID` (which we own) looks underfull. The peer + /// sent a fire-and-forget MergeRequest to us; we just schedule the + /// local MergeAsync. Returns nothing; receiver-side m_mergeList + /// already dedupes repeated triggers, so dropped notifications + /// are recoverable on the next observation. + using MergeCallback = std::function; + + /// Abstract interface for network access (implemented by NetworkNode). + class NetworkAccess { + public: + virtual ~NetworkAccess() = default; + virtual Socket::ConnectionID GetPeerConnection(int nodeIndex) = 0; + virtual void InvalidatePeerConnection(int nodeIndex) = 0; + virtual int GetLocalNodeIndex() const = 0; + virtual int GetNumNodes() const = 0; + virtual Socket::Client* GetClient() = 0; + virtual Socket::Server* GetServer() = 0; + }; + + RemotePostingOps() { + StartHeadSyncRetryThread(); + } + + ~RemotePostingOps() { + StopHeadSyncRetryThread(); + } + + RemotePostingOps(const RemotePostingOps&) = delete; + RemotePostingOps& operator=(const RemotePostingOps&) = delete; + + void SetNetwork(NetworkAccess* net) { m_net = net; } + + // Inject the searcher's shared compute pool. Receiver-side BatchAppend + // work runs as Jobs on this pool so it shares a single bounded- + // concurrency budget with local Append/Split/Merge/Reassign (instead + // of a separate bg executor + transient std::threads which over- + // subscribed TiKV). Per-layer: each layer's ExtraDynamicSearcher owns + // its own m_splitThreadPool, so BatchAppend items dispatch by the + // request's m_layer to the matching pool. A single submitter would + // pile both layers' remote appends into whichever pool wired last. + using JobSubmitter = std::function; + void SetJobSubmitter(int layer, JobSubmitter submitter) { + std::unique_lock lk(m_callbackLifetimeMutex); + EnsureLayerSlot_NoLock(layer); + if (m_jobSubmitters.size() <= static_cast(layer)) { + m_jobSubmitters.resize(static_cast(layer) + 1); + } + m_jobSubmitters[layer] = std::move(submitter); + } + + // Helper: ensure the per-layer registries are wide enough for `layer`. + // Caller must hold m_callbackLifetimeMutex in exclusive mode. + void EnsureLayerSlot_NoLock(int layer) { + if (layer < 0) return; + const size_t needed = static_cast(layer) + 1; + if (m_appendCallbacks.size() < needed) m_appendCallbacks.resize(needed); + if (m_headSyncCallbacks.size() < needed) m_headSyncCallbacks.resize(needed); + if (m_remoteLockCallbacks.size() < needed) m_remoteLockCallbacks.resize(needed); + if (m_mergeCallbacks.size() < needed) m_mergeCallbacks.resize(needed); + if (m_callbackOwners.size() < needed) { + std::vector> grown(needed); + for (size_t i = 0; i < m_callbackOwners.size(); ++i) { + grown[i].store( + m_callbackOwners[i].load(std::memory_order_acquire), + std::memory_order_release); + } + m_callbackOwners = std::move(grown); + } + } + + void SetAppendCallback(int layer, AppendCallback cb) { + std::unique_lock lk(m_callbackLifetimeMutex); + EnsureLayerSlot_NoLock(layer); + m_appendCallbacks[layer] = std::move(cb); + } + void SetHeadSyncCallback(int layer, HeadSyncCallback cb) { + std::unique_lock lk(m_callbackLifetimeMutex); + EnsureLayerSlot_NoLock(layer); + m_headSyncCallbacks[layer] = std::move(cb); + } + void SetRemoteLockCallback(int layer, RemoteLockCallback cb) { + std::unique_lock lk(m_callbackLifetimeMutex); + EnsureLayerSlot_NoLock(layer); + m_remoteLockCallbacks[layer] = std::move(cb); + } + void SetMergeCallback(int layer, MergeCallback cb) { + std::unique_lock lk(m_callbackLifetimeMutex); + EnsureLayerSlot_NoLock(layer); + m_mergeCallbacks[layer] = std::move(cb); + } + + /// Atomically clear ALL callbacks (every layer) and wait for any in-flight + /// callback invocation to finish. Required before the owner of the captured + /// `this` pointer (e.g. ExtraDynamicSearcher) is destroyed, otherwise + /// the lambdas registered via SetXxxCallback would dereference a dangling + /// pointer. + void ClearCallbacks() { + std::unique_lock lk(m_callbackLifetimeMutex); + m_appendCallbacks.clear(); + m_headSyncCallbacks.clear(); + m_remoteLockCallbacks.clear(); + m_mergeCallbacks.clear(); + m_callbackOwners = std::vector>(); + } + + /// Claim ownership of the registered callbacks for a SPECIFIC layer. + /// Each ExtraDynamicSearcher owns its own layer slot; per-layer + /// ownership prevents one layer's destructor from wiping another + /// layer's still-valid callbacks (the original 1-layer design used a + /// single ownership token; with Layers>=2 each layer needs its own). + void ClaimCallbackOwnership(int layer, const void* owner) { + std::unique_lock lk(m_callbackLifetimeMutex); + EnsureLayerSlot_NoLock(layer); + m_callbackOwners[layer].store(owner, std::memory_order_release); + } + + /// Clear callbacks for `layer` ONLY if `owner` is the current registered + /// owner of that layer. Used by ExtraDynamicSearcher destructor: each + /// layer's destructor only clears its own slot. Returns true if cleared. + bool ClearCallbacksIfOwner(int layer, const void* owner) { + std::unique_lock lk(m_callbackLifetimeMutex); + if (layer < 0 || static_cast(layer) >= m_callbackOwners.size()) { + return false; + } + if (m_callbackOwners[layer].load(std::memory_order_acquire) != owner) { + return false; + } + m_appendCallbacks[layer] = nullptr; + m_headSyncCallbacks[layer] = nullptr; + m_remoteLockCallbacks[layer] = nullptr; + if (layer >= 0 && static_cast(layer) < m_mergeCallbacks.size()) { + m_mergeCallbacks[layer] = nullptr; + } + m_callbackOwners[layer].store(nullptr, std::memory_order_release); + return true; + } + + // ----- internal callback lookup helpers (caller holds shared lock) ----- + const AppendCallback* LookupAppendCallback_Locked(int layer) const { + if (layer < 0 || static_cast(layer) >= m_appendCallbacks.size()) return nullptr; + const auto& cb = m_appendCallbacks[layer]; + return cb ? &cb : nullptr; + } + const HeadSyncCallback* LookupHeadSyncCallback_Locked(int layer) const { + if (layer < 0 || static_cast(layer) >= m_headSyncCallbacks.size()) return nullptr; + const auto& cb = m_headSyncCallbacks[layer]; + return cb ? &cb : nullptr; + } + const RemoteLockCallback* LookupRemoteLockCallback_Locked(int layer) const { + if (layer < 0 || static_cast(layer) >= m_remoteLockCallbacks.size()) return nullptr; + const auto& cb = m_remoteLockCallbacks[layer]; + return cb ? &cb : nullptr; + } + // PutPosting/FetchPosting/DeletePosting RPCs lived here historically. + // With shared TiKV every node reads and writes the posting store + // directly (PD routes the key), so the cross-node scatter-gather + // and owner-callback round-trips are unnecessary. + const MergeCallback* LookupMergeCallback_Locked(int layer) const { + if (layer < 0 || static_cast(layer) >= m_mergeCallbacks.size()) return nullptr; + const auto& cb = m_mergeCallbacks[layer]; + return cb ? &cb : nullptr; + } + + // ================================================================== + // Append — single item, synchronous (waits for response) + // ================================================================== + + ErrorCode SendRemoteAppend( + int targetNodeIndex, + int layer, + SizeType headID, + const std::shared_ptr& headVec, + int appendNum, + std::string& appendPosting) + { + Socket::ConnectionID connID = m_net->GetPeerConnection(targetNodeIndex); + if (connID == Socket::c_invalidConnectionID) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "RemotePostingOps: Cannot connect to node %d for headID %lld\n", + targetNodeIndex, (std::int64_t)headID); + return ErrorCode::Fail; + } + + RemoteAppendRequest req; + req.m_layer = layer; + req.m_headID = headID; + req.m_headVec = *headVec; + req.m_appendNum = appendNum; + req.m_appendPosting = appendPosting; + + Socket::ResourceID resID = m_nextResourceId.fetch_add(1); + auto [future, _] = CreatePendingResponse(resID); + (void)_; + + Socket::Packet packet; + packet.Header().m_packetType = Socket::PacketType::AppendRequest; + packet.Header().m_processStatus = Socket::PacketProcessStatus::Ok; + packet.Header().m_connectionID = Socket::c_invalidConnectionID; + packet.Header().m_resourceID = resID; + + auto bodySize = static_cast(req.EstimateBufferSize()); + packet.Header().m_bodyLength = bodySize; + packet.AllocateBuffer(bodySize); + req.Write(packet.Body()); + packet.Header().WriteBuffer(packet.HeaderBuffer()); + + m_net->GetClient()->SendPacket(connID, std::move(packet), + MakeSendFailHandler(resID)); + + auto status = future.wait_for(std::chrono::seconds(30)); + if (status == std::future_status::timeout) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "RemotePostingOps: Timeout waiting for append response for headID %lld from node %d\n", + (std::int64_t)headID, targetNodeIndex); + ErasePending(resID); + return ErrorCode::Fail; + } + return future.get(); + } + + // ================================================================== + // Append — batch, synchronous with retry + // ================================================================== + + ErrorCode SendBatchRemoteAppend( + int targetNodeIndex, + std::vector& items) + { + if (items.empty()) return ErrorCode::Success; + + // Chunk the batch so a single RPC never exceeds kChunkSize items. + // Large batches (millions of items) cannot be processed by the + // receiver within a single timeout window, causing data loss + // when the request is dropped. Chunking keeps each RPC bounded. + // [v38] Reduced 50000 → 10000 to (a) shrink end-of-batch drain + // tail (final chunk no longer 14s wide) and (b) let multiple + // chunks pipeline on the receiver pool. + // [v43] Back to 50000 — v42 (10k) was throughput-best (906/s) + // but during-insert p50 was 222ms; v43 (50k) trades throughput + // (-22% → 704/s) for during-insert p50 (-36% → 141ms) and big + // recovery in post-insert r1 QPS (47→85). v44 (100k) blew up + // tail drain: a single 100k chunk took 116s on the receiver, + // making end-of-batch drain run 40+ min (vs 8 min at 50k). + // 50k is the sweet spot. + // [v47] With shared-pool receiver (BatchAppendItemJob on + // m_splitThreadPool), 50k chunks still occasionally exceed the + // 180s wait_for window under contention → "Timeout waiting for + // batch response" + retries. Drop to 10k so each RPC's worst-case + // receiver wall-clock is ~6× smaller and stays under the timeout. + constexpr size_t kChunkSize = 3000; + const size_t total = items.size(); + size_t offset = 0; + std::vector chunk; + chunk.reserve(std::min(kChunkSize, total)); + + while (offset < total) { + size_t end = std::min(offset + kChunkSize, total); + chunk.clear(); + chunk.reserve(end - offset); + for (size_t i = offset; i < end; ++i) { + chunk.push_back(std::move(items[i])); + } + + ErrorCode chunkRet = SendBatchRemoteAppendChunk(targetNodeIndex, chunk); + if (chunkRet != ErrorCode::Success) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "RemotePostingOps: Chunk send failed to node %d (offset=%zu/%zu, chunk=%zu items)\n", + targetNodeIndex, offset, total, end - offset); + return chunkRet; + } + offset = end; + } + return ErrorCode::Success; + } + + private: + ErrorCode SendBatchRemoteAppendChunk( + int targetNodeIndex, + std::vector& items) + { + if (items.empty()) return ErrorCode::Success; + + for (int attempt = 0; attempt < 3; attempt++) { + Socket::ConnectionID connID = m_net->GetPeerConnection(targetNodeIndex); + if (connID == Socket::c_invalidConnectionID) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "RemotePostingOps: Cannot connect to node %d for batch (%d items, attempt %d)\n", + targetNodeIndex, (int)items.size(), attempt + 1); + if (attempt < 2) continue; + return ErrorCode::Fail; + } + + BatchRemoteAppendRequest batchReq; + batchReq.m_count = static_cast(items.size()); + batchReq.m_items = std::move(items); + + Socket::ResourceID resID = m_nextResourceId.fetch_add(1); + auto [future, _] = CreatePendingResponse(resID); + (void)_; + + Socket::Packet packet; + packet.Header().m_packetType = Socket::PacketType::BatchAppendRequest; + packet.Header().m_processStatus = Socket::PacketProcessStatus::Ok; + packet.Header().m_connectionID = Socket::c_invalidConnectionID; + packet.Header().m_resourceID = resID; + + auto bodySize = static_cast(batchReq.EstimateBufferSize()); + packet.Header().m_bodyLength = bodySize; + packet.AllocateBuffer(bodySize); + batchReq.Write(packet.Body()); + items = std::move(batchReq.m_items); // restore for retry + + packet.Header().WriteBuffer(packet.HeaderBuffer()); + + SPTAGLIB_LOG(Helper::LogLevel::LL_Debug, + "RemotePostingOps: Sending batch of %u appends to node %d (resID=%u, attempt=%d)\n", + batchReq.m_count, targetNodeIndex, resID, attempt + 1); + + auto waitStart = std::chrono::steady_clock::now(); + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "RemotePostingOps: BatchAppendChunk -> node %d (resID=%u, attempt=%d, items=%u) wait_start\n", + targetNodeIndex, resID, attempt + 1, batchReq.m_count); + + m_net->GetClient()->SendPacket(connID, std::move(packet), + MakeSendFailHandler(resID)); + + // Generous timeout: 50k items * (~10ms TiKV roundtrip / 16 worker threads) + // = ~31s typical; cap at 180s to allow for lock contention with merges/splits. + auto status = future.wait_for(std::chrono::seconds(180)); + auto waitMs = std::chrono::duration_cast( + std::chrono::steady_clock::now() - waitStart).count(); + if (status == std::future_status::timeout) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "RemotePostingOps: Timeout waiting for batch response from node %d (chunk=%u items, attempt=%d, waited=%lldms)\n", + targetNodeIndex, batchReq.m_count, attempt + 1, (long long)waitMs); + ErasePending(resID); + // Do NOT invalidate the connection on timeout — a slow + // response is not a broken connection, and reconnecting + // floods the worker's accept loop. Real connection errors + // are signalled via MakeSendFailHandler (which sets the + // promise to Fail, taking the "result != Success" path + // below). + if (attempt < 2) continue; + return ErrorCode::Fail; + } + + ErrorCode result = future.get(); + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "RemotePostingOps: BatchAppendChunk <- node %d (resID=%u, attempt=%d, items=%u, waited=%lldms, result=%d)\n", + targetNodeIndex, resID, attempt + 1, batchReq.m_count, (long long)waitMs, (int)result); + if (result == ErrorCode::Success) return ErrorCode::Success; + + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "RemotePostingOps: Batch to node %d failed (attempt %d), reconnecting...\n", + targetNodeIndex, attempt + 1); + m_net->InvalidatePeerConnection(targetNodeIndex); + } + return ErrorCode::Fail; + } + + public: + + // ================================================================== + // HeadSync — fire-and-forget broadcast + // ================================================================== + + void BroadcastHeadSync(const std::vector& entries) { + if (entries.empty()) return; + + int numNodes = m_net->GetNumNodes(); + int localIdx = m_net->GetLocalNodeIndex(); + + // Count once per peer for sent-entry totals. + std::uint64_t targetCount = 0; + for (int i = 0; i < numNodes; i++) { + if (i != localIdx) targetCount++; + } + m_headSyncBroadcastEntries.fetch_add(entries.size() * targetCount, + std::memory_order_relaxed); + + for (int i = 0; i < numNodes; i++) { + if (i == localIdx) continue; + // Pass a copy of `entries` per peer so each can be re-enqueued + // into its own retry backlog independently on send failure. + SendOneHeadSync(i, std::vector(entries), + /*isRetry=*/false); + } + } + + // Send a HeadSync packet to a single peer. On TCP-level send failure + // (success=false reported by the network stack), the entries are + // appended to the per-peer retry backlog so the background retry + // thread can re-attempt delivery. Counter increments are done + // best-effort once the SendPacket completion lambda fires. + void SendOneHeadSync(int nodeIdx, + std::vector entries, + bool isRetry) + { + if (entries.empty()) return; + + Socket::ConnectionID connID = m_net->GetPeerConnection(nodeIdx); + if (connID == Socket::c_invalidConnectionID) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "RemotePostingOps: HeadSync no connection to node %d (count=%zu, isRetry=%d)\n", + nodeIdx, entries.size(), isRetry ? 1 : 0); + EnqueueHeadSyncRetry(nodeIdx, std::move(entries)); + return; + } + + size_t bodySize = sizeof(std::uint32_t); + for (const auto& e : entries) bodySize += e.EstimateBufferSize(); + + Socket::Packet pkt; + pkt.Header().m_packetType = Socket::PacketType::HeadSyncRequest; + pkt.Header().m_processStatus = Socket::PacketProcessStatus::Ok; + pkt.Header().m_connectionID = Socket::c_invalidConnectionID; + pkt.Header().m_resourceID = 0; + pkt.Header().m_bodyLength = static_cast(bodySize); + pkt.AllocateBuffer(static_cast(bodySize)); + + std::uint8_t* buf = pkt.Body(); + buf = Socket::SimpleSerialization::SimpleWriteBuffer( + static_cast(entries.size()), buf); + for (const auto& e : entries) buf = e.Write(buf); + pkt.Header().WriteBuffer(pkt.HeaderBuffer()); + + const std::uint64_t sentCount = entries.size(); + std::shared_ptr> entriesShared = + std::make_shared>(std::move(entries)); + const bool wasRetry = isRetry; + + m_net->GetClient()->SendPacket(connID, std::move(pkt), + [this, nodeIdx, entriesShared, sentCount, wasRetry](bool success) { + if (success) { + m_headSyncBroadcastSendOK.fetch_add(sentCount, + std::memory_order_relaxed); + if (wasRetry) { + m_headSyncRetrySucceeded.fetch_add(sentCount, + std::memory_order_relaxed); + } + } else { + m_headSyncBroadcastSendFail.fetch_add(sentCount, + std::memory_order_relaxed); + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "RemotePostingOps: HeadSync send to node %d FAILED " + "(count=%llu, isRetry=%d) — enqueueing for retry\n", + nodeIdx, + (unsigned long long)sentCount, + wasRetry ? 1 : 0); + m_net->InvalidatePeerConnection(nodeIdx); + EnqueueHeadSyncRetry(nodeIdx, std::move(*entriesShared)); + } + }); + } + + void EnqueueHeadSyncRetry(int nodeIdx, std::vector entries) { + if (entries.empty()) return; + auto backlog = GetOrCreateBacklog(nodeIdx); + std::lock_guard g(backlog->mu); + if (backlog->queue.size() + entries.size() > HeadSyncBacklog::kMaxEntries) { + std::uint64_t dropped = entries.size(); + m_headSyncRetryDropped.fetch_add(dropped, std::memory_order_relaxed); + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "RemotePostingOps: HeadSync retry queue full for node %d " + "(queue=%zu, dropping=%llu) — index will diverge!\n", + nodeIdx, backlog->queue.size(), + (unsigned long long)dropped); + return; + } + for (auto& e : entries) backlog->queue.push_back(std::move(e)); + m_headSyncRetryEnqueued.fetch_add(entries.size(), + std::memory_order_relaxed); + } + + // Pull up to maxBatch entries from the per-peer backlog and re-send + // them. Called from the retry thread and on demand. Returns the + // total number of entries dispatched (including for retry-of-retry). + size_t DrainHeadSyncBacklog(size_t maxBatch = 1024) { + if (!m_net) return 0; + std::vector nodeIdxs; + { + std::shared_lock lk(m_headSyncBacklogsMu); + nodeIdxs.reserve(m_headSyncBacklogs.size()); + for (auto& kv : m_headSyncBacklogs) nodeIdxs.push_back(kv.first); + } + size_t dispatched = 0; + for (int nodeIdx : nodeIdxs) { + auto backlog = GetOrCreateBacklog(nodeIdx); + std::vector batch; + { + std::lock_guard g(backlog->mu); + if (backlog->queue.empty()) continue; + size_t bs = std::min(backlog->queue.size(), maxBatch); + batch.reserve(bs); + for (size_t i = 0; i < bs; i++) { + batch.push_back(std::move(backlog->queue.front())); + backlog->queue.pop_front(); + } + } + size_t bs = batch.size(); + SendOneHeadSync(nodeIdx, std::move(batch), /*isRetry=*/true); + dispatched += bs; + } + return dispatched; + } + + size_t GetHeadSyncBacklogSize() const { + size_t total = 0; + std::vector> snapshot; + { + std::shared_lock lk(m_headSyncBacklogsMu); + snapshot.reserve(m_headSyncBacklogs.size()); + for (auto& kv : m_headSyncBacklogs) snapshot.push_back(kv.second); + } + for (auto& b : snapshot) { + std::lock_guard g(b->mu); + total += b->queue.size(); + } + return total; + } + + // Best-effort log dump of HeadSync delivery counters. Use whenever a + // checkpoint is needed (start/end of insert phase, before query, on + // SaveIndex, etc.). + void DumpHeadSyncStats(const char* label) const { + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "[HeadSync stats %s] broadcast_entries=%llu send_ok=%llu send_fail=%llu " + "recv_entries=%llu apply_add=%llu apply_del=%llu " + "retry_enqueued=%llu retry_succeeded=%llu retry_dropped=%llu " + "backlog_now=%zu\n", + label ? label : "", + (unsigned long long)m_headSyncBroadcastEntries.load(std::memory_order_relaxed), + (unsigned long long)m_headSyncBroadcastSendOK.load(std::memory_order_relaxed), + (unsigned long long)m_headSyncBroadcastSendFail.load(std::memory_order_relaxed), + (unsigned long long)m_headSyncRecvEntries.load(std::memory_order_relaxed), + (unsigned long long)m_headSyncApplyAdd.load(std::memory_order_relaxed), + (unsigned long long)m_headSyncApplyDelete.load(std::memory_order_relaxed), + (unsigned long long)m_headSyncRetryEnqueued.load(std::memory_order_relaxed), + (unsigned long long)m_headSyncRetrySucceeded.load(std::memory_order_relaxed), + (unsigned long long)m_headSyncRetryDropped.load(std::memory_order_relaxed), + GetHeadSyncBacklogSize()); + } + + // Counters incremented by the receiver-side HandleHeadSyncRequest / + // AddHeadIndex callback. Public so the ExtraDynamicSearcher + // HeadSyncCallback lambda can bump them after applying each entry. + void NoteHeadSyncApplyAdd() { + m_headSyncApplyAdd.fetch_add(1, std::memory_order_relaxed); + } + void NoteHeadSyncApplyDelete() { + m_headSyncApplyDelete.fetch_add(1, std::memory_order_relaxed); + } + + // Best-effort log dump of cross-node merge-hint channel counters. + // Mirrors DumpHeadSyncStats: sender side tracks how many hints we + // broadcast (send_ok / send_fail); receiver side tracks how many + // hints we got and how many were dropped (callback missing). + void DumpMergeRequestStats(const char* label) const { + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "[MergeHint stats %s] send_ok=%llu send_fail=%llu " + "recv_hints=%llu recv_dropped=%llu\n", + label ? label : "", + (unsigned long long)m_mergeBroadcastSendOK.load(std::memory_order_relaxed), + (unsigned long long)m_mergeBroadcastSendFail.load(std::memory_order_relaxed), + (unsigned long long)m_mergeRecvHints.load(std::memory_order_relaxed), + (unsigned long long)m_mergeRecvDropped.load(std::memory_order_relaxed)); + } + + // ================================================================== + // RemoteLock — synchronous request/response + // ================================================================== + + bool SendRemoteLock(int nodeIndex, int layer, SizeType headID, bool lock) { + Socket::ConnectionID connID = m_net->GetPeerConnection(nodeIndex); + if (connID == Socket::c_invalidConnectionID) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "RemotePostingOps: Cannot send remote lock to node %d\n", nodeIndex); + return false; + } + + RemoteLockRequest req; + req.m_op = lock ? RemoteLockRequest::Op::Lock : RemoteLockRequest::Op::Unlock; + req.m_headID = headID; + req.m_layer = layer; + + Socket::ResourceID rid = m_nextResourceId.fetch_add(1); + auto [future, _] = CreatePendingResponse(rid); + (void)_; + + Socket::Packet pkt; + auto bodySize = req.EstimateBufferSize(); + pkt.Header().m_packetType = Socket::PacketType::RemoteLockRequest; + pkt.Header().m_processStatus = Socket::PacketProcessStatus::Ok; + pkt.Header().m_connectionID = Socket::c_invalidConnectionID; + pkt.Header().m_resourceID = rid; + pkt.Header().m_bodyLength = static_cast(bodySize); + pkt.AllocateBuffer(static_cast(bodySize)); + req.Write(pkt.Body()); + pkt.Header().WriteBuffer(pkt.HeaderBuffer()); + + m_net->GetClient()->SendPacket(connID, std::move(pkt), + MakeSendFailHandler(rid)); + + auto status = future.wait_for(std::chrono::milliseconds(5000)); + if (status != std::future_status::ready) { + ErasePending(rid); + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "RemotePostingOps: Lock timeout for headID %lld on node %d\n", + (std::int64_t)headID, nodeIndex); + return false; + } + return future.get() == ErrorCode::Success; + } + + // ================================================================== + // Inbound packet handlers (called by WorkerNode's server/client) + // ================================================================== + + void HandleAppendRequest(Socket::ConnectionID connID, Socket::Packet packet) { + if (packet.Header().m_bodyLength == 0) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "RemotePostingOps: Empty AppendRequest\n"); + return; + } + + if (Socket::c_invalidConnectionID == packet.Header().m_connectionID) + packet.Header().m_connectionID = connID; + + RemoteAppendRequest req; + const std::uint8_t* body = packet.Body(); + const std::uint8_t* bodyEnd = body + packet.Header().m_bodyLength; + if (req.Read(body, bodyEnd) == nullptr) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "RemotePostingOps: AppendRequest version mismatch\n"); + SendAppendResponse(packet, RemoteAppendResponse::Status::Failed); + return; + } + + ErrorCode result = ErrorCode::Fail; + { + std::shared_lock cbLock(m_callbackLifetimeMutex); + const auto* cb = LookupAppendCallback_Locked(req.m_layer); + if (cb) { + auto headVec = std::make_shared(std::move(req.m_headVec)); + result = (*cb)( + req.m_headID, headVec, req.m_appendNum, req.m_appendPosting); + } else { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "RemotePostingOps: AppendRequest layer=%d has no callback registered\n", + req.m_layer); + } + } + + auto status = (result == ErrorCode::Success) + ? RemoteAppendResponse::Status::Success + : RemoteAppendResponse::Status::Failed; + SendAppendResponse(packet, status); + } + + void HandleAppendResponse(Socket::ConnectionID connID, Socket::Packet packet) { + Socket::ResourceID resID = packet.Header().m_resourceID; + auto promise = TakePendingResponse(resID); + if (!promise) return; + + if (packet.Header().m_processStatus != Socket::PacketProcessStatus::Ok) { + promise->set_value(ErrorCode::Fail); + return; + } + + RemoteAppendResponse resp; + if (resp.Read(packet.Body()) == nullptr) { + promise->set_value(ErrorCode::Fail); + return; + } + + promise->set_value( + resp.m_status == RemoteAppendResponse::Status::Success + ? ErrorCode::Success : ErrorCode::Fail); + } + + void HandleBatchAppendRequest(Socket::ConnectionID connID, Socket::Packet packet) { + if (packet.Header().m_bodyLength == 0) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "RemotePostingOps: Empty BatchAppendRequest\n"); + return; + } + + if (Socket::c_invalidConnectionID == packet.Header().m_connectionID) + packet.Header().m_connectionID = connID; + + auto batchReq = std::make_shared(); + if (batchReq->Read(packet.Body(), packet.Header().m_bodyLength) == nullptr) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "RemotePostingOps: BatchAppendRequest parse failed\n"); + SendBatchAppendResponse(packet, 0, 1); + return; + } + + SPTAGLIB_LOG(Helper::LogLevel::LL_Debug, + "RemotePostingOps: Received batch of %u appends\n", batchReq->m_count); + + // Submit each item as a high-priority Job to the searcher's + // shared compute pool. Pool workers run the local Append callback + // exactly like a local insert would. Last completion ACKs the + // sender. This puts remote work on the SAME concurrency budget + // as local Split/Merge/Reassign — eliminating the over-subscribed + // TiKV behaviour of the old separate bg executor + transient + // sub-worker threads. + auto packetPtr = std::make_shared(std::move(packet)); + const size_t total = batchReq->m_items.size(); + if (total == 0) { + SendBatchAppendResponse(*packetPtr, 0, 0); + return; + } + auto remaining = std::make_shared>(total); + auto successCount = std::make_shared>(0); + auto failCount = std::make_shared>(0); + + if (m_jobSubmitters.empty()) { + // Fallback: process inline on the network thread. Should not + // happen once ExtraDynamicSearcher has wired its pool. + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "RemotePostingOps: no job submitter wired; running BatchAppend synchronously\n"); + std::shared_lock cbLock(m_callbackLifetimeMutex); + for (auto& req : batchReq->m_items) { + ErrorCode r = ErrorCode::Fail; + const auto* cb = LookupAppendCallback_Locked(req.m_layer); + if (cb) { + auto hv = std::make_shared(std::move(req.m_headVec)); + r = (*cb)(req.m_headID, hv, req.m_appendNum, req.m_appendPosting); + } + (r == ErrorCode::Success ? *successCount : *failCount).fetch_add(1); + } + SendBatchAppendResponse(*packetPtr, successCount->load(), failCount->load()); + return; + } + + for (size_t i = 0; i < total; i++) { + auto* job = new BatchAppendItemJob( + this, batchReq, i, remaining, successCount, failCount, packetPtr); + // Route to the per-layer searcher pool matching this item's + // m_layer so local Append/Split/Merge on layer N and remote + // appends targeting layer N share the same 16-thread budget. + // A single global submitter sent both layers' work into one + // pool, causing 35k+ queue depth on the receiver side. + int layer = batchReq->m_items[i].m_layer; + const JobSubmitter* sub = nullptr; + if (layer >= 0 && static_cast(layer) < m_jobSubmitters.size() + && m_jobSubmitters[layer]) { + sub = &m_jobSubmitters[layer]; + } else { + // Layer's pool not yet wired — fall back to whichever + // submitter we have. + for (auto& s : m_jobSubmitters) { if (s) { sub = &s; break; } } + } + // Normal priority. Per-layer routing (m_jobSubmitters[layer]) + // already isolates layer-N append items from other layers' + // pools. High priority starved split entirely (split:N + // in_flight, 0 completed) because once all 16 worker threads + // are running long-tail append items, fresh high-prio appends + // keep cutting in front of split. Append throughput per chunk + // is limited by pool concurrency × per-item RMW; widen the + // pool (AppendThreadNum) instead of using priority hacks. + if (sub) (*sub)(job, /*high=*/false); + else { delete job; failCount->fetch_add(1); remaining->fetch_sub(1); } + } + } + + void HandleBatchAppendResponse(Socket::ConnectionID connID, Socket::Packet packet) { + Socket::ResourceID resID = packet.Header().m_resourceID; + auto promise = TakePendingResponse(resID); + if (!promise) return; + + if (packet.Header().m_processStatus != Socket::PacketProcessStatus::Ok) { + promise->set_value(ErrorCode::Fail); + return; + } + + BatchRemoteAppendResponse resp; + if (resp.Read(packet.Body()) == nullptr) { + promise->set_value(ErrorCode::Fail); + return; + } + + promise->set_value(resp.m_failCount == 0 ? ErrorCode::Success : ErrorCode::Fail); + } + + void HandleHeadSyncRequest(Socket::ConnectionID connID, Socket::Packet packet) { + std::shared_lock cbLock(m_callbackLifetimeMutex); + if (m_headSyncCallbacks.empty()) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "RemotePostingOps: HeadSyncRequest but no callbacks registered\n"); + return; + } + + const std::uint8_t* buf = packet.Body(); + const std::uint8_t* bufEnd = buf + packet.Header().m_bodyLength; + std::uint32_t entryCount = 0; + buf = Socket::SimpleSerialization::SimpleReadBuffer(buf, entryCount); + + std::uint32_t bodyLength = packet.Header().m_bodyLength; + if (bodyLength < sizeof(std::uint32_t) || + entryCount > (bodyLength - sizeof(std::uint32_t)) / 8) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "RemotePostingOps: HeadSyncRequest entryCount=%u exceeds bodyLength=%u\n", + entryCount, bodyLength); + return; + } + + for (std::uint32_t i = 0; i < entryCount; i++) { + if (buf >= bufEnd) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "RemotePostingOps: HeadSync buffer overrun at entry %u/%u\n", i, entryCount); + break; + } + HeadSyncEntry entry; + buf = entry.Read(buf); + if (!buf || buf > bufEnd) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "RemotePostingOps: HeadSync parse error at entry %u/%u\n", i, entryCount); + break; + } + m_headSyncRecvEntries.fetch_add(1, std::memory_order_relaxed); + const auto* cb = LookupHeadSyncCallback_Locked(entry.m_layer); + if (cb) { + (*cb)(entry); + } else { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "RemotePostingOps: HeadSyncEntry layer=%d has no callback registered (op=%d, vid=%d)\n", + entry.m_layer, static_cast(entry.op), (int)entry.headVID); + } + } + } + + // ================================================================== + // Merge — fire-and-forget cross-node hint + // ================================================================== + + /// Send a batch of merge hints to one peer. Fire-and-forget: no + /// response is expected and no retry queue is maintained. Receiver- + /// side m_mergeList dedups, and the owner discovers underfull + /// postings through its own paths (own search, own Append) if any + /// notification is dropped. + void SendBatchRemoteMerge(int targetNodeIndex, + const std::vector& items) + { + if (items.empty()) return; + + Socket::ConnectionID connID = m_net->GetPeerConnection(targetNodeIndex); + if (connID == Socket::c_invalidConnectionID) { + m_mergeBroadcastSendFail.fetch_add(items.size(), std::memory_order_relaxed); + return; + } + + BatchRemoteMergeRequest batch; + batch.m_count = static_cast(items.size()); + batch.m_items = items; + + Socket::Packet pkt; + pkt.Header().m_packetType = Socket::PacketType::MergeRequest; + pkt.Header().m_processStatus = Socket::PacketProcessStatus::Ok; + pkt.Header().m_connectionID = Socket::c_invalidConnectionID; + pkt.Header().m_resourceID = 0; + + auto bodySize = static_cast(batch.EstimateBufferSize()); + pkt.Header().m_bodyLength = bodySize; + pkt.AllocateBuffer(bodySize); + batch.Write(pkt.Body()); + pkt.Header().WriteBuffer(pkt.HeaderBuffer()); + + const std::uint64_t sentCount = items.size(); + m_net->GetClient()->SendPacket(connID, std::move(pkt), + [this, targetNodeIndex, sentCount](bool success) { + if (success) { + m_mergeBroadcastSendOK.fetch_add(sentCount, std::memory_order_relaxed); + } else { + m_mergeBroadcastSendFail.fetch_add(sentCount, std::memory_order_relaxed); + m_net->InvalidatePeerConnection(targetNodeIndex); + } + }); + } + + void HandleMergeRequest(Socket::ConnectionID connID, Socket::Packet packet) { + (void)connID; + BatchRemoteMergeRequest batch; + if (batch.Read(packet.Body(), packet.Header().m_bodyLength) == nullptr) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "RemotePostingOps: MergeRequest parse failed (bodyLength=%u)\n", + packet.Header().m_bodyLength); + return; + } + + std::shared_lock cbLock(m_callbackLifetimeMutex); + for (const auto& item : batch.m_items) { + const auto* cb = LookupMergeCallback_Locked(item.m_layer); + if (cb) { + (*cb)(item.m_headID); + m_mergeRecvHints.fetch_add(1, std::memory_order_relaxed); + } else { + m_mergeRecvDropped.fetch_add(1, std::memory_order_relaxed); + } + } + } + + void HandleRemoteLockRequest(Socket::ConnectionID connID, Socket::Packet packet) { + RemoteLockRequest req; + if (req.Read(packet.Body()) == nullptr) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "RemotePostingOps: Failed to parse RemoteLockRequest\n"); + return; + } + + RemoteLockResponse resp; + resp.m_status = RemoteLockResponse::Status::Denied; + + { + std::shared_lock cbLock(m_callbackLifetimeMutex); + const auto* cb = LookupRemoteLockCallback_Locked(req.m_layer); + if (cb) { + bool isLock = (req.m_op == RemoteLockRequest::Op::Lock); + bool success = (*cb)(req.m_headID, isLock); + if (success) resp.m_status = RemoteLockResponse::Status::Granted; + } else { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "RemotePostingOps: RemoteLockRequest layer=%d has no callback registered\n", + req.m_layer); + } + } + + Socket::Packet ret; + auto bodySize = resp.EstimateBufferSize(); + ret.Header().m_packetType = Socket::PacketType::RemoteLockResponse; + ret.Header().m_processStatus = Socket::PacketProcessStatus::Ok; + ret.Header().m_connectionID = connID; + ret.Header().m_resourceID = packet.Header().m_resourceID; + ret.Header().m_bodyLength = static_cast(bodySize); + ret.AllocateBuffer(static_cast(bodySize)); + resp.Write(ret.Body()); + ret.Header().WriteBuffer(ret.HeaderBuffer()); + + m_net->GetServer()->SendPacket(connID, std::move(ret), nullptr); + } + + void HandleRemoteLockResponse(Socket::ConnectionID connID, Socket::Packet packet) { + Socket::ResourceID rid = packet.Header().m_resourceID; + auto promise = TakePendingResponse(rid); + if (!promise) return; + + RemoteLockResponse resp; + if (resp.Read(packet.Body()) == nullptr) { + promise->set_value(ErrorCode::Fail); + return; + } + + promise->set_value(resp.m_status == RemoteLockResponse::Status::Granted + ? ErrorCode::Success : ErrorCode::Fail); + } + + // ---- Response matching helpers ---- + + std::pair, bool> CreatePendingResponse(Socket::ResourceID resID) { + std::promise promise; + auto future = promise.get_future(); + std::lock_guard lock(m_pendingMutex); + m_pendingResponses.emplace(resID, std::move(promise)); + return {std::move(future), true}; + } + + void ErasePending(Socket::ResourceID resID) { + std::lock_guard lock(m_pendingMutex); + m_pendingResponses.erase(resID); + } + + /// Take a pending promise out of the map (returns nullptr if not found). + std::unique_ptr> TakePendingResponse(Socket::ResourceID resID) { + std::lock_guard lock(m_pendingMutex); + auto it = m_pendingResponses.find(resID); + if (it == m_pendingResponses.end()) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "RemotePostingOps: Response for unknown resourceID %u\n", resID); + return nullptr; + } + auto p = std::make_unique>(std::move(it->second)); + m_pendingResponses.erase(it); + return p; + } + + /// Create a send-failure callback that resolves the pending promise. + std::function MakeSendFailHandler(Socket::ResourceID resID) { + return [resID, this](bool success) { + if (!success) { + std::lock_guard lock(m_pendingMutex); + auto it = m_pendingResponses.find(resID); + if (it != m_pendingResponses.end()) { + it->second.set_value(ErrorCode::Fail); + m_pendingResponses.erase(it); + } + } + }; + } + + void SendAppendResponse(Socket::Packet& srcPacket, RemoteAppendResponse::Status status) { + RemoteAppendResponse resp; + resp.m_status = status; + + Socket::Packet ret; + ret.Header().m_packetType = Socket::PacketType::AppendResponse; + ret.Header().m_processStatus = Socket::PacketProcessStatus::Ok; + ret.Header().m_connectionID = srcPacket.Header().m_connectionID; + ret.Header().m_resourceID = srcPacket.Header().m_resourceID; + + auto bodySize = static_cast(resp.EstimateBufferSize()); + ret.Header().m_bodyLength = bodySize; + ret.AllocateBuffer(bodySize); + resp.Write(ret.Body()); + ret.Header().WriteBuffer(ret.HeaderBuffer()); + + m_net->GetServer()->SendPacket(srcPacket.Header().m_connectionID, std::move(ret), nullptr); + } + + void SendBatchAppendResponse(Socket::Packet& srcPacket, + std::uint32_t successCount, std::uint32_t failCount) { + BatchRemoteAppendResponse resp; + resp.m_successCount = successCount; + resp.m_failCount = failCount; + + Socket::Packet ret; + ret.Header().m_packetType = Socket::PacketType::BatchAppendResponse; + ret.Header().m_processStatus = Socket::PacketProcessStatus::Ok; + ret.Header().m_connectionID = srcPacket.Header().m_connectionID; + ret.Header().m_resourceID = srcPacket.Header().m_resourceID; + + auto bodySize = static_cast(resp.EstimateBufferSize()); + ret.Header().m_bodyLength = bodySize; + ret.AllocateBuffer(bodySize); + resp.Write(ret.Body()); + ret.Header().WriteBuffer(ret.HeaderBuffer()); + + m_net->GetServer()->SendPacket(srcPacket.Header().m_connectionID, std::move(ret), nullptr); + } + + // ================================================================== + // [Bug 26] Background executor — slow-lane for batch RPC handlers + // ================================================================== + // + // Why: the network server thread pool has only 8 threads + // (NetworkNode.h). HandleBatchAppendRequest does heavy TiKV work + // (fan out to 4 sub-workers and join), each call tying up its + // network thread for tens of seconds during inserts. + // Once 4–8 such handlers run concurrently, every network thread is + // blocked and latency-sensitive RPCs (HeadSync, RemoteLock) cannot be + // serviced. + // + // Fix: parse on the network thread (fast), then enqueue the heavy + // work onto a dedicated background thread pool and return. The + // network thread immediately becomes available for other RPCs. + // The background worker eventually sends the response itself. + // + // Sizing rationale: + // - Threads default to 8: matches the network pool so we never + // under-utilize CPU even if every network thread is parsing a + // batch. Tunable via env SPTAG_BG_EXEC_THREADS. + // - Queue cap default 256: plenty of headroom for typical bursts; + // when full, falls back to synchronous execution to preserve + // correctness rather than dropping requests. + + // Background executor removed: BatchAppend now runs as sub-Jobs on + // the searcher's shared compute pool via SetJobSubmitter() so it + // shares a single concurrency budget with local Split/Merge/Reassign + // (with high-priority jumping the queue). See HandleBatchAppendRequest. + + // ================================================================== + // HeadSync retry thread — periodic best-effort drain of per-peer + // backlogs that were populated by failed BroadcastHeadSync sends. + // + // Why: BroadcastHeadSync is fire-and-forget by design (we don't + // want to block the layer-1 split path on a slow peer). When the + // TCP send completion reports failure, we previously dropped the + // entries forever and the peer's headIndex / m_pSamples diverged, + // causing the receiver's BKTree to miss heads at search time and + // recall to collapse on later batches. The retry queue + this + // thread make HeadSync delivery reliable best-effort. + // ================================================================== + + struct HeadSyncBacklog { + std::mutex mu; + std::deque queue; + // Matches m_addCountForRebuild scale per peer. If we ever hit + // this we log + drop (fall back to manual reconcile). + static constexpr size_t kMaxEntries = 1u << 18; // 262144 + }; + + void StartHeadSyncRetryThread() { + const char* envIntervalMs = std::getenv("SPTAG_HEADSYNC_RETRY_INTERVAL_MS"); + int intervalMs = 500; + if (envIntervalMs) { + try { intervalMs = std::max(50, std::stoi(envIntervalMs)); } catch (...) {} + } + m_headSyncRetryIntervalMs = intervalMs; + m_headSyncRetryStop.store(false, std::memory_order_release); + m_headSyncRetryThread = std::thread([this]() { HeadSyncRetryLoop(); }); + } + + void StopHeadSyncRetryThread() { + m_headSyncRetryStop.store(true, std::memory_order_release); + if (m_headSyncRetryThread.joinable()) m_headSyncRetryThread.join(); + } + + void HeadSyncRetryLoop() { + using namespace std::chrono; + while (!m_headSyncRetryStop.load(std::memory_order_acquire)) { + std::this_thread::sleep_for(milliseconds(m_headSyncRetryIntervalMs)); + if (m_net) DrainHeadSyncBacklog(); + } + // Final drain pass to give the network a chance to flush. + for (int i = 0; i < 5 && m_net; i++) { + size_t dispatched = DrainHeadSyncBacklog(); + if (dispatched == 0) break; + std::this_thread::sleep_for(milliseconds(200)); + } + if (m_headSyncBroadcastEntries.load(std::memory_order_relaxed) > 0 + || m_headSyncRecvEntries.load(std::memory_order_relaxed) > 0) { + DumpHeadSyncStats("shutdown"); + } + if (m_mergeBroadcastSendOK.load(std::memory_order_relaxed) > 0 + || m_mergeRecvHints.load(std::memory_order_relaxed) > 0) { + DumpMergeRequestStats("shutdown"); + } + } + + std::shared_ptr GetOrCreateBacklog(int nodeIdx) { + { + std::shared_lock lk(m_headSyncBacklogsMu); + auto it = m_headSyncBacklogs.find(nodeIdx); + if (it != m_headSyncBacklogs.end()) return it->second; + } + std::unique_lock lk(m_headSyncBacklogsMu); + auto& slot = m_headSyncBacklogs[nodeIdx]; + if (!slot) slot = std::make_shared(); + return slot; + } + + // ---- State ---- + + NetworkAccess* m_net = nullptr; + + // Per-layer callback registries. Indexed by ExtraDynamicSearcher layer + // (m_layer at the call site). Resized lazily by SetXxxCallback. The + // empty/null entry at layer 0 is preserved so a single-layer caller + // (legacy or test) without explicit Set keeps the no-op default. + // + // The shared-callback design existed because the original SPANN had + // a single ExtraDynamicSearcher (Layers=1). With Layers>=2, each + // layer's lambda captures its own `this` (hence m_layer) and dispatch + // by request.m_layer is required to avoid routing layer-0 events to + // layer-1's storage and vice versa. + std::vector m_appendCallbacks; + std::vector m_headSyncCallbacks; + std::vector m_remoteLockCallbacks; + std::vector m_mergeCallbacks; + + // Per-layer ownership tokens. Each ExtraDynamicSearcher claims its + // layer slot at SetWorker time and releases it on destruction; this + // prevents earlier-layer destructors from wiping a later-layer's + // callbacks (the original ClaimCallbackOwnership purpose, now + // applied per-layer instead of globally). + std::vector> m_callbackOwners; + + // Guards the lifetime of the captured `this` inside the callbacks. + // Held in shared mode by every callback invocation site, and in + // exclusive mode by ClearCallbacks() / SetXxxCallback() so that + // (re)assigning a callback can never race with an in-flight invocation. + mutable std::shared_timed_mutex m_callbackLifetimeMutex; + + std::atomic m_nextResourceId{1}; + std::mutex m_pendingMutex; + std::unordered_map> m_pendingResponses; + + // Per-item Job: each remote append request becomes one Job submitted + // to the searcher's shared SPDKThreadPool. The last completing Job + // ACKs the sender. Identical to how a local insert thread would call + // Append; the only difference is the request originated on a peer. + class BatchAppendItemJob : public Helper::ThreadPool::Job { + public: + BatchAppendItemJob(RemotePostingOps* ops, + std::shared_ptr batchReq, + size_t index, + std::shared_ptr> remaining, + std::shared_ptr> successCount, + std::shared_ptr> failCount, + std::shared_ptr replyPacket) + : m_ops(ops), m_batchReq(std::move(batchReq)), m_index(index), + m_remaining(std::move(remaining)), + m_success(std::move(successCount)), + m_fail(std::move(failCount)), + m_replyPacket(std::move(replyPacket)) {} + + void exec(IAbortOperation*) override { run(); } + void exec(void* workspace, IAbortOperation*) override { + void* prev = tls_preallocAppendWorkSpace; + tls_preallocAppendWorkSpace = workspace; + run(); + tls_preallocAppendWorkSpace = prev; + } + + private: + void run() { + { + std::shared_lock cbLock(m_ops->m_callbackLifetimeMutex); + auto& req = m_batchReq->m_items[m_index]; + ErrorCode r = ErrorCode::Fail; + const auto* cb = m_ops->LookupAppendCallback_Locked(req.m_layer); + if (cb) { + auto hv = std::make_shared(std::move(req.m_headVec)); + r = (*cb)(req.m_headID, hv, req.m_appendNum, req.m_appendPosting); + } + if (r == ErrorCode::Success) m_success->fetch_add(1); + else m_fail->fetch_add(1); + } + if (m_remaining->fetch_sub(1) == 1) { + m_ops->SendBatchAppendResponse( + *m_replyPacket, m_success->load(), m_fail->load()); + } + } + + RemotePostingOps* m_ops; + std::shared_ptr m_batchReq; + size_t m_index; + std::shared_ptr> m_remaining; + std::shared_ptr> m_success; + std::shared_ptr> m_fail; + std::shared_ptr m_replyPacket; + }; + + // [Bug 26 retired] bg executor removed — see HandleBatchAppendRequest. + // m_bgWorkers etc were replaced by per-layer job submission into the + // searcher's shared SPDKThreadPool via m_jobSubmitters[layer]. + std::vector m_jobSubmitters; + + // HeadSync delivery diagnostics + retry queue (v33). Counters give + // observability for sender/receiver gaps; per-peer backlogs + + // retry thread make broadcast reliable best-effort. + std::atomic m_headSyncBroadcastEntries{0}; + std::atomic m_headSyncBroadcastSendOK{0}; + std::atomic m_headSyncBroadcastSendFail{0}; + std::atomic m_headSyncRecvEntries{0}; + std::atomic m_headSyncApplyAdd{0}; + std::atomic m_headSyncApplyDelete{0}; + std::atomic m_headSyncRetryEnqueued{0}; + std::atomic m_headSyncRetrySucceeded{0}; + std::atomic m_headSyncRetryDropped{0}; + + // Cross-node merge hint counters. No retry queue: dropped + // notifications are recoverable since the owner discovers underfull + // postings via its own paths too. + std::atomic m_mergeBroadcastSendOK{0}; + std::atomic m_mergeBroadcastSendFail{0}; + std::atomic m_mergeRecvHints{0}; + std::atomic m_mergeRecvDropped{0}; + + mutable std::shared_timed_mutex m_headSyncBacklogsMu; + std::unordered_map> m_headSyncBacklogs; + std::thread m_headSyncRetryThread; + std::atomic m_headSyncRetryStop{false}; + int m_headSyncRetryIntervalMs{500}; + }; + +} // namespace SPTAG::SPANN diff --git a/AnnService/inc/Core/SPANN/Distributed/WorkerNode.h b/AnnService/inc/Core/SPANN/Distributed/WorkerNode.h new file mode 100644 index 000000000..8af906fcc --- /dev/null +++ b/AnnService/inc/Core/SPANN/Distributed/WorkerNode.h @@ -0,0 +1,616 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#ifndef _SPTAG_SPANN_WORKERNODE_H_ +#define _SPTAG_SPANN_WORKERNODE_H_ + +#include "inc/Core/SPANN/Distributed/NetworkNode.h" +#include "inc/Helper/KeyValueIO.h" +#include "inc/Helper/CommonHelper.h" +#include "inc/Socket/SimpleSerialization.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace SPTAG::SPANN { + + /// Distributed compute worker node. + /// + /// Responsibilities: + /// - Route headIDs to owner nodes via consistent hash ring + /// - Queue and flush remote appends (batched RPC) + /// - HeadSync broadcast and remote locking + /// - Register with dispatcher and receive ring updates + /// - Handle incoming dispatch commands from the driver + class WorkerNode : public NetworkNode { + public: + using AppendCallback = RemotePostingOps::AppendCallback; + using DispatchCallback = DispatchCoordinator::DispatchCallback; + using HeadSyncCallback = RemotePostingOps::HeadSyncCallback; + using RemoteLockCallback = RemotePostingOps::RemoteLockCallback; + + /// Initialize with separate dispatcher/worker/store addresses. + /// workerIndex is 0-based (0 = driver/local, 1+ = remote). + /// Internal node index = workerIndex + 1 (0 is reserved for dispatcher). + bool Initialize( + std::shared_ptr p_db, + int workerIndex, + const std::pair& dispatcherAddr, + const std::vector>& workerAddrs, + const std::vector& storeAddrs, + int vnodeCount = 150) + { + if (storeAddrs.empty()) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "WorkerNode::Initialize: storeAddrs is empty\n"); + return false; + } + + // Build combined addr list: [dispatcher, worker0, worker1, ...] + std::vector> allAddrs; + allAddrs.push_back(dispatcherAddr); + allAddrs.insert(allAddrs.end(), workerAddrs.begin(), workerAddrs.end()); + + int internalIdx = workerIndex + 1; // 0 = dispatcher, 1..N = workers + if (!InitializeNetwork(internalIdx, allAddrs, vnodeCount)) return false; + + // [Bug 30] Populate compute-role fields so callers can ask + // "how many data shards?" / "which shard am I?" without + // accidentally including the dispatcher slot. + m_numDispatchNodes = 1; + m_numWorkerNodes = static_cast(workerAddrs.size()); + m_workerNodeIndex = workerIndex; + + m_db = p_db; + m_nodeStores = storeAddrs; + + // Build store → node list mapping (worker internal indices 1..N) + int numWorkers = static_cast(workerAddrs.size()); + int numStores = static_cast(storeAddrs.size()); + for (int wi = 0; wi < numWorkers; wi++) { + int storeIdx = wi % numStores; + m_storeToNodes[storeAddrs[storeIdx]].push_back(wi + 1); + } + for (auto& [store, nodes] : m_storeToNodes) { + std::string nodeList; + for (int n : nodes) { nodeList += std::to_string(n) + " "; } + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "WorkerNode: store %s → nodes [%s]\n", store.c_str(), nodeList.c_str()); + } + + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "WorkerNode: initialized (workerIndex=%d, internalIdx=%d, %d stores, %d vnodes/node)\n", + workerIndex, internalIdx, numStores, vnodeCount); + + m_dispatch.SetNetwork(this); + m_remoteOps.SetNetwork(this); + + return true; + } + + public: + bool Start() { return StartNetwork(); } + + // ---- Callbacks ---- + // + // ExtraDynamicSearcher passes its m_layer when binding callbacks so + // that with multi-layer SPANN (Layers >= 2) each layer has its own + // captured `this` and request dispatch on the receiver side routes by + // request.m_layer. + + void SetAppendCallback(int layer, AppendCallback cb) { m_remoteOps.SetAppendCallback(layer, std::move(cb)); } + void SetHeadSyncCallback(int layer, HeadSyncCallback cb) { m_remoteOps.SetHeadSyncCallback(layer, std::move(cb)); } + void SetRemoteLockCallback(int layer, RemoteLockCallback cb) { m_remoteOps.SetRemoteLockCallback(layer, std::move(cb)); } + // Inject the searcher's shared compute pool so receiver-side + // BatchAppend work runs there (high-priority Jobs) instead of in a + // separate executor. Idempotent: safe to call multiple times. + void SetJobSubmitter(int layer, RemotePostingOps::JobSubmitter s) { + m_remoteOps.SetJobSubmitter(layer, std::move(s)); + } + /// Atomically clear all RPC callbacks (every layer) and wait for any + /// in-flight invocation to finish. + void ClearCallbacks() { + m_remoteOps.ClearCallbacks(); + } + /// Per-layer ownership API used by ExtraDynamicSearcher to avoid having + /// one layer's destructor wipe another layer's still-active callbacks. + /// SetWorker calls ClaimCallbackOwnership(m_layer, this) before + /// registering; the destructor calls ClearCallbacksIfOwner(m_layer, this). + void ClaimCallbackOwnership(int layer, const void* owner) { + m_remoteOps.ClaimCallbackOwnership(layer, owner); + } + bool ClearCallbacksIfOwner(int layer, const void* owner) { + return m_remoteOps.ClearCallbacksIfOwner(layer, owner); + } + void SetDispatchCallback(DispatchCallback cb) { m_dispatch.SetDispatchCallback(std::move(cb)); } + void ClearDispatchCallback() { m_dispatch.ClearDispatchCallback(); } + + // ---- Routing ---- + + RouteTarget GetOwner(SizeType headID) { + RouteTarget target; + target.isLocal = true; + target.nodeIndex = m_localNodeIndex; + + if (!m_enabled) { + m_routeStats.disabled++; + return target; + } + { + auto ring = std::atomic_load(&m_hashRing); + if (!ring || ring->NodeCount() <= 1) { + m_routeStats.local++; + return target; + } + target.nodeIndex = ring->GetOwner(headID); + } + target.isLocal = (target.nodeIndex == m_localNodeIndex); + if (target.isLocal) m_routeStats.local++; + else m_routeStats.remote++; + return target; + } + + void LogRouteStats(const char* context = "") { + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "WorkerNode stats%s: local=%d remote=%d disabled=%d keyMiss=%d noMapping=%d\n", + context, (int)m_routeStats.local, (int)m_routeStats.remote, + (int)m_routeStats.disabled, (int)m_routeStats.keyMiss, + (int)m_routeStats.noMapping); + } + + void ResetRouteStats() { + m_routeStats.local.store(0); + m_routeStats.remote.store(0); + m_routeStats.disabled.store(0); + m_routeStats.keyMiss.store(0); + m_routeStats.noMapping.store(0); + } + + // ---- Remote posting ops ---- + + ErrorCode SendRemoteAppend(int targetNodeIndex, int layer, SizeType headID, + const std::shared_ptr& headVec, int appendNum, + std::string& appendPosting) + { + return m_remoteOps.SendRemoteAppend(targetNodeIndex, layer, headID, headVec, appendNum, appendPosting); + } + + ErrorCode SendBatchRemoteAppend(int targetNodeIndex, std::vector& items) { + return m_remoteOps.SendBatchRemoteAppend(targetNodeIndex, items); + } + + void BroadcastHeadSync(const std::vector& entries) { + if (!m_enabled) return; + m_remoteOps.BroadcastHeadSync(entries); + } + + // v33: expose HeadSync delivery diagnostics + retry queue. + void DumpHeadSyncStats(const char* label) const { + m_remoteOps.DumpHeadSyncStats(label); + } + // Cross-node merge-hint channel diagnostics. + void DumpMergeRequestStats(const char* label) const { + m_remoteOps.DumpMergeRequestStats(label); + } + size_t GetHeadSyncBacklogSize() const { + return m_remoteOps.GetHeadSyncBacklogSize(); + } + size_t DrainHeadSyncBacklog(size_t maxBatch = 1024) { + return m_remoteOps.DrainHeadSyncBacklog(maxBatch); + } + void NoteHeadSyncApplyAdd() { + m_remoteOps.NoteHeadSyncApplyAdd(); + } + void NoteHeadSyncApplyDelete() { + m_remoteOps.NoteHeadSyncApplyDelete(); + } + + bool SendRemoteLock(int nodeIndex, int layer, SizeType headID, bool lock) { + if (!m_enabled) return false; + return m_remoteOps.SendRemoteLock(nodeIndex, layer, headID, lock); + } + + void SetMergeCallback(int layer, RemotePostingOps::MergeCallback cb) { + m_remoteOps.SetMergeCallback(layer, std::move(cb)); + } + + // ---- Append queue ---- + + void QueueRemoteAppend(int nodeIndex, RemoteAppendRequest req) { + std::vector toFlush; + bool didReserveSlot = false; + { + std::lock_guard lock(m_appendQueueMutex); + auto& q = m_appendQueue[nodeIndex]; + q.push_back(std::move(req)); + m_remoteQueueSize.fetch_add(1, std::memory_order_relaxed); + // [PERF] Auto-flush per node once we have a full chunk worth + // (kAutoFlushThreshold items). Without this, every remote + // append accumulates until end-of-batch FlushRemoteAppends — + // which then sends hundreds of thousands of items serially + // (10k chunks * ~3s/chunk) AFTER all insert compute is done. + // Auto-flushing while inserts keep running overlaps the + // network with CPU and drops end-of-batch tail latency. + // + // [v38] Allow up to kMaxInflightPerNode concurrent in-flight + // chunks per node so a producer burst (split fan-out, reassign + // wave) can saturate the receiver's bg-executor pool instead of + // queueing up serially behind a single per-node mutex. + if (q.size() >= kAutoFlushThreshold + && m_perNodeInflight[nodeIndex] < kMaxInflightPerNode) { + toFlush.swap(q); + m_remoteQueueSize.fetch_sub(toFlush.size(), std::memory_order_relaxed); + ++m_perNodeInflight[nodeIndex]; + didReserveSlot = true; + } + } + if (!didReserveSlot) return; + + // Fire-and-forget async send. After the initial chunk completes, + // the same thread loops to pick up any further accumulation so we + // avoid thread-spawn churn while keeping per-node concurrency at + // kMaxInflightPerNode. Order across batches is best-effort: the + // receiver runs 8 worker threads on each chunk that already + // interleave items within a chunk, so cross-chunk ordering adds + // no extra correctness risk for the per-posting RMW path. + auto items = std::make_shared>(std::move(toFlush)); + m_inflightAppendFlushes.fetch_add(1, std::memory_order_relaxed); + std::thread([this, nodeIndex, items]() { + while (true) { + ErrorCode ret = SendBatchRemoteAppend(nodeIndex, *items); + if (ret != ErrorCode::Success) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "QueueRemoteAppend auto-flush: batch to node %d failed (%zu items)\n", + nodeIndex, items->size()); + } + items->clear(); + { + std::lock_guard lock(m_appendQueueMutex); + auto it = m_appendQueue.find(nodeIndex); + if (it == m_appendQueue.end() + || it->second.size() < kAutoFlushThreshold) { + --m_perNodeInflight[nodeIndex]; + break; + } + items->swap(it->second); + m_remoteQueueSize.fetch_sub(items->size(), + std::memory_order_relaxed); + } + } + m_inflightAppendFlushes.fetch_sub(1, std::memory_order_relaxed); + }).detach(); + } + + size_t GetRemoteQueueSize() const { + return m_remoteQueueSize.load(std::memory_order_relaxed); + } + + ErrorCode FlushRemoteAppends() { + // Drain the queue under m_flushMutex so concurrent flush callers + // serialize. Loop in case items get queued mid-send. This avoids + // the thundering-herd of 100+ concurrent FlushRemoteAppends calls + // (one per split worker) overwhelming the remote node's tiny + // (8-thread, 256-connection-pool) network server. + std::lock_guard flushGuard(m_flushMutex); + + // Wait for any in-flight async auto-flushes triggered by + // QueueRemoteAppend (>= kAutoFlushThreshold) to drain so the + // residue we send below is the actual tail. Callers invoke + // FlushRemoteAppends after all producers (AddIndex / split / + // reassign) have quiesced, so no new auto-flushes will start + // here. + while (m_inflightAppendFlushes.load(std::memory_order_relaxed) > 0) { + std::this_thread::sleep_for(std::chrono::milliseconds(20)); + } + + int errors = 0; + int iterations = 0; + while (true) { + std::unordered_map> toSend; + { + std::lock_guard lock(m_appendQueueMutex); + if (m_appendQueue.empty()) break; + toSend.swap(m_appendQueue); + m_remoteQueueSize.store(0, std::memory_order_relaxed); + } + if (toSend.empty()) break; + ++iterations; + + std::atomic iterErrors{0}; + std::vector threads; + for (auto& [nodeIdx, items] : toSend) { + if (items.empty()) continue; + threads.emplace_back([this, &iterErrors, nodeIdx, &items]() { + // Per-node mutex serializes against any straggler + // auto-flush still in flight for this node. + std::mutex& nodeMtx = GetPerNodeAppendFlushMutex(nodeIdx); + std::lock_guard nlock(nodeMtx); + ErrorCode ret = SendBatchRemoteAppend(nodeIdx, items); + if (ret != ErrorCode::Success) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "FlushRemoteAppends: batch to node %d failed (%d items)\n", + nodeIdx, (int)items.size()); + iterErrors++; + } + }); + } + for (auto& t : threads) t.join(); + errors += iterErrors.load(); + } + return errors > 0 ? ErrorCode::Fail : ErrorCode::Success; + } + + // ---- Cross-node merge hint queue ---- + // + // Search-side fire-and-forget notifications: node X sees posting H + // underfull, where H is owned by Y. We dedup (layer, headID) within + // a flush window and batch-send to Y in one packet. The receiver's + // m_mergeList dedups on top of this, so an occasional dropped or + // duplicated notification only costs a few cycles. + void QueueRemoteMerge(int nodeIndex, int layer, SizeType headID) { + std::vector toFlush; + { + std::lock_guard lock(m_mergeQueueMutex); + std::int64_t key = (static_cast(layer) << 32) + | static_cast(headID); + auto& bucket = m_mergeQueue[nodeIndex]; + if (!bucket.insert(key).second) return; // already pending + m_mergeQueueSize.fetch_add(1, std::memory_order_relaxed); + + if (bucket.size() >= kMergeAutoFlushThreshold) { + toFlush.reserve(bucket.size()); + for (std::int64_t k : bucket) { + RemoteMergeRequest req; + req.m_layer = static_cast(k >> 32); + req.m_headID = static_cast(static_cast(k & 0xFFFFFFFF)); + toFlush.push_back(std::move(req)); + } + m_mergeQueueSize.fetch_sub(bucket.size(), std::memory_order_relaxed); + bucket.clear(); + } + } + if (!toFlush.empty()) { + m_remoteOps.SendBatchRemoteMerge(nodeIndex, toFlush); + } + } + + ErrorCode FlushRemoteMerges() { + std::unordered_map> toSend; + { + std::lock_guard lock(m_mergeQueueMutex); + if (m_mergeQueue.empty()) return ErrorCode::Success; + for (auto& [nodeIdx, bucket] : m_mergeQueue) { + auto& vec = toSend[nodeIdx]; + vec.reserve(bucket.size()); + for (std::int64_t k : bucket) { + RemoteMergeRequest req; + req.m_layer = static_cast(k >> 32); + req.m_headID = static_cast(static_cast(k & 0xFFFFFFFF)); + vec.push_back(std::move(req)); + } + } + m_mergeQueue.clear(); + m_mergeQueueSize.store(0, std::memory_order_relaxed); + } + for (auto& [nodeIdx, items] : toSend) { + if (!items.empty()) m_remoteOps.SendBatchRemoteMerge(nodeIdx, items); + } + return ErrorCode::Success; + } + + // ---- Ring protocol (worker side) ---- + + bool WaitForRing(int timeoutSec = 120) { + auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(timeoutSec); + while (std::chrono::steady_clock::now() < deadline) { + auto ring = std::atomic_load(&m_hashRing); + if (ring && ring->NodeCount() > 0) return true; + std::this_thread::sleep_for(std::chrono::milliseconds(200)); + } + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "WorkerNode: Timed out waiting for ring (%ds)\n", timeoutSec); + return false; + } + + // ---- Data members (public for ExtraDynamicSearcher access) ---- + + std::shared_ptr m_db; + std::vector m_nodeStores; + std::unordered_map> m_storeToNodes; + + struct RouteStats { + std::atomic local{0}; + std::atomic remote{0}; + std::atomic disabled{0}; + std::atomic keyMiss{0}; + std::atomic noMapping{0}; + } m_routeStats; + + protected: + void RegisterServerHandlers(Socket::PacketHandlerMapPtr& handlers) override { + handlers->emplace(Socket::PacketType::AppendRequest, + [this](Socket::ConnectionID c, Socket::Packet p) { m_remoteOps.HandleAppendRequest(c, std::move(p)); }); + handlers->emplace(Socket::PacketType::BatchAppendRequest, + [this](Socket::ConnectionID c, Socket::Packet p) { m_remoteOps.HandleBatchAppendRequest(c, std::move(p)); }); + handlers->emplace(Socket::PacketType::HeadSyncRequest, + [this](Socket::ConnectionID c, Socket::Packet p) { m_remoteOps.HandleHeadSyncRequest(c, std::move(p)); }); + handlers->emplace(Socket::PacketType::RemoteLockRequest, + [this](Socket::ConnectionID c, Socket::Packet p) { m_remoteOps.HandleRemoteLockRequest(c, std::move(p)); }); + handlers->emplace(Socket::PacketType::MergeRequest, + [this](Socket::ConnectionID c, Socket::Packet p) { m_remoteOps.HandleMergeRequest(c, std::move(p)); }); + handlers->emplace(Socket::PacketType::DispatchCommand, + [this](Socket::ConnectionID c, Socket::Packet p) { m_dispatch.HandleDispatchCommand(c, std::move(p)); }); + handlers->emplace(Socket::PacketType::DispatchResult, + [this](Socket::ConnectionID c, Socket::Packet p) { m_dispatch.HandleDispatchResult(c, std::move(p)); }); + handlers->emplace(Socket::PacketType::RingUpdate, + [this](Socket::ConnectionID c, Socket::Packet p) { HandleRingUpdate(c, std::move(p)); }); + } + + void RegisterClientHandlers(Socket::PacketHandlerMapPtr& handlers) override { + handlers->emplace(Socket::PacketType::AppendResponse, + [this](Socket::ConnectionID c, Socket::Packet p) { m_remoteOps.HandleAppendResponse(c, std::move(p)); }); + handlers->emplace(Socket::PacketType::BatchAppendResponse, + [this](Socket::ConnectionID c, Socket::Packet p) { m_remoteOps.HandleBatchAppendResponse(c, std::move(p)); }); + handlers->emplace(Socket::PacketType::RemoteLockResponse, + [this](Socket::ConnectionID c, Socket::Packet p) { m_remoteOps.HandleRemoteLockResponse(c, std::move(p)); }); + handlers->emplace(Socket::PacketType::DispatchResult, + [this](Socket::ConnectionID c, Socket::Packet p) { m_dispatch.HandleDispatchResult(c, std::move(p)); }); + } + + void BgProtocolStep() override { + // Keep sending NodeRegister until ring is populated + auto ring = std::atomic_load(&m_hashRing); + if (!ring || ring->NodeCount() == 0) { + Socket::ConnectionID connID = Socket::c_invalidConnectionID; + { + std::lock_guard lock(m_connMutex); + if (m_dispatcherNodeIndex < (int)m_peerConnections.size()) + connID = m_peerConnections[m_dispatcherNodeIndex]; + } + if (connID != Socket::c_invalidConnectionID) { + SendNodeRegister(); + } + } + } + + bool IsRingSettled() const override { + auto ring = std::atomic_load(&m_hashRing); + return ring && ring->NodeCount() > 0; + } + + private: + void SendNodeRegister() { + NodeRegisterMsg msg; + msg.m_nodeIndex = m_localNodeIndex; + msg.m_host = m_nodeAddrs[m_localNodeIndex].first; + msg.m_port = m_nodeAddrs[m_localNodeIndex].second; + // Worker's 0-based index = m_localNodeIndex - 1 (since 0 is dispatcher) + int workerIdx = m_localNodeIndex - 1; + int numStores = static_cast(m_nodeStores.size()); + msg.m_store = (numStores > 0) ? m_nodeStores[workerIdx % numStores] : ""; + + std::size_t bodySize = msg.EstimateBufferSize(); + Socket::Packet pkt; + pkt.Header().m_packetType = Socket::PacketType::NodeRegisterRequest; + pkt.Header().m_processStatus = Socket::PacketProcessStatus::Ok; + pkt.Header().m_connectionID = Socket::c_invalidConnectionID; + pkt.Header().m_resourceID = 0; + pkt.Header().m_bodyLength = static_cast(bodySize); + pkt.AllocateBuffer(static_cast(bodySize)); + msg.Write(pkt.Body()); + pkt.Header().WriteBuffer(pkt.HeaderBuffer()); + + auto connID = GetPeerConnection(m_dispatcherNodeIndex); + if (connID != Socket::c_invalidConnectionID) { + m_client->SendPacket(connID, std::move(pkt), nullptr); + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "WorkerNode: Sent NodeRegister (node %d) to dispatcher\n", m_localNodeIndex); + } + } + + void HandleRingUpdate(Socket::ConnectionID connID, Socket::Packet packet) { + RingUpdateMsg msg; + if (!msg.Read(packet.Body())) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "WorkerNode: Failed to parse RingUpdate\n"); + return; + } + + auto newRing = std::make_shared(msg.m_vnodeCount); + for (auto idx : msg.m_nodeIndices) { + newRing->AddNode(idx); + } + { + std::lock_guard guard(m_ringWriteMutex); + std::atomic_store(&m_hashRing, + std::shared_ptr(std::move(newRing))); + } + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "WorkerNode: Ring updated — %d nodes (v%u)\n", + (int)msg.m_nodeIndices.size(), msg.m_ringVersion); + + SendRingUpdateACK(msg.m_ringVersion); + } + + void SendRingUpdateACK(std::uint32_t ringVersion) { + RingUpdateACKMsg msg; + msg.m_nodeIndex = m_localNodeIndex; + msg.m_ringVersion = ringVersion; + + std::size_t bodySize = msg.EstimateBufferSize(); + Socket::Packet pkt; + pkt.Header().m_packetType = Socket::PacketType::RingUpdateACK; + pkt.Header().m_processStatus = Socket::PacketProcessStatus::Ok; + pkt.Header().m_connectionID = Socket::c_invalidConnectionID; + pkt.Header().m_resourceID = 0; + pkt.Header().m_bodyLength = static_cast(bodySize); + pkt.AllocateBuffer(static_cast(bodySize)); + msg.Write(pkt.Body()); + pkt.Header().WriteBuffer(pkt.HeaderBuffer()); + + auto connID = GetPeerConnection(m_dispatcherNodeIndex); + if (connID != Socket::c_invalidConnectionID) { + m_client->SendPacket(connID, std::move(pkt), nullptr); + } + } + + int m_dispatcherNodeIndex = 0; + RemotePostingOps m_remoteOps; + DispatchCoordinator m_dispatch; + + mutable std::mutex m_appendQueueMutex; + std::unordered_map> m_appendQueue; + std::atomic m_remoteQueueSize{0}; + // Serializes concurrent FlushRemoteAppends() callers so we don't open + // hundreds of simultaneous RPC streams to the remote worker (which has + // only 8 server threads / 256 connection slots). With this mutex, only + // one thread sends at a time; concurrent callers either wait for the + // current flush to finish or contribute their items to the queue. + std::mutex m_flushMutex; + + // Per-node mutex used by end-of-batch FlushRemoteAppends so concurrent + // sends to the SAME node from the final-drain path remain ordered. + // Auto-flushes (QueueRemoteAppend) instead use m_perNodeInflight to + // cap concurrency at kMaxInflightPerNode per node. + std::mutex m_perNodeAppendFlushMutexMapLock; + std::unordered_map> m_perNodeAppendFlushMutex; + std::atomic m_inflightAppendFlushes{0}; + std::unordered_map m_perNodeInflight; // guarded by m_appendQueueMutex + static constexpr size_t kAutoFlushThreshold = 50000; + static constexpr int kMaxInflightPerNode = 4; + + std::mutex& GetPerNodeAppendFlushMutex(int nodeIndex) { + std::lock_guard lk(m_perNodeAppendFlushMutexMapLock); + auto it = m_perNodeAppendFlushMutex.find(nodeIndex); + if (it == m_perNodeAppendFlushMutex.end()) { + auto ins = m_perNodeAppendFlushMutex.emplace( + nodeIndex, std::make_unique()); + return *ins.first->second; + } + return *it->second; + } + + // Cross-node merge hint queue. Per-target dedup set of packed + // (layer << 32 | headID) values; QueueRemoteMerge inserts and + // auto-flushes when the per-target bucket reaches threshold. + mutable std::mutex m_mergeQueueMutex; + std::unordered_map> m_mergeQueue; + std::atomic m_mergeQueueSize{0}; + // Merge hints are non-urgent (best-effort optimization). A larger + // bucket trades a small amount of latency for much better dedup and + // network batching. End-of-batch FlushRemoteMerges() guarantees no + // hint is permanently dropped. + static constexpr size_t kMergeAutoFlushThreshold = 8192; + }; + +} // namespace SPTAG::SPANN + +#endif // _SPTAG_SPANN_WORKERNODE_H_ diff --git a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h index fe3d306a1..29129bdb4 100644 --- a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h +++ b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h @@ -19,6 +19,7 @@ #include "inc/Core/Common/LocalVersionMap.h" #include "inc/Core/Common/TiKVVersionMap.h" #include "ExtraFileController.h" +#include "Distributed/WorkerNode.h" #include #include #include @@ -207,15 +208,29 @@ namespace SPTAG::SPANN { }; private: + std::atomic m_workspaceCount = 0; + std::shared_ptr db; + WorkerNode* m_worker = nullptr; // externally owned, set via SetWorker() + + public: + // Expose the underlying KV handle so a standalone WorkerNode can be wired to the + // same DB this searcher already opened, instead of opening a second one. + std::shared_ptr GetDB() const { return db; } + private: SPANN::Index* m_headIndex; std::unique_ptr m_versionMap; Options* m_opt; int m_layer; + SizeType m_initialVectorSize = 0; // vector count at build time (before inserts) COMMON::FineGrainedRWLock m_rwLocks; + // Per-bucket flags for remote (cross-node) locking. + static constexpr int kRemoteLockPoolSize = 32767; + std::unique_ptr[]> m_remoteBucketLocked; + IndexStats m_stat; std::shared_ptr m_wal; @@ -339,9 +354,247 @@ namespace SPTAG::SPANN { SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Posting size limit: %d, search limit: %f, merge threshold: %d\n", m_postingSizeLimit, p_opt.m_latencyLimit, m_mergeThreshold); SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "[CONFIG] layer=%d DistributedVersionMap=%s SearchCheckVersionMapOnlyLayer0=%s UseMultiChunkPosting=%s PostingPageLimit=%d\n", layer, p_opt.m_distributedVersionMap ? "true" : "false", p_opt.m_searchCheckVersionMapOnlyLayer0 ? "true" : "false", p_opt.m_useMultiChunkPosting ? "true" : "false", p_opt.m_postingPageLimit); + + // Initialize per-bucket remote lock flags + m_remoteBucketLocked.reset(new std::atomic[kRemoteLockPoolSize + 1]{}); + } + + ~ExtraDynamicSearcher() { + if (m_worker) { + m_worker->ClearCallbacksIfOwner(m_layer, this); + m_worker = nullptr; + } + } + + int GetNumWorkerNodes() const { + if (m_worker && m_worker->IsEnabled()) { + return std::max(1, m_worker->GetNumWorkerNodes()); + } + return 1; + } + + int GetWorkerNodeIndex() const { + if (m_worker && m_worker->IsEnabled()) { + int idx = m_worker->GetWorkerNodeIndex(); + return idx >= 0 ? idx : 0; + } + return 0; + } + + // Stripe globalVID across worker nodes (only for vectors added after build). + SizeType AllocateGlobalVID(SizeType localVID) const override { + int numWorkers = GetNumWorkerNodes(); + if (numWorkers <= 1 || localVID < m_initialVectorSize) return localVID; + return m_initialVectorSize + (localVID - m_initialVectorSize) * numWorkers + GetWorkerNodeIndex(); + } + + // Idempotent: wires the receiver's BatchAppend Jobs onto our shared + // SPDKThreadPool. Called both after pool creation and from + // SetWorker(); whichever happens last actually binds the submitter. + void WireJobSubmitterIfReady() { + if (!m_worker || !m_splitThreadPool) return; + auto pool = m_splitThreadPool; + m_worker->SetJobSubmitter(m_layer, + [pool](Helper::ThreadPool::Job* j, bool high) { + if (high) pool->add_high(j); + else pool->add(j); + }); + } + + /// Set the external WorkerNode pointer and bind all callbacks + /// (append, head-sync, remote-lock, merge-hint) at THIS instance's m_layer. + void SetWorker(WorkerNode* router) override { + m_worker = router; + if (!m_worker) return; + + WireJobSubmitterIfReady(); + + // Claim ownership so the matching destructor's IfOwner check + // clears the right slot if/when we are deleted (multi-layer SPANN + // each layer has its own slot keyed by m_layer). + m_worker->ClaimCallbackOwnership(m_layer, this); + + // Append callback: routes incoming remote appends to local Append() + m_worker->SetAppendCallback(m_layer, + [this](SizeType headID, std::shared_ptr headVec, + int appendNum, std::string& appendPosting) -> ErrorCode { + // Reuse SPDKThreadPool's per-worker pre-allocated workspace + // when called from BatchAppendItemJob on m_splitThreadPool. + ExtraWorkSpace localWorkSpace; + ExtraWorkSpace* ws = static_cast(tls_preallocAppendWorkSpace); + if (!ws) { + m_headIndex->InitWorkSpace(&localWorkSpace); + ws = &localWorkSpace; + } + bool wasMissing = !m_headIndex->ContainSample(headID, m_layer + 1); + if (wasMissing && headVec && !headVec->empty()) { + DimensionType dim = static_cast( + headVec->size() / sizeof(ValueType)); + m_headIndex->AddHeadIndex(headVec->data(), headID, 0, + dim, m_layer + 1, ws); + } + + // Mirror sender's version map for the records we're about + // to persist so MergePostings + SearchIndex don't drop + // them as "stale". See HEAD git history for rationale. + { + const uint8_t* basePtr = reinterpret_cast(appendPosting.data()); + size_t totalRec = appendPosting.size() / m_vectorInfoSize; + EnsureVersionMapCoversPosting(basePtr, totalRec, "AppendCallback", headID); + + const SizeType localCount = m_versionMap->Count(); + std::vector batchVids; + std::vector batchVers; + batchVids.reserve(totalRec); + batchVers.reserve(totalRec); + for (size_t i = 0; i < totalRec; ++i) { + const uint8_t* p = basePtr + i * m_vectorInfoSize; + SizeType vid = *reinterpret_cast(p); + uint8_t recVer = *(p + sizeof(SizeType)); + if (vid < 0 || vid >= localCount) continue; + if (recVer == 0xfe) continue; + uint8_t curVer = m_versionMap->GetVersion(vid); + if (curVer == 0xfe) continue; + if (curVer == recVer) continue; + batchVids.push_back(vid); + batchVers.push_back(recVer); + } + if (!batchVids.empty()) { + m_versionMap->SetVersionBatch(batchVids, batchVers); + } + } + return Append(ws, headID, appendNum, appendPosting, 0); + }); + + // Head sync callback: apply head index updates from peers + auto* headIndex = m_headIndex; + int layer = m_layer; + auto* worker = m_worker; + m_worker->SetHeadSyncCallback(m_layer, [headIndex, layer, worker](const HeadSyncEntry& entry) { + if (entry.op == HeadSyncEntry::Op::Add) { + headIndex->AddHeadIndex(entry.headVector.data(), entry.headVID, 0, + static_cast(entry.headVector.size() / sizeof(ValueType)), + layer + 1, nullptr); + if (worker) worker->NoteHeadSyncApplyAdd(); + } else { + headIndex->DeleteIndex(entry.headVID, layer + 1); + if (worker) worker->NoteHeadSyncApplyDelete(); + } + }); + + // Remote lock callback: per-bucket atomic flags + m_worker->SetRemoteLockCallback(m_layer, [this](SizeType headID, bool lock) -> bool { + unsigned bucket = COMMON::FineGrainedRWLock::BucketIndex(static_cast(headID)); + if (lock) { + bool expected = false; + if (!m_remoteBucketLocked[bucket].compare_exchange_strong(expected, true)) { + return false; + } + if (!m_rwLocks[headID].try_lock()) { + m_remoteBucketLocked[bucket].store(false); + return false; + } + m_rwLocks[headID].unlock(); + return true; + } else { + m_remoteBucketLocked[bucket].store(false); + return true; + } + }); + + // Cross-node merge hint callback + m_worker->SetMergeCallback(m_layer, [this](SizeType headID) { + MergeAsync(headID); + }); + + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "WorkerNode bound to ExtraDynamicSearcher (layer %d)\n", m_layer); } - ~ExtraDynamicSearcher() {} + // Owner-side wait for any in-flight remote lock on this bucket. + void WaitForRemoteBucketUnlocked(SizeType headID) const { + if (!m_worker || !m_worker->IsEnabled()) return; + unsigned bucket = COMMON::FineGrainedRWLock::BucketIndex(static_cast(headID)); + if (!m_remoteBucketLocked[bucket].load(std::memory_order_acquire)) return; + constexpr int kMaxRemoteBucketWaitMs = 5000; + auto deadline = std::chrono::steady_clock::now() + + std::chrono::milliseconds(kMaxRemoteBucketWaitMs); + while (m_remoteBucketLocked[bucket].load(std::memory_order_acquire)) { + if (std::chrono::steady_clock::now() > deadline) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "WaitForRemoteBucketUnlocked: headID=%lld bucket=%u stuck for %d ms, proceeding\n", + (std::int64_t)headID, bucket, kMaxRemoteBucketWaitMs); + return; + } + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } + } + + // Pack and enqueue a RemoteAppendRequest for an already-resolved + // remote owner. headVecBytes may be nullptr when the caller has no + // centroid bytes (plain Append into an existing head). + void EnqueueRemoteAppend(int nodeIndex, + SizeType headID, + int appendNum, + std::string posting, + const void* headVecBytes = nullptr) { + RemoteAppendRequest req; + req.m_headID = headID; + req.m_layer = m_layer; + if (headVecBytes != nullptr) { + req.m_headVec.assign(static_cast(headVecBytes), + m_vectorDataSize); + } + req.m_appendNum = appendNum; + req.m_appendPosting = std::move(posting); + m_worker->QueueRemoteAppend(nodeIndex, std::move(req)); + } + + // If headID is owned by a remote node, queue the append for that + // node and return true; otherwise return false (caller continues + // with local write logic). + bool TryRouteRemoteAppend(SizeType headID, + int appendNum, + std::string posting, + const void* headVecBytes = nullptr) { + if (!m_worker || !m_worker->IsEnabled()) return false; + // Only the outer (head) layer participates in the owner-ring + // route. Inner layers (m_layer > 0) hold per-node-local state + // (no shared head VID space, no cross-node TiKV key naming + // contract), so each node services its own inner layer + // independently. Without this gate inner-layer appends would + // also dispatch RPCs that the receiver can't meaningfully + // apply. + if (m_layer != 0) return false; + auto target = m_worker->GetOwner(headID); + if (target.isLocal) return false; + EnqueueRemoteAppend(target.nodeIndex, headID, appendNum, + std::move(posting), headVecBytes); + return true; + } + + // Validate (and lazily extend) the local version map so that + // every (vid, ver) tuple in a posting we are about to write is + // representable. Without this, remote-originated postings carrying + // VIDs above our current Count() get dropped silently. + void EnsureVersionMapCoversPosting(const uint8_t* p_basePtr, size_t p_totalRec, + const char* p_caller, SizeType p_headID) { + const SizeType localCount = m_versionMap->Count(); + SizeType maxVid = -1; + for (size_t i = 0; i < p_totalRec; ++i) { + const uint8_t* p = p_basePtr + i * m_vectorInfoSize; + SizeType vid = *reinterpret_cast(p); + if (vid > maxVid) maxVid = vid; + } + if (maxVid >= localCount) { + SizeType need = maxVid + 1 - localCount; + m_versionMap->AddBatch(need); + SPTAGLIB_LOG(Helper::LogLevel::LL_Debug, + "%s: extended local versionMap by %lld (head=%lld maxVid=%lld localCount=%lld)\n", + p_caller, (std::int64_t)need, (std::int64_t)p_headID, + (std::int64_t)maxVid, (std::int64_t)localCount); + } + } virtual bool Available() override { @@ -419,7 +672,12 @@ namespace SPTAG::SPANN { virtual ErrorCode AddIDCapacity(SizeType capa, bool deleted) override { - return m_versionMap->AddBatch(capa, deleted); + // Distributed: grow the version map by the FULL batch size + // (capa * numWorkers), not just this node's slice. Stripe formula + // in AllocateGlobalVID produces globalVIDs up to + // m_initialVectorSize + insertCount * numWorkers. + int numWorkers = GetNumWorkerNodes(); + return m_versionMap->AddBatch(capa * numWorkers, deleted); } SPANN::Index* GetHeadIndex() const { return m_headIndex; } @@ -616,6 +874,23 @@ namespace SPTAG::SPANN { double elapsedMSeconds; uint64_t splitPostingVectors = 0; uint64_t splitNewHeadCount = 0; + + // Only the OWNER of headID should run Split. Remote-issued + // splits get dropped early so we don't mutate a posting that + // doesn't live on this node. + if (m_worker && m_worker->IsEnabled()) { + auto target = m_worker->GetOwner(headID); + if (!target.isLocal) { + std::unique_lock tmplock(m_splitListLock); + m_splitList.unsafe_erase(headID); + return ErrorCode::Success; + } + } + + // Owner-side: wait for any in-flight remote-initiated lock on + // this bucket to release the advisory flag before we mutate. + WaitForRemoteBucketUnlocked(headID); + { std::unique_lock lock(m_rwLocks[headID], std::defer_lock); if (requirelock) { @@ -838,6 +1113,17 @@ namespace SPTAG::SPANN { //SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Split: new head VID %lld already exists in head index. Do merging...\n", (std::int64_t)(newHeadVID)); m_stat.m_splitExistingHeadMergeCount.fetch_add(1, std::memory_order_relaxed); + // If newHeadVID's owner is a remote node, route + // the new posting via RemoteAppend; the owner + // will merge it into the existing posting list. + if (TryRouteRemoteAppend( + newHeadVID, + (int)(newPostingLists[k].size() / m_vectorInfoSize), + newPostingLists[k], + args.centers + k * args._D)) { + if (m_rwLocks.hash_func(newHeadVID) != m_rwLocks.hash_func(headID)) anotherLock.unlock(); + continue; + } std::string mergedPostingList; std::set vectorIdSet; @@ -925,20 +1211,36 @@ namespace SPTAG::SPANN { SplitAsync(newHeadVID, currentLength); } } else { - auto splitPutBegin = std::chrono::high_resolution_clock::now(); - if ((ret=db->Put(DBKey(newHeadVID), newPostingLists[k], MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to add new posting %lld\n", (std::int64_t)(newHeadVID)); - return ret; + // If newHeadVID's owner is a remote node, route + // the initial posting via RemoteAppend so it + // ends up in the owner's TiKV. We still add the + // head locally and rely on BroadcastHeadSync + // (after this loop) to spread the head index + // update to all nodes. The receiver's + // AppendCallback materializes the head if its + // HeadSync hasn't arrived yet. + bool remoteCreated = TryRouteRemoteAppend( + newHeadVID, + (int)(newPostingLists[k].size() / m_vectorInfoSize), + newPostingLists[k], + args.centers + k * args._D); + + if (!remoteCreated) { + auto splitPutBegin = std::chrono::high_resolution_clock::now(); + if ((ret=db->Put(DBKey(newHeadVID), newPostingLists[k], MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to add new posting %lld\n", (std::int64_t)(newHeadVID)); + return ret; + } + CheckCentroid(newHeadVID, newPostingLists[k], "Split-NewPosting"); + auto splitPutEnd = std::chrono::high_resolution_clock::now(); + elapsedMSeconds = std::chrono::duration_cast(splitPutEnd - splitPutBegin).count(); + m_stat.m_putCost += elapsedMSeconds; } - CheckCentroid(newHeadVID, newPostingLists[k], "Split-NewPosting"); - auto splitPutEnd = std::chrono::high_resolution_clock::now(); - elapsedMSeconds = std::chrono::duration_cast(splitPutEnd - splitPutBegin).count(); - m_stat.m_putCost += elapsedMSeconds; auto updateHeadBegin = std::chrono::high_resolution_clock::now(); if ((ret = m_headIndex->AddHeadIndex(args.centers + k * args._D, newHeadVID, version, m_opt->m_dim, m_layer + 1, p_exWorkSpace)) != ErrorCode::Success) { SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to update head index %lld\n", (std::int64_t)(newHeadVID)); - if (db->Delete(DBKey(newHeadVID)) != ErrorCode::Success) { + if (!remoteCreated && db->Delete(DBKey(newHeadVID)) != ErrorCode::Success) { SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to delete gc posting %lld\n", (std::int64_t)(newHeadVID)); } return ret; @@ -962,6 +1264,35 @@ namespace SPTAG::SPANN { } } + // Broadcast HeadSync to peer nodes when the head update lands + // in our local BKT (in-memory, per-compute). Lower-layer head + // adds that resolve to m_extraSearchers[m_layer+1]->AddIndex + // already write to shared TiKV so re-broadcasting them would + // only duplicate. + if (m_worker && m_worker->IsEnabled() + && m_headIndex->GetDiskIndex(m_layer + 1) == nullptr) { + std::vector headSyncEntries; + for (int k = 0; k < 2; k++) { + if (args.counts[k] == 0 || (int)newHeadsID.size() <= k) continue; + HeadSyncEntry entry; + entry.op = HeadSyncEntry::Op::Add; + entry.headVID = newHeadsID[k]; + entry.m_layer = m_layer; + entry.headVector.assign(args.centers + k * args._D, args.centers + k * args._D + m_vectorDataSize); + headSyncEntries.push_back(std::move(entry)); + } + if (!theSameHead) { + HeadSyncEntry entry; + entry.op = HeadSyncEntry::Op::Delete; + entry.headVID = headID; + entry.m_layer = m_layer; + headSyncEntries.push_back(std::move(entry)); + } + if (!headSyncEntries.empty()) { + m_worker->BroadcastHeadSync(headSyncEntries); + } + } + { std::unique_lock tmplock(m_splitListLock); //SPTAGLIB_LOG(Helper::LogLevel::LL_Info,"erase: %d\n", headID); @@ -1003,6 +1334,18 @@ namespace SPTAG::SPANN { ErrorCode MergePostings(ExtraWorkSpace *p_exWorkSpace, SizeType headID) { + // The owner runs its own merge passes. Skip when this head is + // owned by another node — we'd just be racing the owner. + if (m_worker && m_worker->IsEnabled()) { + auto target = m_worker->GetOwner(headID); + if (!target.isLocal) { + std::unique_lock tmplock(m_mergeListLock); + m_mergeList.unsafe_erase(headID); + return ErrorCode::Success; + } + } + WaitForRemoteBucketUnlocked(headID); + std::unique_lock lock(m_rwLocks[headID]); if (!m_headIndex->ContainSample(headID, m_layer + 1)) { @@ -1102,23 +1445,61 @@ namespace SPTAG::SPANN { int deletedLength = 0; { std::unique_lock anotherLock(m_rwLocks[queryResult->VID], std::defer_lock); - // SPTAGLIB_LOG(Helper::LogLevel::LL_Info,"Locked: %d, to be lock: %d\n", headID, queryResult->VID); - if (m_rwLocks.hash_func(queryResult->VID) != m_rwLocks.hash_func(headID)) { - if (!anotherLock.try_lock()) { - auto* curJob = new MergeAsyncJob(this, headID, nullptr); - // Re-queue counts as a new submission; matched by the - // m_mergeJobsInFlight-- / m_totalMergeCompleted++ in - // MergeAsyncJob::exec(). Without these increments - // m_mergeJobsInFlight underflows to a huge uint64 - // and m_totalMergeCompleted exceeds m_totalMergeSubmitted. - m_mergeJobsInFlight++; - m_totalMergeSubmitted++; - m_splitThreadPool->add(curJob); - return ErrorCode::Success; + + // RAII guard for the advisory remote bucket lock. + struct RemoteLockGuard { + WorkerNode* router = nullptr; + int nodeIndex = -1; + int layer = 0; + SizeType headID = -1; + bool active = false; + ~RemoteLockGuard() { if (active && router) router->SendRemoteLock(nodeIndex, layer, headID, false); } + void release() { active = false; } + } remoteLockGuard; + + bool isRemoteCandidate = false; + int remoteNodeIndex = -1; + if (m_worker && m_worker->IsEnabled()) { + auto target = m_worker->GetOwner(queryResult->VID); + if (!target.isLocal) { + isRemoteCandidate = true; + remoteNodeIndex = target.nodeIndex; + if (!m_worker->SendRemoteLock(remoteNodeIndex, m_layer, queryResult->VID, true)) { + // Remote owner busy; skip this candidate. + continue; + } + remoteLockGuard.router = m_worker; + remoteLockGuard.nodeIndex = remoteNodeIndex; + remoteLockGuard.layer = m_layer; + remoteLockGuard.headID = queryResult->VID; + remoteLockGuard.active = true; } } - if (!m_headIndex->ContainSample(queryResult->VID, m_layer + 1)) continue; + + if (!isRemoteCandidate) { + // SPTAGLIB_LOG(Helper::LogLevel::LL_Info,"Locked: %d, to be lock: %d\n", headID, queryResult->VID); + if (m_rwLocks.hash_func(queryResult->VID) != m_rwLocks.hash_func(headID)) { + if (!anotherLock.try_lock()) { + auto* curJob = new MergeAsyncJob(this, headID, nullptr); + // Re-queue counts as a new submission; matched by the + // m_mergeJobsInFlight-- / m_totalMergeCompleted++ in + // MergeAsyncJob::exec(). Without these increments + // m_mergeJobsInFlight underflows to a huge uint64 + // and m_totalMergeCompleted exceeds m_totalMergeSubmitted. + m_mergeJobsInFlight++; + m_totalMergeSubmitted++; + m_splitThreadPool->add(curJob); + return ErrorCode::Success; + } + } + if (!m_headIndex->ContainSample(queryResult->VID, m_layer + 1)) continue; + } + if ((ret=db->Get(DBKey(queryResult->VID), &nextPostingList, MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { + if (isRemoteCandidate) { + // Stale fetch on remote side; skip and let next round retry. + continue; + } SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to get to be merged posting: %lld, get size:%d\n", (std::int64_t)(queryResult->VID), (int)(nextPostingList.size())); @@ -1143,6 +1524,14 @@ namespace SPTAG::SPANN { nextLength++; } if (resultVec == nullptr) { + if (isRemoteCandidate) { + // Stale fetch / version skew on remote side. Skip + // and let the next merge round retry. + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "MergePostings: remote candidate %lld has no head record in fetched posting, skipping\n", + (std::int64_t)(queryResult->VID)); + continue; + } SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "MergePostings fail: cannot find another head vector in posting! headID:%lld\n", (std::int64_t)(queryResult->VID)); return ErrorCode::Fail; } @@ -1158,11 +1547,25 @@ namespace SPTAG::SPANN { return ret; } CheckCentroid(headID, mergedPostingList, "MergePostings-currentLength >= nextLength"); - m_headIndex->DeleteIndex(queryResult->VID, m_layer + 1); - if ((ret=db->Delete(DBKey(queryResult->VID))) != ErrorCode::Success) - { - SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to delete old posting %lld in Merge\n", (std::int64_t)(queryResult->VID)); - return ret; + if (isRemoteCandidate) { + // Survivor is local; delete remote loser first + // (so we don't have duplicate VID across nodes), + // then drop local head-index entry. + if ((ret=db->Delete(DBKey(queryResult->VID))) != ErrorCode::Success + && ret != ErrorCode::Key_NotFound) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "MergePostings: remote-loser Delete(%lld) failed; survivor %lld is durable\n", + (std::int64_t)queryResult->VID, (std::int64_t)headID); + return ret; + } + m_headIndex->DeleteIndex(queryResult->VID, m_layer + 1); + } else { + m_headIndex->DeleteIndex(queryResult->VID, m_layer + 1); + if ((ret=db->Delete(DBKey(queryResult->VID))) != ErrorCode::Success) + { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to delete old posting %lld in Merge\n", (std::int64_t)(queryResult->VID)); + return ret; + } } nextHeadID = headID; nextHeadVec = headVec; @@ -1175,6 +1578,12 @@ namespace SPTAG::SPANN { mergedPostingList += *resultVec; } if ((ret=db->Put(DBKey(queryResult->VID), mergedPostingList, MaxTimeout, &(p_exWorkSpace->m_diskRequests))) != ErrorCode::Success) { + if (isRemoteCandidate) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "MergePostings: remote-survivor Put(%lld) failed; no state mutated, next round will retry\n", + (std::int64_t)queryResult->VID); + return ret; + } SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "MergePostings fail to override posting %lld after merge\n", (std::int64_t)(queryResult->VID)); return ret; } @@ -1182,6 +1591,12 @@ namespace SPTAG::SPANN { m_headIndex->DeleteIndex(headID, m_layer + 1); if ((ret = db->Delete(DBKey(headID))) != ErrorCode::Success) { + if (isRemoteCandidate) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, + "MergePostings: local-loser Delete(%lld) failed; remote survivor %lld is durable\n", + (std::int64_t)headID, (std::int64_t)queryResult->VID); + return ret; + } SPTAGLIB_LOG(Helper::LogLevel::LL_Error, "Fail to delete old posting %lld in Merge\n", (std::int64_t)(headID)); return ret; } @@ -1191,7 +1606,15 @@ namespace SPTAG::SPANN { deletedPostingList = ¤tPostingList; deletedLength = currentLength; } - if (m_rwLocks.hash_func(queryResult->VID) != m_rwLocks.hash_func(headID)) anotherLock.unlock(); + if (isRemoteCandidate) { + // Release advisory remote lock before reassign below. + if (remoteLockGuard.active) { + remoteLockGuard.router->SendRemoteLock( + remoteLockGuard.nodeIndex, remoteLockGuard.layer, + remoteLockGuard.headID, false); + remoteLockGuard.release(); + } + } else if (m_rwLocks.hash_func(queryResult->VID) != m_rwLocks.hash_func(headID)) anotherLock.unlock(); } // SPTAGLIB_LOG(Helper::LogLevel::LL_Info,"Release: %d, Release: %d\n", headID, queryResult->VID); @@ -1553,6 +1976,38 @@ namespace SPTAG::SPANN { SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Error!, headID :%lld, appendNum:%d\n", (std::int64_t)headID, appendNum); } + // If this head is owned by a remote node, route the append via + // QueueRemoteAppend instead of touching local TiKV. appendNum is + // captured BEFORE std::move(appendPosting) to avoid use-after-move. + // If the batch carries the head's own self-entry (VID == headID), + // forward its vector bytes so the receiver can materialize the + // head index before the BroadcastHeadSync arrives. See the + // matching scan in BatchAppend() for rationale. + { + const uint8_t* basePtr = + reinterpret_cast(appendPosting.data()); + const void* headVecBytes = nullptr; + for (int i = 0; i < appendNum; ++i) { + const uint8_t* p = basePtr + i * m_vectorInfoSize; + SizeType vid = *reinterpret_cast(p); + if (vid == headID) { + headVecBytes = p + m_metaDataSize; + break; + } + } + if (TryRouteRemoteAppend(headID, appendNum, appendPosting, headVecBytes)) { + if (!reassignThreshold) { + m_totalAppendCount++; + m_stat.m_appendTaskNum++; + } + return ErrorCode::Success; + } + } + + // If a remote initiator is currently holding the advisory lock + // on this bucket, wait it out before we touch the posting. + WaitForRemoteBucketUnlocked(headID); + checkDeleted: if (!m_headIndex->ContainSample(headID, m_layer + 1)) { for (int i = 0; i < appendNum; i++) @@ -1684,6 +2139,41 @@ namespace SPTAG::SPANN { auto appendIt = headAppends.find(headID); if (appendIt == headAppends.end()) continue; + // Owner gate: forward heads owned by a remote node via the + // batched RemoteAppend queue. Local heads fall through to + // the standard MultiMerge path below. Without this hook, + // every node writes to every head's TiKV key and the owner + // ring is ignored (no remote RPC, no route stats). + // + // Pass headVecBytes when this batch carries the head's own + // self-entry (VID == headID). During Build-time seed the + // receiver may not yet have the head index entry; without + // headVecBytes its AppendCallback can't materialize the head + // and falls into the ReassignAsync redirect path, dropping + // the self-entry from the posting and later causing + // "MergePostings fail: cannot find head vector in posting!". + { + const std::string& posting = appendIt->second; + const uint8_t* basePtr = + reinterpret_cast(posting.data()); + size_t totalRec = posting.size() / m_vectorInfoSize; + const void* headVecBytes = nullptr; + for (size_t i = 0; i < totalRec; ++i) { + const uint8_t* p = basePtr + i * m_vectorInfoSize; + SizeType vid = *reinterpret_cast(p); + if (vid == headID) { + headVecBytes = p + m_metaDataSize; + break; + } + } + if (TryRouteRemoteAppend(headID, + (int)(posting.size() / m_vectorInfoSize), + posting, + headVecBytes)) { + continue; + } + } + std::unique_lock headLock(m_rwLocks[headID]); if (!m_headIndex->ContainSample(headID, m_layer + 1)) { @@ -1788,6 +2278,10 @@ namespace SPTAG::SPANN { //LOG(Helper::LogLevel::LL_Info, "Reassign: oldVID:%d, replicaCount:%d, candidateNum:%d, dist0:%f\n", oldVID, replicaCount, i, selections[0].distance); for (int i = 0; i < replicaCount && m_versionMap->GetVersion(VID) == version; i++) { //LOG(Helper::LogLevel::LL_Info, "Reassign: headID :%d, oldVID:%d, newVID:%d, posting length: %d, dist: %f, string size: %d\n", headID, oldVID, VID, m_postingSizes[headID].load(), selections[i].distance, newPart.size()); + if (TryRouteRemoteAppend(selections[i].VID, 1, *vectorInfo, + selections[i].Vec.Data())) { + continue; + } // [FIX H3] use reassignThreshold=0 so that an oversized // target posting triggers SplitAsync (not a synchronous // Split on this worker thread). This matches the @@ -1813,6 +2307,7 @@ namespace SPTAG::SPANN { bool LoadIndex(Options& p_opt) override { m_opt = &p_opt; + m_initialVectorSize = p_opt.m_vectorSize; // initial count for VID stripe SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "DataBlockSize: %d, Capacity: %d\n", m_opt->m_datasetRowsInBlock, m_opt->m_datasetCapacity); std::string versionmapPath = m_opt->m_indexDirectory + FolderSep + m_opt->m_deleteIDFile + "_" + std::to_string(m_layer); if (m_opt->m_recovery) { @@ -1901,13 +2396,33 @@ namespace SPTAG::SPANN { } if (m_opt->m_update) { if (m_splitThreadPool == nullptr) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "SPFresh: initialize thread pools, append: %d, reassign %d\n", m_opt->m_appendThreadNum, m_opt->m_reassignThreadNum); - - m_splitThreadPool = std::make_shared(); - m_splitThreadPool->initSPDK(m_opt->m_appendThreadNum, this); - //m_reassignThreadPool = std::make_shared(); - //m_reassignThreadPool->initSPDK(m_opt->m_reassignThreadNum, this); - SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "SPFresh: finish initialization\n"); + // Only layer 0 participates in the shared-pool slot: + // it both adopts (if a sibling published first) and + // publishes (so the WorkerNode receiver and any later + // layer-0 instance can reuse the same threads). + // Inner layers (m_layer > 0) always create their own + // pool, matching qianxi's per-instance pool design. + if (m_layer == 0 && m_headIndex) { + auto shared = m_headIndex->GetSharedSplitPool(); + if (shared) { + m_splitThreadPool = std::static_pointer_cast(shared); + } + } + if (m_splitThreadPool == nullptr) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "SPFresh: initialize thread pools, append: %d, reassign %d\n", m_opt->m_appendThreadNum, m_opt->m_reassignThreadNum); + + m_splitThreadPool = std::make_shared(); + m_splitThreadPool->initSPDK(m_opt->m_appendThreadNum, this); + //m_reassignThreadPool = std::make_shared(); + //m_reassignThreadPool->initSPDK(m_opt->m_reassignThreadNum, this); + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "SPFresh: finish initialization\n"); + if (m_layer == 0 && m_headIndex) m_headIndex->SetSharedSplitPool(m_splitThreadPool); + } else { + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "SPFresh: adopted shared split pool from sibling layer\n"); + } + // Pool is now ready: re-attempt wiring the worker's job + // submitter (may have been set before pool was alive). + WireJobSubmitterIfReady(); } if (m_opt->m_enableWAL && !m_opt->m_persistentBufferPath.empty()) { @@ -2345,6 +2860,7 @@ namespace SPTAG::SPANN { { auto fullVectors = p_reader->GetVectorSet(); fullCount = fullVectors->Count(); + m_initialVectorSize = fullCount; // remember bulk-build count for stripe formula m_metaDataSize = sizeof(SizeType) + sizeof(uint8_t); m_vectorDataSize = fullVectors->PerVectorDataSize(); m_vectorInfoSize = m_vectorDataSize + m_metaDataSize; @@ -2556,10 +3072,20 @@ namespace SPTAG::SPANN { if (m_opt->m_update && !m_opt->m_allowZeroReplica && zeroReplicaCount > 0) { - SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "SPFresh: initialize thread pools, append: %d, reassign %d\n", m_opt->m_appendThreadNum, m_opt->m_reassignThreadNum); - m_splitThreadPool = std::make_shared(); - m_splitThreadPool->initSPDK(m_opt->m_appendThreadNum, this); - SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "SPFresh: finish initialization, zeroReplicaCount:%zu\n", zeroReplicaCount); + if (m_splitThreadPool == nullptr && m_layer == 0 && m_headIndex) { + auto shared = m_headIndex->GetSharedSplitPool(); + if (shared) { + m_splitThreadPool = std::static_pointer_cast(shared); + } + } + if (m_splitThreadPool == nullptr) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "SPFresh: initialize thread pools, append: %d, reassign %d\n", m_opt->m_appendThreadNum, m_opt->m_reassignThreadNum); + m_splitThreadPool = std::make_shared(); + m_splitThreadPool->initSPDK(m_opt->m_appendThreadNum, this); + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "SPFresh: finish initialization, zeroReplicaCount:%zu\n", zeroReplicaCount); + if (m_layer == 0 && m_headIndex) m_headIndex->SetSharedSplitPool(m_splitThreadPool); + } + WireJobSubmitterIfReady(); uint32_t splitNumBeforeZeroReplica = m_stat.m_splitNum; uint32_t reassignNumBeforeZeroReplica = m_stat.m_reAssignNum; @@ -2834,6 +3360,16 @@ namespace SPTAG::SPANN { return ErrorCode::VectorNotFound; } + ErrorCode FlushRemoteAppends() { + if (m_worker && m_worker->IsEnabled()) { + ErrorCode ret = m_worker->FlushRemoteAppends(); + m_worker->LogRouteStats(" (batch flush)"); + m_worker->ResetRouteStats(); + return ret; + } + return ErrorCode::Success; + } + bool AllFinished() { if (!m_splitThreadPool) return true; diff --git a/AnnService/inc/Core/SPANN/ExtraTiKVController.h b/AnnService/inc/Core/SPANN/ExtraTiKVController.h index d7528d479..0541eaad1 100644 --- a/AnnService/inc/Core/SPANN/ExtraTiKVController.h +++ b/AnnService/inc/Core/SPANN/ExtraTiKVController.h @@ -12,6 +12,7 @@ #include "kvproto/tikvpb.grpc.pb.h" #include "kvproto/kvrpcpb.pb.h" #include "kvproto/metapb.pb.h" +#include "kvproto/pdpb.pb.h" #include "kvproto/pdpb.grpc.pb.h" #include diff --git a/AnnService/inc/Core/SPANN/IExtraSearcher.h b/AnnService/inc/Core/SPANN/IExtraSearcher.h index 554b02421..ec8d8bf95 100644 --- a/AnnService/inc/Core/SPANN/IExtraSearcher.h +++ b/AnnService/inc/Core/SPANN/IExtraSearcher.h @@ -22,6 +22,11 @@ namespace SPTAG { namespace SPANN { + // Forward declaration; the only IExtraSearcher API that touches WorkerNode + // is the SetWorker() hook below. Concrete searchers that care + // (ExtraDynamicSearcher) include the full header and override. + class WorkerNode; + struct SearchStats { SearchStats() @@ -589,6 +594,11 @@ namespace SPTAG { SizeType p_begin) { return ErrorCode::Undefined; } virtual ErrorCode DeleteIndex(SizeType p_id) { return ErrorCode::Undefined; } + // Allocate globalVID to this node's BKT counter. + // ExtraDynamicSearcher overrides this with + // the stripe formula when m_worker is enabled. + virtual SizeType AllocateGlobalVID(SizeType p_localVID) const { return p_localVID; } + virtual SizeType GetNumSamples() const = 0; virtual bool ContainSample(const SizeType idx) const @@ -624,6 +634,11 @@ namespace SPTAG { return ErrorCode::Undefined; } + // Bind a routing worker (no-op by default). ExtraDynamicSearcher + // overrides this to install the cross-node append + put + + // fetch-postings callbacks. ExtraStaticSearcher etc. ignore it. + virtual void SetWorker(WorkerNode* /*worker*/) {} + virtual bool AllFinished() { return false; } virtual void GetDBStats() { return; } virtual int64_t GetNumBlocks() { return 0; } @@ -640,6 +655,8 @@ namespace SPTAG { } virtual ErrorCode Checkpoint(std::string prefix) { return ErrorCode::Success; } + + virtual void InitWorkSpace(ExtraWorkSpace* p_exWorkSpace, bool clear = false) {} }; } // SPANN } // SPTAG diff --git a/AnnService/inc/Core/SPANN/Index.h b/AnnService/inc/Core/SPANN/Index.h index 5479d2d42..255043a58 100644 --- a/AnnService/inc/Core/SPANN/Index.h +++ b/AnnService/inc/Core/SPANN/Index.h @@ -47,6 +47,11 @@ namespace SPTAG template class SPANNResultIterator; + // Forward-declare so Index can hold/forward a WorkerNode pointer + // without dragging in the full Distributed/WorkerNode.h header (and + // thus its boost-asio + grpc transitive deps) into Index.h. + class WorkerNode; + template class Index; template @@ -63,6 +68,12 @@ namespace SPTAG std::vector> m_extraSearchers; std::unique_ptr> m_workSpaceFactory; + // Routing worker bound BEFORE BuildIndex so that + // ExtraDynamicSearcher::WriteDownAllPostingToDB and other build + // hooks see a non-null m_worker as each layer's searcher is + // emplaced. SPFreshTest sets this in BuildOnly+Distributed mode. + WorkerNode* m_pendingWorker = nullptr; + Options m_options; std::function m_fComputeDistance; @@ -85,6 +96,14 @@ namespace SPTAG std::shared_ptr> m_freeWorkSpaceIds; std::atomic m_workspaceCount = 0; + // Single split/append thread pool shared by all extraSearchers + // (one per layer). Lazily populated by the first layer that + // initializes its pool inside LoadIndex; subsequent layers + // adopt the same shared instance so the total worker count + // is AppendThreadNum (not AppendThreadNum * layers). + mutable std::mutex m_sharedSplitPoolMutex; + std::shared_ptr m_sharedSplitPool; + public: Index() { @@ -124,6 +143,27 @@ namespace SPTAG inline std::shared_ptr GetDiskIndex(int layer = 0) { if (layer < m_extraSearchers.size()) return m_extraSearchers[layer]; else return nullptr; } inline Options* GetOptions() { return &m_options; } + // Bind a routing worker. Forwards to all currently-existing + // extraSearchers and remembers the pointer so newly-emplaced + // searchers (created during BuildIndexInternalLayer) also pick + // it up. Pass nullptr to detach. + void SetWorker(WorkerNode* worker) { + m_pendingWorker = worker; + for (auto& searcher : m_extraSearchers) { + if (searcher) searcher->SetWorker(worker); + } + } + inline WorkerNode* GetPendingWorker() const { return m_pendingWorker; } + + inline std::shared_ptr GetSharedSplitPool() const { + std::lock_guard lk(m_sharedSplitPoolMutex); + return m_sharedSplitPool; + } + inline void SetSharedSplitPool(std::shared_ptr pool) { + std::lock_guard lk(m_sharedSplitPoolMutex); + m_sharedSplitPool = std::move(pool); + } + inline SizeType GetNumSamples() const { return GetNumSamples(0); } inline SizeType GetNumSamples(int layer) const { if (layer < m_extraSearchers.size()) return m_extraSearchers[layer]->GetNumSamples(); else return m_topIndex->GetNumSamples(); } inline DimensionType GetFeatureDim() const { return m_topIndex->GetFeatureDim(); } diff --git a/AnnService/inc/Core/VectorIndex.h b/AnnService/inc/Core/VectorIndex.h index a25bf1e63..62e2ca843 100644 --- a/AnnService/inc/Core/VectorIndex.h +++ b/AnnService/inc/Core/VectorIndex.h @@ -5,6 +5,7 @@ #define _SPTAG_VECTORINDEX_H_ #include +#include #include "Common.h" #include "Common/WorkSpace.h" #include "inc/Helper/DiskIO.h" @@ -160,6 +161,14 @@ class VectorIndex static ErrorCode LoadIndex(const std::string& p_loaderFilePath, std::shared_ptr& p_vectorIndex); + /// LoadIndex with config overrides applied between LoadIndexConfig and LoadIndexData, + /// so settings such as TiKVPDAddresses take effect before the underlying KV connection + /// is constructed. Override keys may be section-qualified ("Section.Param"); unqualified + /// keys default to the "BuildSSDIndex" section. + static ErrorCode LoadIndex(const std::string& p_loaderFilePath, + const std::map& p_paramOverrides, + std::shared_ptr& p_vectorIndex); + static ErrorCode LoadIndexFromFile(const std::string& p_file, std::shared_ptr& p_vectorIndex); static ErrorCode LoadIndex(const std::string& p_config, const std::vector& p_indexBlobs, std::shared_ptr& p_vectorIndex); diff --git a/AnnService/inc/Helper/KeyValueIO.h b/AnnService/inc/Helper/KeyValueIO.h index a7c3c25b8..9d7c1e2a3 100644 --- a/AnnService/inc/Helper/KeyValueIO.h +++ b/AnnService/inc/Helper/KeyValueIO.h @@ -34,6 +34,20 @@ namespace SPTAG virtual ErrorCode Put(const SizeType key, const std::string& value, const std::chrono::microseconds& timeout, std::vector* reqs) = 0; + // Batched writes/deletes. Default implementations return Undefined so that + // backends without native batching (RocksDB, FileIO) can ignore them. + // TiKVIO overrides these to issue a single batched RPC per region group, + // which dramatically reduces the number of synchronous gRPC round-trips + // when callers (e.g. SPANN AddIndex Phase 2 / PutPostingToDB) want to + // commit several keys at once. + virtual ErrorCode MultiPut(const std::vector& keys, + const std::vector& values, + const std::chrono::microseconds& timeout, + std::vector* reqs) { return ErrorCode::Undefined; } + + virtual ErrorCode MultiDelete(const std::vector& keys, + const std::chrono::microseconds& timeout) { return ErrorCode::Undefined; } + virtual ErrorCode Merge(const SizeType key, const std::string &value, const std::chrono::microseconds &timeout, std::vector *reqs, int& size) = 0; diff --git a/AnnService/inc/Helper/ThreadPool.h b/AnnService/inc/Helper/ThreadPool.h index 01c82e2a7..a351a75c8 100644 --- a/AnnService/inc/Helper/ThreadPool.h +++ b/AnnService/inc/Helper/ThreadPool.h @@ -5,7 +5,7 @@ #define _SPTAG_HELPER_THREADPOOL_H_ #include -#include +#include #include #include #include @@ -78,28 +78,42 @@ namespace SPTAG { { std::lock_guard lock(m_lock); - m_jobs.push_back(j); + m_jobs.push(j); } m_cond.notify_one(); } - void addfront(Job* j) + // High-priority push: jobs in m_highJobs always run before m_jobs. + // Used by the distributed receiver to let inbound BatchAppend RPC + // work jump ahead of local Split/Merge/Reassign so the sender + // (driver) doesn't time out waiting for the chunk ack while the + // local pool drains long-running rebalance work. + void add_high(Job* j) { { std::lock_guard lock(m_lock); - m_jobs.push_front(j); + m_highJobs.push(j); } m_cond.notify_one(); } + // Alias kept for compatibility with code that calls addfront() + // (e.g., split-async path). Same semantics as add_high. + void addfront(Job* j) { add_high(j); } + bool get(Job*& j) { std::unique_lock lock(m_lock); - while (m_jobs.empty() && !m_abort.ShouldAbort()) m_cond.wait(lock); + while (m_jobs.empty() && m_highJobs.empty() && !m_abort.ShouldAbort()) m_cond.wait(lock); if (!m_abort.ShouldAbort()) { - j = m_jobs.front(); + if (!m_highJobs.empty()) { + j = m_highJobs.front(); + m_highJobs.pop(); + } else { + j = m_jobs.front(); + m_jobs.pop(); + } currentJobs++; - m_jobs.pop_front(); return true; } return false; @@ -108,7 +122,7 @@ namespace SPTAG size_t jobsize() { std::lock_guard lock(m_lock); - return m_jobs.size(); + return m_jobs.size() + m_highJobs.size(); } inline uint32_t runningJobs() { return currentJobs; } @@ -122,7 +136,8 @@ namespace SPTAG protected: std::atomic_uint32_t currentJobs{ 0 }; - std::deque m_jobs; + std::queue m_jobs; + std::queue m_highJobs; Abort m_abort; std::mutex m_lock; std::condition_variable m_cond; diff --git a/AnnService/inc/Socket/ConnectionManager.h b/AnnService/inc/Socket/ConnectionManager.h index e487c6105..0c199ecb1 100644 --- a/AnnService/inc/Socket/ConnectionManager.h +++ b/AnnService/inc/Socket/ConnectionManager.h @@ -41,7 +41,11 @@ class ConnectionManager : public std::enable_shared_from_this inline static std::uint32_t GetPosition(ConnectionID p_connectionID); private: - static constexpr std::uint32_t c_connectionPoolSize = 1 << 8; + // Bumped from 1<<8 (256) to 1<<12 (4096) to avoid silently dropping new + // connections when reconnect storms (e.g., from concurrent FlushRemoteAppends + // timeouts) saturate the pool. Each ConnectionItem is small; 4096 slots is + // ~64KB per ConnectionManager, which is negligible. + static constexpr std::uint32_t c_connectionPoolSize = 1 << 12; static constexpr std::uint32_t c_connectionPoolMask = c_connectionPoolSize - 1; diff --git a/AnnService/inc/Socket/Packet.h b/AnnService/inc/Socket/Packet.h index 8c99b09fe..6d8c1d146 100644 --- a/AnnService/inc/Socket/Packet.h +++ b/AnnService/inc/Socket/Packet.h @@ -27,13 +27,47 @@ enum class PacketType : std::uint8_t SearchRequest = 0x03, + AppendRequest = 0x04, + + BatchAppendRequest = 0x05, + + HeadSyncRequest = 0x07, + + RemoteLockRequest = 0x08, + + DispatchCommand = 0x09, + + NodeRegisterRequest = 0x0A, + + RingUpdate = 0x0B, + + RingUpdateACK = 0x0C, + + // Cross-node merge hint. Search on node X observes posting H is + // underfull, but H is owned by node Y. X sends MergeRequest to Y so + // Y can schedule its own MergeAsync(H). Fire-and-forget (no response + // packet): the receiver's MergeAsync already dedups via m_mergeList, + // a lost notification just means Y discovers H underfull via some + // other path (own search, own Append, explicit RefineIndex). + MergeRequest = 0x11, + ResponseMask = 0x80, + NodeRegisterResponse = ResponseMask | NodeRegisterRequest, + HeartbeatResponse = ResponseMask | HeartbeatRequest, RegisterResponse = ResponseMask | RegisterRequest, - SearchResponse = ResponseMask | SearchRequest + SearchResponse = ResponseMask | SearchRequest, + + AppendResponse = ResponseMask | AppendRequest, + + BatchAppendResponse = ResponseMask | BatchAppendRequest, + + RemoteLockResponse = ResponseMask | RemoteLockRequest, + + DispatchResult = ResponseMask | DispatchCommand, }; diff --git a/AnnService/inc/Socket/SimpleSerialization.h b/AnnService/inc/Socket/SimpleSerialization.h index 6da925625..e0b8141dd 100644 --- a/AnnService/inc/Socket/SimpleSerialization.h +++ b/AnnService/inc/Socket/SimpleSerialization.h @@ -82,6 +82,58 @@ namespace SimpleSerialization } + /// Bounds-checked variants of SimpleReadBuffer. + /// All return nullptr if a read would overrun [p_buffer, p_bufEnd). + /// p_buffer is also returned as nullptr (and p_val left unchanged) if it is already nullptr. + template + inline const std::uint8_t* + SafeSimpleReadBuffer(const std::uint8_t* p_buffer, const std::uint8_t* p_bufEnd, T& p_val) + { + static_assert(std::is_fundamental::value || std::is_enum::value, + "Only applied for fundanmental type."); + + if (p_buffer == nullptr) return nullptr; + if (p_bufEnd != nullptr && static_cast(p_bufEnd - p_buffer) < sizeof(T)) return nullptr; + p_val = *(reinterpret_cast(p_buffer)); + return p_buffer + sizeof(T); + } + + + inline const std::uint8_t* + SafeSimpleReadBuffer(const std::uint8_t* p_buffer, const std::uint8_t* p_bufEnd, std::string& p_val) + { + p_val.clear(); + if (p_buffer == nullptr) return nullptr; + std::uint32_t len = 0; + p_buffer = SafeSimpleReadBuffer(p_buffer, p_bufEnd, len); + if (p_buffer == nullptr) return nullptr; + if (len > 0) + { + if (p_bufEnd != nullptr && static_cast(p_bufEnd - p_buffer) < len) return nullptr; + p_val.assign(reinterpret_cast(p_buffer), len); + } + return p_buffer + len; + } + + + inline const std::uint8_t* + SafeSimpleReadBuffer(const std::uint8_t* p_buffer, const std::uint8_t* p_bufEnd, ByteArray& p_val) + { + p_val.Clear(); + if (p_buffer == nullptr) return nullptr; + std::uint32_t len = 0; + p_buffer = SafeSimpleReadBuffer(p_buffer, p_bufEnd, len); + if (p_buffer == nullptr) return nullptr; + if (len > 0) + { + if (p_bufEnd != nullptr && static_cast(p_bufEnd - p_buffer) < len) return nullptr; + p_val = ByteArray::Alloc(len); + std::memcpy(p_val.Data(), p_buffer, len); + } + return p_buffer + len; + } + + template<> inline std::size_t EstimateBufferSize(const std::string& p_val) diff --git a/AnnService/src/Core/SPANN/ExtraFileController.cpp b/AnnService/src/Core/SPANN/ExtraFileController.cpp index 24c839455..b5db83822 100644 --- a/AnnService/src/Core/SPANN/ExtraFileController.cpp +++ b/AnnService/src/Core/SPANN/ExtraFileController.cpp @@ -25,7 +25,7 @@ bool FileIO::BlockController::Initialize(SPANN::Options &p_opt, int p_layer) #ifndef _MSC_VER O_RDWR | O_DIRECT, numblocks, 2, 2, max(p_opt.m_ioThreads, (2 * max(p_opt.m_searchThreadNum, p_opt.m_iSSDNumberOfThreads) + - (p_opt.m_layers + 1) * (p_opt.m_insertThreadNum + p_opt.m_reassignThreadNum + p_opt.m_appendThreadNum))), + p_opt.m_insertThreadNum + p_opt.m_reassignThreadNum + p_opt.m_appendThreadNum)), ((std::uint64_t)p_opt.m_startFileSize) << 30 #else GENERIC_READ | GENERIC_WRITE, numblocks, 2, 2, diff --git a/AnnService/src/Core/SPANN/SPANNIndex.cpp b/AnnService/src/Core/SPANN/SPANNIndex.cpp index f3f83dca6..38ea1c72d 100644 --- a/AnnService/src/Core/SPANN/SPANNIndex.cpp +++ b/AnnService/src/Core/SPANN/SPANNIndex.cpp @@ -1227,6 +1227,15 @@ template ErrorCode Index::BuildIndexInternalLayer(std::shared_pt m_extraSearchers.emplace_back(std::make_shared>(m_options, m_extraSearchers.size(), this, m_db)); } + // Hand the routing worker (if any) to the freshly-created searcher + // before BuildIndex runs. Build itself no longer routes postings + // (shared TiKV cluster — the driver writes straight to TiKV and PD + // routes each key to the owning store), but other build-time hooks + // that consult m_worker still benefit from seeing a non-null value. + if (m_pendingWorker) { + m_extraSearchers.back()->SetWorker(m_pendingWorker); + } + { std::shared_ptr ptr = SPTAG::f_createIO(); if (ptr == nullptr || @@ -1862,7 +1871,74 @@ ErrorCode Index::AddIndex(const void *p_data, SizeType p_vectorNum, Dimension } workSpace->m_deduper.clear(); workSpace->m_postingIDs.clear(); - return m_extraSearchers[0]->AddIndex(workSpace.get(), vectorSet, begin); + + // Use multiple threads for RNGSelection + Append when vector count is large enough. + // Each thread fetch_add's one vector and calls ExtraDynamicSearcher::AddIndex with a + // single-vector view, so AppendBatchAsync flushes per-vector and pipelines with the + // worker side rather than queuing the whole batch behind a single huge flush. + if (p_vectorNum > 1 && m_options.m_iSSDNumberOfThreads > 1) { + int numThreads = std::min((int)p_vectorNum, m_options.m_iSSDNumberOfThreads); + std::atomic_int nextVec{0}; + std::atomic globalError{ErrorCode::Success}; + int printStep = std::max(1, p_vectorNum / 50); + + auto worker = [&](bool isFirst) { + std::unique_ptr ws; + ExtraWorkSpace* wsPtr; + if (isFirst) { + wsPtr = workSpace.get(); + } else { + ws = m_workSpaceFactory->GetWorkSpace(); + if (!ws) { + ws.reset(new ExtraWorkSpace()); + InitWorkSpace(ws.get(), false); + } else { + InitWorkSpace(ws.get(), true); + } + ws->m_deduper.clear(); + ws->m_postingIDs.clear(); + wsPtr = ws.get(); + } + + while (globalError.load(std::memory_order_relaxed) == ErrorCode::Success) { + int v = nextVec.fetch_add(1); + if (v >= p_vectorNum) break; + + if (v % printStep == 0) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "AddIndex bulk: %d/%d (%.1f%%)\n", + v, p_vectorNum, v * 100.0 / p_vectorNum); + GetDBStat(); + } + + std::shared_ptr singleVec = std::make_shared( + ByteArray((std::uint8_t*)vectorSet->GetVector(v), + sizeof(T) * p_dimension, false), + GetEnumValueType(), p_dimension, 1); + ErrorCode ret = m_extraSearchers[0]->AddIndex(wsPtr, singleVec, + m_extraSearchers[0]->AllocateGlobalVID(begin + v)); + if (ret != ErrorCode::Success) { + globalError.store(ret, std::memory_order_relaxed); + } + } + + if (!isFirst && ws) { + m_workSpaceFactory->ReturnWorkSpace(std::move(ws)); + } + }; + + std::vector threads; + threads.reserve(numThreads - 1); + for (int t = 1; t < numThreads; t++) { + threads.emplace_back(worker, false); + } + worker(true); + for (auto& t : threads) t.join(); + + return globalError.load(); + } + + return m_extraSearchers[0]->AddIndex(workSpace.get(), vectorSet, + m_extraSearchers[0]->AllocateGlobalVID(begin)); } template diff --git a/AnnService/src/Core/VectorIndex.cpp b/AnnService/src/Core/VectorIndex.cpp index 2f8ebfd13..35bcaf585 100644 --- a/AnnService/src/Core/VectorIndex.cpp +++ b/AnnService/src/Core/VectorIndex.cpp @@ -793,6 +793,14 @@ std::shared_ptr VectorIndex::CreateInstance(IndexAlgoType p_algo, V } ErrorCode VectorIndex::LoadIndex(const std::string &p_loaderFilePath, std::shared_ptr &p_vectorIndex) +{ + static const std::map emptyOverrides; + return LoadIndex(p_loaderFilePath, emptyOverrides, p_vectorIndex); +} + +ErrorCode VectorIndex::LoadIndex(const std::string &p_loaderFilePath, + const std::map &p_paramOverrides, + std::shared_ptr &p_vectorIndex) { std::string folderPath(p_loaderFilePath); if (!folderPath.empty() && *(folderPath.rbegin()) != FolderSep) @@ -816,6 +824,23 @@ ErrorCode VectorIndex::LoadIndex(const std::string &p_loaderFilePath, std::share if ((ret = p_vectorIndex->LoadIndexConfig(iniReader)) != ErrorCode::Success) return ret; + // Apply param overrides AFTER LoadIndexConfig but BEFORE LoadIndexData, so that + // settings like TiKVPDAddresses are reflected in m_options before the KV connection + // is constructed inside LoadIndexData -> PrepareDB. + for (const auto &kv : p_paramOverrides) + { + const std::string &key = kv.first; + const std::string &val = kv.second; + auto dotPos = key.find('.'); + if (dotPos != std::string::npos) { + std::string section = key.substr(0, dotPos); + std::string param = key.substr(dotPos + 1); + p_vectorIndex->SetParameter(param.c_str(), val.c_str(), section.c_str()); + } else { + p_vectorIndex->SetParameter(key.c_str(), val.c_str(), "BuildSSDIndex"); + } + } + std::shared_ptr> indexfiles = p_vectorIndex->GetIndexFiles(); if (iniReader.DoesSectionExist("MetaData")) { diff --git a/AnnService/src/Socket/Connection.cpp b/AnnService/src/Socket/Connection.cpp index 150889d2f..444c7afb0 100644 --- a/AnnService/src/Socket/Connection.cpp +++ b/AnnService/src/Socket/Connection.cpp @@ -26,10 +26,19 @@ Connection::Connection(ConnectionID p_connectionID, boost::asio::ip::tcp::socket void Connection::Start() { - SPTAGLIB_LOG(Helper::LogLevel::LL_Debug, "Connection Start, local: %u, remote: %s:%u\n", - static_cast(m_socket.local_endpoint().port()), - m_socket.remote_endpoint().address().to_string().c_str(), - static_cast(m_socket.remote_endpoint().port())); + boost::system::error_code epEc; + auto localEp = m_socket.local_endpoint(epEc); + auto remoteEp = m_socket.remote_endpoint(epEc); + if (!epEc) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Debug, "Connection Start, local: %u, remote: %s:%u\n", + static_cast(localEp.port()), + remoteEp.address().to_string().c_str(), + static_cast(remoteEp.port())); + } else { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, "Connection Start, socket not connected: %s\n", + epEc.message().c_str()); + return; + } if (!m_stopped.exchange(false)) { @@ -42,10 +51,15 @@ void Connection::Start() void Connection::Stop() { - SPTAGLIB_LOG(Helper::LogLevel::LL_Debug, "Connection Stop, local: %u, remote: %s:%u\n", - static_cast(m_socket.local_endpoint().port()), - m_socket.remote_endpoint().address().to_string().c_str(), - static_cast(m_socket.remote_endpoint().port())); + boost::system::error_code epEc; + auto localEp = m_socket.local_endpoint(epEc); + auto remoteEp = m_socket.remote_endpoint(epEc); + if (!epEc) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Debug, "Connection Stop, local: %u, remote: %s:%u\n", + static_cast(localEp.port()), + remoteEp.address().to_string().c_str(), + static_cast(remoteEp.port())); + } if (m_stopped.exchange(true)) { diff --git a/AnnService/src/Socket/Server.cpp b/AnnService/src/Socket/Server.cpp index 9781bf1d4..8be0682c6 100644 --- a/AnnService/src/Socket/Server.cpp +++ b/AnnService/src/Socket/Server.cpp @@ -26,7 +26,7 @@ Server::Server(const std::string &p_address, const std::string &p_port, const Pa boost::asio::ip::tcp::endpoint endpoint = *(endPoints.begin()); m_acceptor.open(endpoint.protocol()); - m_acceptor.set_option(boost::asio::ip::tcp::acceptor::reuse_address(false)); + m_acceptor.set_option(boost::asio::ip::tcp::acceptor::reuse_address(true)); m_acceptor.bind(endpoint, errCode); if (errCode) diff --git a/Test/CMakeLists.txt b/Test/CMakeLists.txt index 52f4168a9..27bdeebb5 100644 --- a/Test/CMakeLists.txt +++ b/Test/CMakeLists.txt @@ -24,7 +24,7 @@ if (NOT LIBRARYONLY) file(GLOB TEST_HDR_FILES ${PROJECT_SOURCE_DIR}/Test/inc/Test.h) file(GLOB TEST_SRC_FILES ${PROJECT_SOURCE_DIR}/Test/src/*.cpp) add_executable(SPTAGTest ${TEST_SRC_FILES} ${TEST_HDR_FILES}) - target_link_libraries(SPTAGTest SPTAGLibStatic ssdservingLib ${Boost_LIBRARIES}) + target_link_libraries(SPTAGTest SPTAGLibStatic ssdservingLib ${Boost_LIBRARIES} absl_synchronization absl_cord absl_cordz_info absl_cord_internal absl_cordz_functions absl_cordz_handle) install(TARGETS SPTAGTest RUNTIME DESTINATION bin diff --git a/Test/inc/TestDataGenerator.h b/Test/inc/TestDataGenerator.h index 5820c8422..9f958f43d 100644 --- a/Test/inc/TestDataGenerator.h +++ b/Test/inc/TestDataGenerator.h @@ -29,7 +29,20 @@ namespace TestUtils { static std::shared_ptr LoadMetadataSet(const std::string pmetaset, const std::string pmetaidx, SPTAG::SizeType start = 0, SPTAG::SizeType count = -1); - static float EvaluateRecall(const std::vector &res, std::shared_ptr &truth, int recallK, int k, int batch, int totalbatches); + // Compute recall against truth file. + // + // Distributed (per-node) recall: when each node only owns a SUBSET of + // the global query set, pass the global query count and this node's + // query offset so the truth row indexing is computed in global terms. + // The truth file is laid out as: + // [iter=0 VIDs for queries 0..Q-1] [iter=1 VIDs ...] ... + // [iter=0 dists for queries 0..Q-1] [iter=1 dists ...] ... + // where Q is the GLOBAL query count, NOT res.size(). With the legacy + // res.size()-based formula, distributed batches > 0 read the wrong + // rows (off by Q-myCount), giving near-random recall that's noise. + // totalQueries=-1 (default) preserves the legacy single-node formula. + static float EvaluateRecall(const std::vector &res, std::shared_ptr &truth, int recallK, int k, int batch, int totalbatches, + int totalQueries = -1, int queryOffset = 0); void RunBatches(std::shared_ptr &vecset, std::shared_ptr &metaset, std::shared_ptr &addvecset, std::shared_ptr &addmetaset, diff --git a/Test/src/SPFreshTest.cpp b/Test/src/SPFreshTest.cpp index 95c1fc4d5..9ab420db9 100644 --- a/Test/src/SPFreshTest.cpp +++ b/Test/src/SPFreshTest.cpp @@ -5,6 +5,10 @@ #include "inc/Core/Common/DistanceUtils.h" #include "inc/Core/Common/QueryResultSet.h" #include "inc/Core/SPANN/Index.h" +#include "inc/Core/SPANN/Distributed/WorkerNode.h" +#include "inc/Core/SPANN/Distributed/DispatcherNode.h" +#include "inc/Core/SPANN/ExtraDynamicSearcher.h" +#include "inc/Core/SPANN/ExtraTiKVController.h" #include "inc/Core/SPANN/SPANNResultIterator.h" #include "inc/Core/VectorIndex.h" #include "inc/Core/Common/IQuantizer.h" @@ -17,10 +21,13 @@ #include "inc/Test.h" #include "inc/TestDataGenerator.h" +#include #include #include +#include #include #include +#include #include #include #include @@ -55,6 +62,181 @@ static __attribute__((constructor)) void install_segfault_handler() { using namespace SPTAG; +// --------------------------------------------------------------------------- +// Stride sharding (a.k.a. odd/even sharding) experiment +// --------------------------------------------------------------------------- +// When the env var SPFRESH_SHARD_STRIDE is set to "1"/"true", each node, instead +// of inserting a contiguous slice [n*B/N, (n+1)*B/N) of the per-iteration batch, +// inserts the strided rows {n, n+N, n+2*N, ...} where n=nodeIndex, N=numNodes. +// This breaks any spatial structure in the input dataset (e.g. SIFT files that +// are roughly sorted by visual feature), letting us check whether the layer-0 +// split skew (driver 71 vs worker 2 in v18) is caused by contiguous slicing +// landing similar vectors on the same node and overflowing a small set of heads. +// +// The total number of vectors inserted across all nodes per iteration is the +// same; only the assignment changes. Recall measurement still works because +// the dataset and ground truth are unchanged — only insert routing differs. +static bool IsStrideShardEnabled() { + const char* e = std::getenv("SPFRESH_SHARD_STRIDE"); + if (!e) return false; + std::string v(e); + return v == "1" || v == "true" || v == "TRUE" || v == "yes"; +} + +// Compute count of indices i in [0, total) with (i % stride) == offset. +static SizeType StrideCount(SizeType total, int stride, int offset) { + if (stride <= 1) return total; + if (offset < 0 || offset >= stride) return 0; + if (total <= offset) return 0; + return (total - 1 - offset) / stride + 1; +} + +// Build a strided sub-VectorSet by copying every `stride`-th vector starting +// at `offset` into a contiguous packed ByteArray. Returns a BasicVectorSet. +static std::shared_ptr ExtractStridedVectors( + const std::shared_ptr& full, int stride, int offset) +{ + if (!full) return nullptr; + SizeType totalCount = full->Count(); + SizeType outCount = StrideCount(totalCount, stride, offset); + auto vt = full->GetValueType(); + auto dim = full->Dimension(); + size_t perVecSize = full->PerVectorDataSize(); + if (outCount <= 0) { + return std::make_shared(ByteArray::Alloc(0), vt, dim, 0); + } + ByteArray buf = ByteArray::Alloc(static_cast(outCount) * perVecSize); + for (SizeType i = 0; i < outCount; ++i) { + SizeType srcIdx = static_cast(offset) + i * static_cast(stride); + std::memcpy(buf.Data() + static_cast(i) * perVecSize, + full->GetVector(srcIdx), + perVecSize); + } + return std::make_shared(buf, vt, dim, outCount); +} + +// Build a strided sub-MetadataSet. Two-pass: first compute offsets, then copy. +static std::shared_ptr ExtractStridedMetadata( + const std::shared_ptr& full, int stride, int offset) +{ + if (!full) return nullptr; + SizeType totalCount = full->Count(); + SizeType outCount = StrideCount(totalCount, stride, offset); + if (outCount <= 0) { + ByteArray emptyMeta = ByteArray::Alloc(0); + ByteArray offBuf = ByteArray::Alloc(sizeof(std::uint64_t)); + *reinterpret_cast(offBuf.Data()) = 0ULL; + return std::make_shared(emptyMeta, offBuf, 0); + } + std::vector offsets(static_cast(outCount) + 1, 0ULL); + std::uint64_t total = 0; + for (SizeType i = 0; i < outCount; ++i) { + SizeType srcIdx = static_cast(offset) + i * static_cast(stride); + ByteArray meta = full->GetMetadata(srcIdx); + offsets[i] = total; + total += meta.Length(); + } + offsets[outCount] = total; + ByteArray metaBuf = ByteArray::Alloc(total > 0 ? total : 1); + for (SizeType i = 0; i < outCount; ++i) { + SizeType srcIdx = static_cast(offset) + i * static_cast(stride); + ByteArray meta = full->GetMetadata(srcIdx); + if (meta.Length() > 0) { + std::memcpy(metaBuf.Data() + offsets[i], meta.Data(), meta.Length()); + } + } + ByteArray offBuf = ByteArray::Alloc((static_cast(outCount) + 1) * sizeof(std::uint64_t)); + std::memcpy(offBuf.Data(), offsets.data(), offsets.size() * sizeof(std::uint64_t)); + return std::make_shared(metaBuf, offBuf, outCount); +} + +// Helper: parse "host:port,host:port,..." into vector of pairs. +static std::vector> ParseNodeAddrs(const std::string& addrStr) { + std::vector> result; + auto parts = Helper::StrUtils::SplitString(addrStr, ","); + for (auto& part : parts) { + auto hp = Helper::StrUtils::SplitString(part, ":"); + if (hp.size() == 2) result.emplace_back(hp[0], hp[1]); + } + return result; +} + +// Helper: bind a WorkerNode to ALL ExtraDynamicSearcher layers inside a VectorIndex. +// Calls SetWorker() which wires up append, head-sync, and remote-lock callbacks. +// All layers must have the worker bound so that AddIDCapacity (called per-layer) sees +// the correct numNodes and grows each layer's TiKVVersionMap to cover the full global +// VID space (capa * numNodes), not just this node's slice. +template +static void BindWorkerToIndex(SPANN::WorkerNode* worker, std::shared_ptr& index) { + auto* spannIndex = dynamic_cast*>(index.get()); + if (!spannIndex) return; + for (int layer = 0; ; ++layer) { + auto diskIndex = spannIndex->GetDiskIndex(layer); + if (!diskIndex) break; + auto* searcher = dynamic_cast*>(diskIndex.get()); + if (searcher) searcher->SetWorker(worker); + } +} + +// Helper: same as BindWorkerToIndex but takes a raw SPANN::Index* directly +// (for sites that have already extracted the spannIndex pointer). +template +static void BindWorkerToAllLayers(SPANN::WorkerNode* worker, SPANN::Index* spannIndex) { + if (!spannIndex) return; + for (int layer = 0; ; ++layer) { + auto diskIndex = spannIndex->GetDiskIndex(layer); + if (!diskIndex) break; + auto* searcher = dynamic_cast*>(diskIndex.get()); + if (searcher) searcher->SetWorker(worker); + } +} + +// Configuration for distributed mode, read from [Distributed] ini section. +struct DistributedConfig { + bool enabled = false; + int workerIndex = 0; // 0-based: 0 = driver (dispatcher + worker 0), 1+ = remote worker + std::string dispatcherAddr; // "host:port" + std::string workerAddrs; // "host:port,host:port,..." + std::string storeAddrs; // "addr,addr,..." + std::string pdAddrs; // "host:port,host:port,..." (per-worker PD) + + // Number of workers (for query/insert partitioning) + int GetNumWorkers() const { + if (!enabled || workerAddrs.empty()) return 1; + return (int)std::count(workerAddrs.begin(), workerAddrs.end(), ',') + 1; + } + + // Parse dispatcher address into host:port pair + std::pair GetDispatcherAddr() const { + auto hp = Helper::StrUtils::SplitString(dispatcherAddr, ":"); + if (hp.size() == 2) return {hp[0], hp[1]}; + return {"", ""}; + } + + // Get PD address for this worker (falls back to global TiKVPDAddresses) + std::string GetLocalPDAddr() const { + if (pdAddrs.empty()) return ""; + auto addrs = Helper::StrUtils::SplitString(pdAddrs, ","); + if (workerIndex < (int)addrs.size()) return addrs[workerIndex]; + return addrs[0]; + } + + static DistributedConfig FromIni(Helper::IniReader& ini) { + DistributedConfig cfg; + cfg.enabled = ini.GetParameter("Distributed", "Enabled", false); + cfg.dispatcherAddr = ini.GetParameter("Distributed", "DispatcherAddr", std::string("")); + cfg.workerAddrs = ini.GetParameter("Distributed", "WorkerAddrs", std::string("")); + cfg.storeAddrs = ini.GetParameter("Distributed", "StoreAddrs", std::string("")); + cfg.pdAddrs = ini.GetParameter("Distributed", "PDAddrs", std::string("")); + + // Worker index from env var (0 = driver, 1+ = remote worker) + const char* wiEnv = std::getenv("WORKER_INDEX"); + cfg.workerIndex = wiEnv ? std::atoi(wiEnv) : 0; + + return cfg; + } +}; + namespace SPFreshTest { SizeType N = 10000; @@ -306,13 +488,17 @@ std::shared_ptr BuildIndex(const std::string &outDirectory, std::sh template std::shared_ptr BuildLargeIndex(const std::string &outDirectory, std::string &pvecset, - std::string& pmetaset, std::string& pmetaidx, Helper::IniReader& iniReader, const std::string &distMethod = "L2", + std::string& pmetaset, std::string& pmetaidx, const std::string &distMethod = "L2", int searchthread = 2, int insertthread = 2, int layers = 1, - std::shared_ptr quantizer = nullptr, std::string quantizerFilePath = "quantizer.bin") + std::shared_ptr quantizer = nullptr, std::string quantizerFilePath = "quantizer.bin", + const std::map& ssdOverrides = {}, + bool ssdOnly = false, + SPANN::WorkerNode* p_worker = nullptr) { auto vecIndex = VectorIndex::CreateInstance(IndexAlgoType::SPANN, GetEnumValueType()); int maxthreads = std::thread::hardware_concurrency(); int postingLimit = 4 * sizeof(T); + remove((outDirectory + FolderSep + "ssdmapping_0_postings").c_str()); std::string configuration = R"( [Base] DistCalcMethod=)" + distMethod + R"( @@ -399,15 +585,29 @@ std::shared_ptr BuildLargeIndex(const std::string &outDirectory, st } } - for (const auto &sec : sections) + // Apply overrides (e.g., Storage, TiKV settings, SelectHead/BuildHead params) + for (const auto &[key, val] : ssdOverrides) { - auto params = iniReader.GetParameters(sec.c_str()); - for (const auto &[key, val] : params) - { - vecIndex->SetParameter(key.c_str(), val.c_str(), sec.c_str()); + // Keys prefixed with "SectionName." are routed to the corresponding section + auto dotPos = key.find('.'); + if (dotPos != std::string::npos) { + std::string section = key.substr(0, dotPos); + std::string param = key.substr(dotPos + 1); + vecIndex->SetParameter(param.c_str(), val.c_str(), section.c_str()); + } else { + vecIndex->SetParameter(key.c_str(), val.c_str(), "BuildSSDIndex"); } } + // SSD-only mode: skip SelectHead and BuildHead, resume from specified layer + if (ssdOnly) + { + // Allow explicit ResumeLayer from config/overrides; otherwise default to layer 0 + // (rebuild SSD for all layers, reusing existing head indexes) + int resumeLayer = 0; + vecIndex->SetParameter("ResumeLayer", std::to_string(resumeLayer).c_str(), "BuildSSDIndex"); + } + if (quantizer) { vecIndex->SetParameter("QuantizerFilePath", quantizerFilePath.c_str(), "Base"); @@ -415,6 +615,20 @@ std::shared_ptr BuildLargeIndex(const std::string &outDirectory, st vecIndex->SetQuantizerADC(false); vecIndex->SetParameter("Dim", std::to_string(quantizer->GetNumSubvectors()).c_str(), "Base"); } + + // Bind a routing worker (if any) to the freshly-created SSD searcher + // before BuildIndex runs. Build itself does not route postings any more + // (shared TiKV cluster — driver writes directly), so in buildOnly mode + // the workerPtr will simply be nullptr and this block is a no-op. + if (p_worker) { + if (auto* spannIdx = dynamic_cast*>(vecIndex.get())) { + spannIdx->SetWorker(p_worker); + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "BuildLargeIndex: bound routing worker (numNodes=%d)\n", + p_worker->GetNumNodes()); + } + } + auto buildStatus = vecIndex->BuildIndex(); if (buildStatus != ErrorCode::Success) return nullptr; @@ -452,9 +666,19 @@ float Search(std::shared_ptr &vecIndex, std::shared_ptr return TestUtils::TestDataGenerator::EvaluateRecall(results, truth, k, k, batch, totalbatches); } +template +double ExecutePartitionedSearch(VectorIndex* index, + std::shared_ptr& queryset, + int myStart, int myCount, + int searchK, int numThreads, + std::vector& results, + std::vector* latenciesOut, + std::vector* statsOut); + template void InsertVectors(SPANN::Index *p_index, int insertThreads, int step, - std::shared_ptr addset, std::shared_ptr &metaset, int searchThreads = 0, std::shared_ptr queryset = nullptr, int numQueries = 0, int k = 5, std::ostream* benchmarkData = nullptr, int start = 0) + std::shared_ptr addset, std::shared_ptr &metaset, int searchThreads = 0, std::shared_ptr queryset = nullptr, int numQueries = 0, int k = 5, std::ostream* benchmarkData = nullptr, int start = 0, + SPANN::WorkerNode* router = nullptr) { p_index->ForceCompaction(); p_index->GetDBStat(); @@ -462,8 +686,15 @@ void InsertVectors(SPANN::Index *p_index, int insertThreads, int step std::vector threads; int printstep = step / 50; + + // Bulk path: single AddIndex call amortizes remote-append RPCs into one AppendBatchAsync. + // Per-vector RNGSelection is parallelized inside ExtraDynamicSearcher::AddIndex so we + // keep insertThreads-way parallelism while saving N-1 RPCs. + bool useBulk = (router && router->GetNumNodes() > 1); + + // Per-vector insert (original path): each thread grabs one vector at a time std::atomic_size_t vectorsSent(start); - auto func = [&]() { + auto perVecFunc = [&]() { size_t index = start; while (true) { @@ -500,43 +731,48 @@ void InsertVectors(SPANN::Index *p_index, int insertThreads, int step } }; - if (searchThreads > 0 && queryset != nullptr && numQueries != 0 && benchmarkData != nullptr) { - std::vector latencies(numQueries); - std::vector results(numQueries); - std::vector duration(searchThreads); - - for (int i = 0; i < numQueries; i++) + // Bulk insert (router path): single call, parallelism inside SPANNIndex::AddIndex + auto bulkFunc = [&]() { + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "InsertVectors: bulk AddIndex for %d vectors (router enabled)\n", step); + ErrorCode ret = p_index->AddIndex(addset->GetVector((SizeType)start), step, addset->Dimension(), metaset, true); + if (ret != ErrorCode::Success) { - results[i] = QueryResult((const ValueType *)queryset->GetVector(i), k, false); + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "AddIndex bulk failed. start:%d count:%d Dim:%d Error:%d\n", + start, step, addset->Dimension(), static_cast(ret)); } + BOOST_REQUIRE(ret == ErrorCode::Success); + }; - std::atomic_size_t queriesSent(0); - auto search = [&](int tid) { - auto s1 = std::chrono::high_resolution_clock::now(); - size_t qid; - while ((qid = queriesSent.fetch_add(1)) < numQueries) - { - auto t1 = std::chrono::high_resolution_clock::now(); - p_index->SearchIndex(results[qid]); - auto t2 = std::chrono::high_resolution_clock::now(); - latencies[qid] = std::chrono::duration_cast(t2 - t1).count() / 1000.0f; - } - auto s2 = std::chrono::high_resolution_clock::now(); - duration[tid] = std::chrono::duration_cast(s2 - s1).count() / 1000.0f; - }; + std::function func; + int insertThreadCount; + if (useBulk) { + func = bulkFunc; + insertThreadCount = 1; + } else { + func = perVecFunc; + insertThreadCount = insertThreads; + } + + if (searchThreads > 0 && queryset != nullptr && numQueries != 0 && benchmarkData != nullptr) { + std::vector latencies; + std::vector results; + double searchWallSeconds = 0.0; - for (int j = 0; j < insertThreads; j++) + for (int j = 0; j < insertThreadCount; j++) { threads.emplace_back(func); } - for (int j = 0; j < searchThreads; j++) - { - threads.emplace_back(search, j); - } + std::thread searchThread([&]() { + searchWallSeconds = ExecutePartitionedSearch( + p_index, queryset, /*myStart=*/0, numQueries, k, searchThreads, + results, &latencies, /*statsOut=*/nullptr); + }); for (auto &thread : threads) { thread.join(); } + searchThread.join(); // Calculate statistics float mean = 0, minLat = (std::numeric_limits::max)(), maxLat = 0; @@ -553,10 +789,7 @@ void InsertVectors(SPANN::Index *p_index, int insertThreads, int step float p90 = latencies[static_cast(numQueries * 0.90)]; float p95 = latencies[static_cast(numQueries * 0.95)]; float p99 = latencies[static_cast(numQueries * 0.99)]; - float maxBatchLatency = 1e-6; - for (int i = 0; i < searchThreads; i++) - if (maxBatchLatency < duration[i]) maxBatchLatency = duration[i]; - float qps = numQueries / maxBatchLatency; + float qps = numQueries / std::max(static_cast(searchWallSeconds), 1e-6f); *benchmarkData << " \"numQueries\": " << numQueries << ",\n"; *benchmarkData << " \"meanLatency\": " << mean << ",\n"; @@ -567,6 +800,17 @@ void InsertVectors(SPANN::Index *p_index, int insertThreads, int step *benchmarkData << " \"minLatency\": " << minLat << ",\n"; *benchmarkData << " \"maxLatency\": " << maxLat << ",\n"; *benchmarkData << " \"qps\": " << qps << ",\n"; + } else { + // No search-during-insert path: just run the insert threads. + // (Used by worker dispatch and any caller that doesn't need stats.) + for (int j = 0; j < insertThreadCount; j++) + { + threads.emplace_back(func); + } + for (auto &thread : threads) + { + thread.join(); + } } auto barrierStart = std::chrono::high_resolution_clock::now(); size_t barrierPolls = 0; @@ -587,72 +831,82 @@ void InsertVectors(SPANN::Index *p_index, int insertThreads, int step } + + + template void BenchmarkQueryPerformance(std::shared_ptr &index, std::shared_ptr &queryset, std::shared_ptr &truth, const std::string &truthPath, SizeType baseVectorCount, int topK, int searchK, int numThreads, int numQueries, int batches, int totalbatches, - std::ostream &benchmarkData, std::string prefix = "") + std::ostream &benchmarkData, std::string prefix = "", + int nodeIndex = 0, SPANN::WorkerNode* router = nullptr, + SPANN::DispatcherNode* dispatcher = nullptr) { - // Benchmark: Query performance with detailed latency stats - std::vector latencies(numQueries); - std::atomic_size_t queriesSent(0); - std::vector results(numQueries); - std::vector searchStats(numQueries); - auto* spannIndex = dynamic_cast*>(index.get()); - - for (int i = 0; i < numQueries; i++) - { - results[i] = QueryResult((const T *)queryset->GetVector(i), searchK, false); + // Use hash ring node count (workers only) for partitioning, not GetNumNodes() (includes dispatcher) + auto ring = (router && router->IsEnabled()) ? router->GetHashRing() : nullptr; + int nodeCount = ring ? static_cast(ring->NodeCount()) : 1; + bool distributed = (dispatcher != nullptr && router != nullptr && router->IsEnabled() && nodeCount > 1); + + // Determine this node's query range (balanced contiguous partition) + int myStart = 0, myCount = numQueries; + if (distributed) { + myStart = (int)((long long)nodeIndex * numQueries / nodeCount); + int myEnd = (int)((long long)(nodeIndex + 1) * numQueries / nodeCount); + myCount = myEnd - myStart; } - std::vector threads; - threads.reserve(numThreads); - - auto batchStart = std::chrono::high_resolution_clock::now(); - - for (int i = 0; i < numThreads; i++) - { - threads.emplace_back([&]() { - size_t qid; - while ((qid = queriesSent.fetch_add(1)) < numQueries) - { - auto t1 = std::chrono::high_resolution_clock::now(); - if (spannIndex != nullptr) - { - spannIndex->SearchIndex(results[qid], &searchStats[qid]); - } - else - { - index->SearchIndex(results[qid]); - } - auto t2 = std::chrono::high_resolution_clock::now(); - latencies[qid] = std::chrono::duration_cast(t2 - t1).count() / 1000.0f; - } - }); + // Dispatch search command to all workers via TCP (distributed only) + std::int64_t dispatchId = -1; + int round = 0; + if (distributed) { + static std::atomic s_searchRound{0}; + round = s_searchRound.fetch_add(1); + dispatchId = dispatcher->BroadcastDispatchCommand( + SPANN::DispatchCommand::Type::Search, static_cast(round)); } - for (auto &thread : threads) - thread.join(); + // Run this node's share of queries. + std::vector results; + std::vector latencies; + std::vector searchStats; + double localWallTime = ExecutePartitionedSearch( + index.get(), queryset, myStart, myCount, searchK, numThreads, + results, &latencies, &searchStats); + float batchLatency = static_cast(localWallTime); + auto* spannIndex = dynamic_cast*>(index.get()); - auto batchEnd = std::chrono::high_resolution_clock::now(); - float batchLatency = - std::chrono::duration_cast(batchEnd - batchStart).count() / 1000000.0f; + if (distributed) { + // Driver also runs searches against its local node, so it can have + // outgoing merge hints queued. Drain before we move on. + if (router) { + router->FlushRemoteMerges(); + } + // Collect worker timings via TCP; QPS is governed by the slowest node. + auto workerTimes = dispatcher->WaitForAllResults(dispatchId, 300); + for (double wt : workerTimes) { + batchLatency = std::max(batchLatency, static_cast(wt)); + } + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "BenchmarkQueryPerformance round %d: local=%.1fms (%d queries), max=%.1fms, QPS=%.1f\n", + round, localWallTime * 1000, myCount, batchLatency * 1000, numQueries / batchLatency); + } - // Calculate statistics + // Calculate statistics (from this node's queries) + int statsCount = myCount; float mean = 0, minLat = (std::numeric_limits::max)(), maxLat = 0; - for (int i = 0; i < numQueries; i++) + for (int i = 0; i < statsCount; i++) { mean += latencies[i]; minLat = (std::min)(minLat, latencies[i]); maxLat = (std::max)(maxLat, latencies[i]); } - mean /= numQueries; + mean /= statsCount; std::sort(latencies.begin(), latencies.end()); - float p50 = latencies[static_cast(numQueries * 0.50)]; - float p90 = latencies[static_cast(numQueries * 0.90)]; - float p95 = latencies[static_cast(numQueries * 0.95)]; - float p99 = latencies[static_cast(numQueries * 0.99)]; + float p50 = latencies[static_cast(statsCount * 0.50)]; + float p90 = latencies[static_cast(statsCount * 0.90)]; + float p95 = latencies[static_cast(statsCount * 0.95)]; + float p99 = latencies[static_cast(statsCount * 0.99)]; float qps = numQueries / batchLatency; BOOST_TEST_MESSAGE(" Queries: " << numQueries); @@ -749,7 +1003,7 @@ void BenchmarkQueryPerformance(std::shared_ptr &index, std::shared_ benchmarkData << prefix << " },\n"; } - // Recall evaluation (if truth file provided) + // Recall evaluation if (!truth || truthPath.empty() || truthPath == "none") { BOOST_TEST_MESSAGE(" Recall evaluation skipped (no truth data)"); @@ -760,7 +1014,13 @@ void BenchmarkQueryPerformance(std::shared_ptr &index, std::shared_ BOOST_TEST_MESSAGE("Checking for truth file: " << truthPath); std::shared_ptr pvecset, paddvecset; - float avgRecall = TestUtils::TestDataGenerator::EvaluateRecall(results, truth, topK, searchK, batches, totalbatches); + // In distributed mode, this node only searched queries [myStart, myStart+myCount). + // Pass the global query count and this node's offset so EvaluateRecall indexes + // the truth file in global terms (BATCH > 0 reads the wrong truth rows otherwise). + int recallTotalQueries = distributed ? numQueries : -1; + int recallQueryOffset = distributed ? myStart : 0; + float avgRecall = TestUtils::TestDataGenerator::EvaluateRecall(results, truth, topK, searchK, batches, totalbatches, + recallTotalQueries, recallQueryOffset); BOOST_TEST_MESSAGE(" Recall" << topK << "@" << searchK << " = " << (avgRecall * 100.0f) << "%"); BOOST_TEST_MESSAGE(" (Evaluated on " << numQueries << " queries against base vectors)"); benchmarkData << std::fixed << std::setprecision(4); @@ -772,6 +1032,115 @@ void BenchmarkQueryPerformance(std::shared_ptr &index, std::shared_ benchmarkData << prefix << " }"; } +// Run [myStart, myStart+myCount) queries against `index` using `numThreads` workers. +// Returns wall time in seconds. Fills `results` and (when non-null) per-query +// `latenciesOut` (ms) and `statsOut` (SPANN SearchStats). When `statsOut` is +// non-null and the index is a SPANN index, the stats overload of SearchIndex +// is used; otherwise the plain SearchIndex path runs. +template +double ExecutePartitionedSearch(VectorIndex* index, + std::shared_ptr& queryset, + int myStart, int myCount, + int searchK, int numThreads, + std::vector& results, + std::vector* latenciesOut, + std::vector* statsOut) +{ + auto* spannIndex = dynamic_cast*>(index); + bool useStats = (statsOut != nullptr && spannIndex != nullptr); + + results.resize(myCount); + for (int i = 0; i < myCount; i++) { + results[i] = QueryResult((const T*)queryset->GetVector(myStart + i), searchK, false); + } + if (useStats) statsOut->assign(myCount, SPANN::SearchStats()); + if (latenciesOut) latenciesOut->assign(myCount, 0.0f); + + std::atomic_size_t queriesSent(0); + int nThreads = std::min(numThreads, std::max(myCount, 1)); + std::vector threads; + threads.reserve(nThreads); + + auto t0 = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < nThreads; i++) { + threads.emplace_back([&]() { + size_t qid; + while ((qid = queriesSent.fetch_add(1)) < static_cast(myCount)) { + auto t1 = std::chrono::high_resolution_clock::now(); + if (useStats) { + spannIndex->SearchIndex(results[qid], &(*statsOut)[qid]); + } else if (spannIndex != nullptr) { + spannIndex->SearchIndex(results[qid]); + } else { + index->SearchIndex(results[qid]); + } + auto t2 = std::chrono::high_resolution_clock::now(); + if (latenciesOut) { + (*latenciesOut)[qid] = + std::chrono::duration_cast(t2 - t1).count() / 1000.0f; + } + } + }); + } + for (auto& t : threads) t.join(); + auto t3 = std::chrono::high_resolution_clock::now(); + return std::chrono::duration_cast(t3 - t0).count() / 1000000.0; +} + +ErrorCode QuantizeVectors(const std::shared_ptr& quantizer, + const std::shared_ptr& source, + ByteArray& dest); + +template +void LoadAndInsertBatch(SPANN::Index* spannIndex, + const std::string& paddset, + const std::string& paddmeta, + const std::string& paddmetaidx, + int dimension, + int insertStart, int loadCount, int perNodeBatch, + bool strideShard, int numNodes, int nodeIndex, + int numInsertThreads, + SPANN::WorkerNode* router, + std::shared_ptr quantizer, + int searchDuringInsertThreads, + std::shared_ptr queryset, + int numQueries, int searchK, + std::ostream* benchmarkData, + const char* logPrefix) +{ + auto addset = TestUtils::TestDataGenerator::LoadVectorSet(paddset, dimension, insertStart, loadCount); + if (quantizer) { + auto addFloat = ConvertToFloatVectorSet(addset); + BOOST_REQUIRE(addFloat != nullptr); + ByteArray quantizedAddBytes = + ByteArray::Alloc((size_t)addFloat->Count() * (size_t)(quantizer->GetNumSubvectors())); + BOOST_REQUIRE(QuantizeVectors(quantizer, addFloat, quantizedAddBytes) == ErrorCode::Success); + addset = std::make_shared(quantizedAddBytes, + VectorValueType::UInt8, + quantizer->GetNumSubvectors(), + addFloat->Count()); + } + auto addmetaset = TestUtils::TestDataGenerator::LoadMetadataSet(paddmeta, paddmetaidx, insertStart, loadCount); + if (strideShard) { + addset = ExtractStridedVectors(addset, numNodes, nodeIndex); + addmetaset = ExtractStridedMetadata(addmetaset, numNodes, nodeIndex); + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "%s: stride-shard batchStart=%d loadCount=%d -> kept=%d (every %d-th, offset=%d)\n", + logPrefix, insertStart, loadCount, + (int)(addset ? addset->Count() : 0), numNodes, nodeIndex); + } + InsertVectors(spannIndex, numInsertThreads, perNodeBatch, + addset, addmetaset, + searchDuringInsertThreads, queryset, numQueries, searchK, + benchmarkData, 0, router); + if (router) { + router->FlushRemoteAppends(); + router->FlushRemoteMerges(); + router->LogRouteStats(" (batch flush)"); + router->ResetRouteStats(); + } +} + template void LogCheckpointLayerStats(const std::shared_ptr& index, int layers, int currentBatch, int totalBatches) { @@ -836,9 +1205,13 @@ ErrorCode QuantizeVectors(const std::shared_ptr& quantizer, template void RunBenchmark(const std::string &vectorPath, const std::string &queryPath, const std::string &truthPath, DistCalcMethod distMethod, const std::string &indexPath, int dimension, int baseVectorCount, - int insertVectorCount, int deleteVectorCount, int batches, int topK, int numSearchThreads, int numInsertThreads, int numSearchDuringInsertThreads, int numQueries, Helper::IniReader& iniReader, + int insertVectorCount, int deleteVectorCount, int batches, int topK, int numSearchThreads, int numInsertThreads, int numSearchDuringInsertThreads, int numQueries, const std::string &outputFile = "output.json", const bool rebuild = true, const int resume = -1, - const std::string &quantizerFilePath = std::string(""), int quantizedDim = 0, int layers = 1) + const std::string &quantizerFilePath = std::string(""), int quantizedDim = 0, int layers = 1, + const std::map& ssdOverrides = {}, + bool rebuildSsdOnly = false, + bool buildOnly = false, + const DistributedConfig& distCfg = {}) { int oldM = M, oldK = K, oldN = N, oldQueries = queries; N = baseVectorCount; @@ -849,6 +1222,27 @@ void RunBenchmark(const std::string &vectorPath, const std::string &queryPath, c int insertBatchSize = insertVectorCount / max(batches, 1); int deleteBatchSize = deleteVectorCount / max(batches, 1); + // Use distributed config for multi-node partitioning + int nodeIndex = distCfg.workerIndex; + int numNodes = distCfg.GetNumWorkers(); + bool strideShard = IsStrideShardEnabled() && numNodes > 1; + int myInsertStart, myInsertEnd, perNodeBatch; + if (strideShard) { + // Stride mode: each node loads the FULL per-iter batch then keeps rows + // where (rowIdx % numNodes) == nodeIndex. myInsertStart/End span the + // full batch; perNodeBatch is the count of strided rows. + myInsertStart = 0; + myInsertEnd = insertBatchSize; + perNodeBatch = static_cast(StrideCount(insertBatchSize, numNodes, nodeIndex)); + } else { + myInsertStart = (numNodes > 1) ? (nodeIndex * insertBatchSize) / numNodes : 0; + myInsertEnd = (numNodes > 1) ? ((nodeIndex + 1) * insertBatchSize) / numNodes : insertBatchSize; + perNodeBatch = myInsertEnd - myInsertStart; + } + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "RunBenchmark: nodeIndex=%d numNodes=%d insertBatchSize=%d myInsertStart=%d myInsertEnd=%d perNodeBatch=%d strideShard=%d\n", + nodeIndex, numNodes, insertBatchSize, myInsertStart, myInsertEnd, perNodeBatch, strideShard ? 1 : 0); + // Variables to collect JSON output data std::ostringstream tmpbenchmark; @@ -902,12 +1296,78 @@ void RunBenchmark(const std::string &vectorPath, const std::string &queryPath, c jsonFile << " \"results\": {\n"; int SearchK = enableQuantization? topK * 4 : topK; + // Distributed routing: dispatcher + local worker (driver node is both) + std::unique_ptr dispatcher; + std::unique_ptr worker; + SPANN::WorkerNode* workerPtr = nullptr; // convenience alias std::shared_ptr index; std::shared_ptr quantizer; - + + // Distributed setup: when running a non-buildOnly distributed benchmark + // (i.e. the search/insert run phase), create the dispatcher + worker0 + // so the driver can broadcast the hash ring and accept remote callbacks. + // BuildOnly mode skips this entirely — build runs single-node and writes + // straight to the shared TiKV cluster (PD routes each key to the owning + // store), so no dispatcher / worker plumbing is needed for the build + // path. + if (distCfg.enabled && !buildOnly) { + auto dispAddr = distCfg.GetDispatcherAddr(); + auto workerAddrs = ParseNodeAddrs(distCfg.workerAddrs); + auto storeAddrs = Helper::StrUtils::SplitString(distCfg.storeAddrs, ","); + + dispatcher.reset(new SPANN::DispatcherNode()); + BOOST_REQUIRE_MESSAGE(dispatcher->Initialize(dispAddr, workerAddrs), + "DispatcherNode initialization failed (build-phase setup)"); + BOOST_REQUIRE(dispatcher->Start()); + + worker.reset(new SPANN::WorkerNode()); + // Pre-build: pass nullptr DB. After BuildIndex, swap in the real DB + // via SetDB() (or rebuild the worker on top of it for run mode). + BOOST_REQUIRE_MESSAGE( + worker->Initialize(nullptr, 0, dispAddr, workerAddrs, storeAddrs), + "WorkerNode initialization failed (build-phase setup)"); + BOOST_REQUIRE(worker->Start()); + workerPtr = worker.get(); + + dispatcher->SetLocalWorkerIndex(worker->GetLocalNodeIndex()); + worker->SetHashRing(dispatcher->GetHashRing()); + + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "Pre-build: waiting for all peer connections...\n"); + BOOST_REQUIRE_MESSAGE(dispatcher->WaitForAllPeersConnected(180), + "Timed out waiting for peer connections (build-phase)"); + + auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(180); + while (std::chrono::steady_clock::now() < deadline) { + if (dispatcher->AllWorkersAcked()) break; + std::this_thread::sleep_for(std::chrono::milliseconds(200)); + } + BOOST_REQUIRE_MESSAGE(dispatcher->AllWorkersAcked(), + "Timed out waiting for workers to ACK ring (build-phase)"); + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "Pre-build: all %d workers connected and ring synchronized\n", numNodes); + + // Start heartbeat pump so remote workers can detect driver failure + // and exit cleanly instead of relying on a fixed wall-clock receiver + // timeout. Worker side enforces HeartbeatTimeoutSec (default 180s). + // Interval is fixed at 30s; six missed pings before worker bails. + dispatcher->StartHeartbeat(30); + } + // Build initial index BOOST_TEST_MESSAGE("\n=== Building Index ==="); - if (rebuild || !direxists(indexPath.c_str())) { + if (rebuild || rebuildSsdOnly || !direxists(indexPath.c_str())) { + if (!rebuildSsdOnly) { + // Allow empty or non-existent directories; block only if index files already exist + if (direxists(indexPath.c_str()) && fileexists((indexPath + FolderSep + "indexloader.ini").c_str())) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Error, + "Index directory '%s' already exists with index files. Refusing to delete. " + "Remove it manually or use RebuildSSDOnly=true to resume.\n", + indexPath.c_str()); + BOOST_FAIL("Index directory already exists: " + indexPath); + return; + } + } auto buildstart = std::chrono::high_resolution_clock::now(); if (enableQuantization) @@ -932,13 +1392,13 @@ void RunBenchmark(const std::string &vectorPath, const std::string &queryPath, c quantizedBase->Save(pquanvecset); } - index = BuildLargeIndex(indexPath, pquanvecset, pmeta, pmetaidx, iniReader, dist, numSearchThreads, numInsertThreads, layers, quantizer, "quantizer.bin"); + index = BuildLargeIndex(indexPath, pquanvecset, pmeta, pmetaidx, dist, numSearchThreads, numInsertThreads, layers, quantizer, "quantizer.bin", ssdOverrides, rebuildSsdOnly, workerPtr); BOOST_REQUIRE(index != nullptr); index->SetQuantizerADC(true); } else { - index = BuildLargeIndex(indexPath, pvecset, pmeta, pmetaidx, iniReader, dist, numSearchThreads, numInsertThreads, layers); + index = BuildLargeIndex(indexPath, pvecset, pmeta, pmetaidx, dist, numSearchThreads, numInsertThreads, layers, nullptr, "quantizer.bin", ssdOverrides, rebuildSsdOnly, workerPtr); BOOST_REQUIRE(index != nullptr); } @@ -954,6 +1414,23 @@ void RunBenchmark(const std::string &vectorPath, const std::string &queryPath, c BOOST_REQUIRE(index != nullptr); } + // Set up distributed routing for RUN mode if configured. + // (Build-phase needs no dispatcher/worker; the run-phase dispatcher+worker + // were created in the pre-build block above.) The driver node is both + // dispatcher (ring management) and worker 0 (compute). + if (distCfg.enabled && !buildOnly) { + // Bind worker to ALL searcher layers (wires append + headsync + lock + fetch callbacks). + // Every layer must see the worker so AddIDCapacity grows each layer's + // version map by capa * numNodes (not just capa). + auto* spannIndex = dynamic_cast*>(index.get()); + BOOST_REQUIRE(spannIndex != nullptr); + BindWorkerToAllLayers(workerPtr, spannIndex); + + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "Run mode: worker bound to all %d layers\n", + (int)spannIndex->GetOptions()->m_layers); + } + auto queryset = TestUtils::TestDataGenerator::LoadVectorSet(pqueryset, M); BOOST_REQUIRE(queryset != nullptr); @@ -973,32 +1450,50 @@ void RunBenchmark(const std::string &vectorPath, const std::string &queryPath, c truth = TestUtils::TestDataGenerator::LoadVectorSet(ptruth, K); } - // Benchmark 0: Query performance before insertions (round 1 — cold cache) - BOOST_TEST_MESSAGE("\n=== Benchmark 0: Query Before Insertions (Round 1) ==="); - BenchmarkQueryPerformance(index, queryset, truth, truthPath, baseVectorCount, topK, SearchK, - numSearchThreads, numQueries, 0, batches, tmpbenchmark); - jsonFile << " \"benchmark0_query_before_insert\": "; - BenchmarkQueryPerformance(index, queryset, truth, truthPath, baseVectorCount, topK, SearchK, - numSearchThreads, numQueries, 0, batches, jsonFile); - jsonFile << ",\n"; - jsonFile.flush(); - - // Benchmark 0b: Query performance before insertions (round 2 — warm cache) - BOOST_TEST_MESSAGE("\n=== Benchmark 0b: Query Before Insertions (Round 2) ==="); - BenchmarkQueryPerformance(index, queryset, truth, truthPath, baseVectorCount, topK, SearchK, - numSearchThreads, numQueries, 0, batches, tmpbenchmark); - jsonFile << " \"benchmark0b_query_before_insert_round2\": "; - BenchmarkQueryPerformance(index, queryset, truth, truthPath, baseVectorCount, topK, SearchK, - numSearchThreads, numQueries, 0, batches, jsonFile); - jsonFile << ",\n"; - jsonFile.flush(); + // Benchmark 0/0b: query performance before insertions. Skip in BuildOnly + // mode (no point measuring queries when we're about to exit; queries also + // require workers to be running for distributed scatter-gather). + if (!buildOnly) { + // Benchmark 0: Query performance before insertions (round 1 — cold cache) + BOOST_TEST_MESSAGE("\n=== Benchmark 0: Query Before Insertions (Round 1) ==="); + BenchmarkQueryPerformance(index, queryset, truth, truthPath, baseVectorCount, topK, SearchK, + numSearchThreads, numQueries, 0, batches, tmpbenchmark, "", + nodeIndex, workerPtr, dispatcher.get()); + jsonFile << " \"benchmark0_query_before_insert\": "; + BenchmarkQueryPerformance(index, queryset, truth, truthPath, baseVectorCount, topK, SearchK, + numSearchThreads, numQueries, 0, batches, jsonFile, "", + nodeIndex, workerPtr, dispatcher.get()); + jsonFile << ",\n"; + jsonFile.flush(); + + // Benchmark 0b: Query performance before insertions (round 2 — warm cache) + BOOST_TEST_MESSAGE("\n=== Benchmark 0b: Query Before Insertions (Round 2) ==="); + BenchmarkQueryPerformance(index, queryset, truth, truthPath, baseVectorCount, topK, SearchK, + numSearchThreads, numQueries, 0, batches, tmpbenchmark, "", + nodeIndex, workerPtr, dispatcher.get()); + jsonFile << " \"benchmark0b_query_before_insert_round2\": "; + BenchmarkQueryPerformance(index, queryset, truth, truthPath, baseVectorCount, topK, SearchK, + numSearchThreads, numQueries, 0, batches, jsonFile, "", + nodeIndex, workerPtr, dispatcher.get()); + jsonFile << ",\n"; + jsonFile.flush(); + } else { + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "BuildOnly=true: skipping Benchmark 0/0b query rounds\n"); + jsonFile << " \"benchmark0_query_before_insert\": {},\n"; + jsonFile << " \"benchmark0b_query_before_insert_round2\": {},\n"; + jsonFile.flush(); + } BOOST_REQUIRE(index->SaveIndex(indexPath) == ErrorCode::Success); index = nullptr; // Benchmark 1: Insert performance - if (insertBatchSize > 0) + if (buildOnly) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "BuildOnly=true: skipping insert batches, index saved to %s\n", indexPath.c_str()); + jsonFile << " \"benchmark1_insert\": {}\n"; + } + else if (insertBatchSize > 0) { BOOST_TEST_MESSAGE("\n=== Benchmark 1: Insert Performance ==="); { @@ -1076,31 +1571,53 @@ void RunBenchmark(const std::string &vectorPath, const std::string &queryPath, c SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Cloned index from %s to %s, check:%d, time: %f seconds\n", prevPath.c_str(), clonePath.c_str(), (int)(cloneret == ErrorCode::Success), seconds); - int insertStart = iter * insertBatchSize; + // Re-bind the worker to ALL layers of the new cloned index's searchers + // (every layer must see the worker so AddIDCapacity grows each layer's + // version map by capa * numNodes). + if (workerPtr) { + BindWorkerToIndex(workerPtr, cloneIndex); + } + + // Dispatch insert command to workers via TCP + std::uint64_t insertDispatchId = 0; + if (dispatcher && numNodes > 1) { + insertDispatchId = dispatcher->BroadcastDispatchCommand( + SPANN::DispatchCommand::Type::Insert, static_cast(iter)); + } + + // Each node inserts its partition. Default mode: contiguous slice + // [iter*batchSize + myInsertStart, +perNodeBatch). Stride mode: + // every numNodes-th row of the full batch starting at nodeIndex + // (loads full batch then filters down to perNodeBatch rows). + int insertStart = iter * insertBatchSize + myInsertStart; + int loadCount = strideShard ? insertBatchSize : perNodeBatch; { - std::shared_ptr addset = TestUtils::TestDataGenerator::LoadVectorSet(paddset, M, insertStart, insertBatchSize); - ByteArray quantizedAddBytes; - if (enableQuantization) { - auto addFloat = ConvertToFloatVectorSet(addset); - BOOST_REQUIRE(addFloat != nullptr); - quantizedAddBytes = ByteArray::Alloc((size_t)addFloat->Count() * (size_t)(quantizer->GetNumSubvectors())); - BOOST_REQUIRE(QuantizeVectors(quantizer, addFloat, quantizedAddBytes) == ErrorCode::Success); - addset = std::make_shared(quantizedAddBytes, - VectorValueType::UInt8, - quantizer->GetNumSubvectors(), - addFloat->Count()); - } - std::shared_ptr addmetaset = TestUtils::TestDataGenerator::LoadMetadataSet(paddmeta, paddmetaidx, insertStart, insertBatchSize); + std::string driverTag = "RunBenchmark iter=" + std::to_string(iter); start = std::chrono::high_resolution_clock::now(); - InsertVectors(static_cast *>(cloneIndex.get()), numInsertThreads, insertBatchSize, - addset, addmetaset, numSearchDuringInsertThreads, queryset, numQueries, SearchK, &jsonFile, 0); - end = std::chrono::high_resolution_clock::now(); + LoadAndInsertBatch(static_cast*>(cloneIndex.get()), + paddset, paddmeta, paddmetaidx, M, + insertStart, loadCount, perNodeBatch, + strideShard, numNodes, nodeIndex, + numInsertThreads, workerPtr, + enableQuantization ? quantizer : nullptr, + numSearchDuringInsertThreads, queryset, + numQueries, SearchK, &jsonFile, + driverTag.c_str()); } + + // Wait for all worker nodes to finish this batch via TCP. + if (insertDispatchId > 0) { + auto workerTimes = dispatcher->WaitForAllResults(insertDispatchId, 7200); + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Driver: all %d workers finished batch %d\n", + (int)workerTimes.size(), iter + 1); + } + + end = std::chrono::high_resolution_clock::now(); seconds = std::chrono::duration_cast(end - start).count() / 1000000.0f; double throughput = insertBatchSize / seconds; - BOOST_TEST_MESSAGE(" Inserted: " << insertBatchSize << " vectors"); + BOOST_TEST_MESSAGE(" Inserted: " << insertBatchSize << " vectors (" << perNodeBatch << " local)"); BOOST_TEST_MESSAGE(" Time: " << seconds << " seconds"); BOOST_TEST_MESSAGE(" Throughput: " << throughput << " vectors/sec"); @@ -1164,17 +1681,21 @@ void RunBenchmark(const std::string &vectorPath, const std::string &queryPath, c BOOST_TEST_MESSAGE("\n=== Benchmark 2: Query After Insertions and Deletions ==="); jsonFile << " \"search\":"; BenchmarkQueryPerformance(cloneIndex, queryset, truth, truthPath, baseVectorCount, topK, SearchK, numSearchThreads, - numQueries, iter + 1, batches, tmpbenchmark, " "); + numQueries, iter + 1, batches, tmpbenchmark, " ", + nodeIndex, workerPtr, dispatcher.get()); BenchmarkQueryPerformance(cloneIndex, queryset, truth, truthPath, baseVectorCount, - topK, SearchK, numSearchThreads, numQueries, iter + 1, batches, jsonFile, " "); + topK, SearchK, numSearchThreads, numQueries, iter + 1, batches, jsonFile, " ", + nodeIndex, workerPtr, dispatcher.get()); jsonFile << ",\n"; BOOST_TEST_MESSAGE("\n=== Benchmark 2b: Query After Insertions and Deletions (Round 2) ==="); jsonFile << " \"search_round2\":"; BenchmarkQueryPerformance(cloneIndex, queryset, truth, truthPath, baseVectorCount, topK, SearchK, numSearchThreads, - numQueries, iter + 1, batches, tmpbenchmark, " "); + numQueries, iter + 1, batches, tmpbenchmark, " ", + nodeIndex, workerPtr, dispatcher.get()); BenchmarkQueryPerformance(cloneIndex, queryset, truth, truthPath, baseVectorCount, - topK, SearchK, numSearchThreads, numQueries, iter + 1, batches, jsonFile, " "); + topK, SearchK, numSearchThreads, numQueries, iter + 1, batches, jsonFile, " ", + nodeIndex, workerPtr, dispatcher.get()); jsonFile << ",\n"; start = std::chrono::high_resolution_clock::now(); @@ -1223,6 +1744,18 @@ void RunBenchmark(const std::string &vectorPath, const std::string &queryPath, c jsonFile << "}\n"; jsonFile.close(); + // Stop workers in distributed mode + if (dispatcher && numNodes > 1) { + // Stop the heartbeat pump first so we don't race a stray Heartbeat + // packet against the Stop dispatch on the same connection. + dispatcher->StopHeartbeat(); + auto dispatchId = dispatcher->BroadcastDispatchCommand(SPANN::DispatchCommand::Type::Stop, 0); + // Wait briefly for ACKs so workers exit cleanly before the driver + // tears down the network (which would force-kill in-flight RPCs). + dispatcher->WaitForAllResults(dispatchId, 60); + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Driver: sent Stop command to all workers\n"); + } + M = oldM; K = oldK; N = oldN; @@ -2198,6 +2731,14 @@ BOOST_AUTO_TEST_CASE(IterativeSearchPerf) std::filesystem::remove_all("original_index"); } +// Forward declaration +template +void RunWorker(const std::string& indexPath, int dimension, int baseVectorCount, + int insertVectorCount, int batches, int topK, int numSearchThreads, + int numInsertThreads, int numQueries, VectorValueType valueType, + const std::map& ssdOverrides, + const DistributedConfig& distCfg, int workerTimeout); + BOOST_AUTO_TEST_CASE(BenchmarkFromConfig) { using namespace SPFreshTest; @@ -2245,14 +2786,59 @@ BOOST_AUTO_TEST_CASE(BenchmarkFromConfig) int topK = iniReader.GetParameter("Benchmark", "TopK", 10); int numSearchThreads = iniReader.GetParameter("Benchmark", "NumSearchThreads", 8); int numInsertThreads = iniReader.GetParameter("Benchmark", "NumInsertThreads", 8); - int appendThreadNum = iniReader.GetParameter("Benchmark", "AppendThreadNum", 0); int numSearchDuringInsertThreads = iniReader.GetParameter("Benchmark", "NumSearchDuringInsertThreads", 1); + int appendThreadNum = iniReader.GetParameter("Benchmark", "AppendThreadNum", 0); int numQueries = iniReader.GetParameter("Benchmark", "NumQueries", 1000); int layers = iniReader.GetParameter("Benchmark", "Layers", 1); DistCalcMethod distMethod = iniReader.GetParameter("Benchmark", "DistMethod", DistCalcMethod::L2); - bool rebuild = (iniReader.GetParameter("Benchmark", "Rebuild", true) || iniReader.GetParameter("Benchmark", "RebuildSSDOnly", false)); + bool rebuild = iniReader.GetParameter("Benchmark", "Rebuild", true); + bool rebuildSsdOnly = iniReader.GetParameter("Benchmark", "RebuildSSDOnly", false); + bool buildOnly = iniReader.GetParameter("Benchmark", "BuildOnly", false); int resume = iniReader.GetParameter("Benchmark", "Resume", -1); + // Read storage backend overrides for BuildSSDIndex + std::map ssdOverrides; + std::string storage = iniReader.GetParameter("Benchmark", "Storage", std::string("")); + if (!storage.empty()) { + ssdOverrides["Storage"] = storage; + } + std::string tikvKeyPrefix = iniReader.GetParameter("Benchmark", "TiKVKeyPrefix", std::string("")); + if (!tikvKeyPrefix.empty()) { + ssdOverrides["TiKVKeyPrefix"] = tikvKeyPrefix; + } + if (appendThreadNum > 0) { + ssdOverrides["AppendThreadNum"] = std::to_string(appendThreadNum); + } + + // Pass through any [BuildSSDIndex] section params from the ini as overrides + auto buildSSDParams = iniReader.GetParameters("BuildSSDIndex"); + for (const auto &[key, val] : buildSSDParams) { + ssdOverrides[key] = val; + } + + // Read distributed config from [Distributed] section + auto distCfg = DistributedConfig::FromIni(iniReader); + + // Shared TiKV raft cluster: every compute node connects to the FULL PD + // endpoint list. The TiKV client uses PD-raft to route reads/writes to + // whichever store owns the region, so any compute can access any posting. + if (!distCfg.pdAddrs.empty()) { + ssdOverrides["TiKVPDAddresses"] = distCfg.pdAddrs; + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "Using PD address: %s (workerIndex=%d)\n", + distCfg.pdAddrs.c_str(), distCfg.workerIndex); + } + + // Pass through [SelectHead] and [BuildHead] params as overrides too + auto selectHeadParams = iniReader.GetParameters("SelectHead"); + for (const auto &[key, val] : selectHeadParams) { + ssdOverrides["SelectHead." + key] = val; + } + auto buildHeadParams = iniReader.GetParameters("BuildHead"); + for (const auto &[key, val] : buildHeadParams) { + ssdOverrides["BuildHead." + key] = val; + } + BOOST_TEST_MESSAGE("=== Benchmark Configuration ==="); BOOST_TEST_MESSAGE("Vector Path: " << vectorPath); BOOST_TEST_MESSAGE("Query Path: " << queryPath); @@ -2273,31 +2859,224 @@ BOOST_AUTO_TEST_CASE(BenchmarkFromConfig) BOOST_TEST_MESSAGE("QuantizedDim: " << quantizedDim); } + // Worker node path: if distributed and workerIndex > 0, run as remote worker and return + if (distCfg.enabled && distCfg.workerIndex > 0) { + int workerTimeout = iniReader.GetParameter("Benchmark", "WorkerTimeout", 3600); + BOOST_TEST_MESSAGE("Running as worker node " << distCfg.workerIndex); + if (valueType == VectorValueType::Float) + RunWorker(indexPath, dimension, baseVectorCount, insertVectorCount, batchNum, topK, numSearchThreads, numInsertThreads, numQueries, valueType, ssdOverrides, distCfg, workerTimeout); + else if (valueType == VectorValueType::Int8) + RunWorker(indexPath, dimension, baseVectorCount, insertVectorCount, batchNum, topK, numSearchThreads, numInsertThreads, numQueries, valueType, ssdOverrides, distCfg, workerTimeout); + else if (valueType == VectorValueType::UInt8) + RunWorker(indexPath, dimension, baseVectorCount, insertVectorCount, batchNum, topK, numSearchThreads, numInsertThreads, numQueries, valueType, ssdOverrides, distCfg, workerTimeout); + return; + } + // Get output file path from environment variable or use default const char *outputPath = std::getenv("BENCHMARK_OUTPUT"); std::string outputFile = outputPath ? std::string(outputPath) : "output.json"; BOOST_TEST_MESSAGE("Output File: " << outputFile); - // Dispatch to appropriate type + // Driver path (nodeIndex == 0 or single-node mode) if (valueType == VectorValueType::Float) { RunBenchmark(vectorPath, queryPath, truthPath, distMethod, indexPath, dimension, baseVectorCount, - insertVectorCount, deleteVectorCount, batchNum, topK, numSearchThreads, numInsertThreads, numSearchDuringInsertThreads, numQueries, iniReader, - outputFile, rebuild, resume, quantizerFilePath, quantizedDim, layers); + insertVectorCount, deleteVectorCount, batchNum, topK, numSearchThreads, numInsertThreads, numSearchDuringInsertThreads, numQueries, outputFile, + rebuild, resume, quantizerFilePath, quantizedDim, layers, ssdOverrides, rebuildSsdOnly, buildOnly, distCfg); } else if (valueType == VectorValueType::Int8) { RunBenchmark(vectorPath, queryPath, truthPath, distMethod, indexPath, dimension, baseVectorCount, - insertVectorCount, deleteVectorCount, batchNum, topK, numSearchThreads, numInsertThreads, numSearchDuringInsertThreads, numQueries, iniReader, - outputFile, rebuild, resume, quantizerFilePath, quantizedDim, layers); + insertVectorCount, deleteVectorCount, batchNum, topK, numSearchThreads, numInsertThreads, numSearchDuringInsertThreads, numQueries, + outputFile, rebuild, resume, quantizerFilePath, quantizedDim, layers, ssdOverrides, rebuildSsdOnly, buildOnly, distCfg); } else if (valueType == VectorValueType::UInt8) { RunBenchmark(vectorPath, queryPath, truthPath, distMethod, indexPath, dimension, baseVectorCount, - insertVectorCount, deleteVectorCount, batchNum, topK, numSearchThreads, numInsertThreads, numSearchDuringInsertThreads, numQueries, iniReader, - outputFile, rebuild, resume, quantizerFilePath, quantizedDim, layers); + insertVectorCount, deleteVectorCount, batchNum, topK, numSearchThreads, numInsertThreads, numSearchDuringInsertThreads, numQueries, + outputFile, rebuild, resume, quantizerFilePath, quantizedDim, layers, ssdOverrides, rebuildSsdOnly, buildOnly, distCfg); + } +} + +/// Worker node path for distributed benchmark (nodeIndex > 0). +/// Loads a pre-built head index, connects to TiKV, starts WorkerNode, +/// and waits for TCP dispatch commands from the driver node. +template +void RunWorker(const std::string& indexPath, int dimension, int baseVectorCount, + int insertVectorCount, int batches, int topK, int numSearchThreads, + int numInsertThreads, int numQueries, VectorValueType valueType, + const std::map& ssdOverrides, + const DistributedConfig& distCfg, int workerTimeout) +{ + int oldN = N, oldM = M, oldK = K, oldQ = queries; + N = baseVectorCount; M = dimension; K = topK; queries = numQueries; + + int nodeIndex = distCfg.workerIndex; + int numNodes = distCfg.GetNumWorkers(); + int insertBatchSize = insertVectorCount / std::max(batches, 1); + bool strideShard = IsStrideShardEnabled() && numNodes > 1; + int myInsertStart, myInsertEnd, perNodeBatch; + if (strideShard) { + myInsertStart = 0; + myInsertEnd = insertBatchSize; + perNodeBatch = static_cast(StrideCount(insertBatchSize, numNodes, nodeIndex)); + } else { + myInsertStart = (numNodes > 1) ? (nodeIndex * insertBatchSize) / numNodes : 0; + myInsertEnd = (numNodes > 1) ? ((nodeIndex + 1) * insertBatchSize) / numNodes : insertBatchSize; + perNodeBatch = myInsertEnd - myInsertStart; + } + + BOOST_TEST_MESSAGE("Worker node " << nodeIndex << ": Loading index from " << indexPath); + std::shared_ptr index; + // IMPORTANT: Pass ssdOverrides through LoadIndex so that worker-specific settings + // (especially TiKVPDAddresses pointing at this worker's local PD) are applied + // BEFORE the underlying TiKV connection is constructed in PrepareDB. Without this, + // the worker would inherit the driver's PD address from the saved indexloader.ini + // and route every KV write back to the driver's TiKV instead of its own. + BOOST_REQUIRE(VectorIndex::LoadIndex(indexPath, ssdOverrides, index) == ErrorCode::Success); + BOOST_REQUIRE(index != nullptr); + + // Create WorkerNode + auto dispAddr = distCfg.GetDispatcherAddr(); + auto workerAddrs = ParseNodeAddrs(distCfg.workerAddrs); + auto storeAddrs = Helper::StrUtils::SplitString(distCfg.storeAddrs, ","); + + auto* spannIndex = dynamic_cast*>(index.get()); + BOOST_REQUIRE_MESSAGE(spannIndex != nullptr, "Failed to cast to SPANN::Index"); + auto diskIndex = spannIndex->GetDiskIndex(0); + BOOST_REQUIRE(diskIndex != nullptr); + auto* searcher = dynamic_cast*>(diskIndex.get()); + BOOST_REQUIRE(searcher != nullptr); + auto workerDb = searcher->GetDB(); + BOOST_REQUIRE_MESSAGE(workerDb != nullptr, "Worker: could not extract db from index"); + + SPANN::WorkerNode workerNode; + BOOST_REQUIRE_MESSAGE(workerNode.Initialize(workerDb, nodeIndex, dispAddr, workerAddrs, storeAddrs), + "WorkerNode initialization failed"); + BOOST_REQUIRE(workerNode.Start()); + auto* router = &workerNode; + + // Bind worker to ALL searcher layers (every layer must see the worker so + // AddIDCapacity grows each layer's version map by capa * numNodes). + BindWorkerToAllLayers(router, spannIndex); + + // Wait for ring from dispatcher + BOOST_REQUIRE_MESSAGE(router->WaitForRing(120), + "Worker: Timed out waiting for ring from dispatcher"); + + BOOST_TEST_MESSAGE("Worker " << nodeIndex << ": Ready, numNodes=" << numNodes + << " perNodeBatch=" << perNodeBatch); + + // Build data file names + std::string typeStr = Helper::Convert::ConvertToString(valueType); + std::string paddset = "perftest_addvector.bin." + typeStr + "_" + std::to_string(insertVectorCount) + "_" + std::to_string(dimension); + std::string paddmeta = "perftest_addmeta.bin." + std::to_string(baseVectorCount) + "_" + std::to_string(insertVectorCount); + std::string paddmetaidx = "perftest_addmetaidx.bin." + std::to_string(baseVectorCount) + "_" + std::to_string(insertVectorCount); + + // Load query set + int searchK = topK; + std::string pqueryset = "perftest_query.bin." + typeStr + "_" + std::to_string(numQueries) + "_" + std::to_string(dimension); + auto queryset = TestUtils::TestDataGenerator::LoadVectorSet(pqueryset, dimension); + BOOST_REQUIRE_MESSAGE(queryset != nullptr, "Worker: Failed to load query set from " << pqueryset); + + // Register dispatch callback + std::promise stopPromise; + auto stopFuture = stopPromise.get_future(); + std::once_flag stopOnce; + + router->SetDispatchCallback([&](const SPANN::DispatchCommand& cmd) -> SPANN::DispatchResult { + SPANN::DispatchResult result; + result.m_dispatchId = cmd.m_dispatchId; + result.m_round = cmd.m_round; + + if (cmd.m_type == SPANN::DispatchCommand::Type::Stop) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Worker %d: Stop command received\n", nodeIndex); + std::call_once(stopOnce, [&]() { stopPromise.set_value(); }); + result.m_status = SPANN::DispatchResult::Status::Success; + return result; + } + + if (cmd.m_type == SPANN::DispatchCommand::Type::Heartbeat) { + // Driver sends a Heartbeat every HeartbeatIntervalSec; the result + // is dropped by DispatchCoordinator. Acknowledge silently so we + // don't log noise every 30s during the insert phase. + result.m_status = SPANN::DispatchResult::Status::Success; + return result; + } + + if (cmd.m_type == SPANN::DispatchCommand::Type::Search) { + int myStart = (int)((long long)nodeIndex * numQueries / numNodes); + int myEnd = (int)((long long)(nodeIndex + 1) * numQueries / numNodes); + int myCount = myEnd - myStart; + + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Worker %d: Search round %u - %d queries [%d, %d)\n", + nodeIndex, cmd.m_round, myCount, myStart, myEnd); + + std::vector results; + double wallTime = ExecutePartitionedSearch( + index.get(), queryset, myStart, myCount, searchK, + std::min(numSearchThreads, myCount), + results, /*latenciesOut=*/nullptr, /*statsOut=*/nullptr); + + // Drain merge hints accumulated during this search round. + // Search-side AsyncMergeInSearch on remote-owned heads enqueues + // notifications via QueueRemoteMerge; auto-flush only fires when + // a per-target bucket reaches kMergeAutoFlushThreshold, so the + // tail of every round (and any sparse rounds) needs an explicit + // drain to guarantee no hint is dropped. + router->FlushRemoteMerges(); + + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Worker %d: Search round %u done - %.1fms\n", + nodeIndex, cmd.m_round, wallTime * 1000); + result.m_status = SPANN::DispatchResult::Status::Success; + result.m_wallTime = wallTime; + return result; + } + + if (cmd.m_type == SPANN::DispatchCommand::Type::Insert) { + int insertStart = cmd.m_round * insertBatchSize + myInsertStart; + int loadCount = strideShard ? insertBatchSize : perNodeBatch; + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Worker %d: Batch %u - inserting %d vectors (offset %d, strideShard=%d)\n", + nodeIndex, cmd.m_round + 1, perNodeBatch, insertStart, strideShard ? 1 : 0); + + auto t1 = std::chrono::high_resolution_clock::now(); + std::string workerTag = + "Worker " + std::to_string(nodeIndex) + " batch=" + std::to_string(cmd.m_round + 1); + LoadAndInsertBatch(spannIndex, paddset, paddmeta, paddmetaidx, dimension, + insertStart, loadCount, perNodeBatch, + strideShard, numNodes, nodeIndex, + numInsertThreads, router, + /*quantizer=*/nullptr, + /*searchDuringInsertThreads=*/0, + /*queryset=*/nullptr, + /*numQueries=*/0, /*searchK=*/5, + /*benchmarkData=*/nullptr, + workerTag.c_str()); + auto t2 = std::chrono::high_resolution_clock::now(); + double secs = std::chrono::duration_cast(t2 - t1).count() / 1000000.0; + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Worker %d: Batch %u done - %d vectors in %.2f s (%.1f vec/s)\n", + nodeIndex, cmd.m_round + 1, perNodeBatch, secs, perNodeBatch / secs); + + result.m_status = SPANN::DispatchResult::Status::Success; + result.m_wallTime = secs; + return result; + } + + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, "Worker %d: Unknown command type %d\n", + nodeIndex, (int)cmd.m_type); + result.m_status = SPANN::DispatchResult::Status::Failed; + return result; + }); + + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Worker %d: Waiting for dispatch commands\n", nodeIndex); + + auto status = stopFuture.wait_for(std::chrono::seconds(workerTimeout)); + if (status == std::future_status::timeout) { + SPTAGLIB_LOG(Helper::LogLevel::LL_Warning, "Worker %d: Timeout after %ds\n", nodeIndex, workerTimeout); } - //std::filesystem::remove_all(indexPath); + router->ClearDispatchCallback(); + N = oldN; M = oldM; K = oldK; queries = oldQ; + BOOST_TEST_MESSAGE("Worker " << nodeIndex << ": Shutting down"); } BOOST_AUTO_TEST_SUITE_END() diff --git a/Test/src/TestDataGenerator.cpp b/Test/src/TestDataGenerator.cpp index cb3318548..c32f19e0a 100644 --- a/Test/src/TestDataGenerator.cpp +++ b/Test/src/TestDataGenerator.cpp @@ -274,7 +274,8 @@ void TestDataGenerator::GenerateBatchTruth(const std::string &filename, std:: } template -float TestDataGenerator::EvaluateRecall(const std::vector &res, std::shared_ptr &truth, int recallK, int k, int batch, int totalbatches) +float TestDataGenerator::EvaluateRecall(const std::vector &res, std::shared_ptr &truth, int recallK, int k, int batch, int totalbatches, + int totalQueries, int queryOffset) { if (!truth) { @@ -285,14 +286,17 @@ float TestDataGenerator::EvaluateRecall(const std::vector recallK = min(recallK, static_cast(truth->Dimension())); float totalRecall = 0.0f; float eps = 1e-4f; - SizeType distbase = truth->Count() - (totalbatches + 1) * res.size(); + // Use global queryCount when caller provides it (distributed path); otherwise + // assume single-node where res.size() IS the global query count. + SizeType queryCount = (totalQueries > 0) ? static_cast(totalQueries) : static_cast(res.size()); + SizeType distbase = truth->Count() - (totalbatches + 1) * queryCount; for (SizeType i = 0; i < res.size(); ++i) { - const SizeType *truthNN = reinterpret_cast(truth->GetData()) + batch * res.size() + i; + const SizeType *truthNN = reinterpret_cast(truth->GetVector(batch * queryCount + queryOffset + i)); float *truthD = nullptr; if (truth->Count() > distbase) { - truthD = reinterpret_cast(truth->GetVector(distbase + batch * res.size() + i)); + truthD = reinterpret_cast(truth->GetVector(distbase + batch * queryCount + queryOffset + i)); } for (int j = 0; j < recallK; ++j) { diff --git a/Test/src/main.cpp b/Test/src/main.cpp index c1a5cde60..ab8d1342c 100644 --- a/Test/src/main.cpp +++ b/Test/src/main.cpp @@ -7,9 +7,7 @@ #include #include -#ifdef TIKV #include -#endif using namespace boost::unit_test; @@ -38,9 +36,8 @@ struct GlobalFixture // adds GraphCycles bookkeeping under a global spinlock on every Lock(); // observed to consume ~12% CPU under high worker-thread parallelism in // gRPC client paths (perf-recorded 2026-05-06). -#ifdef TIKV - absl::SetMutexDeadlockDetectionMode(absl::OnDeadlockCycle::kIgnore); -#endif + absl::SetMutexDeadlockDetectionMode(absl::OnDeadlockCycle::kIgnore); + SPTAGVisitor visitor; traverse_test_tree(framework::master_test_suite(), visitor, false); } diff --git a/benchmark.ini b/benchmark.ini new file mode 100644 index 000000000..e2b400767 --- /dev/null +++ b/benchmark.ini @@ -0,0 +1,19 @@ +[Benchmark] +VectorPath=sift1b/base.100M.u8bin +QueryPath=sift1b/query.public.10K.u8bin +TruthPath=none +IndexPath=proidx/spann_index +ValueType=UInt8 +Dimension=128 +BaseVectorCount=10000 +InsertVectorCount=10000 +DeleteVectorCount=0 +BatchNum=10 +TopK=5 +NumThreads=8 +NumQueries=100 +DistMethod=L2 +Rebuild=true +Resume=-1 +QuantizerFilePath=quantizer.bin +QuantizedDim=64 diff --git a/evaluation/distributed/README.md b/evaluation/distributed/README.md new file mode 100644 index 000000000..1f24bc865 --- /dev/null +++ b/evaluation/distributed/README.md @@ -0,0 +1,294 @@ +# Distributed Benchmark Evaluation — Insert Dominant + +Multi-machine SPTAG SPANN distributed benchmark for an **insert-dominant** workload +(1M base + 10M inserts in 10 batches, with concurrent search-during-insert) on +SIFT1B. Each physical node runs its own independent PD + TiKV (no shared Raft +replication — see "TiKV deployment model" below). + +## Files in this folder + +| File | Purpose | +| --- | --- | +| `configs/benchmark_insert_dominant_template.ini` | Benchmark template; `run_distributed.sh` fills `IndexPath`, `TiKVPDAddresses`, `TiKVKeyPrefix`, and `[Distributed]` from `cluster.conf`. | +| `run_distributed.sh` | Orchestrator: `deploy` / `start-tikv` / `run` / `stop-tikv` / `cleanup`. | +| `README.md` | This file. | + +## Architecture + +``` + ┌──────────────┐ + │ Driver │ (node 0) + │ RunBenchmark│ + │ + Router │ + └──┬───┬───┬──┘ + TCP Dispatch│ │ │ + ┌────────┘ │ └────────┐ + ▼ ▼ ▼ + ┌──────────┐ ┌──────────┐ ┌──────────┐ + │ Worker 1 │ │ Worker 2 │ │ Worker N │ + │ + Router│ │ + Router│ │ + Router│ + └────┬─────┘ └────┬─────┘ └────┬─────┘ + │ │ │ + ▼ ▼ ▼ + ┌──────────┐ ┌──────────┐ ┌──────────┐ + │ TiKV 1 │ │ TiKV 2 │ │ TiKV N │ (one PD + one TiKV per node) + └──────────┘ └──────────┘ └──────────┘ +``` + +- **Driver** (node 0): Builds the index, sends Search/Insert/Stop commands via TCP dispatch. +- **Workers** (nodes 1..N): Receive commands, execute their shard locally, report results back. +- **TiKV (per node)**: Each node runs its own independent PD + TiKV pair. Postings + for a head live on the node that owns that head's hash partition. +- **PostingRouter**: Hash-based head routing, remote append, head sync, dispatch protocol. + +## TiKV deployment model + +Unlike a single-machine multi-docker TiKV (3 PD + 3 TiKV behind 127.0.0.1 ports +22791-3 / 20161-3 sharing one Raft cluster), in this multi-machine setup **each +node runs its own isolated PD + TiKV pair** under host networking. Heads are +routed to nodes by hash, and each node's TiKV stores only its own shard. There +is no Raft replication between nodes (no cross-node region quorum), which is +intentional for insert-dominated benchmarks where Raft log overhead would dominate. + +Per-node ports (defaults from `cluster.conf`): + +| Service | Port | Notes | +| --- | --- | --- | +| PD client | `2379` | Local app uses `:2379`. | +| PD peer | `2380` | Inter-PD; isolated cluster of 1 PD per node. | +| TiKV client | `20161` | The node-local SPTAG worker connects here. | +| Router | `30001+` | TCP dispatch / posting routing between nodes. | + +## Prerequisites + +- `Release/SPTAGTest` built with TiKV support on the driver node: + ```bash + cd + cd ThirdParty/kvproto && ./generate_cpp.sh && cd ../.. + mkdir -p Release && cd Release + cmake .. -DTIKV=ON -DTBB=ON -DCMAKE_BUILD_TYPE=Release -DGPU=OFF + cmake --build . --target SPTAGTest -j$(nproc) + ``` + *Note: building the full project may fail on the Java wrapper (`JAVASPTAGFileIO`) + due to a pre-existing `FileIOInterface.h` signature mismatch — the `SPTAGTest` + target alone is sufficient.* +- Passwordless SSH from driver to every other node (configure `ssh_key` in `cluster.conf`). +- Docker installed on every node (TiKV/PD run as containers in host network mode). +- Same dataset path on every node (default `/mnt/nvme/sift1b/`): + - `/mnt/nvme/sift1b/bigann_base.u8bin` (1B × 128 × u8) + - `/mnt/nvme/sift1b/query.10K.u8bin` +- Same fast-storage path for index + TiKV data on every node (`data_dir` in `cluster.conf`, + default `/mnt/nvme`). + +## Step 1 — Cluster config + +```bash +cp evaluation/distributed/cluster.conf.example cluster.conf +vim cluster.conf +``` + +Example: + +```ini +[cluster] +ssh_user=superbench +sptag_dir=/home/superbench/zhangt/SPTAG +data_dir=/mnt/nvme +tikv_version=v7.5.1 +pd_version=v7.5.1 + +[nodes] +# host router_port +10.0.1.1 30001 # driver (always first) +10.0.1.2 30002 # worker 1 +10.0.1.3 30003 # worker 2 + +[tikv] +# host pd_client pd_peer tikv_port +10.0.1.1 2379 2380 20161 +10.0.1.2 2379 2380 20161 +10.0.1.3 2379 2380 20161 +``` + +`run_distributed.sh` reads this file to fill the template's `[Distributed]`, +`TiKVPDAddresses`, `IndexPath`, and `TiKVKeyPrefix` automatically. + +## Step 2 — Deploy + +```bash +./evaluation/distributed/run_distributed.sh deploy cluster.conf +``` + +This rsyncs `Release/SPTAGTest` (and required shared libs) to every node and +ensures the per-node TiKV / PD data directories exist under `data_dir`. + +## Step 3 — Start TiKV (per-node, independent) + +```bash +./evaluation/distributed/run_distributed.sh start-tikv cluster.conf +``` + +This starts one PD + one TiKV per node in host-network containers. Single-replica +placement (`max-replicas=1`) is set so we measure benchmark performance without +3-way Raft replication. + +Health check (run on driver, repeat per node): + +```bash +for ip in 10.0.1.1 10.0.1.2 10.0.1.3; do + curl -s "http://$ip:2379/pd/api/v1/stores" \ + | python3 -c 'import json,sys; print([s["store"]["state_name"] for s in json.load(sys.stdin)["stores"]])' +done +# Each node should report ['Up']. +``` + +### Pre-split & scatter (optional but recommended) + +For the insert-dominant workload to spread region writes evenly across regions +within a node's TiKV, pre-split the keyspace at boundaries derived from +`DBKey(headID) = MaxID*layer + headID` little-endian byte 0. The TiKV raw key is +`TiKVKeyPrefix + "_" + uint32_le(DBKey)`; for multi-chunk it appends `\x00` / +`\x02` for chunk / count keys, but we split *only* on the head-key prefix so all +chunk and count variants for a head share a region. Boundaries used: `0x02, 0x04, +…, 0xfe` (127 split points → 128 regions). + +Driver-side helper (each PD is independent, so run per node): + +```bash +PREFIX="bench_insert_dominant_3node" # keep in sync with KEY_PREFIX in run_distributed.sh +for ip in 10.0.1.1 10.0.1.2 10.0.1.3; do + PD="http://$ip:2379" + PDCTL=(docker run --rm --network host --entrypoint /pd-ctl pingcap/pd:v7.5.1 -u "$PD") + python3 - "$PREFIX" "${PDCTL[@]}" <<'PY' +import json, subprocess, sys +prefix = sys.argv[1].encode() + b'_' +pdctl = sys.argv[2:] +def run(args): return subprocess.check_output(pdctl + args, text=True) +def region_for(hex_key): return json.loads(run(['region', 'key', '--format=hex', hex_key]))['id'] +for b in range(2, 256, 2): + key = (prefix + bytes([b, 0, 0, 0])).hex() + rid = region_for(key) + run(['operator', 'add', 'split-region', str(rid), '--policy=usekey', '--keys', key]) +for r in json.loads(run(['region', 'scan']))['regions']: + run(['operator', 'add', 'scatter-region', str(r['id'])]) +PY +done +``` + +Skip this on the very first run if you don't have load skew — `start-tikv` works +without it. For 1B-scale insert-dominant runs on a single node it materially +reduces head-region hot-spotting. + +## Step 4 — Run the benchmark + +```bash +# Single scale, explicit node count (driver + (N-1) workers): +./evaluation/distributed/run_distributed.sh run cluster.conf insert_dominant 3 + +# Or sweep 1-node baseline + N-node distributed for one or more scales: +./evaluation/distributed/run_distributed.sh bench cluster.conf insert_dominant +``` + +What `run` does: + +1. **Build** (driver only): driver builds the index locally with router + *disabled* (`Rebuild=true`, no `[Router]`). Output goes to `…_n0/spann_index`. +2. **Distribute**: rsync head index + perftest files from driver to each worker. +3. **Workers**: SSH-launches `SPTAGTest` on each worker with `WORKER_INDEX=i` and + the per-node ini (router enabled, `Rebuild=false`). +4. **Driver**: relaunches `SPTAGTest` with router enabled, `Rebuild=false`. The + driver dispatches Insert / Search commands across batches via TCP. +5. **Collect**: driver sends Stop, joins worker logs into `benchmark_logs/`. + +Useful environment overrides (see header of `run_distributed.sh`): + +- `NOCACHE=1` — disable TiKV block cache, OS pagecache, and `VersionCacheMaxChunks`. +- `BUILD_WITH_CACHE=1` — build with caches, then drop caches before search/insert (NOCACHE only). +- `SKIP_TIKV_SWAP=1` — when using `BUILD_WITH_CACHE`, skip the destructive TiKV + container restart that has corrupted recall at 100M scale. +- `SKIP_SAVE_LOAD=1` — skip post-build SaveIndex / per-batch Load+Clone+Save (NOCACHE only). +- `SKIP_HEAD_BUILD=1` — reuse existing HeadIndex if present (RebuildSSDOnly). + +## Step 5 — Stop / cleanup + +```bash +./evaluation/distributed/run_distributed.sh stop-tikv cluster.conf +./evaluation/distributed/run_distributed.sh cleanup cluster.conf # remove deployed files +``` + +## Key knobs in `benchmark_insert_dominant_template.ini` + +| Key | Value | Meaning | +| --- | --- | --- | +| `BaseVectorCount` | 1_000_000 | Initial index build size. | +| `InsertVectorCount` / `BatchNum` | 10_000_000 / 10 | 10 batches × 1M inserts. | +| `NumSearchThreads` | 4 | Threads for the standalone post-batch query benchmark. | +| `NumInsertThreads` | 16 | Threads driving `AddIndex` calls on the driver. | +| `AppendThreadNum` | 144 | Async append worker pool size — overprovisioned (≈3× cores) because each thread is I/O-bound on TiKV RPCs, so high concurrency increases in-flight RPCs. | +| `NumSearchDuringInsertThreads` | 1 | Concurrent search threads while inserting (continuous loop, ~1s sleep per query). | +| `NumQueries` | 200 | Size of the rotating query pool (in-insert search loops over it). | +| `WorkerTimeout` | 14400 | Seconds a worker waits for the driver before exiting. | +| `Storage` / `TiKVKeyPrefix` / `TiKVPDAddresses` | `TIKVIO` / filled / filled | Filled by `run_distributed.sh` from `cluster.conf`. | +| `Layers` | 2 | SPANN multi-layer head. | +| `BuildSSDIndex.UseMultiChunkPosting` | false | Single-key posting layout (one TiKV value per head). | +| `BuildSSDIndex.PostingPageLimit` | 8 | Posting page limit; runtime cap is logged as ~246 vectors. | +| `BuildSSDIndex.PostingCountCacheCapacity` | 1_000_000 | Posting-count cache capacity. | +| `BuildSSDIndex.DistributedVersionMap` | true | Use TiKV-backed distributed version map. | +| `BuildSSDIndex.ReassignK` | 64 | Split/reassign target fanout knob. | +| `BuildSSDIndex.AsyncMergeInSearch` | true | Async merge during search. | +| `BuildSSDIndex.VersionCacheMaxChunks` | 100_000 | Local version-chunk cache (set ≤0 to disable). | +| `BuildSSDIndex.LatencyLimit` | 100 | ms latency cap fed to SPANN. | +| `BuildSSDIndex.MaxCheck` | 8192 | Max posting checks per query. | +| `BuildSSDIndex.SearchInternalResultNum` | 64 | Internal candidate count during search. | + +## Output JSON structure (per batch) + +For each insert batch, `output.json/results.benchmark1_insert.batch_N` contains: + +- `Load timeSeconds` / `Load vectorCount` — reload of previous batch. +- `Clone timeSeconds`. +- In-insert concurrent search stats (continuous-loop variant): + `numQueries` (actual count issued), `meanLatency`, `p50/p90/p95/p99`, `qps`, + `batch barrier waitSeconds`. +- `inserted`, `insert timeSeconds`, `insert throughput`. +- `search` and `search_round2` — standalone `BenchmarkQueryPerformance` results + against the post-batch index (cold + warm), independent of the in-insert numbers. +- `save timeSeconds`. + +Pre-insert baseline lives at `results.benchmark0_query_before_insert` and +`results.benchmark0b_query_before_insert_round2`. + +## Dispatch Protocol + +The TCP dispatch protocol replaces file-based barriers. Communication flows through +PostingRouter's existing TCP transport: + +| Packet | Direction | Purpose | +|--------|-----------|---------| +| `DispatchCommand (0x09)` | Driver → Worker | Search/Insert/Stop with `dispatchId` + round. | +| `DispatchResult (0x89)` | Worker → Driver | Status + wallTime for aggregation. | + +- **Search**: Driver broadcasts to workers, runs local queries in parallel, collects + wall times for percentile stats. +- **Insert**: Driver broadcasts batch index, workers insert their shard, driver + waits for all to finish. +- **Stop**: Driver sends at end of benchmark; workers exit gracefully. + +Each command has a unique `dispatchId` (monotonic uint64) to avoid round collisions +between search and insert operations. + +## Troubleshooting + +- **Workers don't connect**: confirm `RouterNodeAddrs` ports (default 30001+) are + reachable between every pair of nodes — the router uses TCP with 2 io_context + threads. +- **TiKV timeout**: ensure each node's PD `advertise-client-urls` use a reachable + IP (not 127.0.0.1) — `start-tikv` sets this from `cluster.conf`. Check + `docker logs sptag-pd-0` on the affected node. +- **Worker exits prematurely**: check the worker logs in `benchmark_logs/`. + Common causes: TiKV not ready, index path mismatch, router connection failure. +- **Build fails on Java wrapper**: pre-existing issue unrelated to the benchmark. + Build only what's needed: + ```bash + cmake --build . --target SPTAGTest -j$(nproc) + ``` diff --git a/evaluation/distributed/configs/benchmark_100m_1node.ini b/evaluation/distributed/configs/benchmark_100m_1node.ini new file mode 100644 index 000000000..42ec07f49 --- /dev/null +++ b/evaluation/distributed/configs/benchmark_100m_1node.ini @@ -0,0 +1,71 @@ +; 100m: 99M base + 1M insert (insert is ~1% of base, "freshness / steady-state" workload). +; 100× larger base index than insert_dominant. Tests how the system behaves when +; the head index is large (~tens of millions of heads on layer 0) and the insert +; rate is moderate. Layers=2, L2 distance, SIFT1B dataset. +; +; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from +; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs). +; +; Notes for 100M-scale operation: +; - First run MUST build the index (Rebuild=true). Build of 99M base takes hours; +; reuse with Rebuild=false on subsequent runs and SKIP_HEAD_BUILD=1 if the +; HeadIndex on disk is intact. +; - Truth (top-5 over 99M) is recomputed at start each run; expect ~minutes. +; - SaveIndex at 100M has been observed to hang in BG-job-drain on some hosts; +; use SKIP_SAVE_LOAD=1 when iterating to bypass the per-batch save/load cycle. +; - TiKV data will grow to ~50-100GB per store at this scale; both nodes need +; plenty of NVMe headroom (verified: driver has 6.2T, worker has 691G). +[Benchmark] +WorkerTimeout=14400 +VectorPath=/mnt/nvme/sift1b/bigann_base.u8bin +QueryPath=/mnt/nvme/sift1b/query.10K.u8bin +TruthPath=truth +IndexPath=/mnt/nvme/proidx_100m_1node/spann_index +ValueType=UInt8 +Dimension=128 +BaseVectorCount=99000000 +InsertVectorCount=1000000 +DeleteVectorCount=0 +BatchNum=1 +TopK=5 +NumSearchThreads=4 +NumInsertThreads=4 +AppendThreadNum=16 +NumSearchDuringInsertThreads=1 +NumQueries=200 +DistMethod=L2 +Rebuild=true +BuildOnly=false +Resume=-1 +Layers=2 + +Storage=TIKVIO +TiKVPDAddresses=PLACEHOLDER +TiKVKeyPrefix=bench100m_1node + +[SelectHead] +ParallelBKTBuild=true + +[BuildHead] +ParallelBKTBuild=true + +[BuildSSDIndex] +LatencyLimit=100 +MaxCheck=8192 +SearchInternalResultNum=64 +UseMultiChunkPosting=false +PostingPageLimit=8 +PostingCountCacheCapacity=10000000 +SearchCheckVersionMapOnlyLayer0=true +DistributedVersionMap=true +ReassignK=64 +AsyncMergeInSearch=true +VersionCacheMaxChunks=1000000 +AsyncRpcMaxInflight=512 + +[Distributed] +Enabled=true +DispatcherAddr=10.11.0.7:30001 +WorkerAddrs=10.11.0.7:30011 +StoreAddrs=10.11.0.7:20171 +PDAddrs=10.11.0.7:23791 diff --git a/evaluation/distributed/configs/benchmark_100m_2node.ini b/evaluation/distributed/configs/benchmark_100m_2node.ini new file mode 100644 index 000000000..01b9c3e81 --- /dev/null +++ b/evaluation/distributed/configs/benchmark_100m_2node.ini @@ -0,0 +1,71 @@ +; 100m: 99M base + 1M insert (insert is ~1% of base, "freshness / steady-state" workload). +; 100× larger base index than insert_dominant. Tests how the system behaves when +; the head index is large (~tens of millions of heads on layer 0) and the insert +; rate is moderate. Layers=2, L2 distance, SIFT1B dataset. +; +; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from +; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs). +; +; Notes for 100M-scale operation: +; - First run MUST build the index (Rebuild=true). Build of 99M base takes hours; +; reuse with Rebuild=false on subsequent runs and SKIP_HEAD_BUILD=1 if the +; HeadIndex on disk is intact. +; - Truth (top-5 over 99M) is recomputed at start each run; expect ~minutes. +; - SaveIndex at 100M has been observed to hang in BG-job-drain on some hosts; +; use SKIP_SAVE_LOAD=1 when iterating to bypass the per-batch save/load cycle. +; - TiKV data will grow to ~50-100GB per store at this scale; both nodes need +; plenty of NVMe headroom (verified: driver has 6.2T, worker has 691G). +[Benchmark] +WorkerTimeout=14400 +VectorPath=/mnt/nvme/sift1b/bigann_base.u8bin +QueryPath=/mnt/nvme/sift1b/query.10K.u8bin +TruthPath=truth +IndexPath=/mnt/nvme/proidx_100m_2node/spann_index +ValueType=UInt8 +Dimension=128 +BaseVectorCount=99000000 +InsertVectorCount=1000000 +DeleteVectorCount=0 +BatchNum=1 +TopK=5 +NumSearchThreads=4 +NumInsertThreads=4 +AppendThreadNum=16 +NumSearchDuringInsertThreads=1 +NumQueries=200 +DistMethod=L2 +Rebuild=false +BuildOnly=false +Resume=-1 +Layers=2 + +Storage=TIKVIO +TiKVPDAddresses=PLACEHOLDER +TiKVKeyPrefix=bench100m_2node + +[SelectHead] +ParallelBKTBuild=true + +[BuildHead] +ParallelBKTBuild=true + +[BuildSSDIndex] +LatencyLimit=100 +MaxCheck=8192 +SearchInternalResultNum=64 +UseMultiChunkPosting=false +PostingPageLimit=8 +PostingCountCacheCapacity=10000000 +SearchCheckVersionMapOnlyLayer0=true +DistributedVersionMap=true +ReassignK=64 +AsyncMergeInSearch=true +VersionCacheMaxChunks=1000000 +AsyncRpcMaxInflight=512 + +[Distributed] +Enabled=true +DispatcherAddr=10.11.0.7:30001 +WorkerAddrs=10.11.0.7:30011,10.11.0.10:30002 +StoreAddrs=10.11.0.7:20171,10.11.0.10:20171 +PDAddrs=10.11.0.7:23791,10.11.0.10:23791 diff --git a/evaluation/distributed/configs/benchmark_100m_template.ini b/evaluation/distributed/configs/benchmark_100m_template.ini new file mode 100644 index 000000000..4a69f39a4 --- /dev/null +++ b/evaluation/distributed/configs/benchmark_100m_template.ini @@ -0,0 +1,71 @@ +; 100m: 99M base + 1M insert (insert is ~1% of base, "freshness / steady-state" workload). +; 100× larger base index than insert_dominant. Tests how the system behaves when +; the head index is large (~tens of millions of heads on layer 0) and the insert +; rate is moderate. Layers=2, L2 distance, SIFT1B dataset. +; +; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from +; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs). +; +; Notes for 100M-scale operation: +; - First run MUST build the index (Rebuild=true). Build of 99M base takes hours; +; reuse with Rebuild=false on subsequent runs and SKIP_HEAD_BUILD=1 if the +; HeadIndex on disk is intact. +; - Truth (top-5 over 99M) is recomputed at start each run; expect ~minutes. +; - SaveIndex at 100M has been observed to hang in BG-job-drain on some hosts; +; use SKIP_SAVE_LOAD=1 when iterating to bypass the per-batch save/load cycle. +; - TiKV data will grow to ~50-100GB per store at this scale; both nodes need +; plenty of NVMe headroom (verified: driver has 6.2T, worker has 691G). +[Benchmark] +WorkerTimeout=14400 +VectorPath=/mnt/nvme/sift1b/bigann_base.u8bin +QueryPath=/mnt/nvme/sift1b/query.10K.u8bin +TruthPath=truth +IndexPath=PLACEHOLDER +ValueType=UInt8 +Dimension=128 +BaseVectorCount=99000000 +InsertVectorCount=1000000 +DeleteVectorCount=0 +BatchNum=1 +TopK=5 +NumSearchThreads=4 +NumInsertThreads=4 +AppendThreadNum=16 +NumSearchDuringInsertThreads=1 +NumQueries=200 +DistMethod=L2 +Rebuild=true +BuildOnly=false +Resume=-1 +Layers=2 + +Storage=TIKVIO +TiKVPDAddresses=PLACEHOLDER +TiKVKeyPrefix=PLACEHOLDER + +[SelectHead] +ParallelBKTBuild=true + +[BuildHead] +ParallelBKTBuild=true + +[BuildSSDIndex] +LatencyLimit=100 +MaxCheck=8192 +SearchInternalResultNum=64 +UseMultiChunkPosting=false +PostingPageLimit=8 +PostingCountCacheCapacity=10000000 +SearchCheckVersionMapOnlyLayer0=true +DistributedVersionMap=true +ReassignK=64 +AsyncMergeInSearch=true +VersionCacheMaxChunks=1000000 +AsyncRpcMaxInflight=512 + +[Distributed] +Enabled=true +DispatcherAddr=PLACEHOLDER +WorkerAddrs=PLACEHOLDER +StoreAddrs=PLACEHOLDER +PDAddrs=PLACEHOLDER diff --git a/evaluation/distributed/configs/benchmark_10m_1node.ini b/evaluation/distributed/configs/benchmark_10m_1node.ini new file mode 100644 index 000000000..56dbd9088 --- /dev/null +++ b/evaluation/distributed/configs/benchmark_10m_1node.ini @@ -0,0 +1,62 @@ +; 10m: 9M base + 1M insert (insert is ~10% of base, "growing-index" workload). +; 10× larger base index than insert_dominant, 10× smaller than 100m. +; Useful for validating scaling between 1M and 100M without paying the +; multi-hour build cost of 100m. Layers=2, L2 distance, SIFT1B dataset +; (truncated to 10M of the 1B available). +; +; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from +; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs). +[Benchmark] +WorkerTimeout=14400 +VectorPath=/mnt/nvme/sift1b/bigann_base.u8bin +QueryPath=/mnt/nvme/sift1b/query.10K.u8bin +TruthPath=truth +IndexPath=/mnt/nvme/proidx_10m_1node/spann_index +ValueType=UInt8 +Dimension=128 +BaseVectorCount=9000000 +InsertVectorCount=1000000 +DeleteVectorCount=0 +BatchNum=1 +TopK=5 +NumSearchThreads=4 +NumInsertThreads=4 +AppendThreadNum=16 +NumSearchDuringInsertThreads=1 +NumQueries=200 +DistMethod=L2 +Rebuild=true +BuildOnly=false +Resume=-1 +Layers=2 + +Storage=TIKVIO +TiKVPDAddresses=PLACEHOLDER +TiKVKeyPrefix=bench10m_1node + +[SelectHead] +ParallelBKTBuild=true + +[BuildHead] +ParallelBKTBuild=true + +[BuildSSDIndex] +LatencyLimit=100 +MaxCheck=8192 +SearchInternalResultNum=64 +UseMultiChunkPosting=false +PostingPageLimit=8 +PostingCountCacheCapacity=1000000 +SearchCheckVersionMapOnlyLayer0=true +DistributedVersionMap=true +ReassignK=64 +AsyncMergeInSearch=true +VersionCacheMaxChunks=1000000 +AsyncRpcMaxInflight=512 + +[Distributed] +Enabled=true +DispatcherAddr=10.11.0.7:30001 +WorkerAddrs=10.11.0.7:30011 +StoreAddrs=10.11.0.7:20171 +PDAddrs=10.11.0.7:23791 diff --git a/evaluation/distributed/configs/benchmark_10m_2node.ini b/evaluation/distributed/configs/benchmark_10m_2node.ini new file mode 100644 index 000000000..4ed317ac3 --- /dev/null +++ b/evaluation/distributed/configs/benchmark_10m_2node.ini @@ -0,0 +1,62 @@ +; 10m: 9M base + 1M insert (insert is ~10% of base, "growing-index" workload). +; 10× larger base index than insert_dominant, 10× smaller than 100m. +; Useful for validating scaling between 1M and 100M without paying the +; multi-hour build cost of 100m. Layers=2, L2 distance, SIFT1B dataset +; (truncated to 10M of the 1B available). +; +; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from +; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs). +[Benchmark] +WorkerTimeout=14400 +VectorPath=/mnt/nvme/sift1b/bigann_base.u8bin +QueryPath=/mnt/nvme/sift1b/query.10K.u8bin +TruthPath=truth +IndexPath=/mnt/nvme/proidx_10m_2node/spann_index +ValueType=UInt8 +Dimension=128 +BaseVectorCount=9000000 +InsertVectorCount=1000000 +DeleteVectorCount=0 +BatchNum=1 +TopK=5 +NumSearchThreads=4 +NumInsertThreads=4 +AppendThreadNum=16 +NumSearchDuringInsertThreads=1 +NumQueries=200 +DistMethod=L2 +Rebuild=false +BuildOnly=false +Resume=-1 +Layers=2 + +Storage=TIKVIO +TiKVPDAddresses=PLACEHOLDER +TiKVKeyPrefix=bench10m_2node + +[SelectHead] +ParallelBKTBuild=true + +[BuildHead] +ParallelBKTBuild=true + +[BuildSSDIndex] +LatencyLimit=100 +MaxCheck=8192 +SearchInternalResultNum=64 +UseMultiChunkPosting=false +PostingPageLimit=8 +PostingCountCacheCapacity=1000000 +SearchCheckVersionMapOnlyLayer0=true +DistributedVersionMap=true +ReassignK=64 +AsyncMergeInSearch=true +VersionCacheMaxChunks=1000000 +AsyncRpcMaxInflight=512 + +[Distributed] +Enabled=true +DispatcherAddr=10.11.0.7:30001 +WorkerAddrs=10.11.0.7:30011,10.11.0.10:30002 +StoreAddrs=10.11.0.7:20171,10.11.0.10:20171 +PDAddrs=10.11.0.7:23791,10.11.0.10:23791 diff --git a/evaluation/distributed/configs/benchmark_10m_template.ini b/evaluation/distributed/configs/benchmark_10m_template.ini new file mode 100644 index 000000000..f40203559 --- /dev/null +++ b/evaluation/distributed/configs/benchmark_10m_template.ini @@ -0,0 +1,62 @@ +; 10m: 9M base + 1M insert (insert is ~10% of base, "growing-index" workload). +; 10× larger base index than insert_dominant, 10× smaller than 100m. +; Useful for validating scaling between 1M and 100M without paying the +; multi-hour build cost of 100m. Layers=2, L2 distance, SIFT1B dataset +; (truncated to 10M of the 1B available). +; +; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from +; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs). +[Benchmark] +WorkerTimeout=14400 +VectorPath=/mnt/nvme/sift1b/bigann_base.u8bin +QueryPath=/mnt/nvme/sift1b/query.10K.u8bin +TruthPath=truth +IndexPath=PLACEHOLDER +ValueType=UInt8 +Dimension=128 +BaseVectorCount=9000000 +InsertVectorCount=1000000 +DeleteVectorCount=0 +BatchNum=1 +TopK=5 +NumSearchThreads=4 +NumInsertThreads=4 +AppendThreadNum=16 +NumSearchDuringInsertThreads=1 +NumQueries=200 +DistMethod=L2 +Rebuild=true +BuildOnly=false +Resume=-1 +Layers=2 + +Storage=TIKVIO +TiKVPDAddresses=PLACEHOLDER +TiKVKeyPrefix=PLACEHOLDER + +[SelectHead] +ParallelBKTBuild=true + +[BuildHead] +ParallelBKTBuild=true + +[BuildSSDIndex] +LatencyLimit=100 +MaxCheck=8192 +SearchInternalResultNum=64 +UseMultiChunkPosting=false +PostingPageLimit=8 +PostingCountCacheCapacity=1000000 +SearchCheckVersionMapOnlyLayer0=true +DistributedVersionMap=true +ReassignK=64 +AsyncMergeInSearch=true +VersionCacheMaxChunks=1000000 +AsyncRpcMaxInflight=512 + +[Distributed] +Enabled=true +DispatcherAddr=PLACEHOLDER +WorkerAddrs=PLACEHOLDER +StoreAddrs=PLACEHOLDER +PDAddrs=PLACEHOLDER diff --git a/evaluation/distributed/configs/benchmark_insert_dominant_1node.ini b/evaluation/distributed/configs/benchmark_insert_dominant_1node.ini new file mode 100644 index 000000000..30fe77bbe --- /dev/null +++ b/evaluation/distributed/configs/benchmark_insert_dominant_1node.ini @@ -0,0 +1,58 @@ +; insert_dominant: 1M base + 1M insert with concurrent search-during-insert. +; Layers=2, L2 distance, SIFT1B dataset (truncated to 1M). +; +; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from +; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs). +[Benchmark] +WorkerTimeout=14400 +VectorPath=/mnt/nvme/sift1b/bigann_base.u8bin +QueryPath=/mnt/nvme/sift1b/query.10K.u8bin +TruthPath=truth +IndexPath=/mnt/nvme/proidx_insert_dominant_1node/spann_index +ValueType=UInt8 +Dimension=128 +BaseVectorCount=1000000 +InsertVectorCount=1000000 +DeleteVectorCount=0 +BatchNum=1 +TopK=5 +NumSearchThreads=4 +NumInsertThreads=4 +AppendThreadNum=16 +NumSearchDuringInsertThreads=1 +NumQueries=200 +DistMethod=L2 +Rebuild=true +BuildOnly=false +Resume=-1 +Layers=2 + +Storage=TIKVIO +TiKVPDAddresses=PLACEHOLDER +TiKVKeyPrefix=benchinsert_dominant_1node + +[SelectHead] +ParallelBKTBuild=true + +[BuildHead] +ParallelBKTBuild=true + +[BuildSSDIndex] +LatencyLimit=100 +MaxCheck=8192 +SearchInternalResultNum=64 +UseMultiChunkPosting=false +PostingPageLimit=8 +PostingCountCacheCapacity=1000000 +SearchCheckVersionMapOnlyLayer0=true +DistributedVersionMap=true +ReassignK=64 +AsyncMergeInSearch=true +VersionCacheMaxChunks=100000 + +[Distributed] +Enabled=true +DispatcherAddr=10.11.0.7:30001 +WorkerAddrs=10.11.0.7:30011 +StoreAddrs=10.11.0.7:20171 +PDAddrs=10.11.0.7:23791 diff --git a/evaluation/distributed/configs/benchmark_insert_dominant_2node.ini b/evaluation/distributed/configs/benchmark_insert_dominant_2node.ini new file mode 100644 index 000000000..d45870b50 --- /dev/null +++ b/evaluation/distributed/configs/benchmark_insert_dominant_2node.ini @@ -0,0 +1,58 @@ +; insert_dominant: 1M base + 1M insert with concurrent search-during-insert. +; Layers=2, L2 distance, SIFT1B dataset (truncated to 1M). +; +; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from +; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs). +[Benchmark] +WorkerTimeout=14400 +VectorPath=/mnt/nvme/sift1b/bigann_base.u8bin +QueryPath=/mnt/nvme/sift1b/query.10K.u8bin +TruthPath=truth +IndexPath=/mnt/nvme/proidx_insert_dominant_2node/spann_index +ValueType=UInt8 +Dimension=128 +BaseVectorCount=1000000 +InsertVectorCount=1000000 +DeleteVectorCount=0 +BatchNum=1 +TopK=5 +NumSearchThreads=4 +NumInsertThreads=4 +AppendThreadNum=16 +NumSearchDuringInsertThreads=1 +NumQueries=200 +DistMethod=L2 +Rebuild=false +BuildOnly=false +Resume=-1 +Layers=2 + +Storage=TIKVIO +TiKVPDAddresses=PLACEHOLDER +TiKVKeyPrefix=benchinsert_dominant_2node + +[SelectHead] +ParallelBKTBuild=true + +[BuildHead] +ParallelBKTBuild=true + +[BuildSSDIndex] +LatencyLimit=100 +MaxCheck=8192 +SearchInternalResultNum=64 +UseMultiChunkPosting=false +PostingPageLimit=8 +PostingCountCacheCapacity=1000000 +SearchCheckVersionMapOnlyLayer0=true +DistributedVersionMap=true +ReassignK=64 +AsyncMergeInSearch=true +VersionCacheMaxChunks=100000 + +[Distributed] +Enabled=true +DispatcherAddr=10.11.0.7:30001 +WorkerAddrs=10.11.0.7:30011,10.11.0.10:30002 +StoreAddrs=10.11.0.7:20171,10.11.0.10:20171 +PDAddrs=10.11.0.7:23791,10.11.0.10:23791 diff --git a/evaluation/distributed/configs/benchmark_insert_dominant_3node.ini b/evaluation/distributed/configs/benchmark_insert_dominant_3node.ini new file mode 100644 index 000000000..a8050732d --- /dev/null +++ b/evaluation/distributed/configs/benchmark_insert_dominant_3node.ini @@ -0,0 +1,59 @@ +; insert_dominant: 1M base + 10M insert (10× scale-up) with concurrent search-during-insert. +; Tests how the index handles insertion-dominated workloads where insertion volume +; is much larger than the initial baseline. Layers=2, L2 distance, SIFT1B dataset. +; +; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from +; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs). +[Benchmark] +WorkerTimeout=14400 +VectorPath=/mnt/data/sift1b/base.1B.u8bin +QueryPath=/mnt/data/sift1b/query.public.10K.u8bin +TruthPath=truth +IndexPath=/mnt/md0/proidx_insert_dominant_3node/spann_index +ValueType=UInt8 +Dimension=128 +BaseVectorCount=1000000 +InsertVectorCount=1000000 +DeleteVectorCount=0 +BatchNum=1 +TopK=5 +NumSearchThreads=4 +NumInsertThreads=4 +AppendThreadNum=16 +NumSearchDuringInsertThreads=1 +NumQueries=200 +DistMethod=L2 +Rebuild=false +BuildOnly=false +Resume=-1 +Layers=2 + +Storage=TIKVIO +TiKVPDAddresses=PLACEHOLDER +TiKVKeyPrefix=benchinsert_dominant_3node + +[SelectHead] +ParallelBKTBuild=true + +[BuildHead] +ParallelBKTBuild=true + +[BuildSSDIndex] +LatencyLimit=100 +MaxCheck=8192 +SearchInternalResultNum=64 +UseMultiChunkPosting=false +PostingPageLimit=8 +PostingCountCacheCapacity=1000000 +SearchCheckVersionMapOnlyLayer0=true +DistributedVersionMap=true +ReassignK=64 +AsyncMergeInSearch=true +VersionCacheMaxChunks=100000 + +[Distributed] +Enabled=true +DispatcherAddr=172.27.0.4:30001 +WorkerAddrs=172.27.0.4:30011,172.27.0.5:30002,172.27.0.6:30003 +StoreAddrs=172.27.0.4:20171,172.27.0.5:20171,172.27.0.6:20171 +PDAddrs=172.27.0.4:23791,172.27.0.5:23791,172.27.0.6:23791 diff --git a/evaluation/distributed/configs/benchmark_insert_dominant_template.ini b/evaluation/distributed/configs/benchmark_insert_dominant_template.ini new file mode 100644 index 000000000..f8085c03b --- /dev/null +++ b/evaluation/distributed/configs/benchmark_insert_dominant_template.ini @@ -0,0 +1,58 @@ +; insert_dominant: 1M base + 1M insert with concurrent search-during-insert. +; Layers=2, L2 distance, SIFT1B dataset (truncated to 1M). +; +; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from +; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs). +[Benchmark] +WorkerTimeout=14400 +VectorPath=/mnt/nvme/sift1b/bigann_base.u8bin +QueryPath=/mnt/nvme/sift1b/query.10K.u8bin +TruthPath=truth +IndexPath=PLACEHOLDER +ValueType=UInt8 +Dimension=128 +BaseVectorCount=1000000 +InsertVectorCount=1000000 +DeleteVectorCount=0 +BatchNum=1 +TopK=5 +NumSearchThreads=4 +NumInsertThreads=4 +AppendThreadNum=16 +NumSearchDuringInsertThreads=1 +NumQueries=200 +DistMethod=L2 +Rebuild=true +BuildOnly=false +Resume=-1 +Layers=2 + +Storage=TIKVIO +TiKVPDAddresses=PLACEHOLDER +TiKVKeyPrefix=PLACEHOLDER + +[SelectHead] +ParallelBKTBuild=true + +[BuildHead] +ParallelBKTBuild=true + +[BuildSSDIndex] +LatencyLimit=100 +MaxCheck=8192 +SearchInternalResultNum=64 +UseMultiChunkPosting=false +PostingPageLimit=8 +PostingCountCacheCapacity=1000000 +SearchCheckVersionMapOnlyLayer0=true +DistributedVersionMap=true +ReassignK=64 +AsyncMergeInSearch=true +VersionCacheMaxChunks=100000 + +[Distributed] +Enabled=true +DispatcherAddr=PLACEHOLDER +WorkerAddrs=PLACEHOLDER +StoreAddrs=PLACEHOLDER +PDAddrs=PLACEHOLDER diff --git a/evaluation/distributed/configs/cluster_2node.conf b/evaluation/distributed/configs/cluster_2node.conf new file mode 100644 index 000000000..f94500487 --- /dev/null +++ b/evaluation/distributed/configs/cluster_2node.conf @@ -0,0 +1,31 @@ +# 2-node cluster: driver/worker0 on dev-000003 (10.11.0.7), +# worker1 on dev-000006 (10.11.0.10). +# On 000006, /mnt/nvme is symlinked to /mnt_ssd/data7/sptag-bench (data lives on data7 NVMe). +# +# Cluster mode: SHARED TiKV raft cluster. Both PDs form one raft group; both +# TiKVs share the same cluster (max-replicas=1, so each region lives on +# exactly one store and PD routes reads to it). Compute nodes are stateless +# TiKV clients — no cross-compute fetch RPCs during RNGSelection. +[cluster] +ssh_user=superbench +ssh_key=/home/superbench/.ssh/id_rsa +sptag_dir=/home/superbench/zhangt/SPTAG +data_dir=/mnt/nvme +tikv_version=v8.5.1 +pd_version=v8.5.1 +# Image refs (optional). Defaults: +# tikv_image=sptag-tikv (with tag :${tikv_version}) +# pd_image=sptag-pd (with tag :${pd_version}) +# helper_image=mcr.microsoft.com/mirror/docker/library/ubuntu:22.04 +# Override here to use different registries / replace with pingcap/* etc. + +[nodes] +# host router_port +# node 0 (driver) router_port must differ from dispatcher port (hardcoded 30001). +10.11.0.7 30011 +10.11.0.10 30002 + +[tikv] +# host pd_client_port pd_peer_port tikv_port +10.11.0.7 23791 23801 20171 +10.11.0.10 23791 23801 20171 diff --git a/evaluation/distributed/configs/cluster_3node.conf b/evaluation/distributed/configs/cluster_3node.conf new file mode 100644 index 000000000..ff2ba8af4 --- /dev/null +++ b/evaluation/distributed/configs/cluster_3node.conf @@ -0,0 +1,34 @@ +# 3-node cluster: driver/worker0 on 172.27.0.4, +# worker1 on 172.27.0.5 (20.92.202.166), +# worker2 on 172.27.0.6 (20.5.138.158). +# Data lives on /mnt/md0 (NVMe RAID0, ~11T per node). +# +# Cluster mode: SHARED TiKV raft cluster. All PDs form one raft group; all +# TiKVs share the same cluster (max-replicas=1, so each region lives on +# exactly one store and PD routes reads to it). Compute nodes are stateless +# TiKV clients — no cross-compute fetch RPCs during RNGSelection. +[cluster] +ssh_user=azureuser +ssh_key=/home/azureuser/.ssh/id_rsa +sptag_dir=/home/azureuser/zhangt/SPTAG +data_dir=/mnt/md0 +tikv_version=v8.5.1 +pd_version=v8.5.1 +# Image refs (optional). Defaults: +# tikv_image=sptag-tikv (with tag :${tikv_version}) +# pd_image=sptag-pd (with tag :${pd_version}) +# helper_image=mcr.microsoft.com/mirror/docker/library/ubuntu:22.04 +# Override here to use different registries / replace with pingcap/* etc. + +[nodes] +# host router_port +# node 0 (driver) router_port must differ from dispatcher port (hardcoded 30001). +172.27.0.4 30011 +172.27.0.5 30002 +172.27.0.6 30003 + +[tikv] +# host pd_client_port pd_peer_port tikv_port +172.27.0.4 23791 23801 20171 +172.27.0.5 23791 23801 20171 +172.27.0.6 23791 23801 20171 diff --git a/evaluation/distributed/configs/tikv.toml b/evaluation/distributed/configs/tikv.toml new file mode 100755 index 000000000..4ba5282c0 --- /dev/null +++ b/evaluation/distributed/configs/tikv.toml @@ -0,0 +1,74 @@ +memory-usage-limit = "80GB" + +[server] +# v41: 16 → 32 to handle higher concurrent gRPC streams. 96-core host has +# plenty of headroom; previous setting was a default-y stab in the dark. +grpc-concurrency = 32 +grpc-memory-pool-quota = "16GB" + +[raftstore] +region-max-size = "512MB" +region-split-size = "384MB" +region-max-keys = 5120000 +region-split-keys = 3840000 +# v41: 4 → 32. apply-pool is the path raft-log → RocksDB writes go through. +# At 32 concurrent RMW ops per store (4 local insert + 16 receiver sub-workers +# + 4 search + 4 search-during-insert + misc), a 4-thread apply pool meant +# ~8× queue depth, which is the primary write-amp source we observed +# (TiKV at 7/96 cores while ops are still queueing). +apply-pool-size = 32 +# v41: 4 → 16. store-pool routes raft messages between peers and to apply. +store-pool-size = 16 +# v41: batch up raft entries per fsync. If we're disk-fsync bound (likely), +# this directly amortizes the sync cost. +raft-write-batch-size = "1MB" + +[storage] +reserve-space = "1GB" +# v41: 4 (default) → 16. KV scheduler is the front-end before raftstore. +scheduler-worker-pool-size = 16 + +[storage.block-cache] +capacity = "60GB" + +# v41: new section. Read pool default = 0.8×CPU = 76 on 96-core host, which +# would let reads steal CPU from writes. Cap at 32 to leave room for write +# path. Min 8 ensures reads stay responsive under light load. +[readpool.unified] +max-thread-count = 32 +min-thread-count = 8 + +[rocksdb] +max-background-jobs = 32 +max-sub-compactions = 8 +# v41: 8 dedicated flush threads (subset of max-background-jobs). Reduces +# the chance that compaction monopolizes background-jobs and starves flushes. +max-background-flushes = 8 +rate-bytes-per-sec = "0" + +[rocksdb.defaultcf] +# v41: 512MB → 1GB. Bigger memtable means fewer flushes (and thus fewer L0 +# files), reducing the chance of slowdown/stop write triggers under burst. +write-buffer-size = "1GB" +# v41: 5 → 8. More memtables = more headroom before flush back-pressure. +max-write-buffer-number = 8 +min-write-buffer-number-to-merge = 2 +level0-file-num-compaction-trigger = 12 +# v41: 28 → 40, 40 → 60. Loosen the L0 stall thresholds so bursts have more +# slack. With 10K-item chunks (v39+) we generate more small writes than v38 +# did, so we hit slowdown more often. +level0-slowdown-writes-trigger = 40 +level0-stop-writes-trigger = 60 +max-bytes-for-level-base = "2GB" +compression-per-level = ["no", "no", "no", "lz4", "lz4", "zstd", "zstd"] +target-file-size-base = "128MB" + +[rocksdb.writecf] +write-buffer-size = "128MB" +max-write-buffer-number = 5 + +[coprocessor] +region-max-size = "512MB" +region-split-size = "384MB" +region-max-keys = 5120000 +region-split-keys = 3840000 diff --git a/evaluation/distributed/run_distributed.sh b/evaluation/distributed/run_distributed.sh new file mode 100755 index 000000000..c383a7eed --- /dev/null +++ b/evaluation/distributed/run_distributed.sh @@ -0,0 +1,1364 @@ +#!/bin/bash +# Multi-machine distributed benchmark orchestrator for SPTAG. +# +# Usage: +# ./run_distributed.sh deploy Deploy binary + data to all nodes +# ./run_distributed.sh setup-bins Download tikv-server / pd-server to every node +# ./run_distributed.sh start-tikv [node_count] Start independent TiKV/PD instances +# ./run_distributed.sh stop-tikv [node_count] Stop TiKV/PD instances +# ./run_distributed.sh run Run benchmark +# ./run_distributed.sh bench [scale...] Run 1-node + N-node for each scale +# ./run_distributed.sh cleanup Remove deployed files from remote nodes +# +# Environment variables: +# NOCACHE=1 Disable all caches (TiKV block cache, OS page cache, VersionCache) +# BUILD_WITH_CACHE=1 (only with NOCACHE=1) Use cached TiKV+VersionCache during the +# build phase, then restart TiKV with nocache config and drop all +# OS caches before the search/insert phase. Useful for large scales +# (e.g. 100M) where building under nocache is impractical. +# SKIP_TIKV_SWAP=1 (only with BUILD_WITH_CACHE=1) Skip the TiKV container restart. +# Drop OS caches and rely on VersionCache=0 INI overrides for "nocache" +# semantics. Avoids docker rm -f corruption that has destroyed recall +# at 100M scale; TiKV block cache stays warm but contains mostly recent +# build writes (random search reads largely miss it anyway). +# SKIP_SAVE_LOAD=1 (only with NOCACHE=1) Bypass the post-build SaveIndex / per-batch +# LoadIndex / Clone / SaveIndex cycles. For 1-node, build+search+insert +# run in a single SPTAGTest process, dropping OS pagecache after build. +# For 2-node, the build phase skips the broken final SaveIndex (relies +# on the index files written during BuildLargeIndex). Required at 100M +# scale where SaveIndex's "wait for all background jobs to finish" loop +# never terminates and risks a gRPC SEGFAULT after several hours. +# VersionCache cannot be reset mid-process so it stays warm from build. +# SKIP_HEAD_BUILD=1 Reuse existing HeadIndex if present (RebuildSSDOnly). Falls back to +# full build if HeadIndex is missing. +# +# Prerequisites: +# - Passwordless SSH from driver to all nodes (configure ssh_key in cluster.conf) +# - Docker installed on all nodes (for TiKV) +# - cluster.conf configured (see cluster.conf.example) +# +# The driver (first node in [nodes]) orchestrates everything. +# Compute nodes share a single TiKV raft cluster: all PDs join one raft group, +# all TiKVs point to all PDs, max-replicas=1 (no replication, each region on +# exactly one store). With 2 nodes this gives 2 PDs + 2 TiKV stores in one +# cluster; any compute can read any posting via PD-routed TiKV calls, so the +# distributed routing layer no longer needs to forward reads between computes. + +set -o pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +LOGDIR="$(cd "$SCRIPT_DIR/../.." && pwd)/benchmark_logs" +mkdir -p "$LOGDIR" + +# ─── Config Parsing ─── + +declare -a NODE_HOSTS NODE_ROUTER_PORTS +declare -a TIKV_HOSTS TIKV_PD_CLIENT_PORTS TIKV_PD_PEER_PORTS TIKV_PORTS +declare SSH_USER SPTAG_DIR DATA_DIR TIKV_VERSION PD_VERSION SSH_KEY +declare TIKV_IMAGE PD_IMAGE HELPER_IMAGE BIN_DIR MIRROR +TOTAL_NODES=0 + +parse_config() { + local CONF="$1" + if [ ! -f "$CONF" ]; then + echo "ERROR: Config file not found: $CONF" + exit 1 + fi + + local SECTION="" + + while IFS= read -r line || [ -n "$line" ]; do + # Strip comments and whitespace + line="${line%%#*}" + line="$(echo "$line" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')" + [ -z "$line" ] && continue + + # Section header + if [[ "$line" =~ ^\[(.+)\]$ ]]; then + SECTION="${BASH_REMATCH[1]}" + continue + fi + + case "$SECTION" in + cluster) + local key="${line%%=*}" + local val="${line#*=}" + case "$key" in + ssh_user) SSH_USER="$val" ;; + sptag_dir) SPTAG_DIR="$val" ;; + data_dir) DATA_DIR="$val" ;; + tikv_version) TIKV_VERSION="$val" ;; + pd_version) PD_VERSION="$val" ;; + tikv_image) TIKV_IMAGE="$val" ;; + pd_image) PD_IMAGE="$val" ;; + helper_image) HELPER_IMAGE="$val" ;; + bin_dir) BIN_DIR="$val" ;; + mirror) MIRROR="$val" ;; + ssh_key) SSH_KEY="$val" ;; + esac + ;; + nodes) + read -r host rport <<< "$line" + NODE_HOSTS+=("$host") + NODE_ROUTER_PORTS+=("$rport") + ;; + tikv) + read -r host pd_client pd_peer tikv_port <<< "$line" + TIKV_HOSTS+=("$host") + TIKV_PD_CLIENT_PORTS+=("$pd_client") + TIKV_PD_PEER_PORTS+=("$pd_peer") + TIKV_PORTS+=("$tikv_port") + ;; + esac + done < "$CONF" + + # Defaults + SSH_USER="${SSH_USER:-$(whoami)}" + TIKV_VERSION="${TIKV_VERSION:-v8.5.1}" + PD_VERSION="${PD_VERSION:-v8.5.1}" + # Single image used for ALL containers (PD, TiKV, helper). Stock MCR + # ubuntu:22.04 — never modified, never layered, so security scanners see + # only the MCR base image. TiKV / PD binaries are downloaded to the host + # at $BIN_DIR by `setup-bins` and bind-mounted into the container. + HELPER_IMAGE="${HELPER_IMAGE:-mcr.microsoft.com/mirror/docker/library/ubuntu:22.04}" + TIKV_IMAGE="${TIKV_IMAGE:-${HELPER_IMAGE}}" + PD_IMAGE="${PD_IMAGE:-${HELPER_IMAGE}}" + # Host path on every node where tikv-server / pd-server live. Populated + # by `setup-bins`. Mounted read-only into containers as /sptag-bin. + BIN_DIR="${BIN_DIR:-${SPTAG_DIR}/evaluation/distributed/bin}" + MIRROR="${MIRROR:-https://tiup-mirrors.pingcap.com}" + + # Expand ~ in ssh_key path + if [ -n "$SSH_KEY" ]; then + SSH_KEY="${SSH_KEY/#\~/$HOME}" + fi + + TOTAL_NODES=${#NODE_HOSTS[@]} + + if [ "$TOTAL_NODES" -lt 1 ]; then + echo "ERROR: No compute nodes defined in [nodes]" + exit 1 + fi + if [ ${#TIKV_HOSTS[@]} -lt 1 ]; then + echo "ERROR: No TiKV instances defined in [tikv]" + exit 1 + fi + + echo "Cluster config loaded:" + echo " Compute nodes: $TOTAL_NODES (driver: ${NODE_HOSTS[0]})" + echo " TiKV instances: ${#TIKV_HOSTS[@]}" + echo " SSH user: $SSH_USER" + echo " SSH key: ${SSH_KEY:-(none)}" + echo " SPTAG dir: $SPTAG_DIR" + echo " Data dir: $DATA_DIR" +} + +# ─── SSH Helpers ─── + +# Build SSH options string (key + host checking) +_ssh_opts() { + local opts="-o StrictHostKeyChecking=no -o ConnectTimeout=10" + if [ -n "$SSH_KEY" ]; then + opts+=" -i $SSH_KEY" + fi + echo "$opts" +} + +# Run command on remote host (or locally if it's the driver) +remote_exec() { + local host="$1"; shift + if [ "$host" = "${NODE_HOSTS[0]}" ] || [ "$host" = "localhost" ] || [ "$host" = "127.0.0.1" ]; then + eval "$@" + else + ssh $(_ssh_opts) "$SSH_USER@$host" "$@" + fi +} + +# rsync files to remote host +remote_sync() { + local host="$1" + local src="$2" + local dst="$3" + if [ "$host" = "${NODE_HOSTS[0]}" ] || [ "$host" = "localhost" ]; then + # Local copy — skip if same path + if [ "$(realpath "$src")" != "$(realpath "$dst")" ]; then + rsync -az --progress "$src" "$dst" + fi + else + rsync -az --progress -e "ssh $(_ssh_opts)" "$src" "$SSH_USER@$host:$dst" + fi +} + +# ─── Deploy ─── + +cmd_deploy() { + echo "" + echo "=== Deploying SPTAG to ${#NODE_HOSTS[@]} nodes ===" + echo "" + + # Validate SSH connectivity + for host in "${NODE_HOSTS[@]}"; do + if [ "$host" = "${NODE_HOSTS[0]}" ]; then continue; fi + echo -n " Checking SSH to $host... " + if remote_exec "$host" "echo ok" >/dev/null 2>&1; then + echo "OK" + else + echo "FAILED" + echo "ERROR: Cannot SSH to $SSH_USER@$host" + exit 1 + fi + done + + # Deploy binary to all remote nodes + echo "" + echo "Deploying binary..." + local BINARY="$SPTAG_DIR/Release/SPTAGTest" + if [ ! -f "$BINARY" ]; then + echo "ERROR: Binary not found: $BINARY (run cmake build first)" + exit 1 + fi + + for host in "${NODE_HOSTS[@]}"; do + if [ "$host" = "${NODE_HOSTS[0]}" ]; then continue; fi + echo " → $host:$SPTAG_DIR/Release/" + remote_exec "$host" "mkdir -p $SPTAG_DIR/Release" + remote_sync "$host" "$BINARY" "$SPTAG_DIR/Release/SPTAGTest" + # Also deploy any shared libraries + if ls "$SPTAG_DIR/Release/"*.so 2>/dev/null; then + remote_sync "$host" "$SPTAG_DIR/Release/*.so" "$SPTAG_DIR/Release/" + fi + # Deploy bundled runtime libs (boost 1.73 / abseil / tbb / libstdc++) + # used by SPTAGTest. Not committed; produced locally on the driver. + if [ -d "$SPTAG_DIR/Release/runtime_libs" ]; then + remote_exec "$host" "mkdir -p $SPTAG_DIR/Release/runtime_libs" + rsync -az -e "ssh $(_ssh_opts)" \ + "$SPTAG_DIR/Release/runtime_libs/" \ + "$SSH_USER@$host:$SPTAG_DIR/Release/runtime_libs/" + fi + done + + # Deploy data files (perftest_* vectors, queries) + echo "" + echo "Deploying data files..." + for host in "${NODE_HOSTS[@]}"; do + if [ "$host" = "${NODE_HOSTS[0]}" ]; then continue; fi + echo " → $host:$SPTAG_DIR/ (perftest_* files)" + remote_exec "$host" "mkdir -p $SPTAG_DIR" + rsync -az --progress \ + --include='perftest_*' --exclude='*' \ + -e "ssh $(_ssh_opts)" \ + "$SPTAG_DIR/" "$SSH_USER@$host:$SPTAG_DIR/" + done + + echo "" + echo "Deploy complete." +} + +# ─── TiKV/PD Binary Setup ─── + +setup_bins_one_host() { + # Ensure tikv-server / pd-server are present at $BIN_DIR on $1. + # Downloads from $MIRROR if missing or version mismatch. Idempotent. + local host="$1" + local cmd + # shellcheck disable=SC2016 + cmd='set -e + mkdir -p "'"$BIN_DIR"'" + cd "'"$BIN_DIR"'" + need_tikv=1 + if [ -x tikv-server ] && ./tikv-server --version 2>/dev/null | grep -q "Release Version:[[:space:]]*'"${TIKV_VERSION#v}"'"; then + need_tikv=0 + fi + if [ "$need_tikv" = "1" ]; then + echo " Downloading tikv-'"${TIKV_VERSION}"'..." + curl -fsSL "'"${MIRROR}"'/tikv-'"${TIKV_VERSION}"'-linux-amd64.tar.gz" | tar -xz + chmod +x tikv-server + else + echo " tikv-'"${TIKV_VERSION}"' already present" + fi + need_pd=1 + if [ -x pd-server ] && ./pd-server --version 2>/dev/null | grep -q "Release Version:[[:space:]]*'"${PD_VERSION}"'"; then + need_pd=0 + fi + if [ "$need_pd" = "1" ]; then + echo " Downloading pd-'"${PD_VERSION}"'..." + curl -fsSL "'"${MIRROR}"'/pd-'"${PD_VERSION}"'-linux-amd64.tar.gz" | tar -xz + chmod +x pd-server pd-ctl pd-recover 2>/dev/null || true + else + echo " pd-'"${PD_VERSION}"' already present" + fi' + + if [ "$host" = "${NODE_HOSTS[0]}" ] || [ "$host" = "localhost" ] || [ "$host" = "127.0.0.1" ]; then + bash -c "$cmd" + else + remote_exec "$host" "$cmd" + fi +} + +cmd_setup_bins() { + # Download tikv-server + pd-server to ${BIN_DIR} on every distinct host + # used by the cluster (compute nodes ∪ tikv nodes). Idempotent. + echo "" + echo "=== Setting up TiKV/PD binaries ===" + echo " BIN_DIR : $BIN_DIR" + echo " TIKV : $TIKV_VERSION" + echo " PD : $PD_VERSION" + echo " MIRROR : $MIRROR" + + declare -A seen + local -a hosts=() + local h + for h in "${NODE_HOSTS[@]}" "${TIKV_HOSTS[@]}"; do + if [ -z "${seen[$h]:-}" ]; then + seen[$h]=1 + hosts+=("$h") + fi + done + + for h in "${hosts[@]}"; do + echo "" + echo "→ $h" + setup_bins_one_host "$h" + done + + echo "" + echo "Binary setup complete." +} + +# ─── TiKV Management (Independent Mode) ─── + + +tikv_start() { + # Start the first PD+TiKV pairs. + # + # node_count == 1: standalone PD + TiKV (1-node benchmarks). + # node_count >= 2: SHARED raft cluster — all PDs join one raft group, + # all TiKVs point to all PDs. max-replicas=1 so each + # region lives on exactly one store; PD routes reads + # to whichever store has the region. + local node_count="${1:-${#TIKV_HOSTS[@]}}" + echo "" + if [ "$node_count" -le 1 ]; then + echo "=== Starting 1 standalone TiKV instance ===" + else + echo "=== Starting $node_count-node SHARED TiKV raft cluster ===" + fi + + # Ensure binaries are present on every host that will run a container. + # Cheap if already there (version-grep, no download). + local i_host + for (( i_host=0; i_host/dev/null | tr -d '[:space:]') + fi + if [ "$present" != "yes" ]; then + echo " → $h: binaries missing, running setup-bins" + setup_bins_one_host "$h" + fi + done + + # Build the initial-cluster string used by every PD. + # For 1-node it's a single-member raft; for N>=2 every PD lists all members. + local initial_cluster="" + for (( i=0; i= 2 they form a raft group. + echo "Starting PD instances (initial-cluster=${initial_cluster})..." + for (( i=0; i/dev/null; \ + docker run -d --name sptag-pd-$i --net host \ + -v $DATA_DIR/tikv-data/pd-$i:/data \ + -v ${BIN_DIR}:/sptag-bin:ro \ + --entrypoint /sptag-bin/pd-server \ + ${PD_IMAGE} \ + --name=${pd_name} \ + --data-dir=/data \ + --client-urls=http://0.0.0.0:${client_port} \ + --advertise-client-urls=http://${host}:${client_port} \ + --peer-urls=http://0.0.0.0:${peer_port} \ + --advertise-peer-urls=http://${host}:${peer_port} \ + --initial-cluster=${initial_cluster}" + done + + echo "Waiting for PD raft to form..." + sleep 5 + + # Wait until every PD reports the expected member count (raft quorum up). + for (( i=0; i/dev/null \ + | python3 -c "import sys,json; d=json.load(sys.stdin); print(len(d.get('members',[])))" 2>/dev/null || echo 0) + if [ "$members" -ge "$node_count" ]; then + echo " PD $i ($host:$pd_port) healthy (members=${members})" + break + fi + if [ "$attempt" -eq 60 ]; then + echo " ERROR: PD $i ($host:$pd_port) only sees ${members}/${node_count} members after 60s" + return 1 + fi + sleep 1 + done + done + + # NOTE: max-replicas is configured AFTER TiKV starts (see below). Setting + # placement rules requires cluster bootstrap, which only happens once a + # TiKV store joins. Before bootstrap, /pd/api/v1/config/rule returns 500 + # ErrNotBootstrapped. We rely on the fact that no data is written until + # SPTAGTest connects (which happens after this function returns), so the + # brief window where bootstrap uses default max-replicas=3 is harmless. + + # Start TiKV instances pointing at the shared PD endpoints. + echo "Starting TiKV instances (pd-endpoints=${pd_endpoints})..." + for (( i=0; i/dev/null; \ + docker run -d --name sptag-tikv-$i --net host \ + --ulimit nofile=1048576:1048576 \ + -v $DATA_DIR/tikv-data/tikv-$i:/data \ + -v $DATA_DIR/tikv-data/conf:/conf \ + -v ${BIN_DIR}:/sptag-bin:ro \ + --entrypoint /sptag-bin/tikv-server \ + ${TIKV_IMAGE} \ + --config=/conf/tikv.toml \ + --addr=0.0.0.0:${tikv_port} \ + --advertise-addr=${host}:${tikv_port} \ + --data-dir=/data \ + --pd-endpoints=${pd_endpoints}" + done + + echo "Waiting for TiKV stores to register..." + sleep 5 + + # All stores show up in PD's store list (any PD works — they share state). + local pd_host="${TIKV_HOSTS[0]}" + local pd_port_first="${TIKV_PD_CLIENT_PORTS[0]}" + for attempt in $(seq 1 60); do + local store_count + store_count=$(curl -sf "http://${pd_host}:${pd_port_first}/pd/api/v1/stores" 2>/dev/null \ + | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('count',0))" 2>/dev/null || echo 0) + if [ "$store_count" -ge "$node_count" ]; then + echo " All ${store_count} TiKV stores registered" + break + fi + if [ "$attempt" -eq 60 ]; then + echo " WARNING: only ${store_count}/${node_count} TiKV stores registered after 60s" + fi + sleep 1 + done + + # Set max-replicas=1 on the shared cluster, NOW that cluster is bootstrapped. + # + # PD v6+ defaults to enable-placement-rules=true. The authoritative source + # for replica count is then the default placement rule, NOT the legacy + # max-replicas config. /config POST auto-syncs to the rule but is racy; + # we explicitly POST the rule too. Both endpoints require bootstrap. + # Bug seen v45: skipping this caused 30%+ of a 1-node run to execute with + # max-replicas=3 → PD endlessly tried to schedule replicas onto 1 store + # → constant region state changes → gRPC Deadline / region_error storm. + echo "Setting max-replicas=1 (default placement rule)..." + local target_replicas=1 + local mr_ok=0 + for attempt in $(seq 1 30); do + curl -sf "http://${pd_host}:${pd_port_first}/pd/api/v1/config" \ + -X POST -d "{\"max-replicas\": ${target_replicas}}" >/dev/null 2>&1 || true + curl -sf "http://${pd_host}:${pd_port_first}/pd/api/v1/config/rule" \ + -X POST -d "{\"group_id\":\"pd\",\"id\":\"default\",\"start_key\":\"\",\"end_key\":\"\",\"role\":\"voter\",\"count\":${target_replicas}}" \ + >/dev/null 2>&1 || true + sleep 1 + local got_cfg + got_cfg=$(curl -sf "http://${pd_host}:${pd_port_first}/pd/api/v1/config/replicate" 2>/dev/null \ + | python3 -c 'import sys,json;print(json.load(sys.stdin).get("max-replicas"))' 2>/dev/null) + local got_rule + got_rule=$(curl -sf "http://${pd_host}:${pd_port_first}/pd/api/v1/config/rule/pd/default" 2>/dev/null \ + | python3 -c 'import sys,json;print(json.load(sys.stdin).get("count"))' 2>/dev/null) + if [ "$got_cfg" = "$target_replicas" ] && [ "$got_rule" = "$target_replicas" ]; then + echo " max-replicas=${target_replicas} set (attempt $attempt, config & rule verified)" + mr_ok=1 + break + fi + sleep 1 + done + if [ "$mr_ok" != "1" ]; then + echo " ERROR: Failed to set max-replicas=${target_replicas} after 30 attempts. Aborting." >&2 + return 1 + fi + + echo "TiKV cluster started ($node_count node(s))." +} + +tikv_stop() { + # Stop the first TiKV+PD instances. + local node_count="${1:-${#TIKV_HOSTS[@]}}" + echo "" + echo "=== Stopping $node_count TiKV instances ===" + + for (( i=0; i/dev/null || true" + done + + echo "TiKV instances stopped." +} + +tikv_switch_to_nocache() { + # Restart TiKV containers (NOT PD) with the nocache config, so that the search + # and insert phases use cold block cache. Data on disk is preserved because we + # reuse the same data-dir; PD keeps the cluster metadata. + local node_count="${1:-${#TIKV_HOSTS[@]}}" + if [[ ! -f "$SCRIPT_DIR/configs/tikv_nocache.toml" ]]; then + echo " ERROR: configs/tikv_nocache.toml not found; cannot switch to nocache" + return 1 + fi + echo "" + echo "=== Restarting $node_count TiKV instances with tikv_nocache.toml ===" + + # Reconstruct the shared pd-endpoints list (same as tikv_start). + local pd_endpoints="" + for (( i=0; i/dev/null; \ + docker rm -f sptag-tikv-$i 2>/dev/null; \ + docker run -d --name sptag-tikv-$i --net host \ + --ulimit nofile=1048576:1048576 \ + -v $DATA_DIR/tikv-data/tikv-$i:/data \ + -v $DATA_DIR/tikv-data/conf:/conf \ + -v ${BIN_DIR}:/sptag-bin:ro \ + --entrypoint /sptag-bin/tikv-server \ + ${TIKV_IMAGE} \ + --config=/conf/tikv.toml \ + --addr=0.0.0.0:${tikv_port} \ + --advertise-addr=${host}:${tikv_port} \ + --data-dir=/data \ + --pd-endpoints=${pd_endpoints}" + done + + echo "Waiting for TiKV stores to re-register..." + sleep 5 + local pd_host_first="${TIKV_HOSTS[0]}" + local pd_port_first="${TIKV_PD_CLIENT_PORTS[0]}" + for attempt in $(seq 1 60); do + local store_count + store_count=$(curl -sf "http://${pd_host_first}:${pd_port_first}/pd/api/v1/stores" 2>/dev/null \ + | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('count',0))" 2>/dev/null || echo 0) + if [ "$store_count" -ge "$node_count" ]; then + echo " All ${store_count} TiKV stores re-registered" + break + fi + if [ "$attempt" -eq 60 ]; then + echo " WARNING: only ${store_count}/${node_count} stores re-registered after 60s" + fi + sleep 1 + done + echo "TiKV switched to nocache mode." +} + +tikv_clean() { + # Clean TiKV data for the first instances. + local node_count="${1:-${#TIKV_HOSTS[@]}}" + echo "" + echo "=== Cleaning TiKV data ($node_count instances) ===" + + for (( i=0; i/dev/null || true" + done +} + +# Legacy wrappers for the main case block +cmd_start_tikv() { tikv_start "${1:-${#TIKV_HOSTS[@]}}"; } +cmd_stop_tikv() { tikv_stop "${1:-${#TIKV_HOSTS[@]}}"; } + +# ─── Cache Management ─── + +drop_all_caches() { + # Drop OS page cache + dentries/inodes on the first nodes. + # This may take 30-60s per node if there are many dirty pages. + local node_count="${1:-1}" + if [[ "${SKIP_DROP_CACHES:-0}" == "1" ]]; then + echo "[SKIP_DROP_CACHES=1] skipping OS page-cache drop on $node_count node(s)" + return 0 + fi + echo "Dropping OS page cache on $node_count node(s) (timeout 10s per node)..." + for (( i=0; i /proc/sys/vm/drop_caches'" && echo "done" || echo "timeout/failed (non-fatal)" + done + echo "Cache drop complete." +} + +# ─── INI Generation ─── + +generate_ini() { + # Generate a benchmark INI from a template, filling in [Distributed] fields. + # Usage: generate_ini [overrides...] + local SCALE="$1" + local NODE_COUNT="$2" + shift 2 + + local IDX_PATH="$DATA_DIR/proidx_${SCALE}_${NODE_COUNT}node/spann_index" + local KEY_PREFIX="bench${SCALE}_${NODE_COUNT}node" + + # Build comma-separated address lists from the first node_count entries + local dispatcher_addr="${NODE_HOSTS[0]}:30001" + local worker_addrs="" store_addrs="" pd_addrs="" + for (( i=0; i&2 + return 1 + fi + + local OUT="$SCRIPT_DIR/configs/benchmark_${SCALE}_${NODE_COUNT}node.ini" + cp "$BASE_INI" "$OUT" + + # Fill in placeholder fields + sed -i "s|^IndexPath=.*|IndexPath=${IDX_PATH}|" "$OUT" + sed -i "s|^TiKVKeyPrefix=.*|TiKVKeyPrefix=${KEY_PREFIX}|" "$OUT" + sed -i "s|^DispatcherAddr=.*|DispatcherAddr=${dispatcher_addr}|" "$OUT" + sed -i "s|^WorkerAddrs=.*|WorkerAddrs=${worker_addrs}|" "$OUT" + sed -i "s|^StoreAddrs=.*|StoreAddrs=${store_addrs}|" "$OUT" + sed -i "s|^PDAddrs=.*|PDAddrs=${pd_addrs}|" "$OUT" + + # Apply extra overrides (key=value pairs) + for override in "$@"; do + local key="${override%%=*}" + local val="${override#*=}" + if grep -q "^${key}=" "$OUT"; then + sed -i "s|^${key}=.*|${key}=${val}|" "$OUT" + else + # Append to [Benchmark] section + sed -i "/^\[Benchmark\]/a ${key}=${val}" "$OUT" + fi + done + + echo "$OUT" +} + +# ─── Worker Management ─── + +WORKER_SSH_PIDS=() + +start_remote_worker() { + # Start a worker on a remote node. Returns immediately; worker runs in background. + local NODE_IDX="$1" + local INI="$2" + local SCALE="$3" + local NODE_COUNT="$4" + local host="${NODE_HOSTS[$NODE_IDX]}" + local LOG="$LOGDIR/benchmark_${SCALE}_${NODE_COUNT}node_worker${NODE_IDX}.log" + + # Copy INI + binary to remote + remote_sync "$host" "$INI" "$SPTAG_DIR/worker_n${NODE_IDX}.ini" + + # Start worker via SSH (foreground on remote, background locally). + # Use `ssh -n` to redirect stdin from /dev/null so SSH doesn't try to + # acquire a TTY when the parent script runs under `nohup`. Without -n, + # the SSH client sometimes silently re-points fd1 → /dev/null and fd2 + # → a deleted /tmp file, dropping the worker log. + ssh -n $(_ssh_opts) "$SSH_USER@$host" \ + "cd $SPTAG_DIR && LD_LIBRARY_PATH=$SPTAG_DIR/Release/runtime_libs:/usr/lib/x86_64-linux-gnu:\${LD_LIBRARY_PATH:-} \ + WORKER_INDEX=${NODE_IDX} BENCHMARK_CONFIG=worker_n${NODE_IDX}.ini \ + SPFRESH_SHARD_STRIDE=${SPFRESH_SHARD_STRIDE:-0} \ + ./Release/SPTAGTest --run_test=SPFreshTest/BenchmarkFromConfig 2>&1" \ + "$LOG" 2>&1 & + local ssh_pid=$! + WORKER_SSH_PIDS+=($ssh_pid) + echo " Worker n${NODE_IDX} on $host (SSH PID: $ssh_pid, log: $LOG)" +} + +wait_workers_ready() { + local SCALE="$1" + local NODE_COUNT="$2" + local TIMEOUT=120 + + echo "Waiting for ${#WORKER_SSH_PIDS[@]} workers to be ready..." + for attempt in $(seq 1 $TIMEOUT); do + local all_ready=true + for i in $(seq 1 $((NODE_COUNT - 1))); do + local LOG="$LOGDIR/benchmark_${SCALE}_${NODE_COUNT}node_worker${i}.log" + if ! grep -q "Worker.*[Rr]eady\|Waiting for dispatch" "$LOG" 2>/dev/null; then + all_ready=false + fi + done + if $all_ready; then + echo " All workers ready (${attempt}s)" + return 0 + fi + # Check if any worker SSH process died + for idx in "${!WORKER_SSH_PIDS[@]}"; do + if ! kill -0 "${WORKER_SSH_PIDS[$idx]}" 2>/dev/null; then + echo " ERROR: Worker SSH PID ${WORKER_SSH_PIDS[$idx]} exited prematurely" + return 1 + fi + done + sleep 1 + done + echo " WARNING: Not all workers ready after ${TIMEOUT}s" + return 1 +} + +stop_remote_workers() { + # Wait for workers to self-exit (driver sends TCP Stop), then force-kill. + local TIMEOUT=${1:-30} + if [ ${#WORKER_SSH_PIDS[@]} -eq 0 ]; then return; fi + + echo "Waiting for ${#WORKER_SSH_PIDS[@]} remote workers to exit (${TIMEOUT}s timeout)..." + for pid in "${WORKER_SSH_PIDS[@]}"; do + local elapsed=0 + while kill -0 "$pid" 2>/dev/null && [ $elapsed -lt $TIMEOUT ]; do + sleep 1 + elapsed=$((elapsed + 1)) + done + if kill -0 "$pid" 2>/dev/null; then + echo " WARNING: SSH PID $pid still alive, force killing" + kill -9 "$pid" 2>/dev/null || true + wait "$pid" 2>/dev/null || true + else + echo " Worker (SSH PID $pid) exited gracefully" + fi + done + WORKER_SSH_PIDS=() +} + +# Watchdog: detect driver death (segfault, OOM, SIGKILL by oom_killer, ...) +# and tear down remote workers so they don't linger forever. +# The C++ heartbeat watchdog inside the worker is the primary defense (bounded +# at HeartbeatTimeoutSec, default 180s). This shell watchdog is a faster +# secondary path: as soon as the driver PID is gone we (a) kill the local SSH +# wrappers and (b) `pkill` the remote SPTAGTest processes. +DRIVER_WATCHDOG_PID="" + +start_driver_watchdog() { + local DRIVER_PID="$1" + local NODE_COUNT="$2" + if [ "$NODE_COUNT" -lt 2 ]; then return; fi + if [ ${#WORKER_SSH_PIDS[@]} -eq 0 ]; then return; fi + + # Snapshot what we need before backgrounding (subshell forks current env). + local _ssh_pids="${WORKER_SSH_PIDS[*]}" + local _hosts=() + for (( i=1; i/dev/null; do + sleep 5 + done + echo "[watchdog] Driver PID $DRIVER_PID is gone; tearing down remote workers" >&2 + for pid in $_ssh_pids; do + kill -TERM "$pid" 2>/dev/null || true + done + for host in $_hosts_str; do + ssh -n $_ssh_opts_str "$_ssh_user@$host" \ + "pkill -TERM -f 'SPTAGTest.*BenchmarkFromConfig' 2>/dev/null; \ + sleep 5; \ + pkill -KILL -f 'SPTAGTest.*BenchmarkFromConfig' 2>/dev/null; true" \ + /dev/null 2>&1 || true + done + for pid in $_ssh_pids; do + kill -0 "$pid" 2>/dev/null && kill -KILL "$pid" 2>/dev/null || true + done + ) & + DRIVER_WATCHDOG_PID=$! + echo " Driver watchdog started (PID: $DRIVER_WATCHDOG_PID, monitoring driver $DRIVER_PID)" +} + +stop_driver_watchdog() { + if [ -n "$DRIVER_WATCHDOG_PID" ] && kill -0 "$DRIVER_WATCHDOG_PID" 2>/dev/null; then + kill -TERM "$DRIVER_WATCHDOG_PID" 2>/dev/null || true + wait "$DRIVER_WATCHDOG_PID" 2>/dev/null || true + fi + DRIVER_WATCHDOG_PID="" +} + +# ─── Benchmark Run ─── + +distribute_head_index() { + # Copy the head index from driver to all worker nodes. + local SCALE="$1" + local NODE_COUNT="$2" + local SRC="$DATA_DIR/proidx_${SCALE}_${NODE_COUNT}node/spann_index" + + echo "Distributing head index to $((NODE_COUNT - 1)) workers..." + for (( i=1; i +resolve_build_mode() { + local SCALE="$1" NODE_COUNT="$2" + local IDX_DIR="$DATA_DIR/proidx_${SCALE}_${NODE_COUNT}node/spann_index" + local HEAD_DIR="$IDX_DIR/HeadIndex" + + BUILD_MODE_OVERRIDES=() + if [[ "${SKIP_HEAD_BUILD:-0}" == "1" ]] && [ -d "$HEAD_DIR" ] && [ -n "$(ls -A "$HEAD_DIR" 2>/dev/null)" ]; then + echo "HeadIndex found at $HEAD_DIR — using RebuildSSDOnly (skip SelectHead+BuildHead)" + BUILD_MODE_OVERRIDES=("RebuildSSDOnly=true") + else + if [[ "${SKIP_HEAD_BUILD:-0}" == "1" ]]; then + echo "SKIP_HEAD_BUILD=1 but HeadIndex not found at $HEAD_DIR — falling back to full build" + fi + BUILD_MODE_OVERRIDES=("Rebuild=true") + fi +} + +cmd_run() { + local SCALE="$1" + local NODE_COUNT="$2" + if [ -z "$SCALE" ] || [ -z "$NODE_COUNT" ]; then + echo "Usage: $0 run " + exit 1 + fi + + local BINARY="$SPTAG_DIR/Release/SPTAGTest" + + echo "" + echo "═══════════════════════════════════════════════════" + echo " ${SCALE}: ${NODE_COUNT}-node benchmark${NOCACHE:+ [NOCACHE]}" + echo " Start: $(date)" + echo "═══════════════════════════════════════════════════" + + if [ "$NODE_COUNT" -eq 1 ]; then + # ─── Single-node flow ─── + echo "" + echo "--- Phase 0: Prepare TiKV (1 instance) ---" + tikv_stop 1 + tikv_clean 1 + if ! tikv_start 1; then + echo "ERROR: tikv_start failed; aborting benchmark." >&2 + return 1 + fi + + # Resolve build mode before cleaning (SKIP_HEAD_BUILD needs existing dir) + resolve_build_mode "$SCALE" "$NODE_COUNT" + + if [[ " ${BUILD_MODE_OVERRIDES[*]} " != *"RebuildSSDOnly=true"* ]]; then + # Full build: clean old index dir + rm -rf "$DATA_DIR/proidx_${SCALE}_1node" + fi + mkdir -p "$DATA_DIR/proidx_${SCALE}_1node" + + if [[ "${NOCACHE:-0}" == "1" ]]; then + # NOCACHE: Split into build + cache-drop + search + local BUILD_VERSIONCACHE_OVERRIDES=("VersionCacheTTLMs=0" "VersionCacheMaxChunks=0") + if [[ "${BUILD_WITH_CACHE:-0}" == "1" ]]; then + # Build phase keeps caches enabled; the run phase below switches to nocache + BUILD_VERSIONCACHE_OVERRIDES=() + echo "" + echo "--- Phase 1: Build only (BUILD_WITH_CACHE=1, caches enabled) ---" + else + echo "" + echo "--- Phase 1: Build only (NOCACHE) ---" + fi + + if [[ "${SKIP_SAVE_LOAD:-0}" == "1" ]]; then + # Single-process flow: build + search + insert in one SPTAGTest invocation. + # SkipSaveLoadCycles=true bypasses the broken post-build SaveIndex and per-batch + # Load/Clone/Save. SPTAGTest itself drops OS pagecache after build, before query. + echo "[SKIP_SAVE_LOAD=1] running build + search + insert in a single SPTAGTest process" + local SINGLE_INI + SINGLE_INI=$(generate_ini "$SCALE" 1 "${BUILD_MODE_OVERRIDES[@]}" \ + "SkipSaveLoadCycles=true" "${BUILD_VERSIONCACHE_OVERRIDES[@]}") || exit 1 + + ( cd "$SPTAG_DIR" && LD_LIBRARY_PATH="$SPTAG_DIR/Release/runtime_libs:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH:-}" BENCHMARK_CONFIG="$SINGLE_INI" \ + BENCHMARK_OUTPUT="output_${SCALE}_1node.json" \ + "$BINARY" --run_test=SPFreshTest/BenchmarkFromConfig 2>&1 ) \ + | tee "$LOGDIR/benchmark_${SCALE}_1node_driver.log" + + echo "Done: $(date)" + tikv_stop 1 + return 0 + fi + + local BUILD_INI + BUILD_INI=$(generate_ini "$SCALE" 1 "${BUILD_MODE_OVERRIDES[@]}" "BuildOnly=true" "${BUILD_VERSIONCACHE_OVERRIDES[@]}") || exit 1 + + ( cd "$SPTAG_DIR" && LD_LIBRARY_PATH="$SPTAG_DIR/Release/runtime_libs:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH:-}" BENCHMARK_CONFIG="$BUILD_INI" \ + BENCHMARK_OUTPUT="output_${SCALE}_1node_build.json" \ + "$BINARY" --run_test=SPFreshTest/BenchmarkFromConfig 2>&1 ) \ + | tee "$LOGDIR/benchmark_${SCALE}_1node_build.log" + + echo "Build done: $(date)" + + if [[ "${BUILD_WITH_CACHE:-0}" == "1" && "${SKIP_TIKV_SWAP:-0}" != "1" ]]; then + echo "" + echo "--- Phase 1.4: Switch TiKV to nocache config ---" + tikv_switch_to_nocache 1 + elif [[ "${SKIP_TIKV_SWAP:-0}" == "1" ]]; then + echo "[SKIP_TIKV_SWAP=1] keeping TiKV containers running; relying on drop_caches + VersionCache=0" + fi + + echo "" + echo "--- Phase 1.5: Drop all caches (NOCACHE) ---" + drop_all_caches 1 + + echo "" + echo "--- Phase 2: Search+Insert (cold cache) ---" + local RUN_INI + RUN_INI=$(generate_ini "$SCALE" 1 "Rebuild=false" "VersionCacheTTLMs=0" "VersionCacheMaxChunks=0") || exit 1 + + ( cd "$SPTAG_DIR" && LD_LIBRARY_PATH="$SPTAG_DIR/Release/runtime_libs:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH:-}" BENCHMARK_CONFIG="$RUN_INI" \ + BENCHMARK_OUTPUT="output_${SCALE}_1node.json" \ + "$BINARY" --run_test=SPFreshTest/BenchmarkFromConfig 2>&1 ) \ + | tee "$LOGDIR/benchmark_${SCALE}_1node_driver.log" + else + echo "" + echo "--- Phase 1: Single-node run ---" + local INI + INI=$(generate_ini "$SCALE" 1 "${BUILD_MODE_OVERRIDES[@]}") || exit 1 + + echo "Starting driver on ${NODE_HOSTS[0]}..." + ( cd "$SPTAG_DIR" && LD_LIBRARY_PATH="$SPTAG_DIR/Release/runtime_libs:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH:-}" BENCHMARK_CONFIG="$INI" \ + BENCHMARK_OUTPUT="output_${SCALE}_1node.json" \ + "$BINARY" --run_test=SPFreshTest/BenchmarkFromConfig 2>&1 ) \ + | tee "$LOGDIR/benchmark_${SCALE}_1node_driver.log" + fi + + echo "Done: $(date)" + tikv_stop 1 + else + # ─── Multi-node flow ─── + echo "" + echo "--- Phase 0: Prepare TiKV ($NODE_COUNT instances) ---" + tikv_stop "$NODE_COUNT" + tikv_clean "$NODE_COUNT" + if ! tikv_start "$NODE_COUNT"; then + echo "ERROR: tikv_start failed; aborting benchmark." >&2 + return 1 + fi + + # --- Phase 1: Build index on driver --- + echo "" + echo "--- Phase 1: Build index on driver ---" + local BUILD_INI + local NOCACHE_OVERRIDES=() + local BUILD_NOCACHE_OVERRIDES=() + if [[ "${NOCACHE:-0}" == "1" ]]; then + NOCACHE_OVERRIDES=("VersionCacheTTLMs=0" "VersionCacheMaxChunks=0" "WorkerTimeout=14400") + if [[ "${BUILD_WITH_CACHE:-0}" == "1" ]]; then + # Build with cache, only run phase is nocache + BUILD_NOCACHE_OVERRIDES=() + echo "[BUILD_WITH_CACHE=1] build phase keeps caches; will switch before run phase" + else + BUILD_NOCACHE_OVERRIDES=("${NOCACHE_OVERRIDES[@]}") + fi + fi + + # Resolve build mode before cleaning (SKIP_HEAD_BUILD needs existing dir) + resolve_build_mode "$SCALE" "$NODE_COUNT" + + if [[ " ${BUILD_MODE_OVERRIDES[*]} " != *"RebuildSSDOnly=true"* ]]; then + # Full build: clean old index dirs on all nodes + for (( i=0; i "$BUILD_LOG" 2>&1 & + local BUILD_PID=$! + echo " Driver build PID: $BUILD_PID" + + # Shell-side watchdog: if the driver dies unexpectedly (segfault, OOM, + # SIGKILL) we want a fast failure path rather than hanging forever. + WORKER_SSH_PIDS=() + start_driver_watchdog "$BUILD_PID" "$NODE_COUNT" + + # Wait for the driver build to finish + echo " Waiting for driver build to complete..." + wait "$BUILD_PID" + local BUILD_RC=$? + echo "Driver build done (exit=$BUILD_RC): $(date)" + stop_driver_watchdog + + if [[ $BUILD_RC -ne 0 ]] || grep -q "===== SEGFAULT" "$BUILD_LOG"; then + echo "" + echo "ERROR: Build phase failed (exit=$BUILD_RC, segfault=$(grep -c '===== SEGFAULT' "$BUILD_LOG"))" + echo "Refusing to proceed to run phase with broken build state." + echo "Tail of build log:" + tail -30 "$BUILD_LOG" + tikv_stop "$NODE_COUNT" + exit 1 + fi + + echo "Build done: $(date)" + + # --- Phase 2: Distribute data --- + echo "" + echo "--- Phase 2: Distribute head index + data ---" + rm -f "$DATA_DIR/proidx_${SCALE}_${NODE_COUNT}node/spann_index/checkpoint.txt" + + distribute_head_index "$SCALE" "$NODE_COUNT" + distribute_perftest_files "$NODE_COUNT" + + # Sync SPTAGTest binary + bundled runtime libs to all workers so + # they pick up the latest compiled changes. (cmd_deploy is a separate + # subcommand; without this step a stale binary on the worker silently + # diverges from the driver.) + echo "" + echo "Syncing SPTAGTest binary + runtime_libs to workers..." + for host in "${NODE_HOSTS[@]}"; do + if [ "$host" = "${NODE_HOSTS[0]}" ]; then continue; fi + remote_exec "$host" "mkdir -p $SPTAG_DIR/Release" + remote_sync "$host" "$SPTAG_DIR/Release/SPTAGTest" "$SPTAG_DIR/Release/SPTAGTest" + if [ -d "$SPTAG_DIR/Release/runtime_libs" ]; then + remote_exec "$host" "mkdir -p $SPTAG_DIR/Release/runtime_libs" + rsync -az -e "ssh $(_ssh_opts)" \ + "$SPTAG_DIR/Release/runtime_libs/" \ + "$SSH_USER@$host:$SPTAG_DIR/Release/runtime_libs/" + fi + done + + # Binary already pushed; nothing else to do here. + + # --- Phase 3: Start driver first (contains dispatcher), then workers --- + echo "" + + # Drop caches if NOCACHE mode + if [[ "${NOCACHE:-0}" == "1" ]]; then + if [[ "${BUILD_WITH_CACHE:-0}" == "1" && "${SKIP_TIKV_SWAP:-0}" != "1" ]]; then + echo "--- Phase 2.4: Switch TiKV to nocache config ---" + tikv_switch_to_nocache "$NODE_COUNT" + elif [[ "${SKIP_TIKV_SWAP:-0}" == "1" ]]; then + echo "[SKIP_TIKV_SWAP=1] keeping TiKV containers running; relying on drop_caches + VersionCache=0" + fi + echo "--- Phase 2.5: Drop all caches (NOCACHE) ---" + drop_all_caches "$NODE_COUNT" + fi + + echo "--- Phase 3: Distributed run ---" + + local RUN_INI + RUN_INI=$(generate_ini "$SCALE" "$NODE_COUNT" "Rebuild=false" "${NOCACHE_OVERRIDES[@]}") || exit 1 + + # Start driver in background first — it contains the dispatcher that + # workers need to connect to for ring registration. + local DRIVER_LOG="$LOGDIR/benchmark_${SCALE}_${NODE_COUNT}node_driver.log" + echo "Starting driver (dispatcher+worker0) on ${NODE_HOSTS[0]}..." + ( cd "$SPTAG_DIR" && LD_LIBRARY_PATH="$SPTAG_DIR/Release/runtime_libs:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH:-}" BENCHMARK_CONFIG="$RUN_INI" \ + BENCHMARK_OUTPUT="output_${SCALE}_${NODE_COUNT}node.json" \ + "$BINARY" --run_test=SPFreshTest/BenchmarkFromConfig ) \ + > "$DRIVER_LOG" 2>&1 & + local DRIVER_PID=$! + echo " Driver PID: $DRIVER_PID" + + # Wait for dispatcher to start listening before launching workers + local DISP_PORT=30001 + echo " Waiting for dispatcher to listen on port $DISP_PORT..." + for attempt in $(seq 1 60); do + if ss -tlnp 2>/dev/null | grep -q ":${DISP_PORT} " || \ + netstat -tlnp 2>/dev/null | grep -q ":${DISP_PORT} "; then + echo " Dispatcher listening (${attempt}s)" + break + fi + if ! kill -0 "$DRIVER_PID" 2>/dev/null; then + echo " ERROR: Driver exited prematurely" + cat "$DRIVER_LOG" + return 1 + fi + if [ "$attempt" -eq 60 ]; then + echo " WARNING: Dispatcher not detected on port $DISP_PORT after 60s, proceeding anyway" + fi + sleep 1 + done + + # Now start remote workers — they can connect to the dispatcher + WORKER_SSH_PIDS=() + for (( i=1; i/dev/null || true + done + + tikv_stop "$NODE_COUNT" + fi + + echo "" + echo "═══════════════════════════════════════════════════" + echo " ${SCALE} ${NODE_COUNT}-node done: $(date)" + echo " Results: output_${SCALE}_${NODE_COUNT}node.json" + echo " Logs: $LOGDIR/benchmark_${SCALE}_${NODE_COUNT}node_*.log" + echo "═══════════════════════════════════════════════════" +} + +cmd_bench() { + # Run 1-node baseline + N-node distributed for each specified scale. + # Usage: cmd_bench [scale...] + # Special scale "all" expands to all scales with templates in configs/. + local scales=() + for arg in "$@"; do + if [ "$arg" = "all" ]; then + for tmpl in "$SCRIPT_DIR"/configs/benchmark_*_template.ini; do + local name + name="$(basename "$tmpl")" + name="${name#benchmark_}" + name="${name%_template.ini}" + scales+=("$name") + done + else + scales+=("$arg") + fi + done + + if [ ${#scales[@]} -eq 0 ]; then + echo "Usage: $0 bench [scale...] | all" + echo "Available scales:" + for tmpl in "$SCRIPT_DIR"/configs/benchmark_*_template.ini; do + local name + name="$(basename "$tmpl")" + name="${name#benchmark_}" + name="${name%_template.ini}" + echo " $name" + done + exit 1 + fi + + echo "" + echo "═══════════════════════════════════════════════════" + echo " Benchmark suite: ${scales[*]}" + echo " Cluster: $TOTAL_NODES nodes" + echo " Start: $(date)" + echo "═══════════════════════════════════════════════════" + + for scale in "${scales[@]}"; do + echo "" + echo "▶▶▶ Scale: $scale — 1-node baseline" + cmd_run "$scale" 1 + + if [ "$TOTAL_NODES" -gt 1 ]; then + echo "" + echo "▶▶▶ Scale: $scale — ${TOTAL_NODES}-node distributed" + cmd_run "$scale" "$TOTAL_NODES" + else + echo " (Skipping multi-node: cluster has only 1 node)" + fi + done + + echo "" + echo "═══════════════════════════════════════════════════" + echo " Benchmark suite complete: $(date)" + echo "═══════════════════════════════════════════════════" +} + +# ─── Cleanup ─── + +cmd_cleanup() { + echo "" + echo "=== Cleaning up remote nodes ===" + + for i in $(seq 1 $((${#NODE_HOSTS[@]} - 1))); do + local host="${NODE_HOSTS[$i]}" + echo " Cleaning $host..." + remote_exec "$host" "rm -rf $SPTAG_DIR/Release/SPTAGTest $SPTAG_DIR/perftest_* $SPTAG_DIR/worker_*.ini" + # Clean index directories + remote_exec "$host" "rm -rf $DATA_DIR/proidx_*" + done + echo "Cleanup complete." +} + +# ─── Main ─── + +CMD="$1" +CONF="$2" + +if [ -z "$CMD" ] || [ -z "$CONF" ]; then + echo "Usage: $0 [args...]" + echo "" + echo "Commands:" + echo " deploy Deploy binary and data to all nodes" + echo " start-tikv Start independent TiKV/PD instances" + echo " stop-tikv Stop TiKV/PD instances" + echo " run Run benchmark: $0 run cluster.conf " + echo " bench Run full benchmark suite: $0 bench cluster.conf [scale...] | all" + echo " cleanup Remove deployed files from remote nodes" + exit 1 +fi + +parse_config "$CONF" + +# Trap for cleanup on interrupt +trap 'echo ""; echo "Interrupted!"; stop_driver_watchdog; stop_remote_workers 5; cmd_stop_tikv; exit 1' INT TERM + +case "$CMD" in + deploy) + cmd_deploy + ;; + setup-bins) + cmd_setup_bins + ;; + start-tikv) + cmd_start_tikv "${3:-}" + ;; + stop-tikv) + cmd_stop_tikv "${3:-}" + ;; + run) + cmd_run "$3" "$4" + ;; + bench) + shift 2 # skip cmd and conf + cmd_bench "$@" + ;; + cleanup) + cmd_cleanup + ;; + *) + echo "Unknown command: $CMD" + echo "Valid commands: deploy, setup-bins, start-tikv, stop-tikv, run, bench, cleanup" + exit 1 + ;; +esac From 418674711afefef9a7548136618940061343f0de Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Wed, 20 May 2026 07:21:04 +0000 Subject: [PATCH 02/12] Fix unneede diff --- .gitignore | 3 +- Test/src/main.cpp | 5 +- benchmark.ini | 19 ----- .../configs/benchmark_100m_1node.ini | 71 ------------------- .../configs/benchmark_100m_2node.ini | 71 ------------------- .../configs/benchmark_10m_1node.ini | 62 ---------------- .../configs/benchmark_10m_2node.ini | 62 ---------------- .../benchmark_insert_dominant_1node.ini | 58 --------------- .../benchmark_insert_dominant_2node.ini | 58 --------------- .../benchmark_insert_dominant_3node.ini | 59 --------------- 10 files changed, 5 insertions(+), 463 deletions(-) delete mode 100644 benchmark.ini delete mode 100644 evaluation/distributed/configs/benchmark_100m_1node.ini delete mode 100644 evaluation/distributed/configs/benchmark_100m_2node.ini delete mode 100644 evaluation/distributed/configs/benchmark_10m_1node.ini delete mode 100644 evaluation/distributed/configs/benchmark_10m_2node.ini delete mode 100644 evaluation/distributed/configs/benchmark_insert_dominant_1node.ini delete mode 100644 evaluation/distributed/configs/benchmark_insert_dominant_2node.ini delete mode 100644 evaluation/distributed/configs/benchmark_insert_dominant_3node.ini diff --git a/.gitignore b/.gitignore index e3dc9796a..190ca29d3 100644 --- a/.gitignore +++ b/.gitignore @@ -464,5 +464,4 @@ FodyWeavers.xsd *.sln.iml # SPTAG benchmark generated artifacts -/perftest_* -/evaluation/2026-04-23/output_distributed_hostname_*.json +*perftest_* diff --git a/Test/src/main.cpp b/Test/src/main.cpp index ab8d1342c..49ca39950 100644 --- a/Test/src/main.cpp +++ b/Test/src/main.cpp @@ -7,7 +7,9 @@ #include #include +#ifdef TIKV #include +#endif using namespace boost::unit_test; @@ -36,8 +38,9 @@ struct GlobalFixture // adds GraphCycles bookkeeping under a global spinlock on every Lock(); // observed to consume ~12% CPU under high worker-thread parallelism in // gRPC client paths (perf-recorded 2026-05-06). +#ifdef TIKV absl::SetMutexDeadlockDetectionMode(absl::OnDeadlockCycle::kIgnore); - +#endif SPTAGVisitor visitor; traverse_test_tree(framework::master_test_suite(), visitor, false); } diff --git a/benchmark.ini b/benchmark.ini deleted file mode 100644 index e2b400767..000000000 --- a/benchmark.ini +++ /dev/null @@ -1,19 +0,0 @@ -[Benchmark] -VectorPath=sift1b/base.100M.u8bin -QueryPath=sift1b/query.public.10K.u8bin -TruthPath=none -IndexPath=proidx/spann_index -ValueType=UInt8 -Dimension=128 -BaseVectorCount=10000 -InsertVectorCount=10000 -DeleteVectorCount=0 -BatchNum=10 -TopK=5 -NumThreads=8 -NumQueries=100 -DistMethod=L2 -Rebuild=true -Resume=-1 -QuantizerFilePath=quantizer.bin -QuantizedDim=64 diff --git a/evaluation/distributed/configs/benchmark_100m_1node.ini b/evaluation/distributed/configs/benchmark_100m_1node.ini deleted file mode 100644 index 42ec07f49..000000000 --- a/evaluation/distributed/configs/benchmark_100m_1node.ini +++ /dev/null @@ -1,71 +0,0 @@ -; 100m: 99M base + 1M insert (insert is ~1% of base, "freshness / steady-state" workload). -; 100× larger base index than insert_dominant. Tests how the system behaves when -; the head index is large (~tens of millions of heads on layer 0) and the insert -; rate is moderate. Layers=2, L2 distance, SIFT1B dataset. -; -; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from -; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs). -; -; Notes for 100M-scale operation: -; - First run MUST build the index (Rebuild=true). Build of 99M base takes hours; -; reuse with Rebuild=false on subsequent runs and SKIP_HEAD_BUILD=1 if the -; HeadIndex on disk is intact. -; - Truth (top-5 over 99M) is recomputed at start each run; expect ~minutes. -; - SaveIndex at 100M has been observed to hang in BG-job-drain on some hosts; -; use SKIP_SAVE_LOAD=1 when iterating to bypass the per-batch save/load cycle. -; - TiKV data will grow to ~50-100GB per store at this scale; both nodes need -; plenty of NVMe headroom (verified: driver has 6.2T, worker has 691G). -[Benchmark] -WorkerTimeout=14400 -VectorPath=/mnt/nvme/sift1b/bigann_base.u8bin -QueryPath=/mnt/nvme/sift1b/query.10K.u8bin -TruthPath=truth -IndexPath=/mnt/nvme/proidx_100m_1node/spann_index -ValueType=UInt8 -Dimension=128 -BaseVectorCount=99000000 -InsertVectorCount=1000000 -DeleteVectorCount=0 -BatchNum=1 -TopK=5 -NumSearchThreads=4 -NumInsertThreads=4 -AppendThreadNum=16 -NumSearchDuringInsertThreads=1 -NumQueries=200 -DistMethod=L2 -Rebuild=true -BuildOnly=false -Resume=-1 -Layers=2 - -Storage=TIKVIO -TiKVPDAddresses=PLACEHOLDER -TiKVKeyPrefix=bench100m_1node - -[SelectHead] -ParallelBKTBuild=true - -[BuildHead] -ParallelBKTBuild=true - -[BuildSSDIndex] -LatencyLimit=100 -MaxCheck=8192 -SearchInternalResultNum=64 -UseMultiChunkPosting=false -PostingPageLimit=8 -PostingCountCacheCapacity=10000000 -SearchCheckVersionMapOnlyLayer0=true -DistributedVersionMap=true -ReassignK=64 -AsyncMergeInSearch=true -VersionCacheMaxChunks=1000000 -AsyncRpcMaxInflight=512 - -[Distributed] -Enabled=true -DispatcherAddr=10.11.0.7:30001 -WorkerAddrs=10.11.0.7:30011 -StoreAddrs=10.11.0.7:20171 -PDAddrs=10.11.0.7:23791 diff --git a/evaluation/distributed/configs/benchmark_100m_2node.ini b/evaluation/distributed/configs/benchmark_100m_2node.ini deleted file mode 100644 index 01b9c3e81..000000000 --- a/evaluation/distributed/configs/benchmark_100m_2node.ini +++ /dev/null @@ -1,71 +0,0 @@ -; 100m: 99M base + 1M insert (insert is ~1% of base, "freshness / steady-state" workload). -; 100× larger base index than insert_dominant. Tests how the system behaves when -; the head index is large (~tens of millions of heads on layer 0) and the insert -; rate is moderate. Layers=2, L2 distance, SIFT1B dataset. -; -; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from -; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs). -; -; Notes for 100M-scale operation: -; - First run MUST build the index (Rebuild=true). Build of 99M base takes hours; -; reuse with Rebuild=false on subsequent runs and SKIP_HEAD_BUILD=1 if the -; HeadIndex on disk is intact. -; - Truth (top-5 over 99M) is recomputed at start each run; expect ~minutes. -; - SaveIndex at 100M has been observed to hang in BG-job-drain on some hosts; -; use SKIP_SAVE_LOAD=1 when iterating to bypass the per-batch save/load cycle. -; - TiKV data will grow to ~50-100GB per store at this scale; both nodes need -; plenty of NVMe headroom (verified: driver has 6.2T, worker has 691G). -[Benchmark] -WorkerTimeout=14400 -VectorPath=/mnt/nvme/sift1b/bigann_base.u8bin -QueryPath=/mnt/nvme/sift1b/query.10K.u8bin -TruthPath=truth -IndexPath=/mnt/nvme/proidx_100m_2node/spann_index -ValueType=UInt8 -Dimension=128 -BaseVectorCount=99000000 -InsertVectorCount=1000000 -DeleteVectorCount=0 -BatchNum=1 -TopK=5 -NumSearchThreads=4 -NumInsertThreads=4 -AppendThreadNum=16 -NumSearchDuringInsertThreads=1 -NumQueries=200 -DistMethod=L2 -Rebuild=false -BuildOnly=false -Resume=-1 -Layers=2 - -Storage=TIKVIO -TiKVPDAddresses=PLACEHOLDER -TiKVKeyPrefix=bench100m_2node - -[SelectHead] -ParallelBKTBuild=true - -[BuildHead] -ParallelBKTBuild=true - -[BuildSSDIndex] -LatencyLimit=100 -MaxCheck=8192 -SearchInternalResultNum=64 -UseMultiChunkPosting=false -PostingPageLimit=8 -PostingCountCacheCapacity=10000000 -SearchCheckVersionMapOnlyLayer0=true -DistributedVersionMap=true -ReassignK=64 -AsyncMergeInSearch=true -VersionCacheMaxChunks=1000000 -AsyncRpcMaxInflight=512 - -[Distributed] -Enabled=true -DispatcherAddr=10.11.0.7:30001 -WorkerAddrs=10.11.0.7:30011,10.11.0.10:30002 -StoreAddrs=10.11.0.7:20171,10.11.0.10:20171 -PDAddrs=10.11.0.7:23791,10.11.0.10:23791 diff --git a/evaluation/distributed/configs/benchmark_10m_1node.ini b/evaluation/distributed/configs/benchmark_10m_1node.ini deleted file mode 100644 index 56dbd9088..000000000 --- a/evaluation/distributed/configs/benchmark_10m_1node.ini +++ /dev/null @@ -1,62 +0,0 @@ -; 10m: 9M base + 1M insert (insert is ~10% of base, "growing-index" workload). -; 10× larger base index than insert_dominant, 10× smaller than 100m. -; Useful for validating scaling between 1M and 100M without paying the -; multi-hour build cost of 100m. Layers=2, L2 distance, SIFT1B dataset -; (truncated to 10M of the 1B available). -; -; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from -; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs). -[Benchmark] -WorkerTimeout=14400 -VectorPath=/mnt/nvme/sift1b/bigann_base.u8bin -QueryPath=/mnt/nvme/sift1b/query.10K.u8bin -TruthPath=truth -IndexPath=/mnt/nvme/proidx_10m_1node/spann_index -ValueType=UInt8 -Dimension=128 -BaseVectorCount=9000000 -InsertVectorCount=1000000 -DeleteVectorCount=0 -BatchNum=1 -TopK=5 -NumSearchThreads=4 -NumInsertThreads=4 -AppendThreadNum=16 -NumSearchDuringInsertThreads=1 -NumQueries=200 -DistMethod=L2 -Rebuild=true -BuildOnly=false -Resume=-1 -Layers=2 - -Storage=TIKVIO -TiKVPDAddresses=PLACEHOLDER -TiKVKeyPrefix=bench10m_1node - -[SelectHead] -ParallelBKTBuild=true - -[BuildHead] -ParallelBKTBuild=true - -[BuildSSDIndex] -LatencyLimit=100 -MaxCheck=8192 -SearchInternalResultNum=64 -UseMultiChunkPosting=false -PostingPageLimit=8 -PostingCountCacheCapacity=1000000 -SearchCheckVersionMapOnlyLayer0=true -DistributedVersionMap=true -ReassignK=64 -AsyncMergeInSearch=true -VersionCacheMaxChunks=1000000 -AsyncRpcMaxInflight=512 - -[Distributed] -Enabled=true -DispatcherAddr=10.11.0.7:30001 -WorkerAddrs=10.11.0.7:30011 -StoreAddrs=10.11.0.7:20171 -PDAddrs=10.11.0.7:23791 diff --git a/evaluation/distributed/configs/benchmark_10m_2node.ini b/evaluation/distributed/configs/benchmark_10m_2node.ini deleted file mode 100644 index 4ed317ac3..000000000 --- a/evaluation/distributed/configs/benchmark_10m_2node.ini +++ /dev/null @@ -1,62 +0,0 @@ -; 10m: 9M base + 1M insert (insert is ~10% of base, "growing-index" workload). -; 10× larger base index than insert_dominant, 10× smaller than 100m. -; Useful for validating scaling between 1M and 100M without paying the -; multi-hour build cost of 100m. Layers=2, L2 distance, SIFT1B dataset -; (truncated to 10M of the 1B available). -; -; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from -; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs). -[Benchmark] -WorkerTimeout=14400 -VectorPath=/mnt/nvme/sift1b/bigann_base.u8bin -QueryPath=/mnt/nvme/sift1b/query.10K.u8bin -TruthPath=truth -IndexPath=/mnt/nvme/proidx_10m_2node/spann_index -ValueType=UInt8 -Dimension=128 -BaseVectorCount=9000000 -InsertVectorCount=1000000 -DeleteVectorCount=0 -BatchNum=1 -TopK=5 -NumSearchThreads=4 -NumInsertThreads=4 -AppendThreadNum=16 -NumSearchDuringInsertThreads=1 -NumQueries=200 -DistMethod=L2 -Rebuild=false -BuildOnly=false -Resume=-1 -Layers=2 - -Storage=TIKVIO -TiKVPDAddresses=PLACEHOLDER -TiKVKeyPrefix=bench10m_2node - -[SelectHead] -ParallelBKTBuild=true - -[BuildHead] -ParallelBKTBuild=true - -[BuildSSDIndex] -LatencyLimit=100 -MaxCheck=8192 -SearchInternalResultNum=64 -UseMultiChunkPosting=false -PostingPageLimit=8 -PostingCountCacheCapacity=1000000 -SearchCheckVersionMapOnlyLayer0=true -DistributedVersionMap=true -ReassignK=64 -AsyncMergeInSearch=true -VersionCacheMaxChunks=1000000 -AsyncRpcMaxInflight=512 - -[Distributed] -Enabled=true -DispatcherAddr=10.11.0.7:30001 -WorkerAddrs=10.11.0.7:30011,10.11.0.10:30002 -StoreAddrs=10.11.0.7:20171,10.11.0.10:20171 -PDAddrs=10.11.0.7:23791,10.11.0.10:23791 diff --git a/evaluation/distributed/configs/benchmark_insert_dominant_1node.ini b/evaluation/distributed/configs/benchmark_insert_dominant_1node.ini deleted file mode 100644 index 30fe77bbe..000000000 --- a/evaluation/distributed/configs/benchmark_insert_dominant_1node.ini +++ /dev/null @@ -1,58 +0,0 @@ -; insert_dominant: 1M base + 1M insert with concurrent search-during-insert. -; Layers=2, L2 distance, SIFT1B dataset (truncated to 1M). -; -; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from -; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs). -[Benchmark] -WorkerTimeout=14400 -VectorPath=/mnt/nvme/sift1b/bigann_base.u8bin -QueryPath=/mnt/nvme/sift1b/query.10K.u8bin -TruthPath=truth -IndexPath=/mnt/nvme/proidx_insert_dominant_1node/spann_index -ValueType=UInt8 -Dimension=128 -BaseVectorCount=1000000 -InsertVectorCount=1000000 -DeleteVectorCount=0 -BatchNum=1 -TopK=5 -NumSearchThreads=4 -NumInsertThreads=4 -AppendThreadNum=16 -NumSearchDuringInsertThreads=1 -NumQueries=200 -DistMethod=L2 -Rebuild=true -BuildOnly=false -Resume=-1 -Layers=2 - -Storage=TIKVIO -TiKVPDAddresses=PLACEHOLDER -TiKVKeyPrefix=benchinsert_dominant_1node - -[SelectHead] -ParallelBKTBuild=true - -[BuildHead] -ParallelBKTBuild=true - -[BuildSSDIndex] -LatencyLimit=100 -MaxCheck=8192 -SearchInternalResultNum=64 -UseMultiChunkPosting=false -PostingPageLimit=8 -PostingCountCacheCapacity=1000000 -SearchCheckVersionMapOnlyLayer0=true -DistributedVersionMap=true -ReassignK=64 -AsyncMergeInSearch=true -VersionCacheMaxChunks=100000 - -[Distributed] -Enabled=true -DispatcherAddr=10.11.0.7:30001 -WorkerAddrs=10.11.0.7:30011 -StoreAddrs=10.11.0.7:20171 -PDAddrs=10.11.0.7:23791 diff --git a/evaluation/distributed/configs/benchmark_insert_dominant_2node.ini b/evaluation/distributed/configs/benchmark_insert_dominant_2node.ini deleted file mode 100644 index d45870b50..000000000 --- a/evaluation/distributed/configs/benchmark_insert_dominant_2node.ini +++ /dev/null @@ -1,58 +0,0 @@ -; insert_dominant: 1M base + 1M insert with concurrent search-during-insert. -; Layers=2, L2 distance, SIFT1B dataset (truncated to 1M). -; -; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from -; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs). -[Benchmark] -WorkerTimeout=14400 -VectorPath=/mnt/nvme/sift1b/bigann_base.u8bin -QueryPath=/mnt/nvme/sift1b/query.10K.u8bin -TruthPath=truth -IndexPath=/mnt/nvme/proidx_insert_dominant_2node/spann_index -ValueType=UInt8 -Dimension=128 -BaseVectorCount=1000000 -InsertVectorCount=1000000 -DeleteVectorCount=0 -BatchNum=1 -TopK=5 -NumSearchThreads=4 -NumInsertThreads=4 -AppendThreadNum=16 -NumSearchDuringInsertThreads=1 -NumQueries=200 -DistMethod=L2 -Rebuild=false -BuildOnly=false -Resume=-1 -Layers=2 - -Storage=TIKVIO -TiKVPDAddresses=PLACEHOLDER -TiKVKeyPrefix=benchinsert_dominant_2node - -[SelectHead] -ParallelBKTBuild=true - -[BuildHead] -ParallelBKTBuild=true - -[BuildSSDIndex] -LatencyLimit=100 -MaxCheck=8192 -SearchInternalResultNum=64 -UseMultiChunkPosting=false -PostingPageLimit=8 -PostingCountCacheCapacity=1000000 -SearchCheckVersionMapOnlyLayer0=true -DistributedVersionMap=true -ReassignK=64 -AsyncMergeInSearch=true -VersionCacheMaxChunks=100000 - -[Distributed] -Enabled=true -DispatcherAddr=10.11.0.7:30001 -WorkerAddrs=10.11.0.7:30011,10.11.0.10:30002 -StoreAddrs=10.11.0.7:20171,10.11.0.10:20171 -PDAddrs=10.11.0.7:23791,10.11.0.10:23791 diff --git a/evaluation/distributed/configs/benchmark_insert_dominant_3node.ini b/evaluation/distributed/configs/benchmark_insert_dominant_3node.ini deleted file mode 100644 index a8050732d..000000000 --- a/evaluation/distributed/configs/benchmark_insert_dominant_3node.ini +++ /dev/null @@ -1,59 +0,0 @@ -; insert_dominant: 1M base + 10M insert (10× scale-up) with concurrent search-during-insert. -; Tests how the index handles insertion-dominated workloads where insertion volume -; is much larger than the initial baseline. Layers=2, L2 distance, SIFT1B dataset. -; -; Multi-machine deployment: run_distributed.sh fills PLACEHOLDER fields from -; cluster.conf (IndexPath, TiKVKeyPrefix, TiKVPDAddresses, [Distributed] addrs). -[Benchmark] -WorkerTimeout=14400 -VectorPath=/mnt/data/sift1b/base.1B.u8bin -QueryPath=/mnt/data/sift1b/query.public.10K.u8bin -TruthPath=truth -IndexPath=/mnt/md0/proidx_insert_dominant_3node/spann_index -ValueType=UInt8 -Dimension=128 -BaseVectorCount=1000000 -InsertVectorCount=1000000 -DeleteVectorCount=0 -BatchNum=1 -TopK=5 -NumSearchThreads=4 -NumInsertThreads=4 -AppendThreadNum=16 -NumSearchDuringInsertThreads=1 -NumQueries=200 -DistMethod=L2 -Rebuild=false -BuildOnly=false -Resume=-1 -Layers=2 - -Storage=TIKVIO -TiKVPDAddresses=PLACEHOLDER -TiKVKeyPrefix=benchinsert_dominant_3node - -[SelectHead] -ParallelBKTBuild=true - -[BuildHead] -ParallelBKTBuild=true - -[BuildSSDIndex] -LatencyLimit=100 -MaxCheck=8192 -SearchInternalResultNum=64 -UseMultiChunkPosting=false -PostingPageLimit=8 -PostingCountCacheCapacity=1000000 -SearchCheckVersionMapOnlyLayer0=true -DistributedVersionMap=true -ReassignK=64 -AsyncMergeInSearch=true -VersionCacheMaxChunks=100000 - -[Distributed] -Enabled=true -DispatcherAddr=172.27.0.4:30001 -WorkerAddrs=172.27.0.4:30011,172.27.0.5:30002,172.27.0.6:30003 -StoreAddrs=172.27.0.4:20171,172.27.0.5:20171,172.27.0.6:20171 -PDAddrs=172.27.0.4:23791,172.27.0.5:23791,172.27.0.6:23791 From ee97d3ff732f69c91c2b35158219c5f3f1873187 Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Wed, 20 May 2026 08:21:07 +0000 Subject: [PATCH 03/12] Remove unused stride-shard experiment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Strip the SPFRESH_SHARD_STRIDE opt-in code path (4 helpers + plumbing through LoadAndInsertBatch/RunBenchmark/RunWorker). No active config sets the env var; we always use the contiguous slice partition. Test/CMakeLists.txt: explicitly link ${TiKV_LIBRARIES} into SPTAGTest so a clean build (no .o cache) resolves gpr_/grpc_ symbols pulled in by the kvproto generated stubs. ThirdParty/kvproto/.gitignore: stop tracking regenerated stubs going forward — they are environment-specific (must match the protoc/grpc in the build env); regenerate locally via generate_cpp.sh. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- Test/CMakeLists.txt | 2 +- Test/src/SPFreshTest.cpp | 148 ++-------------------- ThirdParty/kvproto/.gitignore | 4 + evaluation/distributed/run_distributed.sh | 1 - 4 files changed, 19 insertions(+), 136 deletions(-) create mode 100644 ThirdParty/kvproto/.gitignore diff --git a/Test/CMakeLists.txt b/Test/CMakeLists.txt index 27bdeebb5..9db640da2 100644 --- a/Test/CMakeLists.txt +++ b/Test/CMakeLists.txt @@ -24,7 +24,7 @@ if (NOT LIBRARYONLY) file(GLOB TEST_HDR_FILES ${PROJECT_SOURCE_DIR}/Test/inc/Test.h) file(GLOB TEST_SRC_FILES ${PROJECT_SOURCE_DIR}/Test/src/*.cpp) add_executable(SPTAGTest ${TEST_SRC_FILES} ${TEST_HDR_FILES}) - target_link_libraries(SPTAGTest SPTAGLibStatic ssdservingLib ${Boost_LIBRARIES} absl_synchronization absl_cord absl_cordz_info absl_cord_internal absl_cordz_functions absl_cordz_handle) + target_link_libraries(SPTAGTest SPTAGLibStatic ssdservingLib ${Boost_LIBRARIES} ${TiKV_LIBRARIES} absl_synchronization absl_cord absl_cordz_info absl_cord_internal absl_cordz_functions absl_cordz_handle) install(TARGETS SPTAGTest RUNTIME DESTINATION bin diff --git a/Test/src/SPFreshTest.cpp b/Test/src/SPFreshTest.cpp index 9ab420db9..1a2140773 100644 --- a/Test/src/SPFreshTest.cpp +++ b/Test/src/SPFreshTest.cpp @@ -62,94 +62,6 @@ static __attribute__((constructor)) void install_segfault_handler() { using namespace SPTAG; -// --------------------------------------------------------------------------- -// Stride sharding (a.k.a. odd/even sharding) experiment -// --------------------------------------------------------------------------- -// When the env var SPFRESH_SHARD_STRIDE is set to "1"/"true", each node, instead -// of inserting a contiguous slice [n*B/N, (n+1)*B/N) of the per-iteration batch, -// inserts the strided rows {n, n+N, n+2*N, ...} where n=nodeIndex, N=numNodes. -// This breaks any spatial structure in the input dataset (e.g. SIFT files that -// are roughly sorted by visual feature), letting us check whether the layer-0 -// split skew (driver 71 vs worker 2 in v18) is caused by contiguous slicing -// landing similar vectors on the same node and overflowing a small set of heads. -// -// The total number of vectors inserted across all nodes per iteration is the -// same; only the assignment changes. Recall measurement still works because -// the dataset and ground truth are unchanged — only insert routing differs. -static bool IsStrideShardEnabled() { - const char* e = std::getenv("SPFRESH_SHARD_STRIDE"); - if (!e) return false; - std::string v(e); - return v == "1" || v == "true" || v == "TRUE" || v == "yes"; -} - -// Compute count of indices i in [0, total) with (i % stride) == offset. -static SizeType StrideCount(SizeType total, int stride, int offset) { - if (stride <= 1) return total; - if (offset < 0 || offset >= stride) return 0; - if (total <= offset) return 0; - return (total - 1 - offset) / stride + 1; -} - -// Build a strided sub-VectorSet by copying every `stride`-th vector starting -// at `offset` into a contiguous packed ByteArray. Returns a BasicVectorSet. -static std::shared_ptr ExtractStridedVectors( - const std::shared_ptr& full, int stride, int offset) -{ - if (!full) return nullptr; - SizeType totalCount = full->Count(); - SizeType outCount = StrideCount(totalCount, stride, offset); - auto vt = full->GetValueType(); - auto dim = full->Dimension(); - size_t perVecSize = full->PerVectorDataSize(); - if (outCount <= 0) { - return std::make_shared(ByteArray::Alloc(0), vt, dim, 0); - } - ByteArray buf = ByteArray::Alloc(static_cast(outCount) * perVecSize); - for (SizeType i = 0; i < outCount; ++i) { - SizeType srcIdx = static_cast(offset) + i * static_cast(stride); - std::memcpy(buf.Data() + static_cast(i) * perVecSize, - full->GetVector(srcIdx), - perVecSize); - } - return std::make_shared(buf, vt, dim, outCount); -} - -// Build a strided sub-MetadataSet. Two-pass: first compute offsets, then copy. -static std::shared_ptr ExtractStridedMetadata( - const std::shared_ptr& full, int stride, int offset) -{ - if (!full) return nullptr; - SizeType totalCount = full->Count(); - SizeType outCount = StrideCount(totalCount, stride, offset); - if (outCount <= 0) { - ByteArray emptyMeta = ByteArray::Alloc(0); - ByteArray offBuf = ByteArray::Alloc(sizeof(std::uint64_t)); - *reinterpret_cast(offBuf.Data()) = 0ULL; - return std::make_shared(emptyMeta, offBuf, 0); - } - std::vector offsets(static_cast(outCount) + 1, 0ULL); - std::uint64_t total = 0; - for (SizeType i = 0; i < outCount; ++i) { - SizeType srcIdx = static_cast(offset) + i * static_cast(stride); - ByteArray meta = full->GetMetadata(srcIdx); - offsets[i] = total; - total += meta.Length(); - } - offsets[outCount] = total; - ByteArray metaBuf = ByteArray::Alloc(total > 0 ? total : 1); - for (SizeType i = 0; i < outCount; ++i) { - SizeType srcIdx = static_cast(offset) + i * static_cast(stride); - ByteArray meta = full->GetMetadata(srcIdx); - if (meta.Length() > 0) { - std::memcpy(metaBuf.Data() + offsets[i], meta.Data(), meta.Length()); - } - } - ByteArray offBuf = ByteArray::Alloc((static_cast(outCount) + 1) * sizeof(std::uint64_t)); - std::memcpy(offBuf.Data(), offsets.data(), offsets.size() * sizeof(std::uint64_t)); - return std::make_shared(metaBuf, offBuf, outCount); -} - // Helper: parse "host:port,host:port,..." into vector of pairs. static std::vector> ParseNodeAddrs(const std::string& addrStr) { std::vector> result; @@ -1098,7 +1010,6 @@ void LoadAndInsertBatch(SPANN::Index* spannIndex, const std::string& paddmetaidx, int dimension, int insertStart, int loadCount, int perNodeBatch, - bool strideShard, int numNodes, int nodeIndex, int numInsertThreads, SPANN::WorkerNode* router, std::shared_ptr quantizer, @@ -1121,14 +1032,6 @@ void LoadAndInsertBatch(SPANN::Index* spannIndex, addFloat->Count()); } auto addmetaset = TestUtils::TestDataGenerator::LoadMetadataSet(paddmeta, paddmetaidx, insertStart, loadCount); - if (strideShard) { - addset = ExtractStridedVectors(addset, numNodes, nodeIndex); - addmetaset = ExtractStridedMetadata(addmetaset, numNodes, nodeIndex); - SPTAGLIB_LOG(Helper::LogLevel::LL_Info, - "%s: stride-shard batchStart=%d loadCount=%d -> kept=%d (every %d-th, offset=%d)\n", - logPrefix, insertStart, loadCount, - (int)(addset ? addset->Count() : 0), numNodes, nodeIndex); - } InsertVectors(spannIndex, numInsertThreads, perNodeBatch, addset, addmetaset, searchDuringInsertThreads, queryset, numQueries, searchK, @@ -1225,23 +1128,12 @@ void RunBenchmark(const std::string &vectorPath, const std::string &queryPath, c // Use distributed config for multi-node partitioning int nodeIndex = distCfg.workerIndex; int numNodes = distCfg.GetNumWorkers(); - bool strideShard = IsStrideShardEnabled() && numNodes > 1; - int myInsertStart, myInsertEnd, perNodeBatch; - if (strideShard) { - // Stride mode: each node loads the FULL per-iter batch then keeps rows - // where (rowIdx % numNodes) == nodeIndex. myInsertStart/End span the - // full batch; perNodeBatch is the count of strided rows. - myInsertStart = 0; - myInsertEnd = insertBatchSize; - perNodeBatch = static_cast(StrideCount(insertBatchSize, numNodes, nodeIndex)); - } else { - myInsertStart = (numNodes > 1) ? (nodeIndex * insertBatchSize) / numNodes : 0; - myInsertEnd = (numNodes > 1) ? ((nodeIndex + 1) * insertBatchSize) / numNodes : insertBatchSize; - perNodeBatch = myInsertEnd - myInsertStart; - } + int myInsertStart = (numNodes > 1) ? (nodeIndex * insertBatchSize) / numNodes : 0; + int myInsertEnd = (numNodes > 1) ? ((nodeIndex + 1) * insertBatchSize) / numNodes : insertBatchSize; + int perNodeBatch = myInsertEnd - myInsertStart; SPTAGLIB_LOG(Helper::LogLevel::LL_Info, - "RunBenchmark: nodeIndex=%d numNodes=%d insertBatchSize=%d myInsertStart=%d myInsertEnd=%d perNodeBatch=%d strideShard=%d\n", - nodeIndex, numNodes, insertBatchSize, myInsertStart, myInsertEnd, perNodeBatch, strideShard ? 1 : 0); + "RunBenchmark: nodeIndex=%d numNodes=%d insertBatchSize=%d myInsertStart=%d myInsertEnd=%d perNodeBatch=%d\n", + nodeIndex, numNodes, insertBatchSize, myInsertStart, myInsertEnd, perNodeBatch); // Variables to collect JSON output data std::ostringstream tmpbenchmark; @@ -1585,19 +1477,16 @@ void RunBenchmark(const std::string &vectorPath, const std::string &queryPath, c SPANN::DispatchCommand::Type::Insert, static_cast(iter)); } - // Each node inserts its partition. Default mode: contiguous slice - // [iter*batchSize + myInsertStart, +perNodeBatch). Stride mode: - // every numNodes-th row of the full batch starting at nodeIndex - // (loads full batch then filters down to perNodeBatch rows). + // Each node inserts its contiguous slice + // [iter*batchSize + myInsertStart, +perNodeBatch). int insertStart = iter * insertBatchSize + myInsertStart; - int loadCount = strideShard ? insertBatchSize : perNodeBatch; + int loadCount = perNodeBatch; { std::string driverTag = "RunBenchmark iter=" + std::to_string(iter); start = std::chrono::high_resolution_clock::now(); LoadAndInsertBatch(static_cast*>(cloneIndex.get()), paddset, paddmeta, paddmetaidx, M, insertStart, loadCount, perNodeBatch, - strideShard, numNodes, nodeIndex, numInsertThreads, workerPtr, enableQuantization ? quantizer : nullptr, numSearchDuringInsertThreads, queryset, @@ -2914,17 +2803,9 @@ void RunWorker(const std::string& indexPath, int dimension, int baseVectorCount, int nodeIndex = distCfg.workerIndex; int numNodes = distCfg.GetNumWorkers(); int insertBatchSize = insertVectorCount / std::max(batches, 1); - bool strideShard = IsStrideShardEnabled() && numNodes > 1; - int myInsertStart, myInsertEnd, perNodeBatch; - if (strideShard) { - myInsertStart = 0; - myInsertEnd = insertBatchSize; - perNodeBatch = static_cast(StrideCount(insertBatchSize, numNodes, nodeIndex)); - } else { - myInsertStart = (numNodes > 1) ? (nodeIndex * insertBatchSize) / numNodes : 0; - myInsertEnd = (numNodes > 1) ? ((nodeIndex + 1) * insertBatchSize) / numNodes : insertBatchSize; - perNodeBatch = myInsertEnd - myInsertStart; - } + int myInsertStart = (numNodes > 1) ? (nodeIndex * insertBatchSize) / numNodes : 0; + int myInsertEnd = (numNodes > 1) ? ((nodeIndex + 1) * insertBatchSize) / numNodes : insertBatchSize; + int perNodeBatch = myInsertEnd - myInsertStart; BOOST_TEST_MESSAGE("Worker node " << nodeIndex << ": Loading index from " << indexPath); std::shared_ptr index; @@ -3035,16 +2916,15 @@ void RunWorker(const std::string& indexPath, int dimension, int baseVectorCount, if (cmd.m_type == SPANN::DispatchCommand::Type::Insert) { int insertStart = cmd.m_round * insertBatchSize + myInsertStart; - int loadCount = strideShard ? insertBatchSize : perNodeBatch; - SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Worker %d: Batch %u - inserting %d vectors (offset %d, strideShard=%d)\n", - nodeIndex, cmd.m_round + 1, perNodeBatch, insertStart, strideShard ? 1 : 0); + int loadCount = perNodeBatch; + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Worker %d: Batch %u - inserting %d vectors (offset %d)\n", + nodeIndex, cmd.m_round + 1, perNodeBatch, insertStart); auto t1 = std::chrono::high_resolution_clock::now(); std::string workerTag = "Worker " + std::to_string(nodeIndex) + " batch=" + std::to_string(cmd.m_round + 1); LoadAndInsertBatch(spannIndex, paddset, paddmeta, paddmetaidx, dimension, insertStart, loadCount, perNodeBatch, - strideShard, numNodes, nodeIndex, numInsertThreads, router, /*quantizer=*/nullptr, /*searchDuringInsertThreads=*/0, diff --git a/ThirdParty/kvproto/.gitignore b/ThirdParty/kvproto/.gitignore new file mode 100644 index 000000000..b2dab26f7 --- /dev/null +++ b/ThirdParty/kvproto/.gitignore @@ -0,0 +1,4 @@ +# Generated C++ stubs are environment-specific (protoc/grpc versions must +# match the gRPC libs in the build env). Each developer should regenerate +# locally via generate_cpp.sh instead of consuming the committed snapshot. +generated/ diff --git a/evaluation/distributed/run_distributed.sh b/evaluation/distributed/run_distributed.sh index c383a7eed..bb982ab7d 100755 --- a/evaluation/distributed/run_distributed.sh +++ b/evaluation/distributed/run_distributed.sh @@ -744,7 +744,6 @@ start_remote_worker() { ssh -n $(_ssh_opts) "$SSH_USER@$host" \ "cd $SPTAG_DIR && LD_LIBRARY_PATH=$SPTAG_DIR/Release/runtime_libs:/usr/lib/x86_64-linux-gnu:\${LD_LIBRARY_PATH:-} \ WORKER_INDEX=${NODE_IDX} BENCHMARK_CONFIG=worker_n${NODE_IDX}.ini \ - SPFRESH_SHARD_STRIDE=${SPFRESH_SHARD_STRIDE:-0} \ ./Release/SPTAGTest --run_test=SPFreshTest/BenchmarkFromConfig 2>&1" \ "$LOG" 2>&1 & local ssh_pid=$! From 4df704f9897ede7997e6632568f7362ebe893449 Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Wed, 20 May 2026 08:36:46 +0000 Subject: [PATCH 04/12] InsertVectors: dedupe branches, log InsertThreadNum ignore in bulk path The previous if/else duplicated the thread launch+join. Restructure to a single launch with an optional search-during-insert thread: - launch insertThreadCount workers - if benchmarking, launch one search thread in parallel - join all, then compute stats (only when search ran) Also log a clear note when the bulk router path is used: the user- supplied InsertThreadNum is unused there (driver runs one launcher thread and parallelism comes from [BuildSSDIndex] AppendThreadNum inside ExtraDynamicSearcher's append/split pool). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- Test/src/SPFreshTest.cpp | 50 ++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 27 deletions(-) diff --git a/Test/src/SPFreshTest.cpp b/Test/src/SPFreshTest.cpp index 1a2140773..5bef228a3 100644 --- a/Test/src/SPFreshTest.cpp +++ b/Test/src/SPFreshTest.cpp @@ -661,29 +661,39 @@ void InsertVectors(SPANN::Index *p_index, int insertThreads, int step if (useBulk) { func = bulkFunc; insertThreadCount = 1; + SPTAGLIB_LOG(Helper::LogLevel::LL_Info, + "InsertVectors: bulk path - driver launcher=1, internal parallelism comes from " + "[BuildSSDIndex] AppendThreadNum (user-supplied InsertThreadNum=%d is unused on this path)\n", + insertThreads); } else { func = perVecFunc; insertThreadCount = insertThreads; } - if (searchThreads > 0 && queryset != nullptr && numQueries != 0 && benchmarkData != nullptr) { - std::vector latencies; - std::vector results; - double searchWallSeconds = 0.0; + bool withSearch = (searchThreads > 0 && queryset != nullptr && numQueries != 0 && benchmarkData != nullptr); - for (int j = 0; j < insertThreadCount; j++) - { - threads.emplace_back(func); - } - std::thread searchThread([&]() { + for (int j = 0; j < insertThreadCount; j++) + { + threads.emplace_back(func); + } + + std::vector latencies; + std::vector results; + double searchWallSeconds = 0.0; + std::thread searchThread; + if (withSearch) { + searchThread = std::thread([&]() { searchWallSeconds = ExecutePartitionedSearch( p_index, queryset, /*myStart=*/0, numQueries, k, searchThreads, results, &latencies, /*statsOut=*/nullptr); }); - for (auto &thread : threads) - { - thread.join(); - } + } + + for (auto &thread : threads) + { + thread.join(); + } + if (withSearch) { searchThread.join(); // Calculate statistics @@ -712,17 +722,6 @@ void InsertVectors(SPANN::Index *p_index, int insertThreads, int step *benchmarkData << " \"minLatency\": " << minLat << ",\n"; *benchmarkData << " \"maxLatency\": " << maxLat << ",\n"; *benchmarkData << " \"qps\": " << qps << ",\n"; - } else { - // No search-during-insert path: just run the insert threads. - // (Used by worker dispatch and any caller that doesn't need stats.) - for (int j = 0; j < insertThreadCount; j++) - { - threads.emplace_back(func); - } - for (auto &thread : threads) - { - thread.join(); - } } auto barrierStart = std::chrono::high_resolution_clock::now(); size_t barrierPolls = 0; @@ -743,9 +742,6 @@ void InsertVectors(SPANN::Index *p_index, int insertThreads, int step } - - - template void BenchmarkQueryPerformance(std::shared_ptr &index, std::shared_ptr &queryset, std::shared_ptr &truth, const std::string &truthPath, From c27a109ac297d350521478b15bcb2e33b7e1827a Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Wed, 20 May 2026 09:10:14 +0000 Subject: [PATCH 05/12] Restore (layers+1) multiplier in BlockController IO queue size 87160070 removed the (m_layers+1) multiplier in the SPDK BlockController queue-depth formula. The change was based on an incorrect assumption that the distributed port collapses all per-layer SPDK pools into the single shared layer-0 pool. In practice only layer 0 + the RPC receiver share a pool; every inner layer (m_layer >= 1) still creates its own SPDKThreadPool in both BuildIndex and LoadIndex. With Layers=2 (current active configs) we therefore have ~2 independent pools each running insert + reassign + append worker threads, so the peak concurrent IO-submitter count remains the qianxi-original (layers+1)*(insert+reassign+append) plus search threads. Under-sizing the BlockController queue could stall IO submission under heavy split/reassign + search load; over-sizing is harmless. Restore the multiplier to match qianxi behaviour. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- AnnService/src/Core/SPANN/ExtraFileController.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/AnnService/src/Core/SPANN/ExtraFileController.cpp b/AnnService/src/Core/SPANN/ExtraFileController.cpp index b5db83822..24c839455 100644 --- a/AnnService/src/Core/SPANN/ExtraFileController.cpp +++ b/AnnService/src/Core/SPANN/ExtraFileController.cpp @@ -25,7 +25,7 @@ bool FileIO::BlockController::Initialize(SPANN::Options &p_opt, int p_layer) #ifndef _MSC_VER O_RDWR | O_DIRECT, numblocks, 2, 2, max(p_opt.m_ioThreads, (2 * max(p_opt.m_searchThreadNum, p_opt.m_iSSDNumberOfThreads) + - p_opt.m_insertThreadNum + p_opt.m_reassignThreadNum + p_opt.m_appendThreadNum)), + (p_opt.m_layers + 1) * (p_opt.m_insertThreadNum + p_opt.m_reassignThreadNum + p_opt.m_appendThreadNum))), ((std::uint64_t)p_opt.m_startFileSize) << 30 #else GENERIC_READ | GENERIC_WRITE, numblocks, 2, 2, From f3a9de98da29a208ef8eeb7311ad6c433bcfd21b Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Wed, 20 May 2026 09:23:17 +0000 Subject: [PATCH 06/12] SetVersionBatch: bypass LRU cache, read TiKV directly All distributed runs override VersionCacheMaxChunks=0 (set by run_distributed.sh in build/run/nocache phases), so the LRU cache is effectively disabled. Using ReadChunkCached inside SetVersionBatch adds bookkeeping noise (cache hit/miss path, refresh-mutex acquire) that produces no benefit. Switch to direct ReadChunk; the dirty-byte gating still saves the WriteChunk RPC when no version byte actually changes. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- AnnService/inc/Core/Common/TiKVVersionMap.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/AnnService/inc/Core/Common/TiKVVersionMap.h b/AnnService/inc/Core/Common/TiKVVersionMap.h index 69191fe1b..ff30306e8 100644 --- a/AnnService/inc/Core/Common/TiKVVersionMap.h +++ b/AnnService/inc/Core/Common/TiKVVersionMap.h @@ -386,7 +386,10 @@ namespace SPTAG } // Group writes by chunk: 1 ReadChunk + N byte-modifications + 1 WriteChunk - // per chunk, instead of N × (ReadChunk + WriteChunk). + // per chunk, instead of N × (ReadChunk + WriteChunk). Bypasses the LRU + // cache because runs that exercise this path always have + // VersionCacheMaxChunks=0; reading TiKV directly removes a layer of + // bookkeeping (cache invalidate-on-write) we no longer benefit from. void SetVersionBatch(const std::vector& vids, const std::vector& versions) override { size_t n = std::min(vids.size(), versions.size()); @@ -408,7 +411,7 @@ namespace SPTAG SizeType cid = kv.first; auto& idxs = kv.second; std::lock_guard lock(ChunkMutex(cid)); - std::string chunk = ReadChunkCached(cid); + std::string chunk = ReadChunk(cid); if (chunk.empty()) { chunk.assign(m_chunkSize, static_cast(0xff)); } From f35ae85bdb46d25d51585061de47c63b312f48c1 Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Wed, 20 May 2026 09:42:39 +0000 Subject: [PATCH 07/12] Drop high-priority job queue from SPDKThreadPool The distributed port introduced a separate m_highJobs queue + add_high in ThreadPool plus 'urgent' parameters on AppendAsync/ReassignAsync. Receiver dispatch already discovered high-priority starved Split jobs and switched to high=false. The remaining urgent=true callers were: - AppendAsync in CollectReAssign's non-TiKV branch (dead under Storage::TIKVIO which is the only storage we use) - ReassignAsync on head-miss in Append/BatchAppend (same starvation risk against Split that motivated the receiver-side revert) Restore ThreadPool.h to the upstream deque+addfront shape (no semantic change vs. original) and drop the urgent parameter from AppendAsync/ ReassignAsync, the high flag from JobSubmitter, and the high path from WireJobSubmitterIfReady. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Core/SPANN/Distributed/RemotePostingOps.h | 27 ++++++--------- .../inc/Core/SPANN/ExtraDynamicSearcher.h | 29 +++++----------- AnnService/inc/Helper/ThreadPool.h | 33 +++++-------------- 3 files changed, 28 insertions(+), 61 deletions(-) diff --git a/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h b/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h index 577b91876..0f032c2ba 100644 --- a/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h +++ b/AnnService/inc/Core/SPANN/Distributed/RemotePostingOps.h @@ -95,7 +95,7 @@ namespace SPTAG::SPANN { // its own m_splitThreadPool, so BatchAppend items dispatch by the // request's m_layer to the matching pool. A single submitter would // pile both layers' remote appends into whichever pool wired last. - using JobSubmitter = std::function; + using JobSubmitter = std::function; void SetJobSubmitter(int layer, JobSubmitter submitter) { std::unique_lock lk(m_callbackLifetimeMutex); EnsureLayerSlot_NoLock(layer); @@ -756,13 +756,12 @@ namespace SPTAG::SPANN { SPTAGLIB_LOG(Helper::LogLevel::LL_Debug, "RemotePostingOps: Received batch of %u appends\n", batchReq->m_count); - // Submit each item as a high-priority Job to the searcher's - // shared compute pool. Pool workers run the local Append callback - // exactly like a local insert would. Last completion ACKs the - // sender. This puts remote work on the SAME concurrency budget - // as local Split/Merge/Reassign — eliminating the over-subscribed - // TiKV behaviour of the old separate bg executor + transient - // sub-worker threads. + // Submit each item as a Job to the searcher's shared compute pool. + // Pool workers run the local Append callback exactly like a local + // insert would. Last completion ACKs the sender. This puts remote + // work on the SAME concurrency budget as local Split/Merge/Reassign + // — eliminating the over-subscribed TiKV behaviour of the old + // separate bg executor + transient sub-worker threads. auto packetPtr = std::make_shared(std::move(packet)); const size_t total = batchReq->m_items.size(); if (total == 0) { @@ -810,15 +809,9 @@ namespace SPTAG::SPANN { // submitter we have. for (auto& s : m_jobSubmitters) { if (s) { sub = &s; break; } } } - // Normal priority. Per-layer routing (m_jobSubmitters[layer]) - // already isolates layer-N append items from other layers' - // pools. High priority starved split entirely (split:N - // in_flight, 0 completed) because once all 16 worker threads - // are running long-tail append items, fresh high-prio appends - // keep cutting in front of split. Append throughput per chunk - // is limited by pool concurrency × per-item RMW; widen the - // pool (AppendThreadNum) instead of using priority hacks. - if (sub) (*sub)(job, /*high=*/false); + // Per-layer routing (m_jobSubmitters[layer]) isolates layer-N + // append items from other layers' pools. + if (sub) (*sub)(job); else { delete job; failCount->fetch_add(1); remaining->fetch_sub(1); } } } diff --git a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h index 29129bdb4..b8ca98e85 100644 --- a/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h +++ b/AnnService/inc/Core/SPANN/ExtraDynamicSearcher.h @@ -395,10 +395,7 @@ namespace SPTAG::SPANN { if (!m_worker || !m_splitThreadPool) return; auto pool = m_splitThreadPool; m_worker->SetJobSubmitter(m_layer, - [pool](Helper::ThreadPool::Job* j, bool high) { - if (high) pool->add_high(j); - else pool->add(j); - }); + [pool](Helper::ThreadPool::Job* j) { pool->add(j); }); } /// Set the external WorkerNode pointer and bind all callbacks @@ -436,7 +433,7 @@ namespace SPTAG::SPANN { // Mirror sender's version map for the records we're about // to persist so MergePostings + SearchIndex don't drop - // them as "stale". See HEAD git history for rationale. + // them as "stale". { const uint8_t* basePtr = reinterpret_cast(appendPosting.data()); size_t totalRec = appendPosting.size() / m_vectorInfoSize; @@ -1713,28 +1710,20 @@ namespace SPTAG::SPANN { m_splitThreadPool->add(curJob); } - inline void AppendAsync(SizeType headID, std::shared_ptr postingList, bool urgent = false,std::function p_callback = nullptr) + inline void AppendAsync(SizeType headID, std::shared_ptr postingList, std::function p_callback = nullptr) { auto* curJob = new AppendAsyncJob(this, headID, std::move(postingList), p_callback); m_appendJobsInFlight++; m_totalAppendSubmitted++; - if (urgent) { - m_splitThreadPool->addfront(curJob); - } else { - m_splitThreadPool->add(curJob); - } + m_splitThreadPool->add(curJob); } - inline void ReassignAsync(std::shared_ptr vectorInfo, SizeType headPrev, bool urgent = false, std::function p_callback = nullptr) + inline void ReassignAsync(std::shared_ptr vectorInfo, SizeType headPrev, std::function p_callback = nullptr) { auto* curJob = new ReassignAsyncJob(this, std::move(vectorInfo), headPrev, p_callback); m_reassignJobsInFlight++; m_totalReassignSubmitted++; - if (urgent) { - m_splitThreadPool->addfront(curJob); - } else { - m_splitThreadPool->add(curJob); - } + m_splitThreadPool->add(curJob); } ErrorCode CollectReAssign(ExtraWorkSpace *p_exWorkSpace, SizeType headID, std::shared_ptr headVec, @@ -1901,7 +1890,7 @@ namespace SPTAG::SPANN { if (m_opt->m_storage == Storage::TIKVIO) ret = BatchAppend(p_exWorkSpace, batchReassign, "CollectReAssign"); else { for (auto& kv : batchReassign) { - AppendAsync(kv.first, std::make_shared(kv.second), true); + AppendAsync(kv.first, std::make_shared(kv.second)); } } if (batchReassignCount > 0) { @@ -2019,7 +2008,7 @@ namespace SPTAG::SPANN { if (m_versionMap->GetVersion(VID) == version) { // SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Head Miss To ReAssign: VID: %d, current version: %d\n", *(int*)(&appendPosting[idx]), version); m_stat.m_headMiss++; - ReassignAsync(vectorInfo, headID, true); + ReassignAsync(vectorInfo, headID); } // SPTAGLIB_LOG(Helper::LogLevel::LL_Info, "Head Miss Do Not To ReAssign: VID: %d, version: %d, current version: %d\n", *(int*)(&appendPosting[idx]), m_versionMap->GetVersion(*(int*)(&appendPosting[idx])), version); } @@ -2185,7 +2174,7 @@ namespace SPTAG::SPANN { uint8_t version = *(uint8_t*)(ptr + sizeof(SizeType)); if (m_versionMap->GetVersion(VID) == version) { m_stat.m_headMiss++; - ReassignAsync(std::make_shared((char*)ptr, m_vectorInfoSize), headID, true); + ReassignAsync(std::make_shared((char*)ptr, m_vectorInfoSize), headID); } } continue; diff --git a/AnnService/inc/Helper/ThreadPool.h b/AnnService/inc/Helper/ThreadPool.h index a351a75c8..01c82e2a7 100644 --- a/AnnService/inc/Helper/ThreadPool.h +++ b/AnnService/inc/Helper/ThreadPool.h @@ -5,7 +5,7 @@ #define _SPTAG_HELPER_THREADPOOL_H_ #include -#include +#include #include #include #include @@ -78,42 +78,28 @@ namespace SPTAG { { std::lock_guard lock(m_lock); - m_jobs.push(j); + m_jobs.push_back(j); } m_cond.notify_one(); } - // High-priority push: jobs in m_highJobs always run before m_jobs. - // Used by the distributed receiver to let inbound BatchAppend RPC - // work jump ahead of local Split/Merge/Reassign so the sender - // (driver) doesn't time out waiting for the chunk ack while the - // local pool drains long-running rebalance work. - void add_high(Job* j) + void addfront(Job* j) { { std::lock_guard lock(m_lock); - m_highJobs.push(j); + m_jobs.push_front(j); } m_cond.notify_one(); } - // Alias kept for compatibility with code that calls addfront() - // (e.g., split-async path). Same semantics as add_high. - void addfront(Job* j) { add_high(j); } - bool get(Job*& j) { std::unique_lock lock(m_lock); - while (m_jobs.empty() && m_highJobs.empty() && !m_abort.ShouldAbort()) m_cond.wait(lock); + while (m_jobs.empty() && !m_abort.ShouldAbort()) m_cond.wait(lock); if (!m_abort.ShouldAbort()) { - if (!m_highJobs.empty()) { - j = m_highJobs.front(); - m_highJobs.pop(); - } else { - j = m_jobs.front(); - m_jobs.pop(); - } + j = m_jobs.front(); currentJobs++; + m_jobs.pop_front(); return true; } return false; @@ -122,7 +108,7 @@ namespace SPTAG size_t jobsize() { std::lock_guard lock(m_lock); - return m_jobs.size() + m_highJobs.size(); + return m_jobs.size(); } inline uint32_t runningJobs() { return currentJobs; } @@ -136,8 +122,7 @@ namespace SPTAG protected: std::atomic_uint32_t currentJobs{ 0 }; - std::queue m_jobs; - std::queue m_highJobs; + std::deque m_jobs; Abort m_abort; std::mutex m_lock; std::condition_variable m_cond; From a49b26d5292b90c7ccd2ead91fb71176b8e5ae4b Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Wed, 20 May 2026 09:58:06 +0000 Subject: [PATCH 08/12] Fix space --- Test/src/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Test/src/main.cpp b/Test/src/main.cpp index 49ca39950..c1a5cde60 100644 --- a/Test/src/main.cpp +++ b/Test/src/main.cpp @@ -39,7 +39,7 @@ struct GlobalFixture // observed to consume ~12% CPU under high worker-thread parallelism in // gRPC client paths (perf-recorded 2026-05-06). #ifdef TIKV - absl::SetMutexDeadlockDetectionMode(absl::OnDeadlockCycle::kIgnore); + absl::SetMutexDeadlockDetectionMode(absl::OnDeadlockCycle::kIgnore); #endif SPTAGVisitor visitor; traverse_test_tree(framework::master_test_suite(), visitor, false); From 689e5b23e45da738b7ff77830a59283d0a58c5e4 Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Wed, 20 May 2026 10:06:24 +0000 Subject: [PATCH 09/12] Fix distributed benchmark README + drop dead orchestrator code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit run_distributed.sh: - Remove wait_workers_ready() — dead since the driver-listens-on-30001 handshake replaced log-grep readiness detection. - Drop the stale 'Binary already pushed; nothing else to do here' comment that sat immediately after the actual binary-push rsync block. README.md: - Correct the TiKV deployment model: the cluster is SHARED (all PDs in one raft group, all TiKVs registered as stores, max-replicas=1) — not one isolated PD+TiKV per node as the old text claimed. Architecture diagram, port table, and pre-split helper updated accordingly (one PD endpoint, not a per-node loop). - Fix Step 1 cluster-config path: configs/cluster_2node.conf (an actual shipped file), not the non-existent cluster.conf.example. - Update port defaults to match cluster_2node.conf (23791/23801/20171) and call out that the driver's router_port must not collide with the dispatcher port 30001 (cluster_2node.conf uses 30011 for this reason). - List all shipped configs (10m, 100m, insert_dominant, tikv.toml, cluster_*.conf) in the file table. - Document setup-bins subcommand alongside deploy. - Flag the Build / Distribute / Run split as a workaround for the missing distributed SelectHead/BuildHead implementation, so readers don't mistake it for the steady-state design. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- evaluation/distributed/README.md | 219 +++++++++++++--------- evaluation/distributed/run_distributed.sh | 33 ---- 2 files changed, 126 insertions(+), 126 deletions(-) diff --git a/evaluation/distributed/README.md b/evaluation/distributed/README.md index 1f24bc865..4717efc35 100644 --- a/evaluation/distributed/README.md +++ b/evaluation/distributed/README.md @@ -1,18 +1,26 @@ # Distributed Benchmark Evaluation — Insert Dominant Multi-machine SPTAG SPANN distributed benchmark for an **insert-dominant** workload -(1M base + 10M inserts in 10 batches, with concurrent search-during-insert) on -SIFT1B. Each physical node runs its own independent PD + TiKV (no shared Raft -replication — see "TiKV deployment model" below). +(1M base + 1M-10M inserts in batches, with concurrent search-during-insert) on +SIFT1B. All nodes share a single TiKV raft cluster (see "TiKV deployment model" +below). ## Files in this folder | File | Purpose | | --- | --- | -| `configs/benchmark_insert_dominant_template.ini` | Benchmark template; `run_distributed.sh` fills `IndexPath`, `TiKVPDAddresses`, `TiKVKeyPrefix`, and `[Distributed]` from `cluster.conf`. | -| `run_distributed.sh` | Orchestrator: `deploy` / `start-tikv` / `run` / `stop-tikv` / `cleanup`. | +| `configs/benchmark_insert_dominant_template.ini` | 1M base + 1M insert, search-during-insert workload. | +| `configs/benchmark_10m_template.ini` | 9M base + 1M insert, growing-index workload. | +| `configs/benchmark_100m_template.ini` | 99M base + 1M insert, steady-state/freshness workload. | +| `configs/cluster_2node.conf`, `configs/cluster_3node.conf` | Example cluster topologies. Pick one (or write your own) and pass to the orchestrator. | +| `configs/tikv.toml` | TiKV server config baked into the containers. | +| `run_distributed.sh` | Orchestrator: `deploy` / `setup-bins` / `start-tikv` / `run` / `bench` / `stop-tikv` / `cleanup`. | +| `bin/` | `tikv-server` + `pd-server` binaries used by the containers (`setup-bins` downloads them if missing). | | `README.md` | This file. | +`run_distributed.sh` fills the template's `IndexPath`, `TiKVPDAddresses`, +`TiKVKeyPrefix`, and `[Distributed]` section from the cluster config. + ## Architecture ``` @@ -29,35 +37,42 @@ replication — see "TiKV deployment model" below). │ + Router│ │ + Router│ │ + Router│ └────┬─────┘ └────┬─────┘ └────┬─────┘ │ │ │ - ▼ ▼ ▼ - ┌──────────┐ ┌──────────┐ ┌──────────┐ - │ TiKV 1 │ │ TiKV 2 │ │ TiKV N │ (one PD + one TiKV per node) - └──────────┘ └──────────┘ └──────────┘ + └────────────┼────────────┘ + ▼ + ┌───────────────────┐ + │ Shared TiKV raft │ N PDs (one raft group) + + │ cluster │ N TiKV stores (max-replicas=1) + └───────────────────┘ ``` -- **Driver** (node 0): Builds the index, sends Search/Insert/Stop commands via TCP dispatch. -- **Workers** (nodes 1..N): Receive commands, execute their shard locally, report results back. -- **TiKV (per node)**: Each node runs its own independent PD + TiKV pair. Postings - for a head live on the node that owns that head's hash partition. -- **PostingRouter**: Hash-based head routing, remote append, head sync, dispatch protocol. +- **Driver** (node 0): builds the index, sends Search/Insert/Stop commands via + TCP dispatch. +- **Workers** (nodes 1..N): receive commands, execute their shard locally, + report results back over the dispatch channel. +- **Shared TiKV cluster**: every node runs a PD + TiKV container; all PDs join + one raft group, all TiKVs point to all PDs. PD routes each key to the store + that owns its region. +- **PostingRouter**: hash-based head routing, remote append, head sync, + dispatch protocol. ## TiKV deployment model -Unlike a single-machine multi-docker TiKV (3 PD + 3 TiKV behind 127.0.0.1 ports -22791-3 / 20161-3 sharing one Raft cluster), in this multi-machine setup **each -node runs its own isolated PD + TiKV pair** under host networking. Heads are -routed to nodes by hash, and each node's TiKV stores only its own shard. There -is no Raft replication between nodes (no cross-node region quorum), which is -intentional for insert-dominated benchmarks where Raft log overhead would dominate. +All nodes share **one** TiKV raft cluster: every node's PD joins the same raft +group, every node's TiKV registers as a store in that cluster, and PD routes +reads/writes to whichever store owns the region. `max-replicas=1` is set so +each region lives on exactly one store — we measure benchmark performance +without 3-way Raft replication. Compute nodes are stateless TiKV clients; they +read any posting through the shared client, so there is no cross-compute fetch +RPC during RNGSelection. -Per-node ports (defaults from `cluster.conf`): +Per-node ports (defaults from `configs/cluster_2node.conf`): -| Service | Port | Notes | +| Service | Default port | Notes | | --- | --- | --- | -| PD client | `2379` | Local app uses `:2379`. | -| PD peer | `2380` | Inter-PD; isolated cluster of 1 PD per node. | -| TiKV client | `20161` | The node-local SPTAG worker connects here. | -| Router | `30001+` | TCP dispatch / posting routing between nodes. | +| PD client | `23791` | TiKV client + `pd-ctl` connect here. | +| PD peer | `23801` | Inter-PD raft traffic. | +| TiKV client | `20171` | Per-node TiKV listens here. | +| Router | `30002+` | TCP dispatch / posting routing between nodes. **Driver's `router_port` must NOT be `30001`** — the dispatcher listens on `30001` and a collision will silently break worker registration. The shipped 2-node config uses `30011` on the driver for this reason. | ## Prerequisites @@ -69,45 +84,47 @@ Per-node ports (defaults from `cluster.conf`): cmake .. -DTIKV=ON -DTBB=ON -DCMAKE_BUILD_TYPE=Release -DGPU=OFF cmake --build . --target SPTAGTest -j$(nproc) ``` - *Note: building the full project may fail on the Java wrapper (`JAVASPTAGFileIO`) - due to a pre-existing `FileIOInterface.h` signature mismatch — the `SPTAGTest` - target alone is sufficient.* -- Passwordless SSH from driver to every other node (configure `ssh_key` in `cluster.conf`). + *Note: building the full project may fail on the Java wrapper + (`JAVASPTAGFileIO`) due to a pre-existing `FileIOInterface.h` signature + mismatch — the `SPTAGTest` target alone is sufficient.* +- Passwordless SSH from driver to every other node (configure `ssh_key` in + the cluster config). - Docker installed on every node (TiKV/PD run as containers in host network mode). - Same dataset path on every node (default `/mnt/nvme/sift1b/`): - `/mnt/nvme/sift1b/bigann_base.u8bin` (1B × 128 × u8) - `/mnt/nvme/sift1b/query.10K.u8bin` -- Same fast-storage path for index + TiKV data on every node (`data_dir` in `cluster.conf`, - default `/mnt/nvme`). +- Same fast-storage path for index + TiKV data on every node (`data_dir` in + the cluster config, default `/mnt/nvme`). ## Step 1 — Cluster config +Pick one of the shipped templates and edit it for your hosts/paths: + ```bash -cp evaluation/distributed/cluster.conf.example cluster.conf -vim cluster.conf +cp evaluation/distributed/configs/cluster_2node.conf my_cluster.conf +vim my_cluster.conf ``` -Example: +Layout: ```ini [cluster] ssh_user=superbench +ssh_key=/home/superbench/.ssh/id_rsa sptag_dir=/home/superbench/zhangt/SPTAG data_dir=/mnt/nvme -tikv_version=v7.5.1 -pd_version=v7.5.1 +tikv_version=v8.5.1 +pd_version=v8.5.1 [nodes] -# host router_port -10.0.1.1 30001 # driver (always first) -10.0.1.2 30002 # worker 1 -10.0.1.3 30003 # worker 2 +# host router_port (driver is first; router_port must not equal 30001) +10.0.1.1 30011 # driver +10.0.1.2 30002 # worker 1 [tikv] -# host pd_client pd_peer tikv_port -10.0.1.1 2379 2380 20161 -10.0.1.2 2379 2380 20161 -10.0.1.3 2379 2380 20161 +# host pd_client_port pd_peer_port tikv_port +10.0.1.1 23791 23801 20171 +10.0.1.2 23791 23801 20171 ``` `run_distributed.sh` reads this file to fill the template's `[Distributed]`, @@ -116,50 +133,49 @@ pd_version=v7.5.1 ## Step 2 — Deploy ```bash -./evaluation/distributed/run_distributed.sh deploy cluster.conf +./evaluation/distributed/run_distributed.sh deploy my_cluster.conf +./evaluation/distributed/run_distributed.sh setup-bins my_cluster.conf ``` -This rsyncs `Release/SPTAGTest` (and required shared libs) to every node and -ensures the per-node TiKV / PD data directories exist under `data_dir`. +`deploy` rsyncs `Release/SPTAGTest` (and required shared libs) to every node +and ensures per-node TiKV / PD data directories exist under `data_dir`. +`setup-bins` downloads `tikv-server` / `pd-server` into `bin/` on every node +(idempotent; skipped automatically by `start-tikv` if binaries are already +present). -## Step 3 — Start TiKV (per-node, independent) +## Step 3 — Start the shared TiKV cluster ```bash -./evaluation/distributed/run_distributed.sh start-tikv cluster.conf +./evaluation/distributed/run_distributed.sh start-tikv my_cluster.conf ``` -This starts one PD + one TiKV per node in host-network containers. Single-replica -placement (`max-replicas=1`) is set so we measure benchmark performance without -3-way Raft replication. +This starts one PD + one TiKV container per node in host-network mode and +joins them into a single raft cluster (`max-replicas=1`, no 3-way replication). -Health check (run on driver, repeat per node): +Health check (single PD endpoint is enough — the cluster is shared): ```bash -for ip in 10.0.1.1 10.0.1.2 10.0.1.3; do - curl -s "http://$ip:2379/pd/api/v1/stores" \ - | python3 -c 'import json,sys; print([s["store"]["state_name"] for s in json.load(sys.stdin)["stores"]])' -done -# Each node should report ['Up']. +curl -s "http://10.0.1.1:23791/pd/api/v1/stores" \ + | python3 -c 'import json,sys; print([s["store"]["state_name"] for s in json.load(sys.stdin)["stores"]])' +# Expected: ['Up', 'Up'] (one entry per TiKV store). ``` ### Pre-split & scatter (optional but recommended) -For the insert-dominant workload to spread region writes evenly across regions -within a node's TiKV, pre-split the keyspace at boundaries derived from -`DBKey(headID) = MaxID*layer + headID` little-endian byte 0. The TiKV raw key is -`TiKVKeyPrefix + "_" + uint32_le(DBKey)`; for multi-chunk it appends `\x00` / -`\x02` for chunk / count keys, but we split *only* on the head-key prefix so all -chunk and count variants for a head share a region. Boundaries used: `0x02, 0x04, -…, 0xfe` (127 split points → 128 regions). +For the insert-dominant workload, pre-split the keyspace so writes spread +evenly across regions and stores. Boundaries derive from +`DBKey(headID) = MaxID*layer + headID` little-endian byte 0; the TiKV raw key +is `TiKVKeyPrefix + "_" + uint32_le(DBKey)`. We split *only* on the head-key +prefix so all chunk/count variants for a head share a region. Used split +points: `0x02, 0x04, …, 0xfe` (127 split points → 128 regions). -Driver-side helper (each PD is independent, so run per node): +Since the cluster is shared, run the helper **once** against any PD endpoint: ```bash -PREFIX="bench_insert_dominant_3node" # keep in sync with KEY_PREFIX in run_distributed.sh -for ip in 10.0.1.1 10.0.1.2 10.0.1.3; do - PD="http://$ip:2379" - PDCTL=(docker run --rm --network host --entrypoint /pd-ctl pingcap/pd:v7.5.1 -u "$PD") - python3 - "$PREFIX" "${PDCTL[@]}" <<'PY' +PREFIX="bench_insert_dominant_2node" # keep in sync with KEY_PREFIX in run_distributed.sh +PD="http://10.0.1.1:23791" +PDCTL=(docker run --rm --network host --entrypoint /pd-ctl pingcap/pd:v8.5.1 -u "$PD") +python3 - "$PREFIX" "${PDCTL[@]}" <<'PY' import json, subprocess, sys prefix = sys.argv[1].encode() + b'_' pdctl = sys.argv[2:] @@ -172,48 +188,65 @@ for b in range(2, 256, 2): for r in json.loads(run(['region', 'scan']))['regions']: run(['operator', 'add', 'scatter-region', str(r['id'])]) PY -done ``` -Skip this on the very first run if you don't have load skew — `start-tikv` works -without it. For 1B-scale insert-dominant runs on a single node it materially -reduces head-region hot-spotting. +Skip this on the very first run if you don't have load skew — `start-tikv` +works without it. For 1B-scale insert-dominant runs it materially reduces +head-region hot-spotting. ## Step 4 — Run the benchmark ```bash # Single scale, explicit node count (driver + (N-1) workers): -./evaluation/distributed/run_distributed.sh run cluster.conf insert_dominant 3 +./evaluation/distributed/run_distributed.sh run my_cluster.conf insert_dominant 2 # Or sweep 1-node baseline + N-node distributed for one or more scales: -./evaluation/distributed/run_distributed.sh bench cluster.conf insert_dominant +./evaluation/distributed/run_distributed.sh bench my_cluster.conf insert_dominant +./evaluation/distributed/run_distributed.sh bench my_cluster.conf all ``` What `run` does: 1. **Build** (driver only): driver builds the index locally with router - *disabled* (`Rebuild=true`, no `[Router]`). Output goes to `…_n0/spann_index`. + *disabled* (`Rebuild=true`, no `[Distributed]`). Output goes to + `…_n0/spann_index`. Because the TiKV cluster is shared, the driver writes + all postings straight to TiKV via PD-routed RPCs — there is no need for a + distributed build phase. 2. **Distribute**: rsync head index + perftest files from driver to each worker. -3. **Workers**: SSH-launches `SPTAGTest` on each worker with `WORKER_INDEX=i` and - the per-node ini (router enabled, `Rebuild=false`). -4. **Driver**: relaunches `SPTAGTest` with router enabled, `Rebuild=false`. The - driver dispatches Insert / Search commands across batches via TCP. +3. **Workers**: SSH-launches `SPTAGTest` on each worker with `WORKER_INDEX=i` + and the per-node ini (router enabled, `Rebuild=false`). +4. **Driver**: relaunches `SPTAGTest` with router enabled, `Rebuild=false`. + The driver dispatches Insert / Search commands across batches via TCP. 5. **Collect**: driver sends Stop, joins worker logs into `benchmark_logs/`. -Useful environment overrides (see header of `run_distributed.sh`): - -- `NOCACHE=1` — disable TiKV block cache, OS pagecache, and `VersionCacheMaxChunks`. -- `BUILD_WITH_CACHE=1` — build with caches, then drop caches before search/insert (NOCACHE only). -- `SKIP_TIKV_SWAP=1` — when using `BUILD_WITH_CACHE`, skip the destructive TiKV - container restart that has corrupted recall at 100M scale. -- `SKIP_SAVE_LOAD=1` — skip post-build SaveIndex / per-batch Load+Clone+Save (NOCACHE only). -- `SKIP_HEAD_BUILD=1` — reuse existing HeadIndex if present (RebuildSSDOnly). +> The "build on the driver, then distribute and run" split is a workaround: +> we don't yet have a real distributed SelectHead/BuildHead implementation, so +> Phase 1 is single-node-with-shared-TiKV. The `BuildOnly=true` / +> `RebuildSSDOnly=true` / `SkipSaveLoadCycles=true` / +> `tikv_switch_to_nocache` / `drop_caches` choreography exists because of +> this split; it is not a feature of the steady-state design. + +Useful environment overrides (see the header of `run_distributed.sh` for the +authoritative list): + +- `NOCACHE=1` — disable TiKV block cache, OS pagecache, and + `VersionCacheMaxChunks` for the search/insert phase. +- `BUILD_WITH_CACHE=1` — build with caches enabled, then drop caches before + search/insert (requires `NOCACHE=1`). Used at 100M scale where building + under nocache is impractical. +- `SKIP_TIKV_SWAP=1` — with `BUILD_WITH_CACHE`, skip the destructive TiKV + container restart that has corrupted recall at 100M scale. Relies on + drop_caches + `VersionCacheMaxChunks=0` for nocache semantics. +- `SKIP_SAVE_LOAD=1` — skip the post-build SaveIndex / per-batch + Load+Clone+Save cycle (`SkipSaveLoadCycles=true`). Required at 100M scale. +- `SKIP_HEAD_BUILD=1` — reuse existing HeadIndex if present + (`RebuildSSDOnly=true`); falls back to full build if HeadIndex is missing. ## Step 5 — Stop / cleanup ```bash -./evaluation/distributed/run_distributed.sh stop-tikv cluster.conf -./evaluation/distributed/run_distributed.sh cleanup cluster.conf # remove deployed files +./evaluation/distributed/run_distributed.sh stop-tikv my_cluster.conf +./evaluation/distributed/run_distributed.sh cleanup my_cluster.conf # remove deployed files ``` ## Key knobs in `benchmark_insert_dominant_template.ini` diff --git a/evaluation/distributed/run_distributed.sh b/evaluation/distributed/run_distributed.sh index bb982ab7d..28404c8a3 100755 --- a/evaluation/distributed/run_distributed.sh +++ b/evaluation/distributed/run_distributed.sh @@ -751,37 +751,6 @@ start_remote_worker() { echo " Worker n${NODE_IDX} on $host (SSH PID: $ssh_pid, log: $LOG)" } -wait_workers_ready() { - local SCALE="$1" - local NODE_COUNT="$2" - local TIMEOUT=120 - - echo "Waiting for ${#WORKER_SSH_PIDS[@]} workers to be ready..." - for attempt in $(seq 1 $TIMEOUT); do - local all_ready=true - for i in $(seq 1 $((NODE_COUNT - 1))); do - local LOG="$LOGDIR/benchmark_${SCALE}_${NODE_COUNT}node_worker${i}.log" - if ! grep -q "Worker.*[Rr]eady\|Waiting for dispatch" "$LOG" 2>/dev/null; then - all_ready=false - fi - done - if $all_ready; then - echo " All workers ready (${attempt}s)" - return 0 - fi - # Check if any worker SSH process died - for idx in "${!WORKER_SSH_PIDS[@]}"; do - if ! kill -0 "${WORKER_SSH_PIDS[$idx]}" 2>/dev/null; then - echo " ERROR: Worker SSH PID ${WORKER_SSH_PIDS[$idx]} exited prematurely" - return 1 - fi - done - sleep 1 - done - echo " WARNING: Not all workers ready after ${TIMEOUT}s" - return 1 -} - stop_remote_workers() { # Wait for workers to self-exit (driver sends TCP Stop), then force-kill. local TIMEOUT=${1:-30} @@ -1140,8 +1109,6 @@ cmd_run() { fi done - # Binary already pushed; nothing else to do here. - # --- Phase 3: Start driver first (contains dispatcher), then workers --- echo "" From ee405d4ddff4ec218c6a827eb4084087d96432cc Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Wed, 20 May 2026 10:09:08 +0000 Subject: [PATCH 10/12] README: clarify driver = worker 0 + dispatcher; workers peer-to-peer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous wording made it sound like the driver was a stateless coordinator and workers only talked back to it. Reality: node 0 runs as worker 0 (owns its hash shard like every other worker) and additionally hosts the dispatcher; workers talk to each other directly through PostingRouter for remote append, head sync, and merge hints — no driver-mediated forwarding. Diagram and 'What run does' steps updated. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- evaluation/distributed/README.md | 55 +++++++++++++++++++------------- 1 file changed, 33 insertions(+), 22 deletions(-) diff --git a/evaluation/distributed/README.md b/evaluation/distributed/README.md index 4717efc35..2b9c0950e 100644 --- a/evaluation/distributed/README.md +++ b/evaluation/distributed/README.md @@ -24,20 +24,23 @@ below). ## Architecture ``` - ┌──────────────┐ - │ Driver │ (node 0) - │ RunBenchmark│ - │ + Router │ - └──┬───┬───┬──┘ - TCP Dispatch│ │ │ - ┌────────┘ │ └────────┐ - ▼ ▼ ▼ + ┌────────────────────┐ + │ Driver = Worker 0│ (node 0) + │ + Dispatcher │ + └─┬──┬──┬────────────┘ + TCP Dispatch │ │ │ ▲ ▲ ▲ + (broadcast) │ │ │ │ │ │ status replies + ┌──────┘ │ └──────┐│ │ │ + ▼ ▼ ▼│ │ │ ┌──────────┐ ┌──────────┐ ┌──────────┐ │ Worker 1 │ │ Worker 2 │ │ Worker N │ - │ + Router│ │ + Router│ │ + Router│ - └────┬─────┘ └────┬─────┘ └────┬─────┘ - │ │ │ - └────────────┼────────────┘ + └──┬───▲───┘ └──┬───▲───┘ └──┬───▲───┘ + │ │ │ │ │ │ + └───┴────────┴───┴────────┴───┘ + PostingRouter peer-to-peer + (remote append / head sync / + merge hints, by hash owner) + │ ▼ ┌───────────────────┐ │ Shared TiKV raft │ N PDs (one raft group) + @@ -45,15 +48,19 @@ below). └───────────────────┘ ``` -- **Driver** (node 0): builds the index, sends Search/Insert/Stop commands via - TCP dispatch. -- **Workers** (nodes 1..N): receive commands, execute their shard locally, - report results back over the dispatch channel. +- **Driver** (node 0): also runs as **worker 0**. On top of the worker role, + it owns the dispatcher: builds the initial index, then broadcasts + Search/Insert/Stop commands to the other workers over TCP dispatch. +- **Workers** (nodes 0..N-1): each owns a shard of the head index by hash. + Workers talk to each other peer-to-peer through PostingRouter for remote + append, head sync, and merge hints — there is no driver-mediated forwarding. + On each `DispatchCommand` they execute the local part of the request and + report status back to the dispatcher. - **Shared TiKV cluster**: every node runs a PD + TiKV container; all PDs join one raft group, all TiKVs point to all PDs. PD routes each key to the store that owns its region. -- **PostingRouter**: hash-based head routing, remote append, head sync, - dispatch protocol. +- **PostingRouter**: hash-based head routing, remote append, head sync, and + the TCP dispatch transport used by the dispatcher. ## TiKV deployment model @@ -213,10 +220,14 @@ What `run` does: all postings straight to TiKV via PD-routed RPCs — there is no need for a distributed build phase. 2. **Distribute**: rsync head index + perftest files from driver to each worker. -3. **Workers**: SSH-launches `SPTAGTest` on each worker with `WORKER_INDEX=i` - and the per-node ini (router enabled, `Rebuild=false`). -4. **Driver**: relaunches `SPTAGTest` with router enabled, `Rebuild=false`. - The driver dispatches Insert / Search commands across batches via TCP. +3. **Workers**: SSH-launches `SPTAGTest` on each remote worker (nodes 1..N-1) + with `WORKER_INDEX=i` and the per-node ini (router enabled, + `Rebuild=false`). Workers wire PostingRouter so they can reach every peer + directly for remote append / head sync. +4. **Driver**: relaunches `SPTAGTest` on node 0 with router enabled, + `Rebuild=false`. The same process acts as **worker 0** (owns its hash + shard like any other worker) **and** as the dispatcher (broadcasts Insert + / Search / Stop over TCP and waits for status replies). 5. **Collect**: driver sends Stop, joins worker logs into `benchmark_logs/`. > The "build on the driver, then distribute and run" split is a workaround: From 6cf7d36e922d01a86163377a1bbc5cdc3f07f6e8 Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Wed, 20 May 2026 10:10:26 +0000 Subject: [PATCH 11/12] README: drop unused TiKV pre-split helper section We never actually ran the pre-split/scatter helper in our benchmark runs. Keeping it in the doc gives the false impression that it's part of the recommended setup. Remove the whole section. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- evaluation/distributed/README.md | 34 -------------------------------- 1 file changed, 34 deletions(-) diff --git a/evaluation/distributed/README.md b/evaluation/distributed/README.md index 2b9c0950e..7b2234908 100644 --- a/evaluation/distributed/README.md +++ b/evaluation/distributed/README.md @@ -167,40 +167,6 @@ curl -s "http://10.0.1.1:23791/pd/api/v1/stores" \ # Expected: ['Up', 'Up'] (one entry per TiKV store). ``` -### Pre-split & scatter (optional but recommended) - -For the insert-dominant workload, pre-split the keyspace so writes spread -evenly across regions and stores. Boundaries derive from -`DBKey(headID) = MaxID*layer + headID` little-endian byte 0; the TiKV raw key -is `TiKVKeyPrefix + "_" + uint32_le(DBKey)`. We split *only* on the head-key -prefix so all chunk/count variants for a head share a region. Used split -points: `0x02, 0x04, …, 0xfe` (127 split points → 128 regions). - -Since the cluster is shared, run the helper **once** against any PD endpoint: - -```bash -PREFIX="bench_insert_dominant_2node" # keep in sync with KEY_PREFIX in run_distributed.sh -PD="http://10.0.1.1:23791" -PDCTL=(docker run --rm --network host --entrypoint /pd-ctl pingcap/pd:v8.5.1 -u "$PD") -python3 - "$PREFIX" "${PDCTL[@]}" <<'PY' -import json, subprocess, sys -prefix = sys.argv[1].encode() + b'_' -pdctl = sys.argv[2:] -def run(args): return subprocess.check_output(pdctl + args, text=True) -def region_for(hex_key): return json.loads(run(['region', 'key', '--format=hex', hex_key]))['id'] -for b in range(2, 256, 2): - key = (prefix + bytes([b, 0, 0, 0])).hex() - rid = region_for(key) - run(['operator', 'add', 'split-region', str(rid), '--policy=usekey', '--keys', key]) -for r in json.loads(run(['region', 'scan']))['regions']: - run(['operator', 'add', 'scatter-region', str(r['id'])]) -PY -``` - -Skip this on the very first run if you don't have load skew — `start-tikv` -works without it. For 1B-scale insert-dominant runs it materially reduces -head-region hot-spotting. - ## Step 4 — Run the benchmark ```bash From 07bdc03a6b1c3e89944da005d96cc073b733acfd Mon Sep 17 00:00:00 2001 From: TerrenceZhangX <39916879+TerrenceZhangX@users.noreply.github.com> Date: Wed, 20 May 2026 10:11:38 +0000 Subject: [PATCH 12/12] Clean comment --- AnnService/inc/Core/Common/FineGrainedLock.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/AnnService/inc/Core/Common/FineGrainedLock.h b/AnnService/inc/Core/Common/FineGrainedLock.h index 5cfad7ac6..1f7d1eab4 100644 --- a/AnnService/inc/Core/Common/FineGrainedLock.h +++ b/AnnService/inc/Core/Common/FineGrainedLock.h @@ -56,10 +56,6 @@ namespace SPTAG return GetLock(idx); } - // Per-posting lock identity. Two indices share a lock iff they are - // the same posting, so external callers can use `hash_func(a) == - // hash_func(b)` as a self-lock guard (e.g. in Split, to skip - // re-locking the same head VID). static inline unsigned hash_func(unsigned idx) { return idx;