Skip to content

PR 8: GraphRAG - FAISS Vector Store #9

Description

@gilmanb1

Overview

Implement a vector store for fast approximate nearest neighbor search. This provides the optional FAISS backend with an in-memory fallback for systems where FAISS is not available.

Dependencies

Files to Create

File Action Description
Sources/SortAI/Core/GraphRAG/FAISSVectorStore.swift Create FAISS wrapper + in-memory fallback
Sources/SortAI/Core/GraphRAG/VectorStoreProtocol.swift Create Protocol definition

Implementation Details

1. VectorStore Protocol

/// Protocol for vector similarity search backends
protocol VectorStore: Sendable {
    /// Dimension of vectors stored
    var dimension: Int { get }
    
    /// Number of vectors in the store
    var count: Int { get async }
    
    /// Add a vector with associated ID
    func add(id: String, vector: [Float]) async throws
    
    /// Add multiple vectors
    func addBatch(items: [(id: String, vector: [Float])]) async throws
    
    /// Search for k nearest neighbors
    func search(query: [Float], k: Int) async throws -> [(id: String, distance: Float)]
    
    /// Remove a vector by ID
    func remove(id: String) async throws
    
    /// Clear all vectors
    func clear() async throws
    
    /// Persist to disk
    func save(to path: URL) async throws
    
    /// Load from disk
    func load(from path: URL) async throws
}

2. InMemoryVectorStore (Default/Fallback)

import Accelerate

/// Simple in-memory vector store using brute-force search
actor InMemoryVectorStore: VectorStore {
    let dimension: Int
    
    private var vectors: [[Float]] = []
    private var ids: [String] = []
    private var idToIndex: [String: Int] = [:]
    
    var count: Int { vectors.count }
    
    init(dimension: Int) {
        self.dimension = dimension
    }
    
    func add(id: String, vector: [Float]) async throws {
        guard vector.count == dimension else {
            throw VectorStoreError.dimensionMismatch(expected: dimension, got: vector.count)
        }
        
        // Update if exists, otherwise append
        if let index = idToIndex[id] {
            vectors[index] = vector
        } else {
            let index = vectors.count
            vectors.append(vector)
            ids.append(id)
            idToIndex[id] = index
        }
    }
    
    func addBatch(items: [(id: String, vector: [Float])]) async throws {
        for (id, vector) in items {
            try await add(id: id, vector: vector)
        }
    }
    
    func search(query: [Float], k: Int) async throws -> [(id: String, distance: Float)] {
        guard query.count == dimension else {
            throw VectorStoreError.dimensionMismatch(expected: dimension, got: query.count)
        }
        
        guard !vectors.isEmpty else { return [] }
        
        // Calculate distances to all vectors
        var distances: [(Int, Float)] = []
        
        for (index, vector) in vectors.enumerated() {
            let distance = euclideanDistance(query, vector)
            distances.append((index, distance))
        }
        
        // Sort by distance and take top k
        distances.sort { $0.1 < $1.1 }
        
        return distances.prefix(k).map { (index, distance) in
            (ids[index], distance)
        }
    }
    
    func remove(id: String) async throws {
        guard let index = idToIndex[id] else { return }
        
        // Remove and update indices
        vectors.remove(at: index)
        ids.remove(at: index)
        idToIndex.removeValue(forKey: id)
        
        // Rebuild index map
        idToIndex = Dictionary(uniqueKeysWithValues: ids.enumerated().map { ($1, $0) })
    }
    
    func clear() async throws {
        vectors = []
        ids = []
        idToIndex = [:]
    }
    
    func save(to path: URL) async throws {
        let data = VectorStoreData(
            dimension: dimension,
            vectors: vectors,
            ids: ids
        )
        let encoded = try JSONEncoder().encode(data)
        try encoded.write(to: path)
    }
    
    func load(from path: URL) async throws {
        let data = try Data(contentsOf: path)
        let decoded = try JSONDecoder().decode(VectorStoreData.self, from: data)
        
        guard decoded.dimension == dimension else {
            throw VectorStoreError.dimensionMismatch(expected: dimension, got: decoded.dimension)
        }
        
        self.vectors = decoded.vectors
        self.ids = decoded.ids
        self.idToIndex = Dictionary(uniqueKeysWithValues: ids.enumerated().map { ($1, $0) })
    }
    
    // MARK: - Distance Calculation
    
    private func euclideanDistance(_ a: [Float], _ b: [Float]) -> Float {
        var diff = [Float](repeating: 0, count: a.count)
        vDSP_vsub(b, 1, a, 1, &diff, 1, vDSP_Length(a.count))
        
        var sumSquares: Float = 0
        vDSP_dotpr(diff, 1, diff, 1, &sumSquares, vDSP_Length(diff.count))
        
        return sqrt(sumSquares)
    }
}

private struct VectorStoreData: Codable {
    let dimension: Int
    let vectors: [[Float]]
    let ids: [String]
}

3. FAISSVectorStore (Optional)

#if canImport(FAISS)
import FAISS

/// FAISS-backed vector store for large-scale similarity search
actor FAISSVectorStore: VectorStore {
    let dimension: Int
    
    private var index: OpaquePointer?
    private var idMap: [Int64: String] = [:]
    private var stringToId: [String: Int64] = [:]
    private var nextId: Int64 = 0
    
    var count: Int { Int(faiss_index_ntotal(index)) }
    
    init(dimension: Int, indexType: String = "Flat") {
        self.dimension = dimension
        self.index = faiss_index_factory(Int32(dimension), indexType, METRIC_L2)
    }
    
    deinit {
        if let index = index {
            faiss_index_free(index)
        }
    }
    
    func add(id: String, vector: [Float]) async throws {
        guard vector.count == dimension else {
            throw VectorStoreError.dimensionMismatch(expected: dimension, got: vector.count)
        }
        
        let internalId = nextId
        nextId += 1
        
        idMap[internalId] = id
        stringToId[id] = internalId
        
        var mutableVector = vector
        faiss_index_add(index, 1, &mutableVector)
    }
    
    func search(query: [Float], k: Int) async throws -> [(id: String, distance: Float)] {
        guard query.count == dimension else {
            throw VectorStoreError.dimensionMismatch(expected: dimension, got: query.count)
        }
        
        var distances = [Float](repeating: 0, count: k)
        var indices = [Int64](repeating: -1, count: k)
        var mutableQuery = query
        
        faiss_index_search(index, 1, &mutableQuery, Int32(k), &distances, &indices)
        
        var results: [(String, Float)] = []
        for i in 0..<k {
            if indices[i] >= 0, let id = idMap[indices[i]] {
                results.append((id, distances[i]))
            }
        }
        return results
    }
    
    // ... other methods
}
#endif

4. VectorStoreFactory

/// Factory for creating the appropriate vector store
enum VectorStoreFactory {
    static func create(dimension: Int) -> any VectorStore {
        #if canImport(FAISS)
        return FAISSVectorStore(dimension: dimension)
        #else
        return InMemoryVectorStore(dimension: dimension)
        #endif
    }
    
    static func createInMemory(dimension: Int) -> InMemoryVectorStore {
        return InMemoryVectorStore(dimension: dimension)
    }
}

5. Error Types

enum VectorStoreError: LocalizedError {
    case dimensionMismatch(expected: Int, got: Int)
    case notFound(String)
    case persistenceFailed(Error)
    case loadFailed(Error)
    
    var errorDescription: String? {
        switch self {
        case .dimensionMismatch(let expected, let got):
            return "Vector dimension mismatch: expected \(expected), got \(got)"
        case .notFound(let id):
            return "Vector not found: \(id)"
        case .persistenceFailed(let error):
            return "Failed to save vector store: \(error.localizedDescription)"
        case .loadFailed(let error):
            return "Failed to load vector store: \(error.localizedDescription)"
        }
    }
}

Performance Characteristics

Store Type Add Search (k=10) Memory Best For
InMemory O(1) O(n) High < 100K vectors
FAISS Flat O(1) O(n) Medium < 1M vectors
FAISS IVF O(1) O(sqrt(n)) Low > 1M vectors

For SortAI's typical use case (< 100K documents), the in-memory store is sufficient and has zero dependencies.

Acceptance Criteria

  • InMemoryVectorStore works without FAISS dependency
  • Protocol allows swapping implementations
  • Dimension validation on add/search
  • Correct k-nearest neighbor results
  • Persistence to/from disk
  • Batch add support
  • Remove by ID support
  • Uses Accelerate for fast distance calculation

Testing

func testVectorStoreAdd() async throws {
    let store = InMemoryVectorStore(dimension: 512)
    let vector = [Float](repeating: 0.1, count: 512)
    
    try await store.add(id: "doc1", vector: vector)
    
    let count = await store.count
    XCTAssertEqual(count, 1)
}

func testVectorStoreSearch() async throws {
    let store = InMemoryVectorStore(dimension: 128)
    
    // Add vectors
    let vec1 = [Float](repeating: 0.1, count: 128)
    let vec2 = [Float](repeating: 0.9, count: 128)
    try await store.add(id: "doc1", vector: vec1)
    try await store.add(id: "doc2", vector: vec2)
    
    // Search for similar to vec1
    let query = [Float](repeating: 0.15, count: 128)
    let results = try await store.search(query: query, k: 2)
    
    XCTAssertEqual(results.first?.id, "doc1", "doc1 should be closest")
}

func testDimensionMismatch() async {
    let store = InMemoryVectorStore(dimension: 512)
    let wrongDimVector = [Float](repeating: 0.1, count: 256)
    
    do {
        try await store.add(id: "test", vector: wrongDimVector)
        XCTFail("Should throw dimension mismatch error")
    } catch VectorStoreError.dimensionMismatch {
        // Expected
    } catch {
        XCTFail("Wrong error type: \(error)")
    }
}

func testPersistence() async throws {
    let store = InMemoryVectorStore(dimension: 128)
    let vector = [Float](repeating: 0.5, count: 128)
    try await store.add(id: "doc1", vector: vector)
    
    // Save
    let tempURL = FileManager.default.temporaryDirectory.appendingPathComponent("test_vectors.json")
    try await store.save(to: tempURL)
    
    // Load into new store
    let store2 = InMemoryVectorStore(dimension: 128)
    try await store2.load(from: tempURL)
    
    let count = await store2.count
    XCTAssertEqual(count, 1)
    
    // Cleanup
    try FileManager.default.removeItem(at: tempURL)
}

Estimated Size

~200 lines of code (InMemory implementation + protocol)

Risk Assessment

Medium - FAISS native bridging is complex. Mitigation: in-memory fallback always available, no hard dependency on FAISS.

Metadata

Metadata

Assignees

No one assigned

    Labels

    enhancementNew feature or requestgraphragGraphRAG knowledge graph featuresphase-3Phase 3 - Integration

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions