Skip to content

PR 7: GraphRAG - Native Embedding Service #8

Description

@gilmanb1

Overview

Implement native embedding generation using Apple's NaturalLanguage framework (NLEmbedding). This service generates vector representations of text for similarity search.

Dependencies

None - Uses only built-in Apple frameworks. Can be developed in parallel with other GraphRAG PRs.

Files to Create

File Action Description
Sources/SortAI/Core/GraphRAG/AppleNLEmbeddingService.swift Create Embedding service

Implementation Details

AppleNLEmbeddingService

import NaturalLanguage
import Accelerate

actor AppleNLEmbeddingService {
    // MARK: - Properties
    
    private let embedding: NLEmbedding?
    private let dimension: Int
    
    /// Standard embedding dimension for Apple's word embeddings
    static let standardDimension = 512
    
    // MARK: - Initialization
    
    init(language: NLLanguage = .english) {
        self.embedding = NLEmbedding.wordEmbedding(for: language)
        self.dimension = embedding?.dimension ?? Self.standardDimension
    }
    
    var embeddingDimension: Int { dimension }
    
    // MARK: - Single Word Embedding
    
    /// Get embedding for a single word
    func wordVector(for word: String) -> [Float]? {
        guard let embedding = embedding,
              let vector = embedding.vector(for: word.lowercased()) else {
            return nil
        }
        return vector.map { Float($0) }
    }
    
    // MARK: - Document Embedding
    
    /// Generate document embedding by averaging word vectors (bag-of-words)
    func generateEmbedding(for text: String) -> [Float]? {
        guard let embedding = embedding else { return nil }
        
        let tagger = NLTagger(tagSchemes: [.tokenType])
        tagger.string = text
        
        var vectors: [[Double]] = []
        
        tagger.enumerateTags(
            in: text.startIndex..<text.endIndex,
            unit: .word,
            scheme: .tokenType,
            options: [.omitWhitespace, .omitPunctuation]
        ) { _, range in
            let word = String(text[range]).lowercased()
            if let vector = embedding.vector(for: word) {
                vectors.append(vector)
            }
            return true
        }
        
        guard !vectors.isEmpty else { return nil }
        
        // Average all word vectors
        return averageVectors(vectors)
    }
    
    /// Generate weighted embedding (TF-IDF style)
    func generateWeightedEmbedding(
        for text: String,
        weights: [String: Double]? = nil
    ) -> [Float]? {
        guard let embedding = embedding else { return nil }
        
        let tagger = NLTagger(tagSchemes: [.tokenType])
        tagger.string = text
        
        var weightedVectors: [([Double], Double)] = []
        var wordCounts: [String: Int] = [:]
        
        // First pass: count word frequencies
        tagger.enumerateTags(
            in: text.startIndex..<text.endIndex,
            unit: .word,
            scheme: .tokenType,
            options: [.omitWhitespace, .omitPunctuation]
        ) { _, range in
            let word = String(text[range]).lowercased()
            wordCounts[word, default: 0] += 1
            return true
        }
        
        let maxCount = Double(wordCounts.values.max() ?? 1)
        
        // Second pass: collect weighted vectors
        tagger.enumerateTags(
            in: text.startIndex..<text.endIndex,
            unit: .word,
            scheme: .tokenType,
            options: [.omitWhitespace, .omitPunctuation]
        ) { _, range in
            let word = String(text[range]).lowercased()
            
            if let vector = embedding.vector(for: word) {
                // Use provided weights or default to TF normalization
                let weight = weights?[word] ?? (Double(wordCounts[word] ?? 1) / maxCount)
                weightedVectors.append((vector, weight))
            }
            return true
        }
        
        guard !weightedVectors.isEmpty else { return nil }
        
        return weightedAverageVectors(weightedVectors)
    }
    
    // MARK: - Similarity
    
    /// Calculate cosine similarity between two embeddings
    func cosineSimilarity(_ a: [Float], _ b: [Float]) -> Float {
        guard a.count == b.count, !a.isEmpty else { return 0 }
        
        var dotProduct: Float = 0
        var normA: Float = 0
        var normB: Float = 0
        
        vDSP_dotpr(a, 1, b, 1, &dotProduct, vDSP_Length(a.count))
        vDSP_dotpr(a, 1, a, 1, &normA, vDSP_Length(a.count))
        vDSP_dotpr(b, 1, b, 1, &normB, vDSP_Length(b.count))
        
        let denominator = sqrt(normA) * sqrt(normB)
        return denominator > 0 ? dotProduct / denominator : 0
    }
    
    /// Find similar words in embedding space
    func findSimilarWords(to word: String, k: Int = 10) -> [(String, Double)]? {
        guard let embedding = embedding else { return nil }
        return embedding.neighbors(for: word.lowercased(), maximumCount: k)
    }
    
    // MARK: - Batch Processing
    
    /// Generate embeddings for multiple texts
    func generateEmbeddings(for texts: [String]) -> [[Float]?] {
        return texts.map { generateEmbedding(for: $0) }
    }
    
    // MARK: - Private Helpers
    
    private func averageVectors(_ vectors: [[Double]]) -> [Float] {
        let dimension = vectors[0].count
        var averaged = [Double](repeating: 0, count: dimension)
        
        for vector in vectors {
            for i in 0..<dimension {
                averaged[i] += vector[i]
            }
        }
        
        let count = Double(vectors.count)
        return averaged.map { Float($0 / count) }
    }
    
    private func weightedAverageVectors(_ vectors: [([Double], Double)]) -> [Float] {
        let dimension = vectors[0].0.count
        var weighted = [Double](repeating: 0, count: dimension)
        var totalWeight: Double = 0
        
        for (vector, weight) in vectors {
            totalWeight += weight
            for i in 0..<dimension {
                weighted[i] += vector[i] * weight
            }
        }
        
        guard totalWeight > 0 else {
            return averageVectors(vectors.map { $0.0 })
        }
        
        return weighted.map { Float($0 / totalWeight) }
    }
    
    // MARK: - Normalization
    
    /// Normalize embedding to unit length (L2 norm)
    func normalize(_ embedding: [Float]) -> [Float] {
        var norm: Float = 0
        vDSP_dotpr(embedding, 1, embedding, 1, &norm, vDSP_Length(embedding.count))
        norm = sqrt(norm)
        
        guard norm > 0 else { return embedding }
        
        var normalized = [Float](repeating: 0, count: embedding.count)
        var divisor = norm
        vDSP_vsdiv(embedding, 1, &divisor, &normalized, 1, vDSP_Length(embedding.count))
        
        return normalized
    }
}

Design Decisions

1. Bag-of-Words Averaging

Apple's NLEmbedding provides word-level embeddings, not sentence embeddings. We average word vectors to create document embeddings. This is simple but effective for our use case.

2. 512-Dimension Embeddings

Apple's English word embeddings are 512-dimensional. This is fixed by the framework.

3. Accelerate Framework

Using vDSP functions for vector operations (dot product, normalization) provides hardware-accelerated performance on Apple Silicon.

4. Weighted Averaging

Optional TF-IDF-style weighting allows more important words to contribute more to the final embedding.

Acceptance Criteria

  • Generates embeddings for single words
  • Generates document embeddings via word averaging
  • Supports weighted embedding generation
  • Calculates cosine similarity correctly
  • Normalizes embeddings to unit length
  • Uses Accelerate framework for performance
  • Handles missing words gracefully
  • Returns consistent dimension (512)

Testing

func testEmbeddingDimension() async {
    let service = AppleNLEmbeddingService()
    XCTAssertEqual(await service.embeddingDimension, 512)
}

func testWordEmbedding() async {
    let service = AppleNLEmbeddingService()
    let vector = await service.wordVector(for: "computer")
    
    XCTAssertNotNil(vector)
    XCTAssertEqual(vector?.count, 512)
}

func testDocumentEmbedding() async {
    let service = AppleNLEmbeddingService()
    let text = "This is a test document about computers and technology."
    
    let embedding = await service.generateEmbedding(for: text)
    
    XCTAssertNotNil(embedding)
    XCTAssertEqual(embedding?.count, 512)
}

func testCosineSimilarity() async {
    let service = AppleNLEmbeddingService()
    
    let text1 = "Financial quarterly report revenue"
    let text2 = "Quarterly financial earnings report"
    let text3 = "Cat playing with a ball of yarn"
    
    let emb1 = await service.generateEmbedding(for: text1)!
    let emb2 = await service.generateEmbedding(for: text2)!
    let emb3 = await service.generateEmbedding(for: text3)!
    
    let sim12 = await service.cosineSimilarity(emb1, emb2)
    let sim13 = await service.cosineSimilarity(emb1, emb3)
    
    XCTAssertGreaterThan(sim12, sim13, "Similar texts should have higher similarity")
}

func testNormalization() async {
    let service = AppleNLEmbeddingService()
    let embedding = await service.generateEmbedding(for: "test document")!
    let normalized = await service.normalize(embedding)
    
    // L2 norm should be ~1.0
    var norm: Float = 0
    vDSP_dotpr(normalized, 1, normalized, 1, &norm, vDSP_Length(normalized.count))
    
    XCTAssertEqual(sqrt(norm), 1.0, accuracy: 0.001)
}

func testEmptyTextHandling() async {
    let service = AppleNLEmbeddingService()
    let embedding = await service.generateEmbedding(for: "")
    
    XCTAssertNil(embedding, "Empty text should return nil")
}

Estimated Size

~150 lines of code

Risk Assessment

Low - Uses stable NaturalLanguage and Accelerate frameworks. No external dependencies.

Metadata

Metadata

Assignees

No one assigned

    Labels

    enhancementNew feature or requestgraphragGraphRAG knowledge graph featuresphase-2Phase 2 - Parallel development

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions