Overview
Implement native embedding generation using Apple's NaturalLanguage framework (NLEmbedding). This service generates vector representations of text for similarity search.
Dependencies
None - Uses only built-in Apple frameworks. Can be developed in parallel with other GraphRAG PRs.
Files to Create
| File |
Action |
Description |
Sources/SortAI/Core/GraphRAG/AppleNLEmbeddingService.swift |
Create |
Embedding service |
Implementation Details
AppleNLEmbeddingService
import NaturalLanguage
import Accelerate
actor AppleNLEmbeddingService {
// MARK: - Properties
private let embedding: NLEmbedding?
private let dimension: Int
/// Standard embedding dimension for Apple's word embeddings
static let standardDimension = 512
// MARK: - Initialization
init(language: NLLanguage = .english) {
self.embedding = NLEmbedding.wordEmbedding(for: language)
self.dimension = embedding?.dimension ?? Self.standardDimension
}
var embeddingDimension: Int { dimension }
// MARK: - Single Word Embedding
/// Get embedding for a single word
func wordVector(for word: String) -> [Float]? {
guard let embedding = embedding,
let vector = embedding.vector(for: word.lowercased()) else {
return nil
}
return vector.map { Float($0) }
}
// MARK: - Document Embedding
/// Generate document embedding by averaging word vectors (bag-of-words)
func generateEmbedding(for text: String) -> [Float]? {
guard let embedding = embedding else { return nil }
let tagger = NLTagger(tagSchemes: [.tokenType])
tagger.string = text
var vectors: [[Double]] = []
tagger.enumerateTags(
in: text.startIndex..<text.endIndex,
unit: .word,
scheme: .tokenType,
options: [.omitWhitespace, .omitPunctuation]
) { _, range in
let word = String(text[range]).lowercased()
if let vector = embedding.vector(for: word) {
vectors.append(vector)
}
return true
}
guard !vectors.isEmpty else { return nil }
// Average all word vectors
return averageVectors(vectors)
}
/// Generate weighted embedding (TF-IDF style)
func generateWeightedEmbedding(
for text: String,
weights: [String: Double]? = nil
) -> [Float]? {
guard let embedding = embedding else { return nil }
let tagger = NLTagger(tagSchemes: [.tokenType])
tagger.string = text
var weightedVectors: [([Double], Double)] = []
var wordCounts: [String: Int] = [:]
// First pass: count word frequencies
tagger.enumerateTags(
in: text.startIndex..<text.endIndex,
unit: .word,
scheme: .tokenType,
options: [.omitWhitespace, .omitPunctuation]
) { _, range in
let word = String(text[range]).lowercased()
wordCounts[word, default: 0] += 1
return true
}
let maxCount = Double(wordCounts.values.max() ?? 1)
// Second pass: collect weighted vectors
tagger.enumerateTags(
in: text.startIndex..<text.endIndex,
unit: .word,
scheme: .tokenType,
options: [.omitWhitespace, .omitPunctuation]
) { _, range in
let word = String(text[range]).lowercased()
if let vector = embedding.vector(for: word) {
// Use provided weights or default to TF normalization
let weight = weights?[word] ?? (Double(wordCounts[word] ?? 1) / maxCount)
weightedVectors.append((vector, weight))
}
return true
}
guard !weightedVectors.isEmpty else { return nil }
return weightedAverageVectors(weightedVectors)
}
// MARK: - Similarity
/// Calculate cosine similarity between two embeddings
func cosineSimilarity(_ a: [Float], _ b: [Float]) -> Float {
guard a.count == b.count, !a.isEmpty else { return 0 }
var dotProduct: Float = 0
var normA: Float = 0
var normB: Float = 0
vDSP_dotpr(a, 1, b, 1, &dotProduct, vDSP_Length(a.count))
vDSP_dotpr(a, 1, a, 1, &normA, vDSP_Length(a.count))
vDSP_dotpr(b, 1, b, 1, &normB, vDSP_Length(b.count))
let denominator = sqrt(normA) * sqrt(normB)
return denominator > 0 ? dotProduct / denominator : 0
}
/// Find similar words in embedding space
func findSimilarWords(to word: String, k: Int = 10) -> [(String, Double)]? {
guard let embedding = embedding else { return nil }
return embedding.neighbors(for: word.lowercased(), maximumCount: k)
}
// MARK: - Batch Processing
/// Generate embeddings for multiple texts
func generateEmbeddings(for texts: [String]) -> [[Float]?] {
return texts.map { generateEmbedding(for: $0) }
}
// MARK: - Private Helpers
private func averageVectors(_ vectors: [[Double]]) -> [Float] {
let dimension = vectors[0].count
var averaged = [Double](repeating: 0, count: dimension)
for vector in vectors {
for i in 0..<dimension {
averaged[i] += vector[i]
}
}
let count = Double(vectors.count)
return averaged.map { Float($0 / count) }
}
private func weightedAverageVectors(_ vectors: [([Double], Double)]) -> [Float] {
let dimension = vectors[0].0.count
var weighted = [Double](repeating: 0, count: dimension)
var totalWeight: Double = 0
for (vector, weight) in vectors {
totalWeight += weight
for i in 0..<dimension {
weighted[i] += vector[i] * weight
}
}
guard totalWeight > 0 else {
return averageVectors(vectors.map { $0.0 })
}
return weighted.map { Float($0 / totalWeight) }
}
// MARK: - Normalization
/// Normalize embedding to unit length (L2 norm)
func normalize(_ embedding: [Float]) -> [Float] {
var norm: Float = 0
vDSP_dotpr(embedding, 1, embedding, 1, &norm, vDSP_Length(embedding.count))
norm = sqrt(norm)
guard norm > 0 else { return embedding }
var normalized = [Float](repeating: 0, count: embedding.count)
var divisor = norm
vDSP_vsdiv(embedding, 1, &divisor, &normalized, 1, vDSP_Length(embedding.count))
return normalized
}
}
Design Decisions
1. Bag-of-Words Averaging
Apple's NLEmbedding provides word-level embeddings, not sentence embeddings. We average word vectors to create document embeddings. This is simple but effective for our use case.
2. 512-Dimension Embeddings
Apple's English word embeddings are 512-dimensional. This is fixed by the framework.
3. Accelerate Framework
Using vDSP functions for vector operations (dot product, normalization) provides hardware-accelerated performance on Apple Silicon.
4. Weighted Averaging
Optional TF-IDF-style weighting allows more important words to contribute more to the final embedding.
Acceptance Criteria
Testing
func testEmbeddingDimension() async {
let service = AppleNLEmbeddingService()
XCTAssertEqual(await service.embeddingDimension, 512)
}
func testWordEmbedding() async {
let service = AppleNLEmbeddingService()
let vector = await service.wordVector(for: "computer")
XCTAssertNotNil(vector)
XCTAssertEqual(vector?.count, 512)
}
func testDocumentEmbedding() async {
let service = AppleNLEmbeddingService()
let text = "This is a test document about computers and technology."
let embedding = await service.generateEmbedding(for: text)
XCTAssertNotNil(embedding)
XCTAssertEqual(embedding?.count, 512)
}
func testCosineSimilarity() async {
let service = AppleNLEmbeddingService()
let text1 = "Financial quarterly report revenue"
let text2 = "Quarterly financial earnings report"
let text3 = "Cat playing with a ball of yarn"
let emb1 = await service.generateEmbedding(for: text1)!
let emb2 = await service.generateEmbedding(for: text2)!
let emb3 = await service.generateEmbedding(for: text3)!
let sim12 = await service.cosineSimilarity(emb1, emb2)
let sim13 = await service.cosineSimilarity(emb1, emb3)
XCTAssertGreaterThan(sim12, sim13, "Similar texts should have higher similarity")
}
func testNormalization() async {
let service = AppleNLEmbeddingService()
let embedding = await service.generateEmbedding(for: "test document")!
let normalized = await service.normalize(embedding)
// L2 norm should be ~1.0
var norm: Float = 0
vDSP_dotpr(normalized, 1, normalized, 1, &norm, vDSP_Length(normalized.count))
XCTAssertEqual(sqrt(norm), 1.0, accuracy: 0.001)
}
func testEmptyTextHandling() async {
let service = AppleNLEmbeddingService()
let embedding = await service.generateEmbedding(for: "")
XCTAssertNil(embedding, "Empty text should return nil")
}
Estimated Size
~150 lines of code
Risk Assessment
Low - Uses stable NaturalLanguage and Accelerate frameworks. No external dependencies.
Overview
Implement native embedding generation using Apple's NaturalLanguage framework (NLEmbedding). This service generates vector representations of text for similarity search.
Dependencies
None - Uses only built-in Apple frameworks. Can be developed in parallel with other GraphRAG PRs.
Files to Create
Sources/SortAI/Core/GraphRAG/AppleNLEmbeddingService.swiftImplementation Details
AppleNLEmbeddingService
Design Decisions
1. Bag-of-Words Averaging
Apple's NLEmbedding provides word-level embeddings, not sentence embeddings. We average word vectors to create document embeddings. This is simple but effective for our use case.
2. 512-Dimension Embeddings
Apple's English word embeddings are 512-dimensional. This is fixed by the framework.
3. Accelerate Framework
Using
vDSPfunctions for vector operations (dot product, normalization) provides hardware-accelerated performance on Apple Silicon.4. Weighted Averaging
Optional TF-IDF-style weighting allows more important words to contribute more to the final embedding.
Acceptance Criteria
Testing
Estimated Size
~150 lines of code
Risk Assessment
Low - Uses stable NaturalLanguage and Accelerate frameworks. No external dependencies.