Skip to content

PR 6: GraphRAG - Native Entity Extraction #7

Description

@gilmanb1

Overview

Implement native entity extraction using Apple's NaturalLanguage framework (NLTagger). This is the first component of the GraphRAG system and has no dependencies on other new code.

Dependencies

None - Uses only built-in Apple frameworks. Can be developed in parallel with LLM provider PRs.

Files to Create

File Action Description
Sources/SortAI/Core/GraphRAG/NativeEntityExtractor.swift Create Entity extraction using NLTagger
Sources/SortAI/Core/GraphRAG/EntityTypes.swift Create Entity type definitions

Implementation Details

1. EntityType Enum

/// Types of entities that can be extracted from text
enum EntityType: String, CaseIterable, Codable, Sendable {
    case person = "PersonalName"
    case organization = "OrganizationName"
    case location = "PlaceName"
    case date = "Date"
    case keyword = "Keyword"
    
    /// Initialize from NLTag
    init?(tag: NLTag) {
        switch tag {
        case .personalName: self = .person
        case .organizationName: self = .organization
        case .placeName: self = .location
        default: return nil
        }
    }
    
    var displayName: String {
        switch self {
        case .person: return "Person"
        case .organization: return "Organization"
        case .location: return "Location"
        case .date: return "Date"
        case .keyword: return "Keyword"
        }
    }
}

2. ExtractedEntity Struct

/// An entity extracted from text
struct ExtractedEntity: Hashable, Sendable {
    let text: String
    let type: EntityType
    let confidence: Double
    let startIndex: Int
    let endIndex: Int
    
    init(text: String, type: EntityType, confidence: Double = 0.8,
         startIndex: Int = 0, endIndex: Int = 0) {
        self.text = text
        self.type = type
        self.confidence = confidence
        self.startIndex = startIndex
        self.endIndex = endIndex
    }
}

3. NativeEntityExtractor

import NaturalLanguage

actor NativeEntityExtractor {
    private let tagger: NLTagger
    
    init() {
        self.tagger = NLTagger(tagSchemes: [.nameType, .lexicalClass])
    }
    
    /// Extract named entities from text
    func extractEntities(from text: String) -> [ExtractedEntity] {
        var entities: [ExtractedEntity] = []
        
        tagger.string = text
        
        let options: NLTagger.Options = [
            .omitWhitespace,
            .omitPunctuation,
            .joinNames  // Combine multi-word names
        ]
        
        tagger.enumerateTags(
            in: text.startIndex..<text.endIndex,
            unit: .word,
            scheme: .nameType,
            options: options
        ) { tag, tokenRange in
            guard let tag = tag,
                  let entityType = EntityType(tag: tag) else {
                return true
            }
            
            let entityText = String(text[tokenRange])
            
            // Skip very short entities (likely false positives)
            guard entityText.count >= 2 else { return true }
            
            let entity = ExtractedEntity(
                text: entityText,
                type: entityType,
                confidence: 0.8,
                startIndex: text.distance(from: text.startIndex, to: tokenRange.lowerBound),
                endIndex: text.distance(from: text.startIndex, to: tokenRange.upperBound)
            )
            entities.append(entity)
            
            return true
        }
        
        // Deduplicate entities
        return deduplicateEntities(entities)
    }
    
    /// Extract keywords using lexical class tagging
    func extractKeywords(from text: String, limit: Int = 20) -> [ExtractedEntity] {
        var nouns: [String: Int] = [:]
        
        tagger.string = text
        
        let options: NLTagger.Options = [
            .omitWhitespace,
            .omitPunctuation
        ]
        
        tagger.enumerateTags(
            in: text.startIndex..<text.endIndex,
            unit: .word,
            scheme: .lexicalClass,
            options: options
        ) { tag, tokenRange in
            guard tag == .noun || tag == .verb else { return true }
            
            let word = String(text[tokenRange]).lowercased()
            
            // Skip common words and short words
            guard word.count >= 4,
                  !Self.stopWords.contains(word) else {
                return true
            }
            
            nouns[word, default: 0] += 1
            return true
        }
        
        // Sort by frequency and take top keywords
        return nouns
            .sorted { $0.value > $1.value }
            .prefix(limit)
            .map { word, count in
                ExtractedEntity(
                    text: word,
                    type: .keyword,
                    confidence: min(1.0, Double(count) / 10.0)
                )
            }
    }
    
    /// Extract dates from text
    func extractDates(from text: String) -> [ExtractedEntity] {
        var dates: [ExtractedEntity] = []
        
        let detector = try? NSDataDetector(types: NSTextCheckingResult.CheckingType.date.rawValue)
        
        detector?.enumerateMatches(
            in: text,
            options: [],
            range: NSRange(text.startIndex..., in: text)
        ) { result, _, _ in
            guard let result = result,
                  let range = Range(result.range, in: text) else { return }
            
            let dateText = String(text[range])
            let entity = ExtractedEntity(
                text: dateText,
                type: .date,
                confidence: 0.9,
                startIndex: text.distance(from: text.startIndex, to: range.lowerBound),
                endIndex: text.distance(from: text.startIndex, to: range.upperBound)
            )
            dates.append(entity)
        }
        
        return dates
    }
    
    /// Extract all entity types
    func extractAll(from text: String) -> [ExtractedEntity] {
        var all: [ExtractedEntity] = []
        
        all.append(contentsOf: extractEntities(from: text))
        all.append(contentsOf: extractDates(from: text))
        all.append(contentsOf: extractKeywords(from: text))
        
        return deduplicateEntities(all)
    }
    
    // MARK: - Private
    
    private func deduplicateEntities(_ entities: [ExtractedEntity]) -> [ExtractedEntity] {
        var seen = Set<String>()
        return entities.filter { entity in
            let key = "\(entity.text.lowercased()):\(entity.type.rawValue)"
            if seen.contains(key) { return false }
            seen.insert(key)
            return true
        }
    }
    
    private static let stopWords: Set<String> = [
        "the", "and", "for", "that", "this", "with", "from", "have", "been",
        "were", "are", "was", "has", "had", "will", "would", "could", "should",
        "their", "there", "they", "what", "when", "where", "which", "while"
    ]
}

Performance

Based on prototype testing, NLTagger is extremely fast:

  • 1500x faster than Apple Intelligence for entity extraction
  • Processing 1000 words: ~0.6ms

This makes it ideal for preprocessing before LLM refinement.

Acceptance Criteria

  • Extracts person names correctly
  • Extracts organization names correctly
  • Extracts location names correctly
  • Extracts dates using NSDataDetector
  • Extracts keywords using lexical class tagging
  • Deduplicates extracted entities
  • Filters out stop words
  • Returns entities with position information
  • Thread-safe (actor-based)

Testing

func testPersonExtraction() async {
    let extractor = NativeEntityExtractor()
    let text = "Tim Cook announced the new iPhone at Apple Park."
    
    let entities = await extractor.extractEntities(from: text)
    
    let persons = entities.filter { $0.type == .person }
    XCTAssertTrue(persons.contains { $0.text == "Tim Cook" })
}

func testOrganizationExtraction() async {
    let extractor = NativeEntityExtractor()
    let text = "Microsoft and Google are competing with Apple."
    
    let entities = await extractor.extractEntities(from: text)
    
    let orgs = entities.filter { $0.type == .organization }
    XCTAssertTrue(orgs.contains { $0.text == "Microsoft" })
    XCTAssertTrue(orgs.contains { $0.text == "Google" })
    XCTAssertTrue(orgs.contains { $0.text == "Apple" })
}

func testDateExtraction() async {
    let extractor = NativeEntityExtractor()
    let text = "The meeting is scheduled for January 15, 2026."
    
    let entities = await extractor.extractDates(from: text)
    
    XCTAssertEqual(entities.count, 1)
    XCTAssertTrue(entities[0].text.contains("January 15"))
}

func testKeywordExtraction() async {
    let extractor = NativeEntityExtractor()
    let text = """
    The quarterly financial report shows strong revenue growth.
    Revenue exceeded expectations due to increased sales volume.
    """
    
    let keywords = await extractor.extractKeywords(from: text)
    
    XCTAssertTrue(keywords.contains { $0.text == "revenue" })
    XCTAssertTrue(keywords.contains { $0.text == "financial" })
}

func testDeduplication() async {
    let extractor = NativeEntityExtractor()
    let text = "Apple Inc. announced that Apple will release new products."
    
    let entities = await extractor.extractEntities(from: text)
    
    let appleEntities = entities.filter { $0.text.lowercased().contains("apple") }
    XCTAssertEqual(appleEntities.count, 1, "Should deduplicate Apple")
}

Estimated Size

~150 lines of code

Risk Assessment

Low - Uses stable NaturalLanguage framework with no external dependencies.

Metadata

Metadata

Assignees

No one assigned

    Labels

    enhancementNew feature or requestgraphragGraphRAG knowledge graph featuresphase-2Phase 2 - Parallel development

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions