PR 6: GraphRAG - Native Entity Extraction

## Overview

Implement native entity extraction using Apple's NaturalLanguage framework (NLTagger). This is the first component of the GraphRAG system and has no dependencies on other new code.

## Dependencies

**None** - Uses only built-in Apple frameworks. Can be developed in parallel with LLM provider PRs.

## Files to Create

| File | Action | Description |
|------|--------|-------------|
| `Sources/SortAI/Core/GraphRAG/NativeEntityExtractor.swift` | Create | Entity extraction using NLTagger |
| `Sources/SortAI/Core/GraphRAG/EntityTypes.swift` | Create | Entity type definitions |

## Implementation Details

### 1. EntityType Enum

```swift
/// Types of entities that can be extracted from text
enum EntityType: String, CaseIterable, Codable, Sendable {
    case person = "PersonalName"
    case organization = "OrganizationName"
    case location = "PlaceName"
    case date = "Date"
    case keyword = "Keyword"
    
    /// Initialize from NLTag
    init?(tag: NLTag) {
        switch tag {
        case .personalName: self = .person
        case .organizationName: self = .organization
        case .placeName: self = .location
        default: return nil
        }
    }
    
    var displayName: String {
        switch self {
        case .person: return "Person"
        case .organization: return "Organization"
        case .location: return "Location"
        case .date: return "Date"
        case .keyword: return "Keyword"
        }
    }
}
```

### 2. ExtractedEntity Struct

```swift
/// An entity extracted from text
struct ExtractedEntity: Hashable, Sendable {
    let text: String
    let type: EntityType
    let confidence: Double
    let startIndex: Int
    let endIndex: Int
    
    init(text: String, type: EntityType, confidence: Double = 0.8,
         startIndex: Int = 0, endIndex: Int = 0) {
        self.text = text
        self.type = type
        self.confidence = confidence
        self.startIndex = startIndex
        self.endIndex = endIndex
    }
}
```

### 3. NativeEntityExtractor

```swift
import NaturalLanguage

actor NativeEntityExtractor {
    private let tagger: NLTagger
    
    init() {
        self.tagger = NLTagger(tagSchemes: [.nameType, .lexicalClass])
    }
    
    /// Extract named entities from text
    func extractEntities(from text: String) -> [ExtractedEntity] {
        var entities: [ExtractedEntity] = []
        
        tagger.string = text
        
        let options: NLTagger.Options = [
            .omitWhitespace,
            .omitPunctuation,
            .joinNames  // Combine multi-word names
        ]
        
        tagger.enumerateTags(
            in: text.startIndex..<text.endIndex,
            unit: .word,
            scheme: .nameType,
            options: options
        ) { tag, tokenRange in
            guard let tag = tag,
                  let entityType = EntityType(tag: tag) else {
                return true
            }
            
            let entityText = String(text[tokenRange])
            
            // Skip very short entities (likely false positives)
            guard entityText.count >= 2 else { return true }
            
            let entity = ExtractedEntity(
                text: entityText,
                type: entityType,
                confidence: 0.8,
                startIndex: text.distance(from: text.startIndex, to: tokenRange.lowerBound),
                endIndex: text.distance(from: text.startIndex, to: tokenRange.upperBound)
            )
            entities.append(entity)
            
            return true
        }
        
        // Deduplicate entities
        return deduplicateEntities(entities)
    }
    
    /// Extract keywords using lexical class tagging
    func extractKeywords(from text: String, limit: Int = 20) -> [ExtractedEntity] {
        var nouns: [String: Int] = [:]
        
        tagger.string = text
        
        let options: NLTagger.Options = [
            .omitWhitespace,
            .omitPunctuation
        ]
        
        tagger.enumerateTags(
            in: text.startIndex..<text.endIndex,
            unit: .word,
            scheme: .lexicalClass,
            options: options
        ) { tag, tokenRange in
            guard tag == .noun || tag == .verb else { return true }
            
            let word = String(text[tokenRange]).lowercased()
            
            // Skip common words and short words
            guard word.count >= 4,
                  !Self.stopWords.contains(word) else {
                return true
            }
            
            nouns[word, default: 0] += 1
            return true
        }
        
        // Sort by frequency and take top keywords
        return nouns
            .sorted { $0.value > $1.value }
            .prefix(limit)
            .map { word, count in
                ExtractedEntity(
                    text: word,
                    type: .keyword,
                    confidence: min(1.0, Double(count) / 10.0)
                )
            }
    }
    
    /// Extract dates from text
    func extractDates(from text: String) -> [ExtractedEntity] {
        var dates: [ExtractedEntity] = []
        
        let detector = try? NSDataDetector(types: NSTextCheckingResult.CheckingType.date.rawValue)
        
        detector?.enumerateMatches(
            in: text,
            options: [],
            range: NSRange(text.startIndex..., in: text)
        ) { result, _, _ in
            guard let result = result,
                  let range = Range(result.range, in: text) else { return }
            
            let dateText = String(text[range])
            let entity = ExtractedEntity(
                text: dateText,
                type: .date,
                confidence: 0.9,
                startIndex: text.distance(from: text.startIndex, to: range.lowerBound),
                endIndex: text.distance(from: text.startIndex, to: range.upperBound)
            )
            dates.append(entity)
        }
        
        return dates
    }
    
    /// Extract all entity types
    func extractAll(from text: String) -> [ExtractedEntity] {
        var all: [ExtractedEntity] = []
        
        all.append(contentsOf: extractEntities(from: text))
        all.append(contentsOf: extractDates(from: text))
        all.append(contentsOf: extractKeywords(from: text))
        
        return deduplicateEntities(all)
    }
    
    // MARK: - Private
    
    private func deduplicateEntities(_ entities: [ExtractedEntity]) -> [ExtractedEntity] {
        var seen = Set<String>()
        return entities.filter { entity in
            let key = "\(entity.text.lowercased()):\(entity.type.rawValue)"
            if seen.contains(key) { return false }
            seen.insert(key)
            return true
        }
    }
    
    private static let stopWords: Set<String> = [
        "the", "and", "for", "that", "this", "with", "from", "have", "been",
        "were", "are", "was", "has", "had", "will", "would", "could", "should",
        "their", "there", "they", "what", "when", "where", "which", "while"
    ]
}
```

## Performance

Based on prototype testing, NLTagger is extremely fast:
- **1500x faster** than Apple Intelligence for entity extraction
- Processing 1000 words: ~0.6ms

This makes it ideal for preprocessing before LLM refinement.

## Acceptance Criteria

- [ ] Extracts person names correctly
- [ ] Extracts organization names correctly
- [ ] Extracts location names correctly
- [ ] Extracts dates using NSDataDetector
- [ ] Extracts keywords using lexical class tagging
- [ ] Deduplicates extracted entities
- [ ] Filters out stop words
- [ ] Returns entities with position information
- [ ] Thread-safe (actor-based)

## Testing

```swift
func testPersonExtraction() async {
    let extractor = NativeEntityExtractor()
    let text = "Tim Cook announced the new iPhone at Apple Park."
    
    let entities = await extractor.extractEntities(from: text)
    
    let persons = entities.filter { $0.type == .person }
    XCTAssertTrue(persons.contains { $0.text == "Tim Cook" })
}

func testOrganizationExtraction() async {
    let extractor = NativeEntityExtractor()
    let text = "Microsoft and Google are competing with Apple."
    
    let entities = await extractor.extractEntities(from: text)
    
    let orgs = entities.filter { $0.type == .organization }
    XCTAssertTrue(orgs.contains { $0.text == "Microsoft" })
    XCTAssertTrue(orgs.contains { $0.text == "Google" })
    XCTAssertTrue(orgs.contains { $0.text == "Apple" })
}

func testDateExtraction() async {
    let extractor = NativeEntityExtractor()
    let text = "The meeting is scheduled for January 15, 2026."
    
    let entities = await extractor.extractDates(from: text)
    
    XCTAssertEqual(entities.count, 1)
    XCTAssertTrue(entities[0].text.contains("January 15"))
}

func testKeywordExtraction() async {
    let extractor = NativeEntityExtractor()
    let text = """
    The quarterly financial report shows strong revenue growth.
    Revenue exceeded expectations due to increased sales volume.
    """
    
    let keywords = await extractor.extractKeywords(from: text)
    
    XCTAssertTrue(keywords.contains { $0.text == "revenue" })
    XCTAssertTrue(keywords.contains { $0.text == "financial" })
}

func testDeduplication() async {
    let extractor = NativeEntityExtractor()
    let text = "Apple Inc. announced that Apple will release new products."
    
    let entities = await extractor.extractEntities(from: text)
    
    let appleEntities = entities.filter { $0.text.lowercased().contains("apple") }
    XCTAssertEqual(appleEntities.count, 1, "Should deduplicate Apple")
}
```

## Estimated Size

~150 lines of code

## Risk Assessment

**Low** - Uses stable NaturalLanguage framework with no external dependencies.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

PR 6: GraphRAG - Native Entity Extraction #7

Overview

Dependencies

Files to Create

Implementation Details

1. EntityType Enum

2. ExtractedEntity Struct

3. NativeEntityExtractor

Performance

Acceptance Criteria

Testing

Estimated Size

Risk Assessment

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

File	Action	Description
`Sources/SortAI/Core/GraphRAG/NativeEntityExtractor.swift`	Create	Entity extraction using NLTagger
`Sources/SortAI/Core/GraphRAG/EntityTypes.swift`	Create	Entity type definitions

PR 6: GraphRAG - Native Entity Extraction #7

Description

Overview

Dependencies

Files to Create

Implementation Details

1. EntityType Enum

2. ExtractedEntity Struct

3. NativeEntityExtractor

Performance

Acceptance Criteria

Testing

Estimated Size

Risk Assessment

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions