Overview
Implement native entity extraction using Apple's NaturalLanguage framework (NLTagger). This is the first component of the GraphRAG system and has no dependencies on other new code.
Dependencies
None - Uses only built-in Apple frameworks. Can be developed in parallel with LLM provider PRs.
Files to Create
| File |
Action |
Description |
Sources/SortAI/Core/GraphRAG/NativeEntityExtractor.swift |
Create |
Entity extraction using NLTagger |
Sources/SortAI/Core/GraphRAG/EntityTypes.swift |
Create |
Entity type definitions |
Implementation Details
1. EntityType Enum
/// Types of entities that can be extracted from text
enum EntityType: String, CaseIterable, Codable, Sendable {
case person = "PersonalName"
case organization = "OrganizationName"
case location = "PlaceName"
case date = "Date"
case keyword = "Keyword"
/// Initialize from NLTag
init?(tag: NLTag) {
switch tag {
case .personalName: self = .person
case .organizationName: self = .organization
case .placeName: self = .location
default: return nil
}
}
var displayName: String {
switch self {
case .person: return "Person"
case .organization: return "Organization"
case .location: return "Location"
case .date: return "Date"
case .keyword: return "Keyword"
}
}
}
2. ExtractedEntity Struct
/// An entity extracted from text
struct ExtractedEntity: Hashable, Sendable {
let text: String
let type: EntityType
let confidence: Double
let startIndex: Int
let endIndex: Int
init(text: String, type: EntityType, confidence: Double = 0.8,
startIndex: Int = 0, endIndex: Int = 0) {
self.text = text
self.type = type
self.confidence = confidence
self.startIndex = startIndex
self.endIndex = endIndex
}
}
3. NativeEntityExtractor
import NaturalLanguage
actor NativeEntityExtractor {
private let tagger: NLTagger
init() {
self.tagger = NLTagger(tagSchemes: [.nameType, .lexicalClass])
}
/// Extract named entities from text
func extractEntities(from text: String) -> [ExtractedEntity] {
var entities: [ExtractedEntity] = []
tagger.string = text
let options: NLTagger.Options = [
.omitWhitespace,
.omitPunctuation,
.joinNames // Combine multi-word names
]
tagger.enumerateTags(
in: text.startIndex..<text.endIndex,
unit: .word,
scheme: .nameType,
options: options
) { tag, tokenRange in
guard let tag = tag,
let entityType = EntityType(tag: tag) else {
return true
}
let entityText = String(text[tokenRange])
// Skip very short entities (likely false positives)
guard entityText.count >= 2 else { return true }
let entity = ExtractedEntity(
text: entityText,
type: entityType,
confidence: 0.8,
startIndex: text.distance(from: text.startIndex, to: tokenRange.lowerBound),
endIndex: text.distance(from: text.startIndex, to: tokenRange.upperBound)
)
entities.append(entity)
return true
}
// Deduplicate entities
return deduplicateEntities(entities)
}
/// Extract keywords using lexical class tagging
func extractKeywords(from text: String, limit: Int = 20) -> [ExtractedEntity] {
var nouns: [String: Int] = [:]
tagger.string = text
let options: NLTagger.Options = [
.omitWhitespace,
.omitPunctuation
]
tagger.enumerateTags(
in: text.startIndex..<text.endIndex,
unit: .word,
scheme: .lexicalClass,
options: options
) { tag, tokenRange in
guard tag == .noun || tag == .verb else { return true }
let word = String(text[tokenRange]).lowercased()
// Skip common words and short words
guard word.count >= 4,
!Self.stopWords.contains(word) else {
return true
}
nouns[word, default: 0] += 1
return true
}
// Sort by frequency and take top keywords
return nouns
.sorted { $0.value > $1.value }
.prefix(limit)
.map { word, count in
ExtractedEntity(
text: word,
type: .keyword,
confidence: min(1.0, Double(count) / 10.0)
)
}
}
/// Extract dates from text
func extractDates(from text: String) -> [ExtractedEntity] {
var dates: [ExtractedEntity] = []
let detector = try? NSDataDetector(types: NSTextCheckingResult.CheckingType.date.rawValue)
detector?.enumerateMatches(
in: text,
options: [],
range: NSRange(text.startIndex..., in: text)
) { result, _, _ in
guard let result = result,
let range = Range(result.range, in: text) else { return }
let dateText = String(text[range])
let entity = ExtractedEntity(
text: dateText,
type: .date,
confidence: 0.9,
startIndex: text.distance(from: text.startIndex, to: range.lowerBound),
endIndex: text.distance(from: text.startIndex, to: range.upperBound)
)
dates.append(entity)
}
return dates
}
/// Extract all entity types
func extractAll(from text: String) -> [ExtractedEntity] {
var all: [ExtractedEntity] = []
all.append(contentsOf: extractEntities(from: text))
all.append(contentsOf: extractDates(from: text))
all.append(contentsOf: extractKeywords(from: text))
return deduplicateEntities(all)
}
// MARK: - Private
private func deduplicateEntities(_ entities: [ExtractedEntity]) -> [ExtractedEntity] {
var seen = Set<String>()
return entities.filter { entity in
let key = "\(entity.text.lowercased()):\(entity.type.rawValue)"
if seen.contains(key) { return false }
seen.insert(key)
return true
}
}
private static let stopWords: Set<String> = [
"the", "and", "for", "that", "this", "with", "from", "have", "been",
"were", "are", "was", "has", "had", "will", "would", "could", "should",
"their", "there", "they", "what", "when", "where", "which", "while"
]
}
Performance
Based on prototype testing, NLTagger is extremely fast:
- 1500x faster than Apple Intelligence for entity extraction
- Processing 1000 words: ~0.6ms
This makes it ideal for preprocessing before LLM refinement.
Acceptance Criteria
Testing
func testPersonExtraction() async {
let extractor = NativeEntityExtractor()
let text = "Tim Cook announced the new iPhone at Apple Park."
let entities = await extractor.extractEntities(from: text)
let persons = entities.filter { $0.type == .person }
XCTAssertTrue(persons.contains { $0.text == "Tim Cook" })
}
func testOrganizationExtraction() async {
let extractor = NativeEntityExtractor()
let text = "Microsoft and Google are competing with Apple."
let entities = await extractor.extractEntities(from: text)
let orgs = entities.filter { $0.type == .organization }
XCTAssertTrue(orgs.contains { $0.text == "Microsoft" })
XCTAssertTrue(orgs.contains { $0.text == "Google" })
XCTAssertTrue(orgs.contains { $0.text == "Apple" })
}
func testDateExtraction() async {
let extractor = NativeEntityExtractor()
let text = "The meeting is scheduled for January 15, 2026."
let entities = await extractor.extractDates(from: text)
XCTAssertEqual(entities.count, 1)
XCTAssertTrue(entities[0].text.contains("January 15"))
}
func testKeywordExtraction() async {
let extractor = NativeEntityExtractor()
let text = """
The quarterly financial report shows strong revenue growth.
Revenue exceeded expectations due to increased sales volume.
"""
let keywords = await extractor.extractKeywords(from: text)
XCTAssertTrue(keywords.contains { $0.text == "revenue" })
XCTAssertTrue(keywords.contains { $0.text == "financial" })
}
func testDeduplication() async {
let extractor = NativeEntityExtractor()
let text = "Apple Inc. announced that Apple will release new products."
let entities = await extractor.extractEntities(from: text)
let appleEntities = entities.filter { $0.text.lowercased().contains("apple") }
XCTAssertEqual(appleEntities.count, 1, "Should deduplicate Apple")
}
Estimated Size
~150 lines of code
Risk Assessment
Low - Uses stable NaturalLanguage framework with no external dependencies.
Overview
Implement native entity extraction using Apple's NaturalLanguage framework (NLTagger). This is the first component of the GraphRAG system and has no dependencies on other new code.
Dependencies
None - Uses only built-in Apple frameworks. Can be developed in parallel with LLM provider PRs.
Files to Create
Sources/SortAI/Core/GraphRAG/NativeEntityExtractor.swiftSources/SortAI/Core/GraphRAG/EntityTypes.swiftImplementation Details
1. EntityType Enum
2. ExtractedEntity Struct
3. NativeEntityExtractor
Performance
Based on prototype testing, NLTagger is extremely fast:
This makes it ideal for preprocessing before LLM refinement.
Acceptance Criteria
Testing
Estimated Size
~150 lines of code
Risk Assessment
Low - Uses stable NaturalLanguage framework with no external dependencies.