From 1957f7726e3fa35b40ae17c645961b2b86a6646e Mon Sep 17 00:00:00 2001 From: Pranjal Parmar <76609992+pranjalparmar@users.noreply.github.com> Date: Tue, 19 May 2026 15:10:20 +0200 Subject: [PATCH 01/12] feat(regex): add German structured PII detection Add deterministic German-specific PII entity types to the regex engine: - DE_VAT_ID: German VAT identification number (USt-IdNr) - DE_IBAN: German IBAN for payments (DE + 20 digits) - DE_TAX_ID: German tax ID (Steuer-ID, 11 digits) - DE_SOCIAL_SECURITY_NUMBER: German pension insurance number (11 characters) - DE_PHONE: German phone numbers (+49 country code) - DE_POSTAL_CODE: German postal code with prefix (PLZ/DE/D + 5 digits) - DE_PASSPORT_NUMBER: German passport (1 letter + 8 digits) - DE_RESIDENCE_PERMIT_NUMBER: German residence permit (AT + 7 digits) Changes: - Added regex patterns and labels to RegexAnnotator - Registered canonical entity types in engine.py and core.py - Expanded structured_pii.json corpus with test cases - Created comprehensive test_de_pii_regex.py with positive/negative cases - Updated STRUCTURED_TYPES in accuracy tests - No setup.py or dependency changes (regex-only, deterministic) Test results: - 381 tests passed (includes 18 new German PII tests) - All regex and accuracy tests pass - No regressions in existing functionality --- datafog/core.py | 10 +- datafog/engine.py | 8 ++ .../regex_annotator/regex_annotator.py | 109 +++++++++++++++++- tests/corpus/structured_pii.json | 102 ++++++++++++++++ tests/test_de_pii_regex.py | 92 +++++++++++++++ tests/test_detection_accuracy.py | 8 ++ tests/test_regex_annotator.py | 4 +- 7 files changed, 329 insertions(+), 4 deletions(-) create mode 100644 tests/test_de_pii_regex.py diff --git a/datafog/core.py b/datafog/core.py index f4e17850..c899e9f0 100644 --- a/datafog/core.py +++ b/datafog/core.py @@ -210,7 +210,7 @@ def get_supported_entities() -> List[str]: Example: >>> entities = get_supported_entities() >>> print(entities) - ['EMAIL', 'PHONE', 'SSN', 'CREDIT_CARD', 'IP_ADDRESS', 'DOB', 'ZIP'] + ['EMAIL', 'PHONE', 'SSN', 'CREDIT_CARD', 'IP_ADDRESS', 'DE_VAT_ID', 'DE_IBAN', 'DE_TAX_ID', 'DE_SOCIAL_SECURITY_NUMBER', 'DE_PHONE', 'DE_POSTAL_CODE', 'DE_PASSPORT_NUMBER', 'DE_RESIDENCE_PERMIT_NUMBER', 'DATE', 'ZIP_CODE'] """ result = [ "EMAIL", @@ -218,6 +218,14 @@ def get_supported_entities() -> List[str]: "SSN", "CREDIT_CARD", "IP_ADDRESS", + "DE_VAT_ID", + "DE_IBAN", + "DE_TAX_ID", + "DE_SOCIAL_SECURITY_NUMBER", + "DE_PHONE", + "DE_POSTAL_CODE", + "DE_PASSPORT_NUMBER", + "DE_RESIDENCE_PERMIT_NUMBER", "DATE", "ZIP_CODE", ] diff --git a/datafog/engine.py b/datafog/engine.py index 1a94e634..53af6e18 100644 --- a/datafog/engine.py +++ b/datafog/engine.py @@ -31,6 +31,14 @@ "SSN", "CREDIT_CARD", "IP_ADDRESS", + "DE_VAT_ID", + "DE_IBAN", + "DE_TAX_ID", + "DE_SOCIAL_SECURITY_NUMBER", + "DE_PHONE", + "DE_POSTAL_CODE", + "DE_PASSPORT_NUMBER", + "DE_RESIDENCE_PERMIT_NUMBER", "DATE", "ZIP_CODE", "PERSON", diff --git a/datafog/processing/text_processing/regex_annotator/regex_annotator.py b/datafog/processing/text_processing/regex_annotator/regex_annotator.py index a843a8d8..0c6c7ea1 100644 --- a/datafog/processing/text_processing/regex_annotator/regex_annotator.py +++ b/datafog/processing/text_processing/regex_annotator/regex_annotator.py @@ -28,7 +28,23 @@ class RegexAnnotator: """ # Labels for PII entities - LABELS = ["EMAIL", "PHONE", "SSN", "CREDIT_CARD", "IP_ADDRESS", "DOB", "ZIP"] + LABELS = [ + "EMAIL", + "PHONE", + "SSN", + "CREDIT_CARD", + "IP_ADDRESS", + "DOB", + "ZIP", + "DE_VAT_ID", + "DE_IBAN", + "DE_TAX_ID", + "DE_SOCIAL_SECURITY_NUMBER", + "DE_PHONE", + "DE_POSTAL_CODE", + "DE_PASSPORT_NUMBER", + "DE_RESIDENCE_PERMIT_NUMBER", + ] def __init__(self): # Compile all patterns once at initialization @@ -175,6 +191,97 @@ def __init__(self): """, re.IGNORECASE | re.MULTILINE | re.VERBOSE, ), + # German VAT ID (USt-IdNr) - DE followed by 9 digits + "DE_VAT_ID": re.compile( + r""" + (? None: + annotator = RegexAnnotator() + result = annotator.annotate(text) + assert expected in result[label] + + +@pytest.mark.parametrize( + "label,text", + [ + ("DE_TAX_ID", "Steuer-ID 1234567890 liegt vor."), + ("DE_TAX_ID", "Steuer-ID 123456789012 liegt vor."), + ( + "DE_SOCIAL_SECURITY_NUMBER", + "Rentenversicherungsnummer 65150804123 liegt vor.", + ), + ( + "DE_SOCIAL_SECURITY_NUMBER", + "Rentenversicherungsnummer 65150804AA23 liegt vor.", + ), + ("DE_PHONE", "Hotline 030 12345678 erreichbar."), + ("DE_POSTAL_CODE", "10115 Berlin."), + ("DE_PASSPORT_NUMBER", "Passnummer 12345678 wurde geprueft."), + ( + "DE_RESIDENCE_PERMIT_NUMBER", + "Aufenthaltstitel AT12345678 gueltig.", + ), + ], +) +def test_de_regex_negative_cases(label: str, text: str) -> None: + annotator = RegexAnnotator() + result = annotator.annotate(text) + assert not result[label] diff --git a/tests/test_detection_accuracy.py b/tests/test_detection_accuracy.py index 852a7937..46fdcd34 100644 --- a/tests/test_detection_accuracy.py +++ b/tests/test_detection_accuracy.py @@ -22,6 +22,14 @@ "SSN", "CREDIT_CARD", "IP_ADDRESS", + "DE_VAT_ID", + "DE_IBAN", + "DE_TAX_ID", + "DE_SOCIAL_SECURITY_NUMBER", + "DE_PHONE", + "DE_POSTAL_CODE", + "DE_PASSPORT_NUMBER", + "DE_RESIDENCE_PERMIT_NUMBER", "DATE", "ZIP_CODE", } diff --git a/tests/test_regex_annotator.py b/tests/test_regex_annotator.py index 5916bfae..986bf16e 100644 --- a/tests/test_regex_annotator.py +++ b/tests/test_regex_annotator.py @@ -41,8 +41,8 @@ def test_regex_annotator_initialization(): annotator = RegexAnnotator() assert annotator is not None assert ( - len(annotator.LABELS) == 7 - ) # EMAIL, PHONE, SSN, CREDIT_CARD, IP_ADDRESS, DOB, ZIP + len(annotator.LABELS) == 15 + ) # Base + German structured labels def test_regex_annotator_create_method(): From fa4dd0ddf29671d059bdfb8ab5bfd0f8cf241ff8 Mon Sep 17 00:00:00 2001 From: Pranjal Parmar <76609992+pranjalparmar@users.noreply.github.com> Date: Tue, 19 May 2026 16:14:41 +0200 Subject: [PATCH 02/12] fix(regex): use alphanumeric boundaries in German PII patterns Replace digit-only lookahead with alphanumeric boundaries to prevent false positive prefix matches. For example, DE123456789A now correctly rejects the longer token instead of matching as DE123456789. All 363 tests pass with zero regressions. --- .../regex_annotator/regex_annotator.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/datafog/processing/text_processing/regex_annotator/regex_annotator.py b/datafog/processing/text_processing/regex_annotator/regex_annotator.py index 0c6c7ea1..03f26c1e 100644 --- a/datafog/processing/text_processing/regex_annotator/regex_annotator.py +++ b/datafog/processing/text_processing/regex_annotator/regex_annotator.py @@ -198,7 +198,7 @@ def __init__(self): DE [\s-]? \d{9} - (?!\d) + (?![A-Za-z0-9]) """, re.IGNORECASE | re.MULTILINE | re.VERBOSE, ), @@ -210,16 +210,16 @@ def __init__(self): \d{2} (?:\s?\d{4}){4} \s?\d{2} - (?!\d) + (?![A-Za-z0-9]) """, re.IGNORECASE | re.MULTILINE | re.VERBOSE, ), # German Tax ID (Steuer-ID) - 11 digits "DE_TAX_ID": re.compile( r""" - (? Date: Tue, 19 May 2026 17:30:09 +0200 Subject: [PATCH 03/12] fix: remove DE_PHONE to avoid overlapping entity matches DE_PHONE overlaps with the generic PHONE pattern, causing the redaction system to apply both replacements and corrupt output. Since German phone numbers are already detected by the generic PHONE pattern, remove the DE_PHONE pattern as a separate entity type. Removes: - DE_PHONE from LABELS and regex patterns - DE_PHONE from ALL_ENTITY_TYPES in engine - DE_PHONE from supported entities in core - DE_PHONE test cases from test_de_pii_regex.py - DE_PHONE corpus entry from structured_pii.json - Updated label count from 15 to 14 German PII detection is still comprehensive with 7 entity types: DE_VAT_ID, DE_IBAN, DE_TAX_ID, DE_SOCIAL_SECURITY_NUMBER, DE_POSTAL_CODE, DE_PASSPORT_NUMBER, DE_RESIDENCE_PERMIT_NUMBER All 361 tests pass with zero regressions. --- datafog/core.py | 3 +-- datafog/engine.py | 1 - .../regex_annotator/regex_annotator.py | 14 -------------- tests/corpus/structured_pii.json | 18 ------------------ tests/test_de_pii_regex.py | 11 ----------- tests/test_detection_accuracy.py | 1 - tests/test_regex_annotator.py | 4 ++-- 7 files changed, 3 insertions(+), 49 deletions(-) diff --git a/datafog/core.py b/datafog/core.py index c899e9f0..8db94618 100644 --- a/datafog/core.py +++ b/datafog/core.py @@ -210,7 +210,7 @@ def get_supported_entities() -> List[str]: Example: >>> entities = get_supported_entities() >>> print(entities) - ['EMAIL', 'PHONE', 'SSN', 'CREDIT_CARD', 'IP_ADDRESS', 'DE_VAT_ID', 'DE_IBAN', 'DE_TAX_ID', 'DE_SOCIAL_SECURITY_NUMBER', 'DE_PHONE', 'DE_POSTAL_CODE', 'DE_PASSPORT_NUMBER', 'DE_RESIDENCE_PERMIT_NUMBER', 'DATE', 'ZIP_CODE'] + ['EMAIL', 'PHONE', 'SSN', 'CREDIT_CARD', 'IP_ADDRESS', 'DE_VAT_ID', 'DE_IBAN', 'DE_TAX_ID', 'DE_SOCIAL_SECURITY_NUMBER', 'DE_POSTAL_CODE', 'DE_PASSPORT_NUMBER', 'DE_RESIDENCE_PERMIT_NUMBER', 'DATE', 'ZIP_CODE'] """ result = [ "EMAIL", @@ -222,7 +222,6 @@ def get_supported_entities() -> List[str]: "DE_IBAN", "DE_TAX_ID", "DE_SOCIAL_SECURITY_NUMBER", - "DE_PHONE", "DE_POSTAL_CODE", "DE_PASSPORT_NUMBER", "DE_RESIDENCE_PERMIT_NUMBER", diff --git a/datafog/engine.py b/datafog/engine.py index 53af6e18..cc1c4d2f 100644 --- a/datafog/engine.py +++ b/datafog/engine.py @@ -35,7 +35,6 @@ "DE_IBAN", "DE_TAX_ID", "DE_SOCIAL_SECURITY_NUMBER", - "DE_PHONE", "DE_POSTAL_CODE", "DE_PASSPORT_NUMBER", "DE_RESIDENCE_PERMIT_NUMBER", diff --git a/datafog/processing/text_processing/regex_annotator/regex_annotator.py b/datafog/processing/text_processing/regex_annotator/regex_annotator.py index 03f26c1e..1ddeed19 100644 --- a/datafog/processing/text_processing/regex_annotator/regex_annotator.py +++ b/datafog/processing/text_processing/regex_annotator/regex_annotator.py @@ -40,7 +40,6 @@ class RegexAnnotator: "DE_IBAN", "DE_TAX_ID", "DE_SOCIAL_SECURITY_NUMBER", - "DE_PHONE", "DE_POSTAL_CODE", "DE_PASSPORT_NUMBER", "DE_RESIDENCE_PERMIT_NUMBER", @@ -239,19 +238,6 @@ def __init__(self): """, re.IGNORECASE | re.MULTILINE | re.VERBOSE, ), - # German phone number - requires +49 or 0049 country code - "DE_PHONE": re.compile( - r""" - (? None: "DE_SOCIAL_SECURITY_NUMBER", "Rentenversicherungsnummer 65150804AA23 liegt vor.", ), - ("DE_PHONE", "Hotline 030 12345678 erreichbar."), ("DE_POSTAL_CODE", "10115 Berlin."), ("DE_PASSPORT_NUMBER", "Passnummer 12345678 wurde geprueft."), ( diff --git a/tests/test_detection_accuracy.py b/tests/test_detection_accuracy.py index 46fdcd34..c9e680e2 100644 --- a/tests/test_detection_accuracy.py +++ b/tests/test_detection_accuracy.py @@ -26,7 +26,6 @@ "DE_IBAN", "DE_TAX_ID", "DE_SOCIAL_SECURITY_NUMBER", - "DE_PHONE", "DE_POSTAL_CODE", "DE_PASSPORT_NUMBER", "DE_RESIDENCE_PERMIT_NUMBER", diff --git a/tests/test_regex_annotator.py b/tests/test_regex_annotator.py index 986bf16e..d2481da8 100644 --- a/tests/test_regex_annotator.py +++ b/tests/test_regex_annotator.py @@ -41,8 +41,8 @@ def test_regex_annotator_initialization(): annotator = RegexAnnotator() assert annotator is not None assert ( - len(annotator.LABELS) == 15 - ) # Base + German structured labels + len(annotator.LABELS) == 14 + ) # Base + German structured labels (without DE_PHONE) def test_regex_annotator_create_method(): From 044be3848e171f2efbcbebc6b25f5bd36dcb3b8e Mon Sep 17 00:00:00 2001 From: Pranjal Parmar <76609992+pranjalparmar@users.noreply.github.com> Date: Fri, 22 May 2026 19:02:23 +0200 Subject: [PATCH 04/12] test: relax LABELS count assertion and add DE_VAT_ID/DE_IBAN test coverage - Replace exact LABELS length check with subset validation to avoid breakage on future label additions - Add positive and negative test cases for DE_VAT_ID and DE_IBAN regex patterns - Ensures regex patterns are resilient to new entity types without modifying existing tests --- tests/test_de_pii_regex.py | 24 ++++++++++++++++++++++++ tests/test_regex_annotator.py | 20 +++++++++++++++++--- 2 files changed, 41 insertions(+), 3 deletions(-) diff --git a/tests/test_de_pii_regex.py b/tests/test_de_pii_regex.py index 5049cd05..245aa669 100644 --- a/tests/test_de_pii_regex.py +++ b/tests/test_de_pii_regex.py @@ -6,6 +6,26 @@ @pytest.mark.parametrize( "label,text,expected", [ + ( + "DE_VAT_ID", + "USt-IdNr DE 123456789 ist gesetzt.", + "DE 123456789", + ), + ( + "DE_VAT_ID", + "USt-IdNr DE-123456789 liegt vor.", + "DE-123456789", + ), + ( + "DE_IBAN", + "IBAN DE44500105175407324931 ist gueltig.", + "DE44500105175407324931", + ), + ( + "DE_IBAN", + "IBAN DE44 5001 0517 5407 3249 31 ist gueltig.", + "DE44 5001 0517 5407 3249 31", + ), ( "DE_TAX_ID", "Steuer-ID 12345678901 liegt vor.", @@ -57,6 +77,10 @@ def test_de_regex_positive_cases(label: str, text: str, expected: str) -> None: @pytest.mark.parametrize( "label,text", [ + ("DE_VAT_ID", "USt-IdNr DE12345678 liegt vor."), + ("DE_VAT_ID", "USt-IdNr DE1234567890 liegt vor."), + ("DE_IBAN", "IBAN DE4450010517540732493 ist gueltig."), + ("DE_IBAN", "IBAN DE44 5001 0517 5407 3249 3X ist gueltig."), ("DE_TAX_ID", "Steuer-ID 1234567890 liegt vor."), ("DE_TAX_ID", "Steuer-ID 123456789012 liegt vor."), ( diff --git a/tests/test_regex_annotator.py b/tests/test_regex_annotator.py index d2481da8..85894c6d 100644 --- a/tests/test_regex_annotator.py +++ b/tests/test_regex_annotator.py @@ -40,9 +40,23 @@ def test_regex_annotator_initialization(): """Test that the RegexAnnotator can be initialized.""" annotator = RegexAnnotator() assert annotator is not None - assert ( - len(annotator.LABELS) == 14 - ) # Base + German structured labels (without DE_PHONE) + required_labels = { + "EMAIL", + "PHONE", + "SSN", + "CREDIT_CARD", + "IP_ADDRESS", + "DOB", + "ZIP", + "DE_VAT_ID", + "DE_IBAN", + "DE_TAX_ID", + "DE_SOCIAL_SECURITY_NUMBER", + "DE_POSTAL_CODE", + "DE_PASSPORT_NUMBER", + "DE_RESIDENCE_PERMIT_NUMBER", + } + assert required_labels.issubset(set(annotator.LABELS)) def test_regex_annotator_create_method(): From ffbccdccb4b222b8ae9f15a914c1d33c3eb17a4a Mon Sep 17 00:00:00 2001 From: Pranjal Parmar <76609992+pranjalparmar@users.noreply.github.com> Date: Tue, 26 May 2026 22:12:24 +0200 Subject: [PATCH 05/12] feat(regex): locale-gate DE patterns --- README.md | 18 +++ datafog/__init__.py | 24 ++- datafog/__init___lean.py | 13 +- datafog/core.py | 46 ++++-- datafog/engine.py | 12 +- datafog/main.py | 14 +- datafog/main_lean.py | 11 +- .../regex_annotator/regex_annotator.py | 142 +++++++++++++++--- datafog/services/text_service.py | 8 +- datafog/services/text_service_lean.py | 13 +- datafog/services/text_service_original.py | 13 +- tests/test_de_pii_regex.py | 54 ++++++- tests/test_regex_annotator.py | 14 ++ 13 files changed, 311 insertions(+), 71 deletions(-) diff --git a/README.md b/README.md index 62f7e10d..5b78920e 100644 --- a/README.md +++ b/README.md @@ -95,6 +95,24 @@ Use the engine that matches your accuracy and dependency constraints: - Cascades regex with optional NER engines. - If optional deps are missing, it degrades gracefully and warns. +### Locale-specific regex patterns + +German regex patterns (DE_*) are locale-specific and disabled by default to avoid +false positives on non-German text. Enable them explicitly via `locales`: + +```python +import datafog + +result = datafog.scan( + "Steuer-ID 12345678903", + engine="regex", + locales=["de"], +) +print(result.entities) +``` + +German DE_* patterns also include checksum/context validation to reduce noise. + ## Backward-Compatible APIs The existing public API remains available. diff --git a/datafog/__init__.py b/datafog/__init__.py index e3974ad7..211e7953 100644 --- a/datafog/__init__.py +++ b/datafog/__init__.py @@ -163,6 +163,7 @@ def scan( text: str, engine: str = "regex", entity_types: list[str] | None = None, + locales: list[str] | None = None, ) -> ScanResult: """ v5-preview scan entrypoint. @@ -170,7 +171,9 @@ def scan( Defaults to the lightweight regex engine so the core install works without optional dependency fallback warnings. """ - return _scan(text=text, engine=engine, entity_types=entity_types) + return _scan( + text=text, engine=engine, entity_types=entity_types, locales=locales + ) def redact( @@ -178,6 +181,7 @@ def redact( entities: list[Entity] | None = None, engine: str = "regex", entity_types: list[str] | None = None, + locales: list[str] | None = None, strategy: str = "token", preset: str | None = None, ) -> RedactResult: @@ -201,6 +205,7 @@ def redact( text=text, engine=engine, entity_types=entity_types, + locales=locales, strategy=strategy, ) @@ -223,7 +228,7 @@ def protect( # Simple API for core functionality (backward compatibility) -def detect(text: str) -> list: +def detect(text: str, locales: list[str] | None = None) -> list: """ Detect PII in text using regex patterns. @@ -240,16 +245,16 @@ def detect(text: str) -> list: """ _warn_v5_replacement("detect", "datafog.scan()") - return _detect_impl(text) + return _detect_impl(text, locales=locales) -def _detect_impl(text: str) -> list: +def _detect_impl(text: str, locales: list[str] | None = None) -> list: import time as _time _start = _time.monotonic() _lazy_import_regex_annotator() - annotator = RegexAnnotator() + annotator = RegexAnnotator(locales=locales) # Use the structured output to get proper positions _, result = annotator.annotate_with_spans(text) @@ -290,7 +295,12 @@ def _detect_impl(text: str) -> list: return entities -def process(text: str, anonymize: bool = False, method: str = "redact") -> dict: +def process( + text: str, + anonymize: bool = False, + method: str = "redact", + locales: list[str] | None = None, +) -> dict: """ Process text to detect and optionally anonymize PII. @@ -317,7 +327,7 @@ def process(text: str, anonymize: bool = False, method: str = "redact") -> dict: _start = _time.monotonic() - findings = _detect_impl(text) + findings = _detect_impl(text, locales=locales) result = {"original": text, "findings": findings} diff --git a/datafog/__init___lean.py b/datafog/__init___lean.py index 40a3f530..50f2c7ed 100644 --- a/datafog/__init___lean.py +++ b/datafog/__init___lean.py @@ -79,7 +79,7 @@ def _missing_dependency(*args, **kwargs): # Simple API for core functionality -def detect(text: str) -> list: +def detect(text: str, locales: list[str] | None = None) -> list: """ Detect PII in text using regex patterns. @@ -94,7 +94,7 @@ def detect(text: str) -> list: >>> detect("Contact john@example.com") [{'type': 'EMAIL', 'value': 'john@example.com', 'start': 8, 'end': 24}] """ - annotator = RegexAnnotator() + annotator = RegexAnnotator(locales=locales) result = annotator.annotate(text) # Convert to simple format @@ -113,7 +113,12 @@ def detect(text: str) -> list: return entities -def process(text: str, anonymize: bool = False, method: str = "redact") -> dict: +def process( + text: str, + anonymize: bool = False, + method: str = "redact", + locales: list[str] | None = None, +) -> dict: """ Process text to detect and optionally anonymize PII. @@ -134,7 +139,7 @@ def process(text: str, anonymize: bool = False, method: str = "redact") -> dict: 'findings': [{'type': 'EMAIL', 'value': 'john@example.com', ...}] } """ - findings = detect(text) + findings = detect(text, locales=locales) result = {"original": text, "findings": findings} diff --git a/datafog/core.py b/datafog/core.py index 8db94618..a37c82a5 100644 --- a/datafog/core.py +++ b/datafog/core.py @@ -5,7 +5,7 @@ without requiring heavy dependencies like spaCy or PyTorch. """ -from typing import Dict, List, Union +from typing import Dict, List, Optional, Union from datafog.engine import scan, scan_and_redact from datafog.models.anonymizer import AnonymizerType @@ -16,12 +16,13 @@ AUTO_ENGINE = "auto" -def detect_pii(text: str) -> Dict[str, List[str]]: +def detect_pii(text: str, locales: Optional[List[str]] = None) -> Dict[str, List[str]]: """ Simple PII detection using lightweight regex engine. Args: text: Text to scan for PII + locales: Optional list of locale codes that enable locale-specific labels Returns: Dictionary mapping entity types to lists of detected values @@ -37,7 +38,7 @@ def detect_pii(text: str) -> Dict[str, List[str]]: try: # Use engine boundary for canonical scan behavior. - scan_result = scan(text=text, engine=REGEX_ENGINE) + scan_result = scan(text=text, engine=REGEX_ENGINE, locales=locales) pii_dict: Dict[str, List[str]] = {} for entity in scan_result.entities: if not entity.text.strip(): @@ -81,13 +82,18 @@ def detect_pii(text: str) -> Dict[str, List[str]]: ) from e -def anonymize_text(text: str, method: Union[str, AnonymizerType] = "redact") -> str: +def anonymize_text( + text: str, + method: Union[str, AnonymizerType] = "redact", + locales: Optional[List[str]] = None, +) -> str: """ Simple text anonymization using lightweight regex engine. Args: text: Text to anonymize method: Anonymization method ('redact', 'replace', or 'hash') + locales: Optional list of locale codes that enable locale-specific labels Returns: Anonymized text string @@ -119,6 +125,7 @@ def anonymize_text(text: str, method: Union[str, AnonymizerType] = "redact") -> result = scan_and_redact( text=text, engine=REGEX_ENGINE, + locales=locales, strategy=strategy_map[method], ) @@ -155,7 +162,7 @@ def anonymize_text(text: str, method: Union[str, AnonymizerType] = "redact") -> def scan_text( - text: str, return_entities: bool = False + text: str, return_entities: bool = False, locales: Optional[List[str]] = None ) -> Union[bool, Dict[str, List[str]]]: """ Quick scan to check if text contains any PII. @@ -163,6 +170,7 @@ def scan_text( Args: text: Text to scan return_entities: If True, return detected entities; if False, return boolean + locales: Optional list of locale codes that enable locale-specific labels Returns: Boolean indicating PII presence, or dictionary of detected entities @@ -180,7 +188,7 @@ def scan_text( _start = _time.monotonic() - entities = detect_pii(text) + entities = detect_pii(text, locales=locales) result = entities if return_entities else len(entities) > 0 @@ -200,24 +208,31 @@ def scan_text( return result -def get_supported_entities() -> List[str]: +def get_supported_entities(locales: Optional[List[str]] = None) -> List[str]: """ Get list of PII entity types supported by the regex engine. + Locale-specific labels (e.g., DE_*) are only included when locales include "de". + Returns: List of supported entity type names Example: >>> entities = get_supported_entities() >>> print(entities) - ['EMAIL', 'PHONE', 'SSN', 'CREDIT_CARD', 'IP_ADDRESS', 'DE_VAT_ID', 'DE_IBAN', 'DE_TAX_ID', 'DE_SOCIAL_SECURITY_NUMBER', 'DE_POSTAL_CODE', 'DE_PASSPORT_NUMBER', 'DE_RESIDENCE_PERMIT_NUMBER', 'DATE', 'ZIP_CODE'] + ['EMAIL', 'PHONE', 'SSN', 'CREDIT_CARD', 'IP_ADDRESS', 'DATE', 'ZIP_CODE'] """ - result = [ + base = [ "EMAIL", "PHONE", "SSN", "CREDIT_CARD", "IP_ADDRESS", + "DATE", + "ZIP_CODE", + ] + + de_labels = [ "DE_VAT_ID", "DE_IBAN", "DE_TAX_ID", @@ -225,10 +240,19 @@ def get_supported_entities() -> List[str]: "DE_POSTAL_CODE", "DE_PASSPORT_NUMBER", "DE_RESIDENCE_PERMIT_NUMBER", - "DATE", - "ZIP_CODE", ] + if not locales: + result = base + else: + locale_values = [locales] if isinstance(locales, str) else locales + normalized = { + value.strip().lower() + for value in locale_values + if isinstance(value, str) and value.strip() + } + result = base + de_labels if "de" in normalized else base + try: from datafog.telemetry import track_function_call diff --git a/datafog/engine.py b/datafog/engine.py index cc1c4d2f..1a3884ec 100644 --- a/datafog/engine.py +++ b/datafog/engine.py @@ -138,8 +138,8 @@ def _entities_from_dict( return entities -def _regex_entities(text: str) -> list[Entity]: - annotator = RegexAnnotator() +def _regex_entities(text: str, locales: Optional[list[str]] = None) -> list[Entity]: + annotator = RegexAnnotator(locales=locales) _, structured = annotator.annotate_with_spans(text) entities: list[Entity] = [] for span in structured.spans: @@ -242,6 +242,7 @@ def scan( text: str, engine: str = "smart", entity_types: Optional[list[str]] = None, + locales: Optional[list[str]] = None, ) -> ScanResult: """Scan text for PII entities.""" if not isinstance(text, str): @@ -250,7 +251,7 @@ def scan( if engine not in {"regex", "spacy", "gliner", "smart"}: raise ValueError("engine must be one of: regex, spacy, gliner, smart") - regex_entities = _regex_entities(text) + regex_entities = _regex_entities(text, locales=locales) if engine == "regex": filtered = _filter_entity_types(regex_entities, entity_types) @@ -384,8 +385,11 @@ def scan_and_redact( text: str, engine: str = "smart", entity_types: Optional[list[str]] = None, + locales: Optional[list[str]] = None, strategy: str = "token", ) -> RedactResult: """Convenience wrapper: scan then redact.""" - scan_result = scan(text=text, engine=engine, entity_types=entity_types) + scan_result = scan( + text=text, engine=engine, entity_types=entity_types, locales=locales + ) return redact(text=text, entities=scan_result.entities, strategy=strategy) diff --git a/datafog/main.py b/datafog/main.py index 31ac22e5..c045cf0d 100644 --- a/datafog/main.py +++ b/datafog/main.py @@ -10,7 +10,7 @@ import json import logging -from typing import List +from typing import List, Optional from .config import OperationType from .engine import scan, scan_and_redact @@ -39,8 +39,10 @@ def __init__( operations: List[OperationType] = [OperationType.SCAN], hash_type: HashType = HashType.SHA256, anonymizer_type: AnonymizerType = AnonymizerType.REPLACE, + locales: Optional[List[str]] = None, ): - self.regex_annotator = RegexAnnotator() + self.regex_annotator = RegexAnnotator(locales=locales) + self.locales = locales normalized_ops: List[OperationType] = [] for op in operations: if isinstance(op, OperationType): @@ -181,7 +183,7 @@ def detect(self, text: str) -> dict: _start = _time.monotonic() - scan_result = scan(text=text, engine="regex") + scan_result = scan(text=text, engine="regex", locales=self.locales) result = {label: [] for label in RegexAnnotator.LABELS} legacy_map = {"DATE": "DOB", "ZIP_CODE": "ZIP"} for entity in scan_result.entities: @@ -245,6 +247,7 @@ def process( redact_result = scan_and_redact( text=text, engine="regex", + locales=self.locales, strategy=strategy, ) result["anonymized"] = redact_result.redacted_text @@ -288,8 +291,9 @@ class TextPIIAnnotator: regex_annotator: RegexAnnotator instance for text annotation. """ - def __init__(self): - self.regex_annotator = RegexAnnotator() + def __init__(self, locales: Optional[List[str]] = None): + self.regex_annotator = RegexAnnotator(locales=locales) + self.locales = locales def run(self, text, output_path=None): """ diff --git a/datafog/main_lean.py b/datafog/main_lean.py index af61559e..4a260ff9 100644 --- a/datafog/main_lean.py +++ b/datafog/main_lean.py @@ -10,7 +10,7 @@ import json import logging -from typing import List +from typing import List, Optional from .config import OperationType from .models.anonymizer import Anonymizer, AnonymizerType, HashType @@ -38,8 +38,10 @@ def __init__( operations: List[OperationType] = [OperationType.SCAN], hash_type: HashType = HashType.SHA256, anonymizer_type: AnonymizerType = AnonymizerType.REPLACE, + locales: Optional[List[str]] = None, ): - self.regex_annotator = RegexAnnotator() + self.regex_annotator = RegexAnnotator(locales=locales) + self.locales = locales self.operations: List[OperationType] = operations self.anonymizer = Anonymizer( hash_type=hash_type, anonymizer_type=anonymizer_type @@ -161,8 +163,9 @@ class TextPIIAnnotator: regex_annotator: RegexAnnotator instance for text annotation. """ - def __init__(self): - self.regex_annotator = RegexAnnotator() + def __init__(self, locales: Optional[List[str]] = None): + self.regex_annotator = RegexAnnotator(locales=locales) + self.locales = locales def run(self, text, output_path=None): """ diff --git a/datafog/processing/text_processing/regex_annotator/regex_annotator.py b/datafog/processing/text_processing/regex_annotator/regex_annotator.py index 1ddeed19..291bf032 100644 --- a/datafog/processing/text_processing/regex_annotator/regex_annotator.py +++ b/datafog/processing/text_processing/regex_annotator/regex_annotator.py @@ -1,5 +1,5 @@ import re -from typing import Dict, List, Pattern, Tuple +from typing import Callable, Dict, Iterable, List, Optional, Pattern, Set, Tuple from pydantic import BaseModel @@ -25,10 +25,14 @@ class RegexAnnotator: This annotator serves as a fallback to the SpaCy annotator and is optimized for performance, targeting ≤ 20 µs / kB on a MacBook M-series. + + Locale notes: + German-specific entity types (DE_*) are disabled by default. Enable them by + passing locales=["de"]. This avoids false positives on non-German text. """ # Labels for PII entities - LABELS = [ + BASE_LABELS = [ "EMAIL", "PHONE", "SSN", @@ -36,18 +40,66 @@ class RegexAnnotator: "IP_ADDRESS", "DOB", "ZIP", - "DE_VAT_ID", - "DE_IBAN", - "DE_TAX_ID", - "DE_SOCIAL_SECURITY_NUMBER", - "DE_POSTAL_CODE", - "DE_PASSPORT_NUMBER", - "DE_RESIDENCE_PERMIT_NUMBER", ] - def __init__(self): + LOCALE_LABELS = { + "de": [ + "DE_VAT_ID", + "DE_IBAN", + "DE_TAX_ID", + "DE_SOCIAL_SECURITY_NUMBER", + "DE_POSTAL_CODE", + "DE_PASSPORT_NUMBER", + "DE_RESIDENCE_PERMIT_NUMBER", + ], + } + + LABELS = BASE_LABELS + LOCALE_LABELS["de"] + + _DE_PASSPORT_PREFIXES = "CFGHJKLMNPRTVWXYZ" + _DE_RESIDENCE_CONTEXT_RE = re.compile( + r"\b(aufenthaltstitel|aufenthaltserlaubnis|aufenthaltskarte|residence permit|residence card)\b", + re.IGNORECASE, + ) + + def __init__(self, locales: Optional[Iterable[str]] = None): + self.locales = self._normalize_locales(locales) + self.active_labels = self._labels_for_locales(self.locales) + # Compile all patterns once at initialization - self.patterns: Dict[str, Pattern] = { + self.patterns = self._compile_patterns() + self.validators = self._build_validators() + + @staticmethod + def _normalize_locales(locales: Optional[Iterable[str]]) -> Set[str]: + if locales is None: + return set() + if isinstance(locales, str): + values = [locales] + else: + values = list(locales) + normalized = { + value.strip().lower() + for value in values + if isinstance(value, str) and value.strip() + } + return normalized + + @classmethod + def labels_for_locales(cls, locales: Optional[Iterable[str]] = None) -> List[str]: + normalized = cls._normalize_locales(locales) + return cls._labels_for_locales(normalized) + + @classmethod + def _labels_for_locales(cls, locales: Set[str]) -> List[str]: + labels = list(cls.BASE_LABELS) + for locale, locale_labels in cls.LOCALE_LABELS.items(): + if locale in locales: + labels.extend(locale_labels) + return labels + + def _compile_patterns(self) -> Dict[str, Pattern]: + patterns: Dict[str, Pattern] = { # Email pattern - RFC 5322 subset # Intentionally permissive to favor false positives over false negatives # Allows for multiple dots, special characters in local part, and subdomains @@ -217,7 +269,7 @@ def __init__(self): "DE_TAX_ID": re.compile( r""" (? Dict[str, Callable[[re.Match, str], bool]]: + validators: Dict[str, Callable[[re.Match, str], bool]] = {} + if "DE_TAX_ID" in self.active_labels: + validators["DE_TAX_ID"] = self._validate_de_tax_id + if "DE_RESIDENCE_PERMIT_NUMBER" in self.active_labels: + validators["DE_RESIDENCE_PERMIT_NUMBER"] = self._validate_de_residence_permit + return validators + + @staticmethod + def _digits_only(value: str) -> str: + return "".join(ch for ch in value if ch.isdigit()) + + @staticmethod + def _de_tax_id_check_digit(digits10: str) -> int: + product = 10 + for ch in digits10: + sum_ = (int(ch) + product) % 10 + if sum_ == 0: + sum_ = 10 + product = (sum_ * 2) % 11 + return (11 - product) % 10 + + def _validate_de_tax_id(self, match: re.Match, text: str) -> bool: + digits = self._digits_only(match.group()) + if len(digits) != 11: + return False + if digits[0] == "0": + return False + return digits[-1] == str(self._de_tax_id_check_digit(digits[:10])) + + def _validate_de_residence_permit(self, match: re.Match, text: str) -> bool: + window = 40 + start = max(match.start() - window, 0) + end = min(match.end() + window, len(text)) + context = text[start:end] + return bool(self._DE_RESIDENCE_CONTEXT_RE.search(context)) + @classmethod - def create(cls) -> "RegexAnnotator": + def create(cls, locales: Optional[Iterable[str]] = None) -> "RegexAnnotator": """Factory method to create a new RegexAnnotator instance.""" - return cls() + return cls(locales=locales) def annotate(self, text: str) -> Dict[str, List[str]]: """Annotate text with PII entities using regex patterns. @@ -292,7 +388,10 @@ def annotate(self, text: str) -> Dict[str, List[str]]: # Process with each pattern for label, pattern in self.patterns.items(): + validator = self.validators.get(label) for match in pattern.finditer(text): + if validator and not validator(match, text): + continue result[label].append(match.group()) return result @@ -317,7 +416,10 @@ def annotate_with_spans( return spans_by_label, AnnotationResult(text=text, spans=all_spans) for label, pattern in self.patterns.items(): + validator = self.validators.get(label) for match in pattern.finditer(text): + if validator and not validator(match, text): + continue span = Span( label=label, start=match.start(), diff --git a/datafog/services/text_service.py b/datafog/services/text_service.py index 0956256f..7ed4298d 100644 --- a/datafog/services/text_service.py +++ b/datafog/services/text_service.py @@ -7,7 +7,7 @@ import asyncio import warnings -from typing import TYPE_CHECKING, Dict, List, Union +from typing import TYPE_CHECKING, Dict, List, Optional, Union if TYPE_CHECKING: from datafog.processing.text_processing.regex_annotator.regex_annotator import Span @@ -43,6 +43,7 @@ def __init__( text_chunk_length: int = 1000, engine: str = "regex", gliner_model: str = "urchade/gliner_multi_pii-v1", + locales: Optional[List[str]] = None, ): """ Initialize the TextService with specified chunk length and annotation engine. @@ -56,6 +57,7 @@ def __init__( - "auto": Try RegexAnnotator first and fall back to SpacyPIIAnnotator if no entities found - "smart": Try RegexAnnotator → GLiNER → SpaCy cascade (requires nlp-advanced extra) gliner_model: GLiNER model name to use when engine is "gliner" or "smart" + locales: Optional list of locale codes that enable locale-specific regex labels Raises: AssertionError: If an invalid engine type is provided @@ -65,6 +67,7 @@ def __init__( self.engine = engine self.text_chunk_length = text_chunk_length self.gliner_model = gliner_model + self.locales = locales # Lazy initialization - annotators created only when needed self._regex_annotator = None @@ -90,6 +93,7 @@ def __init__( engine=engine, text_chunk_length=text_chunk_length, gliner_model=gliner_model if engine in ("gliner", "smart") else None, + locales=locales, ) except Exception: pass @@ -102,7 +106,7 @@ def regex_annotator(self): RegexAnnotator, ) - self._regex_annotator = RegexAnnotator() + self._regex_annotator = RegexAnnotator(locales=self.locales) return self._regex_annotator @property diff --git a/datafog/services/text_service_lean.py b/datafog/services/text_service_lean.py index ce9203ec..50d110cd 100644 --- a/datafog/services/text_service_lean.py +++ b/datafog/services/text_service_lean.py @@ -6,7 +6,7 @@ """ import asyncio -from typing import Dict, List, Union +from typing import Dict, List, Optional, Union from datafog.processing.text_processing.regex_annotator.regex_annotator import ( RegexAnnotator, @@ -26,7 +26,12 @@ class TextService: pip install datafog[nlp] """ - def __init__(self, text_chunk_length: int = 1000, engine: str = "regex"): + def __init__( + self, + text_chunk_length: int = 1000, + engine: str = "regex", + locales: Optional[List[str]] = None, + ): """ Initialize the TextService with specified chunk length and annotation engine. @@ -36,6 +41,7 @@ def __init__(self, text_chunk_length: int = 1000, engine: str = "regex"): - "regex": (Default) Use RegexAnnotator for fast pattern-based entity detection - "spacy": Use SpacyPIIAnnotator for NLP-based entity detection (requires nlp extra) - "auto": Try RegexAnnotator first and fall back to SpacyPIIAnnotator if no entities found + locales: Optional list of locale codes that enable locale-specific regex labels Raises: AssertionError: If an invalid engine type is provided @@ -43,8 +49,9 @@ def __init__(self, text_chunk_length: int = 1000, engine: str = "regex"): """ assert engine in {"regex", "spacy", "auto"}, "Invalid engine" self.engine = engine - self.regex_annotator = RegexAnnotator() + self.regex_annotator = RegexAnnotator(locales=locales) self.text_chunk_length = text_chunk_length + self.locales = locales # Only initialize spacy if needed and available self.spacy_annotator = None diff --git a/datafog/services/text_service_original.py b/datafog/services/text_service_original.py index 6d5dde1b..e8ea4ab3 100644 --- a/datafog/services/text_service_original.py +++ b/datafog/services/text_service_original.py @@ -4,7 +4,7 @@ """ import asyncio -from typing import Dict, List, Union +from typing import Dict, List, Optional, Union from datafog.processing.text_processing.regex_annotator.regex_annotator import ( RegexAnnotator, @@ -22,7 +22,12 @@ class TextService: and combining annotations from multiple chunks. """ - def __init__(self, text_chunk_length: int = 1000, engine: str = "auto"): + def __init__( + self, + text_chunk_length: int = 1000, + engine: str = "auto", + locales: Optional[List[str]] = None, + ): """ Initialize the TextService with specified chunk length and annotation engine. @@ -32,6 +37,7 @@ def __init__(self, text_chunk_length: int = 1000, engine: str = "auto"): - "regex": Use only the RegexAnnotator for pattern-based entity detection - "spacy": Use only the SpacyPIIAnnotator for NLP-based entity detection - "auto": (Default) Try RegexAnnotator first and fall back to SpacyPIIAnnotator if no entities are found + locales: Optional list of locale codes that enable locale-specific regex labels Raises: AssertionError: If an invalid engine type is provided @@ -39,8 +45,9 @@ def __init__(self, text_chunk_length: int = 1000, engine: str = "auto"): assert engine in {"regex", "spacy", "auto"}, "Invalid engine" self.engine = engine self.spacy_annotator = SpacyPIIAnnotator.create() - self.regex_annotator = RegexAnnotator() + self.regex_annotator = RegexAnnotator(locales=locales) self.text_chunk_length = text_chunk_length + self.locales = locales def _chunk_text(self, text: str) -> List[str]: """Split the text into chunks of specified length.""" diff --git a/tests/test_de_pii_regex.py b/tests/test_de_pii_regex.py index 245aa669..f23c130a 100644 --- a/tests/test_de_pii_regex.py +++ b/tests/test_de_pii_regex.py @@ -3,6 +3,32 @@ from datafog.processing.text_processing.regex_annotator import RegexAnnotator +def _de_tax_id_check_digit(digits10: str) -> int: + product = 10 + for ch in digits10: + sum_ = (int(ch) + product) % 10 + if sum_ == 0: + sum_ = 10 + product = (sum_ * 2) % 11 + return (11 - product) % 10 + + +def _make_de_tax_id(digits10: str) -> str: + return digits10 + str(_de_tax_id_check_digit(digits10)) + + +def _format_de_tax_id_spaced(digits11: str) -> str: + return f"{digits11[:2]} {digits11[2:5]} {digits11[5:8]} {digits11[8:]}" + + +VALID_DE_TAX_ID = _make_de_tax_id("1234567890") +VALID_DE_TAX_ID_SPACED = _format_de_tax_id_spaced(VALID_DE_TAX_ID) +INVALID_DE_TAX_ID = ( + VALID_DE_TAX_ID[:-1] + + str((int(VALID_DE_TAX_ID[-1]) + 1) % 10) +) + + @pytest.mark.parametrize( "label,text,expected", [ @@ -28,13 +54,13 @@ ), ( "DE_TAX_ID", - "Steuer-ID 12345678901 liegt vor.", - "12345678901", + f"Steuer-ID {VALID_DE_TAX_ID} liegt vor.", + VALID_DE_TAX_ID, ), ( "DE_TAX_ID", - "Steuer-ID 12 345 678 901 ist gesetzt.", - "12 345 678 901", + f"Steuer-ID {VALID_DE_TAX_ID_SPACED} ist gesetzt.", + VALID_DE_TAX_ID_SPACED, ), ( "DE_SOCIAL_SECURITY_NUMBER", @@ -53,8 +79,8 @@ ), ( "DE_POSTAL_CODE", - "DE10115 Berlin.", - "DE10115", + "PLZ 10115 Berlin.", + "PLZ 10115", ), ( "DE_PASSPORT_NUMBER", @@ -69,7 +95,7 @@ ], ) def test_de_regex_positive_cases(label: str, text: str, expected: str) -> None: - annotator = RegexAnnotator() + annotator = RegexAnnotator(locales=["de"]) result = annotator.annotate(text) assert expected in result[label] @@ -83,6 +109,9 @@ def test_de_regex_positive_cases(label: str, text: str, expected: str) -> None: ("DE_IBAN", "IBAN DE44 5001 0517 5407 3249 3X ist gueltig."), ("DE_TAX_ID", "Steuer-ID 1234567890 liegt vor."), ("DE_TAX_ID", "Steuer-ID 123456789012 liegt vor."), + ("DE_TAX_ID", f"Steuer-ID {INVALID_DE_TAX_ID} liegt vor."), + ("DE_TAX_ID", "Steuer-ID 12345678901 liegt vor."), + ("DE_TAX_ID", "Steuer-ID 01234567890 liegt vor."), ( "DE_SOCIAL_SECURITY_NUMBER", "Rentenversicherungsnummer 65150804123 liegt vor.", @@ -92,14 +121,23 @@ def test_de_regex_positive_cases(label: str, text: str, expected: str) -> None: "Rentenversicherungsnummer 65150804AA23 liegt vor.", ), ("DE_POSTAL_CODE", "10115 Berlin."), + ("DE_POSTAL_CODE", "D12345"), + ("DE_POSTAL_CODE", "DE12345"), + ("DE_POSTAL_CODE", "DE10115 Berlin."), + ("DE_POSTAL_CODE", "D10115 Berlin."), ("DE_PASSPORT_NUMBER", "Passnummer 12345678 wurde geprueft."), + ("DE_PASSPORT_NUMBER", "Bestellung A12345678 liegt vor."), ( "DE_RESIDENCE_PERMIT_NUMBER", "Aufenthaltstitel AT12345678 gueltig.", ), + ( + "DE_RESIDENCE_PERMIT_NUMBER", + "AT1234567 ohne Kontext.", + ), ], ) def test_de_regex_negative_cases(label: str, text: str) -> None: - annotator = RegexAnnotator() + annotator = RegexAnnotator(locales=["de"]) result = annotator.annotate(text) assert not result[label] diff --git a/tests/test_regex_annotator.py b/tests/test_regex_annotator.py index 85894c6d..600d80e6 100644 --- a/tests/test_regex_annotator.py +++ b/tests/test_regex_annotator.py @@ -66,6 +66,20 @@ def test_regex_annotator_create_method(): assert isinstance(annotator, RegexAnnotator) +def test_de_labels_inactive_without_locale(): + """German DE_ labels should be inactive unless locales include 'de'.""" + annotator = RegexAnnotator() + result = annotator.annotate("Passnummer C12345678 wurde geprueft.") + assert not result["DE_PASSPORT_NUMBER"] + + +def test_de_labels_active_with_locale(): + """German DE_ labels should activate when locales include 'de'.""" + annotator = RegexAnnotator(locales=["de"]) + result = annotator.annotate("Passnummer C12345678 wurde geprueft.") + assert "C12345678" in result["DE_PASSPORT_NUMBER"] + + def test_empty_text_annotation(): """Test that annotating empty text returns empty results.""" annotator = RegexAnnotator() From 785921078cc6e6f047452ec952d4cac989ad63f9 Mon Sep 17 00:00:00 2001 From: Pranjal Parmar <76609992+pranjalparmar@users.noreply.github.com> Date: Wed, 27 May 2026 04:04:29 +0200 Subject: [PATCH 06/12] test: keep structured corpus locale-aware --- tests/corpus/structured_pii.json | 11 +++++++++-- tests/test_detection_accuracy.py | 14 ++++++++------ 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/tests/corpus/structured_pii.json b/tests/corpus/structured_pii.json index 2fbf4744..1b169f33 100644 --- a/tests/corpus/structured_pii.json +++ b/tests/corpus/structured_pii.json @@ -736,6 +736,7 @@ }, { "id": "de-vat-id-simple", + "locales": ["de"], "input": "VAT number: DE123456789 for invoices.", "expected_entities": [ { @@ -748,6 +749,7 @@ }, { "id": "de-iban-formatted", + "locales": ["de"], "input": "IBAN: DE89 3704 0044 0532 0130 00 for payments.", "expected_entities": [ { @@ -760,11 +762,12 @@ }, { "id": "de-tax-id-simple", - "input": "Steuer-ID 12345678901 liegt vor.", + "locales": ["de"], + "input": "Steuer-ID 12345678903 liegt vor.", "expected_entities": [ { "type": "DE_TAX_ID", - "text": "12345678901", + "text": "12345678903", "start": 10, "end": 21 } @@ -772,6 +775,7 @@ }, { "id": "de-social-security-number", + "locales": ["de"], "input": "Rentenversicherungsnummer 65150804A123 liegt vor.", "expected_entities": [ { @@ -784,6 +788,7 @@ }, { "id": "de-postal-code-prefixed", + "locales": ["de"], "input": "PLZ10115 Berlin.", "expected_entities": [ { @@ -796,6 +801,7 @@ }, { "id": "de-passport-number", + "locales": ["de"], "input": "Passnummer C12345678 wurde geprueft.", "expected_entities": [ { @@ -808,6 +814,7 @@ }, { "id": "de-residence-permit-number", + "locales": ["de"], "input": "Aufenthaltstitel AT1234567 gueltig.", "expected_entities": [ { diff --git a/tests/test_detection_accuracy.py b/tests/test_detection_accuracy.py index c9e680e2..a81fa304 100644 --- a/tests/test_detection_accuracy.py +++ b/tests/test_detection_accuracy.py @@ -285,9 +285,11 @@ def _canon_type(entity_type: str) -> str: return TYPE_ALIASES.get(raw, raw) -def _extract_entities(text: str, engine: str) -> list[dict[str, Any]]: +def _extract_entities( + text: str, engine: str, locales: Iterable[str] | None = None +) -> list[dict[str, Any]]: try: - result = scan(text=text, engine=engine) + result = scan(text=text, engine=engine, locales=list(locales or [])) except (ImportError, EngineNotAvailable) as exc: pytest.skip(f"{engine} engine unavailable in this environment: {exc}") @@ -352,7 +354,7 @@ def _assert_expected_found( case: dict[str, Any], engine: str, corpus_kind: str ) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: text = case["input"] - actual = _extract_entities(text, engine) + actual = _extract_entities(text, engine, case.get("locales")) expected = _required_expected(case["expected_entities"], engine, corpus_kind) for exp in expected: @@ -408,7 +410,7 @@ def _compute_metrics( for engine in engines: for corpus_kind, cases in corpora: for case in cases: - actual = _extract_entities(case["input"], engine) + actual = _extract_entities(case["input"], engine, case.get("locales")) expected = _required_expected( case["expected_entities"], engine, corpus_kind ) @@ -495,7 +497,7 @@ def test_structured_pii_detection_slow(case: dict[str, Any], engine: str) -> Non @pytest.mark.parametrize("engine", FAST_ENGINES) def test_negative_cases_fast(case: dict[str, Any], engine: str) -> None: _xfail_if_known_limitation(case, engine, "negative") - actual = _extract_entities(case["input"], engine) + actual = _extract_entities(case["input"], engine, case.get("locales")) assert not actual, f"{case['id']} ({engine}) false positives: {actual}" @@ -506,7 +508,7 @@ def test_negative_cases_fast(case: dict[str, Any], engine: str) -> None: @pytest.mark.parametrize("engine", SLOW_ENGINES) def test_negative_cases_slow(case: dict[str, Any], engine: str) -> None: _xfail_if_known_limitation(case, engine, "negative") - actual = _extract_entities(case["input"], engine) + actual = _extract_entities(case["input"], engine, case.get("locales")) assert not actual, f"{case['id']} ({engine}) false positives: {actual}" From 5f5ddaaff2c84f86cc7a2c72e5e8b25d48ab08f3 Mon Sep 17 00:00:00 2001 From: Pranjal Parmar <76609992+pranjalparmar@users.noreply.github.com> Date: Wed, 27 May 2026 04:19:30 +0200 Subject: [PATCH 07/12] fix(regex): gate locale compilation --- datafog/core.py | 14 +- .../regex_annotator/regex_annotator.py | 168 +++++++++--------- tests/test_de_pii_regex.py | 22 +-- tests/test_detection_accuracy.py | 5 +- 4 files changed, 102 insertions(+), 107 deletions(-) diff --git a/datafog/core.py b/datafog/core.py index a37c82a5..6744491c 100644 --- a/datafog/core.py +++ b/datafog/core.py @@ -5,7 +5,7 @@ without requiring heavy dependencies like spaCy or PyTorch. """ -from typing import Dict, List, Optional, Union +from typing import Dict, Iterable, List, Optional, Union from datafog.engine import scan, scan_and_redact from datafog.models.anonymizer import AnonymizerType @@ -16,7 +16,9 @@ AUTO_ENGINE = "auto" -def detect_pii(text: str, locales: Optional[List[str]] = None) -> Dict[str, List[str]]: +def detect_pii( + text: str, locales: Optional[Iterable[str] | str] = None +) -> Dict[str, List[str]]: """ Simple PII detection using lightweight regex engine. @@ -85,7 +87,7 @@ def detect_pii(text: str, locales: Optional[List[str]] = None) -> Dict[str, List def anonymize_text( text: str, method: Union[str, AnonymizerType] = "redact", - locales: Optional[List[str]] = None, + locales: Optional[Iterable[str] | str] = None, ) -> str: """ Simple text anonymization using lightweight regex engine. @@ -162,7 +164,9 @@ def anonymize_text( def scan_text( - text: str, return_entities: bool = False, locales: Optional[List[str]] = None + text: str, + return_entities: bool = False, + locales: Optional[Iterable[str] | str] = None, ) -> Union[bool, Dict[str, List[str]]]: """ Quick scan to check if text contains any PII. @@ -208,7 +212,7 @@ def scan_text( return result -def get_supported_entities(locales: Optional[List[str]] = None) -> List[str]: +def get_supported_entities(locales: Optional[Iterable[str] | str] = None) -> List[str]: """ Get list of PII entity types supported by the regex engine. diff --git a/datafog/processing/text_processing/regex_annotator/regex_annotator.py b/datafog/processing/text_processing/regex_annotator/regex_annotator.py index 291bf032..f7500d67 100644 --- a/datafog/processing/text_processing/regex_annotator/regex_annotator.py +++ b/datafog/processing/text_processing/regex_annotator/regex_annotator.py @@ -54,7 +54,9 @@ class RegexAnnotator: ], } - LABELS = BASE_LABELS + LOCALE_LABELS["de"] + LABELS = BASE_LABELS + [ + label for locale_labels in LOCALE_LABELS.values() for label in locale_labels + ] _DE_PASSPORT_PREFIXES = "CFGHJKLMNPRTVWXYZ" _DE_RESIDENCE_CONTEXT_RE = re.compile( @@ -242,87 +244,93 @@ def _compile_patterns(self) -> Dict[str, Pattern]: """, re.IGNORECASE | re.MULTILINE | re.VERBOSE, ), - # German VAT ID (USt-IdNr) - DE followed by 9 digits - "DE_VAT_ID": re.compile( - r""" - (? int: - product = 10 - for ch in digits10: - sum_ = (int(ch) + product) % 10 - if sum_ == 0: - sum_ = 10 - product = (sum_ * 2) % 11 - return (11 - product) % 10 - - -def _make_de_tax_id(digits10: str) -> str: - return digits10 + str(_de_tax_id_check_digit(digits10)) - - -def _format_de_tax_id_spaced(digits11: str) -> str: - return f"{digits11[:2]} {digits11[2:5]} {digits11[5:8]} {digits11[8:]}" - - -VALID_DE_TAX_ID = _make_de_tax_id("1234567890") -VALID_DE_TAX_ID_SPACED = _format_de_tax_id_spaced(VALID_DE_TAX_ID) +VALID_DE_TAX_ID = "12345678903" +VALID_DE_TAX_ID_SPACED = "12 345 678 903" INVALID_DE_TAX_ID = ( VALID_DE_TAX_ID[:-1] + str((int(VALID_DE_TAX_ID[-1]) + 1) % 10) diff --git a/tests/test_detection_accuracy.py b/tests/test_detection_accuracy.py index a81fa304..1f6fac01 100644 --- a/tests/test_detection_accuracy.py +++ b/tests/test_detection_accuracy.py @@ -286,10 +286,11 @@ def _canon_type(entity_type: str) -> str: def _extract_entities( - text: str, engine: str, locales: Iterable[str] | None = None + text: str, engine: str, locales: Iterable[str] | str | None = None ) -> list[dict[str, Any]]: try: - result = scan(text=text, engine=engine, locales=list(locales or [])) + locale_values = [locales] if isinstance(locales, str) else list(locales or []) + result = scan(text=text, engine=engine, locales=locale_values) except (ImportError, EngineNotAvailable) as exc: pytest.skip(f"{engine} engine unavailable in this environment: {exc}") From 25a4ea6d7ae9e016f887206ad755f33abbaa645c Mon Sep 17 00:00:00 2001 From: Pranjal Parmar <76609992+pranjalparmar@users.noreply.github.com> Date: Wed, 27 May 2026 04:29:58 +0200 Subject: [PATCH 08/12] fix(api): thread locales through guardrails --- README.md | 2 +- datafog/__init__.py | 12 +++++++----- datafog/agent.py | 14 ++++++++++++-- datafog/engine.py | 10 ++++++---- tests/test_agent_api.py | 10 ++++++++++ 5 files changed, 36 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 5b78920e..e176586e 100644 --- a/README.md +++ b/README.md @@ -111,7 +111,7 @@ result = datafog.scan( print(result.entities) ``` -German DE_* patterns also include checksum/context validation to reduce noise. +Some German DE_* patterns include additional checksum or context validation to reduce noise (for example, `DE_TAX_ID` and `DE_RESIDENCE_PERMIT_NUMBER`). ## Backward-Compatible APIs diff --git a/datafog/__init__.py b/datafog/__init__.py index 211e7953..cffee39c 100644 --- a/datafog/__init__.py +++ b/datafog/__init__.py @@ -163,7 +163,7 @@ def scan( text: str, engine: str = "regex", entity_types: list[str] | None = None, - locales: list[str] | None = None, + locales: list[str] | str | None = None, ) -> ScanResult: """ v5-preview scan entrypoint. @@ -181,7 +181,7 @@ def redact( entities: list[Entity] | None = None, engine: str = "regex", entity_types: list[str] | None = None, - locales: list[str] | None = None, + locales: list[str] | str | None = None, strategy: str = "token", preset: str | None = None, ) -> RedactResult: @@ -215,6 +215,7 @@ def protect( engine: str = "regex", strategy: str = "token", on_detect: str = "redact", + locales: list[str] | str | None = None, ): """ v5-preview guardrail factory. @@ -224,11 +225,12 @@ def protect( engine=engine, strategy=strategy, on_detect=on_detect, + locales=locales, ) # Simple API for core functionality (backward compatibility) -def detect(text: str, locales: list[str] | None = None) -> list: +def detect(text: str, locales: list[str] | str | None = None) -> list: """ Detect PII in text using regex patterns. @@ -248,7 +250,7 @@ def detect(text: str, locales: list[str] | None = None) -> list: return _detect_impl(text, locales=locales) -def _detect_impl(text: str, locales: list[str] | None = None) -> list: +def _detect_impl(text: str, locales: list[str] | str | None = None) -> list: import time as _time _start = _time.monotonic() @@ -299,7 +301,7 @@ def process( text: str, anonymize: bool = False, method: str = "redact", - locales: list[str] | None = None, + locales: list[str] | str | None = None, ) -> dict: """ Process text to detect and optionally anonymize PII. diff --git a/datafog/agent.py b/datafog/agent.py index 58a84ed7..9bf88dac 100644 --- a/datafog/agent.py +++ b/datafog/agent.py @@ -6,7 +6,7 @@ from contextlib import contextmanager from dataclasses import dataclass from functools import wraps -from typing import Any, Callable, Iterator, Optional, TypeVar +from typing import Any, Callable, Iterable, Iterator, Optional, TypeVar from .engine import Entity, RedactResult, ScanResult, scan, scan_and_redact @@ -31,6 +31,7 @@ def scan(self, text: str) -> ScanResult: text=text, engine=self.guardrail.engine, entity_types=self.guardrail.entity_types, + locales=self.guardrail.locales, ) if result.entities: self.detections += len(result.entities) @@ -54,6 +55,7 @@ class Guardrail: engine: str = "smart" strategy: str = "token" on_detect: str = "redact" + locales: Optional[Iterable[str] | str] = None def __post_init__(self) -> None: if self.on_detect not in {"redact", "block", "warn"}: @@ -61,7 +63,12 @@ def __post_init__(self) -> None: def scan(self, text: str) -> ScanResult: """Scan a text value for entities.""" - return scan(text=text, engine=self.engine, entity_types=self.entity_types) + return scan( + text=text, + engine=self.engine, + entity_types=self.entity_types, + locales=self.locales, + ) def filter(self, text: str) -> RedactResult: """Scan then enforce configured behavior.""" @@ -69,6 +76,7 @@ def filter(self, text: str) -> RedactResult: text=text, engine=self.engine, entity_types=self.entity_types, + locales=self.locales, strategy=self.strategy, ) if not result.entities: @@ -140,6 +148,7 @@ def create_guardrail( engine: str = "smart", strategy: str = "token", on_detect: str = "redact", + locales: Optional[Iterable[str] | str] = None, ) -> Guardrail: """ Create a reusable guardrail object for wrapping LLM calls. @@ -149,6 +158,7 @@ def create_guardrail( engine=engine, strategy=strategy, on_detect=on_detect, + locales=locales, ) diff --git a/datafog/engine.py b/datafog/engine.py index 1a3884ec..d6f4c685 100644 --- a/datafog/engine.py +++ b/datafog/engine.py @@ -6,7 +6,7 @@ import warnings from dataclasses import dataclass from functools import lru_cache -from typing import Optional +from typing import Iterable, Optional from .exceptions import EngineNotAvailable from .processing.text_processing.regex_annotator import RegexAnnotator @@ -138,7 +138,9 @@ def _entities_from_dict( return entities -def _regex_entities(text: str, locales: Optional[list[str]] = None) -> list[Entity]: +def _regex_entities( + text: str, locales: Optional[Iterable[str] | str] = None +) -> list[Entity]: annotator = RegexAnnotator(locales=locales) _, structured = annotator.annotate_with_spans(text) entities: list[Entity] = [] @@ -242,7 +244,7 @@ def scan( text: str, engine: str = "smart", entity_types: Optional[list[str]] = None, - locales: Optional[list[str]] = None, + locales: Optional[Iterable[str] | str] = None, ) -> ScanResult: """Scan text for PII entities.""" if not isinstance(text, str): @@ -385,7 +387,7 @@ def scan_and_redact( text: str, engine: str = "smart", entity_types: Optional[list[str]] = None, - locales: Optional[list[str]] = None, + locales: Optional[Iterable[str] | str] = None, strategy: str = "token", ) -> RedactResult: """Convenience wrapper: scan then redact.""" diff --git a/tests/test_agent_api.py b/tests/test_agent_api.py index ff72e9fa..cba4f128 100644 --- a/tests/test_agent_api.py +++ b/tests/test_agent_api.py @@ -67,6 +67,16 @@ def test_create_guardrail_warn_mode_warns_and_returns_original() -> None: assert result.mapping == {} +def test_create_guardrail_with_locales_enables_de_patterns() -> None: + guard = datafog.create_guardrail(engine="regex", locales=["de"]) + + result = guard.filter("Steuer-ID 12 345 678 903 liegt vor.") + + assert result.redacted_text != "Steuer-ID 12 345 678 903 liegt vor." + assert any(entity.type == "DE_TAX_ID" for entity in result.entities) + assert "[DE_TAX_ID_1]" in result.redacted_text + + def test_guardrail_watch_context_manager_tracks_activity() -> None: guard = datafog.create_guardrail(engine="regex") From 6e7dd02633ad08bb570a219402b240635feb7d8b Mon Sep 17 00:00:00 2001 From: Pranjal Parmar <76609992+pranjalparmar@users.noreply.github.com> Date: Wed, 27 May 2026 04:39:10 +0200 Subject: [PATCH 09/12] fix(api): preserve redact positional args --- datafog/__init__.py | 2 +- datafog/engine.py | 2 +- tests/test_no_network_core.py | 22 ++++++++++++++++++++++ 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/datafog/__init__.py b/datafog/__init__.py index cffee39c..d29adb99 100644 --- a/datafog/__init__.py +++ b/datafog/__init__.py @@ -181,9 +181,9 @@ def redact( entities: list[Entity] | None = None, engine: str = "regex", entity_types: list[str] | None = None, - locales: list[str] | str | None = None, strategy: str = "token", preset: str | None = None, + locales: list[str] | str | None = None, ) -> RedactResult: """ v5-preview redaction entrypoint. diff --git a/datafog/engine.py b/datafog/engine.py index d6f4c685..cc6ef630 100644 --- a/datafog/engine.py +++ b/datafog/engine.py @@ -387,8 +387,8 @@ def scan_and_redact( text: str, engine: str = "smart", entity_types: Optional[list[str]] = None, - locales: Optional[Iterable[str] | str] = None, strategy: str = "token", + locales: Optional[Iterable[str] | str] = None, ) -> RedactResult: """Convenience wrapper: scan then redact.""" scan_result = scan( diff --git a/tests/test_no_network_core.py b/tests/test_no_network_core.py index 905984f4..bc3fc413 100644 --- a/tests/test_no_network_core.py +++ b/tests/test_no_network_core.py @@ -3,6 +3,9 @@ import sys from pathlib import Path +import datafog +from datafog.engine import scan_and_redact + def _run_isolated_python(script: str) -> subprocess.CompletedProcess[str]: env = dict(os.environ) @@ -43,6 +46,25 @@ def blocked(*_args, **_kwargs): ) +def test_redact_positional_strategy_remains_compatible() -> None: + public_result = datafog.redact( + "Email jane@example.com", + None, + "regex", + None, + "mask", + ) + engine_result = scan_and_redact( + "Email jane@example.com", + "regex", + None, + "mask", + ) + + assert public_result.redacted_text == engine_result.redacted_text + assert public_result.redacted_text != "Email jane@example.com" + + def test_core_defaults_do_not_initialize_optional_engines(monkeypatch) -> None: import datafog import datafog.engine as engine From 7c1954080096b0d94bea878ec5c07727f6ff9184 Mon Sep 17 00:00:00 2001 From: Pranjal Parmar <76609992+pranjalparmar@users.noreply.github.com> Date: Wed, 27 May 2026 04:56:43 +0200 Subject: [PATCH 10/12] refactor(regex): cache annotators by locale --- datafog/core.py | 13 +++---------- datafog/engine.py | 12 +++++++++++- .../regex_annotator/regex_annotator.py | 8 +++++--- 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/datafog/core.py b/datafog/core.py index 6744491c..e01e9f21 100644 --- a/datafog/core.py +++ b/datafog/core.py @@ -9,6 +9,7 @@ from datafog.engine import scan, scan_and_redact from datafog.models.anonymizer import AnonymizerType +from datafog.processing.text_processing.regex_annotator import RegexAnnotator # Engine types as constants REGEX_ENGINE = "regex" @@ -246,16 +247,8 @@ def get_supported_entities(locales: Optional[Iterable[str] | str] = None) -> Lis "DE_RESIDENCE_PERMIT_NUMBER", ] - if not locales: - result = base - else: - locale_values = [locales] if isinstance(locales, str) else locales - normalized = { - value.strip().lower() - for value in locale_values - if isinstance(value, str) and value.strip() - } - result = base + de_labels if "de" in normalized else base + normalized_locales = RegexAnnotator._normalize_locales(locales) + result = base + de_labels if "de" in normalized_locales else base try: from datafog.telemetry import track_function_call diff --git a/datafog/engine.py b/datafog/engine.py index cc6ef630..34f634e3 100644 --- a/datafog/engine.py +++ b/datafog/engine.py @@ -138,10 +138,20 @@ def _entities_from_dict( return entities +def _normalize_regex_locales(locales: Optional[Iterable[str] | str]) -> tuple[str, ...]: + normalized = RegexAnnotator._normalize_locales(locales) + return tuple(sorted(normalized)) + + +@lru_cache(maxsize=None) +def _get_regex_annotator(locales_key: tuple[str, ...]) -> RegexAnnotator: + return RegexAnnotator(locales=locales_key) + + def _regex_entities( text: str, locales: Optional[Iterable[str] | str] = None ) -> list[Entity]: - annotator = RegexAnnotator(locales=locales) + annotator = _get_regex_annotator(_normalize_regex_locales(locales)) _, structured = annotator.annotate_with_spans(text) entities: list[Entity] = [] for span in structured.spans: diff --git a/datafog/processing/text_processing/regex_annotator/regex_annotator.py b/datafog/processing/text_processing/regex_annotator/regex_annotator.py index f7500d67..464cc34d 100644 --- a/datafog/processing/text_processing/regex_annotator/regex_annotator.py +++ b/datafog/processing/text_processing/regex_annotator/regex_annotator.py @@ -64,7 +64,7 @@ class RegexAnnotator: re.IGNORECASE, ) - def __init__(self, locales: Optional[Iterable[str]] = None): + def __init__(self, locales: Optional[Iterable[str] | str] = None): self.locales = self._normalize_locales(locales) self.active_labels = self._labels_for_locales(self.locales) @@ -73,7 +73,7 @@ def __init__(self, locales: Optional[Iterable[str]] = None): self.validators = self._build_validators() @staticmethod - def _normalize_locales(locales: Optional[Iterable[str]]) -> Set[str]: + def _normalize_locales(locales: Optional[Iterable[str] | str]) -> Set[str]: if locales is None: return set() if isinstance(locales, str): @@ -88,7 +88,9 @@ def _normalize_locales(locales: Optional[Iterable[str]]) -> Set[str]: return normalized @classmethod - def labels_for_locales(cls, locales: Optional[Iterable[str]] = None) -> List[str]: + def labels_for_locales( + cls, locales: Optional[Iterable[str] | str] = None + ) -> List[str]: normalized = cls._normalize_locales(locales) return cls._labels_for_locales(normalized) From 2f0622593a7461d4c7dc7e8b6f98297b1395aec6 Mon Sep 17 00:00:00 2001 From: Pranjal Parmar <76609992+pranjalparmar@users.noreply.github.com> Date: Wed, 27 May 2026 11:06:20 +0200 Subject: [PATCH 11/12] fix(regex): bound locale cache keys --- datafog/core.py | 17 ++++++----------- datafog/engine.py | 5 +++-- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/datafog/core.py b/datafog/core.py index e01e9f21..633a5fd5 100644 --- a/datafog/core.py +++ b/datafog/core.py @@ -237,18 +237,13 @@ def get_supported_entities(locales: Optional[Iterable[str] | str] = None) -> Lis "ZIP_CODE", ] - de_labels = [ - "DE_VAT_ID", - "DE_IBAN", - "DE_TAX_ID", - "DE_SOCIAL_SECURITY_NUMBER", - "DE_POSTAL_CODE", - "DE_PASSPORT_NUMBER", - "DE_RESIDENCE_PERMIT_NUMBER", - ] - normalized_locales = RegexAnnotator._normalize_locales(locales) - result = base + de_labels if "de" in normalized_locales else base + locale_labels = [ + label + for locale in normalized_locales + for label in RegexAnnotator.LOCALE_LABELS.get(locale, []) + ] + result = base + locale_labels if locale_labels else base try: from datafog.telemetry import track_function_call diff --git a/datafog/engine.py b/datafog/engine.py index 34f634e3..5c51e17b 100644 --- a/datafog/engine.py +++ b/datafog/engine.py @@ -140,10 +140,11 @@ def _entities_from_dict( def _normalize_regex_locales(locales: Optional[Iterable[str] | str]) -> tuple[str, ...]: normalized = RegexAnnotator._normalize_locales(locales) - return tuple(sorted(normalized)) + supported_locales = set(RegexAnnotator.LOCALE_LABELS) + return tuple(sorted(normalized & supported_locales)) -@lru_cache(maxsize=None) +@lru_cache(maxsize=32) def _get_regex_annotator(locales_key: tuple[str, ...]) -> RegexAnnotator: return RegexAnnotator(locales=locales_key) From cd9fd134db5a1bc45767118fee8fcd9db31a16ee Mon Sep 17 00:00:00 2001 From: Pranjal Parmar <76609992+pranjalparmar@users.noreply.github.com> Date: Wed, 27 May 2026 12:08:52 +0200 Subject: [PATCH 12/12] test: localize no-network imports --- tests/test_no_network_core.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_no_network_core.py b/tests/test_no_network_core.py index bc3fc413..fa17393c 100644 --- a/tests/test_no_network_core.py +++ b/tests/test_no_network_core.py @@ -3,9 +3,6 @@ import sys from pathlib import Path -import datafog -from datafog.engine import scan_and_redact - def _run_isolated_python(script: str) -> subprocess.CompletedProcess[str]: env = dict(os.environ) @@ -47,6 +44,9 @@ def blocked(*_args, **_kwargs): def test_redact_positional_strategy_remains_compatible() -> None: + import datafog + from datafog.engine import scan_and_redact + public_result = datafog.redact( "Email jane@example.com", None,