From a79cd8f67d0aedb421dae9f07128cafeeaf5637f Mon Sep 17 00:00:00 2001 From: sidmohan0 <61345237+sidmohan0@users.noreply.github.com> Date: Tue, 26 May 2026 19:00:34 +0200 Subject: [PATCH 1/4] chore: land v4.5 release prep baseline Refs DFPY-71 --- .bumpversion.cfg | 6 +-- .gitignore | 5 +- Claude.md => AGENTS.md | 93 ++++++++++++++++++++++++-------- datafog/__init__.py | 95 +++++++++++++++------------------ docs/agents/domain.md | 31 +++++++++++ docs/agents/issue-tracker.md | 24 +++++++++ docs/agents/triage-labels.md | 13 +++++ docs/conf.py | 7 ++- setup.py | 2 + tests/test_donut_lazy_import.py | 67 ++++++++++++----------- tests/test_install_profiles.py | 3 ++ 11 files changed, 231 insertions(+), 115 deletions(-) rename Claude.md => AGENTS.md (82%) create mode 100644 docs/agents/domain.md create mode 100644 docs/agents/issue-tracker.md create mode 100644 docs/agents/triage-labels.md diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 463cf894..0f1713f9 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 4.3.0 +current_version = 4.4.0a5 commit = True tag = True tag_name = v{new_version} @@ -20,7 +20,3 @@ values = [bumpversion:file:datafog/__about__.py] search = __version__ = "{current_version}" replace = __version__ = "{new_version}" - -[bumpversion:file:setup.py] -search = version="{current_version}" -replace = version="{new_version}" \ No newline at end of file diff --git a/.gitignore b/.gitignore index 2f62eff9..bb9f8105 100644 --- a/.gitignore +++ b/.gitignore @@ -24,6 +24,7 @@ error_log.txt # Environment .env .venv +.venv*/ venv/ env/ examples/venv/ @@ -58,14 +59,14 @@ docs/* !docs/conf.py !docs/Makefile !docs/make.bat +!docs/agents/ +!docs/agents/** !docs/audit/ !docs/audit/** # Keep all directories but ignore their contents */**/__pycache__/ -# Keep all files but ignore their contents -Claude.md notes/benchmarking_notes.md Roadmap.md notes/* diff --git a/Claude.md b/AGENTS.md similarity index 82% rename from Claude.md rename to AGENTS.md index dcbe7934..a46c4402 100644 --- a/Claude.md +++ b/AGENTS.md @@ -1,18 +1,26 @@ -# DataFog - Claude Development Guide +# DataFog - Agent Development Guide ## Project Overview + **DataFog** is an open-source Python library for PII detection and anonymization with a focus on speed and lightweight architecture. ## Core Value Proposition + - **Ultra-Fast Performance**: 190x faster than spaCy for structured PII, 32x faster with GLiNER - **Lightweight Core**: <2MB package with optional ML extras - **Modern Engine Options**: Regex, GLiNER, spaCy, and smart cascading - **Production Ready**: Comprehensive testing, CI/CD, and performance validation ## Current Project Status -**Version: 4.3.0** + +**Stable version: 4.4.0** + +**Development version: 4.4.0a5** + +**Next minor target: 4.5.0** ### ✅ Recently Completed (Latest) + - **GLiNER Integration**: Modern NER engine with PII-specialized models - **Smart Cascading**: Intelligent regex → GLiNER → spaCy progression - **Enhanced CLI**: Model management with `--engine` flags @@ -43,6 +51,7 @@ python -c "from datafog.services.text_service import TextService; print('✅ All ## Architecture Overview ### Engine Ecosystem (Updated with GLiNER) + ```python from datafog.services.text_service import TextService @@ -59,21 +68,23 @@ auto_service = TextService(engine="auto") # Legacy: regex→spaCy ``` ### Performance Comparison (Validated) -| Engine | Speed vs spaCy | Accuracy | Use Case | Install | -|---------|----------------|----------|----------|---------| -| `regex` | **190x faster** | High (structured) | Emails, phones, SSNs | Core only | -| `gliner` | **32x faster** | Very High | Modern NER, custom entities | `[nlp-advanced]` | -| `spacy` | 1x (baseline) | Good | Traditional NLP | `[nlp]` | -| `smart` | **60x faster** | Highest | Best balance | `[nlp-advanced]` | + +| Engine | Speed vs spaCy | Accuracy | Use Case | Install | +| -------- | --------------- | ----------------- | --------------------------- | ---------------- | +| `regex` | **190x faster** | High (structured) | Emails, phones, SSNs | Core only | +| `gliner` | **32x faster** | Very High | Modern NER, custom entities | `[nlp-advanced]` | +| `spacy` | 1x (baseline) | Good | Traditional NLP | `[nlp]` | +| `smart` | **60x faster** | Highest | Best balance | `[nlp-advanced]` | ### Dependency Strategy + ```python # Lightweight core (<2MB) pip install datafog # Optional ML engines pip install datafog[nlp] # spaCy (traditional NLP) -pip install datafog[nlp-advanced] # GLiNER (modern NER) +pip install datafog[nlp-advanced] # GLiNER (modern NER) pip install datafog[ocr] # Image processing pip install datafog[all] # Everything ``` @@ -81,15 +92,18 @@ pip install datafog[all] # Everything ## GLiNER Integration (NEW) ### Overview + GLiNER (Generalist Model for Named Entity Recognition) provides modern, accurate NER capabilities optimized for PII detection. ### Key Features + - **PII-Specialized Models**: `urchade/gliner_multi_pii-v1` trained specifically for PII - **Custom Entity Types**: Configurable entity detection beyond default PII types - **Smart Cascading**: Automatically tries regex first, GLiNER second, spaCy last - **CLI Management**: Download and manage GLiNER models via CLI ### Usage Examples + ```python # GLiNER engine from datafog.services.text_service import TextService @@ -108,6 +122,7 @@ subprocess.run(["datafog", "list-models", "--engine", "gliner"]) ``` ### Available GLiNER Models + - `urchade/gliner_multi_pii-v1` - PII-specialized (recommended) - `urchade/gliner_base` - General purpose starter - `urchade/gliner_large-v2` - Higher accuracy @@ -116,17 +131,19 @@ subprocess.run(["datafog", "list-models", "--engine", "gliner"]) ## Development Workflow ### Git Branch Strategy + - **main**: Production releases only - **dev**: Main development branch (use this) -- **feature/***: New features from dev -- **fix/***: Bug fixes from dev +- **feature/\***: New features from dev +- **fix/\***: Bug fixes from dev ### Making Changes + ```bash # Start from dev git checkout dev && git pull origin dev -# Create feature branch +# Create feature branch git checkout -b feature/your-change # Make changes, test, commit @@ -137,6 +154,7 @@ git push -u origin feature/your-change ``` ### Testing + ```bash # Run specific test suites pytest tests/test_text_service.py -v # Core functionality @@ -149,13 +167,14 @@ PYTEST_DONUT=yes pytest tests/test_ocr_integration.py # OCR with real models # Performance requirements # - Regex: 150x+ faster than spaCy -# - GLiNER: 25x+ faster than spaCy +# - GLiNER: 25x+ faster than spaCy # - Package size: Core <2MB, full <8MB ``` ## Key Implementation Patterns ### Simple API (Recommended) + ```python # Always available, lightweight from datafog import detect, process @@ -164,6 +183,7 @@ result = process("john@example.com", method="redact") ``` ### Advanced Engine Selection + ```python # For specialized use cases from datafog.services.text_service import TextService @@ -173,7 +193,7 @@ service = TextService(engine="regex") # Modern NER with custom entities service = TextService( - engine="gliner", + engine="gliner", gliner_model="urchade/gliner_base" ) @@ -182,6 +202,7 @@ service = TextService(engine="smart") ``` ### Graceful Degradation + ```python # Handles missing dependencies elegantly try: @@ -194,18 +215,21 @@ except ImportError: ## Common Tasks ### Adding New Entity Types + 1. Update regex patterns in `regex_annotator.py` 2. Add GLiNER entity types in `gliner_annotator.py` 3. Update tests and benchmarks 4. Validate performance doesn't regress >10% ### Performance Optimization + 1. Profile with existing benchmarks 2. Maintain speed thresholds (regex 150x+, GLiNER 25x+) 3. Update baselines when making improvements 4. Test across all engines ### CLI Enhancements + 1. Update `client.py` with new commands 2. Support `--engine` flag for multi-engine commands 3. Add comprehensive help text and examples @@ -215,31 +239,36 @@ except ImportError: ### Workflow Architecture (3 workflows) -| Workflow | Purpose | Trigger | -|----------|---------|---------| -| `ci.yml` | Lint + Test + Coverage + Wheel size | Push/PR to main/dev | -| `release.yml` | Alpha/Beta/Stable publishing | Schedule + manual dispatch | -| `benchmark.yml` | Performance benchmarks | Push/PR/weekly | +| Workflow | Purpose | Trigger | +| --------------- | ----------------------------------- | -------------------------- | +| `ci.yml` | Lint + Test + Coverage + Wheel size | Push/PR to main/dev | +| `release.yml` | Alpha/Beta/Stable publishing | Schedule + manual dispatch | +| `benchmark.yml` | Performance benchmarks | Push/PR/weekly | ### Release Cadence + - **Alpha** (Mon-Wed 2AM UTC): Automatic from `dev`, date+commit versioning - **Beta** (Thursday 2AM UTC): Automatic from `dev`, incremental beta numbers - **Stable** (manual dispatch): From `main`, base version or override ### Release Pipeline + `determine-release` → `test` → `publish` → `cleanup` + - Tests are a hard gate — no tests = no publish - Stable releases check out `main`; alpha/beta check out `dev` - Old alphas pruned to 7, betas to 5 - `[skip ci]` in version bump commits to prevent loops ### Pre-commit Hooks + - **isort**, **black**, **flake8**, **ruff**: Code formatting and linting - **prettier**: Markdown, JSON, YAML formatting - **gitleaks**: Secret scanning - **pre-commit-hooks**: Large file checks, merge conflict detection, YAML validation ## Environment Variables + ```bash # Testing configuration export PYTEST_DONUT=yes # Enable real OCR testing @@ -250,33 +279,51 @@ export PYTHONPATH=$(pwd) # Local development imports ``` ## Performance Requirements + - **Core Package**: <2MB (from ~8MB in v4.0.x) - **Regex Engine**: 150x+ faster than spaCy (currently 190x) -- **GLiNER Engine**: 25x+ faster than spaCy (currently 32x) +- **GLiNER Engine**: 25x+ faster than spaCy (currently 32x) - **Memory Usage**: Graceful handling of large texts (1MB+ chunks) - **Model Loading**: Cache GLiNER models to avoid repeated downloads -## Best Practices for Claude Agents +## Agent skills + +### Issue tracker + +Issues and PRDs are tracked in Linear under the DFPY team. See `docs/agents/issue-tracker.md`. + +### Triage labels + +Use the default five-label triage vocabulary. See `docs/agents/triage-labels.md`. + +### Domain docs + +Single-context repo: use root `CONTEXT.md` and root `docs/adr/` when present. See `docs/agents/domain.md`. + +## Best Practices for Agents Before beginning any task please checkout a branch from `dev` and create a pull request to `dev`. ### Code Quality + - Follow existing patterns before implementing new approaches - Add comprehensive tests for all new functionality - Update documentation immediately with code changes - Run benchmarks for any text processing modifications ### GLiNER Development + - Use PII-specialized models when available (`urchade/gliner_multi_pii-v1`) - Test graceful degradation when GLiNER dependencies missing - Validate smart cascading thresholds with real data - Consider model download time and caching strategies ### Release Preparation + - Alpha/beta releases are automated via `release.yml` schedule - Stable releases: merge `dev` → `main`, then trigger `release.yml` with `stable` type - Use `dry_run: true` to validate before actual publish - Performance validation on realistic data sets -- In Release Notes or Comments, do not reference that it was authored by Claude (all code is anonymously authored) +- In Release Notes or Comments, do not reference that it was authored by an AI agent (all code is anonymously authored) -This guide provides the essential information for DataFog development while maintaining focus on current priorities and recent GLiNER integration work. \ No newline at end of file +This guide provides the essential information for DataFog development while maintaining focus on current priorities and recent GLiNER integration work. diff --git a/datafog/__init__.py b/datafog/__init__.py index e3974ad7..b7d8e4e7 100644 --- a/datafog/__init__.py +++ b/datafog/__init__.py @@ -61,24 +61,6 @@ def _lazy_import_regex_annotator(): globals()["RegexAnnotator"] = RegexAnnotator -# Optional imports with graceful fallback -try: - from .client import app -except ImportError: - app = None - -try: - from .main import DataFog, TextPIIAnnotator -except ImportError: - DataFog = None - TextPIIAnnotator = None - -try: - from .services.text_service import TextService -except ImportError: - TextService = None - - def __getattr__(name: str): """Handle lazy imports for better lightweight performance.""" # Lazy import core models when first accessed @@ -98,46 +80,53 @@ def __getattr__(name: str): _lazy_import_regex_annotator() return globals()[name] - raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + elif name in _LAZY_EXPORTS: + module_path, attr_name, extra_name = _LAZY_EXPORTS[name] + try: + module = __import__(module_path, fromlist=[attr_name]) + value = getattr(module, attr_name) + except ImportError: + if extra_name is None: + value = None + else: + def _missing_dependency(*args, **kwargs): + raise ImportError( + f"{name} requires additional dependencies. " + f"Install with: pip install datafog[{extra_name}]" + ) -# Optional heavy features - only import if dependencies available -def _optional_import(name, module_path, extra_name): - """Helper to import optional modules with helpful error messages.""" - try: - module = __import__(module_path, fromlist=[name]) - return getattr(module, name) - except ImportError: - - def _missing_dependency(*args, **kwargs): - raise ImportError( - f"{name} requires additional dependencies. " - f"Install with: pip install datafog[{extra_name}]" - ) + value = _missing_dependency + + globals()[name] = value + return value - return _missing_dependency + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") -# OCR/Image processing - requires 'ocr' extra -DonutProcessor = _optional_import( - "DonutProcessor", "datafog.processing.image_processing.donut_processor", "ocr" -) -PytesseractProcessor = _optional_import( - "PytesseractProcessor", - "datafog.processing.image_processing.pytesseract_processor", - "ocr", -) -ImageService = _optional_import("ImageService", "datafog.services.image_service", "ocr") - -# NLP processing - requires 'nlp' extra -SpacyPIIAnnotator = _optional_import( - "SpacyPIIAnnotator", "datafog.processing.text_processing.spacy_pii_annotator", "nlp" -) - -# Distributed processing - requires 'distributed' extra -SparkService = _optional_import( - "SparkService", "datafog.services.spark_service", "distributed" -) +_LAZY_EXPORTS = { + "app": ("datafog.client", "app", None), + "DataFog": ("datafog.main", "DataFog", None), + "TextPIIAnnotator": ("datafog.main", "TextPIIAnnotator", None), + "TextService": ("datafog.services.text_service", "TextService", None), + "DonutProcessor": ( + "datafog.processing.image_processing.donut_processor", + "DonutProcessor", + "ocr", + ), + "PytesseractProcessor": ( + "datafog.processing.image_processing.pytesseract_processor", + "PytesseractProcessor", + "ocr", + ), + "ImageService": ("datafog.services.image_service", "ImageService", "ocr"), + "SpacyPIIAnnotator": ( + "datafog.processing.text_processing.spacy_pii_annotator", + "SpacyPIIAnnotator", + "nlp", + ), + "SparkService": ("datafog.services.spark_service", "SparkService", "distributed"), +} _REDACT_PRESETS = { diff --git a/docs/agents/domain.md b/docs/agents/domain.md new file mode 100644 index 00000000..4fbc0445 --- /dev/null +++ b/docs/agents/domain.md @@ -0,0 +1,31 @@ +# Domain Docs + +How the engineering skills should consume this repo's domain documentation when exploring the codebase. + +Configured layout: single-context. + +## Before exploring, read these + +- **`CONTEXT.md`** at the repo root. +- **`docs/adr/`** for ADRs that touch the area you're about to work in. + +If any of these files don't exist, proceed silently. Don't flag their absence; don't suggest creating them upfront. The producer skill (`/grill-with-docs`) creates them lazily when terms or decisions actually get resolved. + +## File structure + +```text +/ +|-- CONTEXT.md +|-- docs/adr/ +`-- datafog/ +``` + +## Use the glossary's vocabulary + +When your output names a domain concept in an issue title, refactor proposal, hypothesis, or test name, use the term as defined in `CONTEXT.md`. Don't drift to synonyms the glossary explicitly avoids. + +If the concept you need isn't in the glossary yet, that's a signal: either you're inventing language the project doesn't use, or there's a real gap to note for `/grill-with-docs`. + +## Flag ADR conflicts + +If your output contradicts an existing ADR, surface it explicitly rather than silently overriding. diff --git a/docs/agents/issue-tracker.md b/docs/agents/issue-tracker.md new file mode 100644 index 00000000..b07dfca3 --- /dev/null +++ b/docs/agents/issue-tracker.md @@ -0,0 +1,24 @@ +# Issue tracker: Linear + +Issues and PRDs for this repo live in Linear under the DFPY team: + +https://linear.app/threadfork/team/DFPY/all + +Use the Linear connector/app when available. Do not create GitHub or GitLab issues for this repo unless the user explicitly asks for that. + +## Conventions + +- Create new issues in the DFPY team. +- Use the triage labels mapped in `docs/agents/triage-labels.md`. +- Keep issue titles concise and action-oriented. +- Include enough context, acceptance criteria, and verification notes for an AFK agent or human implementer to pick up the work. +- When referencing code, include repo-relative file paths and relevant symbols. +- When a task comes from a PRD, link related Linear issues together where possible. + +## When a skill says "publish to the issue tracker" + +Create a Linear issue in the DFPY team. + +## When a skill says "fetch the relevant ticket" + +Use the Linear connector/app to read the referenced Linear issue, including description, labels, status, comments, and linked issues. diff --git a/docs/agents/triage-labels.md b/docs/agents/triage-labels.md new file mode 100644 index 00000000..0806b2f8 --- /dev/null +++ b/docs/agents/triage-labels.md @@ -0,0 +1,13 @@ +# Triage Labels + +The skills speak in terms of five canonical triage roles. This file maps those roles to the actual label strings used in this repo's issue tracker. + +| Label in mattpocock/skills | Label in our tracker | Meaning | +| -------------------------- | -------------------- | ---------------------------------------- | +| `needs-triage` | `needs-triage` | Maintainer needs to evaluate this issue | +| `needs-info` | `needs-info` | Waiting on reporter for more information | +| `ready-for-agent` | `ready-for-agent` | Fully specified, ready for an AFK agent | +| `ready-for-human` | `ready-for-human` | Requires human implementation | +| `wontfix` | `wontfix` | Will not be actioned | + +When a skill mentions a role, use the corresponding label string from this table. diff --git a/docs/conf.py b/docs/conf.py index 1cb1c895..d71e76b0 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -3,13 +3,18 @@ # For the full list of built-in configuration values, see the documentation: # https://www.sphinx-doc.org/en/master/usage/configuration.html +import re +from pathlib import Path + # -- Project information ----------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information project = "DataFog" copyright = "2024, DataFog Inc." author = "Sid Mohan" -release = "v4.1.1" +_version_file = Path(__file__).resolve().parents[1] / "datafog" / "__about__.py" +_version_match = re.search(r'^__version__ = "([^"]+)"', _version_file.read_text(), re.M) +release = f"v{_version_match.group(1)}" if _version_match else "v0.0.0" # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration diff --git a/setup.py b/setup.py index f84c241a..39f01651 100644 --- a/setup.py +++ b/setup.py @@ -25,6 +25,7 @@ # Optional heavy dependencies nlp_deps = [ + "click>=8.0,<9.0", "spacy>=3.7.0,<4.0", ] @@ -57,6 +58,7 @@ ] cli_deps = [ + "click>=8.0,<9.0", "typer>=0.12.0", "pydantic-settings>=2.0.0", ] diff --git a/tests/test_donut_lazy_import.py b/tests/test_donut_lazy_import.py index 80c9ec09..9b2a28f1 100644 --- a/tests/test_donut_lazy_import.py +++ b/tests/test_donut_lazy_import.py @@ -1,23 +1,36 @@ +import os +import subprocess import sys +from pathlib import Path -from datafog.services.image_service import ImageService + +def _run_isolated_python(script: str) -> subprocess.CompletedProcess[str]: + env = dict(os.environ) + env["PYTHONPATH"] = str(Path.cwd()) + env["DATAFOG_NO_TELEMETRY"] = "1" + env["DO_NOT_TRACK"] = "1" + return subprocess.run( + [sys.executable, "-c", script], + check=True, + env=env, + text=True, + capture_output=True, + ) def test_no_torch_import_when_donut_disabled(): """Test that torch is not imported when use_donut is False""" - # Remove torch and transformers from sys.modules if they're already imported - if "torch" in sys.modules: - del sys.modules["torch"] - if "transformers" in sys.modules: - del sys.modules["transformers"] + _run_isolated_python( + """ +import sys +from datafog.services.image_service import ImageService - # Create ImageService with use_donut=False - # The variable is used indirectly by creating the service which affects sys.modules - _ = ImageService(use_donut=False, use_tesseract=True) +_ = ImageService(use_donut=False, use_tesseract=True) - # Verify that torch and transformers were not imported - assert "torch" not in sys.modules - assert "transformers" not in sys.modules +assert "torch" not in sys.modules +assert "transformers" not in sys.modules +""" + ) def test_lazy_import_mechanism(): @@ -26,24 +39,16 @@ def test_lazy_import_mechanism(): # to use lazy imports. We don't need to actually test the imports themselves, # just that the structure is correct. - # First, ensure torch and transformers are not in sys.modules - if "torch" in sys.modules: - del sys.modules["torch"] - if "transformers" in sys.modules: - del sys.modules["transformers"] - - # Import the DonutProcessor directly - from datafog.processing.image_processing.donut_processor import DonutProcessor - - # Create a processor instance - processor = DonutProcessor() - - # Verify that torch and transformers were not imported just by creating the processor - assert "torch" not in sys.modules - assert "transformers" not in sys.modules + _run_isolated_python( + """ +import sys +from datafog.processing.image_processing.donut_processor import DonutProcessor - # Verify that the extract_text_from_image method exists - assert hasattr(processor, "extract_text_from_image") +processor = DonutProcessor() - # Runtime package installation helpers should not exist on the processor. - assert not hasattr(processor, "ensure_installed") +assert "torch" not in sys.modules +assert "transformers" not in sys.modules +assert hasattr(processor, "extract_text_from_image") +assert not hasattr(processor, "ensure_installed") +""" + ) diff --git a/tests/test_install_profiles.py b/tests/test_install_profiles.py index e17261be..7222186e 100644 --- a/tests/test_install_profiles.py +++ b/tests/test_install_profiles.py @@ -17,10 +17,13 @@ def test_install_profile_import_surface() -> None: assert datafog.scan("Email jane@example.com").entities assert datafog.redact("Email jane@example.com").redacted_text elif profile == "cli": + import click # noqa: F401 + from datafog.client import app assert app is not None elif profile == "nlp": + import click # noqa: F401 import spacy # noqa: F401 from datafog.models.spacy_nlp import SpacyAnnotator From 216b7380647604c7a74acb3bdbdb5828c233099e Mon Sep 17 00:00:00 2001 From: sidmohan0 <61345237+sidmohan0@users.noreply.github.com> Date: Tue, 26 May 2026 19:27:54 +0200 Subject: [PATCH 2/4] fix: keep agent helpers on regex core path Refs DFPY-74 --- datafog/agent.py | 16 +++++------ tests/test_no_network_core.py | 52 +++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+), 8 deletions(-) diff --git a/datafog/agent.py b/datafog/agent.py index 58a84ed7..6603ba11 100644 --- a/datafog/agent.py +++ b/datafog/agent.py @@ -51,7 +51,7 @@ class Guardrail: """Reusable text guardrail for wrapping LLM prompts and outputs.""" entity_types: Optional[list[str]] = None - engine: str = "smart" + engine: str = "regex" strategy: str = "token" on_detect: str = "redact" @@ -111,33 +111,33 @@ def watch(self) -> Iterator[GuardrailWatch]: yield watcher -def sanitize(text: str, **kwargs: Any) -> str: +def sanitize(text: str, engine: str = "regex", **kwargs: Any) -> str: """ One-liner PII removal. Returns the redacted text only. """ - result = scan_and_redact(text=text, **kwargs) + result = scan_and_redact(text=text, engine=engine, **kwargs) return result.redacted_text -def scan_prompt(prompt: str, **kwargs: Any) -> ScanResult: +def scan_prompt(prompt: str, engine: str = "regex", **kwargs: Any) -> ScanResult: """ Scan an LLM prompt for PII without modifying the input text. """ - return scan(prompt, **kwargs) + return scan(prompt, engine=engine, **kwargs) -def filter_output(output: str, **kwargs: Any) -> RedactResult: +def filter_output(output: str, engine: str = "regex", **kwargs: Any) -> RedactResult: """ Scan and redact PII from model output before returning to users. """ - return scan_and_redact(output, **kwargs) + return scan_and_redact(output, engine=engine, **kwargs) def create_guardrail( entity_types: Optional[list[str]] = None, - engine: str = "smart", + engine: str = "regex", strategy: str = "token", on_detect: str = "redact", ) -> Guardrail: diff --git a/tests/test_no_network_core.py b/tests/test_no_network_core.py index 905984f4..f06e8360 100644 --- a/tests/test_no_network_core.py +++ b/tests/test_no_network_core.py @@ -63,6 +63,19 @@ def fail_optional_engine_probe(): guarded = guardrail.filter("Email jane@example.com") assert guarded.redacted_text == "Email [EMAIL_1]" + sanitized = datafog.sanitize("Email jane@example.com") + assert sanitized == "Email [EMAIL_1]" + + prompt_result = datafog.scan_prompt("Email jane@example.com") + assert [entity.type for entity in prompt_result.entities] == ["EMAIL"] + + output_result = datafog.filter_output("Email jane@example.com") + assert output_result.redacted_text == "Email [EMAIL_1]" + + agent_guardrail = datafog.create_guardrail() + agent_guarded = agent_guardrail.filter("Email jane@example.com") + assert agent_guarded.redacted_text == "Email [EMAIL_1]" + def test_import_probes_do_not_load_optional_models() -> None: _run_isolated_python( @@ -94,3 +107,42 @@ def from_pretrained(*_args, **_kwargs): assert datafog.scan("Email jane@example.com").entities """ ) + + +def test_core_path_does_not_import_optional_dependency_modules() -> None: + _run_isolated_python( + """ +import importlib.abc +import sys + +blocked = { + "aiohttp", + "certifi", + "gliner", + "PIL", + "pyspark", + "pytesseract", + "spacy", + "torch", + "transformers", +} + +class BlockOptionalImports(importlib.abc.MetaPathFinder): + def find_spec(self, fullname, path=None, target=None): + if fullname.split(".", 1)[0] in blocked: + raise AssertionError(f"optional dependency imported: {fullname}") + return None + +sys.meta_path.insert(0, BlockOptionalImports()) + +import datafog + +assert datafog.scan("Email jane@example.com").entities +assert datafog.redact("Email jane@example.com").redacted_text == "Email [EMAIL_1]" +assert datafog.protect().filter("Email jane@example.com").redacted_text == "Email [EMAIL_1]" +assert datafog.sanitize("Email jane@example.com") == "Email [EMAIL_1]" +assert datafog.scan_prompt("Email jane@example.com").entities +assert datafog.filter_output("Email jane@example.com").redacted_text == "Email [EMAIL_1]" +assert datafog.create_guardrail().filter("Email jane@example.com").redacted_text == "Email [EMAIL_1]" +""" + ) From 90c426d320dab289d3ce77c25a6cb3ad923f6202 Mon Sep 17 00:00:00 2001 From: sidmohan0 <61345237+sidmohan0@users.noreply.github.com> Date: Tue, 26 May 2026 19:38:56 +0200 Subject: [PATCH 3/4] docs: clarify optional OCR and Spark surfaces Refs DFPY-75 --- .gitignore | 1 + README.md | 29 +++++++ docs/cli.rst | 12 +++ docs/index.rst | 38 ++++----- docs/optional-surfaces.rst | 109 ++++++++++++++++++++++++ docs/python-sdk.rst | 42 ++++++++- docs/roadmap.rst | 34 ++++++-- tests/test_runtime_dependency_safety.py | 53 ++++++++++++ 8 files changed, 287 insertions(+), 31 deletions(-) create mode 100644 docs/optional-surfaces.rst diff --git a/.gitignore b/.gitignore index bb9f8105..1316a1f3 100644 --- a/.gitignore +++ b/.gitignore @@ -59,6 +59,7 @@ docs/* !docs/conf.py !docs/Makefile !docs/make.bat +!docs/optional-surfaces.rst !docs/agents/ !docs/agents/** !docs/audit/ diff --git a/README.md b/README.md index 62f7e10d..2ff68cd2 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,12 @@ pip install datafog[nlp] # Add GLiNER + spaCy support pip install datafog[nlp-advanced] +# Add local OCR support +pip install datafog[ocr] + +# Add Spark/distributed support +pip install datafog[distributed] + # Everything pip install datafog[all] ``` @@ -95,6 +101,29 @@ Use the engine that matches your accuracy and dependency constraints: - Cascades regex with optional NER engines. - If optional deps are missing, it degrades gracefully and warns. +## Optional OCR And Spark Surfaces + +DataFog 4.5 keeps the main package story centered on lightweight text PII +screening. OCR and Spark remain supported optional surfaces for users who +already rely on them, but they are not required for the core import, default +scan/redact helpers, or guardrail helpers. + +- OCR: + - Install `datafog[ocr]` for local image OCR helpers. + - URL-based image downloading also needs `datafog[web,ocr]`. + - Tesseract usage requires the system `tesseract` binary. + - Donut OCR requires `datafog[nlp-advanced,ocr]` and a model already available + locally. +- Spark: + - Install `datafog[distributed]` for `SparkService`. + - Spark PII UDF helpers also require `datafog[nlp]` and an installed spaCy + model. + - A Java runtime is required by PySpark. + +OCR and Spark are not deprecated. Their broader API and packaging overhaul is +deferred; the 4.5 goal is to keep them explicit, documented, and isolated from +the lightweight core path. + ## Backward-Compatible APIs The existing public API remains available. diff --git a/docs/cli.rst b/docs/cli.rst index a4c67272..ec452f53 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -7,6 +7,18 @@ Overview The main entrypoint for the CLI is through the DataFog client file, defined in :mod:`datafog.client`. We use Typer to build the CLI, with each command defined as a separate function. +Core text commands such as ``scan-text``, ``redact-text``, ``replace-text``, +and ``hash-text`` are the primary 4.5 CLI path. OCR commands remain available +for existing users, but they are optional: + +* Local image OCR requires ``datafog[ocr]`` and any needed system OCR binaries + such as Tesseract. +* URL-based image OCR also requires ``datafog[web,ocr]``. +* Donut OCR requires ``datafog[nlp-advanced,ocr]`` and a local model. + +Spark/distributed workflows are Python SDK surfaces rather than first-path CLI +commands. Install ``datafog[distributed]`` when using ``SparkService``. + Definitions ----------- .. automodule:: datafog.client diff --git a/docs/index.rst b/docs/index.rst index a22af1c5..7e05a867 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -2,7 +2,9 @@ DataFog Documentation ===================== -DataFog is an open-source tool for PII detection and anonymization of unstructured data. This documentation covers the CLI and Python SDK. +DataFog is an open-source tool for lightweight text PII detection and +anonymization. The core install focuses on fast regex-based scanning and +redaction, with optional extras for NLP, OCR, and Spark-style workflows. .. toctree:: :maxdepth: 2 @@ -10,6 +12,7 @@ DataFog is an open-source tool for PII detection and anonymization of unstructur important-concepts cli python-sdk + optional-surfaces definitions roadmap v44-bridge-release @@ -24,13 +27,14 @@ Getting Started Installation ------------ -Install DataFog via pip: +Install the lightweight text screening core via pip: .. code-block:: bash pip install datafog -This installs the latest stable version with CLI support. +Optional extras such as ``nlp``, ``nlp-advanced``, ``ocr``, ``distributed``, +and ``web`` are installed only when you need those surfaces. --------------------- CLI Usage @@ -48,19 +52,21 @@ Scan text for PII: datafog scan-text "Your text here" -Extract text from image: +Image/OCR commands are optional. Local OCR requires ``datafog[ocr]``; URL-based +image downloading requires ``datafog[web,ocr]``. .. code-block:: bash datafog scan-image "path/to/image.png" --operations extract -Scan for PII in image: +Scan for PII in image text: .. code-block:: bash datafog scan-image "path/to/image.png" --operations scan -For more information on the CLI, see :doc:`cli`. +For more information on optional OCR and Spark surfaces, see +:doc:`optional-surfaces`. --------------------- Python SDK Usage @@ -71,22 +77,14 @@ Scan text for PII: .. code-block:: python - import requests - from datafog import DataFog - - # For text annotation - client = DataFog(operations="scan") + import datafog - # Fetch sample medical record - doc_url = "https://gist.githubusercontent.com/sidmohan0/b43b72693226422bac5f083c941ecfdb/raw/b819affb51796204d59987893f89dee18428ed5d/note1.txt" - response = requests.get(doc_url) - text_lines = [line for line in response.text.splitlines() if line.strip()] + text = "Contact jane@example.com or call 415-555-1212" + result = datafog.scan(text, engine="regex") + print(result.entities) + print(datafog.redact(text, engine="regex").redacted_text) - # Run annotation - annotations = client.run_text_pipeline_sync(str_list=text_lines) - print(annotations) - -Scan image for PII: +Run OCR and then scan extracted text only when the OCR extra is installed: .. code-block:: python diff --git a/docs/optional-surfaces.rst b/docs/optional-surfaces.rst new file mode 100644 index 00000000..aa9b725e --- /dev/null +++ b/docs/optional-surfaces.rst @@ -0,0 +1,109 @@ +========================= +Optional OCR And Spark +========================= + +DataFog 4.5 keeps the core package focused on lightweight text PII screening. +The default path is: + +.. code-block:: bash + + pip install datafog + +.. code-block:: python + + import datafog + + result = datafog.redact("Email jane@example.com", engine="regex") + print(result.redacted_text) + +OCR and Spark are supported optional surfaces. They are useful for image and +distributed workflows, but they should not be treated as required for the core +install, package import, text scanning, text redaction, or guardrail helpers. + +OCR +--- + +Use OCR when you need to extract text from images before running PII detection. + +Install local OCR support: + +.. code-block:: bash + + pip install "datafog[ocr]" + +Use URL-based image downloads: + +.. code-block:: bash + + pip install "datafog[web,ocr]" + +Use Donut OCR: + +.. code-block:: bash + + pip install "datafog[nlp-advanced,ocr]" + +Notes: + +* Tesseract usage requires the system ``tesseract`` binary in addition to the + Python extra. +* Donut OCR requires a model that is already available locally. DataFog should + not download models implicitly during normal runtime usage. +* OCR is not deprecated. A broader OCR API and packaging overhaul is deferred + beyond the 4.5 focus release. + +Example local OCR flow: + +.. code-block:: python + + import asyncio + from datafog.services.image_service import ImageService + + async def main(): + service = ImageService(use_tesseract=True, use_donut=False) + extracted = await service.ocr_extract(["./invoice.png"]) + print(extracted) + + asyncio.run(main()) + +Spark +------ + +Use Spark when you need distributed processing around DataFog PII detection. + +Install Spark support: + +.. code-block:: bash + + pip install "datafog[distributed]" + +Use Spark PII UDF helpers: + +.. code-block:: bash + + pip install "datafog[distributed,nlp]" + +Notes: + +* ``SparkService`` requires PySpark and a Java runtime. +* Spark PII UDF helpers also require spaCy and an installed spaCy model. +* Spark is not deprecated. A broader Spark overhaul is deferred beyond the 4.5 + focus release. + +Example local Spark flow: + +.. code-block:: python + + from datafog.services.spark_service import SparkService + + service = SparkService(master="local[1]") + rows = service.read_json("./records.json") + print(rows) + +Core-path verification +---------------------- + +The repository includes tests that block optional dependency imports while +importing ``datafog`` and running the default text helpers. These checks verify +that OCR, Spark, NLP, model-loading, and web dependencies are not required for +the core path. diff --git a/docs/python-sdk.rst b/docs/python-sdk.rst index dbf1982d..a1adec5d 100644 --- a/docs/python-sdk.rst +++ b/docs/python-sdk.rst @@ -4,8 +4,46 @@ DataFog Python SDK Overview -------- -The main entrypoint for the SDK is through the DataFog class, defined in :mod:`datafog.main`. -Here you can initialize the different services, including TextService, ImageService, and SparkService. +The primary 4.5 SDK path is lightweight text PII screening through the +top-level ``datafog`` helpers. These helpers use the regex engine by default +and do not require OCR, Spark, model downloads, or distributed dependencies. + +.. code-block:: python + + import datafog + + text = "Contact jane@example.com or call 415-555-1212" + + scan_result = datafog.scan(text, engine="regex") + print(scan_result.entities) + + redact_result = datafog.redact(text, engine="regex") + print(redact_result.redacted_text) + + print(datafog.sanitize(text)) + +The backward-compatible ``DataFog`` and ``TextService`` classes remain +available for existing users. ``TextService(engine="regex")`` is the +dependency-light service path; ``spacy``, ``gliner``, ``smart``, OCR, and Spark +surfaces require their explicit extras. + +Optional services +----------------- + +OCR and Spark are supported optional surfaces, not the primary 4.5 path: + +* Use ``datafog[ocr]`` for local OCR helpers such as ``ImageService`` and + ``PytesseractProcessor``. +* Use ``datafog[web,ocr]`` when OCR inputs must be downloaded from URLs. +* Use ``datafog[nlp-advanced,ocr]`` for Donut OCR, with the model already + available locally. +* Use ``datafog[distributed]`` for ``SparkService``. +* Use ``datafog[distributed,nlp]`` plus an installed spaCy model for Spark PII + UDF helpers. + +OCR and Spark are not deprecated. Their broader overhaul is deferred so the +4.5 release can keep the core package tight while preserving existing optional +usage. See :doc:`optional-surfaces` for install notes and limitations. Definitions ----------- diff --git a/docs/roadmap.rst b/docs/roadmap.rst index acf8b6a0..9db4d433 100644 --- a/docs/roadmap.rst +++ b/docs/roadmap.rst @@ -134,13 +134,29 @@ All features will remain backward compatible with the lightweight architecture. 4.5.0 ------ -Version ``4.5.0`` will introduce: - -* **Enterprise features** in dedicated extras -* **Advanced analytics** for PII detection patterns -* **Multi-language support** for international PII types -* **Cloud integration** helpers for AWS, GCP, Azure -* **Performance monitoring** and metrics collection +Version ``4.5.0`` is a focus release for lightweight text PII screening. It +should make the core package easier to install, reason about, test, and use +before larger v5 middleware work. + +4.5.0 should focus on: + +* Core text scanning, redaction, and guardrail helpers that stay dependency + light by default. +* Clear install-profile documentation for core, NLP, OCR, Spark, CLI, and web + surfaces. +* OCR and Spark as supported optional surfaces, not the main 4.5 adoption path. +* Documentation cleanup so users and contributors can find the current package + story without reading historical planning material first. +* German PII regex support if the external PR passes review and does not + compromise core precision. + +Deferred beyond 4.5.0: + +* Full middleware adapters for Sentry, OpenTelemetry, logging frameworks, or + cloud DLP services. +* OCR architecture overhaul. +* Spark architecture overhaul. +* Enterprise dashboards and analytics. -The lightweight core will remain unchanged, ensuring existing -integrations continue to work without modification. +The lightweight core remains the first path; optional surfaces should stay +explicit and isolated from default import, scan, redact, and guardrail usage. diff --git a/tests/test_runtime_dependency_safety.py b/tests/test_runtime_dependency_safety.py index 9410ddc6..d34ceb8a 100644 --- a/tests/test_runtime_dependency_safety.py +++ b/tests/test_runtime_dependency_safety.py @@ -1,4 +1,6 @@ import importlib +import os +import subprocess import sys import types from pathlib import Path @@ -6,6 +8,20 @@ import pytest +def _run_isolated_python(script: str) -> subprocess.CompletedProcess[str]: + env = dict(os.environ) + env["PYTHONPATH"] = str(Path.cwd()) + env["DATAFOG_NO_TELEMETRY"] = "1" + env["DO_NOT_TRACK"] = "1" + return subprocess.run( + [sys.executable, "-c", script], + check=True, + env=env, + text=True, + capture_output=True, + ) + + def test_runtime_code_does_not_install_packages() -> None: blocked_snippets = [ "subprocess.check_call", @@ -25,6 +41,43 @@ def test_runtime_code_does_not_install_packages() -> None: assert offenders == [] +def test_ocr_and_spark_public_services_do_not_require_optional_imports() -> None: + _run_isolated_python( + """ +import importlib.abc +import sys + +blocked = { + "aiohttp", + "certifi", + "PIL", + "pyspark", + "pytesseract", + "torch", + "transformers", +} + +class BlockOptionalImports(importlib.abc.MetaPathFinder): + def find_spec(self, fullname, path=None, target=None): + if fullname.split(".", 1)[0] in blocked: + raise AssertionError(f"optional dependency imported: {fullname}") + return None + +sys.meta_path.insert(0, BlockOptionalImports()) + +import datafog +from datafog.services import ImageService, SparkService, TextService + +assert datafog.scan("Email jane@example.com").entities +assert ImageService is not None +assert SparkService is not None +assert TextService is not None +assert datafog.ImageService is ImageService +assert datafog.SparkService is SparkService +""" + ) + + def test_spacy_pii_missing_model_requires_explicit_download( monkeypatch: pytest.MonkeyPatch, ) -> None: From 6f57af2dd14bafd9c7403acf8d4ee6041449f4bb Mon Sep 17 00:00:00 2001 From: sidmohan0 <61345237+sidmohan0@users.noreply.github.com> Date: Tue, 26 May 2026 19:51:59 +0200 Subject: [PATCH 4/4] docs: make v4.5 core story obvious Refs DFPY-72 --- README.md | 8 +++ docs/getting-started.rst | 112 +++++++++++++++++++++++++++++++++++ docs/index.rst | 120 ++++++++++---------------------------- docs/planning-history.rst | 30 ++++++++++ 4 files changed, 181 insertions(+), 89 deletions(-) create mode 100644 docs/getting-started.rst create mode 100644 docs/planning-history.rst diff --git a/README.md b/README.md index 2ff68cd2..3f2473e9 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,14 @@ It provides: - A simple agent-oriented API for LLM applications - Backward-compatible `DataFog` and `TextService` classes +## 4.5 Focus + +DataFog 4.5 is focused on lightweight text PII screening: a small core install, +fast regex-based scan/redact helpers, explicit optional extras, and a clearer +path toward future middleware use cases. Dedicated Sentry, OpenTelemetry, +logging-framework, and cloud DLP adapters are future-facing work and are not +part of the 4.5 release. + ## Installation ```bash diff --git a/docs/getting-started.rst b/docs/getting-started.rst new file mode 100644 index 00000000..69637611 --- /dev/null +++ b/docs/getting-started.rst @@ -0,0 +1,112 @@ +================================ +Getting Started With DataFog 4.5 +================================ + +DataFog 4.5 focuses on lightweight text PII screening. A core install should +let you scan and redact common structured PII without installing OCR, Spark, +large NLP models, or middleware integrations. + +Install Profiles +================ + +Core text screening: + +.. code-block:: bash + + pip install datafog + +Optional extras are explicit: + +.. list-table:: + :header-rows: 1 + + * - Profile + - Install command + - Use when + * - Core + - ``pip install datafog`` + - You need regex-based text scanning, redaction, and guardrail helpers. + * - NLP + - ``pip install "datafog[nlp]"`` + - You need spaCy-backed named entity recognition. + * - Advanced NLP + - ``pip install "datafog[nlp-advanced]"`` + - You need GLiNER-backed named entity recognition. + * - OCR + - ``pip install "datafog[ocr]"`` + - You need local image text extraction before PII scanning. + * - OCR from URLs + - ``pip install "datafog[web,ocr]"`` + - You need DataFog to download image inputs before OCR. + * - Spark + - ``pip install "datafog[distributed]"`` + - You need the optional ``SparkService`` surface. + * - Everything + - ``pip install "datafog[all]"`` + - You are developing or deliberately want every optional surface. + +Python Usage +============ + +Use the top-level helpers for the 4.5 core path: + +.. code-block:: python + + import datafog + + text = "Contact jane@example.com or call 415-555-1212" + + scan_result = datafog.scan(text, engine="regex") + print(scan_result.entities) + + redact_result = datafog.redact(text, engine="regex") + print(redact_result.redacted_text) + + print(datafog.sanitize("Card: 4111-1111-1111-1111")) + +Agent-oriented helpers use the same lightweight text path: + +.. code-block:: python + + import datafog + + prompt = "My SSN is 123-45-6789" + scan_result = datafog.scan_prompt(prompt, engine="regex") + + if scan_result.entities: + print("PII detected before sending the prompt") + + output = "Email me at jane.doe@example.com" + safe_output = datafog.filter_output(output, engine="regex") + print(safe_output.redacted_text) + +CLI Usage +========= + +The CLI core path is text-first: + +.. code-block:: bash + + datafog scan-text "Contact jane@example.com" + datafog redact-text "Contact jane@example.com" + datafog replace-text "Contact jane@example.com" + datafog hash-text "Contact jane@example.com" + +Image commands are optional. Install ``datafog[ocr]`` for local OCR and +``datafog[web,ocr]`` when the CLI needs to download image inputs. + +What 4.5 Is Not +=============== + +DataFog 4.5 prepares the package for future middleware use cases, but it does +not ship dedicated Sentry, OpenTelemetry, logging-framework, or cloud DLP +adapters. Those integrations are future-facing work built on the same core +text screening path. + +Next Pages +========== + +* :doc:`python-sdk` documents the Python API surface. +* :doc:`cli` documents command-line usage. +* :doc:`optional-surfaces` documents OCR and Spark install notes. +* :doc:`roadmap` explains how 4.5 leads toward later middleware work. diff --git a/docs/index.rst b/docs/index.rst index 7e05a867..2c4dab59 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -2,105 +2,47 @@ DataFog Documentation ===================== -DataFog is an open-source tool for lightweight text PII detection and -anonymization. The core install focuses on fast regex-based scanning and -redaction, with optional extras for NLP, OCR, and Spark-style workflows. +DataFog 4.5 is a lightweight text PII screening package for Python. The +primary path is a small core install, fast regex-based scanning and redaction, +agent-friendly guardrail helpers, and explicit optional extras when you need +NLP, OCR, Spark, or web inputs. + +Start with :doc:`getting-started` if you want the shortest route from install +to scanning text. The roadmap and historical planning pages remain available, +but the live user docs are the first path for 4.5. + +Use DataFog 4.5 +=============== .. toctree:: :maxdepth: 2 + :caption: Use DataFog 4.5 - important-concepts - cli + getting-started python-sdk + cli optional-surfaces - definitions - roadmap - v44-bridge-release - v5-product-brief - v5-compatibility-matrix - v5-cut-line - -===================== -Getting Started -===================== - -Installation ------------- - -Install the lightweight text screening core via pip: - -.. code-block:: bash - - pip install datafog - -Optional extras such as ``nlp``, ``nlp-advanced``, ``ocr``, ``distributed``, -and ``web`` are installed only when you need those surfaces. - ---------------------- -CLI Usage ---------------------- - -For a list of available operations, run: - -.. code-block:: bash - - datafog --help - -Scan text for PII: - -.. code-block:: bash - - datafog scan-text "Your text here" - -Image/OCR commands are optional. Local OCR requires ``datafog[ocr]``; URL-based -image downloading requires ``datafog[web,ocr]``. - -.. code-block:: bash - - datafog scan-image "path/to/image.png" --operations extract - -Scan for PII in image text: - -.. code-block:: bash - - datafog scan-image "path/to/image.png" --operations scan - -For more information on optional OCR and Spark surfaces, see -:doc:`optional-surfaces`. - ---------------------- -Python SDK Usage ---------------------- - -Scan text for PII: - -.. code-block:: python - - - import datafog - - text = "Contact jane@example.com or call 415-555-1212" - result = datafog.scan(text, engine="regex") - print(result.entities) - print(datafog.redact(text, engine="regex").redacted_text) + important-concepts -Run OCR and then scan extracted text only when the OCR extra is installed: +Reference +========= -.. code-block:: python +.. toctree:: + :maxdepth: 2 + :caption: Reference - - import asyncio - from datafog import DataFog + definitions - # For OCR and PII annotation - ocr_client = DataFog(operations="extract,scan") +Planning And History +==================== - async def run_ocr_pipeline_demo(): - image_url = "https://s3.amazonaws.com/thumbnails.venngage.com/template/dc377004-1c2d-49f2-8ddf-d63f11c8d9c2.png" - results = await ocr_client.run_ocr_pipeline(image_urls=[image_url]) - print("OCR Pipeline Results:", results) +The pages below document release planning, migration history, and future +direction. They are useful context, but they are secondary to the live 4.5 +usage path above. - # Run the async function - asyncio.run(run_ocr_pipeline_demo()) +.. toctree:: + :maxdepth: 1 + :caption: Planning and history -For detailed information on the Python SDK, see :doc:`python-sdk`. + roadmap + planning-history diff --git a/docs/planning-history.rst b/docs/planning-history.rst new file mode 100644 index 00000000..20025573 --- /dev/null +++ b/docs/planning-history.rst @@ -0,0 +1,30 @@ +==================== +Planning And History +==================== + +These pages and artifacts are preserved for context, but they are not the +first path for using DataFog 4.5. Start with :doc:`getting-started` for live +user docs. + +Release Planning +================ + +.. toctree:: + :maxdepth: 1 + + v44-bridge-release + v5-product-brief + v5-compatibility-matrix + v5-cut-line + +Audit Artifacts +=============== + +Historical audit notes remain available in the repository for maintainers who +need the detailed background: + +* :download:`Reconnaissance notes ` +* :download:`Coverage baseline ` +* :download:`Detection accuracy review ` +* :download:`Architecture review ` +* :download:`Final coverage notes `