From a79cd8f67d0aedb421dae9f07128cafeeaf5637f Mon Sep 17 00:00:00 2001
From: sidmohan0 <61345237+sidmohan0@users.noreply.github.com>
Date: Tue, 26 May 2026 19:00:34 +0200
Subject: [PATCH 1/4] chore: land v4.5 release prep baseline

Refs DFPY-71
---
 .bumpversion.cfg                |  6 +--
 .gitignore                      |  5 +-
 Claude.md => AGENTS.md          | 93 ++++++++++++++++++++++++--------
 datafog/__init__.py             | 95 +++++++++++++++------------------
 docs/agents/domain.md           | 31 +++++++++++
 docs/agents/issue-tracker.md    | 24 +++++++++
 docs/agents/triage-labels.md    | 13 +++++
 docs/conf.py                    |  7 ++-
 setup.py                        |  2 +
 tests/test_donut_lazy_import.py | 67 ++++++++++++-----------
 tests/test_install_profiles.py  |  3 ++
 11 files changed, 231 insertions(+), 115 deletions(-)
 rename Claude.md => AGENTS.md (82%)
 create mode 100644 docs/agents/domain.md
 create mode 100644 docs/agents/issue-tracker.md
 create mode 100644 docs/agents/triage-labels.md

diff --git a/.bumpversion.cfg b/.bumpversion.cfg
index 463cf894..0f1713f9 100644
--- a/.bumpversion.cfg
+++ b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 4.3.0
+current_version = 4.4.0a5
 commit = True
 tag = True
 tag_name = v{new_version}
@@ -20,7 +20,3 @@ values =
 [bumpversion:file:datafog/__about__.py]
 search = __version__ = "{current_version}"
 replace = __version__ = "{new_version}"
-
-[bumpversion:file:setup.py]
-search = version="{current_version}"
-replace = version="{new_version}"
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 2f62eff9..bb9f8105 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,6 +24,7 @@ error_log.txt
 # Environment
 .env
 .venv
+.venv*/
 venv/
 env/
 examples/venv/
@@ -58,14 +59,14 @@ docs/*
 !docs/conf.py
 !docs/Makefile
 !docs/make.bat
+!docs/agents/
+!docs/agents/**
 !docs/audit/
 !docs/audit/**
 
 # Keep all directories but ignore their contents
 */**/__pycache__/
 
-# Keep all files but ignore their contents
-Claude.md
 notes/benchmarking_notes.md
 Roadmap.md
 notes/*
diff --git a/Claude.md b/AGENTS.md
similarity index 82%
rename from Claude.md
rename to AGENTS.md
index dcbe7934..a46c4402 100644
--- a/Claude.md
+++ b/AGENTS.md
@@ -1,18 +1,26 @@
-# DataFog - Claude Development Guide
+# DataFog - Agent Development Guide
 
 ## Project Overview
+
 **DataFog** is an open-source Python library for PII detection and anonymization with a focus on speed and lightweight architecture.
 
 ## Core Value Proposition
+
 - **Ultra-Fast Performance**: 190x faster than spaCy for structured PII, 32x faster with GLiNER
 - **Lightweight Core**: <2MB package with optional ML extras
 - **Modern Engine Options**: Regex, GLiNER, spaCy, and smart cascading
 - **Production Ready**: Comprehensive testing, CI/CD, and performance validation
 
 ## Current Project Status
-**Version: 4.3.0**
+
+**Stable version: 4.4.0**
+
+**Development version: 4.4.0a5**
+
+**Next minor target: 4.5.0**
 
 ### ✅ Recently Completed (Latest)
+
 - **GLiNER Integration**: Modern NER engine with PII-specialized models
 - **Smart Cascading**: Intelligent regex → GLiNER → spaCy progression
 - **Enhanced CLI**: Model management with `--engine` flags
@@ -43,6 +51,7 @@ python -c "from datafog.services.text_service import TextService; print('✅ All
 ## Architecture Overview
 
 ### Engine Ecosystem (Updated with GLiNER)
+
 ```python
 from datafog.services.text_service import TextService
 
@@ -59,21 +68,23 @@ auto_service = TextService(engine="auto")        # Legacy: regex→spaCy
 ```
 
 ### Performance Comparison (Validated)
-| Engine  | Speed vs spaCy | Accuracy | Use Case | Install |
-|---------|----------------|----------|----------|---------|
-| `regex` | **190x faster** | High (structured) | Emails, phones, SSNs | Core only |
-| `gliner` | **32x faster** | Very High | Modern NER, custom entities | `[nlp-advanced]` |
-| `spacy` | 1x (baseline) | Good | Traditional NLP | `[nlp]` |
-| `smart` | **60x faster** | Highest | Best balance | `[nlp-advanced]` |
+
+| Engine   | Speed vs spaCy  | Accuracy          | Use Case                    | Install          |
+| -------- | --------------- | ----------------- | --------------------------- | ---------------- |
+| `regex`  | **190x faster** | High (structured) | Emails, phones, SSNs        | Core only        |
+| `gliner` | **32x faster**  | Very High         | Modern NER, custom entities | `[nlp-advanced]` |
+| `spacy`  | 1x (baseline)   | Good              | Traditional NLP             | `[nlp]`          |
+| `smart`  | **60x faster**  | Highest           | Best balance                | `[nlp-advanced]` |
 
 ### Dependency Strategy
+
 ```python
 # Lightweight core (<2MB)
 pip install datafog
 
 # Optional ML engines
 pip install datafog[nlp]           # spaCy (traditional NLP)
-pip install datafog[nlp-advanced]  # GLiNER (modern NER) 
+pip install datafog[nlp-advanced]  # GLiNER (modern NER)
 pip install datafog[ocr]           # Image processing
 pip install datafog[all]           # Everything
 ```
@@ -81,15 +92,18 @@ pip install datafog[all]           # Everything
 ## GLiNER Integration (NEW)
 
 ### Overview
+
 GLiNER (Generalist Model for Named Entity Recognition) provides modern, accurate NER capabilities optimized for PII detection.
 
 ### Key Features
+
 - **PII-Specialized Models**: `urchade/gliner_multi_pii-v1` trained specifically for PII
 - **Custom Entity Types**: Configurable entity detection beyond default PII types
 - **Smart Cascading**: Automatically tries regex first, GLiNER second, spaCy last
 - **CLI Management**: Download and manage GLiNER models via CLI
 
 ### Usage Examples
+
 ```python
 # GLiNER engine
 from datafog.services.text_service import TextService
@@ -108,6 +122,7 @@ subprocess.run(["datafog", "list-models", "--engine", "gliner"])
 ```
 
 ### Available GLiNER Models
+
 - `urchade/gliner_multi_pii-v1` - PII-specialized (recommended)
 - `urchade/gliner_base` - General purpose starter
 - `urchade/gliner_large-v2` - Higher accuracy
@@ -116,17 +131,19 @@ subprocess.run(["datafog", "list-models", "--engine", "gliner"])
 ## Development Workflow
 
 ### Git Branch Strategy
+
 - **main**: Production releases only
 - **dev**: Main development branch (use this)
-- **feature/***: New features from dev
-- **fix/***: Bug fixes from dev
+- **feature/\***: New features from dev
+- **fix/\***: Bug fixes from dev
 
 ### Making Changes
+
 ```bash
 # Start from dev
 git checkout dev && git pull origin dev
 
-# Create feature branch  
+# Create feature branch
 git checkout -b feature/your-change
 
 # Make changes, test, commit
@@ -137,6 +154,7 @@ git push -u origin feature/your-change
 ```
 
 ### Testing
+
 ```bash
 # Run specific test suites
 pytest tests/test_text_service.py -v           # Core functionality
@@ -149,13 +167,14 @@ PYTEST_DONUT=yes pytest tests/test_ocr_integration.py  # OCR with real models
 
 # Performance requirements
 # - Regex: 150x+ faster than spaCy
-# - GLiNER: 25x+ faster than spaCy  
+# - GLiNER: 25x+ faster than spaCy
 # - Package size: Core <2MB, full <8MB
 ```
 
 ## Key Implementation Patterns
 
 ### Simple API (Recommended)
+
 ```python
 # Always available, lightweight
 from datafog import detect, process
@@ -164,6 +183,7 @@ result = process("john@example.com", method="redact")
 ```
 
 ### Advanced Engine Selection
+
 ```python
 # For specialized use cases
 from datafog.services.text_service import TextService
@@ -173,7 +193,7 @@ service = TextService(engine="regex")
 
 # Modern NER with custom entities
 service = TextService(
-    engine="gliner", 
+    engine="gliner",
     gliner_model="urchade/gliner_base"
 )
 
@@ -182,6 +202,7 @@ service = TextService(engine="smart")
 ```
 
 ### Graceful Degradation
+
 ```python
 # Handles missing dependencies elegantly
 try:
@@ -194,18 +215,21 @@ except ImportError:
 ## Common Tasks
 
 ### Adding New Entity Types
+
 1. Update regex patterns in `regex_annotator.py`
 2. Add GLiNER entity types in `gliner_annotator.py`
 3. Update tests and benchmarks
 4. Validate performance doesn't regress >10%
 
 ### Performance Optimization
+
 1. Profile with existing benchmarks
 2. Maintain speed thresholds (regex 150x+, GLiNER 25x+)
 3. Update baselines when making improvements
 4. Test across all engines
 
 ### CLI Enhancements
+
 1. Update `client.py` with new commands
 2. Support `--engine` flag for multi-engine commands
 3. Add comprehensive help text and examples
@@ -215,31 +239,36 @@ except ImportError:
 
 ### Workflow Architecture (3 workflows)
 
-| Workflow | Purpose | Trigger |
-|----------|---------|---------|
-| `ci.yml` | Lint + Test + Coverage + Wheel size | Push/PR to main/dev |
-| `release.yml` | Alpha/Beta/Stable publishing | Schedule + manual dispatch |
-| `benchmark.yml` | Performance benchmarks | Push/PR/weekly |
+| Workflow        | Purpose                             | Trigger                    |
+| --------------- | ----------------------------------- | -------------------------- |
+| `ci.yml`        | Lint + Test + Coverage + Wheel size | Push/PR to main/dev        |
+| `release.yml`   | Alpha/Beta/Stable publishing        | Schedule + manual dispatch |
+| `benchmark.yml` | Performance benchmarks              | Push/PR/weekly             |
 
 ### Release Cadence
+
 - **Alpha** (Mon-Wed 2AM UTC): Automatic from `dev`, date+commit versioning
 - **Beta** (Thursday 2AM UTC): Automatic from `dev`, incremental beta numbers
 - **Stable** (manual dispatch): From `main`, base version or override
 
 ### Release Pipeline
+
 `determine-release` → `test` → `publish` → `cleanup`
+
 - Tests are a hard gate — no tests = no publish
 - Stable releases check out `main`; alpha/beta check out `dev`
 - Old alphas pruned to 7, betas to 5
 - `[skip ci]` in version bump commits to prevent loops
 
 ### Pre-commit Hooks
+
 - **isort**, **black**, **flake8**, **ruff**: Code formatting and linting
 - **prettier**: Markdown, JSON, YAML formatting
 - **gitleaks**: Secret scanning
 - **pre-commit-hooks**: Large file checks, merge conflict detection, YAML validation
 
 ## Environment Variables
+
 ```bash
 # Testing configuration
 export PYTEST_DONUT=yes              # Enable real OCR testing
@@ -250,33 +279,51 @@ export PYTHONPATH=$(pwd)             # Local development imports
 ```
 
 ## Performance Requirements
+
 - **Core Package**: <2MB (from ~8MB in v4.0.x)
 - **Regex Engine**: 150x+ faster than spaCy (currently 190x)
-- **GLiNER Engine**: 25x+ faster than spaCy (currently 32x)  
+- **GLiNER Engine**: 25x+ faster than spaCy (currently 32x)
 - **Memory Usage**: Graceful handling of large texts (1MB+ chunks)
 - **Model Loading**: Cache GLiNER models to avoid repeated downloads
 
-## Best Practices for Claude Agents
+## Agent skills
+
+### Issue tracker
+
+Issues and PRDs are tracked in Linear under the DFPY team. See `docs/agents/issue-tracker.md`.
+
+### Triage labels
+
+Use the default five-label triage vocabulary. See `docs/agents/triage-labels.md`.
+
+### Domain docs
+
+Single-context repo: use root `CONTEXT.md` and root `docs/adr/` when present. See `docs/agents/domain.md`.
+
+## Best Practices for Agents
 
 Before beginning any task please checkout a branch from `dev` and create a pull request to `dev`.
 
 ### Code Quality
+
 - Follow existing patterns before implementing new approaches
 - Add comprehensive tests for all new functionality
 - Update documentation immediately with code changes
 - Run benchmarks for any text processing modifications
 
 ### GLiNER Development
+
 - Use PII-specialized models when available (`urchade/gliner_multi_pii-v1`)
 - Test graceful degradation when GLiNER dependencies missing
 - Validate smart cascading thresholds with real data
 - Consider model download time and caching strategies
 
 ### Release Preparation
+
 - Alpha/beta releases are automated via `release.yml` schedule
 - Stable releases: merge `dev` → `main`, then trigger `release.yml` with `stable` type
 - Use `dry_run: true` to validate before actual publish
 - Performance validation on realistic data sets
-- In Release Notes or Comments, do not reference that it was authored by Claude (all code is anonymously authored)
+- In Release Notes or Comments, do not reference that it was authored by an AI agent (all code is anonymously authored)
 
-This guide provides the essential information for DataFog development while maintaining focus on current priorities and recent GLiNER integration work.
\ No newline at end of file
+This guide provides the essential information for DataFog development while maintaining focus on current priorities and recent GLiNER integration work.
diff --git a/datafog/__init__.py b/datafog/__init__.py
index e3974ad7..b7d8e4e7 100644
--- a/datafog/__init__.py
+++ b/datafog/__init__.py
@@ -61,24 +61,6 @@ def _lazy_import_regex_annotator():
         globals()["RegexAnnotator"] = RegexAnnotator
 
 
-# Optional imports with graceful fallback
-try:
-    from .client import app
-except ImportError:
-    app = None
-
-try:
-    from .main import DataFog, TextPIIAnnotator
-except ImportError:
-    DataFog = None
-    TextPIIAnnotator = None
-
-try:
-    from .services.text_service import TextService
-except ImportError:
-    TextService = None
-
-
 def __getattr__(name: str):
     """Handle lazy imports for better lightweight performance."""
     # Lazy import core models when first accessed
@@ -98,46 +80,53 @@ def __getattr__(name: str):
         _lazy_import_regex_annotator()
         return globals()[name]
 
-    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+    elif name in _LAZY_EXPORTS:
+        module_path, attr_name, extra_name = _LAZY_EXPORTS[name]
+        try:
+            module = __import__(module_path, fromlist=[attr_name])
+            value = getattr(module, attr_name)
+        except ImportError:
+            if extra_name is None:
+                value = None
+            else:
 
+                def _missing_dependency(*args, **kwargs):
+                    raise ImportError(
+                        f"{name} requires additional dependencies. "
+                        f"Install with: pip install datafog[{extra_name}]"
+                    )
 
-# Optional heavy features - only import if dependencies available
-def _optional_import(name, module_path, extra_name):
-    """Helper to import optional modules with helpful error messages."""
-    try:
-        module = __import__(module_path, fromlist=[name])
-        return getattr(module, name)
-    except ImportError:
-
-        def _missing_dependency(*args, **kwargs):
-            raise ImportError(
-                f"{name} requires additional dependencies. "
-                f"Install with: pip install datafog[{extra_name}]"
-            )
+                value = _missing_dependency
+
+        globals()[name] = value
+        return value
 
-        return _missing_dependency
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
 
 
-# OCR/Image processing - requires 'ocr' extra
-DonutProcessor = _optional_import(
-    "DonutProcessor", "datafog.processing.image_processing.donut_processor", "ocr"
-)
-PytesseractProcessor = _optional_import(
-    "PytesseractProcessor",
-    "datafog.processing.image_processing.pytesseract_processor",
-    "ocr",
-)
-ImageService = _optional_import("ImageService", "datafog.services.image_service", "ocr")
-
-# NLP processing - requires 'nlp' extra
-SpacyPIIAnnotator = _optional_import(
-    "SpacyPIIAnnotator", "datafog.processing.text_processing.spacy_pii_annotator", "nlp"
-)
-
-# Distributed processing - requires 'distributed' extra
-SparkService = _optional_import(
-    "SparkService", "datafog.services.spark_service", "distributed"
-)
+_LAZY_EXPORTS = {
+    "app": ("datafog.client", "app", None),
+    "DataFog": ("datafog.main", "DataFog", None),
+    "TextPIIAnnotator": ("datafog.main", "TextPIIAnnotator", None),
+    "TextService": ("datafog.services.text_service", "TextService", None),
+    "DonutProcessor": (
+        "datafog.processing.image_processing.donut_processor",
+        "DonutProcessor",
+        "ocr",
+    ),
+    "PytesseractProcessor": (
+        "datafog.processing.image_processing.pytesseract_processor",
+        "PytesseractProcessor",
+        "ocr",
+    ),
+    "ImageService": ("datafog.services.image_service", "ImageService", "ocr"),
+    "SpacyPIIAnnotator": (
+        "datafog.processing.text_processing.spacy_pii_annotator",
+        "SpacyPIIAnnotator",
+        "nlp",
+    ),
+    "SparkService": ("datafog.services.spark_service", "SparkService", "distributed"),
+}
 
 
 _REDACT_PRESETS = {
diff --git a/docs/agents/domain.md b/docs/agents/domain.md
new file mode 100644
index 00000000..4fbc0445
--- /dev/null
+++ b/docs/agents/domain.md
@@ -0,0 +1,31 @@
+# Domain Docs
+
+How the engineering skills should consume this repo's domain documentation when exploring the codebase.
+
+Configured layout: single-context.
+
+## Before exploring, read these
+
+- **`CONTEXT.md`** at the repo root.
+- **`docs/adr/`** for ADRs that touch the area you're about to work in.
+
+If any of these files don't exist, proceed silently. Don't flag their absence; don't suggest creating them upfront. The producer skill (`/grill-with-docs`) creates them lazily when terms or decisions actually get resolved.
+
+## File structure
+
+```text
+/
+|-- CONTEXT.md
+|-- docs/adr/
+`-- datafog/
+```
+
+## Use the glossary's vocabulary
+
+When your output names a domain concept in an issue title, refactor proposal, hypothesis, or test name, use the term as defined in `CONTEXT.md`. Don't drift to synonyms the glossary explicitly avoids.
+
+If the concept you need isn't in the glossary yet, that's a signal: either you're inventing language the project doesn't use, or there's a real gap to note for `/grill-with-docs`.
+
+## Flag ADR conflicts
+
+If your output contradicts an existing ADR, surface it explicitly rather than silently overriding.
diff --git a/docs/agents/issue-tracker.md b/docs/agents/issue-tracker.md
new file mode 100644
index 00000000..b07dfca3
--- /dev/null
+++ b/docs/agents/issue-tracker.md
@@ -0,0 +1,24 @@
+# Issue tracker: Linear
+
+Issues and PRDs for this repo live in Linear under the DFPY team:
+
+https://linear.app/threadfork/team/DFPY/all
+
+Use the Linear connector/app when available. Do not create GitHub or GitLab issues for this repo unless the user explicitly asks for that.
+
+## Conventions
+
+- Create new issues in the DFPY team.
+- Use the triage labels mapped in `docs/agents/triage-labels.md`.
+- Keep issue titles concise and action-oriented.
+- Include enough context, acceptance criteria, and verification notes for an AFK agent or human implementer to pick up the work.
+- When referencing code, include repo-relative file paths and relevant symbols.
+- When a task comes from a PRD, link related Linear issues together where possible.
+
+## When a skill says "publish to the issue tracker"
+
+Create a Linear issue in the DFPY team.
+
+## When a skill says "fetch the relevant ticket"
+
+Use the Linear connector/app to read the referenced Linear issue, including description, labels, status, comments, and linked issues.
diff --git a/docs/agents/triage-labels.md b/docs/agents/triage-labels.md
new file mode 100644
index 00000000..0806b2f8
--- /dev/null
+++ b/docs/agents/triage-labels.md
@@ -0,0 +1,13 @@
+# Triage Labels
+
+The skills speak in terms of five canonical triage roles. This file maps those roles to the actual label strings used in this repo's issue tracker.
+
+| Label in mattpocock/skills | Label in our tracker | Meaning                                  |
+| -------------------------- | -------------------- | ---------------------------------------- |
+| `needs-triage`             | `needs-triage`       | Maintainer needs to evaluate this issue  |
+| `needs-info`               | `needs-info`         | Waiting on reporter for more information |
+| `ready-for-agent`          | `ready-for-agent`    | Fully specified, ready for an AFK agent  |
+| `ready-for-human`          | `ready-for-human`    | Requires human implementation            |
+| `wontfix`                  | `wontfix`            | Will not be actioned                     |
+
+When a skill mentions a role, use the corresponding label string from this table.
diff --git a/docs/conf.py b/docs/conf.py
index 1cb1c895..d71e76b0 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -3,13 +3,18 @@
 # For the full list of built-in configuration values, see the documentation:
 # https://www.sphinx-doc.org/en/master/usage/configuration.html
 
+import re
+from pathlib import Path
+
 # -- Project information -----------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 
 project = "DataFog"
 copyright = "2024, DataFog Inc."
 author = "Sid Mohan"
-release = "v4.1.1"
+_version_file = Path(__file__).resolve().parents[1] / "datafog" / "__about__.py"
+_version_match = re.search(r'^__version__ = "([^"]+)"', _version_file.read_text(), re.M)
+release = f"v{_version_match.group(1)}" if _version_match else "v0.0.0"
 
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
diff --git a/setup.py b/setup.py
index f84c241a..39f01651 100644
--- a/setup.py
+++ b/setup.py
@@ -25,6 +25,7 @@
 
 # Optional heavy dependencies
 nlp_deps = [
+    "click>=8.0,<9.0",
     "spacy>=3.7.0,<4.0",
 ]
 
@@ -57,6 +58,7 @@
 ]
 
 cli_deps = [
+    "click>=8.0,<9.0",
     "typer>=0.12.0",
     "pydantic-settings>=2.0.0",
 ]
diff --git a/tests/test_donut_lazy_import.py b/tests/test_donut_lazy_import.py
index 80c9ec09..9b2a28f1 100644
--- a/tests/test_donut_lazy_import.py
+++ b/tests/test_donut_lazy_import.py
@@ -1,23 +1,36 @@
+import os
+import subprocess
 import sys
+from pathlib import Path
 
-from datafog.services.image_service import ImageService
+
+def _run_isolated_python(script: str) -> subprocess.CompletedProcess[str]:
+    env = dict(os.environ)
+    env["PYTHONPATH"] = str(Path.cwd())
+    env["DATAFOG_NO_TELEMETRY"] = "1"
+    env["DO_NOT_TRACK"] = "1"
+    return subprocess.run(
+        [sys.executable, "-c", script],
+        check=True,
+        env=env,
+        text=True,
+        capture_output=True,
+    )
 
 
 def test_no_torch_import_when_donut_disabled():
     """Test that torch is not imported when use_donut is False"""
-    # Remove torch and transformers from sys.modules if they're already imported
-    if "torch" in sys.modules:
-        del sys.modules["torch"]
-    if "transformers" in sys.modules:
-        del sys.modules["transformers"]
+    _run_isolated_python(
+        """
+import sys
+from datafog.services.image_service import ImageService
 
-    # Create ImageService with use_donut=False
-    # The variable is used indirectly by creating the service which affects sys.modules
-    _ = ImageService(use_donut=False, use_tesseract=True)
+_ = ImageService(use_donut=False, use_tesseract=True)
 
-    # Verify that torch and transformers were not imported
-    assert "torch" not in sys.modules
-    assert "transformers" not in sys.modules
+assert "torch" not in sys.modules
+assert "transformers" not in sys.modules
+"""
+    )
 
 
 def test_lazy_import_mechanism():
@@ -26,24 +39,16 @@ def test_lazy_import_mechanism():
     # to use lazy imports. We don't need to actually test the imports themselves,
     # just that the structure is correct.
 
-    # First, ensure torch and transformers are not in sys.modules
-    if "torch" in sys.modules:
-        del sys.modules["torch"]
-    if "transformers" in sys.modules:
-        del sys.modules["transformers"]
-
-    # Import the DonutProcessor directly
-    from datafog.processing.image_processing.donut_processor import DonutProcessor
-
-    # Create a processor instance
-    processor = DonutProcessor()
-
-    # Verify that torch and transformers were not imported just by creating the processor
-    assert "torch" not in sys.modules
-    assert "transformers" not in sys.modules
+    _run_isolated_python(
+        """
+import sys
+from datafog.processing.image_processing.donut_processor import DonutProcessor
 
-    # Verify that the extract_text_from_image method exists
-    assert hasattr(processor, "extract_text_from_image")
+processor = DonutProcessor()
 
-    # Runtime package installation helpers should not exist on the processor.
-    assert not hasattr(processor, "ensure_installed")
+assert "torch" not in sys.modules
+assert "transformers" not in sys.modules
+assert hasattr(processor, "extract_text_from_image")
+assert not hasattr(processor, "ensure_installed")
+"""
+    )
diff --git a/tests/test_install_profiles.py b/tests/test_install_profiles.py
index e17261be..7222186e 100644
--- a/tests/test_install_profiles.py
+++ b/tests/test_install_profiles.py
@@ -17,10 +17,13 @@ def test_install_profile_import_surface() -> None:
         assert datafog.scan("Email jane@example.com").entities
         assert datafog.redact("Email jane@example.com").redacted_text
     elif profile == "cli":
+        import click  # noqa: F401
+
         from datafog.client import app
 
         assert app is not None
     elif profile == "nlp":
+        import click  # noqa: F401
         import spacy  # noqa: F401
 
         from datafog.models.spacy_nlp import SpacyAnnotator

From 216b7380647604c7a74acb3bdbdb5828c233099e Mon Sep 17 00:00:00 2001
From: sidmohan0 <61345237+sidmohan0@users.noreply.github.com>
Date: Tue, 26 May 2026 19:27:54 +0200
Subject: [PATCH 2/4] fix: keep agent helpers on regex core path

Refs DFPY-74
---
 datafog/agent.py              | 16 +++++------
 tests/test_no_network_core.py | 52 +++++++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+), 8 deletions(-)

diff --git a/datafog/agent.py b/datafog/agent.py
index 58a84ed7..6603ba11 100644
--- a/datafog/agent.py
+++ b/datafog/agent.py
@@ -51,7 +51,7 @@ class Guardrail:
     """Reusable text guardrail for wrapping LLM prompts and outputs."""
 
     entity_types: Optional[list[str]] = None
-    engine: str = "smart"
+    engine: str = "regex"
     strategy: str = "token"
     on_detect: str = "redact"
 
@@ -111,33 +111,33 @@ def watch(self) -> Iterator[GuardrailWatch]:
         yield watcher
 
 
-def sanitize(text: str, **kwargs: Any) -> str:
+def sanitize(text: str, engine: str = "regex", **kwargs: Any) -> str:
     """
     One-liner PII removal.
 
     Returns the redacted text only.
     """
-    result = scan_and_redact(text=text, **kwargs)
+    result = scan_and_redact(text=text, engine=engine, **kwargs)
     return result.redacted_text
 
 
-def scan_prompt(prompt: str, **kwargs: Any) -> ScanResult:
+def scan_prompt(prompt: str, engine: str = "regex", **kwargs: Any) -> ScanResult:
     """
     Scan an LLM prompt for PII without modifying the input text.
     """
-    return scan(prompt, **kwargs)
+    return scan(prompt, engine=engine, **kwargs)
 
 
-def filter_output(output: str, **kwargs: Any) -> RedactResult:
+def filter_output(output: str, engine: str = "regex", **kwargs: Any) -> RedactResult:
     """
     Scan and redact PII from model output before returning to users.
     """
-    return scan_and_redact(output, **kwargs)
+    return scan_and_redact(output, engine=engine, **kwargs)
 
 
 def create_guardrail(
     entity_types: Optional[list[str]] = None,
-    engine: str = "smart",
+    engine: str = "regex",
     strategy: str = "token",
     on_detect: str = "redact",
 ) -> Guardrail:
diff --git a/tests/test_no_network_core.py b/tests/test_no_network_core.py
index 905984f4..f06e8360 100644
--- a/tests/test_no_network_core.py
+++ b/tests/test_no_network_core.py
@@ -63,6 +63,19 @@ def fail_optional_engine_probe():
     guarded = guardrail.filter("Email jane@example.com")
     assert guarded.redacted_text == "Email [EMAIL_1]"
 
+    sanitized = datafog.sanitize("Email jane@example.com")
+    assert sanitized == "Email [EMAIL_1]"
+
+    prompt_result = datafog.scan_prompt("Email jane@example.com")
+    assert [entity.type for entity in prompt_result.entities] == ["EMAIL"]
+
+    output_result = datafog.filter_output("Email jane@example.com")
+    assert output_result.redacted_text == "Email [EMAIL_1]"
+
+    agent_guardrail = datafog.create_guardrail()
+    agent_guarded = agent_guardrail.filter("Email jane@example.com")
+    assert agent_guarded.redacted_text == "Email [EMAIL_1]"
+
 
 def test_import_probes_do_not_load_optional_models() -> None:
     _run_isolated_python(
@@ -94,3 +107,42 @@ def from_pretrained(*_args, **_kwargs):
 assert datafog.scan("Email jane@example.com").entities
 """
     )
+
+
+def test_core_path_does_not_import_optional_dependency_modules() -> None:
+    _run_isolated_python(
+        """
+import importlib.abc
+import sys
+
+blocked = {
+    "aiohttp",
+    "certifi",
+    "gliner",
+    "PIL",
+    "pyspark",
+    "pytesseract",
+    "spacy",
+    "torch",
+    "transformers",
+}
+
+class BlockOptionalImports(importlib.abc.MetaPathFinder):
+    def find_spec(self, fullname, path=None, target=None):
+        if fullname.split(".", 1)[0] in blocked:
+            raise AssertionError(f"optional dependency imported: {fullname}")
+        return None
+
+sys.meta_path.insert(0, BlockOptionalImports())
+
+import datafog
+
+assert datafog.scan("Email jane@example.com").entities
+assert datafog.redact("Email jane@example.com").redacted_text == "Email [EMAIL_1]"
+assert datafog.protect().filter("Email jane@example.com").redacted_text == "Email [EMAIL_1]"
+assert datafog.sanitize("Email jane@example.com") == "Email [EMAIL_1]"
+assert datafog.scan_prompt("Email jane@example.com").entities
+assert datafog.filter_output("Email jane@example.com").redacted_text == "Email [EMAIL_1]"
+assert datafog.create_guardrail().filter("Email jane@example.com").redacted_text == "Email [EMAIL_1]"
+"""
+    )

From 90c426d320dab289d3ce77c25a6cb3ad923f6202 Mon Sep 17 00:00:00 2001
From: sidmohan0 <61345237+sidmohan0@users.noreply.github.com>
Date: Tue, 26 May 2026 19:38:56 +0200
Subject: [PATCH 3/4] docs: clarify optional OCR and Spark surfaces

Refs DFPY-75
---
 .gitignore                              |   1 +
 README.md                               |  29 +++++++
 docs/cli.rst                            |  12 +++
 docs/index.rst                          |  38 ++++-----
 docs/optional-surfaces.rst              | 109 ++++++++++++++++++++++++
 docs/python-sdk.rst                     |  42 ++++++++-
 docs/roadmap.rst                        |  34 ++++++--
 tests/test_runtime_dependency_safety.py |  53 ++++++++++++
 8 files changed, 287 insertions(+), 31 deletions(-)
 create mode 100644 docs/optional-surfaces.rst

diff --git a/.gitignore b/.gitignore
index bb9f8105..1316a1f3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -59,6 +59,7 @@ docs/*
 !docs/conf.py
 !docs/Makefile
 !docs/make.bat
+!docs/optional-surfaces.rst
 !docs/agents/
 !docs/agents/**
 !docs/audit/
diff --git a/README.md b/README.md
index 62f7e10d..2ff68cd2 100644
--- a/README.md
+++ b/README.md
@@ -21,6 +21,12 @@ pip install datafog[nlp]
 # Add GLiNER + spaCy support
 pip install datafog[nlp-advanced]
 
+# Add local OCR support
+pip install datafog[ocr]
+
+# Add Spark/distributed support
+pip install datafog[distributed]
+
 # Everything
 pip install datafog[all]
 ```
@@ -95,6 +101,29 @@ Use the engine that matches your accuracy and dependency constraints:
   - Cascades regex with optional NER engines.
   - If optional deps are missing, it degrades gracefully and warns.
 
+## Optional OCR And Spark Surfaces
+
+DataFog 4.5 keeps the main package story centered on lightweight text PII
+screening. OCR and Spark remain supported optional surfaces for users who
+already rely on them, but they are not required for the core import, default
+scan/redact helpers, or guardrail helpers.
+
+- OCR:
+  - Install `datafog[ocr]` for local image OCR helpers.
+  - URL-based image downloading also needs `datafog[web,ocr]`.
+  - Tesseract usage requires the system `tesseract` binary.
+  - Donut OCR requires `datafog[nlp-advanced,ocr]` and a model already available
+    locally.
+- Spark:
+  - Install `datafog[distributed]` for `SparkService`.
+  - Spark PII UDF helpers also require `datafog[nlp]` and an installed spaCy
+    model.
+  - A Java runtime is required by PySpark.
+
+OCR and Spark are not deprecated. Their broader API and packaging overhaul is
+deferred; the 4.5 goal is to keep them explicit, documented, and isolated from
+the lightweight core path.
+
 ## Backward-Compatible APIs
 
 The existing public API remains available.
diff --git a/docs/cli.rst b/docs/cli.rst
index a4c67272..ec452f53 100644
--- a/docs/cli.rst
+++ b/docs/cli.rst
@@ -7,6 +7,18 @@ Overview
 The main entrypoint for the CLI is through the DataFog client file, defined in :mod:`datafog.client`.
 We use Typer to build the CLI, with each command defined as a separate function.
 
+Core text commands such as ``scan-text``, ``redact-text``, ``replace-text``,
+and ``hash-text`` are the primary 4.5 CLI path. OCR commands remain available
+for existing users, but they are optional:
+
+* Local image OCR requires ``datafog[ocr]`` and any needed system OCR binaries
+  such as Tesseract.
+* URL-based image OCR also requires ``datafog[web,ocr]``.
+* Donut OCR requires ``datafog[nlp-advanced,ocr]`` and a local model.
+
+Spark/distributed workflows are Python SDK surfaces rather than first-path CLI
+commands. Install ``datafog[distributed]`` when using ``SparkService``.
+
 Definitions
 -----------
 .. automodule:: datafog.client
diff --git a/docs/index.rst b/docs/index.rst
index a22af1c5..7e05a867 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -2,7 +2,9 @@
 DataFog Documentation
 =====================
 
-DataFog is an open-source tool for PII detection and anonymization of unstructured data. This documentation covers the CLI and Python SDK.
+DataFog is an open-source tool for lightweight text PII detection and
+anonymization. The core install focuses on fast regex-based scanning and
+redaction, with optional extras for NLP, OCR, and Spark-style workflows.
 
 .. toctree::
    :maxdepth: 2
@@ -10,6 +12,7 @@ DataFog is an open-source tool for PII detection and anonymization of unstructur
    important-concepts
    cli
    python-sdk
+   optional-surfaces
    definitions
    roadmap
    v44-bridge-release
@@ -24,13 +27,14 @@ Getting Started
 Installation
 ------------
 
-Install DataFog via pip:
+Install the lightweight text screening core via pip:
 
 .. code-block:: bash
 
     pip install datafog
 
-This installs the latest stable version with CLI support.
+Optional extras such as ``nlp``, ``nlp-advanced``, ``ocr``, ``distributed``,
+and ``web`` are installed only when you need those surfaces.
 
 ---------------------
 CLI Usage
@@ -48,19 +52,21 @@ Scan text for PII:
 
     datafog scan-text "Your text here"
 
-Extract text from image:
+Image/OCR commands are optional. Local OCR requires ``datafog[ocr]``; URL-based
+image downloading requires ``datafog[web,ocr]``.
 
 .. code-block:: bash
 
     datafog scan-image "path/to/image.png" --operations extract
 
-Scan for PII in image:
+Scan for PII in image text:
 
 .. code-block:: bash
 
     datafog scan-image "path/to/image.png" --operations scan
 
-For more information on the CLI, see :doc:`cli`.
+For more information on optional OCR and Spark surfaces, see
+:doc:`optional-surfaces`.
 
 ---------------------
 Python SDK Usage
@@ -71,22 +77,14 @@ Scan text for PII:
 .. code-block:: python
 
    
-   import requests
-   from datafog import DataFog
-
-   # For text annotation
-   client = DataFog(operations="scan")
+   import datafog
 
-   # Fetch sample medical record
-   doc_url = "https://gist.githubusercontent.com/sidmohan0/b43b72693226422bac5f083c941ecfdb/raw/b819affb51796204d59987893f89dee18428ed5d/note1.txt"
-   response = requests.get(doc_url)
-   text_lines = [line for line in response.text.splitlines() if line.strip()]
+   text = "Contact jane@example.com or call 415-555-1212"
+   result = datafog.scan(text, engine="regex")
+   print(result.entities)
+   print(datafog.redact(text, engine="regex").redacted_text)
 
-   # Run annotation
-   annotations = client.run_text_pipeline_sync(str_list=text_lines)
-   print(annotations)
-    
-Scan image for PII:
+Run OCR and then scan extracted text only when the OCR extra is installed:
 
 .. code-block:: python
 
diff --git a/docs/optional-surfaces.rst b/docs/optional-surfaces.rst
new file mode 100644
index 00000000..aa9b725e
--- /dev/null
+++ b/docs/optional-surfaces.rst
@@ -0,0 +1,109 @@
+=========================
+Optional OCR And Spark
+=========================
+
+DataFog 4.5 keeps the core package focused on lightweight text PII screening.
+The default path is:
+
+.. code-block:: bash
+
+   pip install datafog
+
+.. code-block:: python
+
+   import datafog
+
+   result = datafog.redact("Email jane@example.com", engine="regex")
+   print(result.redacted_text)
+
+OCR and Spark are supported optional surfaces. They are useful for image and
+distributed workflows, but they should not be treated as required for the core
+install, package import, text scanning, text redaction, or guardrail helpers.
+
+OCR
+---
+
+Use OCR when you need to extract text from images before running PII detection.
+
+Install local OCR support:
+
+.. code-block:: bash
+
+   pip install "datafog[ocr]"
+
+Use URL-based image downloads:
+
+.. code-block:: bash
+
+   pip install "datafog[web,ocr]"
+
+Use Donut OCR:
+
+.. code-block:: bash
+
+   pip install "datafog[nlp-advanced,ocr]"
+
+Notes:
+
+* Tesseract usage requires the system ``tesseract`` binary in addition to the
+  Python extra.
+* Donut OCR requires a model that is already available locally. DataFog should
+  not download models implicitly during normal runtime usage.
+* OCR is not deprecated. A broader OCR API and packaging overhaul is deferred
+  beyond the 4.5 focus release.
+
+Example local OCR flow:
+
+.. code-block:: python
+
+   import asyncio
+   from datafog.services.image_service import ImageService
+
+   async def main():
+       service = ImageService(use_tesseract=True, use_donut=False)
+       extracted = await service.ocr_extract(["./invoice.png"])
+       print(extracted)
+
+   asyncio.run(main())
+
+Spark
+------
+
+Use Spark when you need distributed processing around DataFog PII detection.
+
+Install Spark support:
+
+.. code-block:: bash
+
+   pip install "datafog[distributed]"
+
+Use Spark PII UDF helpers:
+
+.. code-block:: bash
+
+   pip install "datafog[distributed,nlp]"
+
+Notes:
+
+* ``SparkService`` requires PySpark and a Java runtime.
+* Spark PII UDF helpers also require spaCy and an installed spaCy model.
+* Spark is not deprecated. A broader Spark overhaul is deferred beyond the 4.5
+  focus release.
+
+Example local Spark flow:
+
+.. code-block:: python
+
+   from datafog.services.spark_service import SparkService
+
+   service = SparkService(master="local[1]")
+   rows = service.read_json("./records.json")
+   print(rows)
+
+Core-path verification
+----------------------
+
+The repository includes tests that block optional dependency imports while
+importing ``datafog`` and running the default text helpers. These checks verify
+that OCR, Spark, NLP, model-loading, and web dependencies are not required for
+the core path.
diff --git a/docs/python-sdk.rst b/docs/python-sdk.rst
index dbf1982d..a1adec5d 100644
--- a/docs/python-sdk.rst
+++ b/docs/python-sdk.rst
@@ -4,8 +4,46 @@ DataFog Python SDK
 
 Overview
 --------
-The main entrypoint for the SDK is through the DataFog class, defined in :mod:`datafog.main`.
-Here you can initialize the different services, including TextService, ImageService, and SparkService.
+The primary 4.5 SDK path is lightweight text PII screening through the
+top-level ``datafog`` helpers. These helpers use the regex engine by default
+and do not require OCR, Spark, model downloads, or distributed dependencies.
+
+.. code-block:: python
+
+   import datafog
+
+   text = "Contact jane@example.com or call 415-555-1212"
+
+   scan_result = datafog.scan(text, engine="regex")
+   print(scan_result.entities)
+
+   redact_result = datafog.redact(text, engine="regex")
+   print(redact_result.redacted_text)
+
+   print(datafog.sanitize(text))
+
+The backward-compatible ``DataFog`` and ``TextService`` classes remain
+available for existing users. ``TextService(engine="regex")`` is the
+dependency-light service path; ``spacy``, ``gliner``, ``smart``, OCR, and Spark
+surfaces require their explicit extras.
+
+Optional services
+-----------------
+
+OCR and Spark are supported optional surfaces, not the primary 4.5 path:
+
+* Use ``datafog[ocr]`` for local OCR helpers such as ``ImageService`` and
+  ``PytesseractProcessor``.
+* Use ``datafog[web,ocr]`` when OCR inputs must be downloaded from URLs.
+* Use ``datafog[nlp-advanced,ocr]`` for Donut OCR, with the model already
+  available locally.
+* Use ``datafog[distributed]`` for ``SparkService``.
+* Use ``datafog[distributed,nlp]`` plus an installed spaCy model for Spark PII
+  UDF helpers.
+
+OCR and Spark are not deprecated. Their broader overhaul is deferred so the
+4.5 release can keep the core package tight while preserving existing optional
+usage. See :doc:`optional-surfaces` for install notes and limitations.
 
 Definitions
 -----------
diff --git a/docs/roadmap.rst b/docs/roadmap.rst
index acf8b6a0..9db4d433 100644
--- a/docs/roadmap.rst
+++ b/docs/roadmap.rst
@@ -134,13 +134,29 @@ All features will remain backward compatible with the lightweight architecture.
 
 4.5.0
 ------
-Version ``4.5.0`` will introduce:
-
-* **Enterprise features** in dedicated extras
-* **Advanced analytics** for PII detection patterns
-* **Multi-language support** for international PII types
-* **Cloud integration** helpers for AWS, GCP, Azure
-* **Performance monitoring** and metrics collection
+Version ``4.5.0`` is a focus release for lightweight text PII screening. It
+should make the core package easier to install, reason about, test, and use
+before larger v5 middleware work.
+
+4.5.0 should focus on:
+
+* Core text scanning, redaction, and guardrail helpers that stay dependency
+  light by default.
+* Clear install-profile documentation for core, NLP, OCR, Spark, CLI, and web
+  surfaces.
+* OCR and Spark as supported optional surfaces, not the main 4.5 adoption path.
+* Documentation cleanup so users and contributors can find the current package
+  story without reading historical planning material first.
+* German PII regex support if the external PR passes review and does not
+  compromise core precision.
+
+Deferred beyond 4.5.0:
+
+* Full middleware adapters for Sentry, OpenTelemetry, logging frameworks, or
+  cloud DLP services.
+* OCR architecture overhaul.
+* Spark architecture overhaul.
+* Enterprise dashboards and analytics.
 
-The lightweight core will remain unchanged, ensuring existing
-integrations continue to work without modification.
+The lightweight core remains the first path; optional surfaces should stay
+explicit and isolated from default import, scan, redact, and guardrail usage.
diff --git a/tests/test_runtime_dependency_safety.py b/tests/test_runtime_dependency_safety.py
index 9410ddc6..d34ceb8a 100644
--- a/tests/test_runtime_dependency_safety.py
+++ b/tests/test_runtime_dependency_safety.py
@@ -1,4 +1,6 @@
 import importlib
+import os
+import subprocess
 import sys
 import types
 from pathlib import Path
@@ -6,6 +8,20 @@
 import pytest
 
 
+def _run_isolated_python(script: str) -> subprocess.CompletedProcess[str]:
+    env = dict(os.environ)
+    env["PYTHONPATH"] = str(Path.cwd())
+    env["DATAFOG_NO_TELEMETRY"] = "1"
+    env["DO_NOT_TRACK"] = "1"
+    return subprocess.run(
+        [sys.executable, "-c", script],
+        check=True,
+        env=env,
+        text=True,
+        capture_output=True,
+    )
+
+
 def test_runtime_code_does_not_install_packages() -> None:
     blocked_snippets = [
         "subprocess.check_call",
@@ -25,6 +41,43 @@ def test_runtime_code_does_not_install_packages() -> None:
     assert offenders == []
 
 
+def test_ocr_and_spark_public_services_do_not_require_optional_imports() -> None:
+    _run_isolated_python(
+        """
+import importlib.abc
+import sys
+
+blocked = {
+    "aiohttp",
+    "certifi",
+    "PIL",
+    "pyspark",
+    "pytesseract",
+    "torch",
+    "transformers",
+}
+
+class BlockOptionalImports(importlib.abc.MetaPathFinder):
+    def find_spec(self, fullname, path=None, target=None):
+        if fullname.split(".", 1)[0] in blocked:
+            raise AssertionError(f"optional dependency imported: {fullname}")
+        return None
+
+sys.meta_path.insert(0, BlockOptionalImports())
+
+import datafog
+from datafog.services import ImageService, SparkService, TextService
+
+assert datafog.scan("Email jane@example.com").entities
+assert ImageService is not None
+assert SparkService is not None
+assert TextService is not None
+assert datafog.ImageService is ImageService
+assert datafog.SparkService is SparkService
+"""
+    )
+
+
 def test_spacy_pii_missing_model_requires_explicit_download(
     monkeypatch: pytest.MonkeyPatch,
 ) -> None:

From 6f57af2dd14bafd9c7403acf8d4ee6041449f4bb Mon Sep 17 00:00:00 2001
From: sidmohan0 <61345237+sidmohan0@users.noreply.github.com>
Date: Tue, 26 May 2026 19:51:59 +0200
Subject: [PATCH 4/4] docs: make v4.5 core story obvious

Refs DFPY-72
---
 README.md                 |   8 +++
 docs/getting-started.rst  | 112 +++++++++++++++++++++++++++++++++++
 docs/index.rst            | 120 ++++++++++----------------------------
 docs/planning-history.rst |  30 ++++++++++
 4 files changed, 181 insertions(+), 89 deletions(-)
 create mode 100644 docs/getting-started.rst
 create mode 100644 docs/planning-history.rst

diff --git a/README.md b/README.md
index 2ff68cd2..3f2473e9 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,14 @@ It provides:
 - A simple agent-oriented API for LLM applications
 - Backward-compatible `DataFog` and `TextService` classes
 
+## 4.5 Focus
+
+DataFog 4.5 is focused on lightweight text PII screening: a small core install,
+fast regex-based scan/redact helpers, explicit optional extras, and a clearer
+path toward future middleware use cases. Dedicated Sentry, OpenTelemetry,
+logging-framework, and cloud DLP adapters are future-facing work and are not
+part of the 4.5 release.
+
 ## Installation
 
 ```bash
diff --git a/docs/getting-started.rst b/docs/getting-started.rst
new file mode 100644
index 00000000..69637611
--- /dev/null
+++ b/docs/getting-started.rst
@@ -0,0 +1,112 @@
+================================
+Getting Started With DataFog 4.5
+================================
+
+DataFog 4.5 focuses on lightweight text PII screening. A core install should
+let you scan and redact common structured PII without installing OCR, Spark,
+large NLP models, or middleware integrations.
+
+Install Profiles
+================
+
+Core text screening:
+
+.. code-block:: bash
+
+   pip install datafog
+
+Optional extras are explicit:
+
+.. list-table::
+   :header-rows: 1
+
+   * - Profile
+     - Install command
+     - Use when
+   * - Core
+     - ``pip install datafog``
+     - You need regex-based text scanning, redaction, and guardrail helpers.
+   * - NLP
+     - ``pip install "datafog[nlp]"``
+     - You need spaCy-backed named entity recognition.
+   * - Advanced NLP
+     - ``pip install "datafog[nlp-advanced]"``
+     - You need GLiNER-backed named entity recognition.
+   * - OCR
+     - ``pip install "datafog[ocr]"``
+     - You need local image text extraction before PII scanning.
+   * - OCR from URLs
+     - ``pip install "datafog[web,ocr]"``
+     - You need DataFog to download image inputs before OCR.
+   * - Spark
+     - ``pip install "datafog[distributed]"``
+     - You need the optional ``SparkService`` surface.
+   * - Everything
+     - ``pip install "datafog[all]"``
+     - You are developing or deliberately want every optional surface.
+
+Python Usage
+============
+
+Use the top-level helpers for the 4.5 core path:
+
+.. code-block:: python
+
+   import datafog
+
+   text = "Contact jane@example.com or call 415-555-1212"
+
+   scan_result = datafog.scan(text, engine="regex")
+   print(scan_result.entities)
+
+   redact_result = datafog.redact(text, engine="regex")
+   print(redact_result.redacted_text)
+
+   print(datafog.sanitize("Card: 4111-1111-1111-1111"))
+
+Agent-oriented helpers use the same lightweight text path:
+
+.. code-block:: python
+
+   import datafog
+
+   prompt = "My SSN is 123-45-6789"
+   scan_result = datafog.scan_prompt(prompt, engine="regex")
+
+   if scan_result.entities:
+       print("PII detected before sending the prompt")
+
+   output = "Email me at jane.doe@example.com"
+   safe_output = datafog.filter_output(output, engine="regex")
+   print(safe_output.redacted_text)
+
+CLI Usage
+=========
+
+The CLI core path is text-first:
+
+.. code-block:: bash
+
+   datafog scan-text "Contact jane@example.com"
+   datafog redact-text "Contact jane@example.com"
+   datafog replace-text "Contact jane@example.com"
+   datafog hash-text "Contact jane@example.com"
+
+Image commands are optional. Install ``datafog[ocr]`` for local OCR and
+``datafog[web,ocr]`` when the CLI needs to download image inputs.
+
+What 4.5 Is Not
+===============
+
+DataFog 4.5 prepares the package for future middleware use cases, but it does
+not ship dedicated Sentry, OpenTelemetry, logging-framework, or cloud DLP
+adapters. Those integrations are future-facing work built on the same core
+text screening path.
+
+Next Pages
+==========
+
+* :doc:`python-sdk` documents the Python API surface.
+* :doc:`cli` documents command-line usage.
+* :doc:`optional-surfaces` documents OCR and Spark install notes.
+* :doc:`roadmap` explains how 4.5 leads toward later middleware work.
diff --git a/docs/index.rst b/docs/index.rst
index 7e05a867..2c4dab59 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -2,105 +2,47 @@
 DataFog Documentation
 =====================
 
-DataFog is an open-source tool for lightweight text PII detection and
-anonymization. The core install focuses on fast regex-based scanning and
-redaction, with optional extras for NLP, OCR, and Spark-style workflows.
+DataFog 4.5 is a lightweight text PII screening package for Python. The
+primary path is a small core install, fast regex-based scanning and redaction,
+agent-friendly guardrail helpers, and explicit optional extras when you need
+NLP, OCR, Spark, or web inputs.
+
+Start with :doc:`getting-started` if you want the shortest route from install
+to scanning text. The roadmap and historical planning pages remain available,
+but the live user docs are the first path for 4.5.
+
+Use DataFog 4.5
+===============
 
 .. toctree::
    :maxdepth: 2
+   :caption: Use DataFog 4.5
 
-   important-concepts
-   cli
+   getting-started
    python-sdk
+   cli
    optional-surfaces
-   definitions
-   roadmap
-   v44-bridge-release
-   v5-product-brief
-   v5-compatibility-matrix
-   v5-cut-line
-
-=====================
-Getting Started
-=====================
-
-Installation
-------------
-
-Install the lightweight text screening core via pip:
-
-.. code-block:: bash
-
-    pip install datafog
-
-Optional extras such as ``nlp``, ``nlp-advanced``, ``ocr``, ``distributed``,
-and ``web`` are installed only when you need those surfaces.
-
----------------------
-CLI Usage
----------------------
-
-For a list of available operations, run:
-
-.. code-block:: bash
-
-    datafog --help
-
-Scan text for PII:
-
-.. code-block:: bash
-
-    datafog scan-text "Your text here"
-
-Image/OCR commands are optional. Local OCR requires ``datafog[ocr]``; URL-based
-image downloading requires ``datafog[web,ocr]``.
-
-.. code-block:: bash
-
-    datafog scan-image "path/to/image.png" --operations extract
-
-Scan for PII in image text:
-
-.. code-block:: bash
-
-    datafog scan-image "path/to/image.png" --operations scan
-
-For more information on optional OCR and Spark surfaces, see
-:doc:`optional-surfaces`.
-
----------------------
-Python SDK Usage
----------------------
-
-Scan text for PII:
-
-.. code-block:: python
-
-   
-   import datafog
-
-   text = "Contact jane@example.com or call 415-555-1212"
-   result = datafog.scan(text, engine="regex")
-   print(result.entities)
-   print(datafog.redact(text, engine="regex").redacted_text)
+   important-concepts
 
-Run OCR and then scan extracted text only when the OCR extra is installed:
+Reference
+=========
 
-.. code-block:: python
+.. toctree::
+   :maxdepth: 2
+   :caption: Reference
 
-   
-   import asyncio
-   from datafog import DataFog
+   definitions
 
-   # For OCR and PII annotation
-   ocr_client = DataFog(operations="extract,scan")
+Planning And History
+====================
 
-   async def run_ocr_pipeline_demo():
-       image_url = "https://s3.amazonaws.com/thumbnails.venngage.com/template/dc377004-1c2d-49f2-8ddf-d63f11c8d9c2.png"
-       results = await ocr_client.run_ocr_pipeline(image_urls=[image_url])
-       print("OCR Pipeline Results:", results)
+The pages below document release planning, migration history, and future
+direction. They are useful context, but they are secondary to the live 4.5
+usage path above.
 
-   # Run the async function
-   asyncio.run(run_ocr_pipeline_demo())
+.. toctree::
+   :maxdepth: 1
+   :caption: Planning and history
 
-For detailed information on the Python SDK, see :doc:`python-sdk`.
+   roadmap
+   planning-history
diff --git a/docs/planning-history.rst b/docs/planning-history.rst
new file mode 100644
index 00000000..20025573
--- /dev/null
+++ b/docs/planning-history.rst
@@ -0,0 +1,30 @@
+====================
+Planning And History
+====================
+
+These pages and artifacts are preserved for context, but they are not the
+first path for using DataFog 4.5. Start with :doc:`getting-started` for live
+user docs.
+
+Release Planning
+================
+
+.. toctree::
+   :maxdepth: 1
+
+   v44-bridge-release
+   v5-product-brief
+   v5-compatibility-matrix
+   v5-cut-line
+
+Audit Artifacts
+===============
+
+Historical audit notes remain available in the repository for maintainers who
+need the detailed background:
+
+* :download:`Reconnaissance notes <audit/00-reconnaissance.md>`
+* :download:`Coverage baseline <audit/01-coverage-baseline.md>`
+* :download:`Detection accuracy review <audit/02-detection-accuracy.md>`
+* :download:`Architecture review <audit/03-architecture-review.md>`
+* :download:`Final coverage notes <audit/06-final-coverage.md>`