diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 463cf894..0f1713f9 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 4.3.0 +current_version = 4.4.0a5 commit = True tag = True tag_name = v{new_version} @@ -20,7 +20,3 @@ values = [bumpversion:file:datafog/__about__.py] search = __version__ = "{current_version}" replace = __version__ = "{new_version}" - -[bumpversion:file:setup.py] -search = version="{current_version}" -replace = version="{new_version}" \ No newline at end of file diff --git a/.gitignore b/.gitignore index 2f62eff9..bb9f8105 100644 --- a/.gitignore +++ b/.gitignore @@ -24,6 +24,7 @@ error_log.txt # Environment .env .venv +.venv*/ venv/ env/ examples/venv/ @@ -58,14 +59,14 @@ docs/* !docs/conf.py !docs/Makefile !docs/make.bat +!docs/agents/ +!docs/agents/** !docs/audit/ !docs/audit/** # Keep all directories but ignore their contents */**/__pycache__/ -# Keep all files but ignore their contents -Claude.md notes/benchmarking_notes.md Roadmap.md notes/* diff --git a/Claude.md b/AGENTS.md similarity index 82% rename from Claude.md rename to AGENTS.md index dcbe7934..a46c4402 100644 --- a/Claude.md +++ b/AGENTS.md @@ -1,18 +1,26 @@ -# DataFog - Claude Development Guide +# DataFog - Agent Development Guide ## Project Overview + **DataFog** is an open-source Python library for PII detection and anonymization with a focus on speed and lightweight architecture. ## Core Value Proposition + - **Ultra-Fast Performance**: 190x faster than spaCy for structured PII, 32x faster with GLiNER - **Lightweight Core**: <2MB package with optional ML extras - **Modern Engine Options**: Regex, GLiNER, spaCy, and smart cascading - **Production Ready**: Comprehensive testing, CI/CD, and performance validation ## Current Project Status -**Version: 4.3.0** + +**Stable version: 4.4.0** + +**Development version: 4.4.0a5** + +**Next minor target: 4.5.0** ### ✅ Recently Completed (Latest) + - **GLiNER Integration**: Modern NER engine with PII-specialized models - **Smart Cascading**: Intelligent regex → GLiNER → spaCy progression - **Enhanced CLI**: Model management with `--engine` flags @@ -43,6 +51,7 @@ python -c "from datafog.services.text_service import TextService; print('✅ All ## Architecture Overview ### Engine Ecosystem (Updated with GLiNER) + ```python from datafog.services.text_service import TextService @@ -59,21 +68,23 @@ auto_service = TextService(engine="auto") # Legacy: regex→spaCy ``` ### Performance Comparison (Validated) -| Engine | Speed vs spaCy | Accuracy | Use Case | Install | -|---------|----------------|----------|----------|---------| -| `regex` | **190x faster** | High (structured) | Emails, phones, SSNs | Core only | -| `gliner` | **32x faster** | Very High | Modern NER, custom entities | `[nlp-advanced]` | -| `spacy` | 1x (baseline) | Good | Traditional NLP | `[nlp]` | -| `smart` | **60x faster** | Highest | Best balance | `[nlp-advanced]` | + +| Engine | Speed vs spaCy | Accuracy | Use Case | Install | +| -------- | --------------- | ----------------- | --------------------------- | ---------------- | +| `regex` | **190x faster** | High (structured) | Emails, phones, SSNs | Core only | +| `gliner` | **32x faster** | Very High | Modern NER, custom entities | `[nlp-advanced]` | +| `spacy` | 1x (baseline) | Good | Traditional NLP | `[nlp]` | +| `smart` | **60x faster** | Highest | Best balance | `[nlp-advanced]` | ### Dependency Strategy + ```python # Lightweight core (<2MB) pip install datafog # Optional ML engines pip install datafog[nlp] # spaCy (traditional NLP) -pip install datafog[nlp-advanced] # GLiNER (modern NER) +pip install datafog[nlp-advanced] # GLiNER (modern NER) pip install datafog[ocr] # Image processing pip install datafog[all] # Everything ``` @@ -81,15 +92,18 @@ pip install datafog[all] # Everything ## GLiNER Integration (NEW) ### Overview + GLiNER (Generalist Model for Named Entity Recognition) provides modern, accurate NER capabilities optimized for PII detection. ### Key Features + - **PII-Specialized Models**: `urchade/gliner_multi_pii-v1` trained specifically for PII - **Custom Entity Types**: Configurable entity detection beyond default PII types - **Smart Cascading**: Automatically tries regex first, GLiNER second, spaCy last - **CLI Management**: Download and manage GLiNER models via CLI ### Usage Examples + ```python # GLiNER engine from datafog.services.text_service import TextService @@ -108,6 +122,7 @@ subprocess.run(["datafog", "list-models", "--engine", "gliner"]) ``` ### Available GLiNER Models + - `urchade/gliner_multi_pii-v1` - PII-specialized (recommended) - `urchade/gliner_base` - General purpose starter - `urchade/gliner_large-v2` - Higher accuracy @@ -116,17 +131,19 @@ subprocess.run(["datafog", "list-models", "--engine", "gliner"]) ## Development Workflow ### Git Branch Strategy + - **main**: Production releases only - **dev**: Main development branch (use this) -- **feature/***: New features from dev -- **fix/***: Bug fixes from dev +- **feature/\***: New features from dev +- **fix/\***: Bug fixes from dev ### Making Changes + ```bash # Start from dev git checkout dev && git pull origin dev -# Create feature branch +# Create feature branch git checkout -b feature/your-change # Make changes, test, commit @@ -137,6 +154,7 @@ git push -u origin feature/your-change ``` ### Testing + ```bash # Run specific test suites pytest tests/test_text_service.py -v # Core functionality @@ -149,13 +167,14 @@ PYTEST_DONUT=yes pytest tests/test_ocr_integration.py # OCR with real models # Performance requirements # - Regex: 150x+ faster than spaCy -# - GLiNER: 25x+ faster than spaCy +# - GLiNER: 25x+ faster than spaCy # - Package size: Core <2MB, full <8MB ``` ## Key Implementation Patterns ### Simple API (Recommended) + ```python # Always available, lightweight from datafog import detect, process @@ -164,6 +183,7 @@ result = process("john@example.com", method="redact") ``` ### Advanced Engine Selection + ```python # For specialized use cases from datafog.services.text_service import TextService @@ -173,7 +193,7 @@ service = TextService(engine="regex") # Modern NER with custom entities service = TextService( - engine="gliner", + engine="gliner", gliner_model="urchade/gliner_base" ) @@ -182,6 +202,7 @@ service = TextService(engine="smart") ``` ### Graceful Degradation + ```python # Handles missing dependencies elegantly try: @@ -194,18 +215,21 @@ except ImportError: ## Common Tasks ### Adding New Entity Types + 1. Update regex patterns in `regex_annotator.py` 2. Add GLiNER entity types in `gliner_annotator.py` 3. Update tests and benchmarks 4. Validate performance doesn't regress >10% ### Performance Optimization + 1. Profile with existing benchmarks 2. Maintain speed thresholds (regex 150x+, GLiNER 25x+) 3. Update baselines when making improvements 4. Test across all engines ### CLI Enhancements + 1. Update `client.py` with new commands 2. Support `--engine` flag for multi-engine commands 3. Add comprehensive help text and examples @@ -215,31 +239,36 @@ except ImportError: ### Workflow Architecture (3 workflows) -| Workflow | Purpose | Trigger | -|----------|---------|---------| -| `ci.yml` | Lint + Test + Coverage + Wheel size | Push/PR to main/dev | -| `release.yml` | Alpha/Beta/Stable publishing | Schedule + manual dispatch | -| `benchmark.yml` | Performance benchmarks | Push/PR/weekly | +| Workflow | Purpose | Trigger | +| --------------- | ----------------------------------- | -------------------------- | +| `ci.yml` | Lint + Test + Coverage + Wheel size | Push/PR to main/dev | +| `release.yml` | Alpha/Beta/Stable publishing | Schedule + manual dispatch | +| `benchmark.yml` | Performance benchmarks | Push/PR/weekly | ### Release Cadence + - **Alpha** (Mon-Wed 2AM UTC): Automatic from `dev`, date+commit versioning - **Beta** (Thursday 2AM UTC): Automatic from `dev`, incremental beta numbers - **Stable** (manual dispatch): From `main`, base version or override ### Release Pipeline + `determine-release` → `test` → `publish` → `cleanup` + - Tests are a hard gate — no tests = no publish - Stable releases check out `main`; alpha/beta check out `dev` - Old alphas pruned to 7, betas to 5 - `[skip ci]` in version bump commits to prevent loops ### Pre-commit Hooks + - **isort**, **black**, **flake8**, **ruff**: Code formatting and linting - **prettier**: Markdown, JSON, YAML formatting - **gitleaks**: Secret scanning - **pre-commit-hooks**: Large file checks, merge conflict detection, YAML validation ## Environment Variables + ```bash # Testing configuration export PYTEST_DONUT=yes # Enable real OCR testing @@ -250,33 +279,51 @@ export PYTHONPATH=$(pwd) # Local development imports ``` ## Performance Requirements + - **Core Package**: <2MB (from ~8MB in v4.0.x) - **Regex Engine**: 150x+ faster than spaCy (currently 190x) -- **GLiNER Engine**: 25x+ faster than spaCy (currently 32x) +- **GLiNER Engine**: 25x+ faster than spaCy (currently 32x) - **Memory Usage**: Graceful handling of large texts (1MB+ chunks) - **Model Loading**: Cache GLiNER models to avoid repeated downloads -## Best Practices for Claude Agents +## Agent skills + +### Issue tracker + +Issues and PRDs are tracked in Linear under the DFPY team. See `docs/agents/issue-tracker.md`. + +### Triage labels + +Use the default five-label triage vocabulary. See `docs/agents/triage-labels.md`. + +### Domain docs + +Single-context repo: use root `CONTEXT.md` and root `docs/adr/` when present. See `docs/agents/domain.md`. + +## Best Practices for Agents Before beginning any task please checkout a branch from `dev` and create a pull request to `dev`. ### Code Quality + - Follow existing patterns before implementing new approaches - Add comprehensive tests for all new functionality - Update documentation immediately with code changes - Run benchmarks for any text processing modifications ### GLiNER Development + - Use PII-specialized models when available (`urchade/gliner_multi_pii-v1`) - Test graceful degradation when GLiNER dependencies missing - Validate smart cascading thresholds with real data - Consider model download time and caching strategies ### Release Preparation + - Alpha/beta releases are automated via `release.yml` schedule - Stable releases: merge `dev` → `main`, then trigger `release.yml` with `stable` type - Use `dry_run: true` to validate before actual publish - Performance validation on realistic data sets -- In Release Notes or Comments, do not reference that it was authored by Claude (all code is anonymously authored) +- In Release Notes or Comments, do not reference that it was authored by an AI agent (all code is anonymously authored) -This guide provides the essential information for DataFog development while maintaining focus on current priorities and recent GLiNER integration work. \ No newline at end of file +This guide provides the essential information for DataFog development while maintaining focus on current priorities and recent GLiNER integration work. diff --git a/datafog/__init__.py b/datafog/__init__.py index e3974ad7..b7d8e4e7 100644 --- a/datafog/__init__.py +++ b/datafog/__init__.py @@ -61,24 +61,6 @@ def _lazy_import_regex_annotator(): globals()["RegexAnnotator"] = RegexAnnotator -# Optional imports with graceful fallback -try: - from .client import app -except ImportError: - app = None - -try: - from .main import DataFog, TextPIIAnnotator -except ImportError: - DataFog = None - TextPIIAnnotator = None - -try: - from .services.text_service import TextService -except ImportError: - TextService = None - - def __getattr__(name: str): """Handle lazy imports for better lightweight performance.""" # Lazy import core models when first accessed @@ -98,46 +80,53 @@ def __getattr__(name: str): _lazy_import_regex_annotator() return globals()[name] - raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + elif name in _LAZY_EXPORTS: + module_path, attr_name, extra_name = _LAZY_EXPORTS[name] + try: + module = __import__(module_path, fromlist=[attr_name]) + value = getattr(module, attr_name) + except ImportError: + if extra_name is None: + value = None + else: + def _missing_dependency(*args, **kwargs): + raise ImportError( + f"{name} requires additional dependencies. " + f"Install with: pip install datafog[{extra_name}]" + ) -# Optional heavy features - only import if dependencies available -def _optional_import(name, module_path, extra_name): - """Helper to import optional modules with helpful error messages.""" - try: - module = __import__(module_path, fromlist=[name]) - return getattr(module, name) - except ImportError: - - def _missing_dependency(*args, **kwargs): - raise ImportError( - f"{name} requires additional dependencies. " - f"Install with: pip install datafog[{extra_name}]" - ) + value = _missing_dependency + + globals()[name] = value + return value - return _missing_dependency + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") -# OCR/Image processing - requires 'ocr' extra -DonutProcessor = _optional_import( - "DonutProcessor", "datafog.processing.image_processing.donut_processor", "ocr" -) -PytesseractProcessor = _optional_import( - "PytesseractProcessor", - "datafog.processing.image_processing.pytesseract_processor", - "ocr", -) -ImageService = _optional_import("ImageService", "datafog.services.image_service", "ocr") - -# NLP processing - requires 'nlp' extra -SpacyPIIAnnotator = _optional_import( - "SpacyPIIAnnotator", "datafog.processing.text_processing.spacy_pii_annotator", "nlp" -) - -# Distributed processing - requires 'distributed' extra -SparkService = _optional_import( - "SparkService", "datafog.services.spark_service", "distributed" -) +_LAZY_EXPORTS = { + "app": ("datafog.client", "app", None), + "DataFog": ("datafog.main", "DataFog", None), + "TextPIIAnnotator": ("datafog.main", "TextPIIAnnotator", None), + "TextService": ("datafog.services.text_service", "TextService", None), + "DonutProcessor": ( + "datafog.processing.image_processing.donut_processor", + "DonutProcessor", + "ocr", + ), + "PytesseractProcessor": ( + "datafog.processing.image_processing.pytesseract_processor", + "PytesseractProcessor", + "ocr", + ), + "ImageService": ("datafog.services.image_service", "ImageService", "ocr"), + "SpacyPIIAnnotator": ( + "datafog.processing.text_processing.spacy_pii_annotator", + "SpacyPIIAnnotator", + "nlp", + ), + "SparkService": ("datafog.services.spark_service", "SparkService", "distributed"), +} _REDACT_PRESETS = { diff --git a/docs/agents/domain.md b/docs/agents/domain.md new file mode 100644 index 00000000..4fbc0445 --- /dev/null +++ b/docs/agents/domain.md @@ -0,0 +1,31 @@ +# Domain Docs + +How the engineering skills should consume this repo's domain documentation when exploring the codebase. + +Configured layout: single-context. + +## Before exploring, read these + +- **`CONTEXT.md`** at the repo root. +- **`docs/adr/`** for ADRs that touch the area you're about to work in. + +If any of these files don't exist, proceed silently. Don't flag their absence; don't suggest creating them upfront. The producer skill (`/grill-with-docs`) creates them lazily when terms or decisions actually get resolved. + +## File structure + +```text +/ +|-- CONTEXT.md +|-- docs/adr/ +`-- datafog/ +``` + +## Use the glossary's vocabulary + +When your output names a domain concept in an issue title, refactor proposal, hypothesis, or test name, use the term as defined in `CONTEXT.md`. Don't drift to synonyms the glossary explicitly avoids. + +If the concept you need isn't in the glossary yet, that's a signal: either you're inventing language the project doesn't use, or there's a real gap to note for `/grill-with-docs`. + +## Flag ADR conflicts + +If your output contradicts an existing ADR, surface it explicitly rather than silently overriding. diff --git a/docs/agents/issue-tracker.md b/docs/agents/issue-tracker.md new file mode 100644 index 00000000..b07dfca3 --- /dev/null +++ b/docs/agents/issue-tracker.md @@ -0,0 +1,24 @@ +# Issue tracker: Linear + +Issues and PRDs for this repo live in Linear under the DFPY team: + +https://linear.app/threadfork/team/DFPY/all + +Use the Linear connector/app when available. Do not create GitHub or GitLab issues for this repo unless the user explicitly asks for that. + +## Conventions + +- Create new issues in the DFPY team. +- Use the triage labels mapped in `docs/agents/triage-labels.md`. +- Keep issue titles concise and action-oriented. +- Include enough context, acceptance criteria, and verification notes for an AFK agent or human implementer to pick up the work. +- When referencing code, include repo-relative file paths and relevant symbols. +- When a task comes from a PRD, link related Linear issues together where possible. + +## When a skill says "publish to the issue tracker" + +Create a Linear issue in the DFPY team. + +## When a skill says "fetch the relevant ticket" + +Use the Linear connector/app to read the referenced Linear issue, including description, labels, status, comments, and linked issues. diff --git a/docs/agents/triage-labels.md b/docs/agents/triage-labels.md new file mode 100644 index 00000000..0806b2f8 --- /dev/null +++ b/docs/agents/triage-labels.md @@ -0,0 +1,13 @@ +# Triage Labels + +The skills speak in terms of five canonical triage roles. This file maps those roles to the actual label strings used in this repo's issue tracker. + +| Label in mattpocock/skills | Label in our tracker | Meaning | +| -------------------------- | -------------------- | ---------------------------------------- | +| `needs-triage` | `needs-triage` | Maintainer needs to evaluate this issue | +| `needs-info` | `needs-info` | Waiting on reporter for more information | +| `ready-for-agent` | `ready-for-agent` | Fully specified, ready for an AFK agent | +| `ready-for-human` | `ready-for-human` | Requires human implementation | +| `wontfix` | `wontfix` | Will not be actioned | + +When a skill mentions a role, use the corresponding label string from this table. diff --git a/docs/conf.py b/docs/conf.py index 1cb1c895..d71e76b0 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -3,13 +3,18 @@ # For the full list of built-in configuration values, see the documentation: # https://www.sphinx-doc.org/en/master/usage/configuration.html +import re +from pathlib import Path + # -- Project information ----------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information project = "DataFog" copyright = "2024, DataFog Inc." author = "Sid Mohan" -release = "v4.1.1" +_version_file = Path(__file__).resolve().parents[1] / "datafog" / "__about__.py" +_version_match = re.search(r'^__version__ = "([^"]+)"', _version_file.read_text(), re.M) +release = f"v{_version_match.group(1)}" if _version_match else "v0.0.0" # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration diff --git a/setup.py b/setup.py index f84c241a..39f01651 100644 --- a/setup.py +++ b/setup.py @@ -25,6 +25,7 @@ # Optional heavy dependencies nlp_deps = [ + "click>=8.0,<9.0", "spacy>=3.7.0,<4.0", ] @@ -57,6 +58,7 @@ ] cli_deps = [ + "click>=8.0,<9.0", "typer>=0.12.0", "pydantic-settings>=2.0.0", ] diff --git a/tests/test_donut_lazy_import.py b/tests/test_donut_lazy_import.py index 80c9ec09..9b2a28f1 100644 --- a/tests/test_donut_lazy_import.py +++ b/tests/test_donut_lazy_import.py @@ -1,23 +1,36 @@ +import os +import subprocess import sys +from pathlib import Path -from datafog.services.image_service import ImageService + +def _run_isolated_python(script: str) -> subprocess.CompletedProcess[str]: + env = dict(os.environ) + env["PYTHONPATH"] = str(Path.cwd()) + env["DATAFOG_NO_TELEMETRY"] = "1" + env["DO_NOT_TRACK"] = "1" + return subprocess.run( + [sys.executable, "-c", script], + check=True, + env=env, + text=True, + capture_output=True, + ) def test_no_torch_import_when_donut_disabled(): """Test that torch is not imported when use_donut is False""" - # Remove torch and transformers from sys.modules if they're already imported - if "torch" in sys.modules: - del sys.modules["torch"] - if "transformers" in sys.modules: - del sys.modules["transformers"] + _run_isolated_python( + """ +import sys +from datafog.services.image_service import ImageService - # Create ImageService with use_donut=False - # The variable is used indirectly by creating the service which affects sys.modules - _ = ImageService(use_donut=False, use_tesseract=True) +_ = ImageService(use_donut=False, use_tesseract=True) - # Verify that torch and transformers were not imported - assert "torch" not in sys.modules - assert "transformers" not in sys.modules +assert "torch" not in sys.modules +assert "transformers" not in sys.modules +""" + ) def test_lazy_import_mechanism(): @@ -26,24 +39,16 @@ def test_lazy_import_mechanism(): # to use lazy imports. We don't need to actually test the imports themselves, # just that the structure is correct. - # First, ensure torch and transformers are not in sys.modules - if "torch" in sys.modules: - del sys.modules["torch"] - if "transformers" in sys.modules: - del sys.modules["transformers"] - - # Import the DonutProcessor directly - from datafog.processing.image_processing.donut_processor import DonutProcessor - - # Create a processor instance - processor = DonutProcessor() - - # Verify that torch and transformers were not imported just by creating the processor - assert "torch" not in sys.modules - assert "transformers" not in sys.modules + _run_isolated_python( + """ +import sys +from datafog.processing.image_processing.donut_processor import DonutProcessor - # Verify that the extract_text_from_image method exists - assert hasattr(processor, "extract_text_from_image") +processor = DonutProcessor() - # Runtime package installation helpers should not exist on the processor. - assert not hasattr(processor, "ensure_installed") +assert "torch" not in sys.modules +assert "transformers" not in sys.modules +assert hasattr(processor, "extract_text_from_image") +assert not hasattr(processor, "ensure_installed") +""" + ) diff --git a/tests/test_install_profiles.py b/tests/test_install_profiles.py index e17261be..7222186e 100644 --- a/tests/test_install_profiles.py +++ b/tests/test_install_profiles.py @@ -17,10 +17,13 @@ def test_install_profile_import_surface() -> None: assert datafog.scan("Email jane@example.com").entities assert datafog.redact("Email jane@example.com").redacted_text elif profile == "cli": + import click # noqa: F401 + from datafog.client import app assert app is not None elif profile == "nlp": + import click # noqa: F401 import spacy # noqa: F401 from datafog.models.spacy_nlp import SpacyAnnotator