diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..fb60d65 --- /dev/null +++ b/.env.example @@ -0,0 +1,14 @@ +# ── Google Drive integration ──────────────────────────────────────────────── +# Required by scripts/full_pipeline.py (--api mode) and scripts/ci_pipeline.py. +# Obtain from the project's Google Cloud service account (ask the project owner). + +# The full JSON key for the Google service account, as a single-line string. +GOOGLE_SERVICE_ACCOUNT_JSON= + +# The ID of the root Google Drive folder containing 'useful' and 'not-useful' subfolders. +# Found in the folder's URL: drive.google.com/drive/folders/ +GOOGLE_DRIVE_ROOT_FOLDER_ID= + +# Set to "true" if the Drive folder is a shared drive (Team Drive). +# Leave blank or omit for standard My Drive folders. +GOOGLE_DRIVE_USE_SHARED_DRIVE= diff --git a/.github/workflows/working_sw.yml b/.github/workflows/working_sw.yml index dbf8647..8cb055e 100644 --- a/.github/workflows/working_sw.yml +++ b/.github/workflows/working_sw.yml @@ -20,7 +20,7 @@ jobs: sudo apt-get update sudo apt-get install -y tesseract-ocr python -m pip install --upgrade pip - pip install -r requirements.txt + pip install ".[dev]" # CI pipeline (streams 20 PDFs per class from Drive and trains model) - name: CI pipeline (Drive stream) @@ -36,10 +36,10 @@ jobs: uses: actions/cache@v3 with: path: | - src/model/models + src/classifier/models ~/.joblib ~/.sklearn - key: ${{ runner.os }}-sklearn-model-${{ hashFiles('src/model/**/*.py') }} + key: ${{ runner.os }}-sklearn-model-${{ hashFiles('src/classifier/**/*.py') }} restore-keys: | ${{ runner.os }}-sklearn-model- @@ -60,17 +60,17 @@ jobs: - name: Run PDF extraction script (repo test asset) run: | INPUT_PDF="tests/test.pdf" - python src/preprocessing/pdf_text_extraction.py "$INPUT_PDF" + python -m src.io.pdf_text_extraction "$INPUT_PDF" # - name: Generate labels # run: | # echo "Generating labels for extracted text..." - # python src/preprocessing/generate_labels.py + # python src/io/generate_labels.py # - name: Load and preprocess data # run: | # echo "Loading and preprocessing dataset..." - # python src/preprocessing/data_loader.py + # python src/io/data_loader.py # No full pipeline here; use full_training_pipeline.py on main or a scheduled workflow - name: Validate preprocessing pipeline @@ -113,8 +113,9 @@ jobs: with: name: trained-model path: | - src/model/models/pdf_classifier_model.pkl - src/model/models/tfidf_vectorizer.pkl + src/classifier/models/pdf_classifier.json + src/classifier/models/tfidf_vectorizer.pkl + src/classifier/models/label_encoder.pkl - name: Show pipeline summary run: | @@ -124,4 +125,4 @@ jobs: echo "2. Label Generation Results:" ls -l data/labels.json echo "3. Model Training Results:" - ls -l src/model/models/ + ls -l src/classifier/models/ diff --git a/.gitignore b/.gitignore index 2592117..c7bf3fb 100644 --- a/.gitignore +++ b/.gitignore @@ -211,6 +211,7 @@ data/needs-check data/not-useful data/processed-text data/useful -src/model/models/*.pkl +data/results/ +src/classifier/models/*.pkl diff --git a/README.md b/README.md index 9d33fb9..487f58e 100644 --- a/README.md +++ b/README.md @@ -125,7 +125,7 @@ git clone https://github.com/NovakLabOSU/FracFeedExtractor.git cd FracFeedExtractor python3 -m venv venv source venv/bin/activate -pip install -r requirements.txt +pip install -e ".[dev]" ``` ```bash @@ -134,17 +134,17 @@ git clone https://github.com/NovakLabOSU/FracFeedExtractor.git cd FracFeedExtractor py -m venv venv ./venv/Scripts/activate -pip install -r requirements.txt +pip install -e ".[dev]" ``` ### Quick Start ```bash # Classify and extract from a folder of PDFs -python classify_extract.py path/to/pdfs/ +python src/pipeline/classify_extract.py path/to/pdfs/ # Adjust the LLM model or confidence threshold -python classify_extract.py path/to/pdfs/ --llm-model llama3.1:8b --confidence-threshold 0.70 +python src/pipeline/classify_extract.py path/to/pdfs/ --llm-model qwen2.5:7b --confidence-threshold 0.70 ``` Results are written to `data/results/metrics/` (per-paper JSON) and `data/results/summaries/` (pipeline CSV). diff --git a/data/results/Adams_1989_results.json b/data/results/Adams_1989_results.json deleted file mode 100644 index 70d465e..0000000 --- a/data/results/Adams_1989_results.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "source_file": "Adams_1989.txt", - "metrics": { - "species_name": "Gentoo Penguin", - "study_location": "Marion Island", - "study_date": "1984-1985", - "num_empty_stomachs": null, - "num_nonempty_stomachs": null, - "sample_size": 144, - "fraction_feeding": null, - "source_pages": [ - 1, - 2, - 6 - ] - } -} \ No newline at end of file diff --git a/data/results/Ferreira_1999_results.json b/data/results/Ferreira_1999_results.json deleted file mode 100644 index 8265458..0000000 --- a/data/results/Ferreira_1999_results.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "source_file": "Ferreira_1999.txt", - "metrics": { - "species_name": null, - "study_location": null, - "study_date": null, - "num_empty_stomachs": null, - "num_nonempty_stomachs": null, - "sample_size": null, - "fraction_feeding": null - } -} \ No newline at end of file diff --git a/data/results/Fisher_2008_results.json b/data/results/Fisher_2008_results.json deleted file mode 100644 index f6bc255..0000000 --- a/data/results/Fisher_2008_results.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "source_file": "Fisher_2008.txt", - "metrics": { - "species_name": "Nucella lapillus", - "study_location": "Swans Island, Maine, USA", - "study_date": "2004-2005", - "num_empty_stomachs": null, - "num_nonempty_stomachs": 15, - "sample_size": 225, - "fraction_feeding": 0.0667, - "source_pages": [ - 1, - 2 - ] - } -} \ No newline at end of file diff --git a/data/results/Sousa_2015_results.json b/data/results/Sousa_2015_results.json deleted file mode 100644 index 5aa33c6..0000000 --- a/data/results/Sousa_2015_results.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "source_file": "Sousa_2015.txt", - "metrics": { - "species_name": null, - "study_location": null, - "study_date": null, - "num_empty_stomachs": null, - "num_nonempty_stomachs": null, - "sample_size": null, - "fraction_feeding": null - } -} \ No newline at end of file diff --git a/data/results/classifications.csv b/data/results/classifications.csv deleted file mode 100644 index f192e26..0000000 --- a/data/results/classifications.csv +++ /dev/null @@ -1,11 +0,0 @@ -filename,classification,confidence,model_version,processing_time_seconds,timestamp,text_length,error -Adams_1989.pdf,not-useful,0.35939544439315796,1.0.0,0.5106580257415771,2026-02-02T03:18:04.485316,27601, -Berg_2002.pdf,unknown,0.0,1.0.0,,2026-02-02T03:18:05.431344,,No text extracted from PDF -Dale_2011.pdf,not-useful,0.4046034812927246,1.0.0,0.4978008270263672,2026-02-02T03:18:06.053123,75685, -Fisher_2008.pdf,not-useful,0.35939544439315796,1.0.0,0.32041358947753906,2026-02-02T03:18:06.385193,54637, -Harris_2009.pdf,not-useful,0.3941650390625,1.0.0,0.270871639251709,2026-02-02T03:18:06.659290,48939, -Kerle_2000.pdf,not-useful,0.35939544439315796,1.0.0,0.2882812023162842,2026-02-02T03:18:06.954093,33742, -Marques_2015.pdf,not-useful,0.6508813202381134,1.0.0,0.33272790908813477,2026-02-02T03:18:07.305519,49378, -Pakhomov_1998.pdf,not-useful,0.35939544439315796,1.0.0,0.20516109466552734,2026-02-02T03:18:07.515447,30262, -Sousa_2015.pdf,not-useful,0.37260109186172485,1.0.0,0.35510683059692383,2026-02-02T03:18:07.875944,23718, -Wu_2005.pdf,not-useful,0.3673085570335388,1.0.0,0.8690674304962158,2026-02-02T03:18:08.778555,73418, diff --git a/data/results/classifications.json b/data/results/classifications.json deleted file mode 100644 index 8961d1d..0000000 --- a/data/results/classifications.json +++ /dev/null @@ -1,110 +0,0 @@ -{ - "metadata": { - "export_timestamp": "2026-02-02T03:18:08.781554", - "total_files": 10, - "useful_count": 0, - "not_useful_count": 9 - }, - "results": [ - { - "filename": "Adams_1989.pdf", - "classification": "not-useful", - "confidence": 0.35939544439315796, - "model_version": "1.0.0", - "processing_time_seconds": 0.5106580257415771, - "timestamp": "2026-02-02T03:18:04.485316", - "text_length": 27601, - "error": null - }, - { - "filename": "Berg_2002.pdf", - "classification": "unknown", - "confidence": 0.0, - "model_version": "1.0.0", - "processing_time_seconds": null, - "timestamp": "2026-02-02T03:18:05.431344", - "text_length": null, - "error": "No text extracted from PDF" - }, - { - "filename": "Dale_2011.pdf", - "classification": "not-useful", - "confidence": 0.4046034812927246, - "model_version": "1.0.0", - "processing_time_seconds": 0.4978008270263672, - "timestamp": "2026-02-02T03:18:06.053123", - "text_length": 75685, - "error": null - }, - { - "filename": "Fisher_2008.pdf", - "classification": "not-useful", - "confidence": 0.35939544439315796, - "model_version": "1.0.0", - "processing_time_seconds": 0.32041358947753906, - "timestamp": "2026-02-02T03:18:06.385193", - "text_length": 54637, - "error": null - }, - { - "filename": "Harris_2009.pdf", - "classification": "not-useful", - "confidence": 0.3941650390625, - "model_version": "1.0.0", - "processing_time_seconds": 0.270871639251709, - "timestamp": "2026-02-02T03:18:06.659290", - "text_length": 48939, - "error": null - }, - { - "filename": "Kerle_2000.pdf", - "classification": "not-useful", - "confidence": 0.35939544439315796, - "model_version": "1.0.0", - "processing_time_seconds": 0.2882812023162842, - "timestamp": "2026-02-02T03:18:06.954093", - "text_length": 33742, - "error": null - }, - { - "filename": "Marques_2015.pdf", - "classification": "not-useful", - "confidence": 0.6508813202381134, - "model_version": "1.0.0", - "processing_time_seconds": 0.33272790908813477, - "timestamp": "2026-02-02T03:18:07.305519", - "text_length": 49378, - "error": null - }, - { - "filename": "Pakhomov_1998.pdf", - "classification": "not-useful", - "confidence": 0.35939544439315796, - "model_version": "1.0.0", - "processing_time_seconds": 0.20516109466552734, - "timestamp": "2026-02-02T03:18:07.515447", - "text_length": 30262, - "error": null - }, - { - "filename": "Sousa_2015.pdf", - "classification": "not-useful", - "confidence": 0.37260109186172485, - "model_version": "1.0.0", - "processing_time_seconds": 0.35510683059692383, - "timestamp": "2026-02-02T03:18:07.875944", - "text_length": 23718, - "error": null - }, - { - "filename": "Wu_2005.pdf", - "classification": "not-useful", - "confidence": 0.3673085570335388, - "model_version": "1.0.0", - "processing_time_seconds": 0.8690674304962158, - "timestamp": "2026-02-02T03:18:08.778555", - "text_length": 73418, - "error": null - } - ] -} \ No newline at end of file diff --git a/data/results/metrics/Adams_1989_results.json b/data/results/metrics/Adams_1989_results.json deleted file mode 100644 index 851776d..0000000 --- a/data/results/metrics/Adams_1989_results.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "source_file": "Adams_1989.txt", - "file_type": ".txt", - "metrics": { - "species_name": null, - "study_location": "Marion Island, sub-Antarctic", - "study_date": null, - "num_empty_stomachs": null, - "num_nonempty_stomachs": null, - "sample_size": null, - "fraction_feeding": null, - "source_pages": null - } -} \ No newline at end of file diff --git a/data/results/summaries/txt_pipeline_summary_20260227_211255.csv b/data/results/summaries/txt_pipeline_summary_20260227_211255.csv deleted file mode 100644 index b88726f..0000000 --- a/data/results/summaries/txt_pipeline_summary_20260227_211255.csv +++ /dev/null @@ -1,2 +0,0 @@ -filename,raw_chars,cleaned_chars,trimmed_chars,extraction_status,species_name,study_location,study_date,sample_size,num_empty_stomachs,num_nonempty_stomachs,fraction_feeding -Adams_1989.txt,27673,22739,4999,success,,"Marion Island, sub-Antarctic",,,,, diff --git a/data/results/test_biomistral_results.json b/data/results/test_biomistral_results.json deleted file mode 100644 index f327cec..0000000 --- a/data/results/test_biomistral_results.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "source_file": "test.txt", - "metrics": { - "species_name": "Homo sapiens", - "study_location": null, - "study_date": null, - "num_empty_stomachs": null, - "num_nonempty_stomachs": null, - "sample_size": 2000000000000000, - "fraction_feeding": null - } -} \ No newline at end of file diff --git a/data/results/test_quick_results.json b/data/results/test_quick_results.json deleted file mode 100644 index 65978be..0000000 --- a/data/results/test_quick_results.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "source_file": "test.txt", - "metrics": { - "species_name": "Gadus morhua", - "study_location": "North Sea", - "study_date": "2019", - "num_empty_stomachs": 89, - "num_nonempty_stomachs": 253, - "sample_size": 342, - "fraction_feeding": 0.7398 - } -} \ No newline at end of file diff --git a/data/results/test_results.json b/data/results/test_results.json deleted file mode 100644 index ea61fcf..0000000 --- a/data/results/test_results.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "source_file": "test.txt", - "metrics": { - "species_name": null, - "study_location": null, - "study_date": null, - "num_empty_stomachs": null, - "num_nonempty_stomachs": null, - "sample_size": null, - "fraction_feeding": null - } -} \ No newline at end of file diff --git a/documentation/CONTRIBUTING.md b/documentation/CONTRIBUTING.md index 7e95624..f75a6ea 100644 --- a/documentation/CONTRIBUTING.md +++ b/documentation/CONTRIBUTING.md @@ -20,11 +20,10 @@ All contributors must follow the Oregon State University Student Code of Conduct * pip installed * Access to GitHub repository * [Ollama](https://ollama.com) installed and running locally - * Minimum hardware: 8 GB RAM (16 GB recommended for `llama3.1:8b`) + * Minimum hardware: 8 GB RAM (16 GB recommended for `qwen2.5:7b`) * Pull the required models before running the classify/extract pipeline: ```bash - ollama pull llama3.1:8b # default extraction model (~5 GB) - ollama pull qwen2.5:7b # alternative model (~5 GB) + ollama pull qwen2.5:7b # default extraction model (~5 GB) ``` * Verify Ollama is running: `ollama list` * ### Setup Instructions @@ -34,7 +33,7 @@ All contributors must follow the Oregon State University Student Code of Conduct python -m venv venv source venv/bin/activate # Windows: venv\Scripts\activate - pip install -r requirements.txt + pip install -e ".[dev]" ``` * ### Running the application * If you do have the [dataset](https://drive.google.com/drive/u/2/folders/1U3_-TmnXnuBPR9vukkyV-3ITsxPr-nfo) downloaded locally on your machine: @@ -57,20 +56,20 @@ All contributors must follow the Oregon State University Student Code of Conduct ``` * Note: You will need access to the .env file * ### Running the classify/extract pipeline - Use `classify_extract.py` to classify PDFs and extract structured diet data in a single step. - Requires trained model artifacts in `src/model/models/` (run the full pipeline first, + Use `src/pipeline/classify_extract.py` to classify PDFs and extract structured diet data in a single step. + Requires trained model artifacts in `src/classifier/models/` (run the full pipeline first, or see [Retraining the Classifier](#retraining-the-classifier-and-extending-extraction) below). ```bash # Single PDF - python classify_extract.py path/to/file.pdf + python src/pipeline/classify_extract.py path/to/file.pdf # Folder of PDFs (sequential) - python classify_extract.py path/to/pdfs/ + python src/pipeline/classify_extract.py path/to/pdfs/ # All options - python classify_extract.py path/to/pdfs/ \ - --model-dir src/model/models \ - --llm-model llama3.1:8b \ + python src/pipeline/classify_extract.py path/to/pdfs/ \ + --model-dir src/classifier/models \ + --llm-model qwen2.5:7b \ --output-dir data/results \ --confidence-threshold 0.70 \ --max-chars 12000 \ @@ -79,8 +78,8 @@ All contributors must follow the Oregon State University Student Code of Conduct ``` | Flag | Default | Description | |------|---------|-------------| - | `--model-dir` | `src/model/models` | Directory containing classifier artifacts | - | `--llm-model` | `llama3.1:8b` | Ollama model for extraction | + | `--model-dir` | `src/classifier/models` | Directory containing classifier artifacts | + | `--llm-model` | `qwen2.5:7b` | Ollama model for extraction | | `--output-dir` | `data/results` | Destination for JSON results and summary CSV | | `--confidence-threshold` | `0.70` | Probability threshold for "useful" classification | | `--max-chars` | `12000` | Maximum characters sent to the LLM | @@ -264,7 +263,7 @@ dependency update policy, and scanning tools. * Never commit sensitive credentials, tokens, or API keys. * Secrets are stored locally in .env files and excluded via .gitignore. -* Dependencies are managed in requirements.txt. +* Dependencies are managed in `pyproject.toml`. * Use pip-audit monthly to check for vulnerabilities. * Security issues or potential breaches should be reported privately to the Project Manager and TA. @@ -342,27 +341,27 @@ Example entry: ### Retraining the XGBoost Classifier -The classifier artifacts are saved in `src/model/models/`. To retrain with new or updated labeled data: +The classifier artifacts are saved in `src/classifier/models/`. To retrain with new or updated labeled data: 1. **Add labeled text files** to `data/processed-text/` and update `data/labels.json` with `"filename.txt": "useful"` or `"filename.txt": "not useful"` entries. 2. **Run the trainer directly:** ```bash - python src/model/train_model.py + python -m src.classifier.train_model ``` This reads from `data/processed-text/` and `data/labels.json`, trains a TF-IDF + XGBoost model, and saves three artifacts: - - `src/model/models/pdf_classifier.json` - XGBoost model - - `src/model/models/tfidf_vectorizer.pkl` - TF-IDF vectorizer - - `src/model/models/label_encoder.pkl` - LabelEncoder + - `src/classifier/models/pdf_classifier.json` - XGBoost model + - `src/classifier/models/tfidf_vectorizer.pkl` - TF-IDF vectorizer + - `src/classifier/models/label_encoder.pkl` - LabelEncoder 3. **Or run the full pipeline**, which trains the model as a final step: ```bash python scripts/full_pipeline.py --local ``` -Key tunable parameters in `src/model/train_model.py`: +Key tunable parameters in `src/classifier/train_model.py`: - `max_features` in `TfidfVectorizer` (default: 10,000) - `eta`, `max_depth`, `subsample` in the XGBoost `params` dict - `early_stopping_rounds` (default: 20) @@ -371,20 +370,19 @@ Key tunable parameters in `src/model/train_model.py`: Extraction fields are defined in two places: -1. **`src/llm/models.py`** - the `PredatorDietMetrics` Pydantic model. +1. **`src/extraction/models.py`** - the `PredatorDietMetrics` Pydantic model. Add a new optional field with the appropriate type and a `None` default: ```python prey_taxa: Optional[list[str]] = None ``` -2. **`src/llm/llm_client.py`** - the system prompt that instructs the LLM. +2. **`src/extraction/llm_client.py`** - the system prompt that instructs the LLM. Add a description of the new field and its expected format to the prompt string. -3. **`classify_extract.py`** and **`extract-from-txt.py`** - update the `row` dict +3. **`src/pipeline/classify_extract.py`** and **`src/pipeline/extract_from_txt.py`** - update the `row` dict and `fieldnames` list in the summary CSV writer to include the new column. -After adding a field, run `pytest tests/test_llm_text.py` to verify that the prompt -changes do not break existing extraction tests. +After adding a field, run `pytest tests/test_llm_text.py` to verify that the prompt changes do not break existing extraction tests. ## Support & Contact * **Primary Communications**: Slack and Teams diff --git a/pyproject.toml b/pyproject.toml index 42da6f0..4133488 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,6 +2,39 @@ requires = ["setuptools", "wheel"] build-backend = "setuptools.build_meta" +[project] +name = "fracfeedextractor" +version = "0.1.0" +requires-python = ">=3.10" +dependencies = [ + "pymupdf", + "pillow", + "pytesseract", + "scikit-learn", + "python-dotenv", + "google-auth", + "google-auth-oauthlib", + "google-auth-httplib2", + "numpy", + "google-api-python-client", + "ollama", + "pydantic", + "camelot-py[base]", + "opencv-python", + "pymupdf_layout", + "xgboost", + "pyspellchecker", + "joblib", +] + +[project.optional-dependencies] +dev = [ + "pytest", + "coverage", + "flake8==6.1.0", + "black==25.9.0", +] + [tool.black] line-length = 200 target-version = ["py311"] diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 443d114..0000000 --- a/requirements.txt +++ /dev/null @@ -1,23 +0,0 @@ -pymupdf -pillow -pytesseract -tesseract -pytest -coverage -flake8==6.1.0 -black==25.9.0 -scikit-learn -dotenv -google-auth -google-auth-oauthlib -google-auth-httplib2 -numpy -google-api-python-client -ollama -pydantic -camelot-py[base] -opencv-python -pymupdf_layout -xgboost -opencv-python -pyspellchecker \ No newline at end of file diff --git a/scripts/ci_pipeline.py b/scripts/ci_pipeline.py index 38670fd..a24eb2f 100644 --- a/scripts/ci_pipeline.py +++ b/scripts/ci_pipeline.py @@ -33,7 +33,7 @@ download_file_bytes, sanitize_filename, ) -from src.preprocessing.pdf_text_extraction import extract_text_from_pdf_bytes +from src.io.pdf_text_extraction import extract_text_from_pdf_bytes def write_labels(labels: Dict[str, str], output_file: Path): @@ -91,7 +91,7 @@ def main(): # Train model on the CI sample print("\nStarting model training on CI sample...") - r = subprocess.run([sys.executable, "src/model/train_model.py"], env={**os.environ, "CI_TRAIN": "1"}) + r = subprocess.run([sys.executable, "-m", "src.classifier.train_model"], env={**os.environ, "CI_TRAIN": "1"}) if r.returncode != 0: print("Model training failed") raise SystemExit(r.returncode) diff --git a/scripts/full_pipeline.py b/scripts/full_pipeline.py index b03bb4c..85aa00e 100644 --- a/scripts/full_pipeline.py +++ b/scripts/full_pipeline.py @@ -17,7 +17,7 @@ - API mode: Streams every PDF (no local PDF persistence) and writes extracted text to data/processed-text. - Local mode: Processes PDFs from specified local directory (expects 'useful' and 'not-useful' subfolders). - Generates labels.json based on folder origin. - - Trains model with src/model/train_model.py. + - Trains model with src/classifier/train_model.py. """ from __future__ import annotations @@ -48,7 +48,7 @@ download_file_bytes, sanitize_filename, ) -from src.preprocessing.pdf_text_extraction import extract_text_from_pdf_bytes +from src.io.pdf_text_extraction import extract_text_from_pdf_bytes # Module-level flag set once per worker process via initializer @@ -263,7 +263,7 @@ def main(): process_api_mode() print("Beginning model training...") - run([sys.executable, "src/model/train_model.py"]) + run([sys.executable, "-m", "src.classifier.train_model"]) print("Training complete.") diff --git a/src/model/models/test.txt b/src/__init__.py similarity index 100% rename from src/model/models/test.txt rename to src/__init__.py diff --git a/src/classifier/__init__.py b/src/classifier/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/model/models/pdf_classifier.json b/src/classifier/models/pdf_classifier.json similarity index 100% rename from src/model/models/pdf_classifier.json rename to src/classifier/models/pdf_classifier.json diff --git a/src/model/pdf_classifier.py b/src/classifier/pdf_classifier.py similarity index 91% rename from src/model/pdf_classifier.py rename to src/classifier/pdf_classifier.py index d43e46f..0066c5c 100644 --- a/src/model/pdf_classifier.py +++ b/src/classifier/pdf_classifier.py @@ -7,7 +7,7 @@ Usage (standalone): python pdf_classifier.py --pdf-path path/to/file.pdf - python pdf_classifier.py --pdf-path path/to/file.pdf --model-dir src/model/models + python pdf_classifier.py --pdf-path path/to/file.pdf --model-dir src/classifier/models python pdf_classifier.py --pdf-path path/to/file.pdf --threshold 0.80 """ @@ -20,8 +20,7 @@ import joblib import xgboost as xgb -sys.path.append(str(Path(__file__).resolve().parents[2])) -from src.preprocessing.pdf_text_extraction import extract_text_from_pdf +from src.io.pdf_text_extraction import extract_text_from_pdf from src.utils.logger import setup_logging import warnings @@ -32,7 +31,7 @@ log = logging.getLogger(__name__) -def load_classifier(model_dir: str = "src/model/models") -> Tuple: +def load_classifier(model_dir: str = "src/classifier/models") -> Tuple: """Load the trained classifier artifacts from disk. Args: @@ -92,7 +91,7 @@ def classify_text( def classify_pdf( pdf_path: str, - model_dir: str = "src/model/models", + model_dir: str = "src/classifier/models", threshold: float = 0.70, ) -> Tuple[str, float, float]: """Convenience wrapper: extract text from a PDF and classify it.""" @@ -123,7 +122,7 @@ def classify_pdf( if __name__ == "__main__": parser = argparse.ArgumentParser(description="Classify a PDF as useful or not useful.") parser.add_argument("--pdf-path", type=str, required=True, help="Path to the PDF file to classify.") - parser.add_argument("--model-dir", type=str, default="src/model/models", help="Directory containing the trained model artifacts (default: src/model/models).") + parser.add_argument("--model-dir", type=str, default="src/classifier/models", help="Directory containing the trained model artifacts (default: src/classifier/models).") parser.add_argument("--threshold", type=float, default=0.70, help="Probability threshold for the 'useful' class (default: 0.70).") args = parser.parse_args() diff --git a/src/model/train_model.py b/src/classifier/train_model.py similarity index 95% rename from src/model/train_model.py rename to src/classifier/train_model.py index a557e4f..9ff7b1f 100644 --- a/src/model/train_model.py +++ b/src/classifier/train_model.py @@ -19,8 +19,6 @@ from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder -sys.path.insert(0, str(Path(__file__).resolve().parents[2])) - from src.utils.logger import setup_logging log = logging.getLogger(__name__) @@ -47,7 +45,7 @@ def load_labeled_data(data_dir="data/processed-text", labels_file="data/labels.j return texts, labels, filenames -def train_pdf_classifier(texts, labels, output_dir="src/model/models"): +def train_pdf_classifier(texts, labels, output_dir="src/classifier/models"): if not texts or not labels: print("[ERROR] No training samples found.") @@ -145,7 +143,7 @@ def train_pdf_classifier(texts, labels, output_dir="src/model/models"): setup_logging() texts, labels, _ = load_labeled_data() - result = train_pdf_classifier(texts, labels, "src/model/models") + result = train_pdf_classifier(texts, labels, "src/classifier/models") if result is None: sys.exit(1) print(f"Model trained successfully! Accuracy: {result['accuracy']:.2f}") diff --git a/src/extraction/__init__.py b/src/extraction/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/llm/chunked_extraction.py b/src/extraction/chunked_extraction.py similarity index 87% rename from src/llm/chunked_extraction.py rename to src/extraction/chunked_extraction.py index 5c8238b..6ebb8b6 100644 --- a/src/llm/chunked_extraction.py +++ b/src/extraction/chunked_extraction.py @@ -6,18 +6,15 @@ import xgboost as xgb from pathlib import Path from collections import Counter +from typing import Any, Optional -# Add project root to path -project_root = Path(__file__).parent.parent.parent -sys.path.insert(0, str(project_root)) +from src.extraction.llm_client import extract_metrics_from_text +from src.classifier.pdf_classifier import load_classifier -from src.llm.llm_client import extract_metrics_from_text -from src.model.pdf_classifier import load_classifier - -def chunk_text(text, chunk_size=3000, overlap=300): +def chunk_text(text: str, chunk_size: int = 3000, overlap: int = 300) -> list[str]: """Split text into overlapping chunks.""" - chunks = [] + chunks: list[str] = [] start = 0 while start < len(text): @@ -41,7 +38,7 @@ def chunk_text(text, chunk_size=3000, overlap=300): return chunks -def score_chunk(chunk, model, vectorizer): +def score_chunk(chunk: str, model: xgb.Booster, vectorizer: Any) -> float: """Score a chunk using XGBoost classifier.""" X_vec = vectorizer.transform([chunk]) dtest = xgb.DMatrix(X_vec) @@ -49,14 +46,14 @@ def score_chunk(chunk, model, vectorizer): return float(score) -def merge_results(results): +def merge_results(results: list[Optional[dict[str, Any]]]) -> dict[str, Any]: """Merge extraction results from multiple chunks using voting.""" results = [r for r in results if r is not None] if not results: return {} - merged = {} + merged: dict[str, Any] = {} fields = ['species_name', 'study_location', 'study_date', 'num_empty_stomachs', 'num_nonempty_stomachs', 'sample_size'] for field in fields: @@ -82,14 +79,14 @@ def merge_results(results): def extract_with_chunking( - text, - model_dir="src/model/models", - llm_model="qwen2.5:7b", # Changed from biomistral - num_ctx=8192, - top_n=3, - chunk_size=3000, - overlap=300, -): + text: str, + model_dir: str = "src/classifier/models", + llm_model: str = "qwen2.5:7b", + num_ctx: int = 8192, + top_n: int = 3, + chunk_size: int = 3000, + overlap: int = 300, +) -> dict[str, Any]: """Main extraction with chunking pipeline.""" print(" [CHUNK] Loading classifier...", file=sys.stderr) diff --git a/src/llm/llm_client.py b/src/extraction/llm_client.py similarity index 97% rename from src/llm/llm_client.py rename to src/extraction/llm_client.py index 1103c97..88df016 100644 --- a/src/llm/llm_client.py +++ b/src/extraction/llm_client.py @@ -7,7 +7,7 @@ Usage (standalone): python llm_client.py path/to/file.pdf python llm_client.py path/to/file.txt - python llm_client.py path/to/file.pdf --model llama3.1:8b + python llm_client.py path/to/file.pdf --model qwen2.5:7b python llm_client.py path/to/file.txt --output-dir results/ """ @@ -22,11 +22,8 @@ from ollama import chat -project_root = Path(__file__).parent.parent.parent -sys.path.insert(0, str(project_root)) - -from src.llm.models import PredatorDietMetrics -from src.llm.llm_text import extract_key_sections, load_document +from src.extraction.models import PredatorDietMetrics +from src.extraction.llm_text import extract_key_sections, load_document from src.utils.logger import setup_logging log = logging.getLogger(__name__) @@ -60,7 +57,7 @@ def _call_ollama_with_retry(model, messages, format, options): def extract_metrics_from_text( text: str, - # model: str = "llama3.1:8b", + # model: str = "qwen2.5:7b", model: str = "qwen2.5:7b", num_ctx: int = 8192, _retry: bool = False, @@ -283,7 +280,7 @@ def save_extraction_result( def main(): parser = argparse.ArgumentParser(description="Extract predator diet metrics from PDFs or text files using LLM") parser.add_argument("input_file", type=str, help="Path to the input file (.pdf or .txt)") - # parser.add_argument("--model", type=str, default="llama3.1:8b", help="Ollama model to use (default: llama3.1:8b)") + # parser.add_argument("--model", type=str, default="qwen2.5:7b", help="Ollama model to use (default: qwen2.5:7b)") parser.add_argument("--model", type=str, default="qwen2.5:7b", help="Ollama model to use (default: qwen2.5:7b)") parser.add_argument("--output-dir", type=str, default="data/results", help="Output directory for JSON results (default: data/results/metrics)") parser.add_argument("--max-chars", type=int, default=12000, help="Maximum characters of text to send to the model (default: 12000). Reduce if you hit CUDA/OOM errors.") diff --git a/src/llm/llm_text.py b/src/extraction/llm_text.py similarity index 99% rename from src/llm/llm_text.py rename to src/extraction/llm_text.py index 9bc8862..dbc377c 100644 --- a/src/llm/llm_text.py +++ b/src/extraction/llm_text.py @@ -412,7 +412,7 @@ def load_document(file_path: Path) -> str: if suffix == '.pdf': print("[INFO] Reading PDF file...", file=sys.stderr) - from src.preprocessing.pdf_text_extraction import extract_text_from_pdf + from src.io.pdf_text_extraction import extract_text_from_pdf return extract_text_from_pdf(str(file_path)) elif suffix in ['.txt', '.text']: diff --git a/src/llm/models.py b/src/extraction/models.py similarity index 100% rename from src/llm/models.py rename to src/extraction/models.py diff --git a/src/io/__init__.py b/src/io/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/preprocessing/data_loader.py b/src/io/data_loader.py similarity index 100% rename from src/preprocessing/data_loader.py rename to src/io/data_loader.py diff --git a/src/preprocessing/generate_labels.py b/src/io/generate_labels.py similarity index 100% rename from src/preprocessing/generate_labels.py rename to src/io/generate_labels.py diff --git a/src/preprocessing/pdf_text_extraction.py b/src/io/pdf_text_extraction.py similarity index 100% rename from src/preprocessing/pdf_text_extraction.py rename to src/io/pdf_text_extraction.py diff --git a/src/preprocessing/section_filter.py b/src/io/section_filter.py similarity index 100% rename from src/preprocessing/section_filter.py rename to src/io/section_filter.py diff --git a/src/preprocessing/text_cleaner.py b/src/io/text_cleaner.py similarity index 100% rename from src/preprocessing/text_cleaner.py rename to src/io/text_cleaner.py diff --git a/src/llm/chunked_biomistral_llm.py b/src/llm/chunked_biomistral_llm.py deleted file mode 100644 index 9064fc1..0000000 --- a/src/llm/chunked_biomistral_llm.py +++ /dev/null @@ -1,197 +0,0 @@ -"""Chunked extraction pipeline — split papers, score chunks, extract from top-N, merge. - -Instead of sending one big trimmed blob to the LLM, this module: - 1. Splits the document into overlapping character-level chunks - 2. Scores each chunk with the XGBoost classifier (higher = more likely "useful" content) - 3. Sends the top-N scoring chunks through the LLM independently - 4. Merges the per-chunk results via majority voting - -This improves recall on long papers where the single-pass trim can miss -data-rich paragraphs buried in the middle of the document. -""" - -import logging -import sys -from collections import Counter -from pathlib import Path -from typing import Dict, List, Optional, Tuple - -project_root = Path(__file__).parent.parent.parent -sys.path.insert(0, str(project_root)) - -import xgboost as xgb - -from src.llm.llm_client import extract_metrics_from_text -from src.llm.models import PredatorDietMetrics -from src.model.pdf_classifier import load_classifier - -log = logging.getLogger(__name__) - -# fields we try to merge across chunks -_MERGE_FIELDS = [ - "species_name", - "study_location", - "study_date", - "num_empty_stomachs", - "num_nonempty_stomachs", - "sample_size", -] - - -def chunk_text( - text: str, - chunk_size: int = 4000, - overlap: int = 500, - min_chunk_len: int = 100, -) -> List[str]: - """Split *text* into overlapping character-level chunks. - - Tries to break at paragraph boundaries first, then sentence boundaries, - so chunks don't start/end mid-word. - """ - chunks = [] - start = 0 - - while start < len(text): - end = start + chunk_size - - # try to snap to a paragraph break - if end < len(text): - para_break = text.rfind("\n\n", start, end) - if para_break > start + chunk_size // 2: - end = para_break - else: - sent_break = text.rfind(". ", start, end) - if sent_break > start + chunk_size // 2: - end = sent_break + 1 - - chunk = text[start:end].strip() - if len(chunk) >= min_chunk_len: - chunks.append(chunk) - - start = end - overlap - - return chunks - - -def score_chunks( - chunks: List[str], - model_dir: str = "src/model/models", -) -> List[Tuple[str, float]]: - """Score each chunk using the XGBoost classifier. - - Returns a list of (chunk, score) sorted by score descending. - Higher score = chunk looks more like a "useful" paper section. - """ - model, vectorizer, _encoder = load_classifier(model_dir) - - scored = [] - for chunk in chunks: - X_vec = vectorizer.transform([chunk]) - dtest = xgb.DMatrix(X_vec) - score = float(model.predict(dtest)[0]) - scored.append((chunk, score)) - - scored.sort(key=lambda x: x[1], reverse=True) - return scored - - -def merge_results(results: List[dict]) -> dict: - """Merge extraction dicts from multiple chunks via majority voting. - - For each field, the value that appears most often across chunks wins. - A confidence score (votes / total) is stored alongside each field. - """ - if not results: - return {} - - merged = {} - for field in _MERGE_FIELDS: - values = [r.get(field) for r in results if r.get(field) is not None] - - if not values: - merged[field] = None - merged[f"{field}_confidence"] = 0.0 - else: - counter = Counter(values) - most_common_val, most_common_count = counter.most_common(1)[0] - merged[field] = most_common_val - merged[f"{field}_confidence"] = round(most_common_count / len(values), 2) - - return merged - - -def extract_with_chunking( - text: str, - model_dir: str = "src/model/models", - llm_model: str = "qwen2.5:7b", - num_ctx: int = 8192, - top_n: int = 3, - chunk_size: int = 4000, - overlap: int = 500, -) -> dict: - """Full chunked extraction pipeline. - - Args: - text: Full document text. - model_dir: Path to XGBoost model artifacts. - llm_model: Ollama model name for extraction. - num_ctx: Context window size for Ollama. - top_n: Number of highest-scoring chunks to extract from. - chunk_size: Character size per chunk. - overlap: Overlap between consecutive chunks. - - Returns: - Merged metrics dict with per-field confidence scores and - fraction_feeding computed from the merged counts. - """ - # chunk - chunks = chunk_text(text, chunk_size=chunk_size, overlap=overlap) - print(f" [CHUNK] Split into {len(chunks)} chunks", file=sys.stderr) - - if not chunks: - log.warning("No chunks produced from text of length %d", len(text)) - return {} - - # score - scored = score_chunks(chunks, model_dir=model_dir) - top_chunks = scored[:top_n] - scores_str = ", ".join(f"{s:.3f}" for _, s in top_chunks) - print(f" [CHUNK] Top {len(top_chunks)} chunk scores: [{scores_str}]", file=sys.stderr) - - # extract from each chunk - results = [] - for i, (chunk, score) in enumerate(top_chunks): - print(f" [CHUNK] Extracting from chunk {i + 1}/{len(top_chunks)} (score={score:.3f})...", file=sys.stderr) - try: - metrics = extract_metrics_from_text( - text=chunk, - model=llm_model, - num_ctx=num_ctx, - ) - result = metrics.model_dump() - results.append(result) - print( - f" Got: species={result.get('species_name')}, " f"n={result.get('sample_size')}", - file=sys.stderr, - ) - except Exception as e: - print(f" Failed: {e}", file=sys.stderr) - log.error("Chunk %d extraction failed: %s", i + 1, e) - - if not results: - log.warning("All chunk extractions failed") - return {} - - # merge via voting - merged = merge_results(results) - - # compute fraction_feeding from merged counts - nonempty = merged.get("num_nonempty_stomachs") - sample = merged.get("sample_size") - if nonempty is not None and sample is not None and sample > 0: - merged["fraction_feeding"] = round(nonempty / sample, 4) - else: - merged["fraction_feeding"] = None - - return merged diff --git a/src/pipeline/__init__.py b/src/pipeline/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/classify_extract.py b/src/pipeline/classify_extract.py similarity index 93% rename from classify_extract.py rename to src/pipeline/classify_extract.py index 2c8b5fa..adc7d63 100644 --- a/classify_extract.py +++ b/src/pipeline/classify_extract.py @@ -6,15 +6,15 @@ Usage: # Single PDF - python classify-extract.py path/to/file.pdf + python src/pipeline/classify_extract.py path/to/file.pdf # Folder of PDFs - python classify-extract.py path/to/folder/ + python src/pipeline/classify_extract.py path/to/folder/ # Custom options - python classify-extract.py path/to/folder/ \\ - --model-dir src/model/models \\ - --llm-model llama3.1:8b \\ + python src/pipeline/classify_extract.py path/to/folder/ \\ + --model-dir src/classifier/models \\ + --llm-model qwen2.5:7b \\ --output-dir results/ \\ --confidence-threshold 0.70 \\ --max-chars 12000 \\ @@ -34,10 +34,10 @@ from concurrent.futures import ProcessPoolExecutor, as_completed from pathlib import Path -from src.preprocessing.pdf_text_extraction import extract_text_from_pdf -from src.model.pdf_classifier import load_classifier, classify_text -from src.llm.llm_text import extract_key_sections -from src.llm.llm_client import extract_metrics_from_text, save_extraction_result +from src.io.pdf_text_extraction import extract_text_from_pdf +from src.classifier.pdf_classifier import load_classifier, classify_text +from src.extraction.llm_text import extract_key_sections +from src.extraction.llm_client import extract_metrics_from_text, save_extraction_result from src.utils.logger import setup_logging log = logging.getLogger(__name__) @@ -310,16 +310,16 @@ def main(): epilog=""" Examples: Single PDF: - python classify-extract.py paper.pdf + python src/pipeline/classify_extract.py paper.pdf Folder of PDFs: - python classify-extract.py data/pdfs/ + python src/pipeline/classify_extract.py data/pdfs/ Custom options: - python classify-extract.py data/pdfs/ \\ - --model-dir src/model/models \\ + python src/pipeline/classify_extract.py data/pdfs/ \\ + --model-dir src/classifier/models \\ --output-dir results/ \\ - --llm-model llama3.1:8b \\ + --llm-model qwen2.5:7b \\ --confidence-threshold 0.70 """, ) @@ -331,14 +331,14 @@ def main(): parser.add_argument( "--model-dir", type=str, - default="src/model/models", - help="Directory containing classifier model artifacts (default: src/model/models).", + default="src/classifier/models", + help="Directory containing classifier model artifacts (default: src/classifier/models).", ) parser.add_argument( "--llm-model", type=str, - default="llama3.1:8b", - help="Ollama model to use for extraction (default: llama3.1:8b).", + default="qwen2.5:7b", + help="Ollama model to use for extraction (default: qwen2.5:7b).", ) parser.add_argument( "--output-dir", diff --git a/extract-from-txt.py b/src/pipeline/extract_from_txt.py similarity index 92% rename from extract-from-txt.py rename to src/pipeline/extract_from_txt.py index 52c636a..9cee179 100644 --- a/extract-from-txt.py +++ b/src/pipeline/extract_from_txt.py @@ -5,32 +5,32 @@ classifier entirely. Every .txt file fed to this script is assumed to have already been confirmed -as useful (e.g. by the classifier in classify-extract.py or by manual review). +as useful (e.g. by the classifier in src/pipeline/classify_extract.py or by manual review). The pipeline: 1. Read raw .txt file 2. Strip noise (references, acknowledgements, affiliations, captions, …) - via src/preprocessing/text_cleaner.py + via src/io/text_cleaner.py 3. Drop irrelevant paragraphs (taxonomy, morphometrics, stats methods, …) - via src/preprocessing/section_filter.py + via src/io/section_filter.py 4. Trim to the character budget using section-priority ranking - via src/llm/llm_text.py::extract_key_sections() - 5. Call Ollama for structured extraction via src/llm/llm_client.py + via src/extraction/llm_text.py::extract_key_sections() + 5. Call Ollama for structured extraction via src/extraction/llm_client.py 6. Save result JSON per file and a summary CSV Usage:: # Process the default directory (data/processed-text/) - python extract-from-txt.py + python src/pipeline/extract_from_txt.py # Custom input directory - python extract-from-txt.py --input-dir path/to/txt_files/ + python src/pipeline/extract_from_txt.py --input-dir path/to/txt_files/ # Full options - python extract-from-txt.py \\ + python src/pipeline/extract_from_txt.py \\ --input-dir data/processed-text/ \\ --output-dir data/results/ \\ - --llm-model llama3.1:8b \\ + --llm-model qwen2.5:7b \\ --max-chars 10000 \\ --num-ctx 8192 @@ -48,16 +48,11 @@ from datetime import datetime from pathlib import Path -# Ensure the project root is on sys.path regardless of where this script is -# invoked from. -_PROJECT_ROOT = Path(__file__).resolve().parent -sys.path.insert(0, str(_PROJECT_ROOT)) - -from src.preprocessing.text_cleaner import clean_text -from src.preprocessing.section_filter import filter_relevant_sections -from src.llm.llm_text import extract_key_sections -from src.llm.llm_client import extract_metrics_from_text, save_extraction_result -from src.llm.chunked_extraction import extract_with_chunking +from src.io.text_cleaner import clean_text +from src.io.section_filter import filter_relevant_sections +from src.extraction.llm_text import extract_key_sections +from src.extraction.llm_client import extract_metrics_from_text, save_extraction_result +from src.extraction.chunked_extraction import extract_with_chunking # --------------------------------------------------------------------------- @@ -77,7 +72,7 @@ def run_txt_pipeline( top_chunks: int = 3, chunk_size: int = 4000, chunk_overlap: int = 500, - model_dir: str = "src/model/models", + model_dir: str = "src/classifier/models", ) -> None: """Process every .txt file in *input_dir* through clean → filter → trim → extract. @@ -85,7 +80,7 @@ def run_txt_pipeline( input_dir: Directory containing pre-classified useful .txt files. Ignored when *single_file* is provided. output_dir: Root output directory for JSON results and summary CSV. - llm_model: Ollama model name (e.g. ``"llama3.1:8b"``). + llm_model: Ollama model name (e.g. ``"qwen2.5:7b"``). max_chars: Character budget for the text sent to Ollama. num_ctx: Context window size requested from Ollama. single_file: If set, process only this one .txt file. @@ -337,13 +332,13 @@ def main() -> None: epilog=""" Examples: Default (data/processed-text/ → data/results/): - python extract-from-txt.py + python src/pipeline/extract_from_txt.py Custom directories: - python extract-from-txt.py --input-dir data/useful-txt/ --output-dir out/ + python src/pipeline/extract_from_txt.py --input-dir data/useful-txt/ --output-dir out/ Different model / tighter budget: - python extract-from-txt.py --llm-model mistral:7b --max-chars 4500 + python src/pipeline/extract_from_txt.py --llm-model mistral:7b --max-chars 4500 """, ) parser.add_argument( @@ -367,7 +362,7 @@ def main() -> None: parser.add_argument( "--llm-model", type=str, - # default="llama3.1:8b", + # default="qwen2.5:7b", default="qwen2.5:7b", help="Ollama model name (default: qwen2.5:7b).", ) @@ -416,8 +411,8 @@ def main() -> None: parser.add_argument( "--model-dir", type=str, - default="src/model/models", - help="Directory containing XGBoost model artifacts (default: src/model/models). Only used with --chunked.", + default="src/classifier/models", + help="Directory containing XGBoost model artifacts (default: src/classifier/models). Only used with --chunked.", ) args = parser.parse_args() diff --git a/src/utils/__init__.py b/src/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_data_loader.py b/tests/test_data_loader.py index 9ebc47f..4d272f9 100644 --- a/tests/test_data_loader.py +++ b/tests/test_data_loader.py @@ -1,6 +1,6 @@ import pytest from pathlib import Path -from src.preprocessing.data_loader import load_processed_text +from src.io.data_loader import load_processed_text import subprocess import shutil @@ -60,7 +60,7 @@ def test_load_processed_text_encoding(tmp_path): def test_main_prints_summary(monkeypatch, tmp_path, capsys): - from src.preprocessing import data_loader + from src.io import data_loader data_dir = tmp_path / "data" / "processed-text" data_dir.mkdir(parents=True) @@ -84,7 +84,7 @@ def test_data_loader_main_executes(tmp_path): data_dir.mkdir(parents=True) (data_dir / "sample.txt").write_text("Predator diet study data", encoding="utf-8") - script_path = Path("src/preprocessing/data_loader.py") + script_path = Path("src/io/data_loader.py") tmp_script_path = tmp_path / "data_loader.py" shutil.copy(script_path, tmp_script_path) diff --git a/tests/test_generate_labels.py b/tests/test_generate_labels.py index 692a112..6b3bea1 100644 --- a/tests/test_generate_labels.py +++ b/tests/test_generate_labels.py @@ -2,7 +2,7 @@ import json import subprocess from pathlib import Path -from src.preprocessing.generate_labels import generate_labels +from src.io.generate_labels import generate_labels def test_generate_labels_creates_json(tmp_path): @@ -85,7 +85,7 @@ def test_generate_labels_cli(tmp_path): shutil.copytree(repo_src, tmp_src) result = subprocess.run( - ["python", "src/preprocessing/generate_labels.py"], + ["python", "src/io/generate_labels.py"], capture_output=True, text=True, cwd=tmp_path, diff --git a/tests/test_llm_text.py b/tests/test_llm_text.py index b22c98b..c9c006d 100644 --- a/tests/test_llm_text.py +++ b/tests/test_llm_text.py @@ -1,4 +1,4 @@ -"""Unit tests for src/llm/llm_text.py — section extraction and text preprocessing.""" +"""Unit tests for src/extraction/llm_text.py — section extraction and text preprocessing.""" import sys import tempfile @@ -8,7 +8,7 @@ sys.path.insert(0, str(Path(__file__).resolve().parents[1])) -from src.llm.llm_text import ( +from src.extraction.llm_text import ( _section_priority, _score_paragraph, _truncate_at_sentence, @@ -331,10 +331,10 @@ def test_pdf_extension_triggers_pdf_path(self, tmp_path, monkeypatch): pdf_file = tmp_path / "paper.pdf" pdf_file.write_bytes(b"%PDF-1.4 fake content") - import src.llm.llm_text as llm_text_module + import src.extraction.llm_text as llm_text_module monkeypatch.setattr( - "src.llm.llm_text.extract_text_from_pdf", + "src.extraction.llm_text.extract_text_from_pdf", lambda path: f"extracted from {Path(path).name}", raising=False, ) @@ -342,9 +342,9 @@ def test_pdf_extension_triggers_pdf_path(self, tmp_path, monkeypatch): import importlib import sys as _sys - fake_module = type(_sys)("src.preprocessing.pdf_text_extraction") + fake_module = type(_sys)("src.io.pdf_text_extraction") fake_module.extract_text_from_pdf = lambda path: f"extracted from {Path(path).name}" - monkeypatch.setitem(_sys.modules, "src.preprocessing.pdf_text_extraction", fake_module) + monkeypatch.setitem(_sys.modules, "src.io.pdf_text_extraction", fake_module) result = load_document(pdf_file) assert "extracted from paper.pdf" in result diff --git a/tests/test_models.py b/tests/test_models.py index fe2c935..cd46dad 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -1,4 +1,4 @@ -"""Unit tests for src/llm/models.py — PredatorDietMetrics Pydantic schema.""" +"""Unit tests for src/extraction/models.py — PredatorDietMetrics Pydantic schema.""" import sys from pathlib import Path @@ -8,7 +8,7 @@ sys.path.insert(0, str(Path(__file__).resolve().parents[1])) -from src.llm.models import PredatorDietMetrics +from src.extraction.models import PredatorDietMetrics # --------------------------------------------------------------------------- diff --git a/tests/test_pdf_classifier.py b/tests/test_pdf_classifier.py index 659d6c4..b9d7215 100644 --- a/tests/test_pdf_classifier.py +++ b/tests/test_pdf_classifier.py @@ -6,7 +6,7 @@ from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.preprocessing import LabelEncoder from unittest.mock import patch -from src.model.pdf_classifier import classify_pdf +from src.classifier.pdf_classifier import classify_pdf @pytest.fixture @@ -40,7 +40,7 @@ def model_dir_with_mock_model(tmp_path): return model_dir -@patch("src.model.pdf_classifier.extract_text_from_pdf", return_value="predator stomach content analysis") +@patch("src.classifier.pdf_classifier.extract_text_from_pdf", return_value="predator stomach content analysis") def test_classify_pdf_valid_case(mock_extract, model_dir_with_mock_model, capsys): """Happy path: result header, prediction label, confidence percentage all present.""" classify_pdf(Path("tests/test.pdf"), model_dir_with_mock_model) @@ -69,7 +69,7 @@ def test_classify_pdf_missing_model(capsys, tmp_path): assert "[ERROR]" in capsys.readouterr().err -@patch("src.model.pdf_classifier.extract_text_from_pdf", return_value="") +@patch("src.classifier.pdf_classifier.extract_text_from_pdf", return_value="") def test_classify_pdf_no_text(mock_extract, model_dir_with_mock_model, capsys): classify_pdf(Path("tests/empty.pdf"), model_dir_with_mock_model) diff --git a/tests/test_pdf_extraction.py b/tests/test_pdf_extraction.py index 9ed95c2..7d3e022 100644 --- a/tests/test_pdf_extraction.py +++ b/tests/test_pdf_extraction.py @@ -3,7 +3,7 @@ import fitz from pathlib import Path import sys -from src.preprocessing.pdf_text_extraction import extract_text_from_pdf, save_to_file, main +from src.io.pdf_text_extraction import extract_text_from_pdf, save_to_file, main def test_extract_text_exists(): @@ -75,7 +75,7 @@ def test_main_cli(tmp_path): output_file = output_dir / "test.txt" result = subprocess.run( - ["python", "src/preprocessing/pdf_text_extraction.py", input_pdf], + ["python", "src/io/pdf_text_extraction.py", input_pdf], capture_output=True, text=True, ) diff --git a/tests/test_section_filter.py b/tests/test_section_filter.py index 423bbae..06bc5c2 100644 --- a/tests/test_section_filter.py +++ b/tests/test_section_filter.py @@ -1,4 +1,4 @@ -"""Unit tests for src/preprocessing/section_filter.py""" +"""Unit tests for src/io/section_filter.py""" import sys from pathlib import Path @@ -7,7 +7,7 @@ sys.path.insert(0, str(Path(__file__).resolve().parents[1])) -from src.preprocessing.section_filter import ( +from src.io.section_filter import ( filter_relevant_sections, _has_positive_signal, _has_negative_signal, diff --git a/tests/test_text_cleaner.py b/tests/test_text_cleaner.py index da6798d..3aae784 100644 --- a/tests/test_text_cleaner.py +++ b/tests/test_text_cleaner.py @@ -1,4 +1,4 @@ -"""Unit tests for src/preprocessing/text_cleaner.py""" +"""Unit tests for src/io/text_cleaner.py""" import sys from pathlib import Path @@ -7,7 +7,7 @@ sys.path.insert(0, str(Path(__file__).resolve().parents[1])) -from src.preprocessing.text_cleaner import clean_text +from src.io.text_cleaner import clean_text # --------------------------------------------------------------------------- diff --git a/tests/test_train_model.py b/tests/test_train_model.py index 37d8418..b6644dd 100644 --- a/tests/test_train_model.py +++ b/tests/test_train_model.py @@ -2,7 +2,7 @@ import json import joblib from pathlib import Path -from src.model.train_model import load_labeled_data, train_pdf_classifier +from src.classifier.train_model import load_labeled_data, train_pdf_classifier @pytest.fixture