diff --git a/.env.example b/.env.example
new file mode 100644
index 0000000..fb60d65
--- /dev/null
+++ b/.env.example
@@ -0,0 +1,14 @@
+# ── Google Drive integration ────────────────────────────────────────────────
+# Required by scripts/full_pipeline.py (--api mode) and scripts/ci_pipeline.py.
+# Obtain from the project's Google Cloud service account (ask the project owner).
+
+# The full JSON key for the Google service account, as a single-line string.
+GOOGLE_SERVICE_ACCOUNT_JSON=
+
+# The ID of the root Google Drive folder containing 'useful' and 'not-useful' subfolders.
+# Found in the folder's URL: drive.google.com/drive/folders/<ID>
+GOOGLE_DRIVE_ROOT_FOLDER_ID=
+
+# Set to "true" if the Drive folder is a shared drive (Team Drive).
+# Leave blank or omit for standard My Drive folders.
+GOOGLE_DRIVE_USE_SHARED_DRIVE=
diff --git a/.github/workflows/working_sw.yml b/.github/workflows/working_sw.yml
index dbf8647..8cb055e 100644
--- a/.github/workflows/working_sw.yml
+++ b/.github/workflows/working_sw.yml
@@ -20,7 +20,7 @@ jobs:
           sudo apt-get update
           sudo apt-get install -y tesseract-ocr
           python -m pip install --upgrade pip
-          pip install -r requirements.txt
+          pip install ".[dev]"
       
       # CI pipeline (streams 20 PDFs per class from Drive and trains model)
       - name: CI pipeline (Drive stream)
@@ -36,10 +36,10 @@ jobs:
         uses: actions/cache@v3
         with:
           path: |
-            src/model/models
+            src/classifier/models
             ~/.joblib
             ~/.sklearn
-          key: ${{ runner.os }}-sklearn-model-${{ hashFiles('src/model/**/*.py') }}
+          key: ${{ runner.os }}-sklearn-model-${{ hashFiles('src/classifier/**/*.py') }}
           restore-keys: |
             ${{ runner.os }}-sklearn-model-
 
@@ -60,17 +60,17 @@ jobs:
       - name: Run PDF extraction script (repo test asset)
         run: |
           INPUT_PDF="tests/test.pdf"
-          python src/preprocessing/pdf_text_extraction.py "$INPUT_PDF"
+          python -m src.io.pdf_text_extraction "$INPUT_PDF"
 
       # - name: Generate labels
       #   run: |
       #     echo "Generating labels for extracted text..."
-      #     python src/preprocessing/generate_labels.py
+      #     python src/io/generate_labels.py
           
       # - name: Load and preprocess data
       #   run: |
       #     echo "Loading and preprocessing dataset..."
-      #     python src/preprocessing/data_loader.py
+      #     python src/io/data_loader.py
       # No full pipeline here; use full_training_pipeline.py on main or a scheduled workflow
 
       - name: Validate preprocessing pipeline
@@ -113,8 +113,9 @@ jobs:
         with:
           name: trained-model
           path: |
-            src/model/models/pdf_classifier_model.pkl
-            src/model/models/tfidf_vectorizer.pkl
+            src/classifier/models/pdf_classifier.json
+            src/classifier/models/tfidf_vectorizer.pkl
+            src/classifier/models/label_encoder.pkl
 
       - name: Show pipeline summary
         run: |
@@ -124,4 +125,4 @@ jobs:
           echo "2. Label Generation Results:"
           ls -l data/labels.json
           echo "3. Model Training Results:"
-          ls -l src/model/models/
+          ls -l src/classifier/models/
diff --git a/.gitignore b/.gitignore
index 2592117..c7bf3fb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -211,6 +211,7 @@ data/needs-check
 data/not-useful
 data/processed-text
 data/useful
-src/model/models/*.pkl
+data/results/
+src/classifier/models/*.pkl
 
 
diff --git a/README.md b/README.md
index 9d33fb9..487f58e 100644
--- a/README.md
+++ b/README.md
@@ -125,7 +125,7 @@ git clone https://github.com/NovakLabOSU/FracFeedExtractor.git
 cd FracFeedExtractor
 python3 -m venv venv
 source venv/bin/activate
-pip install -r requirements.txt
+pip install -e ".[dev]"
 ```
 
 ```bash
@@ -134,17 +134,17 @@ git clone https://github.com/NovakLabOSU/FracFeedExtractor.git
 cd FracFeedExtractor
 py -m venv venv
 ./venv/Scripts/activate
-pip install -r requirements.txt
+pip install -e ".[dev]"
 ```
 
 ### Quick Start
 
 ```bash
 # Classify and extract from a folder of PDFs
-python classify_extract.py path/to/pdfs/
+python src/pipeline/classify_extract.py path/to/pdfs/
 
 # Adjust the LLM model or confidence threshold
-python classify_extract.py path/to/pdfs/ --llm-model llama3.1:8b --confidence-threshold 0.70
+python src/pipeline/classify_extract.py path/to/pdfs/ --llm-model qwen2.5:7b --confidence-threshold 0.70
 ```
 
 Results are written to `data/results/metrics/` (per-paper JSON) and `data/results/summaries/` (pipeline CSV).
diff --git a/data/results/Adams_1989_results.json b/data/results/Adams_1989_results.json
deleted file mode 100644
index 70d465e..0000000
--- a/data/results/Adams_1989_results.json
+++ /dev/null
@@ -1,17 +0,0 @@
-{
-  "source_file": "Adams_1989.txt",
-  "metrics": {
-    "species_name": "Gentoo Penguin",
-    "study_location": "Marion Island",
-    "study_date": "1984-1985",
-    "num_empty_stomachs": null,
-    "num_nonempty_stomachs": null,
-    "sample_size": 144,
-    "fraction_feeding": null,
-    "source_pages": [
-      1,
-      2,
-      6
-    ]
-  }
-}
\ No newline at end of file
diff --git a/data/results/Ferreira_1999_results.json b/data/results/Ferreira_1999_results.json
deleted file mode 100644
index 8265458..0000000
--- a/data/results/Ferreira_1999_results.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-  "source_file": "Ferreira_1999.txt",
-  "metrics": {
-    "species_name": null,
-    "study_location": null,
-    "study_date": null,
-    "num_empty_stomachs": null,
-    "num_nonempty_stomachs": null,
-    "sample_size": null,
-    "fraction_feeding": null
-  }
-}
\ No newline at end of file
diff --git a/data/results/Fisher_2008_results.json b/data/results/Fisher_2008_results.json
deleted file mode 100644
index f6bc255..0000000
--- a/data/results/Fisher_2008_results.json
+++ /dev/null
@@ -1,16 +0,0 @@
-{
-  "source_file": "Fisher_2008.txt",
-  "metrics": {
-    "species_name": "Nucella lapillus",
-    "study_location": "Swans Island, Maine, USA",
-    "study_date": "2004-2005",
-    "num_empty_stomachs": null,
-    "num_nonempty_stomachs": 15,
-    "sample_size": 225,
-    "fraction_feeding": 0.0667,
-    "source_pages": [
-      1,
-      2
-    ]
-  }
-}
\ No newline at end of file
diff --git a/data/results/Sousa_2015_results.json b/data/results/Sousa_2015_results.json
deleted file mode 100644
index 5aa33c6..0000000
--- a/data/results/Sousa_2015_results.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-  "source_file": "Sousa_2015.txt",
-  "metrics": {
-    "species_name": null,
-    "study_location": null,
-    "study_date": null,
-    "num_empty_stomachs": null,
-    "num_nonempty_stomachs": null,
-    "sample_size": null,
-    "fraction_feeding": null
-  }
-}
\ No newline at end of file
diff --git a/data/results/classifications.csv b/data/results/classifications.csv
deleted file mode 100644
index f192e26..0000000
--- a/data/results/classifications.csv
+++ /dev/null
@@ -1,11 +0,0 @@
-filename,classification,confidence,model_version,processing_time_seconds,timestamp,text_length,error
-Adams_1989.pdf,not-useful,0.35939544439315796,1.0.0,0.5106580257415771,2026-02-02T03:18:04.485316,27601,
-Berg_2002.pdf,unknown,0.0,1.0.0,,2026-02-02T03:18:05.431344,,No text extracted from PDF
-Dale_2011.pdf,not-useful,0.4046034812927246,1.0.0,0.4978008270263672,2026-02-02T03:18:06.053123,75685,
-Fisher_2008.pdf,not-useful,0.35939544439315796,1.0.0,0.32041358947753906,2026-02-02T03:18:06.385193,54637,
-Harris_2009.pdf,not-useful,0.3941650390625,1.0.0,0.270871639251709,2026-02-02T03:18:06.659290,48939,
-Kerle_2000.pdf,not-useful,0.35939544439315796,1.0.0,0.2882812023162842,2026-02-02T03:18:06.954093,33742,
-Marques_2015.pdf,not-useful,0.6508813202381134,1.0.0,0.33272790908813477,2026-02-02T03:18:07.305519,49378,
-Pakhomov_1998.pdf,not-useful,0.35939544439315796,1.0.0,0.20516109466552734,2026-02-02T03:18:07.515447,30262,
-Sousa_2015.pdf,not-useful,0.37260109186172485,1.0.0,0.35510683059692383,2026-02-02T03:18:07.875944,23718,
-Wu_2005.pdf,not-useful,0.3673085570335388,1.0.0,0.8690674304962158,2026-02-02T03:18:08.778555,73418,
diff --git a/data/results/classifications.json b/data/results/classifications.json
deleted file mode 100644
index 8961d1d..0000000
--- a/data/results/classifications.json
+++ /dev/null
@@ -1,110 +0,0 @@
-{
-  "metadata": {
-    "export_timestamp": "2026-02-02T03:18:08.781554",
-    "total_files": 10,
-    "useful_count": 0,
-    "not_useful_count": 9
-  },
-  "results": [
-    {
-      "filename": "Adams_1989.pdf",
-      "classification": "not-useful",
-      "confidence": 0.35939544439315796,
-      "model_version": "1.0.0",
-      "processing_time_seconds": 0.5106580257415771,
-      "timestamp": "2026-02-02T03:18:04.485316",
-      "text_length": 27601,
-      "error": null
-    },
-    {
-      "filename": "Berg_2002.pdf",
-      "classification": "unknown",
-      "confidence": 0.0,
-      "model_version": "1.0.0",
-      "processing_time_seconds": null,
-      "timestamp": "2026-02-02T03:18:05.431344",
-      "text_length": null,
-      "error": "No text extracted from PDF"
-    },
-    {
-      "filename": "Dale_2011.pdf",
-      "classification": "not-useful",
-      "confidence": 0.4046034812927246,
-      "model_version": "1.0.0",
-      "processing_time_seconds": 0.4978008270263672,
-      "timestamp": "2026-02-02T03:18:06.053123",
-      "text_length": 75685,
-      "error": null
-    },
-    {
-      "filename": "Fisher_2008.pdf",
-      "classification": "not-useful",
-      "confidence": 0.35939544439315796,
-      "model_version": "1.0.0",
-      "processing_time_seconds": 0.32041358947753906,
-      "timestamp": "2026-02-02T03:18:06.385193",
-      "text_length": 54637,
-      "error": null
-    },
-    {
-      "filename": "Harris_2009.pdf",
-      "classification": "not-useful",
-      "confidence": 0.3941650390625,
-      "model_version": "1.0.0",
-      "processing_time_seconds": 0.270871639251709,
-      "timestamp": "2026-02-02T03:18:06.659290",
-      "text_length": 48939,
-      "error": null
-    },
-    {
-      "filename": "Kerle_2000.pdf",
-      "classification": "not-useful",
-      "confidence": 0.35939544439315796,
-      "model_version": "1.0.0",
-      "processing_time_seconds": 0.2882812023162842,
-      "timestamp": "2026-02-02T03:18:06.954093",
-      "text_length": 33742,
-      "error": null
-    },
-    {
-      "filename": "Marques_2015.pdf",
-      "classification": "not-useful",
-      "confidence": 0.6508813202381134,
-      "model_version": "1.0.0",
-      "processing_time_seconds": 0.33272790908813477,
-      "timestamp": "2026-02-02T03:18:07.305519",
-      "text_length": 49378,
-      "error": null
-    },
-    {
-      "filename": "Pakhomov_1998.pdf",
-      "classification": "not-useful",
-      "confidence": 0.35939544439315796,
-      "model_version": "1.0.0",
-      "processing_time_seconds": 0.20516109466552734,
-      "timestamp": "2026-02-02T03:18:07.515447",
-      "text_length": 30262,
-      "error": null
-    },
-    {
-      "filename": "Sousa_2015.pdf",
-      "classification": "not-useful",
-      "confidence": 0.37260109186172485,
-      "model_version": "1.0.0",
-      "processing_time_seconds": 0.35510683059692383,
-      "timestamp": "2026-02-02T03:18:07.875944",
-      "text_length": 23718,
-      "error": null
-    },
-    {
-      "filename": "Wu_2005.pdf",
-      "classification": "not-useful",
-      "confidence": 0.3673085570335388,
-      "model_version": "1.0.0",
-      "processing_time_seconds": 0.8690674304962158,
-      "timestamp": "2026-02-02T03:18:08.778555",
-      "text_length": 73418,
-      "error": null
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/results/metrics/Adams_1989_results.json b/data/results/metrics/Adams_1989_results.json
deleted file mode 100644
index 851776d..0000000
--- a/data/results/metrics/Adams_1989_results.json
+++ /dev/null
@@ -1,14 +0,0 @@
-{
-  "source_file": "Adams_1989.txt",
-  "file_type": ".txt",
-  "metrics": {
-    "species_name": null,
-    "study_location": "Marion Island, sub-Antarctic",
-    "study_date": null,
-    "num_empty_stomachs": null,
-    "num_nonempty_stomachs": null,
-    "sample_size": null,
-    "fraction_feeding": null,
-    "source_pages": null
-  }
-}
\ No newline at end of file
diff --git a/data/results/summaries/txt_pipeline_summary_20260227_211255.csv b/data/results/summaries/txt_pipeline_summary_20260227_211255.csv
deleted file mode 100644
index b88726f..0000000
--- a/data/results/summaries/txt_pipeline_summary_20260227_211255.csv
+++ /dev/null
@@ -1,2 +0,0 @@
-filename,raw_chars,cleaned_chars,trimmed_chars,extraction_status,species_name,study_location,study_date,sample_size,num_empty_stomachs,num_nonempty_stomachs,fraction_feeding
-Adams_1989.txt,27673,22739,4999,success,,"Marion Island, sub-Antarctic",,,,,
diff --git a/data/results/test_biomistral_results.json b/data/results/test_biomistral_results.json
deleted file mode 100644
index f327cec..0000000
--- a/data/results/test_biomistral_results.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-  "source_file": "test.txt",
-  "metrics": {
-    "species_name": "Homo sapiens",
-    "study_location": null,
-    "study_date": null,
-    "num_empty_stomachs": null,
-    "num_nonempty_stomachs": null,
-    "sample_size": 2000000000000000,
-    "fraction_feeding": null
-  }
-}
\ No newline at end of file
diff --git a/data/results/test_quick_results.json b/data/results/test_quick_results.json
deleted file mode 100644
index 65978be..0000000
--- a/data/results/test_quick_results.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-  "source_file": "test.txt",
-  "metrics": {
-    "species_name": "Gadus morhua",
-    "study_location": "North Sea",
-    "study_date": "2019",
-    "num_empty_stomachs": 89,
-    "num_nonempty_stomachs": 253,
-    "sample_size": 342,
-    "fraction_feeding": 0.7398
-  }
-}
\ No newline at end of file
diff --git a/data/results/test_results.json b/data/results/test_results.json
deleted file mode 100644
index ea61fcf..0000000
--- a/data/results/test_results.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-  "source_file": "test.txt",
-  "metrics": {
-    "species_name": null,
-    "study_location": null,
-    "study_date": null,
-    "num_empty_stomachs": null,
-    "num_nonempty_stomachs": null,
-    "sample_size": null,
-    "fraction_feeding": null
-  }
-}
\ No newline at end of file
diff --git a/documentation/CONTRIBUTING.md b/documentation/CONTRIBUTING.md
index 7e95624..f75a6ea 100644
--- a/documentation/CONTRIBUTING.md
+++ b/documentation/CONTRIBUTING.md
@@ -20,11 +20,10 @@ All contributors must follow the Oregon State University Student Code of Conduct
   * pip installed
   * Access to GitHub repository
   * [Ollama](https://ollama.com) installed and running locally
-    * Minimum hardware: 8 GB RAM (16 GB recommended for `llama3.1:8b`)
+    * Minimum hardware: 8 GB RAM (16 GB recommended for `qwen2.5:7b`)
     * Pull the required models before running the classify/extract pipeline:
       ```bash
-      ollama pull llama3.1:8b   # default extraction model (~5 GB)
-      ollama pull qwen2.5:7b    # alternative model (~5 GB)
+      ollama pull qwen2.5:7b   # default extraction model (~5 GB)
       ```
     * Verify Ollama is running: `ollama list`
 * ### Setup Instructions
@@ -34,7 +33,7 @@ All contributors must follow the Oregon State University Student Code of Conduct
     python -m venv venv
     source venv/bin/activate   
     # Windows: venv\Scripts\activate
-    pip install -r requirements.txt
+    pip install -e ".[dev]"
 ```
 * ### Running the application
   * If you do have the [dataset](https://drive.google.com/drive/u/2/folders/1U3_-TmnXnuBPR9vukkyV-3ITsxPr-nfo) downloaded locally on your machine:
@@ -57,20 +56,20 @@ All contributors must follow the Oregon State University Student Code of Conduct
     ```
     * Note: You will need access to the .env file
 * ### Running the classify/extract pipeline
-  Use `classify_extract.py` to classify PDFs and extract structured diet data in a single step.
-  Requires trained model artifacts in `src/model/models/` (run the full pipeline first,
+  Use `src/pipeline/classify_extract.py` to classify PDFs and extract structured diet data in a single step.
+  Requires trained model artifacts in `src/classifier/models/` (run the full pipeline first,
   or see [Retraining the Classifier](#retraining-the-classifier-and-extending-extraction) below).
   ```bash
   # Single PDF
-  python classify_extract.py path/to/file.pdf
+  python src/pipeline/classify_extract.py path/to/file.pdf
 
   # Folder of PDFs (sequential)
-  python classify_extract.py path/to/pdfs/
+  python src/pipeline/classify_extract.py path/to/pdfs/
 
   # All options
-  python classify_extract.py path/to/pdfs/ \
-      --model-dir src/model/models \
-      --llm-model llama3.1:8b \
+  python src/pipeline/classify_extract.py path/to/pdfs/ \
+      --model-dir src/classifier/models \
+      --llm-model qwen2.5:7b \
       --output-dir data/results \
       --confidence-threshold 0.70 \
       --max-chars 12000 \
@@ -79,8 +78,8 @@ All contributors must follow the Oregon State University Student Code of Conduct
   ```
   | Flag | Default | Description |
   |------|---------|-------------|
-  | `--model-dir` | `src/model/models` | Directory containing classifier artifacts |
-  | `--llm-model` | `llama3.1:8b` | Ollama model for extraction |
+  | `--model-dir` | `src/classifier/models` | Directory containing classifier artifacts |
+  | `--llm-model` | `qwen2.5:7b` | Ollama model for extraction |
   | `--output-dir` | `data/results` | Destination for JSON results and summary CSV |
   | `--confidence-threshold` | `0.70` | Probability threshold for "useful" classification |
   | `--max-chars` | `12000` | Maximum characters sent to the LLM |
@@ -264,7 +263,7 @@ dependency update policy, and scanning tools.
 
 * Never commit sensitive credentials, tokens, or API keys.
 * Secrets are stored locally in .env files and excluded via .gitignore.
-* Dependencies are managed in requirements.txt.
+* Dependencies are managed in `pyproject.toml`.
 * Use pip-audit monthly to check for vulnerabilities.
 * Security issues or potential breaches should be reported privately to the Project Manager and TA.
 
@@ -342,27 +341,27 @@ Example entry:
 
 ### Retraining the XGBoost Classifier
 
-The classifier artifacts are saved in `src/model/models/`. To retrain with new or updated labeled data:
+The classifier artifacts are saved in `src/classifier/models/`. To retrain with new or updated labeled data:
 
 1. **Add labeled text files** to `data/processed-text/` and update `data/labels.json`
    with `"filename.txt": "useful"` or `"filename.txt": "not useful"` entries.
 
 2. **Run the trainer directly:**
    ```bash
-   python src/model/train_model.py
+   python -m src.classifier.train_model
    ```
    This reads from `data/processed-text/` and `data/labels.json`, trains a TF-IDF +
    XGBoost model, and saves three artifacts:
-   - `src/model/models/pdf_classifier.json` - XGBoost model
-   - `src/model/models/tfidf_vectorizer.pkl` - TF-IDF vectorizer
-   - `src/model/models/label_encoder.pkl` - LabelEncoder
+   - `src/classifier/models/pdf_classifier.json` - XGBoost model
+   - `src/classifier/models/tfidf_vectorizer.pkl` - TF-IDF vectorizer
+   - `src/classifier/models/label_encoder.pkl` - LabelEncoder
 
 3. **Or run the full pipeline**, which trains the model as a final step:
    ```bash
    python scripts/full_pipeline.py --local <path_to_dataset>
    ```
 
-Key tunable parameters in `src/model/train_model.py`:
+Key tunable parameters in `src/classifier/train_model.py`:
 - `max_features` in `TfidfVectorizer` (default: 10,000)
 - `eta`, `max_depth`, `subsample` in the XGBoost `params` dict
 - `early_stopping_rounds` (default: 20)
@@ -371,20 +370,19 @@ Key tunable parameters in `src/model/train_model.py`:
 
 Extraction fields are defined in two places:
 
-1. **`src/llm/models.py`** - the `PredatorDietMetrics` Pydantic model.
+1. **`src/extraction/models.py`** - the `PredatorDietMetrics` Pydantic model.
    Add a new optional field with the appropriate type and a `None` default:
    ```python
    prey_taxa: Optional[list[str]] = None
    ```
 
-2. **`src/llm/llm_client.py`** - the system prompt that instructs the LLM.
+2. **`src/extraction/llm_client.py`** - the system prompt that instructs the LLM.
    Add a description of the new field and its expected format to the prompt string.
 
-3. **`classify_extract.py`** and **`extract-from-txt.py`** - update the `row` dict
+3. **`src/pipeline/classify_extract.py`** and **`src/pipeline/extract_from_txt.py`** - update the `row` dict
    and `fieldnames` list in the summary CSV writer to include the new column.
 
-After adding a field, run `pytest tests/test_llm_text.py` to verify that the prompt
-changes do not break existing extraction tests.
+After adding a field, run `pytest tests/test_llm_text.py` to verify that the prompt changes do not break existing extraction tests.
 
 ## Support & Contact
 * **Primary Communications**: Slack and Teams
diff --git a/pyproject.toml b/pyproject.toml
index 42da6f0..4133488 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -2,6 +2,39 @@
 requires = ["setuptools", "wheel"]
 build-backend = "setuptools.build_meta"
 
+[project]
+name = "fracfeedextractor"
+version = "0.1.0"
+requires-python = ">=3.10"
+dependencies = [
+    "pymupdf",
+    "pillow",
+    "pytesseract",
+    "scikit-learn",
+    "python-dotenv",
+    "google-auth",
+    "google-auth-oauthlib",
+    "google-auth-httplib2",
+    "numpy",
+    "google-api-python-client",
+    "ollama",
+    "pydantic",
+    "camelot-py[base]",
+    "opencv-python",
+    "pymupdf_layout",
+    "xgboost",
+    "pyspellchecker",
+    "joblib",
+]
+
+[project.optional-dependencies]
+dev = [
+    "pytest",
+    "coverage",
+    "flake8==6.1.0",
+    "black==25.9.0",
+]
+
 [tool.black]
 line-length = 200
 target-version = ["py311"]
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index 443d114..0000000
--- a/requirements.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-pymupdf
-pillow
-pytesseract
-tesseract
-pytest
-coverage
-flake8==6.1.0
-black==25.9.0
-scikit-learn
-dotenv
-google-auth 
-google-auth-oauthlib 
-google-auth-httplib2
-numpy
-google-api-python-client
-ollama
-pydantic
-camelot-py[base]
-opencv-python
-pymupdf_layout
-xgboost
-opencv-python
-pyspellchecker
\ No newline at end of file
diff --git a/scripts/ci_pipeline.py b/scripts/ci_pipeline.py
index 38670fd..a24eb2f 100644
--- a/scripts/ci_pipeline.py
+++ b/scripts/ci_pipeline.py
@@ -33,7 +33,7 @@
     download_file_bytes,
     sanitize_filename,
 )
-from src.preprocessing.pdf_text_extraction import extract_text_from_pdf_bytes
+from src.io.pdf_text_extraction import extract_text_from_pdf_bytes
 
 
 def write_labels(labels: Dict[str, str], output_file: Path):
@@ -91,7 +91,7 @@ def main():
 
     # Train model on the CI sample
     print("\nStarting model training on CI sample...")
-    r = subprocess.run([sys.executable, "src/model/train_model.py"], env={**os.environ, "CI_TRAIN": "1"})
+    r = subprocess.run([sys.executable, "-m", "src.classifier.train_model"], env={**os.environ, "CI_TRAIN": "1"})
     if r.returncode != 0:
         print("Model training failed")
         raise SystemExit(r.returncode)
diff --git a/scripts/full_pipeline.py b/scripts/full_pipeline.py
index b03bb4c..85aa00e 100644
--- a/scripts/full_pipeline.py
+++ b/scripts/full_pipeline.py
@@ -17,7 +17,7 @@
  - API mode: Streams every PDF (no local PDF persistence) and writes extracted text to data/processed-text.
  - Local mode: Processes PDFs from specified local directory (expects 'useful' and 'not-useful' subfolders).
  - Generates labels.json based on folder origin.
- - Trains model with src/model/train_model.py.
+ - Trains model with src/classifier/train_model.py.
 """
 
 from __future__ import annotations
@@ -48,7 +48,7 @@
     download_file_bytes,
     sanitize_filename,
 )
-from src.preprocessing.pdf_text_extraction import extract_text_from_pdf_bytes
+from src.io.pdf_text_extraction import extract_text_from_pdf_bytes
 
 
 # Module-level flag set once per worker process via initializer
@@ -263,7 +263,7 @@ def main():
         process_api_mode()
 
     print("Beginning model training...")
-    run([sys.executable, "src/model/train_model.py"])
+    run([sys.executable, "-m", "src.classifier.train_model"])
     print("Training complete.")
 
 
diff --git a/src/model/models/test.txt b/src/__init__.py
similarity index 100%
rename from src/model/models/test.txt
rename to src/__init__.py
diff --git a/src/classifier/__init__.py b/src/classifier/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/model/models/pdf_classifier.json b/src/classifier/models/pdf_classifier.json
similarity index 100%
rename from src/model/models/pdf_classifier.json
rename to src/classifier/models/pdf_classifier.json
diff --git a/src/model/pdf_classifier.py b/src/classifier/pdf_classifier.py
similarity index 91%
rename from src/model/pdf_classifier.py
rename to src/classifier/pdf_classifier.py
index d43e46f..0066c5c 100644
--- a/src/model/pdf_classifier.py
+++ b/src/classifier/pdf_classifier.py
@@ -7,7 +7,7 @@
 
 Usage (standalone):
     python pdf_classifier.py --pdf-path path/to/file.pdf
-    python pdf_classifier.py --pdf-path path/to/file.pdf --model-dir src/model/models
+    python pdf_classifier.py --pdf-path path/to/file.pdf --model-dir src/classifier/models
     python pdf_classifier.py --pdf-path path/to/file.pdf --threshold 0.80
 """
 
@@ -20,8 +20,7 @@
 import joblib
 import xgboost as xgb
 
-sys.path.append(str(Path(__file__).resolve().parents[2]))
-from src.preprocessing.pdf_text_extraction import extract_text_from_pdf
+from src.io.pdf_text_extraction import extract_text_from_pdf
 from src.utils.logger import setup_logging
 
 import warnings
@@ -32,7 +31,7 @@
 log = logging.getLogger(__name__)
 
 
-def load_classifier(model_dir: str = "src/model/models") -> Tuple:
+def load_classifier(model_dir: str = "src/classifier/models") -> Tuple:
     """Load the trained classifier artifacts from disk.
 
     Args:
@@ -92,7 +91,7 @@ def classify_text(
 
 def classify_pdf(
     pdf_path: str,
-    model_dir: str = "src/model/models",
+    model_dir: str = "src/classifier/models",
     threshold: float = 0.70,
 ) -> Tuple[str, float, float]:
     """Convenience wrapper: extract text from a PDF and classify it."""
@@ -123,7 +122,7 @@ def classify_pdf(
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Classify a PDF as useful or not useful.")
     parser.add_argument("--pdf-path", type=str, required=True, help="Path to the PDF file to classify.")
-    parser.add_argument("--model-dir", type=str, default="src/model/models", help="Directory containing the trained model artifacts (default: src/model/models).")
+    parser.add_argument("--model-dir", type=str, default="src/classifier/models", help="Directory containing the trained model artifacts (default: src/classifier/models).")
     parser.add_argument("--threshold", type=float, default=0.70, help="Probability threshold for the 'useful' class (default: 0.70).")
     args = parser.parse_args()
 
diff --git a/src/model/train_model.py b/src/classifier/train_model.py
similarity index 95%
rename from src/model/train_model.py
rename to src/classifier/train_model.py
index a557e4f..9ff7b1f 100644
--- a/src/model/train_model.py
+++ b/src/classifier/train_model.py
@@ -19,8 +19,6 @@
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import LabelEncoder
 
-sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
-
 from src.utils.logger import setup_logging
 
 log = logging.getLogger(__name__)
@@ -47,7 +45,7 @@ def load_labeled_data(data_dir="data/processed-text", labels_file="data/labels.j
     return texts, labels, filenames
 
 
-def train_pdf_classifier(texts, labels, output_dir="src/model/models"):
+def train_pdf_classifier(texts, labels, output_dir="src/classifier/models"):
 
     if not texts or not labels:
         print("[ERROR] No training samples found.")
@@ -145,7 +143,7 @@ def train_pdf_classifier(texts, labels, output_dir="src/model/models"):
     setup_logging()
 
     texts, labels, _ = load_labeled_data()
-    result = train_pdf_classifier(texts, labels, "src/model/models")
+    result = train_pdf_classifier(texts, labels, "src/classifier/models")
     if result is None:
         sys.exit(1)
     print(f"Model trained successfully! Accuracy: {result['accuracy']:.2f}")
diff --git a/src/extraction/__init__.py b/src/extraction/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/llm/chunked_extraction.py b/src/extraction/chunked_extraction.py
similarity index 87%
rename from src/llm/chunked_extraction.py
rename to src/extraction/chunked_extraction.py
index 5c8238b..6ebb8b6 100644
--- a/src/llm/chunked_extraction.py
+++ b/src/extraction/chunked_extraction.py
@@ -6,18 +6,15 @@
 import xgboost as xgb
 from pathlib import Path
 from collections import Counter
+from typing import Any, Optional
 
-# Add project root to path
-project_root = Path(__file__).parent.parent.parent
-sys.path.insert(0, str(project_root))
+from src.extraction.llm_client import extract_metrics_from_text
+from src.classifier.pdf_classifier import load_classifier
 
-from src.llm.llm_client import extract_metrics_from_text
-from src.model.pdf_classifier import load_classifier
 
-
-def chunk_text(text, chunk_size=3000, overlap=300):
+def chunk_text(text: str, chunk_size: int = 3000, overlap: int = 300) -> list[str]:
     """Split text into overlapping chunks."""
-    chunks = []
+    chunks: list[str] = []
     start = 0
 
     while start < len(text):
@@ -41,7 +38,7 @@ def chunk_text(text, chunk_size=3000, overlap=300):
     return chunks
 
 
-def score_chunk(chunk, model, vectorizer):
+def score_chunk(chunk: str, model: xgb.Booster, vectorizer: Any) -> float:
     """Score a chunk using XGBoost classifier."""
     X_vec = vectorizer.transform([chunk])
     dtest = xgb.DMatrix(X_vec)
@@ -49,14 +46,14 @@ def score_chunk(chunk, model, vectorizer):
     return float(score)
 
 
-def merge_results(results):
+def merge_results(results: list[Optional[dict[str, Any]]]) -> dict[str, Any]:
     """Merge extraction results from multiple chunks using voting."""
     results = [r for r in results if r is not None]
 
     if not results:
         return {}
 
-    merged = {}
+    merged: dict[str, Any] = {}
     fields = ['species_name', 'study_location', 'study_date', 'num_empty_stomachs', 'num_nonempty_stomachs', 'sample_size']
 
     for field in fields:
@@ -82,14 +79,14 @@ def merge_results(results):
 
 
 def extract_with_chunking(
-    text,
-    model_dir="src/model/models",
-    llm_model="qwen2.5:7b",  # Changed from biomistral
-    num_ctx=8192,
-    top_n=3,
-    chunk_size=3000,
-    overlap=300,
-):
+    text: str,
+    model_dir: str = "src/classifier/models",
+    llm_model: str = "qwen2.5:7b",
+    num_ctx: int = 8192,
+    top_n: int = 3,
+    chunk_size: int = 3000,
+    overlap: int = 300,
+) -> dict[str, Any]:
     """Main extraction with chunking pipeline."""
 
     print("  [CHUNK] Loading classifier...", file=sys.stderr)
diff --git a/src/llm/llm_client.py b/src/extraction/llm_client.py
similarity index 97%
rename from src/llm/llm_client.py
rename to src/extraction/llm_client.py
index 1103c97..88df016 100644
--- a/src/llm/llm_client.py
+++ b/src/extraction/llm_client.py
@@ -7,7 +7,7 @@
 Usage (standalone):
     python llm_client.py path/to/file.pdf
     python llm_client.py path/to/file.txt
-    python llm_client.py path/to/file.pdf --model llama3.1:8b
+    python llm_client.py path/to/file.pdf --model qwen2.5:7b
     python llm_client.py path/to/file.txt --output-dir results/
 """
 
@@ -22,11 +22,8 @@
 
 from ollama import chat
 
-project_root = Path(__file__).parent.parent.parent
-sys.path.insert(0, str(project_root))
-
-from src.llm.models import PredatorDietMetrics
-from src.llm.llm_text import extract_key_sections, load_document
+from src.extraction.models import PredatorDietMetrics
+from src.extraction.llm_text import extract_key_sections, load_document
 from src.utils.logger import setup_logging
 
 log = logging.getLogger(__name__)
@@ -60,7 +57,7 @@ def _call_ollama_with_retry(model, messages, format, options):
 
 def extract_metrics_from_text(
     text: str,
-    # model: str = "llama3.1:8b",
+    # model: str = "qwen2.5:7b",
     model: str = "qwen2.5:7b",
     num_ctx: int = 8192,
     _retry: bool = False,
@@ -283,7 +280,7 @@ def save_extraction_result(
 def main():
     parser = argparse.ArgumentParser(description="Extract predator diet metrics from PDFs or text files using LLM")
     parser.add_argument("input_file", type=str, help="Path to the input file (.pdf or .txt)")
-    # parser.add_argument("--model", type=str, default="llama3.1:8b", help="Ollama model to use (default: llama3.1:8b)")
+    # parser.add_argument("--model", type=str, default="qwen2.5:7b", help="Ollama model to use (default: qwen2.5:7b)")
     parser.add_argument("--model", type=str, default="qwen2.5:7b", help="Ollama model to use (default: qwen2.5:7b)")
     parser.add_argument("--output-dir", type=str, default="data/results", help="Output directory for JSON results (default: data/results/metrics)")
     parser.add_argument("--max-chars", type=int, default=12000, help="Maximum characters of text to send to the model (default: 12000). Reduce if you hit CUDA/OOM errors.")
diff --git a/src/llm/llm_text.py b/src/extraction/llm_text.py
similarity index 99%
rename from src/llm/llm_text.py
rename to src/extraction/llm_text.py
index 9bc8862..dbc377c 100644
--- a/src/llm/llm_text.py
+++ b/src/extraction/llm_text.py
@@ -412,7 +412,7 @@ def load_document(file_path: Path) -> str:
 
     if suffix == '.pdf':
         print("[INFO] Reading PDF file...", file=sys.stderr)
-        from src.preprocessing.pdf_text_extraction import extract_text_from_pdf
+        from src.io.pdf_text_extraction import extract_text_from_pdf
 
         return extract_text_from_pdf(str(file_path))
     elif suffix in ['.txt', '.text']:
diff --git a/src/llm/models.py b/src/extraction/models.py
similarity index 100%
rename from src/llm/models.py
rename to src/extraction/models.py
diff --git a/src/io/__init__.py b/src/io/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/preprocessing/data_loader.py b/src/io/data_loader.py
similarity index 100%
rename from src/preprocessing/data_loader.py
rename to src/io/data_loader.py
diff --git a/src/preprocessing/generate_labels.py b/src/io/generate_labels.py
similarity index 100%
rename from src/preprocessing/generate_labels.py
rename to src/io/generate_labels.py
diff --git a/src/preprocessing/pdf_text_extraction.py b/src/io/pdf_text_extraction.py
similarity index 100%
rename from src/preprocessing/pdf_text_extraction.py
rename to src/io/pdf_text_extraction.py
diff --git a/src/preprocessing/section_filter.py b/src/io/section_filter.py
similarity index 100%
rename from src/preprocessing/section_filter.py
rename to src/io/section_filter.py
diff --git a/src/preprocessing/text_cleaner.py b/src/io/text_cleaner.py
similarity index 100%
rename from src/preprocessing/text_cleaner.py
rename to src/io/text_cleaner.py
diff --git a/src/llm/chunked_biomistral_llm.py b/src/llm/chunked_biomistral_llm.py
deleted file mode 100644
index 9064fc1..0000000
--- a/src/llm/chunked_biomistral_llm.py
+++ /dev/null
@@ -1,197 +0,0 @@
-"""Chunked extraction pipeline — split papers, score chunks, extract from top-N, merge.
-
-Instead of sending one big trimmed blob to the LLM, this module:
-  1. Splits the document into overlapping character-level chunks
-  2. Scores each chunk with the XGBoost classifier (higher = more likely "useful" content)
-  3. Sends the top-N scoring chunks through the LLM independently
-  4. Merges the per-chunk results via majority voting
-
-This improves recall on long papers where the single-pass trim can miss
-data-rich paragraphs buried in the middle of the document.
-"""
-
-import logging
-import sys
-from collections import Counter
-from pathlib import Path
-from typing import Dict, List, Optional, Tuple
-
-project_root = Path(__file__).parent.parent.parent
-sys.path.insert(0, str(project_root))
-
-import xgboost as xgb
-
-from src.llm.llm_client import extract_metrics_from_text
-from src.llm.models import PredatorDietMetrics
-from src.model.pdf_classifier import load_classifier
-
-log = logging.getLogger(__name__)
-
-# fields we try to merge across chunks
-_MERGE_FIELDS = [
-    "species_name",
-    "study_location",
-    "study_date",
-    "num_empty_stomachs",
-    "num_nonempty_stomachs",
-    "sample_size",
-]
-
-
-def chunk_text(
-    text: str,
-    chunk_size: int = 4000,
-    overlap: int = 500,
-    min_chunk_len: int = 100,
-) -> List[str]:
-    """Split *text* into overlapping character-level chunks.
-
-    Tries to break at paragraph boundaries first, then sentence boundaries,
-    so chunks don't start/end mid-word.
-    """
-    chunks = []
-    start = 0
-
-    while start < len(text):
-        end = start + chunk_size
-
-        # try to snap to a paragraph break
-        if end < len(text):
-            para_break = text.rfind("\n\n", start, end)
-            if para_break > start + chunk_size // 2:
-                end = para_break
-            else:
-                sent_break = text.rfind(". ", start, end)
-                if sent_break > start + chunk_size // 2:
-                    end = sent_break + 1
-
-        chunk = text[start:end].strip()
-        if len(chunk) >= min_chunk_len:
-            chunks.append(chunk)
-
-        start = end - overlap
-
-    return chunks
-
-
-def score_chunks(
-    chunks: List[str],
-    model_dir: str = "src/model/models",
-) -> List[Tuple[str, float]]:
-    """Score each chunk using the XGBoost classifier.
-
-    Returns a list of (chunk, score) sorted by score descending.
-    Higher score = chunk looks more like a "useful" paper section.
-    """
-    model, vectorizer, _encoder = load_classifier(model_dir)
-
-    scored = []
-    for chunk in chunks:
-        X_vec = vectorizer.transform([chunk])
-        dtest = xgb.DMatrix(X_vec)
-        score = float(model.predict(dtest)[0])
-        scored.append((chunk, score))
-
-    scored.sort(key=lambda x: x[1], reverse=True)
-    return scored
-
-
-def merge_results(results: List[dict]) -> dict:
-    """Merge extraction dicts from multiple chunks via majority voting.
-
-    For each field, the value that appears most often across chunks wins.
-    A confidence score (votes / total) is stored alongside each field.
-    """
-    if not results:
-        return {}
-
-    merged = {}
-    for field in _MERGE_FIELDS:
-        values = [r.get(field) for r in results if r.get(field) is not None]
-
-        if not values:
-            merged[field] = None
-            merged[f"{field}_confidence"] = 0.0
-        else:
-            counter = Counter(values)
-            most_common_val, most_common_count = counter.most_common(1)[0]
-            merged[field] = most_common_val
-            merged[f"{field}_confidence"] = round(most_common_count / len(values), 2)
-
-    return merged
-
-
-def extract_with_chunking(
-    text: str,
-    model_dir: str = "src/model/models",
-    llm_model: str = "qwen2.5:7b",
-    num_ctx: int = 8192,
-    top_n: int = 3,
-    chunk_size: int = 4000,
-    overlap: int = 500,
-) -> dict:
-    """Full chunked extraction pipeline.
-
-    Args:
-        text:       Full document text.
-        model_dir:  Path to XGBoost model artifacts.
-        llm_model:  Ollama model name for extraction.
-        num_ctx:    Context window size for Ollama.
-        top_n:      Number of highest-scoring chunks to extract from.
-        chunk_size: Character size per chunk.
-        overlap:    Overlap between consecutive chunks.
-
-    Returns:
-        Merged metrics dict with per-field confidence scores and
-        fraction_feeding computed from the merged counts.
-    """
-    # chunk
-    chunks = chunk_text(text, chunk_size=chunk_size, overlap=overlap)
-    print(f"  [CHUNK] Split into {len(chunks)} chunks", file=sys.stderr)
-
-    if not chunks:
-        log.warning("No chunks produced from text of length %d", len(text))
-        return {}
-
-    # score
-    scored = score_chunks(chunks, model_dir=model_dir)
-    top_chunks = scored[:top_n]
-    scores_str = ", ".join(f"{s:.3f}" for _, s in top_chunks)
-    print(f"  [CHUNK] Top {len(top_chunks)} chunk scores: [{scores_str}]", file=sys.stderr)
-
-    # extract from each chunk
-    results = []
-    for i, (chunk, score) in enumerate(top_chunks):
-        print(f"  [CHUNK] Extracting from chunk {i + 1}/{len(top_chunks)} (score={score:.3f})...", file=sys.stderr)
-        try:
-            metrics = extract_metrics_from_text(
-                text=chunk,
-                model=llm_model,
-                num_ctx=num_ctx,
-            )
-            result = metrics.model_dump()
-            results.append(result)
-            print(
-                f"    Got: species={result.get('species_name')}, " f"n={result.get('sample_size')}",
-                file=sys.stderr,
-            )
-        except Exception as e:
-            print(f"    Failed: {e}", file=sys.stderr)
-            log.error("Chunk %d extraction failed: %s", i + 1, e)
-
-    if not results:
-        log.warning("All chunk extractions failed")
-        return {}
-
-    # merge via voting
-    merged = merge_results(results)
-
-    # compute fraction_feeding from merged counts
-    nonempty = merged.get("num_nonempty_stomachs")
-    sample = merged.get("sample_size")
-    if nonempty is not None and sample is not None and sample > 0:
-        merged["fraction_feeding"] = round(nonempty / sample, 4)
-    else:
-        merged["fraction_feeding"] = None
-
-    return merged
diff --git a/src/pipeline/__init__.py b/src/pipeline/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/classify_extract.py b/src/pipeline/classify_extract.py
similarity index 93%
rename from classify_extract.py
rename to src/pipeline/classify_extract.py
index 2c8b5fa..adc7d63 100644
--- a/classify_extract.py
+++ b/src/pipeline/classify_extract.py
@@ -6,15 +6,15 @@
 
 Usage:
     # Single PDF
-    python classify-extract.py path/to/file.pdf
+    python src/pipeline/classify_extract.py path/to/file.pdf
 
     # Folder of PDFs
-    python classify-extract.py path/to/folder/
+    python src/pipeline/classify_extract.py path/to/folder/
 
     # Custom options
-    python classify-extract.py path/to/folder/ \\
-        --model-dir src/model/models \\
-        --llm-model llama3.1:8b \\
+    python src/pipeline/classify_extract.py path/to/folder/ \\
+        --model-dir src/classifier/models \\
+        --llm-model qwen2.5:7b \\
         --output-dir results/ \\
         --confidence-threshold 0.70 \\
         --max-chars 12000 \\
@@ -34,10 +34,10 @@
 from concurrent.futures import ProcessPoolExecutor, as_completed
 from pathlib import Path
 
-from src.preprocessing.pdf_text_extraction import extract_text_from_pdf
-from src.model.pdf_classifier import load_classifier, classify_text
-from src.llm.llm_text import extract_key_sections
-from src.llm.llm_client import extract_metrics_from_text, save_extraction_result
+from src.io.pdf_text_extraction import extract_text_from_pdf
+from src.classifier.pdf_classifier import load_classifier, classify_text
+from src.extraction.llm_text import extract_key_sections
+from src.extraction.llm_client import extract_metrics_from_text, save_extraction_result
 from src.utils.logger import setup_logging
 
 log = logging.getLogger(__name__)
@@ -310,16 +310,16 @@ def main():
         epilog="""
 Examples:
   Single PDF:
-    python classify-extract.py paper.pdf
+    python src/pipeline/classify_extract.py paper.pdf
 
   Folder of PDFs:
-    python classify-extract.py data/pdfs/
+    python src/pipeline/classify_extract.py data/pdfs/
 
   Custom options:
-    python classify-extract.py data/pdfs/ \\
-        --model-dir src/model/models \\
+    python src/pipeline/classify_extract.py data/pdfs/ \\
+        --model-dir src/classifier/models \\
         --output-dir results/ \\
-        --llm-model llama3.1:8b \\
+        --llm-model qwen2.5:7b \\
         --confidence-threshold 0.70
         """,
     )
@@ -331,14 +331,14 @@ def main():
     parser.add_argument(
         "--model-dir",
         type=str,
-        default="src/model/models",
-        help="Directory containing classifier model artifacts (default: src/model/models).",
+        default="src/classifier/models",
+        help="Directory containing classifier model artifacts (default: src/classifier/models).",
     )
     parser.add_argument(
         "--llm-model",
         type=str,
-        default="llama3.1:8b",
-        help="Ollama model to use for extraction (default: llama3.1:8b).",
+        default="qwen2.5:7b",
+        help="Ollama model to use for extraction (default: qwen2.5:7b).",
     )
     parser.add_argument(
         "--output-dir",
diff --git a/extract-from-txt.py b/src/pipeline/extract_from_txt.py
similarity index 92%
rename from extract-from-txt.py
rename to src/pipeline/extract_from_txt.py
index 52c636a..9cee179 100644
--- a/extract-from-txt.py
+++ b/src/pipeline/extract_from_txt.py
@@ -5,32 +5,32 @@
 classifier entirely.
 
 Every .txt file fed to this script is assumed to have already been confirmed
-as useful (e.g. by the classifier in classify-extract.py or by manual review).
+as useful (e.g. by the classifier in src/pipeline/classify_extract.py or by manual review).
 The pipeline:
 
   1. Read raw .txt file
   2. Strip noise (references, acknowledgements, affiliations, captions, …)
-     via src/preprocessing/text_cleaner.py
+     via src/io/text_cleaner.py
   3. Drop irrelevant paragraphs (taxonomy, morphometrics, stats methods, …)
-     via src/preprocessing/section_filter.py
+     via src/io/section_filter.py
   4. Trim to the character budget using section-priority ranking
-     via src/llm/llm_text.py::extract_key_sections()
-  5. Call Ollama for structured extraction via src/llm/llm_client.py
+     via src/extraction/llm_text.py::extract_key_sections()
+  5. Call Ollama for structured extraction via src/extraction/llm_client.py
   6. Save result JSON per file and a summary CSV
 
 Usage::
 
     # Process the default directory (data/processed-text/)
-    python extract-from-txt.py
+    python src/pipeline/extract_from_txt.py
 
     # Custom input directory
-    python extract-from-txt.py --input-dir path/to/txt_files/
+    python src/pipeline/extract_from_txt.py --input-dir path/to/txt_files/
 
     # Full options
-    python extract-from-txt.py \\
+    python src/pipeline/extract_from_txt.py \\
         --input-dir  data/processed-text/ \\
         --output-dir data/results/ \\
-        --llm-model  llama3.1:8b \\
+        --llm-model  qwen2.5:7b \\
         --max-chars  10000 \\
         --num-ctx    8192
 
@@ -48,16 +48,11 @@
 from datetime import datetime
 from pathlib import Path
 
-# Ensure the project root is on sys.path regardless of where this script is
-# invoked from.
-_PROJECT_ROOT = Path(__file__).resolve().parent
-sys.path.insert(0, str(_PROJECT_ROOT))
-
-from src.preprocessing.text_cleaner import clean_text
-from src.preprocessing.section_filter import filter_relevant_sections
-from src.llm.llm_text import extract_key_sections
-from src.llm.llm_client import extract_metrics_from_text, save_extraction_result
-from src.llm.chunked_extraction import extract_with_chunking
+from src.io.text_cleaner import clean_text
+from src.io.section_filter import filter_relevant_sections
+from src.extraction.llm_text import extract_key_sections
+from src.extraction.llm_client import extract_metrics_from_text, save_extraction_result
+from src.extraction.chunked_extraction import extract_with_chunking
 
 
 # ---------------------------------------------------------------------------
@@ -77,7 +72,7 @@ def run_txt_pipeline(
     top_chunks: int = 3,
     chunk_size: int = 4000,
     chunk_overlap: int = 500,
-    model_dir: str = "src/model/models",
+    model_dir: str = "src/classifier/models",
 ) -> None:
     """Process every .txt file in *input_dir* through clean → filter → trim → extract.
 
@@ -85,7 +80,7 @@ def run_txt_pipeline(
         input_dir:   Directory containing pre-classified useful .txt files.
                      Ignored when *single_file* is provided.
         output_dir:  Root output directory for JSON results and summary CSV.
-        llm_model:   Ollama model name (e.g. ``"llama3.1:8b"``).
+        llm_model:   Ollama model name (e.g. ``"qwen2.5:7b"``).
         max_chars:   Character budget for the text sent to Ollama.
         num_ctx:     Context window size requested from Ollama.
         single_file: If set, process only this one .txt file.
@@ -337,13 +332,13 @@ def main() -> None:
         epilog="""
 Examples:
   Default (data/processed-text/ → data/results/):
-    python extract-from-txt.py
+    python src/pipeline/extract_from_txt.py
 
   Custom directories:
-    python extract-from-txt.py --input-dir data/useful-txt/ --output-dir out/
+    python src/pipeline/extract_from_txt.py --input-dir data/useful-txt/ --output-dir out/
 
   Different model / tighter budget:
-    python extract-from-txt.py --llm-model mistral:7b --max-chars 4500
+    python src/pipeline/extract_from_txt.py --llm-model mistral:7b --max-chars 4500
         """,
     )
     parser.add_argument(
@@ -367,7 +362,7 @@ def main() -> None:
     parser.add_argument(
         "--llm-model",
         type=str,
-        # default="llama3.1:8b",
+        # default="qwen2.5:7b",
         default="qwen2.5:7b",
         help="Ollama model name (default: qwen2.5:7b).",
     )
@@ -416,8 +411,8 @@ def main() -> None:
     parser.add_argument(
         "--model-dir",
         type=str,
-        default="src/model/models",
-        help="Directory containing XGBoost model artifacts (default: src/model/models). Only used with --chunked.",
+        default="src/classifier/models",
+        help="Directory containing XGBoost model artifacts (default: src/classifier/models). Only used with --chunked.",
     )
 
     args = parser.parse_args()
diff --git a/src/utils/__init__.py b/src/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_data_loader.py b/tests/test_data_loader.py
index 9ebc47f..4d272f9 100644
--- a/tests/test_data_loader.py
+++ b/tests/test_data_loader.py
@@ -1,6 +1,6 @@
 import pytest
 from pathlib import Path
-from src.preprocessing.data_loader import load_processed_text
+from src.io.data_loader import load_processed_text
 import subprocess
 import shutil
 
@@ -60,7 +60,7 @@ def test_load_processed_text_encoding(tmp_path):
 
 
 def test_main_prints_summary(monkeypatch, tmp_path, capsys):
-    from src.preprocessing import data_loader
+    from src.io import data_loader
 
     data_dir = tmp_path / "data" / "processed-text"
     data_dir.mkdir(parents=True)
@@ -84,7 +84,7 @@ def test_data_loader_main_executes(tmp_path):
     data_dir.mkdir(parents=True)
     (data_dir / "sample.txt").write_text("Predator diet study data", encoding="utf-8")
 
-    script_path = Path("src/preprocessing/data_loader.py")
+    script_path = Path("src/io/data_loader.py")
     tmp_script_path = tmp_path / "data_loader.py"
     shutil.copy(script_path, tmp_script_path)
 
diff --git a/tests/test_generate_labels.py b/tests/test_generate_labels.py
index 692a112..6b3bea1 100644
--- a/tests/test_generate_labels.py
+++ b/tests/test_generate_labels.py
@@ -2,7 +2,7 @@
 import json
 import subprocess
 from pathlib import Path
-from src.preprocessing.generate_labels import generate_labels
+from src.io.generate_labels import generate_labels
 
 
 def test_generate_labels_creates_json(tmp_path):
@@ -85,7 +85,7 @@ def test_generate_labels_cli(tmp_path):
     shutil.copytree(repo_src, tmp_src)
 
     result = subprocess.run(
-        ["python", "src/preprocessing/generate_labels.py"],
+        ["python", "src/io/generate_labels.py"],
         capture_output=True,
         text=True,
         cwd=tmp_path,
diff --git a/tests/test_llm_text.py b/tests/test_llm_text.py
index b22c98b..c9c006d 100644
--- a/tests/test_llm_text.py
+++ b/tests/test_llm_text.py
@@ -1,4 +1,4 @@
-"""Unit tests for src/llm/llm_text.py — section extraction and text preprocessing."""
+"""Unit tests for src/extraction/llm_text.py — section extraction and text preprocessing."""
 
 import sys
 import tempfile
@@ -8,7 +8,7 @@
 
 sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
 
-from src.llm.llm_text import (
+from src.extraction.llm_text import (
     _section_priority,
     _score_paragraph,
     _truncate_at_sentence,
@@ -331,10 +331,10 @@ def test_pdf_extension_triggers_pdf_path(self, tmp_path, monkeypatch):
         pdf_file = tmp_path / "paper.pdf"
         pdf_file.write_bytes(b"%PDF-1.4 fake content")
 
-        import src.llm.llm_text as llm_text_module
+        import src.extraction.llm_text as llm_text_module
 
         monkeypatch.setattr(
-            "src.llm.llm_text.extract_text_from_pdf",
+            "src.extraction.llm_text.extract_text_from_pdf",
             lambda path: f"extracted from {Path(path).name}",
             raising=False,
         )
@@ -342,9 +342,9 @@ def test_pdf_extension_triggers_pdf_path(self, tmp_path, monkeypatch):
         import importlib
         import sys as _sys
 
-        fake_module = type(_sys)("src.preprocessing.pdf_text_extraction")
+        fake_module = type(_sys)("src.io.pdf_text_extraction")
         fake_module.extract_text_from_pdf = lambda path: f"extracted from {Path(path).name}"
-        monkeypatch.setitem(_sys.modules, "src.preprocessing.pdf_text_extraction", fake_module)
+        monkeypatch.setitem(_sys.modules, "src.io.pdf_text_extraction", fake_module)
 
         result = load_document(pdf_file)
         assert "extracted from paper.pdf" in result
diff --git a/tests/test_models.py b/tests/test_models.py
index fe2c935..cd46dad 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -1,4 +1,4 @@
-"""Unit tests for src/llm/models.py — PredatorDietMetrics Pydantic schema."""
+"""Unit tests for src/extraction/models.py — PredatorDietMetrics Pydantic schema."""
 
 import sys
 from pathlib import Path
@@ -8,7 +8,7 @@
 
 sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
 
-from src.llm.models import PredatorDietMetrics
+from src.extraction.models import PredatorDietMetrics
 
 
 # ---------------------------------------------------------------------------
diff --git a/tests/test_pdf_classifier.py b/tests/test_pdf_classifier.py
index 659d6c4..b9d7215 100644
--- a/tests/test_pdf_classifier.py
+++ b/tests/test_pdf_classifier.py
@@ -6,7 +6,7 @@
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.preprocessing import LabelEncoder
 from unittest.mock import patch
-from src.model.pdf_classifier import classify_pdf
+from src.classifier.pdf_classifier import classify_pdf
 
 
 @pytest.fixture
@@ -40,7 +40,7 @@ def model_dir_with_mock_model(tmp_path):
     return model_dir
 
 
-@patch("src.model.pdf_classifier.extract_text_from_pdf", return_value="predator stomach content analysis")
+@patch("src.classifier.pdf_classifier.extract_text_from_pdf", return_value="predator stomach content analysis")
 def test_classify_pdf_valid_case(mock_extract, model_dir_with_mock_model, capsys):
     """Happy path: result header, prediction label, confidence percentage all present."""
     classify_pdf(Path("tests/test.pdf"), model_dir_with_mock_model)
@@ -69,7 +69,7 @@ def test_classify_pdf_missing_model(capsys, tmp_path):
     assert "[ERROR]" in capsys.readouterr().err
 
 
-@patch("src.model.pdf_classifier.extract_text_from_pdf", return_value="")
+@patch("src.classifier.pdf_classifier.extract_text_from_pdf", return_value="")
 def test_classify_pdf_no_text(mock_extract, model_dir_with_mock_model, capsys):
     classify_pdf(Path("tests/empty.pdf"), model_dir_with_mock_model)
 
diff --git a/tests/test_pdf_extraction.py b/tests/test_pdf_extraction.py
index 9ed95c2..7d3e022 100644
--- a/tests/test_pdf_extraction.py
+++ b/tests/test_pdf_extraction.py
@@ -3,7 +3,7 @@
 import fitz
 from pathlib import Path
 import sys
-from src.preprocessing.pdf_text_extraction import extract_text_from_pdf, save_to_file, main
+from src.io.pdf_text_extraction import extract_text_from_pdf, save_to_file, main
 
 
 def test_extract_text_exists():
@@ -75,7 +75,7 @@ def test_main_cli(tmp_path):
     output_file = output_dir / "test.txt"
 
     result = subprocess.run(
-        ["python", "src/preprocessing/pdf_text_extraction.py", input_pdf],
+        ["python", "src/io/pdf_text_extraction.py", input_pdf],
         capture_output=True,
         text=True,
     )
diff --git a/tests/test_section_filter.py b/tests/test_section_filter.py
index 423bbae..06bc5c2 100644
--- a/tests/test_section_filter.py
+++ b/tests/test_section_filter.py
@@ -1,4 +1,4 @@
-"""Unit tests for src/preprocessing/section_filter.py"""
+"""Unit tests for src/io/section_filter.py"""
 
 import sys
 from pathlib import Path
@@ -7,7 +7,7 @@
 
 sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
 
-from src.preprocessing.section_filter import (
+from src.io.section_filter import (
     filter_relevant_sections,
     _has_positive_signal,
     _has_negative_signal,
diff --git a/tests/test_text_cleaner.py b/tests/test_text_cleaner.py
index da6798d..3aae784 100644
--- a/tests/test_text_cleaner.py
+++ b/tests/test_text_cleaner.py
@@ -1,4 +1,4 @@
-"""Unit tests for src/preprocessing/text_cleaner.py"""
+"""Unit tests for src/io/text_cleaner.py"""
 
 import sys
 from pathlib import Path
@@ -7,7 +7,7 @@
 
 sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
 
-from src.preprocessing.text_cleaner import clean_text
+from src.io.text_cleaner import clean_text
 
 
 # ---------------------------------------------------------------------------
diff --git a/tests/test_train_model.py b/tests/test_train_model.py
index 37d8418..b6644dd 100644
--- a/tests/test_train_model.py
+++ b/tests/test_train_model.py
@@ -2,7 +2,7 @@
 import json
 import joblib
 from pathlib import Path
-from src.model.train_model import load_labeled_data, train_pdf_classifier
+from src.classifier.train_model import load_labeled_data, train_pdf_classifier
 
 
 @pytest.fixture