NovakLabOSU · SeanClay10 · May 13, 2026 · May 13, 2026 · May 13, 2026 · May 13, 2026
diff --git a/.env.example b/.env.example
@@ -0,0 +1,14 @@
+# ── Google Drive integration ────────────────────────────────────────────────
+# Required by scripts/full_pipeline.py (--api mode) and scripts/ci_pipeline.py.
+# Obtain from the project's Google Cloud service account (ask the project owner).
+
+# The full JSON key for the Google service account, as a single-line string.
+GOOGLE_SERVICE_ACCOUNT_JSON=
+
+# The ID of the root Google Drive folder containing 'useful' and 'not-useful' subfolders.
+# Found in the folder's URL: drive.google.com/drive/folders/<ID>
+GOOGLE_DRIVE_ROOT_FOLDER_ID=
+
+# Set to "true" if the Drive folder is a shared drive (Team Drive).
+# Leave blank or omit for standard My Drive folders.
+GOOGLE_DRIVE_USE_SHARED_DRIVE=
diff --git a/.github/workflows/working_sw.yml b/.github/workflows/working_sw.yml
@@ -20,7 +20,7 @@ jobs:
           sudo apt-get update
           sudo apt-get install -y tesseract-ocr
           python -m pip install --upgrade pip
-          pip install -r requirements.txt
+          pip install ".[dev]"
 
       # CI pipeline (streams 20 PDFs per class from Drive and trains model)
       - name: CI pipeline (Drive stream)
@@ -36,10 +36,10 @@ jobs:
         uses: actions/cache@v3
         with:
           path: |
-            src/model/models
+            src/classifier/models
             ~/.joblib
             ~/.sklearn
-          key: ${{ runner.os }}-sklearn-model-${{ hashFiles('src/model/**/*.py') }}
+          key: ${{ runner.os }}-sklearn-model-${{ hashFiles('src/classifier/**/*.py') }}
           restore-keys: |
             ${{ runner.os }}-sklearn-model-
 
@@ -60,17 +60,17 @@ jobs:
       - name: Run PDF extraction script (repo test asset)
         run: |
           INPUT_PDF="tests/test.pdf"
-          python src/preprocessing/pdf_text_extraction.py "$INPUT_PDF"
+          python -m src.io.pdf_text_extraction "$INPUT_PDF"
 
       # - name: Generate labels
       #   run: |
       #     echo "Generating labels for extracted text..."
-      #     python src/preprocessing/generate_labels.py
+      #     python src/io/generate_labels.py
 
       # - name: Load and preprocess data
       #   run: |
       #     echo "Loading and preprocessing dataset..."
-      #     python src/preprocessing/data_loader.py
+      #     python src/io/data_loader.py
       # No full pipeline here; use full_training_pipeline.py on main or a scheduled workflow
 
       - name: Validate preprocessing pipeline
@@ -113,8 +113,9 @@ jobs:
         with:
           name: trained-model
           path: |
-            src/model/models/pdf_classifier_model.pkl
-            src/model/models/tfidf_vectorizer.pkl
+            src/classifier/models/pdf_classifier.json
+            src/classifier/models/tfidf_vectorizer.pkl
+            src/classifier/models/label_encoder.pkl
 
       - name: Show pipeline summary
         run: |
@@ -124,4 +125,4 @@ jobs:
           echo "2. Label Generation Results:"
           ls -l data/labels.json
           echo "3. Model Training Results:"
-          ls -l src/model/models/
+          ls -l src/classifier/models/
diff --git a/.gitignore b/.gitignore
@@ -211,6 +211,7 @@ data/needs-check
 data/not-useful
 data/processed-text
 data/useful
-src/model/models/*.pkl
+data/results/
+src/classifier/models/*.pkl
 
 
diff --git a/README.md b/README.md
@@ -125,7 +125,7 @@ git clone https://github.com/NovakLabOSU/FracFeedExtractor.git
 cd FracFeedExtractor
 python3 -m venv venv
 source venv/bin/activate
-pip install -r requirements.txt
+pip install -e ".[dev]"
 ```
 
 ```bash
@@ -134,17 +134,17 @@ git clone https://github.com/NovakLabOSU/FracFeedExtractor.git
 cd FracFeedExtractor
 py -m venv venv
 ./venv/Scripts/activate
-pip install -r requirements.txt
+pip install -e ".[dev]"
 ```
 
 ### Quick Start
 
 ```bash
 # Classify and extract from a folder of PDFs
-python classify_extract.py path/to/pdfs/
+python src/pipeline/classify_extract.py path/to/pdfs/
 
 # Adjust the LLM model or confidence threshold
-python classify_extract.py path/to/pdfs/ --llm-model llama3.1:8b --confidence-threshold 0.70
+python src/pipeline/classify_extract.py path/to/pdfs/ --llm-model qwen2.5:7b --confidence-threshold 0.70
 ```
 
 Results are written to `data/results/metrics/` (per-paper JSON) and `data/results/summaries/` (pipeline CSV).

diff --git a/data/results/Adams_1989_results.json b/data/results/Adams_1989_results.json
diff --git a/data/results/Ferreira_1999_results.json b/data/results/Ferreira_1999_results.json
diff --git a/data/results/Fisher_2008_results.json b/data/results/Fisher_2008_results.json
diff --git a/data/results/Sousa_2015_results.json b/data/results/Sousa_2015_results.json
diff --git a/data/results/classifications.csv b/data/results/classifications.csv
diff --git a/data/results/classifications.json b/data/results/classifications.json
diff --git a/data/results/metrics/Adams_1989_results.json b/data/results/metrics/Adams_1989_results.json
diff --git a/data/results/summaries/txt_pipeline_summary_20260227_211255.csv b/data/results/summaries/txt_pipeline_summary_20260227_211255.csv
diff --git a/data/results/test_biomistral_results.json b/data/results/test_biomistral_results.json
diff --git a/data/results/test_quick_results.json b/data/results/test_quick_results.json
diff --git a/data/results/test_results.json b/data/results/test_results.json