diff --git a/.env.example b/.env.example index 6ed04f5..7ea1b1f 100644 --- a/.env.example +++ b/.env.example @@ -7,6 +7,9 @@ OLLAMA_CHAT_MODEL=llama3.2:3b OLLAMA_EMBEDDING_MODEL=nomic-embed-text EMBEDDING_DIM=768 TOP_K=3 +# Page list for `make ingest`. Point at data/webpages/test_list.json for a +# faster local run against a small subset. +INGEST_URL_LIST=data/webpages/list.json TEST_DATABASE_URL=postgresql+asyncpg://postgres:postgres@localhost:5445/cs_assistant_test DISCORD_BOT_TOKEN= DISCORD_GUILD_ID= diff --git a/README.md b/README.md index a6b031a..17ea649 100644 --- a/README.md +++ b/README.md @@ -136,6 +136,15 @@ Scrapes, chunks, embeds, and stores all URLs listed in `data/webpages/list.json`. You'll see structured log output for each URL. Re-running is safe — unchanged content is skipped. +> **Local dev tip:** embedding the full list with a local model can be slow. For +> faster iteration, set `INGEST_URL_LIST` in your `.env` to the smaller curated +> subset (1-2 pages per category, exercising every page type), then `make ingest` +> as usual: +> +> ```bash +> INGEST_URL_LIST=data/webpages/test_list.json +> ``` + ### 9. Ask a question ```bash diff --git a/data/webpages/list.json b/data/webpages/list.json index 94b16d0..bcead94 100644 --- a/data/webpages/list.json +++ b/data/webpages/list.json @@ -1,5 +1,109 @@ [ + "https://ccss.carleton.ca/resources/articles/reducing-second-year-workload/", + "https://ccss.carleton.ca/resources/articles/should-you-take-z-section/", + "https://ccss.carleton.ca/resources/articles/things-to-consider-while-making-schedule/", "https://ccss.carleton.ca/resources/articles/which-electives-should-i-take/", + "https://ccss.carleton.ca/resources/articles/why-you-should-consider-a-minor/", + "https://ccss.carleton.ca/resources/faqs/are-all-sections-the-same/", + "https://ccss.carleton.ca/resources/faqs/can-i-take-2000-level-courses-in-first-year/", + "https://ccss.carleton.ca/resources/faqs/comp-1005-1006-vs-1405-1406/", "https://ccss.carleton.ca/resources/faqs/comp1405-z-section/", - "https://carleton.ca/scs/current-students/undergraduate-students/" + "https://ccss.carleton.ca/resources/faqs/contact-for-registration-help/", + "https://ccss.carleton.ca/resources/faqs/course-does-not-have-instructor/", + "https://ccss.carleton.ca/resources/faqs/first-year-courses-order/", + "https://ccss.carleton.ca/resources/faqs/free-vs-breadth-electives/", + "https://ccss.carleton.ca/resources/faqs/how-many-courses-in-a-semester/", + "https://ccss.carleton.ca/resources/faqs/lectures-vs-tutorials/", + "https://ccss.carleton.ca/resources/faqs/override-request/", + "https://ccss.carleton.ca/resources/faqs/scheduled-vs-unscheduled-courses/", + "https://ccss.carleton.ca/resources/faqs/should-i-register-for-both-terms/", + "https://ccss.carleton.ca/resources/faqs/stat-2507-in-first-year/", + "https://ccss.carleton.ca/resources/faqs/switch-courses-after-registering/", + "https://ccss.carleton.ca/resources/faqs/what-are-time-tickets/", + "https://ccss.carleton.ca/resources/faqs/which-courses-can-i-take-as-electives/", + "https://ccss.carleton.ca/resources/faqs/which-courses-in-first-year/", + + "https://carleton.ca/scs/bcyber/bachelor-of-cybersecurity/", + "https://carleton.ca/scs/bcyber/undergraduate-studies-bcyber/", + "https://carleton.ca/scs/current-students/bachelor-of-computer-science/", + "https://carleton.ca/scs/current-students/graduate-students/", + "https://carleton.ca/scs/future-students/bcs-internship/", + "https://carleton.ca/scs/future-students/graduate-studies/masters-program/", + "https://carleton.ca/scs/future-students/undergraduate-studies-bcs/", + "https://carleton.ca/scs/opportunities/", + "https://carleton.ca/scs/opportunities/undergraduate-opportunities/apply-to-be-a-ta/", + "https://carleton.ca/scs/program-information-and-admissions-phd/", + + "https://carleton.ca/academicadvising/academic-standing/", + "https://carleton.ca/academicadvising/academic-status-report/", + "https://carleton.ca/academicadvising/adding-or-dropping-minorsconcentration/", + "https://carleton.ca/academicadvising/audit-faqs/", + "https://carleton.ca/academicadvising/cgpas/", + "https://carleton.ca/academicadvising/changing-your-degree/", + "https://carleton.ca/academicadvising/changing-your-major/", + "https://carleton.ca/academicadvising/cumulative-grade-point-average-cgpa/", + "https://carleton.ca/academicadvising/degree-requirements/", + "https://carleton.ca/academicadvising/departmental-advisors-a-z/", + "https://carleton.ca/academicadvising/how-to-read-your-audit/", + "https://carleton.ca/academicadvising/making-program-changes/", + "https://carleton.ca/academicadvising/reduced-course-load/", + "https://carleton.ca/academicadvising/student-support-services/", + "https://carleton.ca/academicadvising/what-if-audit/", + "https://carleton.ca/academicadvising/why-add-planned-courses/", + "https://carleton.ca/academicadvising/your-academic-audit/", + + "https://carleton.ca/co-op/about/", + "https://carleton.ca/co-op/apply/undergraduate-students/", + "https://carleton.ca/co-op/co-op-awards/", + "https://carleton.ca/co-op/coop-1000/", + "https://carleton.ca/co-op/eligibility/graduate/", + "https://carleton.ca/co-op/faqs/current-co-op-students-faqs/co-op-job-search-faqs/", + "https://carleton.ca/co-op/faqs/prospective-co-op-students-faqs/", + "https://carleton.ca/co-op/important-dates/", + "https://carleton.ca/co-op/program-cost/", + "https://carleton.ca/co-op/programs/undegraduate/", + "https://carleton.ca/co-op/rules-regulations/co-op-participation-agreement/", + "https://carleton.ca/co-op/work-study-sequences/undergraduate/", + + "https://carleton.ca/registration/academic-audit/", + "https://carleton.ca/registration/access-to-courses/", + "https://carleton.ca/registration/block-registration/", + "https://carleton.ca/registration/course-selection-guide/", + "https://carleton.ca/registration/dates/timetickets/", + "https://carleton.ca/registration/new-ug/new-student-checklist/", + "https://carleton.ca/registration/override-requests/", + "https://carleton.ca/registration/registration-steps/", + "https://carleton.ca/registration/registration-support/", + "https://carleton.ca/registration/terminology/", + "https://carleton.ca/registration/waitlisting/", + + "https://calendar.carleton.ca/academicyear/", + "https://calendar.carleton.ca/undergrad/courses/COMP/", + "https://calendar.carleton.ca/undergrad/courses/CSEC/", + "https://calendar.carleton.ca/undergrad/undergradprograms/computerscience/", + + "https://calendar.carleton.ca/undergrad/regulations/academicregulationsoftheuniversity/academic-integrity-and-offenses-of-conduct/", + "https://calendar.carleton.ca/undergrad/regulations/academicregulationsoftheuniversity/examinations/", + "https://calendar.carleton.ca/undergrad/regulations/academicregulationsoftheuniversity/grading/", + "https://calendar.carleton.ca/undergrad/regulations/academicregulationsoftheuniversity/registration-evaluation-records/", + "https://calendar.carleton.ca/undergrad/regulations/academicregulationsoftheuniversity/regulations-for-degree-students/", + "https://calendar.carleton.ca/undergrad/regulations/academicregulationsoftheuniversity/regulations-for-students-with-disabilities/", + "https://calendar.carleton.ca/undergrad/regulations/academicregulationsoftheuniversity/regulations-for-students-with-religious-obligations/", + "https://calendar.carleton.ca/undergrad/regulations/academicregulationsoftheuniversity/student-responsibility/", + + "https://admissions.carleton.ca/apply/transfer-credit/", + "https://admissions.carleton.ca/deadlines/", + "https://admissions.carleton.ca/esl/", + "https://admissions.carleton.ca/minors/", + "https://admissions.carleton.ca/scholarships/", + "https://admissions.carleton.ca/scholarships/domestic-costs/", + "https://admissions.carleton.ca/scholarships/usa-and-international-costs/", + + "https://students.carleton.ca/services/academic-status-report/", + "https://students.carleton.ca/services/book-a-study-room/", + "https://students.carleton.ca/services/calculate-your-cgpa/", + "https://students.carleton.ca/services/counselling-services/", + "https://students.carleton.ca/services/cusa-health-plan/", + + "https://carleton.ca/ydyf/computer-science/" ] diff --git a/data/webpages/test_list.json b/data/webpages/test_list.json new file mode 100644 index 0000000..8c39193 --- /dev/null +++ b/data/webpages/test_list.json @@ -0,0 +1,23 @@ +[ + "https://ccss.carleton.ca/resources/articles/which-electives-should-i-take/", + "https://ccss.carleton.ca/resources/faqs/comp-1005-1006-vs-1405-1406/", + + "https://carleton.ca/scs/current-students/bachelor-of-computer-science/", + + "https://carleton.ca/academicadvising/your-academic-audit/", + + "https://carleton.ca/co-op/about/", + + "https://carleton.ca/registration/registration-steps/", + + "https://calendar.carleton.ca/undergrad/undergradprograms/computerscience/", + "https://calendar.carleton.ca/undergrad/courses/COMP/", + + "https://calendar.carleton.ca/undergrad/regulations/academicregulationsoftheuniversity/grading/", + + "https://admissions.carleton.ca/deadlines/", + + "https://students.carleton.ca/services/calculate-your-cgpa/", + + "https://carleton.ca/ydyf/computer-science/" +] diff --git a/scripts/ingest.py b/scripts/ingest.py index f099fa7..99b75f9 100644 --- a/scripts/ingest.py +++ b/scripts/ingest.py @@ -2,12 +2,15 @@ import json from pathlib import Path +from src.config import settings from src.config.logger import get_logger from src.ingestion.services.ingestion_service import ingest_url log = get_logger(__name__) -URL_LIST_PATH = Path("data/webpages/list.json") +# Defaults to the full curated list. Override via .env to ingest a smaller subset +# for faster local iteration: INGEST_URL_LIST=data/webpages/test_list.json +URL_LIST_PATH = Path(settings.ingest_url_list) async def main() -> None: diff --git a/src/config/__init__.py b/src/config/__init__.py index 7dcf1ff..897f0e9 100644 --- a/src/config/__init__.py +++ b/src/config/__init__.py @@ -11,6 +11,7 @@ class Settings(BaseSettings): ollama_embedding_model: str embedding_dim: int top_k: int = 3 + ingest_url_list: str = "data/webpages/list.json" test_database_url: str | None = None discord_bot_token: str | None = None discord_guild_id: int | None = None