CarletonComputerScienceSociety · mikesiez · Jun 15, 2026 · Jun 15, 2026 · Jun 20, 2026 · Jun 20, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -16,6 +16,7 @@ dependencies = [
     "celery",
     "redis",
     "discord-py>=2.7.1",
+    "beautifulsoup4>=4.15.0",
 ]
 
 [dependency-groups]

diff --git a/src/ingestion/scrapers/html_scraper.py b/src/ingestion/scrapers/html_scraper.py
@@ -1,43 +1,46 @@
-import httpx
-import trafilatura
-
-from src.config.logger import get_logger
-
-log = get_logger(__name__)
-
-
-def scrape(url: str) -> tuple[str, str | None]:
-    """Fetch a URL and extract clean text + title.
-
-    Returns (text, title). Title may be None if not found. Text may be empty
-    if trafilatura couldn't extract anything; the caller should check and skip.
-    """
-    log.info("scrape_started", url=url)
-    with httpx.Client(timeout=30, follow_redirects=True) as client:
-        response = client.get(url)
-        response.raise_for_status()
-
-    # Known limitations (tracked as a separate ticket):
-    #  - Misses content inside aria-hidden="true" accordions (common on FAQ pages)
-    #  - Misses question text but gets answer text on other FAQ pages, we should get both
-    #  - Link formatting is markdown-style and may need cleanup downstream
-    # Improving extraction quality is a tuning ticket, not a blocker for development.
-    text = (
-        trafilatura.extract(
-            response.text,
-            favor_recall=True,
-            include_links=True,
-            include_tables=True,
-            include_formatting=False,
-            include_comments=False,
-            deduplicate=True,
-            output_format="txt",
-        )
-        or ""
-    )
-
-    metadata = trafilatura.extract_metadata(response.text)
-    title = metadata.title if metadata else None
-
-    log.info("scrape_complete", url=url, chars=len(text), title=title)
-    return text, title
+import httpx
+from bs4 import BeautifulSoup
+
+
+def _extract_from_html(html: str) -> tuple[str, str | None]:
+    cut_marker = "<!-- close main-wrapper"
+    if cut_marker in html:
+        html = html.split(cut_marker)[0]
+
+    html = BeautifulSoup(html, "html.parser")
+    # removing elements by class
+    for el in html.select(
+        ".footer, .global-nav, .navigation, " ".topbar, .content__meta, .visuallyhidden"
+    ):
+        el.decompose()
+
+    # removing elements by tag
+    for tag in html(["header", "footer", "nav"]):
+        tag.decompose()
+
+    # formatting links after clearing having cleared clutter
+    for link in html.find_all("a"):
+        href = link.get("href")
+        if href and href[0] != "#":
+            if href[-3:] == "pdf":
+                link.string = f"{link.string} [PDF: {href}]"
+            else:
+                link.string = f"{link.string} [Link: {href}]"
+
+    text = html.get_text("\n", strip=True)
+
+    title = (html("title")[0]).get_text()
+
+    return text, title
+
+
+def scrape(url: str) -> tuple[str, str | None]:
+    """Fetch a URL and extract clean text + title.
+
+    Returns (text, title). Title may be None if not found.
+    """
+    with httpx.Client(timeout=30, follow_redirects=True) as client:
+        response = client.get(url)
+        response.raise_for_status()
+
+    return _extract_from_html(response.text)
diff --git a/tests/ingestion/fixtures/courseTest.html b/tests/ingestion/fixtures/courseTest.html
diff --git a/tests/ingestion/fixtures/faqTest.html b/tests/ingestion/fixtures/faqTest.html
diff --git a/tests/ingestion/scraper_offline.py b/tests/ingestion/scraper_offline.py
@@ -0,0 +1,60 @@
+from pathlib import Path
+
+from src.ingestion.scrapers.html_scraper import _extract_from_html
+
+FIXTURES = Path(__file__).parent / "fixtures"
+
+# run with python -m tests.ingestion.scraper_offline
+
+
+def load_fixture(name: str) -> str:
+    return (FIXTURES / name).read_text(encoding="utf-8")
+
+
+def test_bcyber_page():
+    html = load_fixture("courseTest.html")
+
+    text, title = _extract_from_html(html)
+
+    assert title == "Courses and Registration (B.Cyber.) - School of Computer Science"
+    assert "Electives and Prohibited Courses" in text
+    assert "Skip to Main Content" not in text
+
+    # Optional if your extraction removes nav correctly
+    assert "Search this website" not in text
+
+    assert "[PDF:" in text
+
+    assert (
+        "[PDF: https://carleton.ca/scs/wp-content/uploads/BCyber-Course-Map-202630-3.pdf]" in text
+    )
+
+    assert (
+        "[PDF: https://carleton.ca/scs/wp-content/uploads/FINAL2-BCyber-Course-Map-202530.pdf]"
+        in text
+    )
+
+
+def test_new_student_faq():
+    html = load_fixture("faqTest.html")
+
+    text, title = _extract_from_html(html)
+
+    question = "How do I build a timetable?"
+    answer = "Log into Carleton Central"
+    assert question in text
+    assert answer in text
+
+    question = "How do I build a timetable?"
+    answer = "Log into Carleton Central"
+    assert text.index(question) < text.index(answer)
+
+    assert "How do I view my grades or exam schedule" in text
+
+    # Use a phrase that appears in that answer block
+    assert "Information about how to view your grades or exam schedule" in text
+    assert "Skip to Main Content" not in text
+
+
+test_new_student_faq()
+test_bcyber_page()
diff --git a/uv.lock b/uv.lock