diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e610d47..5d7d16e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -21,6 +21,9 @@ jobs: style: name: "style" runs-on: ubuntu-latest + # needed for gh-pages + permissions: + contents: write steps: - uses: actions/checkout@v4 with: @@ -70,3 +73,38 @@ jobs: if: steps.changes.outputs.python == 'true' run: | uv run pytest -v tests/ + + deploy-docs: + name: "deploy docs" + if: github.ref == 'refs/heads/main' + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + persist-credentials: false + + - name: "Install uv" + uses: astral-sh/setup-uv@v7 + with: + enable-cache: true + + - name: "Install project with docs extras" + run: | + uv sync --group docs + + - name: "Build Sphinx docs" + run: | + uv run sphinx-build -b html docs docs/_build + + - name: "Upload Pages artifact" + uses: actions/upload-pages-artifact@v3 + with: + path: docs/_build + + - name: "Deploy to GitHub Pages" + uses: peaceiris/actions-gh-pages@v3 + if: github.ref == 'refs/heads/main' + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_dir: docs/_build/ diff --git a/.gitignore b/.gitignore index 2509593..b1f5894 100644 --- a/.gitignore +++ b/.gitignore @@ -30,3 +30,4 @@ uv.lock # dirs tmp +docs/_build/ diff --git a/docs/_static/.gitkeep b/docs/_static/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/docs/api.rst b/docs/api.rst new file mode 100644 index 0000000..43e56d5 --- /dev/null +++ b/docs/api.rst @@ -0,0 +1,10 @@ +API Layer +========= + +api.routers.transcript +---------------------- + +.. automodule:: lingua_loop.api.routers.transcript + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..847569b --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,23 @@ +"""Sphinx configuration for lingua-loop.""" + +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).resolve().parents[1] / "src")) + +project = "lingua-loop" +copyright = "2024, Jared Frazier" +author = "Jared Frazier" +release = "0.1.4" + +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.napoleon", + "sphinx.ext.viewcode", +] + +templates_path = ["_templates"] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] + +html_theme = "sphinx_rtd_theme" +html_static_path = ["_static"] diff --git a/docs/db.rst b/docs/db.rst new file mode 100644 index 0000000..cb32a33 --- /dev/null +++ b/docs/db.rst @@ -0,0 +1,27 @@ +Database Layer +============== + +db.models +--------- + +.. automodule:: lingua_loop.db.models + :members: + :undoc-members: + :show-inheritance: + :exclude-members: Base + +db.session +---------- + +.. automodule:: lingua_loop.db.session + :members: + :undoc-members: + :show-inheritance: + +db.transcript +------------- + +.. automodule:: lingua_loop.db.transcript + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/exceptions.rst b/docs/exceptions.rst new file mode 100644 index 0000000..c34c2e0 --- /dev/null +++ b/docs/exceptions.rst @@ -0,0 +1,7 @@ +Exceptions +========== + +.. automodule:: lingua_loop.exceptions + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..b89868f --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,23 @@ +Welcome to lingua-loop's documentation! +========================================= + +A web application to train your listening skills by transcribing real speech +from YouTube videos. + +.. toctree:: + :maxdepth: 1 + :caption: Contents: + + api + schemas + services + db + exceptions + integrations + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/docs/integrations.rst b/docs/integrations.rst new file mode 100644 index 0000000..d130e37 --- /dev/null +++ b/docs/integrations.rst @@ -0,0 +1,18 @@ +Integrations +============ + +integrations.youtube.types +-------------------------- + +.. automodule:: lingua_loop.integrations.youtube.types + :members: + :undoc-members: + :show-inheritance: + +integrations.youtube.wrapper +---------------------------- + +.. automodule:: lingua_loop.integrations.youtube.wrapper + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/schemas.rst b/docs/schemas.rst new file mode 100644 index 0000000..7391c45 --- /dev/null +++ b/docs/schemas.rst @@ -0,0 +1,7 @@ +Schemas +======= + +.. automodule:: lingua_loop.schemas.transcript + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/services.rst b/docs/services.rst new file mode 100644 index 0000000..25adc56 --- /dev/null +++ b/docs/services.rst @@ -0,0 +1,18 @@ +Services Layer +============== + +services.text_normalization +--------------------------- + +.. automodule:: lingua_loop.services.text_normalization + :members: + :undoc-members: + :show-inheritance: + +services.transcript +------------------- + +.. automodule:: lingua_loop.services.transcript + :members: + :undoc-members: + :show-inheritance: diff --git a/pyproject.toml b/pyproject.toml index 95c3b86..8661e62 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,3 +53,8 @@ dev = [ "pytest-skip-slow>=0.0.5", "twine>=6.2.0", ] + +docs = [ + "sphinx>=8.2.1", + "sphinx-rtd-theme>=3.0.2", +] diff --git a/src/lingua_loop/api/routers/transcript.py b/src/lingua_loop/api/routers/transcript.py index 4419664..97f0c9b 100644 --- a/src/lingua_loop/api/routers/transcript.py +++ b/src/lingua_loop/api/routers/transcript.py @@ -1,3 +1,5 @@ +"""API routes for transcript operations.""" + from typing import List from fastapi import APIRouter @@ -29,6 +31,7 @@ async def get_transcript( language_code: SupportedLanguageCodes, session=Depends(get_async_session), ): + """Get a transcript for the given video ID and language code.""" transcript = await get_or_create_transcript_with_segments( video_id=video_id, language_code=language_code, session=session ) @@ -44,6 +47,7 @@ async def get_transcript( def _segments_to_schema(segments: List[Segment]) -> List[SegmentSchema]: + """Convert Segment ORM models to SegmentSchema instances.""" segments_as_schema = [ SegmentSchema( start=segment.start, duration=segment.duration, text=segment.text @@ -58,9 +62,9 @@ async def score_transcription( request: ScoreRequest, session: AsyncSession = Depends(get_async_session), ): + """Score a user's transcription against the reference text.""" await _validate_score_request(request=request, session=session) - # Score the request score, reference_text = await compute_score( video_id=request.video_id, segment_indices=request.segment_indices, @@ -75,6 +79,7 @@ async def score_transcription( async def _validate_score_request( request: ScoreRequest, session: AsyncSession ) -> None: + """Validate the score request against available segments.""" transcript = await get_or_create_transcript_with_segments( video_id=request.video_id, session=session, diff --git a/src/lingua_loop/constants.py b/src/lingua_loop/constants.py index 7e05775..2d96615 100644 --- a/src/lingua_loop/constants.py +++ b/src/lingua_loop/constants.py @@ -1,3 +1,5 @@ +"""Constants for Lingua Loop.""" + from pathlib import Path # Directories diff --git a/src/lingua_loop/db/models.py b/src/lingua_loop/db/models.py index 92110f8..b05d05e 100644 --- a/src/lingua_loop/db/models.py +++ b/src/lingua_loop/db/models.py @@ -1,3 +1,5 @@ +"""SQLAlchemy ORM models for Lingua Loop.""" + from datetime import UTC from datetime import datetime from typing import List @@ -14,7 +16,7 @@ class Base(DeclarativeBase): - pass + """Base class for all ORM models.""" class Transcript(Base): @@ -40,6 +42,8 @@ class Transcript(Base): class Segment(Base): + """Represents a segment within a transcript.""" + __tablename__ = "segment" id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True) start: Mapped[float] diff --git a/src/lingua_loop/db/session.py b/src/lingua_loop/db/session.py index 701ac1d..1a90058 100644 --- a/src/lingua_loop/db/session.py +++ b/src/lingua_loop/db/session.py @@ -1,3 +1,5 @@ +"""Database session management utilities.""" + from collections.abc import AsyncGenerator from pathlib import Path from typing import Tuple @@ -17,6 +19,7 @@ def get_engine_and_session_maker( db_driver: str = DEFAULT_DB_DRIVER, database_path: Path | str = DEFAULT_DATABASE_PATH, ) -> Tuple[AsyncEngine, async_sessionmaker[AsyncSession]]: + """Create and return the async engine and session maker.""" sqlalchemy_database_url = f"{db_driver}:///{database_path}" async_engine = create_async_engine(sqlalchemy_database_url) async_session_maker = async_sessionmaker( @@ -26,18 +29,22 @@ def get_engine_and_session_maker( async def create_db_and_tables(async_engine: AsyncEngine): + """Create all database tables defined in models.""" async with async_engine.begin() as conn: await conn.run_sync(Base.metadata.create_all) async def shutdown(async_engine: AsyncEngine): + """Dispose of the async engine.""" await async_engine.dispose() async def get_async_session( request: Request, ) -> AsyncGenerator[AsyncSession, None]: - # `request` and state populated by lifespan(app) + """Provide an async database session for dependency injection.""" + + # `state` property of `request` gets populated by main.py::lifespan async_session_maker = request.state.async_session_maker async with async_session_maker() as session: diff --git a/src/lingua_loop/db/transcript.py b/src/lingua_loop/db/transcript.py index 57f9c3a..25a6a6f 100644 --- a/src/lingua_loop/db/transcript.py +++ b/src/lingua_loop/db/transcript.py @@ -1,3 +1,5 @@ +"""CRUD operations for transcripts.""" + from typing import List from sqlalchemy import select @@ -19,6 +21,7 @@ async def read_or_create_transcript_with_segments( video_id: str, language_code: SupportedLanguageCodes, session: AsyncSession ) -> Transcript: + """Get or create a transcript with all segments for the given video.""" transcript = await _read_transcript_with_segments( video_id=video_id, session=session ) @@ -49,7 +52,7 @@ async def _read_transcript_with_segments( async def _create_transcript( video_id: str, language_code: SupportedLanguageCodes, session: AsyncSession ) -> Transcript: - + """Create a new transcript record with segments from YouTube.""" transcript_list = list_transcripts(video_id=video_id) has_transcript = video_has_transcript_in_language( transcript_list=transcript_list, language_code=language_code @@ -78,6 +81,7 @@ async def _create_transcript( def _get_segments(fetched_transcript: FetchedTranscript) -> List[Segment]: + """Convert a FetchedTranscript to a list of Segment ORM objects.""" segments: List[Segment] = [] snippets = fetched_transcript.snippets for snippet in snippets: diff --git a/src/lingua_loop/exceptions.py b/src/lingua_loop/exceptions.py index c6bbc2f..e27125a 100644 --- a/src/lingua_loop/exceptions.py +++ b/src/lingua_loop/exceptions.py @@ -1,15 +1,23 @@ +"""Custom exceptions for Lingua Loop.""" + from fastapi import status from fastapi.exceptions import HTTPException class TranscriptNotFoundError(Exception): + """Raised when a transcript is not found for a given video ID.""" + def __init__(self, video_id: str): + """Initialize the exception with the video_id.""" self.video_id = video_id super().__init__(f"Transcript not found for video_id={video_id}") class SegmentIndicesError(HTTPException): + """Raised when invalid segment indices are provided.""" + def __init__(self, segment_indices: list[int]): + """Initialize the exception with the invalid segment_indices.""" status_code = status.HTTP_400_BAD_REQUEST detail = f"Invalid segment indices, got {segment_indices}" super().__init__(status_code=status_code, detail=detail) diff --git a/src/lingua_loop/integrations/youtube/types.py b/src/lingua_loop/integrations/youtube/types.py index 5630c64..cf03a42 100644 --- a/src/lingua_loop/integrations/youtube/types.py +++ b/src/lingua_loop/integrations/youtube/types.py @@ -1,8 +1,12 @@ +"""YouTube integration type definitions.""" + from enum import Enum from typing import Dict class SupportedLanguageCodes(str, Enum): + """Supported language codes for YouTube transcripts.""" + DUTCH = "nl" ENGLISH = "en" GERMAN = "de" @@ -10,6 +14,8 @@ class SupportedLanguageCodes(str, Enum): class SupportedLanguages(str, Enum): + """Human-readable language names.""" + DUTCH = "Dutch" ENGLISH = "English" GERMAN = "German" diff --git a/src/lingua_loop/integrations/youtube/wrapper.py b/src/lingua_loop/integrations/youtube/wrapper.py index 96797f2..ca684ea 100644 --- a/src/lingua_loop/integrations/youtube/wrapper.py +++ b/src/lingua_loop/integrations/youtube/wrapper.py @@ -1,3 +1,5 @@ +"""Wrapper around YouTube Transcript API.""" + from youtube_transcript_api import FetchedTranscript from youtube_transcript_api import NoTranscriptFound from youtube_transcript_api import TranscriptList @@ -11,16 +13,19 @@ def fetch_transcript( video_id: str, language_code: SupportedLanguageCodes ) -> FetchedTranscript: + """Fetch a transcript for a video in a specific language.""" return ytt_api.fetch(video_id=video_id, languages=[language_code]) def list_transcripts(video_id: str) -> TranscriptList: + """List all available transcripts for a video.""" return ytt_api.list(video_id) def video_has_transcript_in_language( transcript_list: TranscriptList, language_code: SupportedLanguageCodes ) -> bool: + """Check if a video has a transcript in the specified language.""" transcript_found = True try: transcript = transcript_list.find_transcript( diff --git a/src/lingua_loop/main.py b/src/lingua_loop/main.py index 55d151e..552d61b 100644 --- a/src/lingua_loop/main.py +++ b/src/lingua_loop/main.py @@ -1,3 +1,5 @@ +"""FastAPI application factory and entry point.""" + from contextlib import asynccontextmanager from os import getenv from os import remove @@ -26,11 +28,14 @@ class State(TypedDict): + """Application state type definition.""" + async_session_maker: async_sessionmaker[AsyncSession] @asynccontextmanager async def lifespan(app: FastAPI): + """Application lifespan context manager, setting up and tearing down resources.""" async_engine, async_session_maker = get_engine_and_session_maker() await create_db_and_tables(async_engine=async_engine) yield {"async_session_maker": async_session_maker} @@ -41,6 +46,7 @@ async def lifespan(app: FastAPI): def create_app() -> FastAPI: + """Create and configure the FastAPI application.""" app = FastAPI(lifespan=lifespan) @app.exception_handler(TranscriptNotFoundError) diff --git a/src/lingua_loop/schemas/transcript.py b/src/lingua_loop/schemas/transcript.py index cab6596..bbadad2 100644 --- a/src/lingua_loop/schemas/transcript.py +++ b/src/lingua_loop/schemas/transcript.py @@ -1,3 +1,5 @@ +"""Pydantic models for transcript requests and responses.""" + from typing import List from pydantic import BaseModel @@ -9,18 +11,24 @@ class SegmentSchema(BaseModel): + """Schema for a transcript segment.""" + start: float = Field(ge=0.0) duration: float = Field(gt=0.0) text: str = Field(min_length=1) class TranscriptResponse(BaseModel): + """Response model for a transcript request.""" + video_id: str = Field(min_length=1) segments: List[SegmentSchema] is_generated: bool class ScoreRequest(BaseModel): + """Request model for scoring a transcription.""" + video_id: str = Field(min_length=1) segment_indices: List[int] = Field(min_length=1) user_text: str = Field(min_length=1) @@ -28,5 +36,7 @@ class ScoreRequest(BaseModel): class ScoreResponse(BaseModel): + """Response model for a scored transcription.""" + score: float = Field(ge=MIN_SCORE, le=MAX_SCORE) reference_text: str = Field(min_length=1) diff --git a/src/lingua_loop/scripts/run.py b/src/lingua_loop/scripts/run.py index 18bb95f..dbb4dd6 100644 --- a/src/lingua_loop/scripts/run.py +++ b/src/lingua_loop/scripts/run.py @@ -1,3 +1,5 @@ +"""Script entry point for running the Lingua Loop server.""" + import threading import webbrowser from os import mkdir diff --git a/src/lingua_loop/services/text_normalization.py b/src/lingua_loop/services/text_normalization.py index 42dcac6..a0a6fef 100644 --- a/src/lingua_loop/services/text_normalization.py +++ b/src/lingua_loop/services/text_normalization.py @@ -1,3 +1,5 @@ +"""Text normalization utilities for scoring comparisons.""" + from abc import ABC from abc import abstractmethod from re import sub @@ -25,6 +27,7 @@ class TextNormalizer(ABC): form: NormalizationForm = "NFKD" def normalize(self, text: str) -> str: + """Normalize text through all normalization steps.""" text = self.normalize_special_characters(text) text = self.normalize_case(text) text = self.normalize_punctuation(text) @@ -44,9 +47,11 @@ def normalize_special_characters(self, text: str) -> str: pass def normalize_case(self, text: str) -> str: + """Normalize text to lowercase.""" return text.lower() def normalize_punctuation(self, text: str) -> str: + """Remove punctuation from text.""" all_chars_except_words_and_single_spaces = r"[^\w\s]" space = " " return sub( @@ -56,25 +61,35 @@ def normalize_punctuation(self, text: str) -> str: ) def normalize_whitespace(self, text: str) -> str: + """Collapse whitespace and strip.""" one_or_more_spaces = r"\s+" space = " " return sub(pattern=one_or_more_spaces, repl=space, string=text).strip() class GenericNormalizer(TextNormalizer): + """Normalizer that performs no special character normalization.""" + def normalize_special_characters(self, text: str) -> str: + """Return text unchanged.""" return text class DutchNormalizer(TextNormalizer): + """Normalizer for Dutch text.""" + def normalize_special_characters(self, text: str) -> str: + """Remove combining characters via NFKD normalization.""" text = normalize(self.form, text) text = "".join(c for c in text if not combining(c)) return text class GermanNormalizer(TextNormalizer): + """Normalizer for German text.""" + def normalize_special_characters(self, text: str) -> str: + """Replace German special characters with ASCII equivalents.""" text = ( text.replace("ß", "ss") .replace("ä", "ae") @@ -85,12 +100,15 @@ def normalize_special_characters(self, text: str) -> str: class TextNormalizerFactory: + """Factory for creating appropriate TextNormalizer instances.""" + _language_code_to_normalizer = { SupportedLanguageCodes.DUTCH: DutchNormalizer, SupportedLanguageCodes.GERMAN: GermanNormalizer, } def __call__(self, language_code: SupportedLanguageCodes) -> TextNormalizer: + """Get the appropriate normalizer for the given language code.""" normalizer_cls = self._language_code_to_normalizer.get( language_code, GenericNormalizer ) diff --git a/src/lingua_loop/services/transcript.py b/src/lingua_loop/services/transcript.py index 10a91f1..8cef47f 100644 --- a/src/lingua_loop/services/transcript.py +++ b/src/lingua_loop/services/transcript.py @@ -1,3 +1,5 @@ +"""Service layer for transcript scoring.""" + from difflib import SequenceMatcher from typing import List from typing import Tuple @@ -31,7 +33,7 @@ async def compute_score( language_code: SupportedLanguageCodes, session: AsyncSession, ) -> Tuple[float, str]: - + """Compute a score comparing user text to reference text.""" transcript = await read_or_create_transcript_with_segments( video_id=video_id, session=session, language_code=language_code ) @@ -56,6 +58,7 @@ async def compute_score( def _get_transcript_segments_by_indices( transcript: Transcript, segment_indices: List[int] ) -> List[Segment]: + """Get transcript segments at the specified indices.""" assert _is_monotonically_increasing(segment_indices) segments = transcript.segments segments = [segments[ix] for ix in segment_indices] @@ -63,6 +66,7 @@ def _get_transcript_segments_by_indices( def _is_monotonically_increasing(indices: List[int]) -> bool: + """Check if the given indices are strictly increasing.""" assert len(indices) >= 1 monotonically_increasing: bool = True for ix in range(0, len(indices) - 1): @@ -75,7 +79,8 @@ def _is_monotonically_increasing(indices: List[int]) -> bool: def _score_text(reference_text: str, user_text: str): - """ + """Compute score using Gestalt pattern matching algorithm. + TODO: At some point, you may want to give the user information about word level mismatches so that they can see roughly what they missed... the current approach just gives an overall score. It also performs