Open-Source-Legal
diff --git a/‎.github/workflows/frontend-e2e-extract.yml‎
Lines changed: 141 additions & 0 deletions b/‎.github/workflows/frontend-e2e-extract.yml‎
Lines changed: 141 additions & 0 deletions
diff --git a/‎CHANGELOG.md‎
Lines changed: 3 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎compose/local/django/celery/worker/start‎
100644100755
Lines changed: 1 addition & 1 deletion b/‎compose/local/django/celery/worker/start‎
100644100755
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/development/e2e_vcr.md‎
Lines changed: 70 additions & 0 deletions b/‎docs/development/e2e_vcr.md‎
Lines changed: 70 additions & 0 deletions
diff --git a/‎frontend/playwright.video.config.ts‎
Lines changed: 16 additions & 0 deletions b/‎frontend/playwright.video.config.ts‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎frontend/src/components/documents/ModernDocumentItem.tsx‎
Lines changed: 2 additions & 0 deletions b/‎frontend/src/components/documents/ModernDocumentItem.tsx‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎frontend/src/views/Documents.tsx‎
Lines changed: 6 additions & 0 deletions b/‎frontend/src/views/Documents.tsx‎
Lines changed: 6 additions & 0 deletions
@@ -0,0 +1,141 @@
+name: Frontend E2E Extract Pipeline (VCR)
+
+# Drives the full extract pipeline end-to-end via Playwright:
+#   login → upload two PDFs → ingest+embed → create extract → run → CSV.
+# The LLM call inside `doc_extract_query_task` is wrapped in a VCR.py
+# cassette so no OpenAI traffic is generated. See:
+#   docs/development/e2e_vcr.md
+#
+# *** CURRENTLY MANUAL-TRIGGER ONLY ***
+#
+# This workflow is parked behind `workflow_dispatch` because LlamaParse —
+# the default PDF parser used during ingest — is NOT yet covered by VCR.
+# Until `_LLM_HOSTS` in `opencontractserver/utils/vcr_replay.py` is
+# extended to include `api.cloud.llamaindex.ai` AND the cassette is
+# re-recorded to capture the parse-and-poll calls, this workflow would
+# call LlamaParse for real on every CI run. Triggering it manually is
+# OK for spot-checks (LlamaParse calls cost cents per doc); enabling it
+# on every PR is not.
+#
+# To enable on PRs: replace the `on:` block with the standard
+# pull_request paths filter (see frontend-e2e.yml for the template).
+
+env:
+  DOCKER_BUILDKIT: 1
+  COMPOSE_DOCKER_CLI_BUILD: 1
+
+defaults:
+  run:
+    working-directory: ./
+
+on:
+  workflow_dispatch:
+
+concurrency:
+  group: frontend-e2e-extract-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  e2e-extract:
+    name: Extract pipeline (PDF upload → run → CSV)
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    permissions:
+      contents: read
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v6
+
+      - name: Setup Node.js
+        uses: actions/setup-node@v6
+        with:
+          node-version: "20"
+
+      - name: Install Yarn
+        run: npm install -g yarn
+
+      - name: Install frontend dependencies
+        working-directory: ./frontend
+        run: yarn install --frozen-lockfile
+
+      - name: Install Playwright browsers
+        working-directory: ./frontend
+        run: yarn playwright install --with-deps chromium
+
+      # ────────────────────────────────────────────────────────────────
+      # Bring up the full local stack (postgres, redis, embedder,
+      # docling-parser, django, celeryworker). We use `local.yml` rather
+      # than `test.yml` because the extract pipeline needs a celery
+      # worker, and `local.yml` is the only compose file that ships one.
+      #
+      # The cassette replays every OpenAI call so OPENAI_API_KEY can be
+      # anything. LLAMAPARSE_API_KEY is still required (real calls,
+      # see file-level note above).
+      # ────────────────────────────────────────────────────────────────
+      - name: Build django image
+        run: docker compose -f local.yml build django
+
+      - name: Start backend stack with VCR replay
+        env:
+          OC_LLM_VCR_MODE: replay
+          OC_LLM_VCR_CASSETTE: /app/opencontractserver/tests/fixtures/cassettes/e2e_extract_pdf_workflow/extract.yaml
+          # CI provisions a fake OpenAI key — the cassette intercepts
+          # every request so this is never sent. We pin it to a clearly
+          # bogus value to make accidental real calls fail loudly.
+          OPENAI_API_KEY: sk-FAKE-VCR-CI-NOT-REAL
+          # LlamaParse is NOT yet under VCR; the workflow secret is
+          # required for real PDF parsing. See file-level note.
+          LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
+        run: |
+          docker compose -f local.yml up -d
+          echo "Waiting for django to become healthy…"
+          for i in {1..60}; do
+            state=$(docker inspect -f '{{.State.Health.Status}}' \
+              opencontracts-django-1 2>/dev/null || echo "starting")
+            if [ "$state" = "healthy" ]; then echo "django healthy"; break; fi
+            if [ "$i" = "60" ]; then
+              echo "django did not become healthy in time"
+              docker compose -f local.yml logs django | tail -100
+              exit 1
+            fi
+            sleep 2
+          done
+
+      - name: Run Playwright extract spec
+        working-directory: ./frontend
+        env:
+          CI: "true"
+          E2E_RUN_LLM_TESTS: "true"
+          E2E_TEST_USERNAME: admin
+          # The local stack creates the superuser from
+          # DJANGO_SUPERUSER_PASSWORD in `.envs/.local/.django`. CI must
+          # supply the same value to the playwright fixture.
+          E2E_TEST_PASSWORD: ${{ secrets.E2E_LOCAL_SUPERUSER_PASSWORD }}
+        run: yarn playwright test --grep "Extract PDF workflow" --reporter=list
+
+      - name: Capture backend logs on failure
+        if: failure()
+        run: |
+          mkdir -p artifacts
+          docker compose -f local.yml ps > artifacts/docker-ps.txt || true
+          docker compose -f local.yml logs --no-color > artifacts/docker-compose-logs.txt || true
+
+      - name: Upload backend logs on failure
+        if: failure()
+        uses: actions/upload-artifact@v7
+        with:
+          name: e2e-extract-backend-logs
+          path: artifacts/
+
+      - name: Upload Playwright HTML report on failure
+        if: failure()
+        uses: actions/upload-artifact@v7
+        with:
+          name: e2e-extract-playwright-report
+          path: frontend/playwright-report-e2e/
+          if-no-files-found: ignore
+
+      - name: Tear down backend stack
+        if: always()
+        run: docker compose -f local.yml down -v
@@ -9,6 +9,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- **VCR.py wrapper for LLM calls in `doc_extract_query_task`** — `opencontractserver/utils/vcr_replay.py` exposes a `maybe_vcr_cassette()` context manager that, when `OC_LLM_VCR_MODE` and `OC_LLM_VCR_CASSETTE` are set on the celery worker, records or replays every HTTP call to LLM provider hosts (currently `api.openai.com` / `api.anthropic.com`). A custom request-body matcher strips volatile values (millisecond timestamps, Django document PKs, OpenAI tool-call IDs, UUIDs) so a cassette recorded against one DB replays cleanly against another. With the env vars unset the wrapper is a no-op — production behavior is unchanged. Pre-recorded cassette for the E2E extract spec lives at `opencontractserver/tests/fixtures/cassettes/e2e_extract_pdf_workflow/extract.yaml`. Replay was verified end-to-end against a deliberately-fake `OPENAI_API_KEY` to confirm no real network call is made. See `docs/development/e2e_vcr.md` for record / replay / debug instructions.
+- **`.github/workflows/frontend-e2e-extract.yml`** — CI workflow (currently `workflow_dispatch` only) that runs the new E2E extract spec against the full `local.yml` stack with `OC_LLM_VCR_MODE=replay` and a fake `OPENAI_API_KEY`. Manual-only because LlamaParse is not yet covered by the cassette and would otherwise be called for real on each run; activating on every PR is a follow-up that needs to extend `_LLM_HOSTS` and re-record.
+- **`frontend/tests/e2e/extract-pdf-workflow.spec.ts`** — full-stack Playwright E2E spec for the extract pipeline: login → create corpus → upload two PDFs (`frontend/tests/fixtures/{usc-title-1,eton-agreement}.pdf`) → wait for parse + embedding → create extract with one column → run with a real OpenAI call → CSV export → assert non-empty cells. Adds new helpers to `frontend/tests/e2e/helpers.ts` (`uploadPdfViaUI`, `waitForDocumentReady`, `createExtractViaUI`, `openExtractByName`, `addColumnViaUI`, `addDocumentsToExtractViaUI`, `runExtractAndWaitForFinish`). Gated on `E2E_RUN_LLM_TESTS=true`; skipped in CI until LLM responses can be mocked over the wire. Runs on the live `local.yml` stack; required tweaks to disable Auth0 (`.envs/.local/.django USE_AUTH0=false`) and to widen the celeryworker `watchfiles --ignore-paths` (in `compose/local/django/celery/worker/start` and the `local.yml` command pointer) so editor / Playwright artifact writes don't hot-reload the worker mid-task. Also adds `data-testid="document-card"` (+ `data-processing` on the `/documents`-view variants) to `frontend/src/views/Documents.tsx` and `data-testid="document-card"` to `frontend/src/components/documents/ModernDocumentItem.tsx`, so tests can poll for the `backendLock` UI signal without depending on hover-only action menus. Cards are matched by `[data-testid="document-card"]` filtered with the visible title text — the standard Playwright pattern.
 - **Mypy: type analyzer, shared, agents, badges, worker_uploads; introduce shared protocols** (Issue #1335): Brought the five smaller, interface-rich target packages over the ≥70% return-annotation bar called for by the issue and seeded `opencontractserver/types/protocols.py` with the four protocols requested in the scope:
   - `VectorStoreProtocol` — minimum surface (`search` / `async_search`) implemented by `CoreAnnotationVectorStore` (`opencontractserver/llms/vector_stores/core_vector_stores.py`); imported and re-exported from that module so consumers can annotate against the protocol rather than the concrete dataclass.
   - `PipelineComponentProtocol` — `title` / `description` / `author` / `dependencies` surface that the pipeline registry duck-types against; imported from `opencontractserver/pipeline/base/base_component.py` so any future parser/embedder/thumbnailer registered outside the inheritance hierarchy still type-checks against the same contract.
 
@@ -4,4 +4,4 @@ set -o errexit
 set -o nounset
 
 
-watchfiles --target-type command "celery -A config.celery_app worker -l INFO --concurrency=1 -Q celery,worker_uploads"
+watchfiles --ignore-paths frontend,node_modules,.git,__pycache__,.mypy_cache,.pytest_cache,media,staticfiles,docs,.claude,.playwright-mcp,playwright-report-e2e,test-results --target-type command "celery -A config.celery_app worker -l INFO --concurrency=1 -Q celery,worker_uploads"
@@ -0,0 +1,70 @@
+# Recording and replaying LLM cassettes for the extract E2E spec
+
+The E2E Playwright spec at `frontend/tests/e2e/extract-pdf-workflow.spec.ts` drives the full extract pipeline including a real LLM call. To keep CI fast, deterministic, and free of LLM-API spend, the call can be wrapped in a [VCR.py](https://vcrpy.readthedocs.io/) cassette so the recorded HTTP interaction is replayed instead of hitting the provider.
+
+## How it works
+
+`opencontractserver/utils/vcr_replay.py` exposes a `maybe_vcr_cassette()` context manager that is wrapped around the agent invocation in `opencontractserver/tasks/data_extract_tasks.py::doc_extract_query_task`. When the relevant env vars are unset, the manager is a no-op — production behavior is unchanged.
+
+When the env vars are set:
+- `OC_LLM_VCR_MODE=record` — every LLM HTTP call is captured to the cassette file (overwriting any existing one).
+- `OC_LLM_VCR_MODE=once` — record if missing, replay if present.
+- `OC_LLM_VCR_MODE=replay` — replay only; an unmatched request raises `CannotOverwriteExistingCassetteException` and bubbles up as an extraction failure.
+
+The cassette path is supplied via `OC_LLM_VCR_CASSETTE` (filesystem path, must be visible inside the celery worker container).
+
+A custom matcher strips volatile fields from request bodies so a cassette recorded against one DB (with timestamp `1777504812606` and document PK `56`, etc.) replays cleanly against another (with different IDs and timestamps). The patterns are in `_VOLATILE_PATTERNS` at the top of `vcr_replay.py` — extend them as new volatile values surface.
+
+## Recording a fresh cassette
+
+1. Bring up the local stack with the VCR env vars set so the worker container picks them up:
+
+   ```bash
+   OC_LLM_VCR_MODE=record \
+   OC_LLM_VCR_CASSETTE=/app/opencontractserver/tests/fixtures/cassettes/e2e_extract_pdf_workflow/extract.yaml \
+   docker compose -f local.yml up -d --no-deps --force-recreate celeryworker
+   ```
+
+2. Run the E2E spec end-to-end against a real OpenAI key:
+
+   ```bash
+   cd frontend
+   E2E_RUN_LLM_TESTS=true E2E_TEST_PASSWORD="<your superuser password>" \
+     yarn test:e2e --grep "Extract PDF workflow" --reporter=list
+   ```
+
+3. Verify the cassette landed under `opencontractserver/tests/fixtures/cassettes/e2e_extract_pdf_workflow/extract.yaml`. Commit it.
+
+## Replaying
+
+1. Bring up the worker in replay mode (no real key required — VCR intercepts every request to the LLM provider):
+
+   ```bash
+   OC_LLM_VCR_MODE=replay \
+   OC_LLM_VCR_CASSETTE=/app/opencontractserver/tests/fixtures/cassettes/e2e_extract_pdf_workflow/extract.yaml \
+   docker compose -f local.yml up -d --no-deps --force-recreate celeryworker
+   ```
+
+2. Run the spec normally. Total time drops from ~1.6 min to roughly the same — the bulk of the runtime is parser ingest + Playwright wall-clock, not the LLM call.
+
+To prove no real LLM call happens, replace `OPENAI_API_KEY` in `.envs/.local/.django` with a deliberately-fake string before re-running. The spec still passes.
+
+## Debugging matcher failures
+
+Set `OC_LLM_VCR_DEBUG=1` (also forwarded through `local.yml` to the worker). On every body-mismatch the matcher writes a JSON line with the first byte of difference + 200 chars of context to `/tmp/vcr-mismatch-<pid>.log` inside the container. Tail it to see exactly what's volatile and isn't being normalized:
+
+```bash
+docker exec celeryworker bash -c 'tail -1 /tmp/vcr-mismatch-*.log | python3 -m json.tool'
+```
+
+Add new patterns to `_VOLATILE_PATTERNS` until the matcher succeeds.
+
+## Limitations / follow-ups
+
+- **LlamaParse is not yet covered.** PDF ingest still calls `https://api.cloud.llamaindex.ai`. To run the spec in CI without any external network, `_LLM_HOSTS` in `vcr_replay.py` needs to grow to include the LlamaParse host AND the wrapper needs to be applied around `ingest_doc` too. That's a separate PR.
+- **Cassette goes stale on prompt changes.** Any change to the structured-extraction system prompt, the column query, or the tool schemas will produce a new request body and require re-recording.
+- **One cassette per spec.** The current cassette is named `e2e_extract_pdf_workflow/extract.yaml`. If you add a second LLM-using spec, give it its own cassette directory.
+
+## Why VCR rather than `pydantic_ai.models.test.TestModel`
+
+A `TestModel` would be cheaper (no HTTP at all) but it would not exercise the openai-SDK + httpx + pydantic-ai integration path. PR #1399's `failure_mode=no_final_response` classifier specifically targets that integration path — bypassing it with a pure-Python test double would mask the very class of bugs the spec exists to catch.
@@ -0,0 +1,16 @@
+import baseConfig from "./playwright.config";
+import { defineConfig } from "@playwright/test";
+
+/**
+ * Playwright config override that forces video recording for every test.
+ * Use only for capturing demo footage (`yarn playwright test -c playwright.video.config.ts ...`);
+ * the default config keeps `video: "retain-on-failure"` to avoid bloat.
+ */
+export default defineConfig({
+  ...baseConfig,
+  use: {
+    ...baseConfig.use,
+    video: "on",
+    viewport: { width: 1280, height: 800 },
+  },
+});
@@ -1286,6 +1286,7 @@ export const ModernDocumentItem: React.FC<ModernDocumentItemProps> = ({
       <>
         <CardContainer
           ref={setNodeRef}
+          data-testid="document-card"
           className={`${is_selected ? "is-selected" : ""} ${
             isProcessing ? "backend-locked" : ""
           } ${isFailed ? "failed" : ""} ${
@@ -1497,6 +1498,7 @@ export const ModernDocumentItem: React.FC<ModernDocumentItemProps> = ({
     <>
       <ListContainer
         ref={setNodeRef}
+        data-testid="document-card"
         className={`${is_selected ? "is-selected" : ""} ${
           isProcessing ? "backend-locked" : ""
         } ${isFailed ? "failed" : ""} ${isLongPressing ? "long-pressing" : ""}`}
 
@@ -1380,6 +1380,8 @@ export const Documents = () => {
                       key={doc.id}
                       role="button"
                       tabIndex={0}
+                      data-testid="document-card"
+                      data-processing={String(Boolean(doc.backendLock))}
                       aria-label={`Open document ${doc.title || "Untitled"}`}
                       $selected={selected_document_ids.includes(doc.id)}
                       onClick={() => handleDocumentClick(doc)}
@@ -1502,6 +1504,8 @@ export const Documents = () => {
                       key={doc.id}
                       role="row"
                       tabIndex={0}
+                      data-testid="document-card"
+                      data-processing={String(Boolean(doc.backendLock))}
                       aria-label={`Open document ${doc.title || "Untitled"}`}
                       $selected={selected_document_ids.includes(doc.id)}
                       onClick={() => handleDocumentClick(doc)}
@@ -1569,6 +1573,8 @@ export const Documents = () => {
                       key={doc.id}
                       role="listitem"
                       tabIndex={0}
+                      data-testid="document-card"
+                      data-processing={String(Boolean(doc.backendLock))}
                       aria-label={`Open document ${doc.title || "Untitled"}`}
                       $selected={selected_document_ids.includes(doc.id)}
                       onClick={() => handleDocumentClick(doc)}
Original file line number	Diff line number	Diff line change
`@@ -4,4 +4,4 @@ set -o errexit`
`4`	`4`	`set -o nounset`
`5`	`5`
`6`	`6`
`7`		`-watchfiles --target-type command "celery -A config.celery_app worker -l INFO --concurrency=1 -Q celery,worker_uploads"`
	`7`	`+watchfiles --ignore-paths frontend,node_modules,.git,__pycache__,.mypy_cache,.pytest_cache,media,staticfiles,docs,.claude,.playwright-mcp,playwright-report-e2e,test-results --target-type command "celery -A config.celery_app worker -l INFO --concurrency=1 -Q celery,worker_uploads"`