Skip to content

Commit 5f3b453

Browse files
authored
Merge pull request #1413 from Open-Source-Legal/pr1399-e2e-extract-test
E2E Playwright spec for extract pipeline + VCR replay (rebased onto PR #1399)
2 parents 1e67b38 + 815d509 commit 5f3b453

17 files changed

Lines changed: 2142 additions & 4 deletions

File tree

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
name: Frontend E2E Extract Pipeline (VCR)
2+
3+
# Drives the full extract pipeline end-to-end via Playwright:
4+
# login → upload two PDFs → ingest+embed → create extract → run → CSV.
5+
# The LLM call inside `doc_extract_query_task` is wrapped in a VCR.py
6+
# cassette so no OpenAI traffic is generated. See:
7+
# docs/development/e2e_vcr.md
8+
#
9+
# *** CURRENTLY MANUAL-TRIGGER ONLY ***
10+
#
11+
# This workflow is parked behind `workflow_dispatch` because LlamaParse —
12+
# the default PDF parser used during ingest — is NOT yet covered by VCR.
13+
# Until `_LLM_HOSTS` in `opencontractserver/utils/vcr_replay.py` is
14+
# extended to include `api.cloud.llamaindex.ai` AND the cassette is
15+
# re-recorded to capture the parse-and-poll calls, this workflow would
16+
# call LlamaParse for real on every CI run. Triggering it manually is
17+
# OK for spot-checks (LlamaParse calls cost cents per doc); enabling it
18+
# on every PR is not.
19+
#
20+
# To enable on PRs: replace the `on:` block with the standard
21+
# pull_request paths filter (see frontend-e2e.yml for the template).
22+
23+
env:
24+
DOCKER_BUILDKIT: 1
25+
COMPOSE_DOCKER_CLI_BUILD: 1
26+
27+
defaults:
28+
run:
29+
working-directory: ./
30+
31+
on:
32+
workflow_dispatch:
33+
34+
concurrency:
35+
group: frontend-e2e-extract-${{ github.head_ref || github.run_id }}
36+
cancel-in-progress: true
37+
38+
jobs:
39+
e2e-extract:
40+
name: Extract pipeline (PDF upload → run → CSV)
41+
runs-on: ubuntu-latest
42+
timeout-minutes: 30
43+
permissions:
44+
contents: read
45+
46+
steps:
47+
- name: Checkout
48+
uses: actions/checkout@v6
49+
50+
- name: Setup Node.js
51+
uses: actions/setup-node@v6
52+
with:
53+
node-version: "20"
54+
55+
- name: Install Yarn
56+
run: npm install -g yarn
57+
58+
- name: Install frontend dependencies
59+
working-directory: ./frontend
60+
run: yarn install --frozen-lockfile
61+
62+
- name: Install Playwright browsers
63+
working-directory: ./frontend
64+
run: yarn playwright install --with-deps chromium
65+
66+
# ────────────────────────────────────────────────────────────────
67+
# Bring up the full local stack (postgres, redis, embedder,
68+
# docling-parser, django, celeryworker). We use `local.yml` rather
69+
# than `test.yml` because the extract pipeline needs a celery
70+
# worker, and `local.yml` is the only compose file that ships one.
71+
#
72+
# The cassette replays every OpenAI call so OPENAI_API_KEY can be
73+
# anything. LLAMAPARSE_API_KEY is still required (real calls,
74+
# see file-level note above).
75+
# ────────────────────────────────────────────────────────────────
76+
- name: Build django image
77+
run: docker compose -f local.yml build django
78+
79+
- name: Start backend stack with VCR replay
80+
env:
81+
OC_LLM_VCR_MODE: replay
82+
OC_LLM_VCR_CASSETTE: /app/opencontractserver/tests/fixtures/cassettes/e2e_extract_pdf_workflow/extract.yaml
83+
# CI provisions a fake OpenAI key — the cassette intercepts
84+
# every request so this is never sent. We pin it to a clearly
85+
# bogus value to make accidental real calls fail loudly.
86+
OPENAI_API_KEY: sk-FAKE-VCR-CI-NOT-REAL
87+
# LlamaParse is NOT yet under VCR; the workflow secret is
88+
# required for real PDF parsing. See file-level note.
89+
LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
90+
run: |
91+
docker compose -f local.yml up -d
92+
echo "Waiting for django to become healthy…"
93+
for i in {1..60}; do
94+
state=$(docker inspect -f '{{.State.Health.Status}}' \
95+
opencontracts-django-1 2>/dev/null || echo "starting")
96+
if [ "$state" = "healthy" ]; then echo "django healthy"; break; fi
97+
if [ "$i" = "60" ]; then
98+
echo "django did not become healthy in time"
99+
docker compose -f local.yml logs django | tail -100
100+
exit 1
101+
fi
102+
sleep 2
103+
done
104+
105+
- name: Run Playwright extract spec
106+
working-directory: ./frontend
107+
env:
108+
CI: "true"
109+
E2E_RUN_LLM_TESTS: "true"
110+
E2E_TEST_USERNAME: admin
111+
# The local stack creates the superuser from
112+
# DJANGO_SUPERUSER_PASSWORD in `.envs/.local/.django`. CI must
113+
# supply the same value to the playwright fixture.
114+
E2E_TEST_PASSWORD: ${{ secrets.E2E_LOCAL_SUPERUSER_PASSWORD }}
115+
run: yarn playwright test --grep "Extract PDF workflow" --reporter=list
116+
117+
- name: Capture backend logs on failure
118+
if: failure()
119+
run: |
120+
mkdir -p artifacts
121+
docker compose -f local.yml ps > artifacts/docker-ps.txt || true
122+
docker compose -f local.yml logs --no-color > artifacts/docker-compose-logs.txt || true
123+
124+
- name: Upload backend logs on failure
125+
if: failure()
126+
uses: actions/upload-artifact@v7
127+
with:
128+
name: e2e-extract-backend-logs
129+
path: artifacts/
130+
131+
- name: Upload Playwright HTML report on failure
132+
if: failure()
133+
uses: actions/upload-artifact@v7
134+
with:
135+
name: e2e-extract-playwright-report
136+
path: frontend/playwright-report-e2e/
137+
if-no-files-found: ignore
138+
139+
- name: Tear down backend stack
140+
if: always()
141+
run: docker compose -f local.yml down -v

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
99

1010
### Added
1111

12+
- **VCR.py wrapper for LLM calls in `doc_extract_query_task`**`opencontractserver/utils/vcr_replay.py` exposes a `maybe_vcr_cassette()` context manager that, when `OC_LLM_VCR_MODE` and `OC_LLM_VCR_CASSETTE` are set on the celery worker, records or replays every HTTP call to LLM provider hosts (currently `api.openai.com` / `api.anthropic.com`). A custom request-body matcher strips volatile values (millisecond timestamps, Django document PKs, OpenAI tool-call IDs, UUIDs) so a cassette recorded against one DB replays cleanly against another. With the env vars unset the wrapper is a no-op — production behavior is unchanged. Pre-recorded cassette for the E2E extract spec lives at `opencontractserver/tests/fixtures/cassettes/e2e_extract_pdf_workflow/extract.yaml`. Replay was verified end-to-end against a deliberately-fake `OPENAI_API_KEY` to confirm no real network call is made. See `docs/development/e2e_vcr.md` for record / replay / debug instructions.
13+
- **`.github/workflows/frontend-e2e-extract.yml`** — CI workflow (currently `workflow_dispatch` only) that runs the new E2E extract spec against the full `local.yml` stack with `OC_LLM_VCR_MODE=replay` and a fake `OPENAI_API_KEY`. Manual-only because LlamaParse is not yet covered by the cassette and would otherwise be called for real on each run; activating on every PR is a follow-up that needs to extend `_LLM_HOSTS` and re-record.
14+
- **`frontend/tests/e2e/extract-pdf-workflow.spec.ts`** — full-stack Playwright E2E spec for the extract pipeline: login → create corpus → upload two PDFs (`frontend/tests/fixtures/{usc-title-1,eton-agreement}.pdf`) → wait for parse + embedding → create extract with one column → run with a real OpenAI call → CSV export → assert non-empty cells. Adds new helpers to `frontend/tests/e2e/helpers.ts` (`uploadPdfViaUI`, `waitForDocumentReady`, `createExtractViaUI`, `openExtractByName`, `addColumnViaUI`, `addDocumentsToExtractViaUI`, `runExtractAndWaitForFinish`). Gated on `E2E_RUN_LLM_TESTS=true`; skipped in CI until LLM responses can be mocked over the wire. Runs on the live `local.yml` stack; required tweaks to disable Auth0 (`.envs/.local/.django USE_AUTH0=false`) and to widen the celeryworker `watchfiles --ignore-paths` (in `compose/local/django/celery/worker/start` and the `local.yml` command pointer) so editor / Playwright artifact writes don't hot-reload the worker mid-task. Also adds `data-testid="document-card"` (+ `data-processing` on the `/documents`-view variants) to `frontend/src/views/Documents.tsx` and `data-testid="document-card"` to `frontend/src/components/documents/ModernDocumentItem.tsx`, so tests can poll for the `backendLock` UI signal without depending on hover-only action menus. Cards are matched by `[data-testid="document-card"]` filtered with the visible title text — the standard Playwright pattern.
1215
- **Mypy: type analyzer, shared, agents, badges, worker_uploads; introduce shared protocols** (Issue #1335): Brought the five smaller, interface-rich target packages over the ≥70% return-annotation bar called for by the issue and seeded `opencontractserver/types/protocols.py` with the four protocols requested in the scope:
1316
- `VectorStoreProtocol` — minimum surface (`search` / `async_search`) implemented by `CoreAnnotationVectorStore` (`opencontractserver/llms/vector_stores/core_vector_stores.py`); imported and re-exported from that module so consumers can annotate against the protocol rather than the concrete dataclass.
1417
- `PipelineComponentProtocol``title` / `description` / `author` / `dependencies` surface that the pipeline registry duck-types against; imported from `opencontractserver/pipeline/base/base_component.py` so any future parser/embedder/thumbnailer registered outside the inheritance hierarchy still type-checks against the same contract.

compose/local/django/celery/worker/start

100644100755
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,4 @@ set -o errexit
44
set -o nounset
55

66

7-
watchfiles --target-type command "celery -A config.celery_app worker -l INFO --concurrency=1 -Q celery,worker_uploads"
7+
watchfiles --ignore-paths frontend,node_modules,.git,__pycache__,.mypy_cache,.pytest_cache,media,staticfiles,docs,.claude,.playwright-mcp,playwright-report-e2e,test-results --target-type command "celery -A config.celery_app worker -l INFO --concurrency=1 -Q celery,worker_uploads"

docs/development/e2e_vcr.md

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
# Recording and replaying LLM cassettes for the extract E2E spec
2+
3+
The E2E Playwright spec at `frontend/tests/e2e/extract-pdf-workflow.spec.ts` drives the full extract pipeline including a real LLM call. To keep CI fast, deterministic, and free of LLM-API spend, the call can be wrapped in a [VCR.py](https://vcrpy.readthedocs.io/) cassette so the recorded HTTP interaction is replayed instead of hitting the provider.
4+
5+
## How it works
6+
7+
`opencontractserver/utils/vcr_replay.py` exposes a `maybe_vcr_cassette()` context manager that is wrapped around the agent invocation in `opencontractserver/tasks/data_extract_tasks.py::doc_extract_query_task`. When the relevant env vars are unset, the manager is a no-op — production behavior is unchanged.
8+
9+
When the env vars are set:
10+
- `OC_LLM_VCR_MODE=record` — every LLM HTTP call is captured to the cassette file (overwriting any existing one).
11+
- `OC_LLM_VCR_MODE=once` — record if missing, replay if present.
12+
- `OC_LLM_VCR_MODE=replay` — replay only; an unmatched request raises `CannotOverwriteExistingCassetteException` and bubbles up as an extraction failure.
13+
14+
The cassette path is supplied via `OC_LLM_VCR_CASSETTE` (filesystem path, must be visible inside the celery worker container).
15+
16+
A custom matcher strips volatile fields from request bodies so a cassette recorded against one DB (with timestamp `1777504812606` and document PK `56`, etc.) replays cleanly against another (with different IDs and timestamps). The patterns are in `_VOLATILE_PATTERNS` at the top of `vcr_replay.py` — extend them as new volatile values surface.
17+
18+
## Recording a fresh cassette
19+
20+
1. Bring up the local stack with the VCR env vars set so the worker container picks them up:
21+
22+
```bash
23+
OC_LLM_VCR_MODE=record \
24+
OC_LLM_VCR_CASSETTE=/app/opencontractserver/tests/fixtures/cassettes/e2e_extract_pdf_workflow/extract.yaml \
25+
docker compose -f local.yml up -d --no-deps --force-recreate celeryworker
26+
```
27+
28+
2. Run the E2E spec end-to-end against a real OpenAI key:
29+
30+
```bash
31+
cd frontend
32+
E2E_RUN_LLM_TESTS=true E2E_TEST_PASSWORD="<your superuser password>" \
33+
yarn test:e2e --grep "Extract PDF workflow" --reporter=list
34+
```
35+
36+
3. Verify the cassette landed under `opencontractserver/tests/fixtures/cassettes/e2e_extract_pdf_workflow/extract.yaml`. Commit it.
37+
38+
## Replaying
39+
40+
1. Bring up the worker in replay mode (no real key required — VCR intercepts every request to the LLM provider):
41+
42+
```bash
43+
OC_LLM_VCR_MODE=replay \
44+
OC_LLM_VCR_CASSETTE=/app/opencontractserver/tests/fixtures/cassettes/e2e_extract_pdf_workflow/extract.yaml \
45+
docker compose -f local.yml up -d --no-deps --force-recreate celeryworker
46+
```
47+
48+
2. Run the spec normally. Total time drops from ~1.6 min to roughly the same — the bulk of the runtime is parser ingest + Playwright wall-clock, not the LLM call.
49+
50+
To prove no real LLM call happens, replace `OPENAI_API_KEY` in `.envs/.local/.django` with a deliberately-fake string before re-running. The spec still passes.
51+
52+
## Debugging matcher failures
53+
54+
Set `OC_LLM_VCR_DEBUG=1` (also forwarded through `local.yml` to the worker). On every body-mismatch the matcher writes a JSON line with the first byte of difference + 200 chars of context to `/tmp/vcr-mismatch-<pid>.log` inside the container. Tail it to see exactly what's volatile and isn't being normalized:
55+
56+
```bash
57+
docker exec celeryworker bash -c 'tail -1 /tmp/vcr-mismatch-*.log | python3 -m json.tool'
58+
```
59+
60+
Add new patterns to `_VOLATILE_PATTERNS` until the matcher succeeds.
61+
62+
## Limitations / follow-ups
63+
64+
- **LlamaParse is not yet covered.** PDF ingest still calls `https://api.cloud.llamaindex.ai`. To run the spec in CI without any external network, `_LLM_HOSTS` in `vcr_replay.py` needs to grow to include the LlamaParse host AND the wrapper needs to be applied around `ingest_doc` too. That's a separate PR.
65+
- **Cassette goes stale on prompt changes.** Any change to the structured-extraction system prompt, the column query, or the tool schemas will produce a new request body and require re-recording.
66+
- **One cassette per spec.** The current cassette is named `e2e_extract_pdf_workflow/extract.yaml`. If you add a second LLM-using spec, give it its own cassette directory.
67+
68+
## Why VCR rather than `pydantic_ai.models.test.TestModel`
69+
70+
A `TestModel` would be cheaper (no HTTP at all) but it would not exercise the openai-SDK + httpx + pydantic-ai integration path. PR #1399's `failure_mode=no_final_response` classifier specifically targets that integration path — bypassing it with a pure-Python test double would mask the very class of bugs the spec exists to catch.
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
import baseConfig from "./playwright.config";
2+
import { defineConfig } from "@playwright/test";
3+
4+
/**
5+
* Playwright config override that forces video recording for every test.
6+
* Use only for capturing demo footage (`yarn playwright test -c playwright.video.config.ts ...`);
7+
* the default config keeps `video: "retain-on-failure"` to avoid bloat.
8+
*/
9+
export default defineConfig({
10+
...baseConfig,
11+
use: {
12+
...baseConfig.use,
13+
video: "on",
14+
viewport: { width: 1280, height: 800 },
15+
},
16+
});

frontend/src/components/documents/ModernDocumentItem.tsx

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1286,6 +1286,7 @@ export const ModernDocumentItem: React.FC<ModernDocumentItemProps> = ({
12861286
<>
12871287
<CardContainer
12881288
ref={setNodeRef}
1289+
data-testid="document-card"
12891290
className={`${is_selected ? "is-selected" : ""} ${
12901291
isProcessing ? "backend-locked" : ""
12911292
} ${isFailed ? "failed" : ""} ${
@@ -1497,6 +1498,7 @@ export const ModernDocumentItem: React.FC<ModernDocumentItemProps> = ({
14971498
<>
14981499
<ListContainer
14991500
ref={setNodeRef}
1501+
data-testid="document-card"
15001502
className={`${is_selected ? "is-selected" : ""} ${
15011503
isProcessing ? "backend-locked" : ""
15021504
} ${isFailed ? "failed" : ""} ${isLongPressing ? "long-pressing" : ""}`}

frontend/src/views/Documents.tsx

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1380,6 +1380,8 @@ export const Documents = () => {
13801380
key={doc.id}
13811381
role="button"
13821382
tabIndex={0}
1383+
data-testid="document-card"
1384+
data-processing={String(Boolean(doc.backendLock))}
13831385
aria-label={`Open document ${doc.title || "Untitled"}`}
13841386
$selected={selected_document_ids.includes(doc.id)}
13851387
onClick={() => handleDocumentClick(doc)}
@@ -1502,6 +1504,8 @@ export const Documents = () => {
15021504
key={doc.id}
15031505
role="row"
15041506
tabIndex={0}
1507+
data-testid="document-card"
1508+
data-processing={String(Boolean(doc.backendLock))}
15051509
aria-label={`Open document ${doc.title || "Untitled"}`}
15061510
$selected={selected_document_ids.includes(doc.id)}
15071511
onClick={() => handleDocumentClick(doc)}
@@ -1569,6 +1573,8 @@ export const Documents = () => {
15691573
key={doc.id}
15701574
role="listitem"
15711575
tabIndex={0}
1576+
data-testid="document-card"
1577+
data-processing={String(Boolean(doc.backendLock))}
15721578
aria-label={`Open document ${doc.title || "Untitled"}`}
15731579
$selected={selected_document_ids.includes(doc.id)}
15741580
onClick={() => handleDocumentClick(doc)}

0 commit comments

Comments
 (0)