|
| 1 | +name: Frontend E2E Extract Pipeline (VCR) |
| 2 | + |
| 3 | +# Drives the full extract pipeline end-to-end via Playwright: |
| 4 | +# login → upload two PDFs → ingest+embed → create extract → run → CSV. |
| 5 | +# The LLM call inside `doc_extract_query_task` is wrapped in a VCR.py |
| 6 | +# cassette so no OpenAI traffic is generated. See: |
| 7 | +# docs/development/e2e_vcr.md |
| 8 | +# |
| 9 | +# *** CURRENTLY MANUAL-TRIGGER ONLY *** |
| 10 | +# |
| 11 | +# This workflow is parked behind `workflow_dispatch` because LlamaParse — |
| 12 | +# the default PDF parser used during ingest — is NOT yet covered by VCR. |
| 13 | +# Until `_LLM_HOSTS` in `opencontractserver/utils/vcr_replay.py` is |
| 14 | +# extended to include `api.cloud.llamaindex.ai` AND the cassette is |
| 15 | +# re-recorded to capture the parse-and-poll calls, this workflow would |
| 16 | +# call LlamaParse for real on every CI run. Triggering it manually is |
| 17 | +# OK for spot-checks (LlamaParse calls cost cents per doc); enabling it |
| 18 | +# on every PR is not. |
| 19 | +# |
| 20 | +# To enable on PRs: replace the `on:` block with the standard |
| 21 | +# pull_request paths filter (see frontend-e2e.yml for the template). |
| 22 | + |
| 23 | +env: |
| 24 | + DOCKER_BUILDKIT: 1 |
| 25 | + COMPOSE_DOCKER_CLI_BUILD: 1 |
| 26 | + |
| 27 | +defaults: |
| 28 | + run: |
| 29 | + working-directory: ./ |
| 30 | + |
| 31 | +on: |
| 32 | + workflow_dispatch: |
| 33 | + |
| 34 | +concurrency: |
| 35 | + group: frontend-e2e-extract-${{ github.head_ref || github.run_id }} |
| 36 | + cancel-in-progress: true |
| 37 | + |
| 38 | +jobs: |
| 39 | + e2e-extract: |
| 40 | + name: Extract pipeline (PDF upload → run → CSV) |
| 41 | + runs-on: ubuntu-latest |
| 42 | + timeout-minutes: 30 |
| 43 | + permissions: |
| 44 | + contents: read |
| 45 | + |
| 46 | + steps: |
| 47 | + - name: Checkout |
| 48 | + uses: actions/checkout@v6 |
| 49 | + |
| 50 | + - name: Setup Node.js |
| 51 | + uses: actions/setup-node@v6 |
| 52 | + with: |
| 53 | + node-version: "20" |
| 54 | + |
| 55 | + - name: Install Yarn |
| 56 | + run: npm install -g yarn |
| 57 | + |
| 58 | + - name: Install frontend dependencies |
| 59 | + working-directory: ./frontend |
| 60 | + run: yarn install --frozen-lockfile |
| 61 | + |
| 62 | + - name: Install Playwright browsers |
| 63 | + working-directory: ./frontend |
| 64 | + run: yarn playwright install --with-deps chromium |
| 65 | + |
| 66 | + # ──────────────────────────────────────────────────────────────── |
| 67 | + # Bring up the full local stack (postgres, redis, embedder, |
| 68 | + # docling-parser, django, celeryworker). We use `local.yml` rather |
| 69 | + # than `test.yml` because the extract pipeline needs a celery |
| 70 | + # worker, and `local.yml` is the only compose file that ships one. |
| 71 | + # |
| 72 | + # The cassette replays every OpenAI call so OPENAI_API_KEY can be |
| 73 | + # anything. LLAMAPARSE_API_KEY is still required (real calls, |
| 74 | + # see file-level note above). |
| 75 | + # ──────────────────────────────────────────────────────────────── |
| 76 | + - name: Build django image |
| 77 | + run: docker compose -f local.yml build django |
| 78 | + |
| 79 | + - name: Start backend stack with VCR replay |
| 80 | + env: |
| 81 | + OC_LLM_VCR_MODE: replay |
| 82 | + OC_LLM_VCR_CASSETTE: /app/opencontractserver/tests/fixtures/cassettes/e2e_extract_pdf_workflow/extract.yaml |
| 83 | + # CI provisions a fake OpenAI key — the cassette intercepts |
| 84 | + # every request so this is never sent. We pin it to a clearly |
| 85 | + # bogus value to make accidental real calls fail loudly. |
| 86 | + OPENAI_API_KEY: sk-FAKE-VCR-CI-NOT-REAL |
| 87 | + # LlamaParse is NOT yet under VCR; the workflow secret is |
| 88 | + # required for real PDF parsing. See file-level note. |
| 89 | + LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }} |
| 90 | + run: | |
| 91 | + docker compose -f local.yml up -d |
| 92 | + echo "Waiting for django to become healthy…" |
| 93 | + for i in {1..60}; do |
| 94 | + state=$(docker inspect -f '{{.State.Health.Status}}' \ |
| 95 | + opencontracts-django-1 2>/dev/null || echo "starting") |
| 96 | + if [ "$state" = "healthy" ]; then echo "django healthy"; break; fi |
| 97 | + if [ "$i" = "60" ]; then |
| 98 | + echo "django did not become healthy in time" |
| 99 | + docker compose -f local.yml logs django | tail -100 |
| 100 | + exit 1 |
| 101 | + fi |
| 102 | + sleep 2 |
| 103 | + done |
| 104 | +
|
| 105 | + - name: Run Playwright extract spec |
| 106 | + working-directory: ./frontend |
| 107 | + env: |
| 108 | + CI: "true" |
| 109 | + E2E_RUN_LLM_TESTS: "true" |
| 110 | + E2E_TEST_USERNAME: admin |
| 111 | + # The local stack creates the superuser from |
| 112 | + # DJANGO_SUPERUSER_PASSWORD in `.envs/.local/.django`. CI must |
| 113 | + # supply the same value to the playwright fixture. |
| 114 | + E2E_TEST_PASSWORD: ${{ secrets.E2E_LOCAL_SUPERUSER_PASSWORD }} |
| 115 | + run: yarn playwright test --grep "Extract PDF workflow" --reporter=list |
| 116 | + |
| 117 | + - name: Capture backend logs on failure |
| 118 | + if: failure() |
| 119 | + run: | |
| 120 | + mkdir -p artifacts |
| 121 | + docker compose -f local.yml ps > artifacts/docker-ps.txt || true |
| 122 | + docker compose -f local.yml logs --no-color > artifacts/docker-compose-logs.txt || true |
| 123 | +
|
| 124 | + - name: Upload backend logs on failure |
| 125 | + if: failure() |
| 126 | + uses: actions/upload-artifact@v7 |
| 127 | + with: |
| 128 | + name: e2e-extract-backend-logs |
| 129 | + path: artifacts/ |
| 130 | + |
| 131 | + - name: Upload Playwright HTML report on failure |
| 132 | + if: failure() |
| 133 | + uses: actions/upload-artifact@v7 |
| 134 | + with: |
| 135 | + name: e2e-extract-playwright-report |
| 136 | + path: frontend/playwright-report-e2e/ |
| 137 | + if-no-files-found: ignore |
| 138 | + |
| 139 | + - name: Tear down backend stack |
| 140 | + if: always() |
| 141 | + run: docker compose -f local.yml down -v |
0 commit comments