Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
14 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Actor images build from the monorepo root (dockerContextDir in .actor/actor.json).
# Keep the context lean: ship sources + manifests + the pnpm lockfile only.
node_modules
**/node_modules
.git
.turbo
**/dist
**/storage
apify_storage
crawlee_storage
*.log
.DS_Store
.idea
.vscode
2 changes: 2 additions & 0 deletions .github/workflows/on-pull-request.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ jobs:
name: Build & Test
if: (!contains(github.event.head_commit.message, '[skip ci]') && !contains(github.event.head_commit.message, 'docs:'))
runs-on: ubuntu-22.04
timeout-minutes: 30

steps:
- name: Checkout repository
Expand All @@ -38,6 +39,7 @@ jobs:
lint:
name: Lint
runs-on: ubuntu-22.04
timeout-minutes: 30
steps:
- name: Checkout repository
uses: actions/checkout@v6
Expand Down
62 changes: 47 additions & 15 deletions .github/workflows/release-generic-actors.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
name: Build generic Actors
description: Build generic Actors and push them to Apify

on:
workflow_dispatch:
Expand Down Expand Up @@ -34,6 +33,11 @@ on:
type: boolean
required: false
default: true
sitemap-scraper:
description: apify/sitemap-extractor
type: boolean
required: false
default: true
build-channel:
description: Build channel
type: choice
Expand All @@ -60,43 +64,68 @@ jobs:
matrix:
settings:
- actor: web-scraper
apify-actor: apify/web-scraper
stable-version: '3.0'
stable-build-tag: version-3
development-version: '0.0'
development-build-tag: development
should-build: ${{ github.event.inputs.web-scraper }}
- actor: cheerio-scraper
apify-actor: apify/cheerio-scraper
stable-version: '3.0'
stable-build-tag: version-3
development-version: '0.0'
development-build-tag: development
should-build: ${{ github.event.inputs.cheerio-scraper }}
- actor: playwright-scraper
apify-actor: apify/playwright-scraper
stable-version: '1.0'
stable-build-tag: version-1
development-version: '0.0'
development-build-tag: development
should-build: ${{ github.event.inputs.playwright-scraper }}
- actor: puppeteer-scraper
apify-actor: apify/puppeteer-scraper
stable-version: '3.0'
stable-build-tag: version-3
development-version: '0.0'
development-build-tag: development
should-build: ${{ github.event.inputs.puppeteer-scraper }}
- actor: jsdom-scraper
apify-actor: apify/jsdom-scraper
stable-version: '0.2'
stable-build-tag: version-0
development-version: '0.0'
development-build-tag: development
should-build: ${{ github.event.inputs.jsdom-scraper }}
- actor: camoufox-scraper
apify-actor: apify/camoufox-scraper
stable-version: '3.0'
stable-build-tag: version-3
development-version: '0.0'
development-build-tag: development
should-build: ${{ github.event.inputs.camoufox-scraper }}
- actor: sitemap-scraper
apify-actor: apify/sitemap-extractor
stable-version: '0.1'
stable-build-tag: latest
development-version: '0.0'
development-build-tag: development
should-build: ${{ github.event.inputs.sitemap-scraper }}
steps:
- uses: actions/checkout@v6
- name: Check out current SHA
if: matrix.settings.should-build == 'true'
uses: actions/checkout@v6

- name: Set up Node.js
if: matrix.settings.should-build == 'true'
uses: actions/setup-node@v6
with:
node-version: 24

- name: Install pnpm and dependencies
if: matrix.settings.should-build == 'true'
uses: apify/actions/pnpm-install@v1.1.2

- name: Log matrix
run: |
Expand All @@ -108,21 +137,24 @@ jobs:
if: matrix.settings.should-build == 'true'
run: |
if [ "${{ github.event.inputs.build-channel }}" = "stable" ]; then
echo "version=${{ matrix.settings.stable-version }}" >> $GITHUB_ENV
echo "build-tag=${{ matrix.settings.stable-build-tag }}" >> $GITHUB_ENV
echo "version=${{ matrix.settings.stable-version }}" >> "$GITHUB_ENV"
echo "build-tag=${{ matrix.settings.stable-build-tag }}" >> "$GITHUB_ENV"
elif [ "${{ github.event.inputs.build-channel }}" = "development" ]; then
echo "version=${{ matrix.settings.development-version }}" >> $GITHUB_ENV
echo "build-tag=${{ matrix.settings.development-build-tag }}" >> $GITHUB_ENV
echo "version=${{ matrix.settings.development-version }}" >> "$GITHUB_ENV"
echo "build-tag=${{ matrix.settings.development-build-tag }}" >> "$GITHUB_ENV"
else
echo "version=${{ github.event.inputs.version }}" >> $GITHUB_ENV
echo "build-tag=${{ github.event.inputs.build-tag }}" >> $GITHUB_ENV
echo "version=${{ github.event.inputs.version }}" >> "$GITHUB_ENV"
echo "build-tag=${{ github.event.inputs.build-tag }}" >> "$GITHUB_ENV"
fi

- name: Build ${{ matrix.settings.actor }}
uses: apify/push-actor-action@master
- name: Build ${{ matrix.settings.actor }} on Apify (Git source)
if: matrix.settings.should-build == 'true'
with:
token: ${{ secrets.APIFY_ACTOR_BUILD_TOKEN }}
build-tag: ${{ env.build-tag }}
version: ${{ env.version }}
working-directory: packages/actor-scraper/${{ matrix.settings.actor }}
env:
APIFY_TOKEN: ${{ secrets.APIFY_ACTOR_BUILD_TOKEN }}
APIFY_ACTOR: ${{ matrix.settings.apify-actor }}
BUILD_VERSION: ${{ env.version }}
BUILD_TAG: ${{ env.build-tag }}
APIFY_RELEASE_BUILD_TIMEOUT_SECS: "900"
run: |
set -euo pipefail
node scripts/trigger-apify-build.mjs
6 changes: 6 additions & 0 deletions .github/workflows/test-e2e.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ jobs:
build_and_test:
name: Build & Test
runs-on: ubuntu-22.04
timeout-minutes: 60

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe this won't fix the timeouting e2e tests. Likely caused by microsoft/playwright#40724 (bumping Playwright or reverting Node version might help)

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's not the fix 😄, it's just for the future (if something happens then fail is better than getting stuck) The actual fix is browser-install step


steps:
- name: Cancel Workflow Action
Expand Down Expand Up @@ -40,6 +41,11 @@ jobs:
- name: Install pnpm and dependencies
uses: apify/actions/pnpm-install@v1.1.2

- name: Install browsers
run: |
pnpm exec puppeteer browsers install chrome
pnpm exec playwright install chromium

- name: Build
run: pnpm ci:build

Expand Down
6 changes: 3 additions & 3 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,14 @@ Please provide steps to reproduce if you found a bug or ideally fork the reposit

Before you submit your pull request, consider the following guidelines:

- Search [GitHub](https://github.com/apify/apify-sdk-js/pulls) for an open or closed PR that relates to your submission. You don't want to duplicate effort.
- Search [GitHub](https://github.com/apify/actor-scraper/pulls) for an open or closed PR that relates to your submission. You don't want to duplicate effort.

- Fork the project and install NPM dependencies.
- Fork the project and install dependencies with pnpm.

- Run tests before you start working, to be sure they all pass and your setup is working correctly:

```sh
npm run test
pnpm test
```

- Be sure to **include appropriate test cases**. Tests help make it clear what the PR is fixing and also make sure the changes won't break over time.
Expand Down
16 changes: 8 additions & 8 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"name": "apify-sdk-js",
"name": "actor-scraper",
"private": true,
"description": "Apify SDK monorepo",
"description": "Apify generic scrapers monorepo",
"keywords": [
"apify",
"headless",
Expand All @@ -23,12 +23,12 @@
"license": "Apache-2.0",
"repository": {
"type": "git",
"url": "git+https://github.com/apify/apify-ts"
"url": "git+https://github.com/apify/actor-scraper.git"
},
"bugs": {
"url": "https://github.com/apify/apify-ts/issues"
"url": "https://github.com/apify/actor-scraper/issues"
},
"homepage": "https://sdk.apify.com",
"homepage": "https://github.com/apify/actor-scraper",
"scripts": {
"prepare": "husky",
"prepublishOnly": "turbo run copy",
Expand Down Expand Up @@ -61,7 +61,7 @@
"@apify/tsconfig": "^0.1.2",
"@commitlint/config-conventional": "^20.0.0",
"@isaacs/brace-expansion": "^5.0.1",
"@playwright/browser-chromium": "^1.46.0",
"@playwright/browser-chromium": "^1.61.0",
"@types/content-type": "^1.1.8",
"@types/fs-extra": "^11.0.4",
"@types/node": "^24.0.0",
Expand All @@ -81,8 +81,8 @@
"oxfmt": "0.46.0",
"oxlint": "1.62.0",
"oxlint-tsgolint": "0.22.0",
"playwright": "^1.46.0",
"puppeteer": "^24.0.0",
"playwright": "^1.61.0",
"puppeteer": "25.2.0",
"rimraf": "^6.0.1",
"tsx": "^4.16.5",
"turbo": "2.9.1",
Expand Down
2 changes: 2 additions & 0 deletions packages/actor-scraper/camoufox-scraper/.actor/actor.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
"name": "camoufox-scraper",
"version": "0.1",
"buildTag": "latest",
"dockerContextDir": "../../../..",
"dockerfile": "../Dockerfile",
Comment on lines +6 to +7

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same in every actor

points the Docker build context at the monorepo root so the build can use the root pnpm-lock.yaml for a frozen, deterministic install. apify push of a single folder can't do this (no root lockfile in the upload). that's why builds run from the Git source.

inspired by actor-monorepo-example

"storages": {
"dataset": {
"actorSpecification": 1,
Expand Down
12 changes: 0 additions & 12 deletions packages/actor-scraper/camoufox-scraper/.dockerignore

This file was deleted.

52 changes: 33 additions & 19 deletions packages/actor-scraper/camoufox-scraper/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,31 +1,45 @@
FROM apify/actor-node-playwright-camoufox:22 AS builder
# Build context is the monorepo root (dockerContextDir in .actor/actor.json), so the build uses the
# root pnpm-lock.yaml for a deterministic install.
#
# The base is pinned by digest (reproducible Camoufox binary + OS deps). The actor pins playwright and
# camoufox-js to the versions this base ships (24-1.59.1 -> playwright 1.59.1, camoufox-js 0.11.1) and
# bundles them, so the bundled drivers match the Camoufox binary they drive. The binary itself lives in
# ~/.cache/camoufox (outside node_modules) and stays in the base. Tag 24-1.59.1 is the newest Camoufox
# base that survives page JS errors (24-1.60.0 crashes the process on them).
FROM apify/actor-node-playwright-camoufox:24-1.59.1@sha256:eadc96fa9492284eb45ef70b6b91c841fae7f142d25a22ae2887a21bb78b3469 AS builder

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same for each browser. Base is pinned by sha256 digest, not just a tag, on purpose: tags are mutable, so a bare tag wouldn't give a reproducible browser binary. Since the whole runtime browser stack comes from the base (see the driver note), the digest is what actually makes the browser deterministic. Renovate bumps the digest.

Also, not the newest base. I've tested the latest camoufox base (24-1.60.0, Camoufox 150): it crashes the whole process on any page that emits a JS error (playwright's Firefox pageError handler). I also tried the latest Playwright (1.61.x), which won't even launch this Camoufox binary (Browser.setDefaultViewport protocol error). 24-1.59.1 is the newest camoufox base that survives page errors.


COPY --chown=myuser package*.json ./
# The browser base runs as `myuser`; the build stage needs root for corepack + a writable workdir.
USER root
WORKDIR /app

RUN npm install --include=dev --audit=false
RUN corepack enable

COPY --chown=myuser . ./
# Browsers ship with the base image; never let an npm postinstall download them.
ENV PUPPETEER_SKIP_DOWNLOAD=true \
PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1

RUN npm run build
# Whole workspace (root .dockerignore keeps node_modules/.git/dist out of the context).
COPY . ./

FROM apify/actor-node-playwright-camoufox:22
# Deterministic install (frozen lockfile, honors minimumReleaseAge) of the actor + its workspace deps.
RUN pnpm install --frozen-lockfile --filter actor-camoufox-scraper...

COPY --from=builder --chown=myuser /home/myuser/dist ./dist
# Build the actor and its workspace dependency @apify/scraper-tools.
RUN pnpm --filter actor-camoufox-scraper... build

COPY --chown=myuser package*.json ./
# Self-contained production bundle; inject-workspace-packages copies the built @apify/scraper-tools
# into node_modules instead of symlinking it. The native better-sqlite3 binding (a camoufox-js dep) is
# compiled here against the base's Node 24, and the runtime stage uses that same base, so it matches.
RUN pnpm --config.inject-workspace-packages=true --filter actor-camoufox-scraper deploy --prod /deploy

RUN npm --quiet set progress=false \
&& npm install --omit=dev \
&& echo "Installed NPM packages:" \
&& (npm list --omit=dev --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version \
&& rm -r ~/.npm
FROM apify/actor-node-playwright-camoufox:24-1.59.1@sha256:eadc96fa9492284eb45ef70b6b91c841fae7f142d25a22ae2887a21bb78b3469

COPY --chown=myuser . ./
# Base WORKDIR is /home/myuser and ships the Xvfb entrypoint plus the Camoufox binary in
# ~/.cache/camoufox (outside node_modules). Replace only the template node_modules with the lean
# production bundle (which carries the matching playwright + camoufox-js); the binary stays in the base.
RUN rm -rf node_modules
COPY --from=builder --chown=myuser /deploy ./

ENV APIFY_DISABLE_OUTDATED_WARNING=1

CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
CMD ["node", "dist/main.js"]
10 changes: 5 additions & 5 deletions packages/actor-scraper/camoufox-scraper/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@
"@crawlee/playwright": "^3.14.1",
"@crawlee/utils": "^3.14.1",
"apify": "^3.2.6",
"camoufox-js": "^0.9.0",
"camoufox-js": "0.11.1",
"idcac-playwright": "^0.2.0",
"playwright": "*"
"playwright": "1.59.1",
"playwright-core": "1.59.1"
},
"devDependencies": {
"@apify/tsconfig": "^0.1.0",
Expand All @@ -22,19 +23,18 @@
},
"scripts": {
"start": "pnpm start:dev",
"start:prod": "node dist/main.js",
"start:dev": "tsx src/main.ts",
"build": "tsc"
},
"repository": {
"type": "git",
"url": "https://github.com/apify/apify-sdk-js"
"url": "https://github.com/apify/actor-scraper"
},
"author": {
"name": "Apify Technologies",
"email": "support@apify.com",
"url": "https://apify.com"
},
"license": "Apache-2.0",
"homepage": "https://github.com/apify/apify-sdk-js"
"homepage": "https://github.com/apify/actor-scraper"
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ import type { ApifyEnv } from 'apify';
import { Actor } from 'apify';
import { launchOptions } from 'camoufox-js';
import { getInjectableScript } from 'idcac-playwright';
import type { Response } from 'playwright';
import { firefox } from 'playwright';

import type { CrawlerSetupOptions, RequestMetadata } from '@apify/scraper-tools';
Expand Down Expand Up @@ -201,7 +200,11 @@ export class CrawlerSetup implements CrawlerSetupOptions {
...this.input,
humanize: this.input.humanize ? Number(this.input.humanize) : 0,
}),
} as PlaywrightLaunchContext,
// `firefox` and @crawlee/playwright's PlaywrightLaunchContext resolve to two different
// playwright-core copies (camoufox pins playwright 1.59.1 to match its base image,
// while the rest of the workspace is on 1.61.0), so the BrowserType identities don't
// line up. This is a workspace dual-install, not a camoufox-js/crawlee issue.
} as unknown as PlaywrightLaunchContext,
useSessionPool: true,
persistCookiesPerSession: true,
sessionPoolOptions: {
Expand Down Expand Up @@ -392,7 +395,7 @@ export class CrawlerSetup implements CrawlerSetupOptions {

private async _handleResult(
request: Request,
response?: Response,
response?: PlaywrightCrawlingContext['response'],
pageFunctionResult?: Dictionary,
isError?: boolean,
) {
Expand Down
Loading
Loading