diff --git a/.github/workflows/test-integration.yml b/.github/workflows/test-integration.yml new file mode 100644 index 000000000000..7fdded9d6a84 --- /dev/null +++ b/.github/workflows/test-integration.yml @@ -0,0 +1,70 @@ +name: Integration tests + +on: + pull_request: + branches: [ master, v4 ] + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + remote-browser: + name: Remote browser integration + runs-on: ubuntu-22.04 + + # Side-services provide the remote browser and a deterministic HTTP target. + services: + browserless: + image: ghcr.io/browserless/chromium:latest + ports: + - 3000:3000 + env: + CONCURRENT: 4 + options: >- + --health-cmd "wget -qO- http://localhost:3000/json/version || exit 1" + --health-interval 5s + --health-timeout 5s + --health-retries 12 + httpbin: + # kennethreitz/httpbin is python:3.6-slim and ships without wget/curl, + # so no Docker HEALTHCHECK — httpbin starts in <1s and the first test + # request will surface any real failure. + image: kennethreitz/httpbin:latest + ports: + - 8080:80 + + steps: + - name: Checkout repository + uses: actions/checkout@v6 + + - name: Use Node.js 24 + uses: actions/setup-node@v6 + with: + node-version: 24 + package-manager-cache: false + + - name: Turbo cache + uses: actions/cache@v5 + with: + path: .turbo + key: turbo-${{ github.job }}-${{ github.ref_name }}-${{ github.sha }} + restore-keys: | + turbo-${{ github.job }}-${{ github.ref_name }}- + + - uses: apify/workflows/pnpm-install@main + + # No `playwright install` — these tests connect to remote Browserless + # over CDP and never launch a local browser binary. + + - name: Build + run: pnpm ci:build + + - name: Run integration tests + run: pnpm test:integration + env: + BROWSERLESS_URL: http://localhost:3000 + HTTPBIN_URL: http://httpbin + CRAWLEE_DIFFICULT_TESTS: 1 + RETRY_TESTS: 1 diff --git a/docs/guides/remote_browser.mdx b/docs/guides/remote_browser.mdx new file mode 100644 index 000000000000..f02d41be4b64 --- /dev/null +++ b/docs/guides/remote_browser.mdx @@ -0,0 +1,70 @@ +--- +id: remote-browser +title: "Remote browser services" +sidebar_label: "Remote browsers" +description: Connect Crawlee crawlers to remote browser services like Browserbase, Browserless, or Steel. +--- + +import ApiLink from '@site/src/components/ApiLink'; +import CodeBlock from '@theme/CodeBlock'; + +import RemoteBrowserConfigSource from '!!raw-loader!./remote_browser_config.ts'; +import RemoteBrowserProviderSource from '!!raw-loader!./remote_browser_provider.ts'; +import RemoteBrowserPuppeteerSource from '!!raw-loader!./remote_browser_puppeteer.ts'; + +Instead of launching a local browser, Crawlee can connect to a remote browser service like [Browserbase](https://browserbase.com/), [Browserless](https://browserless.io/), [Steel](https://steel.dev/), or any service that exposes a WebSocket/CDP endpoint. The crawler manages session rotation and the request lifecycle the same way it does locally — only the browser itself runs elsewhere. + +Use this when you need IPs in specific regions, want to offload CPU/memory from your runner, or need stealth features the service provides. + +## How it works + +Set the crawler's `remoteBrowser` option with the connection details. The crawler builds a `RemoteBrowserPool` around its own browser plugin, so the connection is always for the matching browser — there's no plugin to construct and no way to mismatch the pool with the crawler. The pool (an `IBrowserPool` wrapping the regular `BrowserPool`) owns everything remote: resolving the endpoint, releasing sessions when browsers close, and capping how many remote browsers run at once. + +## Basic usage + +The simplest form is a static connection URL. Use this when the service exposes a single endpoint and doesn't need per-session setup. + +{RemoteBrowserConfigSource} + +`endpoint` can also be a function returning `{ url, context }`, called once per browser launch. Pair it with a `release` callback (it receives the `context`) to clean up sessions on the service side when the browser closes, crashes, or the pool is destroyed. + +`maxOpenBrowsers` caps the number of concurrent remote browsers — set it to the service's concurrent-session limit to avoid 429 errors. The pool enforces it inside `newPage()`, which waits for a free slot rather than overshooting. + +### Self-hosted + +Some services ship a Docker image you can run locally or on your own infrastructure. For example, [Browserless](https://www.browserless.io/) has an open-source Chromium image: + +```bash +docker run -p 3000:3000 -e CONCURRENT=4 ghcr.io/browserless/chromium +``` + +Point the pool at the local endpoint with `endpoint: 'ws://localhost:3000'`. + +## Custom provider + +For services with a session-create / session-release lifecycle, extend `RemoteBrowserProvider` and pass the instance as the pool's `endpoint`. `connect()` runs once per browser launch and returns the connection URL plus an optional `context` object passed back to `release()`. `maxOpenBrowsers` set on the provider is adopted by the pool. + +{RemoteBrowserProviderSource} + +## Puppeteer + +`PuppeteerCrawler` works the same way — build the pool with a `PuppeteerPlugin`. Puppeteer connects over CDP: + +{RemoteBrowserPuppeteerSource} + +For Playwright you can choose the protocol via the `remoteBrowser.connection.protocol` option: `'cdp'` (default, `connectOverCDP()`) or `'playwright'` (`connect()`, Playwright's own WebSocket protocol). + +## Sharing a pool across crawlers + +`remoteBrowser` builds a pool the crawler owns and tears down. To share one remote pool across multiple crawlers, construct a `RemoteBrowserPool` yourself and pass it as the `browserPool` option instead — a pool supplied that way is never destroyed by the crawler, so you control its lifecycle. Use `remoteBrowser` *or* `browserPool`, not both. + +## Limitations + +- **`headless` and `launchOptions` don't apply.** The remote service controls headless mode and browser flags; configure them on the service side. +- **`useIncognitoPages` is forced to `true`** for Playwright remote connections — `connect()` / `connectOverCDP()` don't accept persistent contexts. For state shared across requests, use the `SessionPool`. +- **`userDataDir` has no effect** — there's no local profile when the browser runs remotely. Use the service's persistence API (e.g. Browserbase Contexts, Steel Profiles). + +## Further reading + +- `RemoteBrowserPool` API reference +- `RemoteBrowserProvider` API reference diff --git a/docs/guides/remote_browser_config.ts b/docs/guides/remote_browser_config.ts new file mode 100644 index 000000000000..41f4e0542fe8 --- /dev/null +++ b/docs/guides/remote_browser_config.ts @@ -0,0 +1,19 @@ +import { PlaywrightCrawler } from 'crawlee'; + +const token = process.env.BROWSERLESS_TOKEN!; + +const crawler = new PlaywrightCrawler({ + // Connect to a remote browser instead of launching locally. The crawler builds the right + // pool for its browser — you only supply the connection details. + remoteBrowser: { + endpoint: `wss://production-sfo.browserless.io?token=${token}`, + // Optional — respect the service's concurrent session limit. + maxOpenBrowsers: 5, + }, + async requestHandler({ page, request, log }) { + const title = await page.title(); + log.info(`${request.loadedUrl} — "${title}"`); + }, +}); + +await crawler.run(['https://crawlee.dev']); diff --git a/docs/guides/remote_browser_provider.ts b/docs/guides/remote_browser_provider.ts new file mode 100644 index 000000000000..45594d0fe4f4 --- /dev/null +++ b/docs/guides/remote_browser_provider.ts @@ -0,0 +1,46 @@ +import { RemoteBrowserProvider } from '@crawlee/browser-pool'; +import { PlaywrightCrawler } from 'crawlee'; + +const apiKey = process.env.BROWSERBASE_API_KEY!; +const projectId = process.env.BROWSERBASE_PROJECT_ID!; + +class BrowserbaseProvider extends RemoteBrowserProvider<{ id: string }> { + // Respect the service's concurrent session limit to avoid 429s. + override maxOpenBrowsers = 5; + + async connect() { + const response = await fetch('https://api.browserbase.com/v1/sessions', { + method: 'POST', + headers: { 'x-bb-api-key': apiKey, 'Content-Type': 'application/json' }, + body: JSON.stringify({ projectId }), + }); + + if (!response.ok) { + throw new Error(`Failed to create session: ${response.status} ${response.statusText}`); + } + + const session = (await response.json()) as { id: string; connectUrl: string }; + return { url: session.connectUrl, context: { id: session.id } }; + } + + override async release({ id }: { id: string }) { + await fetch(`https://api.browserbase.com/v1/sessions/${id}`, { + method: 'POST', + headers: { 'x-bb-api-key': apiKey, 'Content-Type': 'application/json' }, + body: JSON.stringify({ status: 'REQUEST_RELEASE' }), + }); + } +} + +const crawler = new PlaywrightCrawler({ + // Pass the provider as the `endpoint`; the crawler's pool calls connect()/release() per browser. + remoteBrowser: { + endpoint: new BrowserbaseProvider(), + }, + async requestHandler({ page, request, log }) { + const title = await page.title(); + log.info(`${request.loadedUrl} — "${title}"`); + }, +}); + +await crawler.run(['https://crawlee.dev']); diff --git a/docs/guides/remote_browser_puppeteer.ts b/docs/guides/remote_browser_puppeteer.ts new file mode 100644 index 000000000000..2bfc14be3d65 --- /dev/null +++ b/docs/guides/remote_browser_puppeteer.ts @@ -0,0 +1,16 @@ +import { PuppeteerCrawler } from 'crawlee'; + +const token = process.env.BROWSERLESS_TOKEN!; + +const crawler = new PuppeteerCrawler({ + // PuppeteerCrawler connects over CDP. Same `remoteBrowser` option, matching browser guaranteed. + remoteBrowser: { + endpoint: `wss://production-sfo.browserless.io?token=${token}`, + }, + async requestHandler({ page, request, log }) { + const title = await page.title(); + log.info(`${request.loadedUrl} — "${title}"`); + }, +}); + +await crawler.run(['https://crawlee.dev']); diff --git a/package.json b/package.json index 531e1c73cc1f..85cdeb825206 100644 --- a/package.json +++ b/package.json @@ -37,6 +37,9 @@ "ci:build": "turbo run build --filter=./packages/* --cache-dir=\".turbo\" && node ./scripts/typescript_fixes.mjs", "test": "vitest run --silent", "test:e2e": "node test/e2e/run.mjs", + "test:integration": "cross-env CRAWLEE_DIFFICULT_TESTS=1 vitest run --silent=true test/integration", + "test:integration:services:up": "docker network create crawlee-it 2>/dev/null; docker run -d --rm --name crawlee-it-browserless --network crawlee-it -p 3000:3000 -e CONCURRENT=4 ghcr.io/browserless/chromium && docker run -d --rm --name crawlee-it-httpbin --network crawlee-it --network-alias httpbin -p 8080:80 kennethreitz/httpbin", + "test:integration:services:down": "docker stop crawlee-it-browserless crawlee-it-httpbin; docker network rm crawlee-it 2>/dev/null; true", "test:full": "cross-env CRAWLEE_DIFFICULT_TESTS=1 vitest run --silent", "tsc-check-tests": "tsc --noEmit --project test/tsconfig.json", "coverage": "vitest --coverage", diff --git a/packages/browser-crawler/src/internals/browser-crawler.ts b/packages/browser-crawler/src/internals/browser-crawler.ts index e93c52b3fa20..3d4cc2ecb2aa 100644 --- a/packages/browser-crawler/src/internals/browser-crawler.ts +++ b/packages/browser-crawler/src/internals/browser-crawler.ts @@ -34,10 +34,11 @@ import type { BrowserPoolHooks, BrowserPoolOptions, CommonPage, + CrawlerRemoteBrowserOptions, InferBrowserPluginArray, LaunchContext, } from '@crawlee/browser-pool'; -import { BrowserPool } from '@crawlee/browser-pool'; +import { BrowserPool, RemoteBrowserPool } from '@crawlee/browser-pool'; import type { BatchAddRequestsResult, Cookie as CookieObject, IBrowserPool, ISession } from '@crawlee/types'; import type { RobotsTxtFile } from '@crawlee/utils'; import { CLOUDFLARE_RETRY_CSS_SELECTORS, RETRY_CSS_SELECTORS, sleep } from '@crawlee/utils'; @@ -123,6 +124,19 @@ export interface BrowserCrawlerOptions< */ browserPool?: IBrowserPool; + /** + * Connect to a remote browser service (Browserbase, Browserless, Steel, …) instead of launching locally. + * + * The crawler builds a {@apilink RemoteBrowserPool} around its own browser plugin, so the connection is + * always for the right browser — there is no plugin to construct and no way to mismatch the pool with the + * crawler. Supply the connection details only: a static `endpoint` URL, a function returning one per launch, + * or a {@apilink RemoteBrowserProvider}. + * + * Mutually exclusive with `browserPool`. For sharing a remote pool across crawlers, construct a + * {@apilink RemoteBrowserPool} yourself and pass it as `browserPool` instead. + */ + remoteBrowser?: CrawlerRemoteBrowserOptions; + /** * Function that is called to process each request. * @@ -322,12 +336,11 @@ export abstract class BrowserCrawler< browserPool: IBrowserPool; /** - * Set when the crawler constructed its own {@apilink BrowserPool} (no `browserPool` option was provided). - * Holds the same instance as `browserPool`, but typed as the concrete class so the crawler can call - * lifecycle methods (`destroy`) that aren't part of {@apilink IBrowserPool}. A user-supplied pool is - * never owned and never torn down by the crawler. + * Set when the crawler constructed its own pool (a {@apilink BrowserPool}, or a {@apilink RemoteBrowserPool} + * built from the `remoteBrowser` option). Holds the same instance as `browserPool` but is the only reference + * the crawler tears down — a user-supplied `browserPool` is never owned and never destroyed by the crawler. */ - private ownedBrowserPool?: BrowserPool; + private ownedBrowserPool?: { destroy: () => Promise }; launchContext: BrowserLaunchContext; @@ -349,6 +362,7 @@ export abstract class BrowserCrawler< launchContext: ow.optional.object, headless: ow.optional.any(ow.boolean, ow.string), browserPool: ow.optional.object.validate(validators.browserPool), + remoteBrowser: ow.optional.object, browserPoolOptions: ow.optional.object, saveResponseCookies: ow.optional.boolean, proxyConfiguration: ow.optional.object.validate(validators.proxyConfiguration), @@ -368,6 +382,7 @@ export abstract class BrowserCrawler< saveResponseCookies = true, launchContext = {}, browserPool, + remoteBrowser, browserPoolOptions, preNavigationHooks = [], postNavigationHooks = [], @@ -422,6 +437,13 @@ export abstract class BrowserCrawler< this.saveResponseCookies = saveResponseCookies; + if (browserPool && remoteBrowser) { + throw new Error( + "Set at most one of 'browserPool' and 'remoteBrowser'. To share a remote pool across crawlers, " + + 'build a RemoteBrowserPool yourself and pass it as `browserPool`.', + ); + } + if (browserPool) { this.browserPool = browserPool; return; @@ -435,10 +457,25 @@ export abstract class BrowserCrawler< resolvedBrowserPoolOptions.useFingerprints = false; } - this.ownedBrowserPool = new BrowserPool({ + if (remoteBrowser) { + // The crawler already built the right plugin for its browser — hand it to a RemoteBrowserPool so the + // remote connection is always for the matching browser (no plugin to construct, no way to mismatch). + const { browserPlugins, ...remoteBrowserPoolOptions } = resolvedBrowserPoolOptions; + const remotePool = new RemoteBrowserPool({ + browserPlugins: browserPlugins as BrowserPlugin[], + ...remoteBrowser, + browserPoolOptions: remoteBrowserPoolOptions as any, + }); + this.ownedBrowserPool = remotePool; + this.browserPool = remotePool as IBrowserPool; + return; + } + + const ownedBrowserPool = new BrowserPool({ ...(resolvedBrowserPoolOptions as any), }); - this.browserPool = this.ownedBrowserPool as IBrowserPool; + this.ownedBrowserPool = ownedBrowserPool; + this.browserPool = ownedBrowserPool as IBrowserPool; } protected override buildContextPipeline(): ContextPipeline< diff --git a/packages/browser-pool/src/abstract-classes/browser-controller.ts b/packages/browser-pool/src/abstract-classes/browser-controller.ts index 684f65738a6b..7c5d407416b2 100644 --- a/packages/browser-pool/src/abstract-classes/browser-controller.ts +++ b/packages/browser-pool/src/abstract-classes/browser-controller.ts @@ -208,6 +208,7 @@ export abstract class BrowserController< this.log.debug(`Could not close browser.\nCause: ${(error as Error).message}`, { id: this.id }); } + await this._releaseRemoteBrowser(); this.emit(BROWSER_CONTROLLER_EVENTS.BROWSER_CLOSED, this); setTimeout(() => { @@ -225,9 +226,25 @@ export abstract class BrowserController< async kill(): Promise { await this.hasBrowserPromise; await this._kill(); + await this._releaseRemoteBrowser(); this.emit(BROWSER_CONTROLLER_EVENTS.BROWSER_CLOSED, this); } + /** + * Releases the remote browser session (if this controller serves a remote browser) via the plugin's + * {@apilink RemoteConnection}. Safe to call multiple times — the token is cleared after the first call + * and the pool's registry also dedupes, so `release()` fires at most once across close()/kill(). + */ + private async _releaseRemoteBrowser(): Promise { + const token = this.launchContext?._remoteToken; + if (token === undefined) return; + + // Clear so release only fires once (close() schedules kill() after a timeout). + this.launchContext._remoteToken = undefined; + + await this.browserPlugin.remoteConnection?.release(token); + } + /** * Opens new browser page. * @ignore diff --git a/packages/browser-pool/src/abstract-classes/browser-plugin.ts b/packages/browser-pool/src/abstract-classes/browser-plugin.ts index 8d090a8939b5..b08e3b55c38e 100644 --- a/packages/browser-pool/src/abstract-classes/browser-plugin.ts +++ b/packages/browser-pool/src/abstract-classes/browser-plugin.ts @@ -4,7 +4,8 @@ import merge from 'lodash.merge'; import type { LaunchContextOptions } from '../launch-context.js'; import { LaunchContext } from '../launch-context.js'; -import type { UnwrapPromise } from '../utils.js'; +import type { RemoteConnection, RemoteConnectionParameters } from '../remote-browser-pool.js'; +import { sanitizeEndpointForLog, type UnwrapPromise } from '../utils.js'; import type { BrowserController } from './browser-controller.js'; /** @@ -117,6 +118,18 @@ export abstract class BrowserPlugin< ignoreProxyCertificate?: boolean; + /** + * Set by {@apilink RemoteBrowserPool} when this plugin connects to a remote browser service instead of + * launching locally. Holds the bridge the plugin uses to resolve endpoints and release sessions; all + * remote-session policy lives in the pool, not here. + * + * @internal + */ + remoteConnection?: RemoteConnection; + + /** Static connect() parameters for a remote connection (protocol, headers, …). @internal */ + remoteConnectionParameters?: RemoteConnectionParameters; + constructor(library: Library, options: BrowserPluginOptions = {}) { const { launchOptions = {} as LibraryOptions, @@ -137,6 +150,54 @@ export abstract class BrowserPlugin< this.ignoreProxyCertificate = ignoreProxyCertificate; } + /** + * Configures this plugin to connect to a remote browser using the given {@apilink RemoteConnection}. + * Called by {@apilink RemoteBrowserPool}; subclasses may override to apply library-specific defaults + * (e.g. forcing incognito pages). + * + * @internal + */ + useRemoteConnection(connection: RemoteConnection, parameters: RemoteConnectionParameters = {}): void { + this.remoteConnection = connection; + this.remoteConnectionParameters = parameters; + } + + /** + * Resolves a remote endpoint via the injected {@apilink RemoteConnection}, stores the session token on + * the launch context (so the controller can release it on close), and runs the library-specific `connect`. + * On failure the session is released and the error is wrapped in a {@apilink BrowserLaunchError}. + * + * Subclasses implement only the `connect` callback — the resolve / token / release / error-wrap scaffolding + * lives here so it stays identical across plugins. + */ + protected async _connectToRemoteBrowser( + launchContext: LaunchContext, + connect: (url: string) => Promise, + ): Promise { + const connection = this.remoteConnection!; + + let url: string; + let token: number; + try { + ({ url, token } = await connection.resolve({ proxyUrl: launchContext.proxyUrl })); + } catch (cause) { + throw new BrowserLaunchError('Failed to resolve the remote browser endpoint.', { cause }); + } + + launchContext._remoteToken = token; + + try { + return await connect(url); + } catch (cause) { + await connection.release(token); + throw new BrowserLaunchError( + `Failed to connect to remote browser at "${sanitizeEndpointForLog(url)}". ` + + 'Check that the endpoint is reachable and accepts the configured protocol.', + { cause }, + ); + } + } + /** * Creates a `LaunchContext` with all the information needed * to launch a browser. Aside from library specific launch options, @@ -154,6 +215,7 @@ export abstract class BrowserPlugin< userDataDir = this.userDataDir, browserPerProxy = this.browserPerProxy, ignoreProxyCertificate = this.ignoreProxyCertificate, + isRemote = !!this.remoteConnection, } = options; return new LaunchContext({ @@ -165,6 +227,7 @@ export abstract class BrowserPlugin< userDataDir, browserPerProxy, ignoreProxyCertificate, + isRemote, }); } @@ -188,15 +251,16 @@ export abstract class BrowserPlugin< NewPageResult > = this.createLaunchContext(), ): Promise { + // launchOptions is only used by the local launch path below — remote connections ignore it. launchContext.launchOptions ??= {} as LibraryOptions; const { proxyUrl, launchOptions } = launchContext; - if (proxyUrl) { + if (proxyUrl && !launchContext.isRemote) { await this._addProxyToLaunchOptions(launchContext); } - if (this._isChromiumBasedBrowser(launchContext)) { + if (!launchContext.isRemote && this._isChromiumBasedBrowser(launchContext)) { // This will set the args for chromium based browsers to hide the webdriver. (launchOptions as Dictionary).args = this._mergeArgsToHideWebdriver(launchOptions!.args); // When User-Agent is not set, and we're using Chromium in headless mode, @@ -208,6 +272,10 @@ export abstract class BrowserPlugin< } } + if (launchContext.isRemote) { + this.log.info('Connecting to remote browser (skipping local proxy and webdriver stealth configuration).'); + } + return this._launch(launchContext); } diff --git a/packages/browser-pool/src/browser-pool.ts b/packages/browser-pool/src/browser-pool.ts index 7ec5e1b37ca8..2776913cbe63 100644 --- a/packages/browser-pool/src/browser-pool.ts +++ b/packages/browser-pool/src/browser-pool.ts @@ -307,6 +307,7 @@ export class BrowserPool< { browserPlugins: BrowserPlugins; maxOpenPagesPerBrowser: number; + maxOpenBrowsers: number; retireBrowserAfterPageCount: number; operationTimeoutMillis: number; closeInactiveBrowserAfterMillis: number; @@ -399,6 +400,7 @@ export class BrowserPool< this.browserPlugins = browserPlugins as unknown as BrowserPlugins; this.maxOpenPagesPerBrowser = maxOpenPagesPerBrowser; + this.maxOpenBrowsers = Infinity; this.retireBrowserAfterPageCount = retireBrowserAfterPageCount; this.operationTimeoutMillis = operationTimeoutSecs * 1000; this.closeInactiveBrowserAfterMillis = closeInactiveBrowserAfterSecs * 1000; @@ -931,6 +933,28 @@ export class BrowserPool< } } + /** + * Returns `true` if the pool can accept a new browser launch without exceeding + * {@link BrowserPoolOptions.maxOpenBrowsers}. Counts starting, active, and retired browsers. + */ + hasFreeBrowserSlot(): boolean { + const total = + this.startingBrowserControllers.size + + this.activeBrowserControllers.size + + this.retiredBrowserControllers.size; + return total < this.maxOpenBrowsers; + } + + /** + * Returns `true` if any active browser has room for another page. + */ + hasActiveBrowserWithFreeCapacity(): boolean { + for (const controller of this.activeBrowserControllers) { + if (controller.activePages < this.maxOpenPagesPerBrowser) return true; + } + return false; + } + private _initializeFingerprinting(): void { const { useFingerprintCache = true, fingerprintCacheSize = 10_000 } = this.fingerprintOptions; this.fingerprintGenerator = new FingerprintGenerator(this.fingerprintOptions.fingerprintGeneratorOptions); diff --git a/packages/browser-pool/src/fingerprinting/hooks.ts b/packages/browser-pool/src/fingerprinting/hooks.ts index 07096e416890..9a95752ad079 100644 --- a/packages/browser-pool/src/fingerprinting/hooks.ts +++ b/packages/browser-pool/src/fingerprinting/hooks.ts @@ -34,6 +34,9 @@ export function createFingerprintPreLaunchHook(browserPool: BrowserPool { + // Remote browsers may have their own fingerprinting — skip local fingerprint injection + if (launchContext.isRemote) return; + const { useIncognitoPages } = launchContext; const session = launchContext.session as ISession | undefined; const cacheKey = session?.id ?? launchContext.proxyUrl; @@ -75,6 +78,7 @@ export function createFingerprintPreLaunchHook(browserPool: BrowserPool { const { launchContext, browserPlugin } = browserController; + if (launchContext.isRemote) return; const { fingerprint } = launchContext.fingerprint!; if (launchContext.useIncognitoPages && browserPlugin instanceof PlaywrightPlugin && pageOptions) { @@ -93,6 +97,7 @@ export function createPrePageCreateHook() { export function createPostPageCreateHook(fingerprintInjector: FingerprintInjector) { return async (page: any, browserController: BrowserController): Promise => { const { browserPlugin, launchContext } = browserController; + if (launchContext.isRemote) return; const fingerprint = launchContext.fingerprint!; // TODO this will require refactoring, we should use common API instead of branching based on plugin type, diff --git a/packages/browser-pool/src/index.ts b/packages/browser-pool/src/index.ts index f8502a42dbe6..9f3b49455248 100644 --- a/packages/browser-pool/src/index.ts +++ b/packages/browser-pool/src/index.ts @@ -51,6 +51,16 @@ export type { export { BrowserPlugin, BrowserLaunchError, DEFAULT_USER_AGENT } from './abstract-classes/browser-plugin.js'; export type { LaunchContextOptions } from './launch-context.js'; export { LaunchContext } from './launch-context.js'; +export { RemoteBrowserProvider } from './remote-browser-provider.js'; +export { RemoteBrowserPool } from './remote-browser-pool.js'; +export type { + RemoteBrowserPoolOptions, + CrawlerRemoteBrowserOptions, + RemoteBrowserEndpoint, + ResolvedRemoteEndpoint, + RemoteConnection, + RemoteConnectionParameters, +} from './remote-browser-pool.js'; export type { InferBrowserPluginArray, UnwrapPromise } from './utils.js'; export { anonymizeProxySugar, type AnonymizeProxySugarOptions } from './anonymize-proxy.js'; export type { IBrowserPool, NewPageOptions } from '@crawlee/types'; diff --git a/packages/browser-pool/src/launch-context.ts b/packages/browser-pool/src/launch-context.ts index b433f8cb20f7..4bbec3236835 100644 --- a/packages/browser-pool/src/launch-context.ts +++ b/packages/browser-pool/src/launch-context.ts @@ -56,6 +56,12 @@ export interface LaunchContextOptions< * This is useful when using HTTPS proxies with self-signed certificates. */ ignoreProxyCertificate?: boolean; + /** + * Whether this launch context represents a connection to a remote browser + * rather than a locally launched one. + * @default false + */ + isRemote?: boolean; } export class LaunchContext< @@ -71,12 +77,21 @@ export class LaunchContext< useIncognitoPages: boolean; browserPerProxy?: boolean; userDataDir: string; + readonly isRemote: boolean; ignoreProxyCertificate?: boolean; private _proxyUrl?: string; private readonly _reservedFieldNames = [...Reflect.ownKeys(this), 'extend']; fingerprint?: BrowserFingerprintWithHeaders; + + /** + * Token identifying the remote browser session this context connected to, set by the plugin and read by + * the {@apilink BrowserController} to release the session on close. Only present for remote connections. + * @internal + */ + _remoteToken?: number; + [K: PropertyKey]: unknown; constructor(options: LaunchContextOptions) { @@ -89,6 +104,7 @@ export class LaunchContext< browserPerProxy, userDataDir = '', ignoreProxyCertificate, + isRemote, } = options; this.id = id; @@ -98,6 +114,7 @@ export class LaunchContext< this.useIncognitoPages = useIncognitoPages ?? false; this.userDataDir = userDataDir; this.ignoreProxyCertificate = ignoreProxyCertificate ?? false; + this.isRemote = isRemote ?? false; this._proxyUrl = proxyUrl; } diff --git a/packages/browser-pool/src/playwright/playwright-browser.ts b/packages/browser-pool/src/playwright/playwright-browser.ts index 2b94fce5421e..c1e2c65b8ef8 100644 --- a/packages/browser-pool/src/playwright/playwright-browser.ts +++ b/packages/browser-pool/src/playwright/playwright-browser.ts @@ -21,7 +21,6 @@ export class PlaywrightBrowser extends EventEmitter { const { browserContext, version } = options; this._browserContext = browserContext; - this._version = version; this._browserContext.once('close', () => { diff --git a/packages/browser-pool/src/playwright/playwright-controller.ts b/packages/browser-pool/src/playwright/playwright-controller.ts index 0f7a4c1bf539..aeb927ff5f13 100644 --- a/packages/browser-pool/src/playwright/playwright-controller.ts +++ b/packages/browser-pool/src/playwright/playwright-controller.ts @@ -45,6 +45,11 @@ export class PlaywrightController extends BrowserController< ...contextOptions, }; + // Remote browsers handle their own proxy — don't inject local proxy settings into context + if (this.launchContext.isRemote) { + delete contextOptions?.proxy; + } + if (contextOptions?.proxy) { const [anonymizedProxyUrl, closeProxy] = await anonymizeProxySugar( contextOptions.proxy.server, diff --git a/packages/browser-pool/src/playwright/playwright-plugin.ts b/packages/browser-pool/src/playwright/playwright-plugin.ts index a48cf1fedfec..bd71092fcf0f 100644 --- a/packages/browser-pool/src/playwright/playwright-plugin.ts +++ b/packages/browser-pool/src/playwright/playwright-plugin.ts @@ -7,6 +7,7 @@ import { anonymizeProxySugar } from '../anonymize-proxy.js'; import type { createProxyServerForContainers } from '../container-proxy-server.js'; import type { LaunchContext } from '../launch-context.js'; import { getLocalProxyAddress } from '../proxy-server.js'; +import type { RemoteConnection, RemoteConnectionParameters } from '../remote-browser-pool.js'; import type { SafeParameters } from '../utils.js'; import { PlaywrightBrowser as PlaywrightBrowserWithPersistentContext } from './playwright-browser.js'; import { PlaywrightController } from './playwright-controller.js'; @@ -19,7 +20,34 @@ export class PlaywrightPlugin extends BrowserPlugin< private _browserVersion?: string; _containerProxyServer?: Awaited>; + /** + * Playwright remote connections only support incognito pages — `connect()` / `connectOverCDP()` don't + * accept persistent contexts. Force it on (and inform the user) when wired for a remote connection. + */ + override useRemoteConnection(connection: RemoteConnection, parameters: RemoteConnectionParameters = {}): void { + super.useRemoteConnection(connection, parameters); + + if (!this.useIncognitoPages) { + this.log.info( + 'Remote Playwright connection — useIncognitoPages forced to true. ' + + 'Pages will not share cookies/storage between each other; use the SessionPool for shared state.', + ); + } + this.useIncognitoPages = true; + } + protected async _launch(launchContext: LaunchContext): Promise { + if (this.remoteConnection) { + return this._connectToRemoteBrowser(launchContext, async (url) => { + const connectOptions = (this.remoteConnectionParameters?.connectOptions ?? {}) as any; + if (this.remoteConnectionParameters?.protocol === 'playwright') { + this.log.info('Connecting to remote browser via connect (Playwright WebSocket).'); + return this.library.connect(url, connectOptions); + } + this.log.info('Connecting to remote browser via connectOverCDP.'); + return this.library.connectOverCDP(url, connectOptions); + }); + } const { launchOptions, useIncognitoPages, userDataDir, proxyUrl } = launchContext; let browser: PlaywrightBrowser; diff --git a/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts b/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts index 91ea817d03a9..6e18b22b21cb 100644 --- a/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts +++ b/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts @@ -7,6 +7,7 @@ import type * as PuppeteerTypes from 'puppeteer'; import { BrowserPlugin } from '../abstract-classes/browser-plugin.js'; import { anonymizeProxySugar } from '../anonymize-proxy.js'; import type { LaunchContext } from '../launch-context.js'; +import type { RemoteConnection, RemoteConnectionParameters } from '../remote-browser-pool.js'; import { noop } from '../utils.js'; import type { PuppeteerNewPageOptions } from './puppeteer-controller.js'; import { PuppeteerController } from './puppeteer-controller.js'; @@ -19,6 +20,18 @@ export class PuppeteerPlugin extends BrowserPlugin< PuppeteerTypes.Browser, PuppeteerNewPageOptions > { + /** Pages share cookies/storage on the remote browser (Puppeteer defaults to non-incognito). */ + override useRemoteConnection(connection: RemoteConnection, parameters: RemoteConnectionParameters = {}): void { + super.useRemoteConnection(connection, parameters); + + if (!this.useIncognitoPages) { + this.log.info( + 'Remote Puppeteer connection — pages will share cookies and storage on the remote ' + + 'browser instance (useIncognitoPages defaults to false).', + ); + } + } + protected async _launch( launchContext: LaunchContext< typeof Puppeteer, @@ -38,71 +51,74 @@ export class PuppeteerPlugin extends BrowserPlugin< // ignore } - const { - launchOptions, - userDataDir, - useIncognitoPages, - experimentalContainers, - proxyUrl, - ignoreProxyCertificate, - } = launchContext; - - if (experimentalContainers) { - throw new Error('Experimental containers are only available with Playwright'); - } - - launchOptions!.userDataDir = launchOptions!.userDataDir ?? userDataDir; - - if (launchOptions!.headless === false) { - if (Array.isArray(launchOptions!.args)) { - launchOptions!.args.push('--disable-site-isolation-trials'); - } else { - launchOptions!.args = ['--disable-site-isolation-trials']; - } - } - - if (launchOptions!.headless === true && oldPuppeteerVersion) { - launchOptions!.headless = 'new' as any; - } + const { useIncognitoPages, proxyUrl, ignoreProxyCertificate } = launchContext; let browser: PuppeteerTypes.Browser; - { - const [anonymizedProxyUrl, close] = await anonymizeProxySugar(proxyUrl, undefined, undefined, { - ignoreProxyCertificate: launchContext.ignoreProxyCertificate, + if (this.remoteConnection) { + browser = await this._connectToRemoteBrowser(launchContext, async (url) => { + const connectOptions = this.remoteConnectionParameters?.connectOptions ?? {}; + this.log.info('Connecting to remote browser via connect (CDP).'); + return this.library.connect({ ...connectOptions, browserWSEndpoint: url }); }); + } else { + const { launchOptions, userDataDir, experimentalContainers } = launchContext; + + if (experimentalContainers) { + throw new Error('Experimental containers are only available with Playwright'); + } - if (proxyUrl) { - const proxyArg = `${PROXY_SERVER_ARG}${anonymizedProxyUrl ?? proxyUrl}`; + launchOptions!.userDataDir = launchOptions!.userDataDir ?? userDataDir; + if (launchOptions!.headless === false) { if (Array.isArray(launchOptions!.args)) { - launchOptions!.args.push(proxyArg); + launchOptions!.args.push('--disable-site-isolation-trials'); } else { - launchOptions!.args = [proxyArg]; + launchOptions!.args = ['--disable-site-isolation-trials']; } } - try { - browser = await this.library.launch(launchOptions); + if (launchOptions!.headless === true && oldPuppeteerVersion) { + launchOptions!.headless = 'new' as any; + } - if (anonymizedProxyUrl) { - browser.on('disconnected', async () => { - await close(); - }); + { + const [anonymizedProxyUrl, close] = await anonymizeProxySugar(proxyUrl, undefined, undefined, { + ignoreProxyCertificate: launchContext.ignoreProxyCertificate, + }); + + if (proxyUrl) { + const proxyArg = `${PROXY_SERVER_ARG}${anonymizedProxyUrl ?? proxyUrl}`; + + if (Array.isArray(launchOptions!.args)) { + launchOptions!.args.push(proxyArg); + } else { + launchOptions!.args = [proxyArg]; + } + } + + try { + browser = await this.library.launch(launchOptions); + + if (anonymizedProxyUrl) { + browser.on('disconnected', async () => { + await close(); + }); + } + } catch (error: any) { + await close(); + + this._throwAugmentedLaunchError( + error, + launchContext.launchOptions?.executablePath, + '`apify/actor-node-puppeteer-chrome`', + "Try installing a browser, if it's missing, by running `npx @puppeteer/browsers install chromium --path [path]` and pointing `executablePath` to the downloaded executable (https://pptr.dev/browsers-api)", + ); } - } catch (error: any) { - await close(); - - this._throwAugmentedLaunchError( - error, - launchContext.launchOptions?.executablePath, - '`apify/actor-node-puppeteer-chrome`', - "Try installing a browser, if it's missing, by running `npx @puppeteer/browsers install chromium --path [path]` and pointing `executablePath` to the downloaded executable (https://pptr.dev/browsers-api)", - ); } } - browser.on('targetcreated', async (target: PuppeteerTypes.Target) => { + const targetCreatedHandler = async (target: PuppeteerTypes.Target) => { try { const page = await target.page(); @@ -115,7 +131,16 @@ export class PuppeteerPlugin extends BrowserPlugin< } catch (error: any) { this.log.exception(error, 'Failed to retrieve page from target.'); } - }); + }; + + browser.on('targetcreated', targetCreatedHandler); + + // Clean up the listener when a remote browser disconnects to prevent leaks + if (this.remoteConnection) { + browser.once('disconnected', () => { + browser.off('targetcreated', targetCreatedHandler); + }); + } const boundMethods = ( [ @@ -142,30 +167,35 @@ export class PuppeteerPlugin extends BrowserPlugin< let page: PuppeteerTypes.Page; if (useIncognitoPages) { - const [anonymizedProxyUrl, close] = await anonymizeProxySugar( - proxyUrl, - undefined, - undefined, - { ignoreProxyCertificate }, - ); + // Skip proxy setup for remote connections — proxy is managed by the remote service. + const effectiveProxyUrl = this.remoteConnection ? undefined : proxyUrl; + const [anonymizedProxyUrl, close] = effectiveProxyUrl + ? await anonymizeProxySugar(effectiveProxyUrl, undefined, undefined, { + ignoreProxyCertificate, + }) + : ([undefined, noop] as const); + + const proxyServer = anonymizedProxyUrl ?? effectiveProxyUrl; + const contextOptions = proxyServer ? { proxyServer } : {}; + const context = (await (browser as any)[method]( + contextOptions, + )) as PuppeteerTypes.BrowserContext; try { - const context = (await (browser as any)[method]({ - proxyServer: anonymizedProxyUrl ?? proxyUrl, - })) as PuppeteerTypes.BrowserContext; - page = await context.newPage(...args); - - if (anonymizedProxyUrl) { - page.on('close', async () => { - await close(); - }); - } } catch (error) { + await context.close().catch(noop); await close(); throw error; } + + page.once('close', async () => { + if (anonymizedProxyUrl) { + await close(); + } + await context.close().catch(noop); + }); } else { page = await boundMethods.newPage(...args); } diff --git a/packages/browser-pool/src/remote-browser-pool.ts b/packages/browser-pool/src/remote-browser-pool.ts new file mode 100644 index 000000000000..1d27353593cc --- /dev/null +++ b/packages/browser-pool/src/remote-browser-pool.ts @@ -0,0 +1,318 @@ +import { type CrawleeLogger, serviceLocator } from '@crawlee/core'; +import type { IBrowserPool, NewPageOptions, PageState } from '@crawlee/types'; + +import type { BrowserPlugin } from './abstract-classes/browser-plugin.js'; +import { BrowserPool } from './browser-pool.js'; +import type { BrowserPoolHooks, BrowserPoolOptions } from './browser-pool.js'; +import { BROWSER_POOL_EVENTS } from './events.js'; +import { RemoteBrowserProvider } from './remote-browser-provider.js'; + +/** + * The result of resolving a remote browser endpoint: the URL to connect to plus an optional opaque + * `context` object that is handed back to `release`. + */ +export interface ResolvedRemoteEndpoint { + /** The browser endpoint URL to connect to. */ + url: string; + /** Opaque metadata passed back to `release()` — e.g. session IDs, API tokens. */ + context?: Record; +} + +/** + * A remote browser endpoint: either a static URL string, or a function called once per browser launch + * that returns a URL (optionally with a `context` for `release`). + * + * The function receives the `proxyUrl` resolved by Crawlee's proxy configuration for the launch, so it + * can forward it to the remote service's proxy API. + */ +export type RemoteBrowserEndpoint = + | string + | ((options?: { proxyUrl?: string }) => string | ResolvedRemoteEndpoint | Promise); + +/** + * The bridge a {@apilink RemoteBrowserPool} injects into a {@apilink BrowserPlugin} so the plugin can + * connect to a remote browser without owning any remote-session policy. + * + * The plugin only knows how to make the library-specific `connect()` call; everything else — resolving + * the endpoint, calling the user's `release()`, and guaranteeing release fires at most once — lives in + * the pool. The plugin calls {@apilink RemoteConnection.resolve|resolve} before connecting, stores the + * returned `token` on its launch context, and the controller later calls + * {@apilink RemoteConnection.release|release} with that token when the browser closes. + * + * @internal + */ +export interface RemoteConnection { + /** Resolves the endpoint for a single browser launch. The `token` identifies the session for release. */ + resolve(options?: { proxyUrl?: string }): Promise<{ url: string; token: number }>; + /** Releases the remote session for `token`. Idempotent — safe to call from both `close()` and `kill()`. */ + release(token: number): Promise; +} + +/** + * Owns the lifecycle of remote browser sessions for a single {@apilink RemoteBrowserPool}: endpoint + * resolution, the user's `release()` callback, and a release-at-most-once guarantee. Implements + * {@apilink RemoteConnection} so it can be injected into a plugin. + */ +class RemoteSessionRegistry implements RemoteConnection { + private readonly sessions = new Map< + number, + { url: string; context?: Record; released: boolean } + >(); + private nextToken = 0; + + constructor( + private readonly endpoint: RemoteBrowserEndpoint, + private readonly onRelease: + | ((info: { endpoint: string; context?: Record }) => unknown) + | undefined, + private readonly log: CrawleeLogger, + ) {} + + async resolve(options?: { proxyUrl?: string }): Promise<{ url: string; token: number }> { + const resolved = typeof this.endpoint === 'function' ? await this.endpoint(options) : this.endpoint; + + let result: ResolvedRemoteEndpoint; + if (typeof resolved === 'string') { + if (!resolved) throw new Error('Remote browser endpoint resolved to an empty string.'); + result = { url: resolved }; + } else if (!resolved?.url) { + throw new Error("Remote browser endpoint() must return a URL string or an object with a non-empty 'url'."); + } else { + result = resolved; + } + + const token = this.nextToken++; + this.sessions.set(token, { url: result.url, context: result.context, released: false }); + return { url: result.url, token }; + } + + async release(token: number): Promise { + const session = this.sessions.get(token); + // Release at most once per session — guards a close()/teardown race (the `released` flag is set + // synchronously before the awaited onRelease, so releaseAll() can't double-fire an in-flight release). + if (!session || session.released) return; + session.released = true; + + try { + await this.onRelease?.({ endpoint: session.url, context: session.context }); + } catch (err) { + this.log.warning('Remote browser release() failed.', { error: (err as Error)?.message }); + } finally { + this.sessions.delete(token); + } + } + + /** Releases every session that is still open. Called on pool teardown so no remote session leaks. */ + async releaseAll(): Promise { + await Promise.all([...this.sessions.keys()].map(async (token) => this.release(token))); + } +} + +/** + * Per-plugin remote connection parameters, passed to {@apilink BrowserPlugin.useRemoteConnection}. + * The endpoint is supplied per-launch via {@apilink RemoteConnection}; these are the static connect() + * parameters (protocol, headers, timeouts, …). + */ +export interface RemoteConnectionParameters { + /** + * Playwright only: which protocol to connect with. `'cdp'` uses `connectOverCDP()` (the default), + * `'playwright'` uses `connect()` (Playwright's own WebSocket protocol). Ignored by Puppeteer. + */ + protocol?: 'cdp' | 'playwright'; + /** Extra options forwarded to the library `connect()` / `connectOverCDP()` call (endpoint excluded). */ + connectOptions?: Record; +} + +export interface RemoteBrowserPoolOptions { + /** + * The browser plugin(s) used to connect to the remote service — e.g. `new PlaywrightPlugin(playwright.chromium)` + * or `new PuppeteerPlugin(puppeteer)`. The pool configures them for remote connection; do not set a local + * `launchOptions` on them. + */ + browserPlugins: BrowserPlugin[]; + /** + * The remote browser endpoint: a static URL, a function returning one per launch, or a + * {@apilink RemoteBrowserProvider} instance encapsulating a session create/release lifecycle. + */ + endpoint: RemoteBrowserEndpoint | RemoteBrowserProvider; + /** + * Cleanup callback invoked when a browser closes, crashes, or the pool is destroyed. Receives the + * `context` returned by a function endpoint. Errors are caught and logged. Ignored when `endpoint` + * is a {@apilink RemoteBrowserProvider} (its own `release()` is used instead). + */ + release?: (info: { endpoint: string; context?: Record }) => unknown; + /** + * Maximum number of remote browsers open at once. When reached, {@apilink RemoteBrowserPool.newPage|newPage} + * waits for a browser to close before connecting a new one. Set it to your service's concurrent-session limit + * to avoid `429` errors. Defaults to the {@apilink RemoteBrowserProvider.maxOpenBrowsers|provider's value}, or + * `Infinity`. + */ + maxOpenBrowsers?: number; + /** Static connect() parameters (Playwright protocol selection, headers, timeouts, …). */ + connection?: RemoteConnectionParameters; + /** Extra {@apilink BrowserPool} options (lifecycle hooks, page limits, fingerprinting, …). */ + browserPoolOptions?: Omit & BrowserPoolHooks; + /** Fallback poll interval (ms) while waiting for a free browser slot. The wait is event-driven; this only bounds it. @default 500 */ + slotPollIntervalMillis?: number; +} + +/** + * The remote-connection configuration a browser crawler accepts on its `remoteBrowser` option. It is the + * {@apilink RemoteBrowserPoolOptions} a user supplies *minus* the parts the crawler provides itself — the + * `browserPlugins` (the crawler builds the correct one for its browser) and `browserPoolOptions` (taken from + * the crawler's own `browserPoolOptions`). This is what makes the crawler path both terse and mismatch-proof. + */ +export type CrawlerRemoteBrowserOptions = Omit; + +/** + * An {@apilink IBrowserPool} implementation for remote browser services. + * + * Unlike configuring a remote browser through a crawler's `launchContext`, this pool is the single owner + * of all remote-session concerns: + * - **endpoint resolution** — static URL, per-launch function, or {@apilink RemoteBrowserProvider}; + * - **release lifecycle** — `release()` fires exactly once per session on close/crash/teardown (no leaks, + * no double-release); + * - **concurrency** — {@apilink RemoteBrowserPoolOptions.maxOpenBrowsers|maxOpenBrowsers} is enforced inside + * {@apilink RemoteBrowserPool.newPage|newPage}, which waits for a free slot rather than overshooting. + * + * The wrapped {@apilink BrowserPool} and its plugin only perform the library-specific `connect()` call. + * + * Pass an instance as the crawler's `browserPool` option: + * + * ```typescript + * import { PlaywrightPlugin, RemoteBrowserPool } from '@crawlee/browser-pool'; + * import { PlaywrightCrawler } from 'crawlee'; + * import playwright from 'playwright'; + * + * const browserPool = new RemoteBrowserPool({ + * browserPlugins: [new PlaywrightPlugin(playwright.chromium)], + * endpoint: 'wss://production-sfo.browserless.io?token=xxx', + * maxOpenBrowsers: 2, + * }); + * + * const crawler = new PlaywrightCrawler({ browserPool }); + * ``` + * + * @category Browser management + */ +export class RemoteBrowserPool implements IBrowserPool { + /** The wrapped pool that performs the remote connections and serves pages. */ + readonly browserPool: BrowserPool; + + /** The wrapped pool viewed through the {@apilink IBrowserPool} contract (the bare type widens pages to `never`). */ + private readonly pool: IBrowserPool; + + private readonly registry: RemoteSessionRegistry; + private readonly slotPollIntervalMillis: number; + private readonly log: CrawleeLogger; + + /** Shared by all `newPage` callers waiting for a free slot, so they don't each register their own listeners. */ + private _capacityChange?: Promise; + + constructor(options: RemoteBrowserPoolOptions) { + const { + browserPlugins, + endpoint, + release, + maxOpenBrowsers, + connection = {}, + browserPoolOptions = {}, + slotPollIntervalMillis = 500, + } = options; + + this.log = serviceLocator.getLogger().child({ prefix: 'RemoteBrowserPool' }); + this.slotPollIntervalMillis = slotPollIntervalMillis; + + // A RemoteBrowserProvider carries its own endpoint, release, and maxOpenBrowsers. + const provider = endpoint instanceof RemoteBrowserProvider ? endpoint : undefined; + const resolvedEndpoint: RemoteBrowserEndpoint = provider + ? (opts) => provider.connect(opts) + : (endpoint as RemoteBrowserEndpoint); + const resolvedRelease = provider + ? ({ context }: { context?: Record }) => provider.release(context as any) + : release; + const resolvedMax = maxOpenBrowsers ?? provider?.maxOpenBrowsers; + + this.registry = new RemoteSessionRegistry(resolvedEndpoint, resolvedRelease, this.log); + + // Wire every plugin for remote connection. + for (const plugin of browserPlugins) { + plugin.useRemoteConnection(this.registry, connection); + } + + this.browserPool = new BrowserPool({ ...browserPoolOptions, browserPlugins }) as unknown as BrowserPool; + this.pool = this.browserPool as unknown as IBrowserPool; + + if (resolvedMax !== undefined) { + this.browserPool.maxOpenBrowsers = resolvedMax; + } + } + + /** Maximum number of remote browsers that may be open at the same time. */ + get maxOpenBrowsers(): number { + return this.browserPool.maxOpenBrowsers; + } + + set maxOpenBrowsers(value: number) { + this.browserPool.maxOpenBrowsers = value; + } + + /** + * Opens a new page, waiting first until {@apilink RemoteBrowserPoolOptions.maxOpenBrowsers|maxOpenBrowsers} + * allows it (either a new browser slot is free, or an active browser still has page capacity). + */ + async newPage(options?: NewPageOptions): Promise { + await this._waitForFreeSlot(); + return this.pool.newPage(options); + } + + async closePage(page: Page, options?: { error?: Error }): Promise { + return this.pool.closePage(page, options); + } + + async extractPageState(page: Page): Promise { + return this.pool.extractPageState(page); + } + + async injectPageState(page: Page, state: PageState): Promise { + return this.pool.injectPageState(page, state); + } + + /** Closes all browsers, releases any still-open remote sessions, and tears down the wrapped pool. */ + async destroy(): Promise { + await this.browserPool.destroy(); + // Backstop: release any sessions whose browser never emitted a close (e.g. dropped on teardown). + await this.registry.releaseAll(); + } + + /** Resolves once the wrapped pool can serve another page without exceeding `maxOpenBrowsers`. */ + private async _waitForFreeSlot(): Promise { + while (!this.browserPool.hasFreeBrowserSlot() && !this.browserPool.hasActiveBrowserWithFreeCapacity()) { + await this._nextCapacityChange(); + } + } + + /** + * Resolves on the next browser-retired / page-closed event, or after `slotPollIntervalMillis`. All + * concurrently-waiting `newPage` calls share a single promise (and a single pair of event listeners) + * per tick, so a fleet of saturated callers doesn't fan out into N listener pairs on the pool. + */ + private _nextCapacityChange(): Promise { + this._capacityChange ??= new Promise((resolve) => { + const done = () => { + clearTimeout(timer); + this.browserPool.off(BROWSER_POOL_EVENTS.BROWSER_RETIRED, done); + this.browserPool.off(BROWSER_POOL_EVENTS.PAGE_CLOSED, done); + this._capacityChange = undefined; + resolve(); + }; + + const timer = setTimeout(done, this.slotPollIntervalMillis); + timer.unref?.(); + this.browserPool.once(BROWSER_POOL_EVENTS.BROWSER_RETIRED, done); + this.browserPool.once(BROWSER_POOL_EVENTS.PAGE_CLOSED, done); + }); + + return this._capacityChange; + } +} diff --git a/packages/browser-pool/src/remote-browser-provider.ts b/packages/browser-pool/src/remote-browser-provider.ts new file mode 100644 index 000000000000..425d61ca65b1 --- /dev/null +++ b/packages/browser-pool/src/remote-browser-provider.ts @@ -0,0 +1,79 @@ +/** + * Abstract base class for remote browser service providers. + * + * Implement this class to encapsulate the lifecycle of a remote browser session + * (creation, connection URL resolution, and cleanup). {@apilink RemoteBrowserPool} + * calls {@link connect} once per browser launch and {@link release} when the browser + * closes, crashes, the pool is destroyed, or the connection fails during launch. + * + * Pass the provider instance as the `endpoint` of a {@apilink RemoteBrowserPool}, then + * hand the pool to a crawler via its `browserPool` option: + * + * ```typescript + * const browserPool = new RemoteBrowserPool({ + * browserPlugins: [new PlaywrightPlugin(playwright.chromium)], + * endpoint: new MyProvider(), + * }); + * + * const crawler = new PlaywrightCrawler({ browserPool }); + * ``` + * + * **Example — simple static endpoint (e.g. Browserless):** + * ```typescript + * class BrowserlessProvider extends RemoteBrowserProvider { + * maxOpenBrowsers = 2; // respect the service's concurrent session limit + * + * async connect() { + * return { url: `wss://production-sfo.browserless.io?token=${token}` }; + * } + * } + * ``` + * + * **Example — session lifecycle with concurrency limit (e.g. Browserbase):** + * ```typescript + * class BrowserbaseProvider extends RemoteBrowserProvider<{ id: string }> { + * maxOpenBrowsers = 2; // respect the service's concurrent session limit + * + * async connect({ proxyUrl } = {}) { + * const session = await createSession(apiKey, projectId, { + * proxies: proxyUrl ? [{ type: 'external', server: proxyUrl }] : undefined, + * }); + * return { url: session.connectUrl, context: { id: session.id } }; + * } + * + * async release(context: { id: string }) { + * await releaseSession(apiKey, context.id); + * } + * } + * ``` + */ +export abstract class RemoteBrowserProvider = Record> { + /** + * Maximum number of browsers that can be open at the same time. + * Set this to your remote service's concurrent session limit to avoid 429 errors. + */ + maxOpenBrowsers?: number; + + /** + * Called once per browser launch. Return the WebSocket/CDP endpoint URL + * and an optional `context` object that will be passed back to {@link release}. + * + * @param options.proxyUrl - The proxy URL resolved by Crawlee's proxy configuration + * for this browser session. Pass it to your remote service's proxy API if supported. + */ + abstract connect(options?: { + proxyUrl?: string; + }): Promise<{ url: string; context?: TContext }> | { url: string; context?: TContext }; + + /** + * Called when the browser closes, crashes, the pool is destroyed, or the + * connection fails right after {@link connect} succeeds. + * Override this to clean up remote sessions, release API resources, etc. + * + * Errors thrown here are caught and logged as warnings — they never crash the crawler. + * Safe to assume this is called at most once per {@link connect} call. + * + * @param _context The same `context` object returned by {@link connect}. + */ + async release(_context: TContext): Promise {} +} diff --git a/packages/browser-pool/src/utils.ts b/packages/browser-pool/src/utils.ts index ae224fee62e5..ca59455d72c5 100644 --- a/packages/browser-pool/src/utils.ts +++ b/packages/browser-pool/src/utils.ts @@ -6,6 +6,25 @@ export type UnwrapPromise = T extends PromiseLike ? UnwrapPromise export function noop(..._args: unknown[]): void {} +/** + * Strips secrets from a URL so it can be safely included in logs and error messages. Removes userinfo + * credentials and the entire query string and fragment — remote browser services routinely carry tokens + * there (e.g. Browserless `?token=…`), and we can't tell which params are sensitive. Keeps the + * protocol, host, port, and path, which are enough to diagnose connection failures. + */ +export function sanitizeEndpointForLog(endpoint: string): string { + try { + const url = new URL(endpoint); + url.username = ''; + url.password = ''; + url.search = ''; + url.hash = ''; + return url.toString(); + } catch { + return ''; + } +} + /** * This is required when using optional dependencies. * Importing a type gives `any`, but `Parameters` gives `unknown[]` instead of `any` diff --git a/packages/browser-pool/test/remote-browser-pool.test.ts b/packages/browser-pool/test/remote-browser-pool.test.ts new file mode 100644 index 000000000000..7e5b48df5a09 --- /dev/null +++ b/packages/browser-pool/test/remote-browser-pool.test.ts @@ -0,0 +1,230 @@ +import { vi } from 'vitest'; + +import { serviceLocator } from '@crawlee/core'; +import type { CrawleeLogger } from '@crawlee/core'; + +import { BROWSER_POOL_EVENTS } from '../src/events.js'; +import type { RemoteConnection } from '../src/remote-browser-pool.js'; +import { RemoteBrowserPool } from '../src/remote-browser-pool.js'; +import { RemoteBrowserProvider } from '../src/remote-browser-provider.js'; + +function createMockLogger(): CrawleeLogger { + const logger: any = { + child: vi.fn(() => logger), + error: vi.fn(), + exception: vi.fn(), + softFail: vi.fn(), + warning: vi.fn(), + warningOnce: vi.fn(), + info: vi.fn(), + debug: vi.fn(), + perf: vi.fn(), + deprecated: vi.fn(), + getOptions: vi.fn(() => ({})), + setOptions: vi.fn(), + setLevel: vi.fn(), + getLevel: vi.fn(), + }; + return logger; +} + +/** + * A stand-in plugin that captures the {@link RemoteConnection} the pool injects, so tests can drive + * endpoint resolution / release directly without launching a real browser. + */ +function createCapturingPlugin() { + let connection: RemoteConnection | undefined; + const plugin: any = { + useRemoteConnection: (conn: RemoteConnection) => { + connection = conn; + }, + }; + return { plugin, getConnection: () => connection! }; +} + +beforeEach(() => { + serviceLocator.setLogger(createMockLogger()); +}); + +describe('RemoteBrowserPool — endpoint resolution', () => { + it('resolves a static string endpoint', async () => { + const { plugin, getConnection } = createCapturingPlugin(); + const pool = new RemoteBrowserPool({ browserPlugins: [plugin], endpoint: 'wss://remote:9222' }); + + const { url, token } = await getConnection().resolve(); + + expect(url).toBe('wss://remote:9222'); + expect(typeof token).toBe('number'); + await pool.destroy(); + }); + + it('resolves a function endpoint and forwards proxyUrl', async () => { + const endpoint = vi.fn(() => 'wss://dynamic:9222'); + const { plugin, getConnection } = createCapturingPlugin(); + const pool = new RemoteBrowserPool({ browserPlugins: [plugin], endpoint }); + + const { url } = await getConnection().resolve({ proxyUrl: 'http://proxy:8080' }); + + expect(url).toBe('wss://dynamic:9222'); + expect(endpoint).toHaveBeenCalledWith({ proxyUrl: 'http://proxy:8080' }); + await pool.destroy(); + }); + + it('throws when an endpoint resolves to an empty string', async () => { + const { plugin, getConnection } = createCapturingPlugin(); + const pool = new RemoteBrowserPool({ browserPlugins: [plugin], endpoint: () => '' }); + + await expect(getConnection().resolve()).rejects.toThrow(/empty string/); + await pool.destroy(); + }); + + it('throws when a function endpoint returns an object without a url', async () => { + const { plugin, getConnection } = createCapturingPlugin(); + const pool = new RemoteBrowserPool({ browserPlugins: [plugin], endpoint: () => ({}) as any }); + + await expect(getConnection().resolve()).rejects.toThrow(/non-empty 'url'/); + await pool.destroy(); + }); +}); + +describe('RemoteBrowserPool — release lifecycle', () => { + it('calls release with the context from a function endpoint, exactly once', async () => { + const release = vi.fn(); + const { plugin, getConnection } = createCapturingPlugin(); + const pool = new RemoteBrowserPool({ + browserPlugins: [plugin], + endpoint: () => ({ url: 'wss://remote:9222', context: { id: 'sess-1' } }), + release, + }); + + const { token } = await getConnection().resolve(); + await getConnection().release(token); + await getConnection().release(token); // second call must be a no-op (close()+kill()) + + expect(release).toHaveBeenCalledTimes(1); + expect(release).toHaveBeenCalledWith({ endpoint: 'wss://remote:9222', context: { id: 'sess-1' } }); + await pool.destroy(); + }); + + it('releases all still-open sessions on destroy()', async () => { + const release = vi.fn(); + const { plugin, getConnection } = createCapturingPlugin(); + const pool = new RemoteBrowserPool({ browserPlugins: [plugin], endpoint: 'wss://remote:9222', release }); + + await getConnection().resolve(); + await getConnection().resolve(); + + await pool.destroy(); + + expect(release).toHaveBeenCalledTimes(2); + }); + + it('swallows errors thrown by release()', async () => { + const release = vi.fn(() => { + throw new Error('release boom'); + }); + const { plugin, getConnection } = createCapturingPlugin(); + const pool = new RemoteBrowserPool({ browserPlugins: [plugin], endpoint: 'wss://remote:9222', release }); + + const { token } = await getConnection().resolve(); + await expect(getConnection().release(token)).resolves.toBeUndefined(); + await pool.destroy(); + }); +}); + +describe('RemoteBrowserPool — RemoteBrowserProvider endpoint', () => { + class TestProvider extends RemoteBrowserProvider<{ id: string }> { + override maxOpenBrowsers = 3; + connect = vi.fn(async () => ({ url: 'wss://provider:9222', context: { id: 'sess-1' } })); + override release = vi.fn(async () => {}); + } + + it('wires connect/release and adopts the provider maxOpenBrowsers', async () => { + const provider = new TestProvider(); + const { plugin, getConnection } = createCapturingPlugin(); + const pool = new RemoteBrowserPool({ browserPlugins: [plugin], endpoint: provider }); + + expect(pool.maxOpenBrowsers).toBe(3); + + const { url, token } = await getConnection().resolve({ proxyUrl: 'http://proxy:8080' }); + expect(url).toBe('wss://provider:9222'); + expect(provider.connect).toHaveBeenCalledWith({ proxyUrl: 'http://proxy:8080' }); + + await getConnection().release(token); + expect(provider.release).toHaveBeenCalledWith({ id: 'sess-1' }); + await pool.destroy(); + }); + + it('an explicit maxOpenBrowsers overrides the provider value', async () => { + const { plugin } = createCapturingPlugin(); + const pool = new RemoteBrowserPool({ + browserPlugins: [plugin], + endpoint: new TestProvider(), + maxOpenBrowsers: 7, + }); + + expect(pool.maxOpenBrowsers).toBe(7); + await pool.destroy(); + }); +}); + +describe('RemoteBrowserPool — maxOpenBrowsers throttle', () => { + it('proxies maxOpenBrowsers to the wrapped pool', async () => { + const { plugin } = createCapturingPlugin(); + const pool = new RemoteBrowserPool({ + browserPlugins: [plugin], + endpoint: 'wss://remote:9222', + maxOpenBrowsers: 2, + }); + + expect(pool.browserPool.maxOpenBrowsers).toBe(2); + pool.maxOpenBrowsers = 5; + expect(pool.browserPool.maxOpenBrowsers).toBe(5); + await pool.destroy(); + }); + + it('opens immediately when a browser slot is free', async () => { + const { plugin } = createCapturingPlugin(); + const pool = new RemoteBrowserPool({ + browserPlugins: [plugin], + endpoint: 'wss://remote:9222', + maxOpenBrowsers: 2, + }); + + pool.browserPool.hasFreeBrowserSlot = vi.fn(() => true); + pool.browserPool.hasActiveBrowserWithFreeCapacity = vi.fn(() => false); + const newPage = vi.fn(async () => ({ id: 'p' })); + (pool.browserPool as any).newPage = newPage; + + await pool.newPage({ id: 'p' }); + expect(newPage).toHaveBeenCalledOnce(); + await pool.destroy(); + }); + + it('waits while at capacity, then opens once a browser is retired', async () => { + const { plugin } = createCapturingPlugin(); + const pool = new RemoteBrowserPool({ + browserPlugins: [plugin], + endpoint: 'wss://remote:9222', + maxOpenBrowsers: 1, + slotPollIntervalMillis: 50, + }); + + let atCapacity = true; + pool.browserPool.hasFreeBrowserSlot = vi.fn(() => !atCapacity); + pool.browserPool.hasActiveBrowserWithFreeCapacity = vi.fn(() => false); + const newPage = vi.fn(async () => ({ id: 'p' })); + (pool.browserPool as any).newPage = newPage; + + const pagePromise = pool.newPage(); + await new Promise((r) => setTimeout(r, 20)); + expect(newPage).not.toHaveBeenCalled(); + + atCapacity = false; + pool.browserPool.emit(BROWSER_POOL_EVENTS.BROWSER_RETIRED, {} as any); + + await pagePromise; + expect(newPage).toHaveBeenCalledOnce(); + await pool.destroy(); + }); +}); diff --git a/packages/browser-pool/test/remote-browser.test.ts b/packages/browser-pool/test/remote-browser.test.ts new file mode 100644 index 000000000000..d4d24a4fac46 --- /dev/null +++ b/packages/browser-pool/test/remote-browser.test.ts @@ -0,0 +1,211 @@ +import { vi } from 'vitest'; + +import { serviceLocator } from '@crawlee/core'; +import type { CrawleeLogger } from '@crawlee/core'; + +import { PlaywrightPlugin } from '../src/playwright/playwright-plugin.js'; +import { PuppeteerPlugin } from '../src/puppeteer/puppeteer-plugin.js'; +import type { RemoteConnection } from '../src/remote-browser-pool.js'; + +// --------------------------------------------------------------------------- +// Mock helpers +// --------------------------------------------------------------------------- + +function createMockPage() { + return { + close: vi.fn().mockResolvedValue(undefined), + url: vi.fn(() => 'about:blank'), + on: vi.fn(), + once: vi.fn(), + }; +} + +function createMockBrowser() { + const page = createMockPage(); + const mockContext = { + newPage: vi.fn().mockResolvedValue(page), + close: vi.fn().mockResolvedValue(undefined), + on: vi.fn(), + once: vi.fn(), + }; + return { + newPage: vi.fn().mockResolvedValue(createMockPage()), + close: vi.fn().mockResolvedValue(undefined), + contexts: vi.fn(() => [mockContext]), + on: vi.fn(), + off: vi.fn(), + once: vi.fn(), + version: vi.fn(() => '120.0.0'), + pages: vi.fn(() => []), + process: vi.fn(() => null), + userAgent: vi.fn().mockResolvedValue('mock-ua'), + createBrowserContext: vi.fn().mockResolvedValue(mockContext), + createIncognitoBrowserContext: vi.fn().mockResolvedValue(mockContext), + }; +} + +function createMockPlaywrightLibrary(browser = createMockBrowser()) { + return { + launch: vi.fn().mockResolvedValue(browser), + connect: vi.fn().mockResolvedValue(browser), + connectOverCDP: vi.fn().mockResolvedValue(browser), + name: vi.fn(() => 'chromium'), + launchPersistentContext: vi.fn().mockResolvedValue(browser), + }; +} + +function createMockPuppeteerLibrary(browser = createMockBrowser()) { + return { + launch: vi.fn().mockResolvedValue(browser), + connect: vi.fn().mockResolvedValue(browser), + product: 'chrome', + }; +} + +function createMockLogger(): CrawleeLogger & { warning: ReturnType; info: ReturnType } { + const logger: any = { + child: vi.fn(() => logger), + error: vi.fn(), + exception: vi.fn(), + softFail: vi.fn(), + warning: vi.fn(), + warningOnce: vi.fn(), + info: vi.fn(), + debug: vi.fn(), + perf: vi.fn(), + deprecated: vi.fn(), + getOptions: vi.fn(() => ({})), + setOptions: vi.fn(), + setLevel: vi.fn(), + getLevel: vi.fn(), + }; + return logger; +} + +/** A fake {@link RemoteConnection} that resolves to a fixed URL and records release() calls. */ +function createConnection(url = 'wss://remote:9222', context?: Record): RemoteConnection & { + resolve: ReturnType; + release: ReturnType; +} { + return { + resolve: vi.fn(async (_options?: { proxyUrl?: string }) => ({ url, token: 42, context })), + release: vi.fn(async () => {}), + } as any; +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +let mockLogger: ReturnType; + +beforeEach(() => { + mockLogger = createMockLogger(); + serviceLocator.setLogger(mockLogger); +}); + +describe('Remote connection — PlaywrightPlugin', () => { + it('useRemoteConnection forces incognito pages on and marks the launch context remote', () => { + const plugin = new PlaywrightPlugin(createMockPlaywrightLibrary() as any, { useIncognitoPages: false }); + plugin.useRemoteConnection(createConnection()); + + expect(plugin.useIncognitoPages).toBe(true); + expect(plugin.createLaunchContext().isRemote).toBe(true); + }); + + it('connects via connectOverCDP by default and skips a local launch', async () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any); + const connection = createConnection('http://remote:9222'); + plugin.useRemoteConnection(connection, { connectOptions: { timeout: 5000 } }); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + expect(connection.resolve).toHaveBeenCalledTimes(1); + expect(lib.connectOverCDP).toHaveBeenCalledWith('http://remote:9222', { timeout: 5000 }); + expect(lib.connect).not.toHaveBeenCalled(); + expect(lib.launch).not.toHaveBeenCalled(); + expect(ctx._remoteToken).toBe(42); + }); + + it("connects via connect() when protocol is 'playwright'", async () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any); + plugin.useRemoteConnection(createConnection('ws://remote:3000'), { protocol: 'playwright' }); + + await plugin.launch(plugin.createLaunchContext()); + + expect(lib.connect).toHaveBeenCalledWith('ws://remote:3000', {}); + expect(lib.connectOverCDP).not.toHaveBeenCalled(); + }); + + it('releases the session and throws BrowserLaunchError when connect fails', async () => { + const lib = createMockPlaywrightLibrary(); + lib.connectOverCDP.mockRejectedValueOnce(new Error('ECONNREFUSED')); + const plugin = new PlaywrightPlugin(lib as any); + const connection = createConnection(); + plugin.useRemoteConnection(connection); + + await expect(plugin.launch(plugin.createLaunchContext())).rejects.toThrow(/Failed to connect to remote browser/); + expect(connection.release).toHaveBeenCalledWith(42); + }); + + it('throws BrowserLaunchError (without connecting) when endpoint resolution fails', async () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any); + const connection = createConnection(); + connection.resolve.mockRejectedValueOnce(new Error('no session')); + plugin.useRemoteConnection(connection); + + await expect(plugin.launch(plugin.createLaunchContext())).rejects.toThrow(/resolve the remote browser endpoint/); + expect(lib.connectOverCDP).not.toHaveBeenCalled(); + expect(connection.release).not.toHaveBeenCalled(); + }); + + it('a plain plugin (no remote connection) launches locally', async () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any); + + await plugin.launch(plugin.createLaunchContext()); + + expect(lib.launch).toHaveBeenCalledTimes(1); + expect(lib.connect).not.toHaveBeenCalled(); + expect(lib.connectOverCDP).not.toHaveBeenCalled(); + }); +}); + +describe('Remote connection — PuppeteerPlugin', () => { + it('connects via connect() with the resolved endpoint and skips a local launch', async () => { + const lib = createMockPuppeteerLibrary(); + const plugin = new PuppeteerPlugin(lib as any); + const connection = createConnection('ws://remote:9222'); + plugin.useRemoteConnection(connection, { connectOptions: { protocolTimeout: 1000 } }); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + expect(connection.resolve).toHaveBeenCalledTimes(1); + expect(lib.connect).toHaveBeenCalledWith({ protocolTimeout: 1000, browserWSEndpoint: 'ws://remote:9222' }); + expect(lib.launch).not.toHaveBeenCalled(); + expect(ctx._remoteToken).toBe(42); + }); + + it('releases the session and throws BrowserLaunchError when connect fails', async () => { + const lib = createMockPuppeteerLibrary(); + lib.connect.mockRejectedValueOnce(new Error('ECONNREFUSED')); + const plugin = new PuppeteerPlugin(lib as any); + const connection = createConnection(); + plugin.useRemoteConnection(connection); + + await expect(plugin.launch(plugin.createLaunchContext())).rejects.toThrow(/Failed to connect to remote browser/); + expect(connection.release).toHaveBeenCalledWith(42); + }); + + it('marks the launch context remote', () => { + const plugin = new PuppeteerPlugin(createMockPuppeteerLibrary() as any); + plugin.useRemoteConnection(createConnection()); + + expect(plugin.createLaunchContext().isRemote).toBe(true); + }); +}); diff --git a/pnpm-workspace.yaml b/pnpm-workspace.yaml index d687fbfdce24..f21d9ea663dc 100644 --- a/pnpm-workspace.yaml +++ b/pnpm-workspace.yaml @@ -37,7 +37,7 @@ allowBuilds: better-sqlite3: true bufferutil: true core-js: true - core-js-pure: set this to true or false + core-js-pure: false esbuild: true nx: true protobufjs: true diff --git a/test/core/browser_launchers/playwright_launcher.test.ts b/test/core/browser_launchers/playwright_launcher.test.ts index e2008b3458e0..ba29d0408c28 100644 --- a/test/core/browser_launchers/playwright_launcher.test.ts +++ b/test/core/browser_launchers/playwright_launcher.test.ts @@ -288,4 +288,5 @@ describe('launchPlaywright()', () => { recursive: true, }); }); + }); diff --git a/test/core/browser_launchers/puppeteer_launcher.test.ts b/test/core/browser_launchers/puppeteer_launcher.test.ts index 0963cd62de6b..4b36679a0ffe 100644 --- a/test/core/browser_launchers/puppeteer_launcher.test.ts +++ b/test/core/browser_launchers/puppeteer_launcher.test.ts @@ -308,4 +308,5 @@ describe('launchPuppeteer()', () => { recursive: true, }); }); + }); diff --git a/test/core/crawlers/browser_crawler.test.ts b/test/core/crawlers/browser_crawler.test.ts index 388b45781111..c0ca30eeba07 100644 --- a/test/core/crawlers/browser_crawler.test.ts +++ b/test/core/crawlers/browser_crawler.test.ts @@ -6,6 +6,7 @@ import { BrowserPool as BrowserPoolClass, OperatingSystemsName, PuppeteerPlugin, + RemoteBrowserPool, } from '@crawlee/browser-pool'; import { bindMethodsToServiceLocator, BLOCKED_STATUS_CODES, ServiceLocator, SessionPool } from '@crawlee/core'; import type { PuppeteerGoToOptions } from '@crawlee/puppeteer'; @@ -176,6 +177,46 @@ describe('BrowserCrawler', () => { } }); + test.concurrent('builds and owns a RemoteBrowserPool from the remoteBrowser option', async () => { + const localStorageEmulator = new MemoryStorageEmulator(); + await localStorageEmulator.init(); + + try { + const crawler = new BrowserCrawlerTest({ + remoteBrowser: { endpoint: 'ws://remote:9222', maxOpenBrowsers: 2 }, + browserPoolOptions: { browserPlugins: [new PuppeteerPlugin(puppeteer)] }, + requestHandler: async () => {}, + }); + + expect(crawler.browserPool).toBeInstanceOf(RemoteBrowserPool); + expect((crawler.browserPool as RemoteBrowserPool).maxOpenBrowsers).toBe(2); + + await (crawler.browserPool as RemoteBrowserPool).destroy(); + } finally { + await localStorageEmulator.destroy(); + } + }); + + test.concurrent('throws when both browserPool and remoteBrowser are set', async () => { + const localStorageEmulator = new MemoryStorageEmulator(); + await localStorageEmulator.init(); + const externalPool = new BrowserPoolClass({ browserPlugins: [new PuppeteerPlugin(puppeteer)] }); + + try { + expect( + () => + new BrowserCrawlerTest({ + browserPool: externalPool, + remoteBrowser: { endpoint: 'ws://remote:9222' }, + requestHandler: async () => {}, + }), + ).toThrow(/at most one of 'browserPool' and 'remoteBrowser'/); + } finally { + await externalPool.destroy(); + await localStorageEmulator.destroy(); + } + }); + test.concurrent('should retire session after TimeoutError', async () => { const localStorageEmulator = new MemoryStorageEmulator(); await localStorageEmulator.init(); diff --git a/test/integration/helpers.ts b/test/integration/helpers.ts new file mode 100644 index 000000000000..c680a62c5a75 --- /dev/null +++ b/test/integration/helpers.ts @@ -0,0 +1,28 @@ +/** + * Helpers for remote-browser integration tests. + * + * These tests require a running Browserless instance and a deterministic HTTP + * target (httpbin). In CI both are provided as GitHub Actions service + * containers on a shared network. Locally, start them via + * `pnpm test:integration:services:up`. + * + * Network model: HTTPBIN_URL is consumed by the REMOTE browser (not the test + * runner). The browser lives in the Browserless container, so the URL must + * resolve inside that container's Docker network — typically `http://httpbin` + * via service name/alias. + * + * Env vars: + * BROWSERLESS_URL default: http://localhost:3000 (host-side; how the test + * runner reaches CDP) + * HTTPBIN_URL default: http://httpbin (browser-side; how the + * remote browser reaches + * httpbin via Docker DNS) + */ + +export const BROWSERLESS_URL = process.env.BROWSERLESS_URL ?? 'http://localhost:3000'; +export const HTTPBIN_URL = process.env.HTTPBIN_URL ?? 'http://httpbin'; + +/** Build a URL on the httpbin service from a path (e.g. '/cookies'). */ +export function httpbin(path: string): string { + return `${HTTPBIN_URL}${path.startsWith('/') ? path : `/${path}`}`; +} diff --git a/test/integration/remote-browser-incognito.test.ts b/test/integration/remote-browser-incognito.test.ts new file mode 100644 index 000000000000..4c176870df99 --- /dev/null +++ b/test/integration/remote-browser-incognito.test.ts @@ -0,0 +1,64 @@ +/** + * Integration test: PlaywrightCrawler against a remote Browserless CDP endpoint + * forces useIncognitoPages: true, so two pages on the same remote browser do + * NOT share cookies. + * + * Mirrors temp-examples/examples/cookie-sharing-pages-same-remote-browser.ts: + * - retireBrowserAfterPageCount: 10 → both requests stay on the same browser + * - saveResponseCookies: false → Session cannot carry cookies across requests + * - Request 1 → /cookies/set?TOKEN=… (httpbin Set-Cookie) + * - Request 2 → /cookies (httpbin echoes received cookies in body) + * + * With the wrapper removed, request 2's body should report no cookies. + */ +import { PlaywrightCrawler } from 'crawlee'; +import { expect, test } from 'vitest'; + +import { BROWSERLESS_URL, httpbin } from './helpers.js'; + +// Gate on CRAWLEE_DIFFICULT_TESTS so plain `pnpm test` skips integration tests +// (no Docker required); `pnpm test:integration` and `pnpm test:full` set the flag. +test.skipIf(!process.env.CRAWLEE_DIFFICULT_TESTS)( + 'remote Playwright CDP: pages on the same browser do not share cookies', + async () => { + const observations: { controllerId: string; body: { cookies: Record } }[] = []; + const controllerIdByPage = new WeakMap(); + + const crawler = new PlaywrightCrawler({ + remoteBrowser: { + endpoint: BROWSERLESS_URL, + maxOpenBrowsers: 1, + }, + browserPoolOptions: { + retireBrowserAfterPageCount: 10, // keep the same browser across both requests + maxOpenPagesPerBrowser: 2, + postPageCreateHooks: [ + (page: object, browserController: { id: string }) => { + controllerIdByPage.set(page, browserController.id); + }, + ], + }, + saveResponseCookies: false, // remove Session-based propagation + maxConcurrency: 1, + maxRequestsPerCrawl: 2, + async requestHandler({ page }) { + const body = await page.evaluate(() => document.body.textContent?.trim()); + observations.push({ + controllerId: controllerIdByPage.get(page)!, + body: body ? JSON.parse(body) : null, + }); + }, + }); + + await crawler.run([httpbin('/cookies/set?TOKEN=integration-test'), httpbin('/cookies')]); + + expect(observations).toHaveLength(2); + // Same browser handled both requests — otherwise the assertion below proves nothing. + expect(observations[0].controllerId).toBe(observations[1].controllerId); + // Request 1 actually got the cookie (else request 2's emptiness proves nothing). + expect(observations[0].body.cookies).toEqual({ TOKEN: 'integration-test' }); + // Request 2 (the /cookies echo) must NOT include the TOKEN cookie set by request 1. + expect(observations[1].body.cookies).toEqual({}); + }, + 60_000, +); diff --git a/website/sidebars.js b/website/sidebars.js index 85e39e5a658d..f0f6ac01cdbc 100644 --- a/website/sidebars.js +++ b/website/sidebars.js @@ -38,6 +38,7 @@ module.exports = { 'guides/configuration', 'guides/cheerio-crawler-guide', 'guides/javascript-rendering', + 'guides/remote-browser', 'guides/proxy-management', 'guides/session-management', 'guides/scaling-crawlers',