From a5df5b828543ab6aaa574a2059044eb5f45eb59b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sol=C3=A1r?= Date: Wed, 18 Mar 2026 14:46:32 +0100 Subject: [PATCH 01/45] feat: add `isRemote` flag and connect option types for remote browser support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # Task 1: Type Definitions & LaunchContext `isRemote` Flag ## Goal Add the foundational types and the `isRemote` flag that all other remote browser tasks depend on. ## Dependencies None — this is the foundation task. ## Scope ### 1. Add `isRemote` to `LaunchContext` **File:** `packages/browser-pool/src/launch-context.ts` - Add `isRemote?: boolean` to the `LaunchContextOptions` interface (alongside `id`, `browserPlugin`, etc.) - Add a public readonly `isRemote: boolean` property to the `LaunchContext` class - Set it from constructor options, defaulting to `false` ### 2. Define connect option types on PlaywrightPlugin **File:** `packages/browser-pool/src/playwright/playwright-plugin.ts` Add the following type to the plugin file (or a co-located types file): ```typescript // Mirrors browserType.connectOverCDP(endpointURL, options) interface PlaywrightConnectOverCDPOptions { endpointURL: string; options?: Parameters[1]; } // Mirrors browserType.connect(wsEndpoint, options) interface PlaywrightConnectOptions { wsEndpoint: string; options?: Parameters[1]; } ``` Use the existing `Parameters` utility type pattern (see how `SafeParameters` is used elsewhere in the codebase) — do NOT redefine Playwright's types manually. ### 3. Define connect option types on PuppeteerPlugin **File:** `packages/browser-pool/src/puppeteer/puppeteer-plugin.ts` ```typescript // Mirrors puppeteer.connect({ browserWSEndpoint, ...rest }) // Flat object matching Puppeteer's ConnectOptions type PuppeteerConnectOverCDPOptions = Parameters[0]; ``` Use the `Parameters` pattern to extract the type from Puppeteer's `connect` method. ### 4. Add connect option fields to `BrowserPluginOptions` **File:** `packages/browser-pool/src/abstract-classes/browser-plugin.ts` This is a design choice — the PRD says connect options live on the plugin subclass, not on `LaunchContext`. Add the fields to the plugin options type so they flow through the constructor: - `PlaywrightPlugin` options should accept `connectOptions?` and `connectOverCDPOptions?` - `PuppeteerPlugin` options should accept `connectOverCDPOptions?` These can be added to subclass-specific option types rather than the base `BrowserPluginOptions`. ### 5. Add connect option fields to launcher-level interfaces **File:** `packages/playwright-crawler/src/internals/playwright-launcher.ts` Add to `PlaywrightLaunchContext`: ```typescript connectOptions?: PlaywrightConnectOptions; connectOverCDPOptions?: PlaywrightConnectOverCDPOptions; ``` **File:** `packages/puppeteer-crawler/src/internals/puppeteer-launcher.ts` Add to `PuppeteerLaunchContext`: ```typescript connectOverCDPOptions?: PuppeteerConnectOverCDPOptions; ``` This enables IDE autocomplete when users configure `launchContext` on the crawler. ### 6. Export new types **File:** `packages/browser-pool/src/index.ts` Export the new connect option types so they're available to consumers. ## Key Files | File | Change | |------|--------| | `packages/browser-pool/src/launch-context.ts` | Add `isRemote` option + property | | `packages/browser-pool/src/playwright/playwright-plugin.ts` | Add connect option types | | `packages/browser-pool/src/puppeteer/puppeteer-plugin.ts` | Add connect option type | | `packages/playwright-crawler/src/internals/playwright-launcher.ts` | Add connect options to `PlaywrightLaunchContext` | | `packages/puppeteer-crawler/src/internals/puppeteer-launcher.ts` | Add connect options to `PuppeteerLaunchContext` | | `packages/browser-pool/src/index.ts` | Export new types | | `packages/browser-crawler/src/internals/browser-launcher.ts` | May need connect options on `BrowserLaunchContext` base | ## Acceptance Criteria - [x] `LaunchContext` has `isRemote` boolean property, defaults to `false` - [x] Connect option types are defined using library `Parameters` extraction (not manual redefinition) - [x] `PlaywrightLaunchContext` shows `connectOptions` and `connectOverCDPOptions` in IDE autocomplete - [x] `PuppeteerLaunchContext` shows `connectOverCDPOptions` in IDE autocomplete - [x] New types are exported from `@crawlee/browser-pool` - [x] TypeScript compiles with no errors Co-Authored-By: Claude Opus 4.6 --- packages/browser-pool/src/launch-context.ts | 9 ++++++ .../src/playwright/playwright-plugin.ts | 29 ++++++++++++++++++- .../src/puppeteer/puppeteer-plugin.ts | 20 ++++++++++++- .../src/internals/playwright-launcher.ts | 15 ++++++++++ .../src/internals/puppeteer-launcher.ts | 8 +++++ 5 files changed, 79 insertions(+), 2 deletions(-) diff --git a/packages/browser-pool/src/launch-context.ts b/packages/browser-pool/src/launch-context.ts index 9ae847634b51..47029904a8f9 100644 --- a/packages/browser-pool/src/launch-context.ts +++ b/packages/browser-pool/src/launch-context.ts @@ -57,6 +57,12 @@ export interface LaunchContextOptions< * This is useful when using HTTPS proxies with self-signed certificates. */ ignoreProxyCertificate?: boolean; + /** + * Whether this launch context represents a connection to a remote browser + * rather than a locally launched one. + * @default false + */ + isRemote?: boolean; } export class LaunchContext< @@ -73,6 +79,7 @@ export class LaunchContext< browserPerProxy?: boolean; userDataDir: string; proxyTier?: number; + readonly isRemote: boolean; ignoreProxyCertificate?: boolean; private _proxyUrl?: string; @@ -92,6 +99,7 @@ export class LaunchContext< userDataDir = '', proxyTier, ignoreProxyCertificate, + isRemote, } = options; this.id = id; @@ -102,6 +110,7 @@ export class LaunchContext< this.userDataDir = userDataDir; this.proxyTier = proxyTier; this.ignoreProxyCertificate = ignoreProxyCertificate ?? false; + this.isRemote = isRemote ?? false; this._proxyUrl = proxyUrl; } diff --git a/packages/browser-pool/src/playwright/playwright-plugin.ts b/packages/browser-pool/src/playwright/playwright-plugin.ts index a48cf1fedfec..7cd4d33bea60 100644 --- a/packages/browser-pool/src/playwright/playwright-plugin.ts +++ b/packages/browser-pool/src/playwright/playwright-plugin.ts @@ -2,7 +2,7 @@ import fs from 'node:fs'; import type { Browser as PlaywrightBrowser, BrowserType } from 'playwright'; -import { BrowserPlugin } from '../abstract-classes/browser-plugin.js'; +import { BrowserPlugin, type BrowserPluginOptions } from '../abstract-classes/browser-plugin.js'; import { anonymizeProxySugar } from '../anonymize-proxy.js'; import type { createProxyServerForContainers } from '../container-proxy-server.js'; import type { LaunchContext } from '../launch-context.js'; @@ -11,6 +11,23 @@ import type { SafeParameters } from '../utils.js'; import { PlaywrightBrowser as PlaywrightBrowserWithPersistentContext } from './playwright-browser.js'; import { PlaywrightController } from './playwright-controller.js'; +/** + * Options for connecting to a remote browser via CDP. + * Mirrors `browserType.connectOverCDP(options)`. + */ +export type PlaywrightConnectOverCDPOptions = Parameters[0]; + +/** + * Options for connecting to a remote browser via WebSocket. + * Mirrors `browserType.connect(options)`. + */ +export type PlaywrightConnectOptions = Parameters[0]; + +export interface PlaywrightPluginOptions extends BrowserPluginOptions[0]> { + connectOptions?: PlaywrightConnectOptions; + connectOverCDPOptions?: PlaywrightConnectOverCDPOptions; +} + export class PlaywrightPlugin extends BrowserPlugin< BrowserType, SafeParameters[0], @@ -19,6 +36,16 @@ export class PlaywrightPlugin extends BrowserPlugin< private _browserVersion?: string; _containerProxyServer?: Awaited>; + connectOptions?: PlaywrightConnectOptions; + connectOverCDPOptions?: PlaywrightConnectOverCDPOptions; + + constructor(library: BrowserType, options: PlaywrightPluginOptions = {}) { + const { connectOptions, connectOverCDPOptions, ...baseOptions } = options; + super(library, baseOptions); + this.connectOptions = connectOptions; + this.connectOverCDPOptions = connectOverCDPOptions; + } + protected async _launch(launchContext: LaunchContext): Promise { const { launchOptions, useIncognitoPages, userDataDir, proxyUrl } = launchContext; let browser: PlaywrightBrowser; diff --git a/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts b/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts index 91ea817d03a9..b1143d7efaa7 100644 --- a/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts +++ b/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts @@ -4,7 +4,7 @@ import type { Dictionary } from '@crawlee/types'; import type Puppeteer from 'puppeteer'; import type * as PuppeteerTypes from 'puppeteer'; -import { BrowserPlugin } from '../abstract-classes/browser-plugin.js'; +import { BrowserPlugin, type BrowserPluginOptions } from '../abstract-classes/browser-plugin.js'; import { anonymizeProxySugar } from '../anonymize-proxy.js'; import type { LaunchContext } from '../launch-context.js'; import { noop } from '../utils.js'; @@ -13,12 +13,30 @@ import { PuppeteerController } from './puppeteer-controller.js'; const PROXY_SERVER_ARG = '--proxy-server='; +/** + * Options for connecting to a remote browser via Puppeteer. + * Flat object matching Puppeteer's `ConnectOptions`. + */ +export type PuppeteerConnectOverCDPOptions = Parameters<(typeof Puppeteer)['connect']>[0]; + +export interface PuppeteerPluginOptions extends BrowserPluginOptions { + connectOverCDPOptions?: PuppeteerConnectOverCDPOptions; +} + export class PuppeteerPlugin extends BrowserPlugin< typeof Puppeteer, PuppeteerTypes.LaunchOptions, PuppeteerTypes.Browser, PuppeteerNewPageOptions > { + connectOverCDPOptions?: PuppeteerConnectOverCDPOptions; + + constructor(library: typeof Puppeteer, options: PuppeteerPluginOptions = {}) { + const { connectOverCDPOptions, ...baseOptions } = options; + super(library, baseOptions); + this.connectOverCDPOptions = connectOverCDPOptions; + } + protected async _launch( launchContext: LaunchContext< typeof Puppeteer, diff --git a/packages/playwright-crawler/src/internals/playwright-launcher.ts b/packages/playwright-crawler/src/internals/playwright-launcher.ts index a8bf13c86c50..5c93468d63f3 100644 --- a/packages/playwright-crawler/src/internals/playwright-launcher.ts +++ b/packages/playwright-crawler/src/internals/playwright-launcher.ts @@ -1,6 +1,7 @@ import type { BrowserLaunchContext } from '@crawlee/browser'; import { BrowserLauncher, Configuration } from '@crawlee/browser'; import { PlaywrightPlugin } from '@crawlee/browser-pool'; +import type { PlaywrightConnectOptions, PlaywrightConnectOverCDPOptions } from '@crawlee/browser-pool'; import ow from 'ow'; import type { Browser, BrowserType, LaunchOptions } from 'playwright'; @@ -70,6 +71,18 @@ export interface PlaywrightLaunchContext extends BrowserLaunchContext { ...BrowserLauncher.optionsShape, launcher: ow.optional.object, launchContextOptions: ow.optional.object, + connectOptions: ow.optional.object, + connectOverCDPOptions: ow.optional.object, }; /** diff --git a/packages/puppeteer-crawler/src/internals/puppeteer-launcher.ts b/packages/puppeteer-crawler/src/internals/puppeteer-launcher.ts index 3d46c30dd432..5e8333083e32 100644 --- a/packages/puppeteer-crawler/src/internals/puppeteer-launcher.ts +++ b/packages/puppeteer-crawler/src/internals/puppeteer-launcher.ts @@ -1,6 +1,7 @@ import type { BrowserLaunchContext } from '@crawlee/browser'; import { BrowserLauncher, Configuration } from '@crawlee/browser'; import { PuppeteerPlugin } from '@crawlee/browser-pool'; +import type { PuppeteerConnectOverCDPOptions } from '@crawlee/browser-pool'; import ow from 'ow'; import type { Browser } from 'puppeteer'; @@ -65,6 +66,12 @@ export interface PuppeteerLaunchContext extends BrowserLaunchContext protected static override optionsShape = { ...BrowserLauncher.optionsShape, launcher: ow.optional.object, + connectOverCDPOptions: ow.optional.object, }; /** From b012525db7e9d539bd14c65ecf925137385fa56e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sol=C3=A1r?= Date: Wed, 18 Mar 2026 15:18:31 +0100 Subject: [PATCH 02/45] feat: add PlaywrightPlugin remote connection routing via `connect()` and `connectOverCDP()` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # Task 2: PlaywrightPlugin Remote Connection Routing ## Goal Make `PlaywrightPlugin._launch()` branch to `connect()` or `connectOverCDP()` when remote connection options are present, instead of calling `launch()`. ## Dependencies - Task 1 (types and `isRemote` flag) ## Scope ### 1. Store connect options on the plugin instance **File:** `packages/browser-pool/src/playwright/playwright-plugin.ts` - Accept `connectOptions` and `connectOverCDPOptions` in the constructor options - Store them as instance properties - **Validation:** If both `connectOptions` AND `connectOverCDPOptions` are provided, throw an error immediately in the constructor: ``` Cannot set both 'connectOptions' and 'connectOverCDPOptions' — pick one protocol. ``` ### 2. Branch in `_launch()` for remote connections **File:** `packages/browser-pool/src/playwright/playwright-plugin.ts` In the existing `_launch()` method (currently lines 22-102), add branching logic **before** the existing local launch code: ```typescript protected async _launch(launchContext: LaunchContext<...>): Promise { // Remote CDP connection if (this.connectOverCDPOptions) { const { endpointURL, options } = this.connectOverCDPOptions; const browser = await browserType.connectOverCDP(endpointURL, options); return browser; } // Remote Playwright WebSocket connection if (this.connectOptions) { const { wsEndpoint, options } = this.connectOptions; const browser = await browserType.connect(wsEndpoint, options); return browser; } // Existing local launch logic... } ``` **Reference:** See `StagehandPlugin._launch()` at `packages/stagehand-crawler/src/internals/stagehand-plugin.ts:102-107` for the CDP connection pattern: ```typescript const cdpUrl = await stagehand.connectURL(); const browser = await chromium.connectOverCDP(cdpUrl); ``` ### 3. Set `isRemote` on LaunchContext **File:** `packages/browser-pool/src/playwright/playwright-plugin.ts` In `createLaunchContext()` (or wherever the plugin creates the LaunchContext), pass `isRemote: true` when connect options are present. This can be done by overriding `createLaunchContext()` in the subclass, or by passing it through the options. Check how the base `BrowserPlugin.createLaunchContext()` works (at `packages/browser-pool/src/abstract-classes/browser-plugin.ts:149-174`) and determine the best insertion point. ## Key Design Decisions - **No new abstract method:** The routing happens inside `_launch()` via internal branching, not a new `_connect()` method. This keeps the abstract interface unchanged and doesn't affect custom plugins like StagehandPlugin. - **`browser.close()` for cleanup:** Remote browsers are closed the same way as local browsers — via `browser.close()`. No special disconnect handling. - **No proxy server setup for remote:** The remote branch skips the local proxy server setup that exists in the current `_launch()` code. ## Key Files | File | Change | |------|--------| | `packages/browser-pool/src/playwright/playwright-plugin.ts` | Constructor stores options, `_launch()` branches for remote | ## Acceptance Criteria - [x] `PlaywrightPlugin` accepts `connectOptions` in constructor and calls `browserType.connect()` with `wsEndpoint` and `options` - [x] `PlaywrightPlugin` accepts `connectOverCDPOptions` in constructor and calls `browserType.connectOverCDP()` with `endpointURL` and `options` - [x] Setting both `connectOptions` and `connectOverCDPOptions` throws an error - [x] `launchContext.isRemote` is `true` when connect options are present - [x] Remote branch skips local proxy server setup and persistent context logic - [x] TypeScript compiles with no errors Co-Authored-By: Claude Opus 4.6 --- .../src/playwright/playwright-plugin.ts | 28 ++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/packages/browser-pool/src/playwright/playwright-plugin.ts b/packages/browser-pool/src/playwright/playwright-plugin.ts index 7cd4d33bea60..c99f1680b2c2 100644 --- a/packages/browser-pool/src/playwright/playwright-plugin.ts +++ b/packages/browser-pool/src/playwright/playwright-plugin.ts @@ -2,7 +2,11 @@ import fs from 'node:fs'; import type { Browser as PlaywrightBrowser, BrowserType } from 'playwright'; -import { BrowserPlugin, type BrowserPluginOptions } from '../abstract-classes/browser-plugin.js'; +import { + BrowserPlugin, + type BrowserPluginOptions, + type CreateLaunchContextOptions, +} from '../abstract-classes/browser-plugin.js'; import { anonymizeProxySugar } from '../anonymize-proxy.js'; import type { createProxyServerForContainers } from '../container-proxy-server.js'; import type { LaunchContext } from '../launch-context.js'; @@ -41,12 +45,34 @@ export class PlaywrightPlugin extends BrowserPlugin< constructor(library: BrowserType, options: PlaywrightPluginOptions = {}) { const { connectOptions, connectOverCDPOptions, ...baseOptions } = options; + + if (connectOptions && connectOverCDPOptions) { + throw new Error("Cannot set both 'connectOptions' and 'connectOverCDPOptions' — pick one protocol."); + } + super(library, baseOptions); this.connectOptions = connectOptions; this.connectOverCDPOptions = connectOverCDPOptions; } + override createLaunchContext(options: CreateLaunchContextOptions = {}): LaunchContext { + return super.createLaunchContext({ + ...options, + isRemote: options.isRemote ?? !!(this.connectOptions || this.connectOverCDPOptions), + }); + } + protected async _launch(launchContext: LaunchContext): Promise { + // Remote CDP connection — skip all local launch/proxy logic + if (this.connectOverCDPOptions) { + return this.library.connectOverCDP(this.connectOverCDPOptions); + } + + // Remote Playwright WebSocket connection — skip all local launch/proxy logic + if (this.connectOptions) { + return this.library.connect(this.connectOptions); + } + const { launchOptions, useIncognitoPages, userDataDir, proxyUrl } = launchContext; let browser: PlaywrightBrowser; From 0e8381267e8a97a263899e326daca7754387bea6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sol=C3=A1r?= Date: Wed, 18 Mar 2026 15:23:37 +0100 Subject: [PATCH 03/45] feat: add PuppeteerPlugin remote connection routing via `puppeteer.connect()` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # Task 3: PuppeteerPlugin Remote Connection Routing ## Goal Make `PuppeteerPlugin._launch()` branch to `puppeteer.connect()` when remote connection options (CDP) are present, instead of calling `puppeteer.launch()`. ## Dependencies - Task 1 (types and `isRemote` flag) ## Scope ### 1. Store connect options on the plugin instance **File:** `packages/browser-pool/src/puppeteer/puppeteer-plugin.ts` - Accept `connectOverCDPOptions` in the constructor options - Store as an instance property - Puppeteer only supports CDP — there is no `connectOptions` field (Playwright-only) ### 2. Branch in `_launch()` for remote connections **File:** `packages/browser-pool/src/puppeteer/puppeteer-plugin.ts` In the existing `_launch()` method (currently lines 22-203), add branching logic **before** the existing local launch code: ```typescript protected async _launch(launchContext: LaunchContext<...>): Promise { // Remote CDP connection if (this.connectOverCDPOptions) { const browser = await puppeteer.connect(this.connectOverCDPOptions); // Wrap with the same Proxy handler for newPage() interception // (see existing code at lines 138-200) return wrappedBrowser; } // Existing local launch logic... } ``` **Important:** Puppeteer's `connect()` takes a flat options object: `puppeteer.connect({ browserWSEndpoint, ...rest })`. This is different from Playwright's two-argument pattern. The type should match Puppeteer's `ConnectOptions`. ### 3. Handle the `newPage()` Proxy wrapper for remote The existing `_launch()` wraps the browser in a `Proxy` that intercepts `newPage()` calls to support `useIncognitoPages` (lines 138-200). This proxy wrapper should also be applied to remote browsers so that incognito context creation works correctly. ### 4. Set `isRemote` on LaunchContext Same pattern as Task 2 — pass `isRemote: true` when `connectOverCDPOptions` is present. ## Key Design Decisions - **Flat options object:** Puppeteer's `connect()` API takes a single options object (not `endpointURL, options` like Playwright). The `connectOverCDPOptions` type matches this flat shape directly. - **`browser.close()` for cleanup:** Same as Playwright — remote browsers closed via `browser.close()`, not `browser.disconnect()`. - **`newPage()` proxy still needed:** The Proxy wrapper that intercepts `newPage()` to create incognito contexts must still wrap remote browsers. ## Key Files | File | Change | |------|--------| | `packages/browser-pool/src/puppeteer/puppeteer-plugin.ts` | Constructor stores options, `_launch()` branches for remote | ## Acceptance Criteria - [x] `PuppeteerPlugin` accepts `connectOverCDPOptions` in constructor and calls `puppeteer.connect()` with the options object - [x] The `newPage()` Proxy wrapper is applied to remote browsers (for incognito support) - [x] `launchContext.isRemote` is `true` when connect options are present - [x] Remote branch skips user data directory setup, headless handling, and other local-only logic - [x] TypeScript compiles with no errors Co-Authored-By: Claude Opus 4.6 --- .../src/puppeteer/puppeteer-plugin.ts | 118 ++++++++++-------- 1 file changed, 68 insertions(+), 50 deletions(-) diff --git a/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts b/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts index b1143d7efaa7..a3ea69461662 100644 --- a/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts +++ b/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts @@ -4,7 +4,11 @@ import type { Dictionary } from '@crawlee/types'; import type Puppeteer from 'puppeteer'; import type * as PuppeteerTypes from 'puppeteer'; -import { BrowserPlugin, type BrowserPluginOptions } from '../abstract-classes/browser-plugin.js'; +import { + BrowserPlugin, + type BrowserPluginOptions, + type CreateLaunchContextOptions, +} from '../abstract-classes/browser-plugin.js'; import { anonymizeProxySugar } from '../anonymize-proxy.js'; import type { LaunchContext } from '../launch-context.js'; import { noop } from '../utils.js'; @@ -37,6 +41,20 @@ export class PuppeteerPlugin extends BrowserPlugin< this.connectOverCDPOptions = connectOverCDPOptions; } + override createLaunchContext( + options: CreateLaunchContextOptions< + typeof Puppeteer, + PuppeteerTypes.LaunchOptions, + PuppeteerTypes.Browser, + PuppeteerNewPageOptions + > = {}, + ): LaunchContext { + return super.createLaunchContext({ + ...options, + isRemote: options.isRemote ?? !!this.connectOverCDPOptions, + }); + } + protected async _launch( launchContext: LaunchContext< typeof Puppeteer, @@ -56,67 +74,67 @@ export class PuppeteerPlugin extends BrowserPlugin< // ignore } - const { - launchOptions, - userDataDir, - useIncognitoPages, - experimentalContainers, - proxyUrl, - ignoreProxyCertificate, - } = launchContext; - - if (experimentalContainers) { - throw new Error('Experimental containers are only available with Playwright'); - } - - launchOptions!.userDataDir = launchOptions!.userDataDir ?? userDataDir; - - if (launchOptions!.headless === false) { - if (Array.isArray(launchOptions!.args)) { - launchOptions!.args.push('--disable-site-isolation-trials'); - } else { - launchOptions!.args = ['--disable-site-isolation-trials']; - } - } - - if (launchOptions!.headless === true && oldPuppeteerVersion) { - launchOptions!.headless = 'new' as any; - } + const { useIncognitoPages, proxyUrl, ignoreProxyCertificate } = launchContext; let browser: PuppeteerTypes.Browser; - { - const [anonymizedProxyUrl, close] = await anonymizeProxySugar(proxyUrl, undefined, undefined, { - ignoreProxyCertificate: launchContext.ignoreProxyCertificate, - }); + if (this.connectOverCDPOptions) { + // Remote CDP connection — skip local launch/proxy/headless logic + browser = await this.library.connect(this.connectOverCDPOptions); + } else { + const { launchOptions, userDataDir, experimentalContainers } = launchContext; + + if (experimentalContainers) { + throw new Error('Experimental containers are only available with Playwright'); + } - if (proxyUrl) { - const proxyArg = `${PROXY_SERVER_ARG}${anonymizedProxyUrl ?? proxyUrl}`; + launchOptions!.userDataDir = launchOptions!.userDataDir ?? userDataDir; + if (launchOptions!.headless === false) { if (Array.isArray(launchOptions!.args)) { - launchOptions!.args.push(proxyArg); + launchOptions!.args.push('--disable-site-isolation-trials'); } else { - launchOptions!.args = [proxyArg]; + launchOptions!.args = ['--disable-site-isolation-trials']; } } - try { - browser = await this.library.launch(launchOptions); + if (launchOptions!.headless === true && oldPuppeteerVersion) { + launchOptions!.headless = 'new' as any; + } - if (anonymizedProxyUrl) { - browser.on('disconnected', async () => { - await close(); - }); + { + const [anonymizedProxyUrl, close] = await anonymizeProxySugar(proxyUrl, undefined, undefined, { + ignoreProxyCertificate: launchContext.ignoreProxyCertificate, + }); + + if (proxyUrl) { + const proxyArg = `${PROXY_SERVER_ARG}${anonymizedProxyUrl ?? proxyUrl}`; + + if (Array.isArray(launchOptions!.args)) { + launchOptions!.args.push(proxyArg); + } else { + launchOptions!.args = [proxyArg]; + } + } + + try { + browser = await this.library.launch(launchOptions); + + if (anonymizedProxyUrl) { + browser.on('disconnected', async () => { + await close(); + }); + } + } catch (error: any) { + await close(); + + this._throwAugmentedLaunchError( + error, + launchContext.launchOptions?.executablePath, + '`apify/actor-node-puppeteer-chrome`', + "Try installing a browser, if it's missing, by running `npx @puppeteer/browsers install chromium --path [path]` and pointing `executablePath` to the downloaded executable (https://pptr.dev/browsers-api)", + ); } - } catch (error: any) { - await close(); - - this._throwAugmentedLaunchError( - error, - launchContext.launchOptions?.executablePath, - '`apify/actor-node-puppeteer-chrome`', - "Try installing a browser, if it's missing, by running `npx @puppeteer/browsers install chromium --path [path]` and pointing `executablePath` to the downloaded executable (https://pptr.dev/browsers-api)", - ); } } From 29e7aa407aeec7fb7605dd7ddb5faa04a5c99e38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sol=C3=A1r?= Date: Thu, 19 Mar 2026 10:42:10 +0100 Subject: [PATCH 04/45] feat: skip proxy/webdriver hiding for remote browsers, add remote connection logging MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Goal Make `BrowserPlugin.launch()` skip proxy injection and webdriver hiding when `launchContext.isRemote` is `true`, since these operations modify `launchOptions` which are not used for remote connections. ## Dependencies - Task 1 (`isRemote` flag on LaunchContext) ## Scope ### 1. Skip `_addProxyToLaunchOptions()` for remote **File:** `packages/browser-pool/src/abstract-classes/browser-plugin.ts` In the `launch()` method, the call to `_addProxyToLaunchOptions()` is now gated on `!isRemote`: ```typescript if (launchContext.proxyUrl && !launchContext.isRemote) { await this._addProxyToLaunchOptions(launchContext); } ``` ### 2. Skip `_mergeArgsToHideWebdriver()` for remote ```typescript if (!launchContext.isRemote && this._isChromiumBasedBrowser(launchContext)) { this._mergeArgsToHideWebdriver(launchContext); } ``` ### 3. No changes to `_addProxyToLaunchOptions()` or `_mergeArgsToHideWebdriver()` themselves The methods remain unchanged — the skip logic lives in the calling `launch()` method. ## Key Design Decisions - **Skip at call site, not in the methods** - **`proxyUrl` + remote triggers a warning:** Handled in Task 6 (Warnings) - **Fingerprinting hooks are unchanged** ## Additional - Fixed `isRemote` not being passed through base class `createLaunchContext()` - Added info-level logs for remote connections in base class and both plugins Co-Authored-By: Claude Opus 4.6 --- .../src/abstract-classes/browser-plugin.ts | 10 ++++++++-- .../browser-pool/src/playwright/playwright-plugin.ts | 2 ++ .../browser-pool/src/puppeteer/puppeteer-plugin.ts | 1 + 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/packages/browser-pool/src/abstract-classes/browser-plugin.ts b/packages/browser-pool/src/abstract-classes/browser-plugin.ts index bc75ff2fde41..aede527bb847 100644 --- a/packages/browser-pool/src/abstract-classes/browser-plugin.ts +++ b/packages/browser-pool/src/abstract-classes/browser-plugin.ts @@ -158,6 +158,7 @@ export abstract class BrowserPlugin< browserPerProxy = this.browserPerProxy, ignoreProxyCertificate = this.ignoreProxyCertificate, proxyTier, + isRemote, } = options; return new LaunchContext({ @@ -170,6 +171,7 @@ export abstract class BrowserPlugin< browserPerProxy, ignoreProxyCertificate, proxyTier, + isRemote, }); } @@ -197,11 +199,11 @@ export abstract class BrowserPlugin< const { proxyUrl, launchOptions } = launchContext; - if (proxyUrl) { + if (proxyUrl && !launchContext.isRemote) { await this._addProxyToLaunchOptions(launchContext); } - if (this._isChromiumBasedBrowser(launchContext)) { + if (!launchContext.isRemote && this._isChromiumBasedBrowser(launchContext)) { // This will set the args for chromium based browsers to hide the webdriver. (launchOptions as Dictionary).args = this._mergeArgsToHideWebdriver(launchOptions!.args); // When User-Agent is not set, and we're using Chromium in headless mode, @@ -213,6 +215,10 @@ export abstract class BrowserPlugin< } } + if (launchContext.isRemote) { + this.log.info('Connecting to remote browser (skipping local proxy and webdriver stealth configuration).'); + } + return this._launch(launchContext); } diff --git a/packages/browser-pool/src/playwright/playwright-plugin.ts b/packages/browser-pool/src/playwright/playwright-plugin.ts index c99f1680b2c2..a4d5e41c5170 100644 --- a/packages/browser-pool/src/playwright/playwright-plugin.ts +++ b/packages/browser-pool/src/playwright/playwright-plugin.ts @@ -65,11 +65,13 @@ export class PlaywrightPlugin extends BrowserPlugin< protected async _launch(launchContext: LaunchContext): Promise { // Remote CDP connection — skip all local launch/proxy logic if (this.connectOverCDPOptions) { + this.log.info('Connecting to remote browser via connectOverCDP.'); return this.library.connectOverCDP(this.connectOverCDPOptions); } // Remote Playwright WebSocket connection — skip all local launch/proxy logic if (this.connectOptions) { + this.log.info('Connecting to remote browser via connect (Playwright WebSocket).'); return this.library.connect(this.connectOptions); } diff --git a/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts b/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts index a3ea69461662..5cc898bc307b 100644 --- a/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts +++ b/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts @@ -80,6 +80,7 @@ export class PuppeteerPlugin extends BrowserPlugin< if (this.connectOverCDPOptions) { // Remote CDP connection — skip local launch/proxy/headless logic + this.log.info('Connecting to remote browser via connect (CDP).'); browser = await this.library.connect(this.connectOverCDPOptions); } else { const { launchOptions, userDataDir, experimentalContainers } = launchContext; From ed86761bec75426c806c759414f71d11db4d86c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sol=C3=A1r?= Date: Thu, 19 Mar 2026 10:49:44 +0100 Subject: [PATCH 05/45] fix: require endpoint in connect options, use non-deprecated Playwright overloads Playwright: change PlaywrightConnectOverCDPOptions and PlaywrightConnectOptions from type aliases (all-optional fields) to interfaces with required `wsEndpoint`. Use the non-deprecated two-argument overloads in _launch(). Puppeteer: add runtime guard that throws if neither `browserWSEndpoint` nor `browserURL` is provided in connectOverCDPOptions. Co-Authored-By: Claude Opus 4.6 --- .../src/playwright/playwright-plugin.ts | 22 +++++++++++++------ .../src/puppeteer/puppeteer-plugin.ts | 3 +++ 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/packages/browser-pool/src/playwright/playwright-plugin.ts b/packages/browser-pool/src/playwright/playwright-plugin.ts index a4d5e41c5170..199e468cea8c 100644 --- a/packages/browser-pool/src/playwright/playwright-plugin.ts +++ b/packages/browser-pool/src/playwright/playwright-plugin.ts @@ -1,6 +1,6 @@ import fs from 'node:fs'; -import type { Browser as PlaywrightBrowser, BrowserType } from 'playwright'; +import type { Browser as PlaywrightBrowser, BrowserType, ConnectOverCDPOptions, ConnectOptions } from 'playwright'; import { BrowserPlugin, @@ -17,15 +17,21 @@ import { PlaywrightController } from './playwright-controller.js'; /** * Options for connecting to a remote browser via CDP. - * Mirrors `browserType.connectOverCDP(options)`. + * Mirrors `browserType.connectOverCDP(endpointURL, options?)`. */ -export type PlaywrightConnectOverCDPOptions = Parameters[0]; +export interface PlaywrightConnectOverCDPOptions extends ConnectOverCDPOptions { + /** The CDP endpoint URL to connect to (required). */ + wsEndpoint: string; +} /** * Options for connecting to a remote browser via WebSocket. - * Mirrors `browserType.connect(options)`. + * Mirrors `browserType.connect(wsEndpoint, options?)`. */ -export type PlaywrightConnectOptions = Parameters[0]; +export interface PlaywrightConnectOptions extends ConnectOptions { + /** The WebSocket endpoint URL to connect to (required). */ + wsEndpoint: string; +} export interface PlaywrightPluginOptions extends BrowserPluginOptions[0]> { connectOptions?: PlaywrightConnectOptions; @@ -66,13 +72,15 @@ export class PlaywrightPlugin extends BrowserPlugin< // Remote CDP connection — skip all local launch/proxy logic if (this.connectOverCDPOptions) { this.log.info('Connecting to remote browser via connectOverCDP.'); - return this.library.connectOverCDP(this.connectOverCDPOptions); + const { wsEndpoint, ...options } = this.connectOverCDPOptions; + return this.library.connectOverCDP(wsEndpoint, options); } // Remote Playwright WebSocket connection — skip all local launch/proxy logic if (this.connectOptions) { this.log.info('Connecting to remote browser via connect (Playwright WebSocket).'); - return this.library.connect(this.connectOptions); + const { wsEndpoint, ...options } = this.connectOptions; + return this.library.connect(wsEndpoint, options); } const { launchOptions, useIncognitoPages, userDataDir, proxyUrl } = launchContext; diff --git a/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts b/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts index 5cc898bc307b..b3fd07f224e6 100644 --- a/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts +++ b/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts @@ -80,6 +80,9 @@ export class PuppeteerPlugin extends BrowserPlugin< if (this.connectOverCDPOptions) { // Remote CDP connection — skip local launch/proxy/headless logic + if (!this.connectOverCDPOptions.browserWSEndpoint && !this.connectOverCDPOptions.browserURL) { + throw new Error("connectOverCDPOptions must include either 'browserWSEndpoint' or 'browserURL'."); + } this.log.info('Connecting to remote browser via connect (CDP).'); browser = await this.library.connect(this.connectOverCDPOptions); } else { From bd19911cef73b00090932fcb70919a95d99e4d31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sol=C3=A1r?= Date: Thu, 19 Mar 2026 10:55:00 +0100 Subject: [PATCH 06/45] feat: default `useIncognitoPages` to `true` for remote browser connections MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # Task 5: `useIncognitoPages` Defaults to `true` for Remote ## Goal When remote connection options are present and `useIncognitoPages` was not explicitly set by the user, default it to `true` and log an info message. If the user explicitly sets `false`, log a warning. ## Dependencies - Task 2 (PlaywrightPlugin stores connect options) - Task 3 (PuppeteerPlugin stores connect options) ## Scope ### 1. Preserve `undefined` vs `false` in base constructor The base `BrowserPlugin` constructor currently collapses `useIncognitoPages` to `false`. The subclass checks `options.useIncognitoPages` directly (preserves `undefined`) and overrides after `super()`. ### 2. Override default in PlaywrightPlugin constructor After the `super()` call, if connect options are present: - `undefined` → set to `true`, info log - `false` → warning log - `true` → no extra log ### 3. Override default in PuppeteerPlugin constructor Same logic, checking `connectOverCDPOptions`. ## Key Design Decisions - **Info vs warning:** Defaulting to `true` is an info message (expected behavior). Explicit `false` is a warning (user should understand implications). - **`useIncognitoPages: false` + `connect()` is not special-cased:** The warning covers this case — no additional error or fallback. - **Uses existing `this.log`:** All logging uses the inherited `BrowserPlugin.log` logger. ## Acceptance Criteria - [x] When `connectOptions` or `connectOverCDPOptions` is set and `useIncognitoPages` is not provided → defaults to `true`, info message logged - [x] When `connectOptions` or `connectOverCDPOptions` is set and `useIncognitoPages: false` → stays `false`, warning logged - [x] When `connectOptions` or `connectOverCDPOptions` is set and `useIncognitoPages: true` → stays `true`, no extra log - [x] When no connect options are set → existing behavior unchanged - [x] Base constructor preserves `undefined` vs `false` distinction Co-Authored-By: Claude Opus 4.6 --- .../browser-pool/src/playwright/playwright-plugin.ts | 12 ++++++++++++ .../browser-pool/src/puppeteer/puppeteer-plugin.ts | 12 ++++++++++++ 2 files changed, 24 insertions(+) diff --git a/packages/browser-pool/src/playwright/playwright-plugin.ts b/packages/browser-pool/src/playwright/playwright-plugin.ts index 199e468cea8c..c745e7784537 100644 --- a/packages/browser-pool/src/playwright/playwright-plugin.ts +++ b/packages/browser-pool/src/playwright/playwright-plugin.ts @@ -59,6 +59,18 @@ export class PlaywrightPlugin extends BrowserPlugin< super(library, baseOptions); this.connectOptions = connectOptions; this.connectOverCDPOptions = connectOverCDPOptions; + + if (this.connectOptions || this.connectOverCDPOptions) { + if (options.useIncognitoPages === undefined) { + this.useIncognitoPages = true; + this.log.info('Remote browser detected — defaulting useIncognitoPages to true for session isolation.'); + } else if (options.useIncognitoPages === false) { + this.log.warning( + 'useIncognitoPages is set to false with a remote browser connection. ' + + 'Pages will share cookies and storage on the remote browser instance.', + ); + } + } } override createLaunchContext(options: CreateLaunchContextOptions = {}): LaunchContext { diff --git a/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts b/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts index b3fd07f224e6..0e93a93e9f83 100644 --- a/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts +++ b/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts @@ -39,6 +39,18 @@ export class PuppeteerPlugin extends BrowserPlugin< const { connectOverCDPOptions, ...baseOptions } = options; super(library, baseOptions); this.connectOverCDPOptions = connectOverCDPOptions; + + if (this.connectOverCDPOptions) { + if (options.useIncognitoPages === undefined) { + this.useIncognitoPages = true; + this.log.info('Remote browser detected — defaulting useIncognitoPages to true for session isolation.'); + } else if (options.useIncognitoPages === false) { + this.log.warning( + 'useIncognitoPages is set to false with a remote browser connection. ' + + 'Pages will share cookies and storage on the remote browser instance.', + ); + } + } } override createLaunchContext( From 373da361280860e7e151cd5e7e43efb38dbc57aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sol=C3=A1r?= Date: Thu, 19 Mar 2026 11:06:35 +0100 Subject: [PATCH 07/45] fix: improve remote connection error handling and endpoint validation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Rename PlaywrightConnectOverCDPOptions.wsEndpoint → endpointURL to match Playwright's own terminology and avoid field conflict with inherited ConnectOverCDPOptions.endpointURL - Wrap connectOverCDP() and connect() failures with BrowserLaunchError including sanitized endpoint URL (credentials stripped) and actionable guidance - Move endpoint validation to constructors (fail fast) — Playwright validates endpointURL and wsEndpoint are non-empty, Puppeteer validates browserWSEndpoint || browserURL - Add _sanitizeEndpointForLog() to both plugins to strip credentials from URLs before including them in error messages Co-Authored-By: Claude Opus 4.6 --- .../src/playwright/playwright-plugin.ts | 50 ++++++++++++++++--- .../src/puppeteer/puppeteer-plugin.ts | 34 +++++++++++-- 2 files changed, 74 insertions(+), 10 deletions(-) diff --git a/packages/browser-pool/src/playwright/playwright-plugin.ts b/packages/browser-pool/src/playwright/playwright-plugin.ts index c745e7784537..cabcc7352fd7 100644 --- a/packages/browser-pool/src/playwright/playwright-plugin.ts +++ b/packages/browser-pool/src/playwright/playwright-plugin.ts @@ -3,6 +3,7 @@ import fs from 'node:fs'; import type { Browser as PlaywrightBrowser, BrowserType, ConnectOverCDPOptions, ConnectOptions } from 'playwright'; import { + BrowserLaunchError, BrowserPlugin, type BrowserPluginOptions, type CreateLaunchContextOptions, @@ -20,8 +21,8 @@ import { PlaywrightController } from './playwright-controller.js'; * Mirrors `browserType.connectOverCDP(endpointURL, options?)`. */ export interface PlaywrightConnectOverCDPOptions extends ConnectOverCDPOptions { - /** The CDP endpoint URL to connect to (required). */ - wsEndpoint: string; + /** The CDP endpoint URL to connect to (required). Overrides the deprecated optional `endpointURL` from Playwright. */ + endpointURL: string; } /** @@ -56,6 +57,14 @@ export class PlaywrightPlugin extends BrowserPlugin< throw new Error("Cannot set both 'connectOptions' and 'connectOverCDPOptions' — pick one protocol."); } + if (connectOverCDPOptions && !connectOverCDPOptions.endpointURL) { + throw new Error("'connectOverCDPOptions.endpointURL' must be a non-empty string."); + } + + if (connectOptions && !connectOptions.wsEndpoint) { + throw new Error("'connectOptions.wsEndpoint' must be a non-empty string."); + } + super(library, baseOptions); this.connectOptions = connectOptions; this.connectOverCDPOptions = connectOverCDPOptions; @@ -80,19 +89,48 @@ export class PlaywrightPlugin extends BrowserPlugin< }); } + private _sanitizeEndpointForLog(endpoint: string): string { + try { + const url = new URL(endpoint); + if (url.username || url.password) { + url.username = '***'; + url.password = '***'; + } + return url.toString(); + } catch { + return ''; + } + } + protected async _launch(launchContext: LaunchContext): Promise { // Remote CDP connection — skip all local launch/proxy logic if (this.connectOverCDPOptions) { + const { endpointURL, ...options } = this.connectOverCDPOptions; this.log.info('Connecting to remote browser via connectOverCDP.'); - const { wsEndpoint, ...options } = this.connectOverCDPOptions; - return this.library.connectOverCDP(wsEndpoint, options); + try { + return await this.library.connectOverCDP(endpointURL, options); + } catch (cause) { + throw new BrowserLaunchError( + `Failed to connect to remote browser via CDP at "${this._sanitizeEndpointForLog(endpointURL)}". ` + + 'Check that the endpoint is reachable and the browser is accepting CDP connections.\n\u200b', + { cause }, + ); + } } // Remote Playwright WebSocket connection — skip all local launch/proxy logic if (this.connectOptions) { - this.log.info('Connecting to remote browser via connect (Playwright WebSocket).'); const { wsEndpoint, ...options } = this.connectOptions; - return this.library.connect(wsEndpoint, options); + this.log.info('Connecting to remote browser via connect (Playwright WebSocket).'); + try { + return await this.library.connect(wsEndpoint, options); + } catch (cause) { + throw new BrowserLaunchError( + `Failed to connect to remote browser via WebSocket at "${this._sanitizeEndpointForLog(wsEndpoint)}". ` + + 'Check that the endpoint is reachable and the Playwright server is running.\n\u200b', + { cause }, + ); + } } const { launchOptions, useIncognitoPages, userDataDir, proxyUrl } = launchContext; diff --git a/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts b/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts index 0e93a93e9f83..b0cf8a1aa03d 100644 --- a/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts +++ b/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts @@ -5,6 +5,7 @@ import type Puppeteer from 'puppeteer'; import type * as PuppeteerTypes from 'puppeteer'; import { + BrowserLaunchError, BrowserPlugin, type BrowserPluginOptions, type CreateLaunchContextOptions, @@ -37,6 +38,11 @@ export class PuppeteerPlugin extends BrowserPlugin< constructor(library: typeof Puppeteer, options: PuppeteerPluginOptions = {}) { const { connectOverCDPOptions, ...baseOptions } = options; + + if (connectOverCDPOptions && !connectOverCDPOptions.browserWSEndpoint && !connectOverCDPOptions.browserURL) { + throw new Error("connectOverCDPOptions must include either 'browserWSEndpoint' or 'browserURL'."); + } + super(library, baseOptions); this.connectOverCDPOptions = connectOverCDPOptions; @@ -67,6 +73,19 @@ export class PuppeteerPlugin extends BrowserPlugin< }); } + private _sanitizeEndpointForLog(endpoint: string): string { + try { + const url = new URL(endpoint); + if (url.username || url.password) { + url.username = '***'; + url.password = '***'; + } + return url.toString(); + } catch { + return ''; + } + } + protected async _launch( launchContext: LaunchContext< typeof Puppeteer, @@ -92,11 +111,18 @@ export class PuppeteerPlugin extends BrowserPlugin< if (this.connectOverCDPOptions) { // Remote CDP connection — skip local launch/proxy/headless logic - if (!this.connectOverCDPOptions.browserWSEndpoint && !this.connectOverCDPOptions.browserURL) { - throw new Error("connectOverCDPOptions must include either 'browserWSEndpoint' or 'browserURL'."); - } + const endpoint = this.connectOverCDPOptions.browserWSEndpoint || this.connectOverCDPOptions.browserURL!; this.log.info('Connecting to remote browser via connect (CDP).'); - browser = await this.library.connect(this.connectOverCDPOptions); + try { + browser = await this.library.connect(this.connectOverCDPOptions); + } catch (cause) { + const safeEndpoint = this._sanitizeEndpointForLog(endpoint); + throw new BrowserLaunchError( + `Failed to connect to remote browser via CDP at "${safeEndpoint}". ` + + 'Check that the endpoint is reachable and the browser is accepting CDP connections.\n\u200b', + { cause }, + ); + } } else { const { launchOptions, userDataDir, experimentalContainers } = launchContext; From 76b7d20d8cba6941ab77ba1d86d2dbe2d9486662 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sol=C3=A1r?= Date: Thu, 19 Mar 2026 11:14:14 +0100 Subject: [PATCH 08/45] fix: prevent resource leaks in PuppeteerPlugin remote browser connections MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Close BrowserContext on page close when useIncognitoPages is true. Previously contexts were only cleaned up when an anonymized proxy was active, causing context accumulation on remote browsers without proxy. - Clean up targetcreated listener on remote browser disconnect via browser.once('disconnected') handler to prevent listener leaks. - Guard anonymizeProxySugar call with proxyUrl check — skip the async call entirely when no proxy is configured (common for remote browsers). - Conditionally omit proxyServer from context options when no proxy is set, instead of passing { proxyServer: undefined }. Co-Authored-By: Claude Opus 4.6 --- .../src/puppeteer/puppeteer-plugin.ts | 39 ++++++++++++------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts b/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts index b0cf8a1aa03d..6d117f707021 100644 --- a/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts +++ b/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts @@ -180,7 +180,7 @@ export class PuppeteerPlugin extends BrowserPlugin< } } - browser.on('targetcreated', async (target: PuppeteerTypes.Target) => { + const targetCreatedHandler = async (target: PuppeteerTypes.Target) => { try { const page = await target.page(); @@ -193,7 +193,16 @@ export class PuppeteerPlugin extends BrowserPlugin< } catch (error: any) { this.log.exception(error, 'Failed to retrieve page from target.'); } - }); + }; + + browser.on('targetcreated', targetCreatedHandler); + + // Clean up the listener when a remote browser disconnects to prevent leaks + if (this.connectOverCDPOptions) { + browser.once('disconnected', () => { + browser.off('targetcreated', targetCreatedHandler); + }); + } const boundMethods = ( [ @@ -220,25 +229,25 @@ export class PuppeteerPlugin extends BrowserPlugin< let page: PuppeteerTypes.Page; if (useIncognitoPages) { - const [anonymizedProxyUrl, close] = await anonymizeProxySugar( - proxyUrl, - undefined, - undefined, - { ignoreProxyCertificate }, - ); + const [anonymizedProxyUrl, close] = proxyUrl + ? await anonymizeProxySugar(proxyUrl, undefined, undefined, { ignoreProxyCertificate }) + : ([undefined, noop] as const); try { - const context = (await (browser as any)[method]({ - proxyServer: anonymizedProxyUrl ?? proxyUrl, - })) as PuppeteerTypes.BrowserContext; + const proxyServer = anonymizedProxyUrl ?? proxyUrl; + const contextOptions = proxyServer ? { proxyServer } : {}; + const context = (await (browser as any)[method]( + contextOptions, + )) as PuppeteerTypes.BrowserContext; page = await context.newPage(...args); - if (anonymizedProxyUrl) { - page.on('close', async () => { + page.once('close', async () => { + if (anonymizedProxyUrl) { await close(); - }); - } + } + await context.close().catch(noop); + }); } catch (error) { await close(); From 01ada420f41180703c52be3875b7bb1687198500 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sol=C3=A1r?= Date: Thu, 19 Mar 2026 11:25:05 +0100 Subject: [PATCH 09/45] chore: add clarifying comments for remote launch path in base class Co-Authored-By: Claude Opus 4.6 --- packages/browser-pool/src/abstract-classes/browser-plugin.ts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/packages/browser-pool/src/abstract-classes/browser-plugin.ts b/packages/browser-pool/src/abstract-classes/browser-plugin.ts index aede527bb847..3db226002b72 100644 --- a/packages/browser-pool/src/abstract-classes/browser-plugin.ts +++ b/packages/browser-pool/src/abstract-classes/browser-plugin.ts @@ -195,10 +195,12 @@ export abstract class BrowserPlugin< NewPageResult > = this.createLaunchContext(), ): Promise { + // launchOptions is only used by the local launch path below — remote connections ignore it. launchContext.launchOptions ??= {} as LibraryOptions; const { proxyUrl, launchOptions } = launchContext; + // TODO(Task 6): warn when proxyUrl is set on a remote connection — proxy is silently ignored. if (proxyUrl && !launchContext.isRemote) { await this._addProxyToLaunchOptions(launchContext); } From f7dc7c6301973444e13a9a6daaf53f54fa021747 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sol=C3=A1r?= Date: Thu, 19 Mar 2026 11:55:01 +0100 Subject: [PATCH 10/45] fix: clarify useIncognitoPages pattern and improve warning for WebSocket connections MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add comments in both plugin constructors explaining why options.useIncognitoPages is checked instead of this.useIncognitoPages (super() collapses undefined to false, losing the "not set" signal). - Strengthen warning for Playwright connectOptions (WebSocket) + useIncognitoPages: false — connect() returns a browser with no default context, which is more severe than just sharing cookies. Co-Authored-By: Claude Opus 4.6 --- .../browser-pool/src/playwright/playwright-plugin.ts | 12 ++++++++---- .../browser-pool/src/puppeteer/puppeteer-plugin.ts | 2 ++ 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/packages/browser-pool/src/playwright/playwright-plugin.ts b/packages/browser-pool/src/playwright/playwright-plugin.ts index cabcc7352fd7..d23e20f7a120 100644 --- a/packages/browser-pool/src/playwright/playwright-plugin.ts +++ b/packages/browser-pool/src/playwright/playwright-plugin.ts @@ -69,15 +69,19 @@ export class PlaywrightPlugin extends BrowserPlugin< this.connectOptions = connectOptions; this.connectOverCDPOptions = connectOverCDPOptions; + // We check options.useIncognitoPages (not this.useIncognitoPages) because super() collapses undefined to false. + // This preserves the distinction between "not set" (undefined → default to true) and "explicitly false". if (this.connectOptions || this.connectOverCDPOptions) { if (options.useIncognitoPages === undefined) { this.useIncognitoPages = true; this.log.info('Remote browser detected — defaulting useIncognitoPages to true for session isolation.'); } else if (options.useIncognitoPages === false) { - this.log.warning( - 'useIncognitoPages is set to false with a remote browser connection. ' + - 'Pages will share cookies and storage on the remote browser instance.', - ); + const message = this.connectOptions + ? 'useIncognitoPages is set to false with a remote WebSocket connection. ' + + 'This may cause errors because browserType.connect() returns a browser with no default context.' + : 'useIncognitoPages is set to false with a remote browser connection. ' + + 'Pages will share cookies and storage on the remote browser instance.'; + this.log.warning(message); } } } diff --git a/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts b/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts index 6d117f707021..33b2105e2cf1 100644 --- a/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts +++ b/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts @@ -46,6 +46,8 @@ export class PuppeteerPlugin extends BrowserPlugin< super(library, baseOptions); this.connectOverCDPOptions = connectOverCDPOptions; + // We check options.useIncognitoPages (not this.useIncognitoPages) because super() collapses undefined to false. + // This preserves the distinction between "not set" (undefined → default to true) and "explicitly false". if (this.connectOverCDPOptions) { if (options.useIncognitoPages === undefined) { this.useIncognitoPages = true; From 0e3218b6676b8586b744740722026bfba2818f78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sol=C3=A1r?= Date: Thu, 19 Mar 2026 12:59:23 +0100 Subject: [PATCH 11/45] feat: add warnings for ignored options on remote browser connections MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove spurious launchOptions warning that always fired due to framework-injected defaults, and share log instances in launchers. PRD Task 6: Warnings for Ignored & Conflicting Options - proxyUrl + remote → warning in base BrowserPlugin.launch() - useChrome + remote → warning in launcher constructors - executablePath + remote → warning in launcher constructors - useIncognitoPages: false + remote → handled by Task 5 Co-Authored-By: Claude Opus 4.6 --- .../src/abstract-classes/browser-plugin.ts | 8 ++++++- .../src/internals/playwright-launcher.ts | 21 ++++++++++++++++++ .../src/internals/puppeteer-launcher.ts | 22 +++++++++++++++++++ 3 files changed, 50 insertions(+), 1 deletion(-) diff --git a/packages/browser-pool/src/abstract-classes/browser-plugin.ts b/packages/browser-pool/src/abstract-classes/browser-plugin.ts index 3db226002b72..b1889f010669 100644 --- a/packages/browser-pool/src/abstract-classes/browser-plugin.ts +++ b/packages/browser-pool/src/abstract-classes/browser-plugin.ts @@ -200,7 +200,13 @@ export abstract class BrowserPlugin< const { proxyUrl, launchOptions } = launchContext; - // TODO(Task 6): warn when proxyUrl is set on a remote connection — proxy is silently ignored. + if (proxyUrl && launchContext.isRemote) { + this.log.warning( + 'proxyUrl is set but will be ignored for remote browser connections. ' + + 'Configure proxy settings on the remote browser service instead.', + ); + } + if (proxyUrl && !launchContext.isRemote) { await this._addProxyToLaunchOptions(launchContext); } diff --git a/packages/playwright-crawler/src/internals/playwright-launcher.ts b/packages/playwright-crawler/src/internals/playwright-launcher.ts index 5c93468d63f3..e3d2e7f44c8c 100644 --- a/packages/playwright-crawler/src/internals/playwright-launcher.ts +++ b/packages/playwright-crawler/src/internals/playwright-launcher.ts @@ -2,6 +2,7 @@ import type { BrowserLaunchContext } from '@crawlee/browser'; import { BrowserLauncher, Configuration } from '@crawlee/browser'; import { PlaywrightPlugin } from '@crawlee/browser-pool'; import type { PlaywrightConnectOptions, PlaywrightConnectOverCDPOptions } from '@crawlee/browser-pool'; +import { serviceLocator } from '@crawlee/core'; import ow from 'ow'; import type { Browser, BrowserType, LaunchOptions } from 'playwright'; @@ -129,6 +130,26 @@ export class PlaywrightLauncher extends BrowserLauncher { ); this.Plugin = PlaywrightPlugin; + + const connectOptionsPresent = !!(launchContext.connectOptions || launchContext.connectOverCDPOptions); + + if (connectOptionsPresent && (launchContext.useChrome || launchContext.launchOptions?.executablePath)) { + const log = serviceLocator.getLogger().child({ prefix: 'PlaywrightLauncher' }); + + if (launchContext.useChrome) { + log.warning( + 'useChrome is set but will be ignored for remote browser connections. ' + + 'The remote service controls which browser binary is used.', + ); + } + + if (launchContext.launchOptions?.executablePath) { + log.warning( + 'executablePath is set but will be ignored for remote browser connections. ' + + 'The remote service controls which browser binary is used.', + ); + } + } } } diff --git a/packages/puppeteer-crawler/src/internals/puppeteer-launcher.ts b/packages/puppeteer-crawler/src/internals/puppeteer-launcher.ts index 5e8333083e32..4113ea0d90bc 100644 --- a/packages/puppeteer-crawler/src/internals/puppeteer-launcher.ts +++ b/packages/puppeteer-crawler/src/internals/puppeteer-launcher.ts @@ -2,6 +2,7 @@ import type { BrowserLaunchContext } from '@crawlee/browser'; import { BrowserLauncher, Configuration } from '@crawlee/browser'; import { PuppeteerPlugin } from '@crawlee/browser-pool'; import type { PuppeteerConnectOverCDPOptions } from '@crawlee/browser-pool'; +import { serviceLocator } from '@crawlee/core'; import ow from 'ow'; import type { Browser } from 'puppeteer'; @@ -108,6 +109,27 @@ export class PuppeteerLauncher extends BrowserLauncher ); this.Plugin = PuppeteerPlugin; + + if ( + launchContext.connectOverCDPOptions && + (launchContext.useChrome || (launchContext.launchOptions as Record)?.executablePath) + ) { + const log = serviceLocator.getLogger().child({ prefix: 'PuppeteerLauncher' }); + + if (launchContext.useChrome) { + log.warning( + 'useChrome is set but will be ignored for remote browser connections. ' + + 'The remote service controls which browser binary is used.', + ); + } + + if ((launchContext.launchOptions as Record)?.executablePath) { + log.warning( + 'executablePath is set but will be ignored for remote browser connections. ' + + 'The remote service controls which browser binary is used.', + ); + } + } } protected override _getDefaultHeadlessOption(): boolean { From a11370a495e34df673d380535a8c231e1eff7d23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sol=C3=A1r?= Date: Thu, 19 Mar 2026 13:27:37 +0100 Subject: [PATCH 12/45] test: add unit tests for remote browser connections PRD Task 7: Unit Tests - Connection routing (Playwright CDP/WS/local, Puppeteer CDP/local) - Validation (mutual exclusion, missing endpoints) - isRemote correctness for all plugin variants - Proxy/webdriver skipping for remote, applied for local - useIncognitoPages defaults (true for remote, false for local) - Warnings (proxyUrl, useIncognitoPages: false, CDP vs WS variants) - 40 tests, all mocked (no real browser instances) Co-Authored-By: Claude Opus 4.6 --- .../browser-pool/test/remote-browser.test.ts | 624 ++++++++++++++++++ 1 file changed, 624 insertions(+) create mode 100644 packages/browser-pool/test/remote-browser.test.ts diff --git a/packages/browser-pool/test/remote-browser.test.ts b/packages/browser-pool/test/remote-browser.test.ts new file mode 100644 index 000000000000..a4e32212f9f2 --- /dev/null +++ b/packages/browser-pool/test/remote-browser.test.ts @@ -0,0 +1,624 @@ +import { vi } from 'vitest'; + +import { serviceLocator } from '@crawlee/core'; +import type { CrawleeLogger } from '@crawlee/core'; + +import { PlaywrightPlugin } from '../src/playwright/playwright-plugin.js'; +import { PuppeteerPlugin } from '../src/puppeteer/puppeteer-plugin.js'; + +// --------------------------------------------------------------------------- +// Shared mock helpers +// --------------------------------------------------------------------------- + +function createMockBrowser() { + return { + newPage: vi.fn().mockResolvedValue({ close: vi.fn(), url: vi.fn(() => 'about:blank') }), + close: vi.fn().mockResolvedValue(undefined), + contexts: vi.fn(() => []), + on: vi.fn(), + off: vi.fn(), + once: vi.fn(), + version: vi.fn(() => '120.0.0'), + pages: vi.fn(() => []), + process: vi.fn(() => null), + userAgent: vi.fn().mockResolvedValue('mock-ua'), + createBrowserContext: vi.fn(), + createIncognitoBrowserContext: vi.fn(), + }; +} + +function createMockPlaywrightLibrary(browser = createMockBrowser()) { + const mockContext = { + ...browser, + once: vi.fn(), + on: vi.fn(), + }; + return { + launch: vi.fn().mockResolvedValue(browser), + connect: vi.fn().mockResolvedValue(browser), + connectOverCDP: vi.fn().mockResolvedValue(browser), + name: vi.fn(() => 'chromium'), + launchPersistentContext: vi.fn().mockResolvedValue(mockContext), + }; +} + +function createMockPuppeteerLibrary(browser = createMockBrowser()) { + return { + launch: vi.fn().mockResolvedValue(browser), + connect: vi.fn().mockResolvedValue(browser), + product: 'chrome', + }; +} + +function createMockLogger(): CrawleeLogger & { warning: ReturnType; info: ReturnType } { + const mockLogger: any = { + getOptions: vi.fn(() => ({})), + setOptions: vi.fn(), + child: vi.fn(() => mockLogger), + error: vi.fn(), + exception: vi.fn(), + softFail: vi.fn(), + warning: vi.fn(), + warningOnce: vi.fn(), + info: vi.fn(), + debug: vi.fn(), + perf: vi.fn(), + deprecated: vi.fn(), + log: vi.fn(), + setLevel: vi.fn(), + getLevel: vi.fn(), + }; + return mockLogger; +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +describe('Remote browser — PlaywrightPlugin', () => { + let mockLogger: ReturnType; + + beforeEach(() => { + mockLogger = createMockLogger(); + serviceLocator.setLogger(mockLogger); + }); + + // --- Connection routing --------------------------------------------------- + + describe('connection routing', () => { + test('connectOverCDPOptions → calls connectOverCDP, not launch', async () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any, { + connectOverCDPOptions: { endpointURL: 'http://remote:9222' }, + }); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + expect(lib.connectOverCDP).toHaveBeenCalledTimes(1); + expect(lib.connectOverCDP).toHaveBeenCalledWith('http://remote:9222', {}); + expect(lib.launch).not.toHaveBeenCalled(); + expect(lib.connect).not.toHaveBeenCalled(); + }); + + test('connectOptions → calls connect, not launch', async () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any, { + connectOptions: { wsEndpoint: 'ws://remote:3000' }, + }); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + expect(lib.connect).toHaveBeenCalledTimes(1); + expect(lib.connect).toHaveBeenCalledWith('ws://remote:3000', {}); + expect(lib.launch).not.toHaveBeenCalled(); + expect(lib.connectOverCDP).not.toHaveBeenCalled(); + }); + + test('no connect options → calls launch', async () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + expect(lib.launch).toHaveBeenCalledTimes(1); + expect(lib.connect).not.toHaveBeenCalled(); + expect(lib.connectOverCDP).not.toHaveBeenCalled(); + }); + + test('passes extra options through to connectOverCDP', async () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any, { + connectOverCDPOptions: { + endpointURL: 'http://remote:9222', + timeout: 5000, + headers: { 'x-token': 'abc' }, + }, + }); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + expect(lib.connectOverCDP).toHaveBeenCalledWith('http://remote:9222', { + timeout: 5000, + headers: { 'x-token': 'abc' }, + }); + }); + + test('passes extra options through to connect', async () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any, { + connectOptions: { + wsEndpoint: 'ws://remote:3000', + timeout: 3000, + headers: { Authorization: 'Bearer xyz' }, + }, + }); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + expect(lib.connect).toHaveBeenCalledWith('ws://remote:3000', { + timeout: 3000, + headers: { Authorization: 'Bearer xyz' }, + }); + }); + }); + + // --- Validation ----------------------------------------------------------- + + describe('validation', () => { + test('throws when both connectOptions and connectOverCDPOptions are set', () => { + const lib = createMockPlaywrightLibrary(); + + expect( + () => + new PlaywrightPlugin(lib as any, { + connectOptions: { wsEndpoint: 'ws://remote:3000' }, + connectOverCDPOptions: { endpointURL: 'http://remote:9222' }, + }), + ).toThrow("Cannot set both 'connectOptions' and 'connectOverCDPOptions'"); + }); + + test('throws when connectOverCDPOptions has no endpointURL', () => { + const lib = createMockPlaywrightLibrary(); + + expect( + () => + new PlaywrightPlugin(lib as any, { + connectOverCDPOptions: { endpointURL: '' }, + }), + ).toThrow("'connectOverCDPOptions.endpointURL' must be a non-empty string"); + }); + + test('throws when connectOptions has no wsEndpoint', () => { + const lib = createMockPlaywrightLibrary(); + + expect( + () => + new PlaywrightPlugin(lib as any, { + connectOptions: { wsEndpoint: '' }, + }), + ).toThrow("'connectOptions.wsEndpoint' must be a non-empty string"); + }); + }); + + // --- isRemote correctness ------------------------------------------------- + + describe('isRemote', () => { + test('true when connectOverCDPOptions is present', () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any, { + connectOverCDPOptions: { endpointURL: 'http://remote:9222' }, + }); + + const ctx = plugin.createLaunchContext(); + expect(ctx.isRemote).toBe(true); + }); + + test('true when connectOptions is present', () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any, { + connectOptions: { wsEndpoint: 'ws://remote:3000' }, + }); + + const ctx = plugin.createLaunchContext(); + expect(ctx.isRemote).toBe(true); + }); + + test('false when no connect options', () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any); + + const ctx = plugin.createLaunchContext(); + expect(ctx.isRemote).toBe(false); + }); + }); + + // --- Proxy/webdriver skipping --------------------------------------------- + + describe('proxy/webdriver skipping for remote', () => { + test('proxy is not applied for remote connections', async () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any, { + connectOverCDPOptions: { endpointURL: 'http://remote:9222' }, + proxyUrl: 'http://user:pass@proxy:8080', + }); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + // The browser was connected via CDP, not launched — proxy is not set on launchOptions + expect(lib.connectOverCDP).toHaveBeenCalledTimes(1); + expect(lib.launch).not.toHaveBeenCalled(); + }); + + test('webdriver hiding args are not added for remote connections', async () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any, { + connectOverCDPOptions: { endpointURL: 'http://remote:9222' }, + launchOptions: { args: ['--custom-flag'] }, + }); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + // The original args should be untouched — no webdriver stealth flag injected + expect(ctx.launchOptions?.args).toEqual(['--custom-flag']); + expect(ctx.launchOptions?.args).not.toContain('--disable-blink-features=AutomationControlled'); + }); + + test('webdriver hiding args ARE added for local chromium connections', async () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any, { + launchOptions: { args: ['--custom-flag'] }, + }); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + expect(ctx.launchOptions?.args).toContain('--disable-blink-features=AutomationControlled'); + expect(ctx.launchOptions?.args).toContain('--custom-flag'); + }); + + test('proxy is applied for local connections', async () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any, { + proxyUrl: 'http://user:pass@proxy:8080', + }); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + expect(lib.launch).toHaveBeenCalledTimes(1); + // Launch options should have proxy configured + const launchOpts = lib.launch.mock.calls[0][0]; + expect(launchOpts.proxy).toBeDefined(); + expect(launchOpts.proxy.server).toBeDefined(); + }); + }); + + // --- useIncognitoPages default -------------------------------------------- + + describe('useIncognitoPages default', () => { + test('defaults to true for remote (connectOverCDP)', () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any, { + connectOverCDPOptions: { endpointURL: 'http://remote:9222' }, + }); + + expect(plugin.useIncognitoPages).toBe(true); + }); + + test('defaults to true for remote (connect)', () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any, { + connectOptions: { wsEndpoint: 'ws://remote:3000' }, + }); + + expect(plugin.useIncognitoPages).toBe(true); + }); + + test('explicit false preserved for remote', () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any, { + connectOverCDPOptions: { endpointURL: 'http://remote:9222' }, + useIncognitoPages: false, + }); + + expect(plugin.useIncognitoPages).toBe(false); + }); + + test('explicit true preserved for remote', () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any, { + connectOverCDPOptions: { endpointURL: 'http://remote:9222' }, + useIncognitoPages: true, + }); + + expect(plugin.useIncognitoPages).toBe(true); + }); + + test('default false for local', () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any); + + expect(plugin.useIncognitoPages).toBe(false); + }); + }); + + // --- Warnings ------------------------------------------------------------- + + describe('warnings', () => { + test('proxyUrl + remote → warning logged', async () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any, { + connectOverCDPOptions: { endpointURL: 'http://remote:9222' }, + proxyUrl: 'http://user:pass@proxy:8080', + }); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + expect(mockLogger.warning).toHaveBeenCalledWith( + expect.stringContaining('proxyUrl is set but will be ignored'), + ); + }); + + test('useIncognitoPages: false + remote CDP → warning about shared state', () => { + const lib = createMockPlaywrightLibrary(); + new PlaywrightPlugin(lib as any, { + connectOverCDPOptions: { endpointURL: 'http://remote:9222' }, + useIncognitoPages: false, + }); + + expect(mockLogger.warning).toHaveBeenCalledWith( + expect.stringContaining('Pages will share cookies and storage'), + ); + }); + + test('useIncognitoPages: false + remote WebSocket → warning about no default context', () => { + const lib = createMockPlaywrightLibrary(); + new PlaywrightPlugin(lib as any, { + connectOptions: { wsEndpoint: 'ws://remote:3000' }, + useIncognitoPages: false, + }); + + expect(mockLogger.warning).toHaveBeenCalledWith( + expect.stringContaining('browserType.connect() returns a browser with no default context'), + ); + }); + + test('no warnings for local browser usage', async () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + expect(mockLogger.warning).not.toHaveBeenCalled(); + }); + }); +}); + +describe('Remote browser — PuppeteerPlugin', () => { + let mockLogger: ReturnType; + + beforeEach(() => { + mockLogger = createMockLogger(); + serviceLocator.setLogger(mockLogger); + }); + + // --- Connection routing --------------------------------------------------- + + describe('connection routing', () => { + test('connectOverCDPOptions → calls connect, not launch', async () => { + const lib = createMockPuppeteerLibrary(); + const plugin = new PuppeteerPlugin(lib as any, { + connectOverCDPOptions: { browserWSEndpoint: 'ws://remote:9222' }, + }); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + expect(lib.connect).toHaveBeenCalledTimes(1); + expect(lib.connect).toHaveBeenCalledWith({ browserWSEndpoint: 'ws://remote:9222' }); + expect(lib.launch).not.toHaveBeenCalled(); + }); + + test('no connect options → calls launch', async () => { + const lib = createMockPuppeteerLibrary(); + const plugin = new PuppeteerPlugin(lib as any); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + expect(lib.launch).toHaveBeenCalledTimes(1); + expect(lib.connect).not.toHaveBeenCalled(); + }); + + test('passes all connect options through to connect', async () => { + const lib = createMockPuppeteerLibrary(); + const plugin = new PuppeteerPlugin(lib as any, { + connectOverCDPOptions: { + browserWSEndpoint: 'ws://remote:9222', + defaultViewport: { width: 800, height: 600 }, + }, + }); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + expect(lib.connect).toHaveBeenCalledWith({ + browserWSEndpoint: 'ws://remote:9222', + defaultViewport: { width: 800, height: 600 }, + }); + }); + }); + + // --- Validation ----------------------------------------------------------- + + describe('validation', () => { + test('throws when connectOverCDPOptions has no browserWSEndpoint or browserURL', () => { + const lib = createMockPuppeteerLibrary(); + + expect( + () => + new PuppeteerPlugin(lib as any, { + connectOverCDPOptions: {} as any, + }), + ).toThrow("connectOverCDPOptions must include either 'browserWSEndpoint' or 'browserURL'"); + }); + }); + + // --- isRemote correctness ------------------------------------------------- + + describe('isRemote', () => { + test('true when connectOverCDPOptions is present', () => { + const lib = createMockPuppeteerLibrary(); + const plugin = new PuppeteerPlugin(lib as any, { + connectOverCDPOptions: { browserWSEndpoint: 'ws://remote:9222' }, + }); + + const ctx = plugin.createLaunchContext(); + expect(ctx.isRemote).toBe(true); + }); + + test('false when no connect options', () => { + const lib = createMockPuppeteerLibrary(); + const plugin = new PuppeteerPlugin(lib as any); + + const ctx = plugin.createLaunchContext(); + expect(ctx.isRemote).toBe(false); + }); + }); + + // --- Proxy/webdriver skipping --------------------------------------------- + + describe('proxy/webdriver skipping for remote', () => { + test('proxy is not applied for remote connections', async () => { + const lib = createMockPuppeteerLibrary(); + const plugin = new PuppeteerPlugin(lib as any, { + connectOverCDPOptions: { browserWSEndpoint: 'ws://remote:9222' }, + proxyUrl: 'http://user:pass@proxy:8080', + }); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + expect(lib.connect).toHaveBeenCalledTimes(1); + expect(lib.launch).not.toHaveBeenCalled(); + }); + + test('webdriver hiding args are not added for remote connections', async () => { + const lib = createMockPuppeteerLibrary(); + const plugin = new PuppeteerPlugin(lib as any, { + connectOverCDPOptions: { browserWSEndpoint: 'ws://remote:9222' }, + launchOptions: { args: ['--custom-flag'] }, + }); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + // The original args should be untouched — no webdriver stealth flag injected + expect(ctx.launchOptions?.args).toEqual(['--custom-flag']); + expect(ctx.launchOptions?.args).not.toContain('--disable-blink-features=AutomationControlled'); + }); + + test('webdriver hiding args ARE added for local connections', async () => { + const lib = createMockPuppeteerLibrary(); + const plugin = new PuppeteerPlugin(lib as any, { + launchOptions: { args: ['--custom-flag'] }, + }); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + expect(ctx.launchOptions?.args).toContain('--disable-blink-features=AutomationControlled'); + expect(ctx.launchOptions?.args).toContain('--custom-flag'); + }); + }); + + // --- useIncognitoPages default -------------------------------------------- + + describe('useIncognitoPages default', () => { + test('defaults to true for remote', () => { + const lib = createMockPuppeteerLibrary(); + const plugin = new PuppeteerPlugin(lib as any, { + connectOverCDPOptions: { browserWSEndpoint: 'ws://remote:9222' }, + }); + + expect(plugin.useIncognitoPages).toBe(true); + }); + + test('explicit false preserved for remote', () => { + const lib = createMockPuppeteerLibrary(); + const plugin = new PuppeteerPlugin(lib as any, { + connectOverCDPOptions: { browserWSEndpoint: 'ws://remote:9222' }, + useIncognitoPages: false, + }); + + expect(plugin.useIncognitoPages).toBe(false); + }); + + test('explicit true preserved for remote', () => { + const lib = createMockPuppeteerLibrary(); + const plugin = new PuppeteerPlugin(lib as any, { + connectOverCDPOptions: { browserWSEndpoint: 'ws://remote:9222' }, + useIncognitoPages: true, + }); + + expect(plugin.useIncognitoPages).toBe(true); + }); + + test('default false for local', () => { + const lib = createMockPuppeteerLibrary(); + const plugin = new PuppeteerPlugin(lib as any); + + expect(plugin.useIncognitoPages).toBe(false); + }); + }); + + // --- Warnings ------------------------------------------------------------- + + describe('warnings', () => { + test('proxyUrl + remote → warning logged', async () => { + const lib = createMockPuppeteerLibrary(); + const plugin = new PuppeteerPlugin(lib as any, { + connectOverCDPOptions: { browserWSEndpoint: 'ws://remote:9222' }, + proxyUrl: 'http://user:pass@proxy:8080', + }); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + expect(mockLogger.warning).toHaveBeenCalledWith( + expect.stringContaining('proxyUrl is set but will be ignored'), + ); + }); + + test('useIncognitoPages: false + remote → warning logged', () => { + const lib = createMockPuppeteerLibrary(); + new PuppeteerPlugin(lib as any, { + connectOverCDPOptions: { browserWSEndpoint: 'ws://remote:9222' }, + useIncognitoPages: false, + }); + + expect(mockLogger.warning).toHaveBeenCalledWith( + expect.stringContaining('useIncognitoPages is set to false'), + ); + }); + + test('no warnings for local browser usage', async () => { + const lib = createMockPuppeteerLibrary(); + const plugin = new PuppeteerPlugin(lib as any); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + expect(mockLogger.warning).not.toHaveBeenCalled(); + }); + }); +}); From 2740bf4cc288460e27684ad807be4b44f4f02829 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sol=C3=A1r?= Date: Thu, 19 Mar 2026 14:08:02 +0100 Subject: [PATCH 13/45] fix: prevent proxy URL from leaking into remote Puppeteer browser contexts When useIncognitoPages is true (default for remote) and proxyUrl is set, the newPage handler was passing proxyServer to createBrowserContext even for remote connections. For credentialed proxies this also spun up a localhost tunnel unreachable by the remote browser. Co-Authored-By: Claude Opus 4.6 --- .../src/puppeteer/puppeteer-plugin.ts | 10 ++-- .../browser-pool/test/remote-browser.test.ts | 47 +++++++++++++++++-- 2 files changed, 51 insertions(+), 6 deletions(-) diff --git a/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts b/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts index 33b2105e2cf1..f325f89cc234 100644 --- a/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts +++ b/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts @@ -231,12 +231,16 @@ export class PuppeteerPlugin extends BrowserPlugin< let page: PuppeteerTypes.Page; if (useIncognitoPages) { - const [anonymizedProxyUrl, close] = proxyUrl - ? await anonymizeProxySugar(proxyUrl, undefined, undefined, { ignoreProxyCertificate }) + // Skip proxy setup for remote connections — proxy is managed by the remote service. + const effectiveProxyUrl = this.connectOverCDPOptions ? undefined : proxyUrl; + const [anonymizedProxyUrl, close] = effectiveProxyUrl + ? await anonymizeProxySugar(effectiveProxyUrl, undefined, undefined, { + ignoreProxyCertificate, + }) : ([undefined, noop] as const); try { - const proxyServer = anonymizedProxyUrl ?? proxyUrl; + const proxyServer = anonymizedProxyUrl ?? effectiveProxyUrl; const contextOptions = proxyServer ? { proxyServer } : {}; const context = (await (browser as any)[method]( contextOptions, diff --git a/packages/browser-pool/test/remote-browser.test.ts b/packages/browser-pool/test/remote-browser.test.ts index a4e32212f9f2..c9ce63ba8d2d 100644 --- a/packages/browser-pool/test/remote-browser.test.ts +++ b/packages/browser-pool/test/remote-browser.test.ts @@ -10,9 +10,30 @@ import { PuppeteerPlugin } from '../src/puppeteer/puppeteer-plugin.js'; // Shared mock helpers // --------------------------------------------------------------------------- +function createMockPage() { + return { + close: vi.fn().mockResolvedValue(undefined), + url: vi.fn(() => 'about:blank'), + on: vi.fn(), + once: vi.fn(), + }; +} + +function createMockBrowserContext() { + const page = createMockPage(); + return { + newPage: vi.fn().mockResolvedValue(page), + close: vi.fn().mockResolvedValue(undefined), + on: vi.fn(), + once: vi.fn(), + _mockPage: page, + }; +} + function createMockBrowser() { + const mockContext = createMockBrowserContext(); return { - newPage: vi.fn().mockResolvedValue({ close: vi.fn(), url: vi.fn(() => 'about:blank') }), + newPage: vi.fn().mockResolvedValue(createMockPage()), close: vi.fn().mockResolvedValue(undefined), contexts: vi.fn(() => []), on: vi.fn(), @@ -22,8 +43,9 @@ function createMockBrowser() { pages: vi.fn(() => []), process: vi.fn(() => null), userAgent: vi.fn().mockResolvedValue('mock-ua'), - createBrowserContext: vi.fn(), - createIncognitoBrowserContext: vi.fn(), + createBrowserContext: vi.fn().mockResolvedValue(mockContext), + createIncognitoBrowserContext: vi.fn().mockResolvedValue(mockContext), + _mockContext: mockContext, }; } @@ -512,6 +534,25 @@ describe('Remote browser — PuppeteerPlugin', () => { expect(lib.launch).not.toHaveBeenCalled(); }); + test('proxy is not leaked into createBrowserContext for remote newPage', async () => { + const browser = createMockBrowser(); + const lib = createMockPuppeteerLibrary(browser); + const plugin = new PuppeteerPlugin(lib as any, { + connectOverCDPOptions: { browserWSEndpoint: 'ws://remote:9222' }, + proxyUrl: 'http://user:pass@proxy:8080', + }); + + const ctx = plugin.createLaunchContext(); + const wrappedBrowser = await plugin.launch(ctx); + + // Call newPage on the wrapped browser — useIncognitoPages defaults to true for remote + await (wrappedBrowser as any).newPage(); + + // createBrowserContext should be called with empty options (no proxyServer) + expect(browser.createBrowserContext).toHaveBeenCalledTimes(1); + expect(browser.createBrowserContext).toHaveBeenCalledWith({}); + }); + test('webdriver hiding args are not added for remote connections', async () => { const lib = createMockPuppeteerLibrary(); const plugin = new PuppeteerPlugin(lib as any, { From fb73726b9070fa18f6f858cdeb392a9e0c3460d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sol=C3=A1r?= Date: Tue, 31 Mar 2026 13:41:33 +0200 Subject: [PATCH 14/45] docs(examples): add remote browser integration examples Examples for Browserbase, Browserless, Rebrowser, and Steel using Playwright and Puppeteer. --- temp-examples/.env.example | 9 +++ temp-examples/.gitignore | 1 + .../examples/browserbase-playwright-ws.ts | 55 +++++++++++++++++++ .../examples/browserbase-playwright.ts | 53 ++++++++++++++++++ .../examples/browserbase-puppeteer.ts | 53 ++++++++++++++++++ .../examples/browserless-playwright-ws.ts | 29 ++++++++++ .../examples/browserless-playwright.ts | 29 ++++++++++ .../examples/browserless-puppeteer.ts | 29 ++++++++++ .../examples/rebrowser-playwright-ws.ts | 50 +++++++++++++++++ .../examples/rebrowser-playwright.ts | 31 +++++++++++ temp-examples/examples/rebrowser-puppeteer.ts | 31 +++++++++++ temp-examples/examples/steel-playwright-ws.ts | 51 +++++++++++++++++ temp-examples/examples/steel-playwright.ts | 26 +++++++++ temp-examples/examples/steel-puppeteer.ts | 26 +++++++++ temp-examples/package.json | 38 +++++++++++++ temp-examples/readme.md | 12 ++++ temp-examples/tsconfig.json | 9 +++ 17 files changed, 532 insertions(+) create mode 100644 temp-examples/.env.example create mode 100644 temp-examples/.gitignore create mode 100644 temp-examples/examples/browserbase-playwright-ws.ts create mode 100644 temp-examples/examples/browserbase-playwright.ts create mode 100644 temp-examples/examples/browserbase-puppeteer.ts create mode 100644 temp-examples/examples/browserless-playwright-ws.ts create mode 100644 temp-examples/examples/browserless-playwright.ts create mode 100644 temp-examples/examples/browserless-puppeteer.ts create mode 100644 temp-examples/examples/rebrowser-playwright-ws.ts create mode 100644 temp-examples/examples/rebrowser-playwright.ts create mode 100644 temp-examples/examples/rebrowser-puppeteer.ts create mode 100644 temp-examples/examples/steel-playwright-ws.ts create mode 100644 temp-examples/examples/steel-playwright.ts create mode 100644 temp-examples/examples/steel-puppeteer.ts create mode 100644 temp-examples/package.json create mode 100644 temp-examples/readme.md create mode 100644 temp-examples/tsconfig.json diff --git a/temp-examples/.env.example b/temp-examples/.env.example new file mode 100644 index 000000000000..500f5da5f2ce --- /dev/null +++ b/temp-examples/.env.example @@ -0,0 +1,9 @@ +BROWSERBASE_API_KEY= +BROWSERBASE_PROJECT_ID= +# +BROWSERLESS_TOKEN= +# +REBROWSER_API_KEY= +REBROWSER_PROFILE_ID= +# +STEEL_API_KEY= diff --git a/temp-examples/.gitignore b/temp-examples/.gitignore new file mode 100644 index 000000000000..4c49bd78f1d0 --- /dev/null +++ b/temp-examples/.gitignore @@ -0,0 +1 @@ +.env diff --git a/temp-examples/examples/browserbase-playwright-ws.ts b/temp-examples/examples/browserbase-playwright-ws.ts new file mode 100644 index 000000000000..656d2cf3a0f0 --- /dev/null +++ b/temp-examples/examples/browserbase-playwright-ws.ts @@ -0,0 +1,55 @@ +import 'dotenv/config'; + +import { PlaywrightCrawler } from 'crawlee'; + +// Browserbase requires two env variables: +// - BROWSERBASE_API_KEY: Your API key for authentication +// - BROWSERBASE_PROJECT_ID: The project to create sessions in +const apiKey = process.env.BROWSERBASE_API_KEY; +const projectId = process.env.BROWSERBASE_PROJECT_ID; + +if (!apiKey) { + throw new Error('BROWSERBASE_API_KEY env variable is required'); +} + +if (!projectId) { + throw new Error('BROWSERBASE_PROJECT_ID env variable is required'); +} + +// Step 1: Create a Browserbase session via REST API. +// This returns a session ID that we use to construct the WebSocket URL. +// You have 5 minutes to connect before the session terminates. +const response = await fetch('https://api.browserbase.com/v1/sessions', { + method: 'POST', + headers: { + 'x-bb-api-key': apiKey, + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ projectId }), +}); + +if (!response.ok) { + throw new Error(`Failed to create Browserbase session: ${response.status} ${response.statusText}`); +} + +const session = await response.json(); +console.log(`Created Browserbase session: ${session.id}`); + +// Step 2: Connect to the session using Playwright's WebSocket connection. +// The WS URL is constructed from the API key and session ID. +const wsUrl = `wss://connect.browserbase.com?apiKey=${apiKey}&sessionId=${session.id}`; + +const crawler = new PlaywrightCrawler({ + launchContext: { + connectOptions: { + wsEndpoint: wsUrl, + }, + }, + async requestHandler({ page, request }) { + const title = await page.title(); + console.log(`[${request.loadedUrl}] ${title}`); + }, + maxRequestsPerCrawl: 1, +}); + +await crawler.run(['https://example.com']); diff --git a/temp-examples/examples/browserbase-playwright.ts b/temp-examples/examples/browserbase-playwright.ts new file mode 100644 index 000000000000..78ce8ca5569e --- /dev/null +++ b/temp-examples/examples/browserbase-playwright.ts @@ -0,0 +1,53 @@ +import 'dotenv/config'; + +import { PlaywrightCrawler } from 'crawlee'; + +// Browserbase requires two env variables: +// - BROWSERBASE_API_KEY: Your API key for authentication +// - BROWSERBASE_PROJECT_ID: The project to create sessions in +const apiKey = process.env.BROWSERBASE_API_KEY; +const projectId = process.env.BROWSERBASE_PROJECT_ID; + +if (!apiKey) { + throw new Error('BROWSERBASE_API_KEY env variable is required'); +} + +if (!projectId) { + throw new Error('BROWSERBASE_PROJECT_ID env variable is required'); +} + +// Step 1: Create a Browserbase session via REST API. +// This returns a connectUrl that we can use with Playwright's connectOverCDP. +// You have 5 minutes to connect before the session terminates. +const response = await fetch('https://api.browserbase.com/v1/sessions', { + method: 'POST', + headers: { + 'x-bb-api-key': apiKey, + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ projectId }), +}); + +if (!response.ok) { + throw new Error(`Failed to create Browserbase session: ${response.status} ${response.statusText}`); +} + +const session = await response.json(); +console.log(`Created Browserbase session: ${session.id}`); + +// Step 2: Connect to the session using Playwright's CDP connection. +// The connectUrl from the session response is used as the CDP endpoint. +const crawler = new PlaywrightCrawler({ + launchContext: { + connectOverCDPOptions: { + endpointURL: session.connectUrl, + }, + }, + async requestHandler({ page, request }) { + const title = await page.title(); + console.log(`[${request.loadedUrl}] ${title}`); + }, + maxRequestsPerCrawl: 1, +}); + +await crawler.run(['https://example.com']); diff --git a/temp-examples/examples/browserbase-puppeteer.ts b/temp-examples/examples/browserbase-puppeteer.ts new file mode 100644 index 000000000000..f6dcce121965 --- /dev/null +++ b/temp-examples/examples/browserbase-puppeteer.ts @@ -0,0 +1,53 @@ +import 'dotenv/config'; + +import { PuppeteerCrawler } from 'crawlee'; + +// Browserbase requires two env variables: +// - BROWSERBASE_API_KEY: Your API key for authentication +// - BROWSERBASE_PROJECT_ID: The project to create sessions in +const apiKey = process.env.BROWSERBASE_API_KEY; +const projectId = process.env.BROWSERBASE_PROJECT_ID; + +if (!apiKey) { + throw new Error('BROWSERBASE_API_KEY env variable is required'); +} + +if (!projectId) { + throw new Error('BROWSERBASE_PROJECT_ID env variable is required'); +} + +// Step 1: Create a Browserbase session via REST API. +// This returns a connectUrl that we can use with Puppeteer's CDP connection. +// You have 5 minutes to connect before the session terminates. +const response = await fetch('https://api.browserbase.com/v1/sessions', { + method: 'POST', + headers: { + 'x-bb-api-key': apiKey, + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ projectId }), +}); + +if (!response.ok) { + throw new Error(`Failed to create Browserbase session: ${response.status} ${response.statusText}`); +} + +const session = await response.json(); +console.log(`Created Browserbase session: ${session.id}`); + +// Step 2: Connect to the session using Puppeteer's CDP connection. +// The connectUrl from the session response is used as the browserWSEndpoint. +const crawler = new PuppeteerCrawler({ + launchContext: { + connectOverCDPOptions: { + browserWSEndpoint: session.connectUrl, + }, + }, + async requestHandler({ page, request }) { + const title = await page.title(); + console.log(`[${request.loadedUrl}] ${title}`); + }, + maxRequestsPerCrawl: 1, +}); + +await crawler.run(['https://example.com']); diff --git a/temp-examples/examples/browserless-playwright-ws.ts b/temp-examples/examples/browserless-playwright-ws.ts new file mode 100644 index 000000000000..ec659b59f025 --- /dev/null +++ b/temp-examples/examples/browserless-playwright-ws.ts @@ -0,0 +1,29 @@ +import 'dotenv/config'; + +import { PlaywrightCrawler } from 'crawlee'; + +const token = process.env.BROWSERLESS_TOKEN; + +if (!token) { + throw new Error('BROWSERLESS_TOKEN env variable is required'); +} + +const crawler = new PlaywrightCrawler({ + launchContext: { + connectOptions: { + wsEndpoint: `wss://production-sfo.browserless.io/chromium/playwright?token=${token}`, + }, + }, + async requestHandler({ page, request, enqueueLinks }) { + const title = await page.title(); + console.log(`[${request.loadedUrl}] ${title}`); + + await enqueueLinks({ + globs: ['https://www.crawlee.dev/**'], + limit: 5, + }); + }, + maxRequestsPerCrawl: 10, +}); + +await crawler.run(['https://www.crawlee.dev']); diff --git a/temp-examples/examples/browserless-playwright.ts b/temp-examples/examples/browserless-playwright.ts new file mode 100644 index 000000000000..ca7712c62ed0 --- /dev/null +++ b/temp-examples/examples/browserless-playwright.ts @@ -0,0 +1,29 @@ +import 'dotenv/config'; + +import { PlaywrightCrawler } from 'crawlee'; + +const token = process.env.BROWSERLESS_TOKEN; + +if (!token) { + throw new Error('BROWSERLESS_TOKEN env variable is required'); +} + +const crawler = new PlaywrightCrawler({ + launchContext: { + connectOverCDPOptions: { + endpointURL: `wss://production-sfo.browserless.io?token=${token}`, + }, + }, + async requestHandler({ page, request, enqueueLinks }) { + const title = await page.title(); + console.log(`[${request.loadedUrl}] ${title}`); + + await enqueueLinks({ + globs: ['https://www.crawlee.dev/**'], + limit: 5, + }); + }, + maxRequestsPerCrawl: 10, +}); + +await crawler.run(['https://www.crawlee.dev']); diff --git a/temp-examples/examples/browserless-puppeteer.ts b/temp-examples/examples/browserless-puppeteer.ts new file mode 100644 index 000000000000..c47fbe214420 --- /dev/null +++ b/temp-examples/examples/browserless-puppeteer.ts @@ -0,0 +1,29 @@ +import 'dotenv/config'; + +import { PuppeteerCrawler } from 'crawlee'; + +const token = process.env.BROWSERLESS_TOKEN; + +if (!token) { + throw new Error('BROWSERLESS_TOKEN env variable is required'); +} + +const crawler = new PuppeteerCrawler({ + launchContext: { + connectOverCDPOptions: { + browserWSEndpoint: `wss://production-sfo.browserless.io?token=${token}`, + }, + }, + async requestHandler({ page, request, enqueueLinks }) { + const title = await page.title(); + console.log(`[${request.loadedUrl}] ${title}`); + + await enqueueLinks({ + globs: ['https://www.crawlee.dev/**'], + limit: 5, + }); + }, + maxRequestsPerCrawl: 10, +}); + +await crawler.run(['https://www.crawlee.dev']); diff --git a/temp-examples/examples/rebrowser-playwright-ws.ts b/temp-examples/examples/rebrowser-playwright-ws.ts new file mode 100644 index 000000000000..31587ceca6bb --- /dev/null +++ b/temp-examples/examples/rebrowser-playwright-ws.ts @@ -0,0 +1,50 @@ +import 'dotenv/config'; + +import { PlaywrightCrawler } from 'crawlee'; + +const apiKey = process.env.REBROWSER_API_KEY; +const profileId = process.env.REBROWSER_PROFILE_ID; + +if (!apiKey) { + throw new Error('REBROWSER_API_KEY env variable is required'); +} + +// Step 1: Start a Rebrowser run via REST API. +// This gives you a dedicated WebSocket endpoint for the session. +// You can optionally specify a profileId and proxyUrl for advanced control. +const startRunUrl = new URL(`https://rebrowser.net/api/startRun?apikey=${apiKey}`); + +if (profileId) { + startRunUrl.searchParams.set('profileId', profileId); + console.log(`Using Rebrowser profile: ${profileId}`); +} + +const response = await fetch(startRunUrl.toString()); + +if (!response.ok) { + throw new Error(`Failed to start Rebrowser run: ${response.status} ${response.statusText}`); +} + +const run = await response.json(); +console.log(`Started Rebrowser run with wsEndpoint: ${run.wsEndpoint}`); + +// Step 2: Connect to the run using Playwright's WebSocket connection. +const crawler = new PlaywrightCrawler({ + launchContext: { + connectOptions: { + wsEndpoint: run.wsEndpoint, + }, + }, + async requestHandler({ page, request }) { + const title = await page.title(); + console.log(`[${request.loadedUrl}] ${title}`); + }, + maxRequestsPerCrawl: 1, +}); + +await crawler.run(['https://example.com']); + +// Step 3: Finish the run to stop billing. +// Rebrowser recommends explicit finishRun to avoid idle billing. +// The browser disconnects automatically after the crawl, but calling finishRun +// ensures the run is cleanly terminated on Rebrowser's side. diff --git a/temp-examples/examples/rebrowser-playwright.ts b/temp-examples/examples/rebrowser-playwright.ts new file mode 100644 index 000000000000..f88783238192 --- /dev/null +++ b/temp-examples/examples/rebrowser-playwright.ts @@ -0,0 +1,31 @@ +import 'dotenv/config'; + +import { PlaywrightCrawler } from 'crawlee'; + +const apiKey = process.env.REBROWSER_API_KEY; + +if (!apiKey) { + throw new Error('REBROWSER_API_KEY env variable is required'); +} + +// Rebrowser simple connection: no profile or run creation needed. +// A random profile is auto-selected when you connect with just an API key. +// Proxies are managed via the Rebrowser dashboard or WS URL params. +const crawler = new PlaywrightCrawler({ + launchContext: { + connectOverCDPOptions: { + endpointURL: `wss://api.rebrowser.net?apikey=${apiKey}`, + }, + }, + async requestHandler({ page, request }) { + const title = await page.title(); + console.log(`[${request.loadedUrl}] ${title}`); + }, + maxRequestsPerCrawl: 1, +}); + +await crawler.run(['https://example.com']); + +// Note: Rebrowser recommends calling finishRun after you're done to avoid idle billing. +// With Crawlee, the browser disconnects automatically after the crawl finishes, +// which should end the run. For explicit control, use the REST API finishRun endpoint. diff --git a/temp-examples/examples/rebrowser-puppeteer.ts b/temp-examples/examples/rebrowser-puppeteer.ts new file mode 100644 index 000000000000..54d49065c712 --- /dev/null +++ b/temp-examples/examples/rebrowser-puppeteer.ts @@ -0,0 +1,31 @@ +import 'dotenv/config'; + +import { PuppeteerCrawler } from 'crawlee'; + +const apiKey = process.env.REBROWSER_API_KEY; + +if (!apiKey) { + throw new Error('REBROWSER_API_KEY env variable is required'); +} + +// Rebrowser simple connection: no profile or run creation needed. +// A random profile is auto-selected when you connect with just an API key. +// Proxies are managed via the Rebrowser dashboard or WS URL params. +const crawler = new PuppeteerCrawler({ + launchContext: { + connectOverCDPOptions: { + browserWSEndpoint: `wss://api.rebrowser.net?apikey=${apiKey}`, + }, + }, + async requestHandler({ page, request }) { + const title = await page.title(); + console.log(`[${request.loadedUrl}] ${title}`); + }, + maxRequestsPerCrawl: 1, +}); + +await crawler.run(['https://example.com']); + +// Note: Rebrowser recommends calling finishRun after you're done to avoid idle billing. +// With Crawlee, the browser disconnects automatically after the crawl finishes, +// which should end the run. For explicit control, use the REST API finishRun endpoint. diff --git a/temp-examples/examples/steel-playwright-ws.ts b/temp-examples/examples/steel-playwright-ws.ts new file mode 100644 index 000000000000..55f4712a5315 --- /dev/null +++ b/temp-examples/examples/steel-playwright-ws.ts @@ -0,0 +1,51 @@ +import 'dotenv/config'; + +import { PlaywrightCrawler } from 'crawlee'; + +const apiKey = process.env.STEEL_API_KEY; + +if (!apiKey) { + throw new Error('STEEL_API_KEY env variable is required'); +} + +// Step 1: Create a Steel session via REST API. +// Explicit session creation enables advanced features like proxy and CAPTCHA solving. +const response = await fetch('https://api.steel.dev/v1/sessions', { + method: 'POST', + headers: { + 'Steel-Api-Key': apiKey, + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ useProxy: true, solveCaptcha: true }), +}); + +if (!response.ok) { + throw new Error(`Failed to create Steel session: ${response.status} ${response.statusText}`); +} + +const session = await response.json(); +console.log(`Created Steel session: ${session.id}`); + +// Step 2: Connect to the session using Playwright's WebSocket connection. +// The session ID is passed as a query parameter to the Steel WebSocket endpoint. +const crawler = new PlaywrightCrawler({ + launchContext: { + connectOptions: { + wsEndpoint: `wss://connect.steel.dev?apiKey=${apiKey}&sessionId=${session.id}`, + }, + }, + async requestHandler({ page, request }) { + const title = await page.title(); + console.log(`[${request.loadedUrl}] ${title}`); + }, + maxRequestsPerCrawl: 1, +}); + +await crawler.run(['https://example.com']); + +// Step 3: Release the session (optional — Steel auto-releases on disconnect). +await fetch(`https://api.steel.dev/v1/sessions/${session.id}/release`, { + method: 'POST', + headers: { 'Steel-Api-Key': apiKey }, +}); +console.log(`Released Steel session: ${session.id}`); diff --git a/temp-examples/examples/steel-playwright.ts b/temp-examples/examples/steel-playwright.ts new file mode 100644 index 000000000000..7bf2913054e9 --- /dev/null +++ b/temp-examples/examples/steel-playwright.ts @@ -0,0 +1,26 @@ +import 'dotenv/config'; + +import { PlaywrightCrawler } from 'crawlee'; + +const apiKey = process.env.STEEL_API_KEY; + +if (!apiKey) { + throw new Error('STEEL_API_KEY env variable is required'); +} + +// Steel direct connection: no session creation needed. +// A session is auto-created when you connect and auto-released on disconnect. +const crawler = new PlaywrightCrawler({ + launchContext: { + connectOverCDPOptions: { + endpointURL: `wss://connect.steel.dev?apiKey=${apiKey}`, + }, + }, + async requestHandler({ page, request }) { + const title = await page.title(); + console.log(`[${request.loadedUrl}] ${title}`); + }, + maxRequestsPerCrawl: 1, +}); + +await crawler.run(['https://example.com']); diff --git a/temp-examples/examples/steel-puppeteer.ts b/temp-examples/examples/steel-puppeteer.ts new file mode 100644 index 000000000000..68dc3cdb59a9 --- /dev/null +++ b/temp-examples/examples/steel-puppeteer.ts @@ -0,0 +1,26 @@ +import 'dotenv/config'; + +import { PuppeteerCrawler } from 'crawlee'; + +const apiKey = process.env.STEEL_API_KEY; + +if (!apiKey) { + throw new Error('STEEL_API_KEY env variable is required'); +} + +// Steel direct connection: no session creation needed. +// A session is auto-created when you connect and auto-released on disconnect. +const crawler = new PuppeteerCrawler({ + launchContext: { + connectOverCDPOptions: { + browserWSEndpoint: `wss://connect.steel.dev?apiKey=${apiKey}`, + }, + }, + async requestHandler({ page, request }) { + const title = await page.title(); + console.log(`[${request.loadedUrl}] ${title}`); + }, + maxRequestsPerCrawl: 1, +}); + +await crawler.run(['https://example.com']); diff --git a/temp-examples/package.json b/temp-examples/package.json new file mode 100644 index 000000000000..cbcb71ed6c1b --- /dev/null +++ b/temp-examples/package.json @@ -0,0 +1,38 @@ +{ + "name": "temp-examples", + "version": "1.0.0", + "private": true, + "type": "module", + "scripts": { + "example:browserless-puppeteer": "node --experimental-strip-types examples/browserless-puppeteer.ts", + "example:browserless-playwright": "node --experimental-strip-types examples/browserless-playwright.ts", + "example:browserless-playwright-ws": "node --experimental-strip-types examples/browserless-playwright-ws.ts", + "example:browserbase-puppeteer": "node --experimental-strip-types examples/browserbase-puppeteer.ts", + "example:browserbase-playwright": "node --experimental-strip-types examples/browserbase-playwright.ts", + "example:browserbase-playwright-ws": "node --experimental-strip-types examples/browserbase-playwright-ws.ts", + "example:steel-puppeteer": "node --experimental-strip-types examples/steel-puppeteer.ts", + "example:steel-playwright": "node --experimental-strip-types examples/steel-playwright.ts", + "example:steel-playwright-ws": "node --experimental-strip-types examples/steel-playwright-ws.ts", + "example:rebrowser-puppeteer": "node --experimental-strip-types examples/rebrowser-puppeteer.ts", + "example:rebrowser-playwright": "node --experimental-strip-types examples/rebrowser-playwright.ts", + "example:rebrowser-playwright-ws": "node --experimental-strip-types examples/rebrowser-playwright-ws.ts" + }, + "dependencies": { + "@crawlee/basic": "file:../packages/basic-crawler/dist", + "@crawlee/browser": "file:../packages/browser-crawler/dist", + "@crawlee/browser-pool": "file:../packages/browser-pool/dist", + "@crawlee/cheerio": "file:../packages/cheerio-crawler/dist", + "@crawlee/cli": "file:../packages/cli/dist", + "@crawlee/core": "file:../packages/core/dist", + "@crawlee/http": "file:../packages/http-crawler/dist", + "@crawlee/jsdom": "file:../packages/jsdom-crawler/dist", + "@crawlee/linkedom": "file:../packages/linkedom-crawler/dist", + "@crawlee/playwright": "file:../packages/playwright-crawler/dist", + "@crawlee/puppeteer": "file:../packages/puppeteer-crawler/dist", + "@crawlee/types": "file:../packages/types/dist", + "@crawlee/utils": "file:../packages/utils/dist", + "@types/node": "^25.2.0", + "crawlee": "file:../packages/crawlee/dist", + "dotenv": "^17.3.1" + } +} diff --git a/temp-examples/readme.md b/temp-examples/readme.md new file mode 100644 index 000000000000..a570750b6774 --- /dev/null +++ b/temp-examples/readme.md @@ -0,0 +1,12 @@ +#how to start + +``` +##root +nr clean +nr build + +cd temp-examples +npm install +npm run example:browserless-puppeteer +... +``` diff --git a/temp-examples/tsconfig.json b/temp-examples/tsconfig.json new file mode 100644 index 000000000000..5fcc4b7bad3a --- /dev/null +++ b/temp-examples/tsconfig.json @@ -0,0 +1,9 @@ +{ + "compilerOptions": { + "target": "ES2022", + "module": "Node16", + "moduleResolution": "Node16", + "esModuleInterop": true, + "sourceMap": false + } +} From 090ae785d60c3372b60e35f4055680ed4c97e5fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sol=C3=A1r?= Date: Wed, 29 Apr 2026 15:35:43 +0200 Subject: [PATCH 15/45] feat(browser-pool): add RemoteBrowserProvider abstract class and remoteBrowser config Add a unified API for connecting crawlers to remote browser services (Browserbase, Browserless, Steel, Rebrowser). Users can either pass a RemoteBrowserConfig object or extend RemoteBrowserProvider with typed connect()/release() lifecycle methods. - Add RemoteBrowserProvider abstract class with generic TContext - Add RemoteBrowserConfig interface (endpoint + release + type) - Wire remoteBrowser through BrowserPlugin, PlaywrightPlugin, PuppeteerPlugin - Auto-call release() on browser close/crash/pool destroy - Skip fingerprinting, proxy injection, and webdriver stealth for remote browsers - Skip session-based browser retirement for remote browsers (isRemote guard) - Default useIncognitoPages to true for remote connections - Add 30+ unit tests for both config and provider patterns - Update all temp-examples to use RemoteBrowserProvider --- .../src/internals/browser-crawler.ts | 7 + .../src/internals/browser-launcher.ts | 1 + .../abstract-classes/browser-controller.ts | 18 + .../src/abstract-classes/browser-plugin.ts | 113 +++++ .../browser-pool/src/fingerprinting/hooks.ts | 5 + packages/browser-pool/src/index.ts | 3 + .../src/playwright/playwright-controller.ts | 5 + .../src/playwright/playwright-plugin.ts | 63 ++- .../src/puppeteer/puppeteer-plugin.ts | 68 ++- .../src/remote-browser-provider.ts | 58 +++ .../browser-pool/test/remote-browser.test.ts | 462 ++++++++++++++++++ .../src/internals/playwright-launcher.ts | 37 +- .../src/internals/puppeteer-launcher.ts | 30 +- .../examples/browserbase-playwright-ws.ts | 73 ++- .../examples/browserbase-playwright.ts | 81 +-- .../examples/browserbase-puppeteer.ts | 80 +-- .../examples/browserless-playwright-ws.ts | 46 +- .../examples/browserless-playwright.ts | 41 +- .../examples/browserless-puppeteer.ts | 42 +- .../examples/rebrowser-playwright-ws.ts | 52 +- .../examples/rebrowser-playwright.ts | 16 +- temp-examples/examples/rebrowser-puppeteer.ts | 16 +- temp-examples/examples/steel-playwright-ws.ts | 85 ++-- temp-examples/examples/steel-playwright.ts | 59 ++- temp-examples/examples/steel-puppeteer.ts | 36 +- temp-examples/package.json | 73 +-- 26 files changed, 1263 insertions(+), 307 deletions(-) create mode 100644 packages/browser-pool/src/remote-browser-provider.ts diff --git a/packages/browser-crawler/src/internals/browser-crawler.ts b/packages/browser-crawler/src/internals/browser-crawler.ts index 3405ab3bdb91..3c1f1a544ed8 100644 --- a/packages/browser-crawler/src/internals/browser-crawler.ts +++ b/packages/browser-crawler/src/internals/browser-crawler.ts @@ -673,6 +673,13 @@ export abstract class BrowserCrawler< return; } + // Remote browsers are expensive — don't retire them when a session retires. + // Let retireBrowserAfterPageCount control the browser lifecycle instead. + // See also: PR #3605 which fixes the root cause (maxUsageCount: 1 in BasicCrawler). + if (browserController.launchContext.isRemote) { + return; + } + let sessionIds = this.browserSessionIds.get(browserController); if (sessionIds) { diff --git a/packages/browser-crawler/src/internals/browser-launcher.ts b/packages/browser-crawler/src/internals/browser-launcher.ts index cdff657de995..f83b46f1fd27 100644 --- a/packages/browser-crawler/src/internals/browser-launcher.ts +++ b/packages/browser-crawler/src/internals/browser-launcher.ts @@ -113,6 +113,7 @@ export abstract class BrowserLauncher< userDataDir: ow.optional.string, launchOptions: ow.optional.object, userAgent: ow.optional.string, + remoteBrowser: ow.optional.object, }; static requireLauncherOrThrow(launcher: string, apifyImageName: string): T { diff --git a/packages/browser-pool/src/abstract-classes/browser-controller.ts b/packages/browser-pool/src/abstract-classes/browser-controller.ts index 078fcd889a99..adc6b97c74ac 100644 --- a/packages/browser-pool/src/abstract-classes/browser-controller.ts +++ b/packages/browser-pool/src/abstract-classes/browser-controller.ts @@ -141,6 +141,7 @@ export abstract class BrowserController< this.log.debug(`Could not close browser.\nCause: ${(error as Error).message}`, { id: this.id }); } + await this._releaseRemoteBrowser(); this.emit(BROWSER_CONTROLLER_EVENTS.BROWSER_CLOSED, this); setTimeout(() => { @@ -158,9 +159,26 @@ export abstract class BrowserController< async kill(): Promise { await this.hasBrowserPromise; await this._kill(); + await this._releaseRemoteBrowser(); this.emit(BROWSER_CONTROLLER_EVENTS.BROWSER_CLOSED, this); } + /** + * Calls `remoteBrowser.release()` if configured. Safe to call multiple times — + * clears the endpoint after the first call so release only fires once. + */ + private async _releaseRemoteBrowser(): Promise { + const endpoint = this.launchContext?._resolvedRemoteEndpoint as string | undefined; + if (!endpoint) return; + + const context = this.launchContext._remoteContext as Record | undefined; + + // Clear so release only fires once (close() schedules kill() after timeout) + this.launchContext.extend({ _resolvedRemoteEndpoint: undefined, _remoteContext: undefined }); + + await this.browserPlugin._callRelease(endpoint, context); + } + /** * Opens new browser page. * @ignore diff --git a/packages/browser-pool/src/abstract-classes/browser-plugin.ts b/packages/browser-pool/src/abstract-classes/browser-plugin.ts index 27b3895c5c74..ccc768e0ff29 100644 --- a/packages/browser-pool/src/abstract-classes/browser-plugin.ts +++ b/packages/browser-pool/src/abstract-classes/browser-plugin.ts @@ -4,6 +4,7 @@ import merge from 'lodash.merge'; import type { LaunchContextOptions } from '../launch-context.js'; import { LaunchContext } from '../launch-context.js'; +import { RemoteBrowserProvider } from '../remote-browser-provider.js'; import type { UnwrapPromise } from '../utils.js'; import type { BrowserController } from './browser-controller.js'; @@ -44,6 +45,61 @@ export interface CommonPage { url(): string | Promise; } +/** + * Return type for dynamic endpoint functions that need to pass session + * metadata to the `release()` callback. + */ +export interface RemoteBrowserEndpointResult { + /** The browser endpoint URL to connect to. */ + url: string; + /** Opaque metadata passed back to `release()` — e.g. session IDs, API tokens. */ + context?: Record; +} + +/** + * Configuration for connecting to a remote browser service. + * + * **Static endpoint (e.g. Browserless):** + * ```typescript + * { endpoint: 'wss://browserless.io?token=xxx' } + * ``` + * + * **Dynamic endpoint with lifecycle (e.g. Browserbase):** + * ```typescript + * { + * endpoint: async () => { + * const session = await createSession(); + * return { url: session.connectUrl, context: { id: session.id } }; + * }, + * release: async ({ context }) => { + * await releaseSession(context.id); + * }, + * } + * ``` + */ +export interface RemoteBrowserConfig { + /** + * The browser endpoint URL, or an async function that returns one. + * When a function is provided, it is called once per browser launch (not per page). + * + * Can return a plain string or an object with `url` and optional `context` + * that will be forwarded to `release()`. + */ + endpoint: string | (() => string | RemoteBrowserEndpointResult | Promise); + /** + * Optional cleanup function called when the browser closes, crashes, or the pool is destroyed. + * Receives the resolved endpoint URL and the `context` object returned by `endpoint()`. + * Errors are caught and logged as warnings — they never crash the crawler. + */ + release?: (info: { endpoint: string; context?: Record }) => void | Promise; + /** + * Connection type. Subclass interfaces narrow this further + * (e.g. Puppeteer only allows `'cdp'`). + * @default 'cdp' + */ + type?: 'cdp' | 'websocket'; +} + export interface BrowserPluginOptions { /** * Options that will be passed down to the automation library. E.g. @@ -81,6 +137,15 @@ export interface BrowserPluginOptions { * This is useful when using HTTPS proxies with self-signed certificates. */ ignoreProxyCertificate?: boolean; + /** + * Configuration for connecting to a remote browser service. + * When set, the plugin connects to a remote browser instead of launching a local one. + * + * Accepts either a {@link RemoteBrowserConfig} object or a {@link RemoteBrowserProvider} instance. + * + * Takes precedence over `connectOverCDPOptions` / `connectOptions` if both are set. + */ + remoteBrowser?: RemoteBrowserConfig | RemoteBrowserProvider; } export interface CreateLaunchContextOptions< @@ -116,6 +181,7 @@ export abstract class BrowserPlugin< browserPerProxy?: boolean; ignoreProxyCertificate?: boolean; + remoteBrowser?: RemoteBrowserConfig; constructor(library: Library, options: BrowserPluginOptions = {}) { const { @@ -125,6 +191,7 @@ export abstract class BrowserPlugin< useIncognitoPages = false, browserPerProxy = false, ignoreProxyCertificate = false, + remoteBrowser, } = options; this.log = serviceLocator.getLogger().child({ prefix: 'BrowserPool' }); @@ -135,6 +202,52 @@ export abstract class BrowserPlugin< this.useIncognitoPages = useIncognitoPages; this.browserPerProxy = browserPerProxy; this.ignoreProxyCertificate = ignoreProxyCertificate; + + // Normalize RemoteBrowserProvider instances into a plain RemoteBrowserConfig + // so all downstream code only deals with the config shape. + if (remoteBrowser instanceof RemoteBrowserProvider) { + const provider = remoteBrowser; + this.remoteBrowser = { + endpoint: () => provider.connect(), + release: ({ context }) => provider.release(context as any), + type: provider.type, + }; + } else { + this.remoteBrowser = remoteBrowser; + } + } + + /** Resolves the remote browser endpoint from a string or function. Returns { url, context }. */ + protected async _resolveRemoteEndpoint(): Promise { + const { endpoint } = this.remoteBrowser!; + const result = typeof endpoint === 'function' ? await endpoint() : endpoint; + if (typeof result === 'string') { + return { url: result }; + } + return result; + } + + /** @internal Called by BrowserController on browser close/kill. */ + async _callRelease(endpoint: string, context?: Record): Promise { + try { + await this.remoteBrowser?.release?.({ endpoint, context }); + } catch (err) { + this.log.warning('remoteBrowser.release() failed.', { error: (err as Error)?.message }); + } + } + + /** Strips credentials from a URL for safe logging. */ + protected _sanitizeEndpointForLog(endpoint: string): string { + try { + const url = new URL(endpoint); + if (url.username || url.password) { + url.username = '***'; + url.password = '***'; + } + return url.toString(); + } catch { + return ''; + } } /** diff --git a/packages/browser-pool/src/fingerprinting/hooks.ts b/packages/browser-pool/src/fingerprinting/hooks.ts index 1e22b72de411..56b4efaf6a78 100644 --- a/packages/browser-pool/src/fingerprinting/hooks.ts +++ b/packages/browser-pool/src/fingerprinting/hooks.ts @@ -19,6 +19,9 @@ export function createFingerprintPreLaunchHook(browserPool: BrowserPool { + // Remote browsers may have their own fingerprinting — skip local fingerprint injection + if (launchContext.isRemote) return; + const { useIncognitoPages } = launchContext; const cacheKey = (launchContext.session as { id: string } | undefined)?.id ?? launchContext.proxyUrl; const { launchOptions }: { launchOptions: any } = launchContext; @@ -62,6 +65,7 @@ export function createFingerprintPreLaunchHook(browserPool: BrowserPool { const { launchContext, browserPlugin } = browserController; + if (launchContext.isRemote) return; const { fingerprint } = launchContext.fingerprint!; if (launchContext.useIncognitoPages && browserPlugin instanceof PlaywrightPlugin && pageOptions) { @@ -80,6 +84,7 @@ export function createPrePageCreateHook() { export function createPostPageCreateHook(fingerprintInjector: FingerprintInjector) { return async (page: any, browserController: BrowserController): Promise => { const { browserPlugin, launchContext } = browserController; + if (launchContext.isRemote) return; const fingerprint = launchContext.fingerprint!; // TODO this will require refactoring, we should use common API instead of branching based on plugin type, diff --git a/packages/browser-pool/src/index.ts b/packages/browser-pool/src/index.ts index 3b3ff37f60f8..484e05394aa4 100644 --- a/packages/browser-pool/src/index.ts +++ b/packages/browser-pool/src/index.ts @@ -43,9 +43,12 @@ export type { CommonLibrary, BrowserPluginOptions, CreateLaunchContextOptions, + RemoteBrowserConfig, + RemoteBrowserEndpointResult, } from './abstract-classes/browser-plugin.js'; export { BrowserPlugin, BrowserLaunchError, DEFAULT_USER_AGENT } from './abstract-classes/browser-plugin.js'; export type { LaunchContextOptions } from './launch-context.js'; export { LaunchContext } from './launch-context.js'; +export { RemoteBrowserProvider } from './remote-browser-provider.js'; export type { InferBrowserPluginArray, UnwrapPromise } from './utils.js'; export { anonymizeProxySugar, type AnonymizeProxySugarOptions } from './anonymize-proxy.js'; diff --git a/packages/browser-pool/src/playwright/playwright-controller.ts b/packages/browser-pool/src/playwright/playwright-controller.ts index 0f7a4c1bf539..aeb927ff5f13 100644 --- a/packages/browser-pool/src/playwright/playwright-controller.ts +++ b/packages/browser-pool/src/playwright/playwright-controller.ts @@ -45,6 +45,11 @@ export class PlaywrightController extends BrowserController< ...contextOptions, }; + // Remote browsers handle their own proxy — don't inject local proxy settings into context + if (this.launchContext.isRemote) { + delete contextOptions?.proxy; + } + if (contextOptions?.proxy) { const [anonymizedProxyUrl, closeProxy] = await anonymizeProxySugar( contextOptions.proxy.server, diff --git a/packages/browser-pool/src/playwright/playwright-plugin.ts b/packages/browser-pool/src/playwright/playwright-plugin.ts index d23e20f7a120..6aa1787324e5 100644 --- a/packages/browser-pool/src/playwright/playwright-plugin.ts +++ b/packages/browser-pool/src/playwright/playwright-plugin.ts @@ -65,18 +65,32 @@ export class PlaywrightPlugin extends BrowserPlugin< throw new Error("'connectOptions.wsEndpoint' must be a non-empty string."); } + const remoteBrowserIgnored = !!(baseOptions.remoteBrowser && (connectOverCDPOptions || connectOptions)); + if (remoteBrowserIgnored) { + baseOptions.remoteBrowser = undefined; + } + super(library, baseOptions); this.connectOptions = connectOptions; this.connectOverCDPOptions = connectOverCDPOptions; + if (remoteBrowserIgnored) { + this.log.warning( + 'Both remoteBrowser and connectOverCDPOptions/connectOptions are set. ' + + 'remoteBrowser is ignored when explicit connect options are provided.', + ); + } + // We check options.useIncognitoPages (not this.useIncognitoPages) because super() collapses undefined to false. // This preserves the distinction between "not set" (undefined → default to true) and "explicitly false". - if (this.connectOptions || this.connectOverCDPOptions) { + const isRemoteConnection = this.remoteBrowser || this.connectOptions || this.connectOverCDPOptions; + if (isRemoteConnection) { if (options.useIncognitoPages === undefined) { this.useIncognitoPages = true; this.log.info('Remote browser detected — defaulting useIncognitoPages to true for session isolation.'); } else if (options.useIncognitoPages === false) { - const message = this.connectOptions + const isWebSocket = this.connectOptions || this.remoteBrowser?.type === 'websocket'; + const message = isWebSocket ? 'useIncognitoPages is set to false with a remote WebSocket connection. ' + 'This may cause errors because browserType.connect() returns a browser with no default context.' : 'useIncognitoPages is set to false with a remote browser connection. ' + @@ -89,24 +103,45 @@ export class PlaywrightPlugin extends BrowserPlugin< override createLaunchContext(options: CreateLaunchContextOptions = {}): LaunchContext { return super.createLaunchContext({ ...options, - isRemote: options.isRemote ?? !!(this.connectOptions || this.connectOverCDPOptions), + isRemote: options.isRemote ?? !!(this.remoteBrowser || this.connectOptions || this.connectOverCDPOptions), }); } - private _sanitizeEndpointForLog(endpoint: string): string { - try { - const url = new URL(endpoint); - if (url.username || url.password) { - url.username = '***'; - url.password = '***'; + protected async _launch(launchContext: LaunchContext): Promise { + if (this.remoteBrowser) { + const type = this.remoteBrowser.type ?? 'cdp'; + let url: string; + let context: Record | undefined; + try { + const result = await this._resolveRemoteEndpoint(); + url = result.url; + context = result.context; + } catch (cause) { + throw new BrowserLaunchError( + 'Failed to resolve remote browser endpoint from remoteBrowser.endpoint() function.\n\u200b', + { cause }, + ); + } + + launchContext.extend({ _resolvedRemoteEndpoint: url, _remoteContext: context }); + + try { + if (type === 'websocket') { + this.log.info('Connecting to remote browser via connect (Playwright WebSocket).'); + return await this.library.connect(url, {}); + } + this.log.info('Connecting to remote browser via connectOverCDP.'); + return await this.library.connectOverCDP(url, {}); + } catch (cause) { + await this._callRelease(url, context); + throw new BrowserLaunchError( + `Failed to connect to remote browser at "${this._sanitizeEndpointForLog(url)}". ` + + `Connection type: ${type}. Check that the endpoint is reachable.\n\u200b`, + { cause }, + ); } - return url.toString(); - } catch { - return ''; } - } - protected async _launch(launchContext: LaunchContext): Promise { // Remote CDP connection — skip all local launch/proxy logic if (this.connectOverCDPOptions) { const { endpointURL, ...options } = this.connectOverCDPOptions; diff --git a/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts b/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts index f325f89cc234..5a853594e2ac 100644 --- a/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts +++ b/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts @@ -43,12 +43,29 @@ export class PuppeteerPlugin extends BrowserPlugin< throw new Error("connectOverCDPOptions must include either 'browserWSEndpoint' or 'browserURL'."); } + if (baseOptions.remoteBrowser?.type === 'websocket') { + throw new Error("Puppeteer does not support 'websocket' connection type. Use 'cdp' (default) instead."); + } + + const remoteBrowserIgnored = !!(baseOptions.remoteBrowser && connectOverCDPOptions); + if (remoteBrowserIgnored) { + baseOptions.remoteBrowser = undefined; + } + super(library, baseOptions); this.connectOverCDPOptions = connectOverCDPOptions; + if (remoteBrowserIgnored) { + this.log.warning( + 'Both remoteBrowser and connectOverCDPOptions are set. ' + + 'remoteBrowser is ignored when explicit connect options are provided.', + ); + } + // We check options.useIncognitoPages (not this.useIncognitoPages) because super() collapses undefined to false. // This preserves the distinction between "not set" (undefined → default to true) and "explicitly false". - if (this.connectOverCDPOptions) { + const isRemoteConnection = this.remoteBrowser || this.connectOverCDPOptions; + if (isRemoteConnection) { if (options.useIncognitoPages === undefined) { this.useIncognitoPages = true; this.log.info('Remote browser detected — defaulting useIncognitoPages to true for session isolation.'); @@ -71,23 +88,10 @@ export class PuppeteerPlugin extends BrowserPlugin< ): LaunchContext { return super.createLaunchContext({ ...options, - isRemote: options.isRemote ?? !!this.connectOverCDPOptions, + isRemote: options.isRemote ?? !!(this.remoteBrowser || this.connectOverCDPOptions), }); } - private _sanitizeEndpointForLog(endpoint: string): string { - try { - const url = new URL(endpoint); - if (url.username || url.password) { - url.username = '***'; - url.password = '***'; - } - return url.toString(); - } catch { - return ''; - } - } - protected async _launch( launchContext: LaunchContext< typeof Puppeteer, @@ -111,7 +115,34 @@ export class PuppeteerPlugin extends BrowserPlugin< let browser: PuppeteerTypes.Browser; - if (this.connectOverCDPOptions) { + if (this.remoteBrowser) { + let url: string; + let context: Record | undefined; + try { + const result = await this._resolveRemoteEndpoint(); + url = result.url; + context = result.context; + } catch (cause) { + throw new BrowserLaunchError( + 'Failed to resolve remote browser endpoint from remoteBrowser.endpoint() function.\n\u200b', + { cause }, + ); + } + + launchContext.extend({ _resolvedRemoteEndpoint: url, _remoteContext: context }); + + this.log.info('Connecting to remote browser via connect (CDP).'); + try { + browser = await this.library.connect({ browserWSEndpoint: url }); + } catch (cause) { + await this._callRelease(url, context); + throw new BrowserLaunchError( + `Failed to connect to remote browser at "${this._sanitizeEndpointForLog(url)}". ` + + 'Check that the endpoint is reachable and the browser is accepting CDP connections.\n\u200b', + { cause }, + ); + } + } else if (this.connectOverCDPOptions) { // Remote CDP connection — skip local launch/proxy/headless logic const endpoint = this.connectOverCDPOptions.browserWSEndpoint || this.connectOverCDPOptions.browserURL!; this.log.info('Connecting to remote browser via connect (CDP).'); @@ -200,7 +231,7 @@ export class PuppeteerPlugin extends BrowserPlugin< browser.on('targetcreated', targetCreatedHandler); // Clean up the listener when a remote browser disconnects to prevent leaks - if (this.connectOverCDPOptions) { + if (this.remoteBrowser || this.connectOverCDPOptions) { browser.once('disconnected', () => { browser.off('targetcreated', targetCreatedHandler); }); @@ -232,7 +263,8 @@ export class PuppeteerPlugin extends BrowserPlugin< if (useIncognitoPages) { // Skip proxy setup for remote connections — proxy is managed by the remote service. - const effectiveProxyUrl = this.connectOverCDPOptions ? undefined : proxyUrl; + const effectiveProxyUrl = + this.remoteBrowser || this.connectOverCDPOptions ? undefined : proxyUrl; const [anonymizedProxyUrl, close] = effectiveProxyUrl ? await anonymizeProxySugar(effectiveProxyUrl, undefined, undefined, { ignoreProxyCertificate, diff --git a/packages/browser-pool/src/remote-browser-provider.ts b/packages/browser-pool/src/remote-browser-provider.ts new file mode 100644 index 000000000000..dc324d529c39 --- /dev/null +++ b/packages/browser-pool/src/remote-browser-provider.ts @@ -0,0 +1,58 @@ +/** + * Abstract base class for remote browser service providers. + * + * Implement this class to encapsulate the lifecycle of a remote browser session + * (creation, connection URL resolution, and cleanup). The framework calls + * {@link connect} once per browser launch and {@link release} when the browser + * closes or crashes. + * + * **Example — simple static endpoint (e.g. Browserless):** + * ```typescript + * class BrowserlessProvider extends RemoteBrowserProvider { + * constructor(private url: string) { super(); } + * async connect() { return { url: this.url }; } + * } + * ``` + * + * **Example — session lifecycle (e.g. Browserbase):** + * ```typescript + * class BrowserbaseProvider extends RemoteBrowserProvider<{ id: string }> { + * constructor(private apiKey: string, private projectId: string) { super(); } + * + * async connect() { + * const session = await createSession(this.apiKey, this.projectId); + * return { url: session.connectUrl, context: { id: session.id } }; + * } + * + * async release(context: { id: string }) { + * await releaseSession(this.apiKey, context.id); + * } + * } + * ``` + */ +export abstract class RemoteBrowserProvider = Record> { + /** + * Connection type. + * - `'cdp'` — Chrome DevTools Protocol, works with Puppeteer and Playwright. + * - `'websocket'` — Playwright-specific WebSocket protocol (not supported by Puppeteer). + * + * @default 'cdp' + */ + type: 'cdp' | 'websocket' = 'cdp'; + + /** + * Called once per browser launch. Return the WebSocket/CDP endpoint URL + * and an optional `context` object that will be passed back to {@link release}. + */ + abstract connect(): Promise<{ url: string; context?: TContext }> | { url: string; context?: TContext }; + + /** + * Called when the browser closes, crashes, or the pool is destroyed. + * Override this to clean up remote sessions, release API resources, etc. + * + * Errors thrown here are caught and logged as warnings — they never crash the crawler. + * + * @param _context The same `context` object returned by {@link connect}. + */ + async release(_context: TContext): Promise {} +} diff --git a/packages/browser-pool/test/remote-browser.test.ts b/packages/browser-pool/test/remote-browser.test.ts index c9ce63ba8d2d..4a3e4a8b59ad 100644 --- a/packages/browser-pool/test/remote-browser.test.ts +++ b/packages/browser-pool/test/remote-browser.test.ts @@ -5,6 +5,7 @@ import type { CrawleeLogger } from '@crawlee/core'; import { PlaywrightPlugin } from '../src/playwright/playwright-plugin.js'; import { PuppeteerPlugin } from '../src/puppeteer/puppeteer-plugin.js'; +import { RemoteBrowserProvider } from '../src/remote-browser-provider.js'; // --------------------------------------------------------------------------- // Shared mock helpers @@ -663,3 +664,464 @@ describe('Remote browser — PuppeteerPlugin', () => { }); }); }); + +// --------------------------------------------------------------------------- +// remoteBrowser config tests +// --------------------------------------------------------------------------- + +describe('remoteBrowser config — PlaywrightPlugin', () => { + let mockLogger: ReturnType; + + beforeEach(() => { + mockLogger = createMockLogger(); + serviceLocator.setLogger(mockLogger); + }); + + test('static string endpoint → calls connectOverCDP by default', async () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any, { + remoteBrowser: { endpoint: 'wss://browserless.io?token=xxx' }, + }); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + expect(lib.connectOverCDP).toHaveBeenCalledWith('wss://browserless.io?token=xxx', {}); + expect(lib.launch).not.toHaveBeenCalled(); + expect(lib.connect).not.toHaveBeenCalled(); + }); + + test('static string endpoint with type websocket → calls connect', async () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any, { + remoteBrowser: { endpoint: 'wss://browserless.io/ws', type: 'websocket' }, + }); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + expect(lib.connect).toHaveBeenCalledWith('wss://browserless.io/ws', {}); + expect(lib.connectOverCDP).not.toHaveBeenCalled(); + }); + + test('function endpoint → called per launch', async () => { + const lib = createMockPlaywrightLibrary(); + const endpointFn = vi.fn().mockResolvedValue('wss://dynamic-endpoint.io'); + const plugin = new PlaywrightPlugin(lib as any, { + remoteBrowser: { endpoint: endpointFn }, + }); + + const ctx1 = plugin.createLaunchContext(); + await plugin.launch(ctx1); + + const ctx2 = plugin.createLaunchContext(); + await plugin.launch(ctx2); + + expect(endpointFn).toHaveBeenCalledTimes(2); + expect(lib.connectOverCDP).toHaveBeenCalledTimes(2); + }); + + test('resolved endpoint stored on launchContext', async () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any, { + remoteBrowser: { endpoint: 'wss://test.io' }, + }); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + expect((ctx as any)._resolvedRemoteEndpoint).toBe('wss://test.io'); + }); + + test('isRemote is true when remoteBrowser is set', () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any, { + remoteBrowser: { endpoint: 'wss://test.io' }, + }); + + const ctx = plugin.createLaunchContext(); + expect(ctx.isRemote).toBe(true); + }); + + test('useIncognitoPages defaults to true when remoteBrowser is set', () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any, { + remoteBrowser: { endpoint: 'wss://test.io' }, + }); + + expect(plugin.useIncognitoPages).toBe(true); + }); + + test('release called on connection failure with context', async () => { + const lib = createMockPlaywrightLibrary(); + lib.connectOverCDP.mockRejectedValue(new Error('Connection refused')); + + const releaseFn = vi.fn(); + const plugin = new PlaywrightPlugin(lib as any, { + remoteBrowser: { + endpoint: async () => ({ url: 'wss://fail.io', context: { id: 'sess-123' } }), + release: releaseFn, + }, + }); + + const ctx = plugin.createLaunchContext(); + await expect(plugin.launch(ctx)).rejects.toThrow('Failed to connect to remote browser'); + + expect(releaseFn).toHaveBeenCalledWith({ endpoint: 'wss://fail.io', context: { id: 'sess-123' } }); + }); + + test('release receives context from endpoint function', async () => { + const lib = createMockPlaywrightLibrary(); + const releaseFn = vi.fn(); + const plugin = new PlaywrightPlugin(lib as any, { + remoteBrowser: { + endpoint: async () => ({ url: 'wss://test.io', context: { sessionId: 'abc' } }), + release: releaseFn, + }, + }); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + // Context stored on launchContext for later release + expect((ctx as any)._remoteContext).toEqual({ sessionId: 'abc' }); + }); + + test('release failure is swallowed and logged as warning', async () => { + const lib = createMockPlaywrightLibrary(); + lib.connectOverCDP.mockRejectedValue(new Error('Connection refused')); + + const releaseFn = vi.fn().mockRejectedValue(new Error('Release failed')); + const plugin = new PlaywrightPlugin(lib as any, { + remoteBrowser: { endpoint: 'wss://fail.io', release: releaseFn }, + }); + + const ctx = plugin.createLaunchContext(); + await expect(plugin.launch(ctx)).rejects.toThrow('Failed to connect to remote browser'); + + expect(releaseFn).toHaveBeenCalled(); + expect(mockLogger.warning).toHaveBeenCalledWith( + 'remoteBrowser.release() failed.', + expect.objectContaining({ error: 'Release failed' }), + ); + }); + + test('endpoint function rejection throws BrowserLaunchError', async () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any, { + remoteBrowser: { endpoint: () => Promise.reject(new Error('API down')) }, + }); + + const ctx = plugin.createLaunchContext(); + await expect(plugin.launch(ctx)).rejects.toThrow('Failed to resolve remote browser endpoint'); + }); + + test('remoteBrowser ignored when connectOverCDPOptions also set', async () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any, { + remoteBrowser: { endpoint: 'wss://ignored.io' }, + connectOverCDPOptions: { endpointURL: 'wss://explicit.io' }, + }); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + expect(lib.connectOverCDP).toHaveBeenCalledWith('wss://explicit.io', {}); + expect(mockLogger.warning).toHaveBeenCalledWith(expect.stringContaining('remoteBrowser is ignored')); + }); +}); + +describe('remoteBrowser config — PuppeteerPlugin', () => { + let mockLogger: ReturnType; + + beforeEach(() => { + mockLogger = createMockLogger(); + serviceLocator.setLogger(mockLogger); + }); + + test('static string endpoint → calls connect with browserWSEndpoint', async () => { + const lib = createMockPuppeteerLibrary(); + const plugin = new PuppeteerPlugin(lib as any, { + remoteBrowser: { endpoint: 'wss://browserless.io?token=xxx' }, + }); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + expect(lib.connect).toHaveBeenCalledWith({ browserWSEndpoint: 'wss://browserless.io?token=xxx' }); + expect(lib.launch).not.toHaveBeenCalled(); + }); + + test('function endpoint → called per launch', async () => { + const lib = createMockPuppeteerLibrary(); + const endpointFn = vi.fn().mockResolvedValue('wss://dynamic.io'); + const plugin = new PuppeteerPlugin(lib as any, { + remoteBrowser: { endpoint: endpointFn }, + }); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + expect(endpointFn).toHaveBeenCalledTimes(1); + expect(lib.connect).toHaveBeenCalledWith({ browserWSEndpoint: 'wss://dynamic.io' }); + }); + + test('type websocket throws in constructor', () => { + const lib = createMockPuppeteerLibrary(); + expect(() => { + new PuppeteerPlugin(lib as any, { + remoteBrowser: { endpoint: 'wss://test.io', type: 'websocket' } as any, + }); + }).toThrow("does not support 'websocket'"); + }); + + test('isRemote is true when remoteBrowser is set', () => { + const lib = createMockPuppeteerLibrary(); + const plugin = new PuppeteerPlugin(lib as any, { + remoteBrowser: { endpoint: 'wss://test.io' }, + }); + + const ctx = plugin.createLaunchContext(); + expect(ctx.isRemote).toBe(true); + }); + + test('release called on connection failure with context', async () => { + const lib = createMockPuppeteerLibrary(); + lib.connect.mockRejectedValue(new Error('Connection refused')); + + const releaseFn = vi.fn(); + const plugin = new PuppeteerPlugin(lib as any, { + remoteBrowser: { + endpoint: async () => ({ url: 'wss://fail.io', context: { id: 'sess-456' } }), + release: releaseFn, + }, + }); + + const ctx = plugin.createLaunchContext(); + await expect(plugin.launch(ctx)).rejects.toThrow('Failed to connect to remote browser'); + + expect(releaseFn).toHaveBeenCalledWith({ endpoint: 'wss://fail.io', context: { id: 'sess-456' } }); + }); + + test('remoteBrowser ignored when connectOverCDPOptions also set', async () => { + const lib = createMockPuppeteerLibrary(); + const plugin = new PuppeteerPlugin(lib as any, { + remoteBrowser: { endpoint: 'wss://ignored.io' }, + connectOverCDPOptions: { browserWSEndpoint: 'wss://explicit.io' }, + }); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + expect(lib.connect).toHaveBeenCalledWith({ browserWSEndpoint: 'wss://explicit.io' }); + expect(mockLogger.warning).toHaveBeenCalledWith(expect.stringContaining('remoteBrowser is ignored')); + }); +}); + +// --------------------------------------------------------------------------- +// RemoteBrowserProvider tests +// --------------------------------------------------------------------------- + +describe('RemoteBrowserProvider — PlaywrightPlugin', () => { + let mockLogger: ReturnType; + + beforeEach(() => { + mockLogger = createMockLogger(); + serviceLocator.setLogger(mockLogger); + }); + + test('provider connect() → calls connectOverCDP by default', async () => { + const lib = createMockPlaywrightLibrary(); + + class SimpleProvider extends RemoteBrowserProvider { + async connect() { + return { url: 'wss://provider.io/cdp' }; + } + } + + const plugin = new PlaywrightPlugin(lib as any, { + remoteBrowser: new SimpleProvider(), + }); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + expect(lib.connectOverCDP).toHaveBeenCalledWith('wss://provider.io/cdp', {}); + expect(lib.connect).not.toHaveBeenCalled(); + expect(lib.launch).not.toHaveBeenCalled(); + }); + + test('provider with type=websocket → calls connect', async () => { + const lib = createMockPlaywrightLibrary(); + + class WsProvider extends RemoteBrowserProvider { + override type = 'websocket' as const; + async connect() { + return { url: 'wss://provider.io/ws' }; + } + } + + const plugin = new PlaywrightPlugin(lib as any, { + remoteBrowser: new WsProvider(), + }); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + expect(lib.connect).toHaveBeenCalledWith('wss://provider.io/ws', {}); + expect(lib.connectOverCDP).not.toHaveBeenCalled(); + }); + + test('provider context flows to release', async () => { + const lib = createMockPlaywrightLibrary(); + + interface Ctx { + sessionId: string; + } + + class SessionProvider extends RemoteBrowserProvider { + releasedContext?: Ctx; + async connect() { + return { url: 'wss://test.io', context: { sessionId: 'sess-42' } }; + } + async release(context: Ctx) { + this.releasedContext = context; + } + } + + const provider = new SessionProvider(); + const plugin = new PlaywrightPlugin(lib as any, { remoteBrowser: provider }); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + // Context stored on launchContext + expect((ctx as any)._remoteContext).toEqual({ sessionId: 'sess-42' }); + }); + + test('provider release called on connection failure', async () => { + const lib = createMockPlaywrightLibrary(); + lib.connectOverCDP.mockRejectedValue(new Error('Connection refused')); + + const releaseSpy = vi.fn(); + + class FailProvider extends RemoteBrowserProvider<{ id: string }> { + async connect() { + return { url: 'wss://fail.io', context: { id: 'sess-fail' } }; + } + async release(context: { id: string }) { + releaseSpy(context); + } + } + + const plugin = new PlaywrightPlugin(lib as any, { remoteBrowser: new FailProvider() }); + const ctx = plugin.createLaunchContext(); + + await expect(plugin.launch(ctx)).rejects.toThrow('Failed to connect to remote browser'); + expect(releaseSpy).toHaveBeenCalledWith({ id: 'sess-fail' }); + }); + + test('provider sets isRemote = true', () => { + const lib = createMockPlaywrightLibrary(); + + class P extends RemoteBrowserProvider { + async connect() { + return { url: 'wss://test.io' }; + } + } + + const plugin = new PlaywrightPlugin(lib as any, { remoteBrowser: new P() }); + const ctx = plugin.createLaunchContext(); + expect(ctx.isRemote).toBe(true); + }); + + test('provider sets useIncognitoPages default to true', () => { + const lib = createMockPlaywrightLibrary(); + + class P extends RemoteBrowserProvider { + async connect() { + return { url: 'wss://test.io' }; + } + } + + const plugin = new PlaywrightPlugin(lib as any, { remoteBrowser: new P() }); + expect(plugin.useIncognitoPages).toBe(true); + }); +}); + +describe('RemoteBrowserProvider — PuppeteerPlugin', () => { + let mockLogger: ReturnType; + + beforeEach(() => { + mockLogger = createMockLogger(); + serviceLocator.setLogger(mockLogger); + }); + + test('provider connect() → calls connect with browserWSEndpoint', async () => { + const lib = createMockPuppeteerLibrary(); + + class SimpleProvider extends RemoteBrowserProvider { + async connect() { + return { url: 'wss://provider.io/cdp' }; + } + } + + const plugin = new PuppeteerPlugin(lib as any, { remoteBrowser: new SimpleProvider() }); + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + expect(lib.connect).toHaveBeenCalledWith({ browserWSEndpoint: 'wss://provider.io/cdp' }); + expect(lib.launch).not.toHaveBeenCalled(); + }); + + test('provider with type=websocket throws in Puppeteer', () => { + const lib = createMockPuppeteerLibrary(); + + class WsProvider extends RemoteBrowserProvider { + override type = 'websocket' as const; + async connect() { + return { url: 'wss://test.io' }; + } + } + + expect(() => { + new PuppeteerPlugin(lib as any, { remoteBrowser: new WsProvider() }); + }).toThrow("does not support 'websocket'"); + }); + + test('provider release called on connection failure', async () => { + const lib = createMockPuppeteerLibrary(); + lib.connect.mockRejectedValue(new Error('Connection refused')); + + const releaseSpy = vi.fn(); + + class FailProvider extends RemoteBrowserProvider<{ id: string }> { + async connect() { + return { url: 'wss://fail.io', context: { id: 'sess-pptr' } }; + } + async release(context: { id: string }) { + releaseSpy(context); + } + } + + const plugin = new PuppeteerPlugin(lib as any, { remoteBrowser: new FailProvider() }); + const ctx = plugin.createLaunchContext(); + + await expect(plugin.launch(ctx)).rejects.toThrow('Failed to connect to remote browser'); + expect(releaseSpy).toHaveBeenCalledWith({ id: 'sess-pptr' }); + }); + + test('provider isRemote = true', () => { + const lib = createMockPuppeteerLibrary(); + + class P extends RemoteBrowserProvider { + async connect() { + return { url: 'wss://test.io' }; + } + } + + const plugin = new PuppeteerPlugin(lib as any, { remoteBrowser: new P() }); + const ctx = plugin.createLaunchContext(); + expect(ctx.isRemote).toBe(true); + }); +}); diff --git a/packages/playwright-crawler/src/internals/playwright-launcher.ts b/packages/playwright-crawler/src/internals/playwright-launcher.ts index e3d2e7f44c8c..e7f4bb2b7f1a 100644 --- a/packages/playwright-crawler/src/internals/playwright-launcher.ts +++ b/packages/playwright-crawler/src/internals/playwright-launcher.ts @@ -1,7 +1,12 @@ import type { BrowserLaunchContext } from '@crawlee/browser'; import { BrowserLauncher, Configuration } from '@crawlee/browser'; import { PlaywrightPlugin } from '@crawlee/browser-pool'; -import type { PlaywrightConnectOptions, PlaywrightConnectOverCDPOptions } from '@crawlee/browser-pool'; +import type { + PlaywrightConnectOptions, + PlaywrightConnectOverCDPOptions, + RemoteBrowserConfig, + RemoteBrowserProvider, +} from '@crawlee/browser-pool'; import { serviceLocator } from '@crawlee/core'; import ow from 'ow'; import type { Browser, BrowserType, LaunchOptions } from 'playwright'; @@ -84,6 +89,35 @@ export interface PlaywrightLaunchContext extends BrowserLaunchContext; +} + +/** + * Remote browser configuration for Playwright crawlers. + * Supports both CDP and WebSocket connection types. + */ +export interface PlaywrightRemoteBrowserConfig extends RemoteBrowserConfig { + /** + * Connection type to use. `'cdp'` uses `browserType.connectOverCDP()`, + * `'websocket'` uses `browserType.connect()`. + * @default 'cdp' + */ + type?: 'cdp' | 'websocket'; } /** @@ -97,6 +131,7 @@ export class PlaywrightLauncher extends BrowserLauncher { launchContextOptions: ow.optional.object, connectOptions: ow.optional.object, connectOverCDPOptions: ow.optional.object, + remoteBrowser: ow.optional.object, }; /** diff --git a/packages/puppeteer-crawler/src/internals/puppeteer-launcher.ts b/packages/puppeteer-crawler/src/internals/puppeteer-launcher.ts index 4113ea0d90bc..0458de65efa5 100644 --- a/packages/puppeteer-crawler/src/internals/puppeteer-launcher.ts +++ b/packages/puppeteer-crawler/src/internals/puppeteer-launcher.ts @@ -1,7 +1,7 @@ import type { BrowserLaunchContext } from '@crawlee/browser'; import { BrowserLauncher, Configuration } from '@crawlee/browser'; import { PuppeteerPlugin } from '@crawlee/browser-pool'; -import type { PuppeteerConnectOverCDPOptions } from '@crawlee/browser-pool'; +import type { PuppeteerConnectOverCDPOptions, RemoteBrowserConfig, RemoteBrowserProvider } from '@crawlee/browser-pool'; import { serviceLocator } from '@crawlee/core'; import ow from 'ow'; import type { Browser } from 'puppeteer'; @@ -73,6 +73,33 @@ export interface PuppeteerLaunchContext extends BrowserLaunchContext; +} + +/** + * Remote browser configuration for Puppeteer crawlers. + * Only CDP connections are supported (Puppeteer does not have a WebSocket connection mode). + */ +export interface PuppeteerRemoteBrowserConfig extends RemoteBrowserConfig { + /** + * Connection type. Only `'cdp'` is supported for Puppeteer. + * @default 'cdp' + */ + type?: 'cdp'; } /** @@ -84,6 +111,7 @@ export class PuppeteerLauncher extends BrowserLauncher ...BrowserLauncher.optionsShape, launcher: ow.optional.object, connectOverCDPOptions: ow.optional.object, + remoteBrowser: ow.optional.object, }; /** diff --git a/temp-examples/examples/browserbase-playwright-ws.ts b/temp-examples/examples/browserbase-playwright-ws.ts index 656d2cf3a0f0..f576a9ee9add 100644 --- a/temp-examples/examples/browserbase-playwright-ws.ts +++ b/temp-examples/examples/browserbase-playwright-ws.ts @@ -1,55 +1,54 @@ import 'dotenv/config'; +import { RemoteBrowserProvider } from '@crawlee/browser-pool'; import { PlaywrightCrawler } from 'crawlee'; -// Browserbase requires two env variables: -// - BROWSERBASE_API_KEY: Your API key for authentication -// - BROWSERBASE_PROJECT_ID: The project to create sessions in const apiKey = process.env.BROWSERBASE_API_KEY; const projectId = process.env.BROWSERBASE_PROJECT_ID; -if (!apiKey) { - throw new Error('BROWSERBASE_API_KEY env variable is required'); +if (!apiKey) throw new Error('BROWSERBASE_API_KEY env variable is required'); +if (!projectId) throw new Error('BROWSERBASE_PROJECT_ID env variable is required'); + +class BrowserbaseWsProvider extends RemoteBrowserProvider<{ id: string }> { + override type = 'websocket' as const; + + async connect() { + const response = await fetch('https://api.browserbase.com/v1/sessions', { + method: 'POST', + headers: { 'x-bb-api-key': apiKey, 'Content-Type': 'application/json' }, + body: JSON.stringify({ projectId }), + }); + + if (!response.ok) { + throw new Error(`Failed to create Browserbase session: ${response.status} ${response.statusText}`); + } + + const session = await response.json(); + console.log(`>>> Session created: ${session.id}`); + + const url = `wss://connect.browserbase.com?apiKey=${apiKey}&sessionId=${session.id}`; + return { url, context: { id: session.id } }; + } + + async release({ id }: { id: string }) { + await fetch(`https://api.browserbase.com/v1/sessions/${id}`, { + method: 'POST', + headers: { 'x-bb-api-key': apiKey, 'Content-Type': 'application/json' }, + body: JSON.stringify({ status: 'REQUEST_RELEASE' }), + }).catch(() => {}); + console.log(`<<< Session released: ${id}`); + } } -if (!projectId) { - throw new Error('BROWSERBASE_PROJECT_ID env variable is required'); -} - -// Step 1: Create a Browserbase session via REST API. -// This returns a session ID that we use to construct the WebSocket URL. -// You have 5 minutes to connect before the session terminates. -const response = await fetch('https://api.browserbase.com/v1/sessions', { - method: 'POST', - headers: { - 'x-bb-api-key': apiKey, - 'Content-Type': 'application/json', - }, - body: JSON.stringify({ projectId }), -}); - -if (!response.ok) { - throw new Error(`Failed to create Browserbase session: ${response.status} ${response.statusText}`); -} - -const session = await response.json(); -console.log(`Created Browserbase session: ${session.id}`); - -// Step 2: Connect to the session using Playwright's WebSocket connection. -// The WS URL is constructed from the API key and session ID. -const wsUrl = `wss://connect.browserbase.com?apiKey=${apiKey}&sessionId=${session.id}`; - const crawler = new PlaywrightCrawler({ launchContext: { - connectOptions: { - wsEndpoint: wsUrl, - }, + remoteBrowser: new BrowserbaseWsProvider(), }, + maxRequestsPerCrawl: 1, async requestHandler({ page, request }) { const title = await page.title(); - console.log(`[${request.loadedUrl}] ${title}`); + console.log(`[Page] ${request.loadedUrl} — "${title}"`); }, - maxRequestsPerCrawl: 1, }); await crawler.run(['https://example.com']); diff --git a/temp-examples/examples/browserbase-playwright.ts b/temp-examples/examples/browserbase-playwright.ts index 78ce8ca5569e..30686afa2924 100644 --- a/temp-examples/examples/browserbase-playwright.ts +++ b/temp-examples/examples/browserbase-playwright.ts @@ -1,53 +1,68 @@ import 'dotenv/config'; +import { RemoteBrowserProvider } from '@crawlee/browser-pool'; import { PlaywrightCrawler } from 'crawlee'; -// Browserbase requires two env variables: -// - BROWSERBASE_API_KEY: Your API key for authentication -// - BROWSERBASE_PROJECT_ID: The project to create sessions in const apiKey = process.env.BROWSERBASE_API_KEY; const projectId = process.env.BROWSERBASE_PROJECT_ID; -if (!apiKey) { - throw new Error('BROWSERBASE_API_KEY env variable is required'); -} +if (!apiKey) throw new Error('BROWSERBASE_API_KEY env variable is required'); +if (!projectId) throw new Error('BROWSERBASE_PROJECT_ID env variable is required'); -if (!projectId) { - throw new Error('BROWSERBASE_PROJECT_ID env variable is required'); -} +class BrowserbaseProvider extends RemoteBrowserProvider<{ id: string }> { + async connect() { + const response = await fetch('https://api.browserbase.com/v1/sessions', { + method: 'POST', + headers: { 'x-bb-api-key': apiKey, 'Content-Type': 'application/json' }, + body: JSON.stringify({ projectId }), + }); -// Step 1: Create a Browserbase session via REST API. -// This returns a connectUrl that we can use with Playwright's connectOverCDP. -// You have 5 minutes to connect before the session terminates. -const response = await fetch('https://api.browserbase.com/v1/sessions', { - method: 'POST', - headers: { - 'x-bb-api-key': apiKey, - 'Content-Type': 'application/json', - }, - body: JSON.stringify({ projectId }), -}); + if (!response.ok) { + throw new Error(`Failed to create Browserbase session: ${response.status} ${response.statusText}`); + } -if (!response.ok) { - throw new Error(`Failed to create Browserbase session: ${response.status} ${response.statusText}`); -} + const session = await response.json(); + console.log(`>>> Session created: ${session.id}`); + + return { url: session.connectUrl, context: { id: session.id } }; + } -const session = await response.json(); -console.log(`Created Browserbase session: ${session.id}`); + async release({ id }: { id: string }) { + await fetch(`https://api.browserbase.com/v1/sessions/${id}`, { + method: 'POST', + headers: { 'x-bb-api-key': apiKey, 'Content-Type': 'application/json' }, + body: JSON.stringify({ status: 'REQUEST_RELEASE' }), + }).catch(() => {}); + console.log(`<<< Session released: ${id}`); + } +} -// Step 2: Connect to the session using Playwright's CDP connection. -// The connectUrl from the session response is used as the CDP endpoint. const crawler = new PlaywrightCrawler({ launchContext: { - connectOverCDPOptions: { - endpointURL: session.connectUrl, - }, + remoteBrowser: new BrowserbaseProvider(), + }, + browserPoolOptions: { + retireBrowserAfterPageCount: 3, + maxOpenPagesPerBrowser: 1, }, + // Browserbase free tier: 3 concurrent sessions + maxConcurrency: 2, + maxRequestsPerCrawl: 10, async requestHandler({ page, request }) { const title = await page.title(); - console.log(`[${request.loadedUrl}] ${title}`); + console.log(`[Page] ${request.loadedUrl} — "${title}"`); }, - maxRequestsPerCrawl: 1, }); -await crawler.run(['https://example.com']); +await crawler.run([ + 'https://example.com', + 'https://crawlee.dev', + 'https://www.google.com', + 'https://github.com', + 'https://wikipedia.org', + 'https://httpbin.org', + 'https://jsonplaceholder.typicode.com', + 'https://news.ycombinator.com', + 'https://books.toscrape.com', + 'https://quotes.toscrape.com', +]); diff --git a/temp-examples/examples/browserbase-puppeteer.ts b/temp-examples/examples/browserbase-puppeteer.ts index f6dcce121965..14fff4c90034 100644 --- a/temp-examples/examples/browserbase-puppeteer.ts +++ b/temp-examples/examples/browserbase-puppeteer.ts @@ -1,53 +1,67 @@ import 'dotenv/config'; +import { RemoteBrowserProvider } from '@crawlee/browser-pool'; import { PuppeteerCrawler } from 'crawlee'; -// Browserbase requires two env variables: -// - BROWSERBASE_API_KEY: Your API key for authentication -// - BROWSERBASE_PROJECT_ID: The project to create sessions in const apiKey = process.env.BROWSERBASE_API_KEY; const projectId = process.env.BROWSERBASE_PROJECT_ID; -if (!apiKey) { - throw new Error('BROWSERBASE_API_KEY env variable is required'); -} +if (!apiKey) throw new Error('BROWSERBASE_API_KEY env variable is required'); +if (!projectId) throw new Error('BROWSERBASE_PROJECT_ID env variable is required'); -if (!projectId) { - throw new Error('BROWSERBASE_PROJECT_ID env variable is required'); -} +class BrowserbaseProvider extends RemoteBrowserProvider<{ id: string }> { + async connect() { + const response = await fetch('https://api.browserbase.com/v1/sessions', { + method: 'POST', + headers: { 'x-bb-api-key': apiKey, 'Content-Type': 'application/json' }, + body: JSON.stringify({ projectId }), + }); -// Step 1: Create a Browserbase session via REST API. -// This returns a connectUrl that we can use with Puppeteer's CDP connection. -// You have 5 minutes to connect before the session terminates. -const response = await fetch('https://api.browserbase.com/v1/sessions', { - method: 'POST', - headers: { - 'x-bb-api-key': apiKey, - 'Content-Type': 'application/json', - }, - body: JSON.stringify({ projectId }), -}); + if (!response.ok) { + throw new Error(`Failed to create Browserbase session: ${response.status} ${response.statusText}`); + } -if (!response.ok) { - throw new Error(`Failed to create Browserbase session: ${response.status} ${response.statusText}`); -} + const session = await response.json(); + console.log(`>>> Session created: ${session.id}`); + + return { url: session.connectUrl, context: { id: session.id } }; + } -const session = await response.json(); -console.log(`Created Browserbase session: ${session.id}`); + async release({ id }: { id: string }) { + await fetch(`https://api.browserbase.com/v1/sessions/${id}`, { + method: 'POST', + headers: { 'x-bb-api-key': apiKey, 'Content-Type': 'application/json' }, + body: JSON.stringify({ status: 'REQUEST_RELEASE' }), + }).catch(() => {}); + console.log(`<<< Session released: ${id}`); + } +} -// Step 2: Connect to the session using Puppeteer's CDP connection. -// The connectUrl from the session response is used as the browserWSEndpoint. const crawler = new PuppeteerCrawler({ launchContext: { - connectOverCDPOptions: { - browserWSEndpoint: session.connectUrl, - }, + remoteBrowser: new BrowserbaseProvider(), + }, + browserPoolOptions: { + retireBrowserAfterPageCount: 3, + maxOpenPagesPerBrowser: 1, }, + maxConcurrency: 2, + maxRequestsPerCrawl: 10, async requestHandler({ page, request }) { const title = await page.title(); - console.log(`[${request.loadedUrl}] ${title}`); + console.log(`[Page] ${request.loadedUrl} — "${title}"`); }, - maxRequestsPerCrawl: 1, }); -await crawler.run(['https://example.com']); +await crawler.run([ + 'https://example.com', + 'https://crawlee.dev', + 'https://www.google.com', + 'https://github.com', + 'https://wikipedia.org', + 'https://httpbin.org', + 'https://jsonplaceholder.typicode.com', + 'https://news.ycombinator.com', + 'https://books.toscrape.com', + 'https://quotes.toscrape.com', +]); diff --git a/temp-examples/examples/browserless-playwright-ws.ts b/temp-examples/examples/browserless-playwright-ws.ts index ec659b59f025..3d7727d1db0e 100644 --- a/temp-examples/examples/browserless-playwright-ws.ts +++ b/temp-examples/examples/browserless-playwright-ws.ts @@ -1,29 +1,47 @@ import 'dotenv/config'; +import { RemoteBrowserProvider } from '@crawlee/browser-pool'; import { PlaywrightCrawler } from 'crawlee'; +// Local Docker (preferred): docker run -p 3000:3000 -e CONCURRENT=4 ghcr.io/browserless/chromium +// Remote: set BROWSERLESS_TOKEN in .env const token = process.env.BROWSERLESS_TOKEN; +const base = token ? 'wss://production-sfo.browserless.io' : 'ws://localhost:3000'; +const endpointUrl = token ? `${base}/chromium/playwright?token=${token}` : `${base}/chromium/playwright`; -if (!token) { - throw new Error('BROWSERLESS_TOKEN env variable is required'); +class BrowserlessWsProvider extends RemoteBrowserProvider { + override type = 'websocket' as const; + + async connect() { + return { url: endpointUrl }; + } } const crawler = new PlaywrightCrawler({ launchContext: { - connectOptions: { - wsEndpoint: `wss://production-sfo.browserless.io/chromium/playwright?token=${token}`, - }, + remoteBrowser: new BrowserlessWsProvider(), }, - async requestHandler({ page, request, enqueueLinks }) { - const title = await page.title(); - console.log(`[${request.loadedUrl}] ${title}`); - - await enqueueLinks({ - globs: ['https://www.crawlee.dev/**'], - limit: 5, - }); + browserPoolOptions: { + retireBrowserAfterPageCount: 5, + maxOpenPagesPerBrowser: 1, }, + maxConcurrency: 4, maxRequestsPerCrawl: 10, + async requestHandler({ page, request }) { + const title = await page.title(); + console.log(`[Page] ${request.loadedUrl} — "${title}"`); + }, }); -await crawler.run(['https://www.crawlee.dev']); +await crawler.run([ + 'https://example.com', + 'https://crawlee.dev', + 'https://www.google.com', + 'https://github.com', + 'https://wikipedia.org', + 'https://httpbin.org', + 'https://jsonplaceholder.typicode.com', + 'https://news.ycombinator.com', + 'https://books.toscrape.com', + 'https://quotes.toscrape.com', +]); diff --git a/temp-examples/examples/browserless-playwright.ts b/temp-examples/examples/browserless-playwright.ts index ca7712c62ed0..bc4c72abfbc2 100644 --- a/temp-examples/examples/browserless-playwright.ts +++ b/temp-examples/examples/browserless-playwright.ts @@ -1,29 +1,42 @@ import 'dotenv/config'; +import { RemoteBrowserProvider } from '@crawlee/browser-pool'; import { PlaywrightCrawler } from 'crawlee'; +// Local Docker (preferred): docker run -p 3000:3000 -e CONCURRENT=4 ghcr.io/browserless/chromium +// Remote: set BROWSERLESS_TOKEN in .env const token = process.env.BROWSERLESS_TOKEN; +const endpointUrl = token ? `wss://production-sfo.browserless.io?token=${token}` : 'ws://localhost:3000'; -if (!token) { - throw new Error('BROWSERLESS_TOKEN env variable is required'); +class BrowserlessProvider extends RemoteBrowserProvider { + async connect() { + return { url: endpointUrl }; + } } const crawler = new PlaywrightCrawler({ launchContext: { - connectOverCDPOptions: { - endpointURL: `wss://production-sfo.browserless.io?token=${token}`, - }, + remoteBrowser: new BrowserlessProvider(), }, - async requestHandler({ page, request, enqueueLinks }) { + browserPoolOptions: { + retireBrowserAfterPageCount: 5, + }, + maxConcurrency: 1, + maxRequestsPerCrawl: 9, + async requestHandler({ page, request }) { const title = await page.title(); - console.log(`[${request.loadedUrl}] ${title}`); - - await enqueueLinks({ - globs: ['https://www.crawlee.dev/**'], - limit: 5, - }); + console.log(`[Page] ${request.loadedUrl} — "${title}"`); }, - maxRequestsPerCrawl: 10, }); -await crawler.run(['https://www.crawlee.dev']); +await crawler.run([ + 'https://example.com', + 'https://crawlee.dev', + 'https://www.google.com', + 'https://github.com', + 'https://wikipedia.org', + 'https://httpbin.org', + 'https://jsonplaceholder.typicode.com', + 'https://news.ycombinator.com', + 'https://books.toscrape.com', +]); diff --git a/temp-examples/examples/browserless-puppeteer.ts b/temp-examples/examples/browserless-puppeteer.ts index c47fbe214420..d49d3e2c6f05 100644 --- a/temp-examples/examples/browserless-puppeteer.ts +++ b/temp-examples/examples/browserless-puppeteer.ts @@ -1,29 +1,43 @@ import 'dotenv/config'; +import { RemoteBrowserProvider } from '@crawlee/browser-pool'; import { PuppeteerCrawler } from 'crawlee'; +// Local Docker (preferred): docker run -p 3000:3000 -e CONCURRENT=4 ghcr.io/browserless/chromium +// Remote: set BROWSERLESS_TOKEN in .env const token = process.env.BROWSERLESS_TOKEN; +const endpointUrl = token ? `wss://production-sfo.browserless.io?token=${token}` : 'ws://localhost:3000'; -if (!token) { - throw new Error('BROWSERLESS_TOKEN env variable is required'); +class BrowserlessProvider extends RemoteBrowserProvider { + async connect() { + return { url: endpointUrl }; + } } const crawler = new PuppeteerCrawler({ launchContext: { - connectOverCDPOptions: { - browserWSEndpoint: `wss://production-sfo.browserless.io?token=${token}`, - }, + remoteBrowser: new BrowserlessProvider(), }, - async requestHandler({ page, request, enqueueLinks }) { + browserPoolOptions: { + retireBrowserAfterPageCount: 5, + maxOpenPagesPerBrowser: 1, + }, + maxConcurrency: 4, + maxRequestsPerCrawl: 9, + async requestHandler({ page, request }) { const title = await page.title(); - console.log(`[${request.loadedUrl}] ${title}`); - - await enqueueLinks({ - globs: ['https://www.crawlee.dev/**'], - limit: 5, - }); + console.log(`[Page] ${request.loadedUrl} — "${title}"`); }, - maxRequestsPerCrawl: 10, }); -await crawler.run(['https://www.crawlee.dev']); +await crawler.run([ + 'https://example.com', + 'https://crawlee.dev', + 'https://www.google.com', + 'https://github.com', + 'https://wikipedia.org', + 'https://httpbin.org', + 'https://jsonplaceholder.typicode.com', + 'https://news.ycombinator.com', + 'https://books.toscrape.com', +]); diff --git a/temp-examples/examples/rebrowser-playwright-ws.ts b/temp-examples/examples/rebrowser-playwright-ws.ts index 31587ceca6bb..7c783150e45d 100644 --- a/temp-examples/examples/rebrowser-playwright-ws.ts +++ b/temp-examples/examples/rebrowser-playwright-ws.ts @@ -1,39 +1,42 @@ import 'dotenv/config'; +import { RemoteBrowserProvider } from '@crawlee/browser-pool'; import { PlaywrightCrawler } from 'crawlee'; const apiKey = process.env.REBROWSER_API_KEY; const profileId = process.env.REBROWSER_PROFILE_ID; -if (!apiKey) { - throw new Error('REBROWSER_API_KEY env variable is required'); -} +if (!apiKey) throw new Error('REBROWSER_API_KEY env variable is required'); -// Step 1: Start a Rebrowser run via REST API. -// This gives you a dedicated WebSocket endpoint for the session. -// You can optionally specify a profileId and proxyUrl for advanced control. -const startRunUrl = new URL(`https://rebrowser.net/api/startRun?apikey=${apiKey}`); +// Rebrowser WS connection: starts a dedicated run via REST API, +// which gives you a WebSocket endpoint for Playwright's native protocol. +class RebrowserWsProvider extends RemoteBrowserProvider { + override type = 'websocket' as const; -if (profileId) { - startRunUrl.searchParams.set('profileId', profileId); - console.log(`Using Rebrowser profile: ${profileId}`); -} + async connect() { + const url = new URL(`https://rebrowser.net/api/startRun?apikey=${apiKey}`); -const response = await fetch(startRunUrl.toString()); + if (profileId) { + url.searchParams.set('profileId', profileId); + console.log(`Using Rebrowser profile: ${profileId}`); + } -if (!response.ok) { - throw new Error(`Failed to start Rebrowser run: ${response.status} ${response.statusText}`); -} + const response = await fetch(url.toString()); -const run = await response.json(); -console.log(`Started Rebrowser run with wsEndpoint: ${run.wsEndpoint}`); + if (!response.ok) { + throw new Error(`Failed to start Rebrowser run: ${response.status} ${response.statusText}`); + } + + const run = await response.json(); + console.log(`Started Rebrowser run with wsEndpoint: ${run.wsEndpoint}`); + + return { url: run.wsEndpoint }; + } +} -// Step 2: Connect to the run using Playwright's WebSocket connection. const crawler = new PlaywrightCrawler({ launchContext: { - connectOptions: { - wsEndpoint: run.wsEndpoint, - }, + remoteBrowser: new RebrowserWsProvider(), }, async requestHandler({ page, request }) { const title = await page.title(); @@ -44,7 +47,6 @@ const crawler = new PlaywrightCrawler({ await crawler.run(['https://example.com']); -// Step 3: Finish the run to stop billing. -// Rebrowser recommends explicit finishRun to avoid idle billing. -// The browser disconnects automatically after the crawl, but calling finishRun -// ensures the run is cleanly terminated on Rebrowser's side. +// Note: Rebrowser recommends calling finishRun after you're done to avoid idle billing. +// With Crawlee, the browser disconnects automatically after the crawl finishes, +// which should end the run. For explicit control, use the REST API finishRun endpoint. diff --git a/temp-examples/examples/rebrowser-playwright.ts b/temp-examples/examples/rebrowser-playwright.ts index f88783238192..45a8804dae57 100644 --- a/temp-examples/examples/rebrowser-playwright.ts +++ b/temp-examples/examples/rebrowser-playwright.ts @@ -1,21 +1,23 @@ import 'dotenv/config'; +import { RemoteBrowserProvider } from '@crawlee/browser-pool'; import { PlaywrightCrawler } from 'crawlee'; const apiKey = process.env.REBROWSER_API_KEY; - -if (!apiKey) { - throw new Error('REBROWSER_API_KEY env variable is required'); -} +if (!apiKey) throw new Error('REBROWSER_API_KEY env variable is required'); // Rebrowser simple connection: no profile or run creation needed. // A random profile is auto-selected when you connect with just an API key. // Proxies are managed via the Rebrowser dashboard or WS URL params. +class RebrowserProvider extends RemoteBrowserProvider { + async connect() { + return { url: `wss://api.rebrowser.net?apikey=${apiKey}` }; + } +} + const crawler = new PlaywrightCrawler({ launchContext: { - connectOverCDPOptions: { - endpointURL: `wss://api.rebrowser.net?apikey=${apiKey}`, - }, + remoteBrowser: new RebrowserProvider(), }, async requestHandler({ page, request }) { const title = await page.title(); diff --git a/temp-examples/examples/rebrowser-puppeteer.ts b/temp-examples/examples/rebrowser-puppeteer.ts index 54d49065c712..51691a8f85f4 100644 --- a/temp-examples/examples/rebrowser-puppeteer.ts +++ b/temp-examples/examples/rebrowser-puppeteer.ts @@ -1,21 +1,23 @@ import 'dotenv/config'; +import { RemoteBrowserProvider } from '@crawlee/browser-pool'; import { PuppeteerCrawler } from 'crawlee'; const apiKey = process.env.REBROWSER_API_KEY; - -if (!apiKey) { - throw new Error('REBROWSER_API_KEY env variable is required'); -} +if (!apiKey) throw new Error('REBROWSER_API_KEY env variable is required'); // Rebrowser simple connection: no profile or run creation needed. // A random profile is auto-selected when you connect with just an API key. // Proxies are managed via the Rebrowser dashboard or WS URL params. +class RebrowserProvider extends RemoteBrowserProvider { + async connect() { + return { url: `wss://api.rebrowser.net?apikey=${apiKey}` }; + } +} + const crawler = new PuppeteerCrawler({ launchContext: { - connectOverCDPOptions: { - browserWSEndpoint: `wss://api.rebrowser.net?apikey=${apiKey}`, - }, + remoteBrowser: new RebrowserProvider(), }, async requestHandler({ page, request }) { const title = await page.title(); diff --git a/temp-examples/examples/steel-playwright-ws.ts b/temp-examples/examples/steel-playwright-ws.ts index 55f4712a5315..e560972891d9 100644 --- a/temp-examples/examples/steel-playwright-ws.ts +++ b/temp-examples/examples/steel-playwright-ws.ts @@ -1,51 +1,70 @@ import 'dotenv/config'; +import { RemoteBrowserProvider } from '@crawlee/browser-pool'; import { PlaywrightCrawler } from 'crawlee'; const apiKey = process.env.STEEL_API_KEY; +if (!apiKey) throw new Error('STEEL_API_KEY env variable is required'); -if (!apiKey) { - throw new Error('STEEL_API_KEY env variable is required'); -} +class SteelWsProvider extends RemoteBrowserProvider<{ id: string }> { + override type = 'websocket' as const; -// Step 1: Create a Steel session via REST API. -// Explicit session creation enables advanced features like proxy and CAPTCHA solving. -const response = await fetch('https://api.steel.dev/v1/sessions', { - method: 'POST', - headers: { - 'Steel-Api-Key': apiKey, - 'Content-Type': 'application/json', - }, - body: JSON.stringify({ useProxy: true, solveCaptcha: true }), -}); + async connect() { + const response = await fetch('https://api.steel.dev/v1/sessions', { + method: 'POST', + headers: { 'Steel-Api-Key': apiKey, 'Content-Type': 'application/json' }, + body: JSON.stringify({}), + }); -if (!response.ok) { - throw new Error(`Failed to create Steel session: ${response.status} ${response.statusText}`); -} + if (!response.ok) { + throw new Error(`Failed to create Steel session: ${response.status} ${response.statusText}`); + } + + const session = await response.json(); + console.log(`>>> Session created: ${session.id}`); -const session = await response.json(); -console.log(`Created Steel session: ${session.id}`); + return { + url: `wss://connect.steel.dev?apiKey=${apiKey}&sessionId=${session.id}`, + context: { id: session.id }, + }; + } -// Step 2: Connect to the session using Playwright's WebSocket connection. -// The session ID is passed as a query parameter to the Steel WebSocket endpoint. + async release({ id }: { id: string }) { + await fetch(`https://api.steel.dev/v1/sessions/${id}/release`, { + method: 'POST', + headers: { 'Steel-Api-Key': apiKey }, + }).catch(() => {}); + console.log(`<<< Session released: ${id}`); + } +} + +// Note: Steel may not support the Playwright WebSocket protocol. +// If this hangs on connect, use the CDP variant (steel-playwright.ts) instead. const crawler = new PlaywrightCrawler({ launchContext: { - connectOptions: { - wsEndpoint: `wss://connect.steel.dev?apiKey=${apiKey}&sessionId=${session.id}`, - }, + remoteBrowser: new SteelWsProvider(), + }, + browserPoolOptions: { + retireBrowserAfterPageCount: 3, + maxOpenPagesPerBrowser: 1, }, + maxConcurrency: 4, + maxRequestsPerCrawl: 10, async requestHandler({ page, request }) { const title = await page.title(); - console.log(`[${request.loadedUrl}] ${title}`); + console.log(`[Page] ${request.loadedUrl} — "${title}"`); }, - maxRequestsPerCrawl: 1, }); -await crawler.run(['https://example.com']); - -// Step 3: Release the session (optional — Steel auto-releases on disconnect). -await fetch(`https://api.steel.dev/v1/sessions/${session.id}/release`, { - method: 'POST', - headers: { 'Steel-Api-Key': apiKey }, -}); -console.log(`Released Steel session: ${session.id}`); +await crawler.run([ + 'https://example.com', + 'https://crawlee.dev', + 'https://www.google.com', + 'https://github.com', + 'https://wikipedia.org', + 'https://httpbin.org', + 'https://jsonplaceholder.typicode.com', + 'https://news.ycombinator.com', + 'https://books.toscrape.com', + 'https://quotes.toscrape.com', +]); diff --git a/temp-examples/examples/steel-playwright.ts b/temp-examples/examples/steel-playwright.ts index 7bf2913054e9..9e9354412985 100644 --- a/temp-examples/examples/steel-playwright.ts +++ b/temp-examples/examples/steel-playwright.ts @@ -1,26 +1,65 @@ import 'dotenv/config'; +import { RemoteBrowserProvider } from '@crawlee/browser-pool'; import { PlaywrightCrawler } from 'crawlee'; const apiKey = process.env.STEEL_API_KEY; +if (!apiKey) throw new Error('STEEL_API_KEY env variable is required'); -if (!apiKey) { - throw new Error('STEEL_API_KEY env variable is required'); +class SteelProvider extends RemoteBrowserProvider<{ id: string }> { + async connect() { + const response = await fetch('https://api.steel.dev/v1/sessions', { + method: 'POST', + headers: { 'Steel-Api-Key': apiKey, 'Content-Type': 'application/json' }, + body: JSON.stringify({}), + }); + + if (!response.ok) { + throw new Error(`Failed to create Steel session: ${response.status} ${response.statusText}`); + } + + const session = await response.json(); + console.log(`>>> Session created: ${session.id}`); + + return { + url: `wss://connect.steel.dev?apiKey=${apiKey}&sessionId=${session.id}`, + context: { id: session.id }, + }; + } + + async release({ id }: { id: string }) { + await fetch(`https://api.steel.dev/v1/sessions/${id}/release`, { + method: 'POST', + headers: { 'Steel-Api-Key': apiKey }, + }).catch(() => {}); + console.log(`<<< Session released: ${id}`); + } } -// Steel direct connection: no session creation needed. -// A session is auto-created when you connect and auto-released on disconnect. const crawler = new PlaywrightCrawler({ launchContext: { - connectOverCDPOptions: { - endpointURL: `wss://connect.steel.dev?apiKey=${apiKey}`, - }, + remoteBrowser: new SteelProvider(), + }, + browserPoolOptions: { + retireBrowserAfterPageCount: 5, }, + maxConcurrency: 1, + maxRequestsPerCrawl: 10, async requestHandler({ page, request }) { const title = await page.title(); - console.log(`[${request.loadedUrl}] ${title}`); + console.log(`[Page] ${request.loadedUrl} — "${title}"`); }, - maxRequestsPerCrawl: 1, }); -await crawler.run(['https://example.com']); +await crawler.run([ + 'https://example.com', + 'https://crawlee.dev', + 'https://www.google.com', + 'https://github.com', + 'https://wikipedia.org', + 'https://httpbin.org', + 'https://jsonplaceholder.typicode.com', + 'https://news.ycombinator.com', + 'https://books.toscrape.com', + 'https://quotes.toscrape.com', +]); diff --git a/temp-examples/examples/steel-puppeteer.ts b/temp-examples/examples/steel-puppeteer.ts index 68dc3cdb59a9..f2985f8158a7 100644 --- a/temp-examples/examples/steel-puppeteer.ts +++ b/temp-examples/examples/steel-puppeteer.ts @@ -1,26 +1,42 @@ import 'dotenv/config'; +import { RemoteBrowserProvider } from '@crawlee/browser-pool'; import { PuppeteerCrawler } from 'crawlee'; const apiKey = process.env.STEEL_API_KEY; +if (!apiKey) throw new Error('STEEL_API_KEY env variable is required'); -if (!apiKey) { - throw new Error('STEEL_API_KEY env variable is required'); +class SteelProvider extends RemoteBrowserProvider { + async connect() { + return { url: `wss://connect.steel.dev?apiKey=${apiKey}` }; + } } -// Steel direct connection: no session creation needed. -// A session is auto-created when you connect and auto-released on disconnect. const crawler = new PuppeteerCrawler({ launchContext: { - connectOverCDPOptions: { - browserWSEndpoint: `wss://connect.steel.dev?apiKey=${apiKey}`, - }, + remoteBrowser: new SteelProvider(), }, + browserPoolOptions: { + retireBrowserAfterPageCount: 3, + maxOpenPagesPerBrowser: 1, + }, + maxConcurrency: 4, + maxRequestsPerCrawl: 10, async requestHandler({ page, request }) { const title = await page.title(); - console.log(`[${request.loadedUrl}] ${title}`); + console.log(`[Page] ${request.loadedUrl} — "${title}"`); }, - maxRequestsPerCrawl: 1, }); -await crawler.run(['https://example.com']); +await crawler.run([ + 'https://example.com', + 'https://crawlee.dev', + 'https://www.google.com', + 'https://github.com', + 'https://wikipedia.org', + 'https://httpbin.org', + 'https://jsonplaceholder.typicode.com', + 'https://news.ycombinator.com', + 'https://books.toscrape.com', + 'https://quotes.toscrape.com', +]); diff --git a/temp-examples/package.json b/temp-examples/package.json index cbcb71ed6c1b..32e7681d0202 100644 --- a/temp-examples/package.json +++ b/temp-examples/package.json @@ -1,38 +1,39 @@ { - "name": "temp-examples", - "version": "1.0.0", - "private": true, - "type": "module", - "scripts": { - "example:browserless-puppeteer": "node --experimental-strip-types examples/browserless-puppeteer.ts", - "example:browserless-playwright": "node --experimental-strip-types examples/browserless-playwright.ts", - "example:browserless-playwright-ws": "node --experimental-strip-types examples/browserless-playwright-ws.ts", - "example:browserbase-puppeteer": "node --experimental-strip-types examples/browserbase-puppeteer.ts", - "example:browserbase-playwright": "node --experimental-strip-types examples/browserbase-playwright.ts", - "example:browserbase-playwright-ws": "node --experimental-strip-types examples/browserbase-playwright-ws.ts", - "example:steel-puppeteer": "node --experimental-strip-types examples/steel-puppeteer.ts", - "example:steel-playwright": "node --experimental-strip-types examples/steel-playwright.ts", - "example:steel-playwright-ws": "node --experimental-strip-types examples/steel-playwright-ws.ts", - "example:rebrowser-puppeteer": "node --experimental-strip-types examples/rebrowser-puppeteer.ts", - "example:rebrowser-playwright": "node --experimental-strip-types examples/rebrowser-playwright.ts", - "example:rebrowser-playwright-ws": "node --experimental-strip-types examples/rebrowser-playwright-ws.ts" - }, - "dependencies": { - "@crawlee/basic": "file:../packages/basic-crawler/dist", - "@crawlee/browser": "file:../packages/browser-crawler/dist", - "@crawlee/browser-pool": "file:../packages/browser-pool/dist", - "@crawlee/cheerio": "file:../packages/cheerio-crawler/dist", - "@crawlee/cli": "file:../packages/cli/dist", - "@crawlee/core": "file:../packages/core/dist", - "@crawlee/http": "file:../packages/http-crawler/dist", - "@crawlee/jsdom": "file:../packages/jsdom-crawler/dist", - "@crawlee/linkedom": "file:../packages/linkedom-crawler/dist", - "@crawlee/playwright": "file:../packages/playwright-crawler/dist", - "@crawlee/puppeteer": "file:../packages/puppeteer-crawler/dist", - "@crawlee/types": "file:../packages/types/dist", - "@crawlee/utils": "file:../packages/utils/dist", - "@types/node": "^25.2.0", - "crawlee": "file:../packages/crawlee/dist", - "dotenv": "^17.3.1" - } + "name": "temp-examples", + "version": "1.0.0", + "private": true, + "type": "module", + "scripts": { + "docker:browserless": "docker run -p 3000:3000 -e CONCURRENT=4 ghcr.io/browserless/chromium", + "example:browserless-puppeteer": "node --experimental-strip-types examples/browserless-puppeteer.ts", + "example:browserless-playwright": "node --experimental-strip-types examples/browserless-playwright.ts", + "example:browserless-playwright-ws": "node --experimental-strip-types examples/browserless-playwright-ws.ts", + "example:browserbase-puppeteer": "node --experimental-strip-types examples/browserbase-puppeteer.ts", + "example:browserbase-playwright": "node --experimental-strip-types examples/browserbase-playwright.ts", + "example:browserbase-playwright-ws": "node --experimental-strip-types examples/browserbase-playwright-ws.ts", + "example:steel-puppeteer": "node --experimental-strip-types examples/steel-puppeteer.ts", + "example:steel-playwright": "node --experimental-strip-types examples/steel-playwright.ts", + "example:steel-playwright-ws": "node --experimental-strip-types examples/steel-playwright-ws.ts", + "example:rebrowser-puppeteer": "node --experimental-strip-types examples/rebrowser-puppeteer.ts", + "example:rebrowser-playwright": "node --experimental-strip-types examples/rebrowser-playwright.ts", + "example:rebrowser-playwright-ws": "node --experimental-strip-types examples/rebrowser-playwright-ws.ts" + }, + "dependencies": { + "@crawlee/basic": "file:../packages/basic-crawler/dist", + "@crawlee/browser": "file:../packages/browser-crawler/dist", + "@crawlee/browser-pool": "file:../packages/browser-pool/dist", + "@crawlee/cheerio": "file:../packages/cheerio-crawler/dist", + "@crawlee/cli": "file:../packages/cli/dist", + "@crawlee/core": "file:../packages/core/dist", + "@crawlee/http": "file:../packages/http-crawler/dist", + "@crawlee/jsdom": "file:../packages/jsdom-crawler/dist", + "@crawlee/linkedom": "file:../packages/linkedom-crawler/dist", + "@crawlee/playwright": "file:../packages/playwright-crawler/dist", + "@crawlee/puppeteer": "file:../packages/puppeteer-crawler/dist", + "@crawlee/types": "file:../packages/types/dist", + "@crawlee/utils": "file:../packages/utils/dist", + "@types/node": "^25.2.0", + "crawlee": "file:../packages/crawlee/dist", + "dotenv": "^17.3.1" + } } From 945ef9261bdc7d1e27b0ed2ae962f33b9ccdf12d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sol=C3=A1r?= Date: Wed, 29 Apr 2026 21:27:06 +0200 Subject: [PATCH 16/45] feat(browser-pool): add maxOpenBrowsers to prevent concurrent session overflow Remote browser services enforce concurrent session limits. During browser retirement transitions, the pool could briefly exceed the limit by launching a new browser before the retired one fully closed. - Add maxOpenBrowsers to RemoteBrowserConfig and RemoteBrowserProvider - BrowserCrawler reads it from the plugin and applies it to the pool - Gate new tasks via _isTaskReadyFunction (same pattern as maxConcurrency) - Add hasFreeBrowserSlot() and hasActiveBrowserWithFreeCapacity() to BrowserPool - Only activates when maxOpenBrowsers is set (remote browsers); local browsers unaffected --- .../src/internals/browser-crawler.ts | 16 ++++ .../src/abstract-classes/browser-plugin.ts | 7 ++ packages/browser-pool/src/browser-pool.ts | 24 ++++++ .../src/playwright/playwright-plugin.ts | 2 +- .../src/puppeteer/puppeteer-plugin.ts | 2 +- .../src/remote-browser-provider.ts | 6 ++ .../examples/browserless-overlap-test.ts | 80 +++++++++++++++++++ 7 files changed, 135 insertions(+), 2 deletions(-) create mode 100644 temp-examples/examples/browserless-overlap-test.ts diff --git a/packages/browser-crawler/src/internals/browser-crawler.ts b/packages/browser-crawler/src/internals/browser-crawler.ts index 3c1f1a544ed8..ba5b0e00d747 100644 --- a/packages/browser-crawler/src/internals/browser-crawler.ts +++ b/packages/browser-crawler/src/internals/browser-crawler.ts @@ -393,6 +393,12 @@ export abstract class BrowserCrawler< this.browserPool = new BrowserPool({ ...(browserPoolOptions as any), }); + + // Read maxOpenBrowsers from the remote browser config and apply it to the pool. + const remoteMaxBrowsers = this.browserPool.browserPlugins[0]?.remoteBrowser?.maxOpenBrowsers; + if (remoteMaxBrowsers) { + this.browserPool.maxOpenBrowsers = remoteMaxBrowsers; + } } protected override buildContextPipeline(): ContextPipeline< @@ -710,6 +716,16 @@ export abstract class BrowserCrawler< * Function for cleaning up after all requests are processed. * @ignore */ + protected override async _isTaskReadyFunction(): Promise { + // Don't start new tasks if browser pool is at its limit and no active browser has capacity. + // AutoscaledPool will retry automatically when a browser closes and frees a slot. + if (!this.browserPool.hasFreeBrowserSlot() && !this.browserPool.hasActiveBrowserWithFreeCapacity()) { + return false; + } + + return super._isTaskReadyFunction(); + } + override async teardown(): Promise { await this.browserPool.destroy(); await super.teardown(); diff --git a/packages/browser-pool/src/abstract-classes/browser-plugin.ts b/packages/browser-pool/src/abstract-classes/browser-plugin.ts index ccc768e0ff29..2b5b8372696b 100644 --- a/packages/browser-pool/src/abstract-classes/browser-plugin.ts +++ b/packages/browser-pool/src/abstract-classes/browser-plugin.ts @@ -98,6 +98,12 @@ export interface RemoteBrowserConfig { * @default 'cdp' */ type?: 'cdp' | 'websocket'; + /** + * Maximum number of browsers that can be open at the same time. + * When the limit is reached, the crawler waits for a browser to close before launching a new one. + * Set this to your remote service's concurrent session limit to avoid 429 errors. + */ + maxOpenBrowsers?: number; } export interface BrowserPluginOptions { @@ -211,6 +217,7 @@ export abstract class BrowserPlugin< endpoint: () => provider.connect(), release: ({ context }) => provider.release(context as any), type: provider.type, + maxOpenBrowsers: provider.maxOpenBrowsers, }; } else { this.remoteBrowser = remoteBrowser; diff --git a/packages/browser-pool/src/browser-pool.ts b/packages/browser-pool/src/browser-pool.ts index 20c56ff70f95..11b2fd4f2b6c 100644 --- a/packages/browser-pool/src/browser-pool.ts +++ b/packages/browser-pool/src/browser-pool.ts @@ -303,6 +303,7 @@ export class BrowserPool< > extends TypedEmitter> { browserPlugins: BrowserPlugins; maxOpenPagesPerBrowser: number; + maxOpenBrowsers: number; retireBrowserAfterPageCount: number; operationTimeoutMillis: number; closeInactiveBrowserAfterMillis: number; @@ -395,6 +396,7 @@ export class BrowserPool< this.browserPlugins = browserPlugins as unknown as BrowserPlugins; this.maxOpenPagesPerBrowser = maxOpenPagesPerBrowser; + this.maxOpenBrowsers = Infinity; this.retireBrowserAfterPageCount = retireBrowserAfterPageCount; this.operationTimeoutMillis = operationTimeoutSecs * 1000; this.closeInactiveBrowserAfterMillis = closeInactiveBrowserAfterSecs * 1000; @@ -860,6 +862,28 @@ export class BrowserPool< } } + /** + * Returns `true` if the pool can accept a new browser launch without exceeding + * {@link BrowserPoolOptions.maxOpenBrowsers}. Counts starting, active, and retired browsers. + */ + hasFreeBrowserSlot(): boolean { + const total = + this.startingBrowserControllers.size + + this.activeBrowserControllers.size + + this.retiredBrowserControllers.size; + return total < this.maxOpenBrowsers; + } + + /** + * Returns `true` if any active browser has room for another page. + */ + hasActiveBrowserWithFreeCapacity(): boolean { + for (const controller of this.activeBrowserControllers) { + if (controller.activePages < this.maxOpenPagesPerBrowser) return true; + } + return false; + } + private _initializeFingerprinting(): void { const { useFingerprintCache = true, fingerprintCacheSize = 10_000 } = this.fingerprintOptions; this.fingerprintGenerator = new FingerprintGenerator(this.fingerprintOptions.fingerprintGeneratorOptions); diff --git a/packages/browser-pool/src/playwright/playwright-plugin.ts b/packages/browser-pool/src/playwright/playwright-plugin.ts index 6aa1787324e5..b59a7de6b19c 100644 --- a/packages/browser-pool/src/playwright/playwright-plugin.ts +++ b/packages/browser-pool/src/playwright/playwright-plugin.ts @@ -88,7 +88,7 @@ export class PlaywrightPlugin extends BrowserPlugin< if (options.useIncognitoPages === undefined) { this.useIncognitoPages = true; this.log.info('Remote browser detected — defaulting useIncognitoPages to true for session isolation.'); - } else if (options.useIncognitoPages === false) { + } else if (!options.useIncognitoPages) { const isWebSocket = this.connectOptions || this.remoteBrowser?.type === 'websocket'; const message = isWebSocket ? 'useIncognitoPages is set to false with a remote WebSocket connection. ' + diff --git a/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts b/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts index 5a853594e2ac..df24096b7f49 100644 --- a/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts +++ b/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts @@ -69,7 +69,7 @@ export class PuppeteerPlugin extends BrowserPlugin< if (options.useIncognitoPages === undefined) { this.useIncognitoPages = true; this.log.info('Remote browser detected — defaulting useIncognitoPages to true for session isolation.'); - } else if (options.useIncognitoPages === false) { + } else if (!options.useIncognitoPages) { this.log.warning( 'useIncognitoPages is set to false with a remote browser connection. ' + 'Pages will share cookies and storage on the remote browser instance.', diff --git a/packages/browser-pool/src/remote-browser-provider.ts b/packages/browser-pool/src/remote-browser-provider.ts index dc324d529c39..46fbb4568744 100644 --- a/packages/browser-pool/src/remote-browser-provider.ts +++ b/packages/browser-pool/src/remote-browser-provider.ts @@ -40,6 +40,12 @@ export abstract class RemoteBrowserProvider>> CONNECT active=${activeConnections} (peak=${peakConnections})`); + const token = process.env.BROWSERLESS_TOKEN; + const url = token ? `wss://production-sfo.browserless.io?token=${token}` : 'ws://localhost:3000'; + return { url }; + } + + async release() { + activeConnections--; + console.log(`<<< RELEASE active=${activeConnections}`); + } +} + +const crawler = new PlaywrightCrawler({ + launchContext: { + remoteBrowser: new BrowserlessProvider(), + }, + browserPoolOptions: { + // Retire after just 2 pages — forces frequent retirement + retireBrowserAfterPageCount: 2, + maxOpenPagesPerBrowser: 1, + }, + // 2 concurrent pages = 2 browsers needed, matching the docker CONCURRENT=2 + maxConcurrency: 2, + maxRequestsPerCrawl: 10, + async requestHandler({ page, request }) { + const title = await page.title(); + console.log(`[Page] ${request.loadedUrl} — "${title}"`); + }, +}); + +await crawler.run([ + 'https://example.com', + 'https://crawlee.dev', + 'https://www.google.com', + 'https://github.com', + 'https://wikipedia.org', + 'https://httpbin.org', + 'https://jsonplaceholder.typicode.com', + 'https://news.ycombinator.com', + 'https://books.toscrape.com', + 'https://quotes.toscrape.com', +]); + +console.log(`\nPeak concurrent connections: ${peakConnections}`); +console.log(`Expected max: 2 (matching maxConcurrency)`); +if (peakConnections > 2) { + console.log(`⚠ OVERLAP DETECTED: ${peakConnections} browsers were open simultaneously`); +} From 2f6e9eb49a63400294a1f9a406423c88c4967b64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sol=C3=A1r?= Date: Thu, 30 Apr 2026 10:59:35 +0200 Subject: [PATCH 17/45] update remote browser --- .../src/remote-browser-provider.ts | 32 ++++++++++++++----- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/packages/browser-pool/src/remote-browser-provider.ts b/packages/browser-pool/src/remote-browser-provider.ts index 46fbb4568744..8b714691a0be 100644 --- a/packages/browser-pool/src/remote-browser-provider.ts +++ b/packages/browser-pool/src/remote-browser-provider.ts @@ -4,28 +4,42 @@ * Implement this class to encapsulate the lifecycle of a remote browser session * (creation, connection URL resolution, and cleanup). The framework calls * {@link connect} once per browser launch and {@link release} when the browser - * closes or crashes. + * closes, crashes, the pool is destroyed, or the connection fails during launch. + * + * Pass the provider instance as the `remoteBrowser` option on the crawler's + * `launchContext` or directly on the plugin constructor: + * + * ```typescript + * const crawler = new PlaywrightCrawler({ + * launchContext: { + * remoteBrowser: new MyProvider(), + * }, + * }); + * ``` * * **Example — simple static endpoint (e.g. Browserless):** * ```typescript * class BrowserlessProvider extends RemoteBrowserProvider { - * constructor(private url: string) { super(); } - * async connect() { return { url: this.url }; } + * maxOpenBrowsers = 2; // respect the service's concurrent session limit + * + * async connect() { + * return { url: `wss://production-sfo.browserless.io?token=${token}` }; + * } * } * ``` * - * **Example — session lifecycle (e.g. Browserbase):** + * **Example — session lifecycle with concurrency limit (e.g. Browserbase):** * ```typescript * class BrowserbaseProvider extends RemoteBrowserProvider<{ id: string }> { - * constructor(private apiKey: string, private projectId: string) { super(); } + * maxOpenBrowsers = 2; // respect the service's concurrent session limit * * async connect() { - * const session = await createSession(this.apiKey, this.projectId); + * const session = await createSession(apiKey, projectId); * return { url: session.connectUrl, context: { id: session.id } }; * } * * async release(context: { id: string }) { - * await releaseSession(this.apiKey, context.id); + * await releaseSession(apiKey, context.id); * } * } * ``` @@ -53,10 +67,12 @@ export abstract class RemoteBrowserProvider | { url: string; context?: TContext }; /** - * Called when the browser closes, crashes, or the pool is destroyed. + * Called when the browser closes, crashes, the pool is destroyed, or the + * connection fails right after {@link connect} succeeds. * Override this to clean up remote sessions, release API resources, etc. * * Errors thrown here are caught and logged as warnings — they never crash the crawler. + * Safe to assume this is called at most once per {@link connect} call. * * @param _context The same `context` object returned by {@link connect}. */ From 604346ad20b27dc91ae70df3529470325f238d3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sol=C3=A1r?= Date: Thu, 30 Apr 2026 15:43:16 +0200 Subject: [PATCH 18/45] update examples --- .../examples/browserbase-playwright.ts | 4 +- .../browserless-local-playwright-ws.ts | 48 +++++++ .../examples/browserless-local-playwright.ts | 42 +++++++ .../examples/browserless-local-puppeteer.ts | 43 +++++++ .../examples/browserless-overlap-test.ts | 2 +- .../examples/browserless-playwright-ws.ts | 8 +- .../examples/browserless-playwright.ts | 41 ++++-- .../examples/browserless-puppeteer.ts | 7 +- temp-examples/examples/steel-playwright-ws.ts | 70 ----------- temp-examples/examples/steel-playwright.ts | 3 +- temp-examples/examples/steel-puppeteer.ts | 4 +- temp-examples/package.json | 3 + temp-examples/readme.md | 117 ++++++++++++++++-- 13 files changed, 297 insertions(+), 95 deletions(-) create mode 100644 temp-examples/examples/browserless-local-playwright-ws.ts create mode 100644 temp-examples/examples/browserless-local-playwright.ts create mode 100644 temp-examples/examples/browserless-local-puppeteer.ts delete mode 100644 temp-examples/examples/steel-playwright-ws.ts diff --git a/temp-examples/examples/browserbase-playwright.ts b/temp-examples/examples/browserbase-playwright.ts index 30686afa2924..6a6c48b0e77f 100644 --- a/temp-examples/examples/browserbase-playwright.ts +++ b/temp-examples/examples/browserbase-playwright.ts @@ -10,6 +10,8 @@ if (!apiKey) throw new Error('BROWSERBASE_API_KEY env variable is required'); if (!projectId) throw new Error('BROWSERBASE_PROJECT_ID env variable is required'); class BrowserbaseProvider extends RemoteBrowserProvider<{ id: string }> { + maxOpenBrowsers = 1; + async connect() { const response = await fetch('https://api.browserbase.com/v1/sessions', { method: 'POST', @@ -38,6 +40,7 @@ class BrowserbaseProvider extends RemoteBrowserProvider<{ id: string }> { } const crawler = new PlaywrightCrawler({ + launchContext: { remoteBrowser: new BrowserbaseProvider(), }, @@ -45,7 +48,6 @@ const crawler = new PlaywrightCrawler({ retireBrowserAfterPageCount: 3, maxOpenPagesPerBrowser: 1, }, - // Browserbase free tier: 3 concurrent sessions maxConcurrency: 2, maxRequestsPerCrawl: 10, async requestHandler({ page, request }) { diff --git a/temp-examples/examples/browserless-local-playwright-ws.ts b/temp-examples/examples/browserless-local-playwright-ws.ts new file mode 100644 index 000000000000..5ec8933abbd3 --- /dev/null +++ b/temp-examples/examples/browserless-local-playwright-ws.ts @@ -0,0 +1,48 @@ +/** + * Browserless local — Playwright WebSocket protocol + * Docker: docker run -p 3000:3000 -e CONCURRENT=4 ghcr.io/browserless/chromium + * + * Uses browserType.connect() (Playwright native WS), not connectOverCDP(). + * Browserless supports both protocols — the /chromium/playwright endpoint + * speaks the Playwright WebSocket protocol. + */ +import { RemoteBrowserProvider } from '@crawlee/browser-pool'; +import { PlaywrightCrawler } from 'crawlee'; + +class BrowserlessLocalWsProvider extends RemoteBrowserProvider { + override type = 'websocket' as const; + maxOpenBrowsers = 4; // match CONCURRENT=4 in docker + + async connect() { + return { url: 'ws://localhost:3000/chromium/playwright' }; + } +} + +const crawler = new PlaywrightCrawler({ + launchContext: { + remoteBrowser: new BrowserlessLocalWsProvider(), + }, + browserPoolOptions: { + retireBrowserAfterPageCount: 5, + maxOpenPagesPerBrowser: 1, + }, + maxConcurrency: 4, + maxRequestsPerCrawl: 10, + async requestHandler({ page, request }) { + const title = await page.title(); + console.log(`[Page] ${request.loadedUrl} — "${title}"`); + }, +}); + +await crawler.run([ + 'https://example.com', + 'https://crawlee.dev', + 'https://www.google.com', + 'https://github.com', + 'https://wikipedia.org', + 'https://httpbin.org', + 'https://jsonplaceholder.typicode.com', + 'https://news.ycombinator.com', + 'https://books.toscrape.com', + 'https://quotes.toscrape.com', +]); diff --git a/temp-examples/examples/browserless-local-playwright.ts b/temp-examples/examples/browserless-local-playwright.ts new file mode 100644 index 000000000000..806938c6b98b --- /dev/null +++ b/temp-examples/examples/browserless-local-playwright.ts @@ -0,0 +1,42 @@ +/** + * Browserless local — Playwright CDP + * Docker: docker run -p 3000:3000 -e CONCURRENT=4 ghcr.io/browserless/chromium + */ +import { RemoteBrowserProvider } from '@crawlee/browser-pool'; +import { PlaywrightCrawler } from 'crawlee'; + +class BrowserlessLocalProvider extends RemoteBrowserProvider { + maxOpenBrowsers = 4; // match CONCURRENT=4 in docker + + async connect() { + return { url: 'ws://localhost:3000' }; + } +} + +const crawler = new PlaywrightCrawler({ + launchContext: { + remoteBrowser: new BrowserlessLocalProvider(), + }, + browserPoolOptions: { + retireBrowserAfterPageCount: 5, + }, + maxConcurrency: 4, + maxRequestsPerCrawl: 10, + async requestHandler({ page, request }) { + const title = await page.title(); + console.log(`[Page] ${request.loadedUrl} — "${title}"`); + }, +}); + +await crawler.run([ + 'https://example.com', + 'https://crawlee.dev', + 'https://www.google.com', + 'https://github.com', + 'https://wikipedia.org', + 'https://httpbin.org', + 'https://jsonplaceholder.typicode.com', + 'https://news.ycombinator.com', + 'https://books.toscrape.com', + 'https://quotes.toscrape.com', +]); diff --git a/temp-examples/examples/browserless-local-puppeteer.ts b/temp-examples/examples/browserless-local-puppeteer.ts new file mode 100644 index 000000000000..5eb8b751bd73 --- /dev/null +++ b/temp-examples/examples/browserless-local-puppeteer.ts @@ -0,0 +1,43 @@ +/** + * Browserless local — Puppeteer CDP + * Docker: docker run -p 3000:3000 -e CONCURRENT=4 ghcr.io/browserless/chromium + */ +import { RemoteBrowserProvider } from '@crawlee/browser-pool'; +import { PuppeteerCrawler } from 'crawlee'; + +class BrowserlessLocalProvider extends RemoteBrowserProvider { + maxOpenBrowsers = 4; // match CONCURRENT=4 in docker + + async connect() { + return { url: 'ws://localhost:3000' }; + } +} + +const crawler = new PuppeteerCrawler({ + launchContext: { + remoteBrowser: new BrowserlessLocalProvider(), + }, + browserPoolOptions: { + retireBrowserAfterPageCount: 5, + maxOpenPagesPerBrowser: 1, + }, + maxConcurrency: 4, + maxRequestsPerCrawl: 10, + async requestHandler({ page, request }) { + const title = await page.title(); + console.log(`[Page] ${request.loadedUrl} — "${title}"`); + }, +}); + +await crawler.run([ + 'https://example.com', + 'https://crawlee.dev', + 'https://www.google.com', + 'https://github.com', + 'https://wikipedia.org', + 'https://httpbin.org', + 'https://jsonplaceholder.typicode.com', + 'https://news.ycombinator.com', + 'https://books.toscrape.com', + 'https://quotes.toscrape.com', +]); diff --git a/temp-examples/examples/browserless-overlap-test.ts b/temp-examples/examples/browserless-overlap-test.ts index 5783b798f399..5219ed81cc5e 100644 --- a/temp-examples/examples/browserless-overlap-test.ts +++ b/temp-examples/examples/browserless-overlap-test.ts @@ -25,7 +25,7 @@ let peakConnections = 0; class BrowserlessProvider extends RemoteBrowserProvider { // Cap to match the service limit — prevents overlap during retirement - override maxOpenBrowsers = 2; + maxOpenBrowsers = 2; async connect() { activeConnections++; diff --git a/temp-examples/examples/browserless-playwright-ws.ts b/temp-examples/examples/browserless-playwright-ws.ts index 3d7727d1db0e..0f6d027388f3 100644 --- a/temp-examples/examples/browserless-playwright-ws.ts +++ b/temp-examples/examples/browserless-playwright-ws.ts @@ -3,11 +3,11 @@ import 'dotenv/config'; import { RemoteBrowserProvider } from '@crawlee/browser-pool'; import { PlaywrightCrawler } from 'crawlee'; -// Local Docker (preferred): docker run -p 3000:3000 -e CONCURRENT=4 ghcr.io/browserless/chromium -// Remote: set BROWSERLESS_TOKEN in .env +// Set BROWSERLESS_TOKEN in .env +// For local Docker, see browserless-local-playwright-ws.ts const token = process.env.BROWSERLESS_TOKEN; -const base = token ? 'wss://production-sfo.browserless.io' : 'ws://localhost:3000'; -const endpointUrl = token ? `${base}/chromium/playwright?token=${token}` : `${base}/chromium/playwright`; +if (!token) throw new Error('BROWSERLESS_TOKEN env variable is required'); +const endpointUrl = `wss://production-sfo.browserless.io/chromium/playwright?token=${token}`; class BrowserlessWsProvider extends RemoteBrowserProvider { override type = 'websocket' as const; diff --git a/temp-examples/examples/browserless-playwright.ts b/temp-examples/examples/browserless-playwright.ts index bc4c72abfbc2..0d0869372323 100644 --- a/temp-examples/examples/browserless-playwright.ts +++ b/temp-examples/examples/browserless-playwright.ts @@ -1,16 +1,42 @@ +/** + * Browserless remote — Playwright CDP with API-managed sessions + * Set BROWSERLESS_TOKEN in .env + * For local Docker, see browserless-local-playwright.ts + */ import 'dotenv/config'; import { RemoteBrowserProvider } from '@crawlee/browser-pool'; import { PlaywrightCrawler } from 'crawlee'; -// Local Docker (preferred): docker run -p 3000:3000 -e CONCURRENT=4 ghcr.io/browserless/chromium -// Remote: set BROWSERLESS_TOKEN in .env const token = process.env.BROWSERLESS_TOKEN; -const endpointUrl = token ? `wss://production-sfo.browserless.io?token=${token}` : 'ws://localhost:3000'; +if (!token) throw new Error('BROWSERLESS_TOKEN env variable is required'); -class BrowserlessProvider extends RemoteBrowserProvider { +const baseUrl = 'https://production-sfo.browserless.io'; + +class BrowserlessProvider extends RemoteBrowserProvider<{ stopUrl: string }> { async connect() { - return { url: endpointUrl }; + const response = await fetch(`${baseUrl}/session?token=${token}`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ ttl: 60_000 }), + }); + + if (!response.ok) { + throw new Error(`Failed to create session: ${response.status} ${response.statusText}`); + } + + const session = await response.json(); + console.log(`>>> Session created: ${session.id}`); + + return { + url: session.connect, + context: { stopUrl: session.stop }, + }; + } + + async release({ stopUrl }: { stopUrl: string }) { + await fetch(`${stopUrl}&force=true`, { method: 'DELETE' }).catch(() => {}); + console.log(`<<< Session released`); } } @@ -21,8 +47,8 @@ const crawler = new PlaywrightCrawler({ browserPoolOptions: { retireBrowserAfterPageCount: 5, }, - maxConcurrency: 1, - maxRequestsPerCrawl: 9, + maxConcurrency: 4, + maxRequestsPerCrawl: 10, async requestHandler({ page, request }) { const title = await page.title(); console.log(`[Page] ${request.loadedUrl} — "${title}"`); @@ -39,4 +65,5 @@ await crawler.run([ 'https://jsonplaceholder.typicode.com', 'https://news.ycombinator.com', 'https://books.toscrape.com', + 'https://quotes.toscrape.com', ]); diff --git a/temp-examples/examples/browserless-puppeteer.ts b/temp-examples/examples/browserless-puppeteer.ts index d49d3e2c6f05..36b973807401 100644 --- a/temp-examples/examples/browserless-puppeteer.ts +++ b/temp-examples/examples/browserless-puppeteer.ts @@ -3,10 +3,11 @@ import 'dotenv/config'; import { RemoteBrowserProvider } from '@crawlee/browser-pool'; import { PuppeteerCrawler } from 'crawlee'; -// Local Docker (preferred): docker run -p 3000:3000 -e CONCURRENT=4 ghcr.io/browserless/chromium -// Remote: set BROWSERLESS_TOKEN in .env +// Set BROWSERLESS_TOKEN in .env +// For local Docker, see browserless-local-puppeteer.ts const token = process.env.BROWSERLESS_TOKEN; -const endpointUrl = token ? `wss://production-sfo.browserless.io?token=${token}` : 'ws://localhost:3000'; +if (!token) throw new Error('BROWSERLESS_TOKEN env variable is required'); +const endpointUrl = `wss://production-sfo.browserless.io?token=${token}`; class BrowserlessProvider extends RemoteBrowserProvider { async connect() { diff --git a/temp-examples/examples/steel-playwright-ws.ts b/temp-examples/examples/steel-playwright-ws.ts deleted file mode 100644 index e560972891d9..000000000000 --- a/temp-examples/examples/steel-playwright-ws.ts +++ /dev/null @@ -1,70 +0,0 @@ -import 'dotenv/config'; - -import { RemoteBrowserProvider } from '@crawlee/browser-pool'; -import { PlaywrightCrawler } from 'crawlee'; - -const apiKey = process.env.STEEL_API_KEY; -if (!apiKey) throw new Error('STEEL_API_KEY env variable is required'); - -class SteelWsProvider extends RemoteBrowserProvider<{ id: string }> { - override type = 'websocket' as const; - - async connect() { - const response = await fetch('https://api.steel.dev/v1/sessions', { - method: 'POST', - headers: { 'Steel-Api-Key': apiKey, 'Content-Type': 'application/json' }, - body: JSON.stringify({}), - }); - - if (!response.ok) { - throw new Error(`Failed to create Steel session: ${response.status} ${response.statusText}`); - } - - const session = await response.json(); - console.log(`>>> Session created: ${session.id}`); - - return { - url: `wss://connect.steel.dev?apiKey=${apiKey}&sessionId=${session.id}`, - context: { id: session.id }, - }; - } - - async release({ id }: { id: string }) { - await fetch(`https://api.steel.dev/v1/sessions/${id}/release`, { - method: 'POST', - headers: { 'Steel-Api-Key': apiKey }, - }).catch(() => {}); - console.log(`<<< Session released: ${id}`); - } -} - -// Note: Steel may not support the Playwright WebSocket protocol. -// If this hangs on connect, use the CDP variant (steel-playwright.ts) instead. -const crawler = new PlaywrightCrawler({ - launchContext: { - remoteBrowser: new SteelWsProvider(), - }, - browserPoolOptions: { - retireBrowserAfterPageCount: 3, - maxOpenPagesPerBrowser: 1, - }, - maxConcurrency: 4, - maxRequestsPerCrawl: 10, - async requestHandler({ page, request }) { - const title = await page.title(); - console.log(`[Page] ${request.loadedUrl} — "${title}"`); - }, -}); - -await crawler.run([ - 'https://example.com', - 'https://crawlee.dev', - 'https://www.google.com', - 'https://github.com', - 'https://wikipedia.org', - 'https://httpbin.org', - 'https://jsonplaceholder.typicode.com', - 'https://news.ycombinator.com', - 'https://books.toscrape.com', - 'https://quotes.toscrape.com', -]); diff --git a/temp-examples/examples/steel-playwright.ts b/temp-examples/examples/steel-playwright.ts index 9e9354412985..e14bf60bf31b 100644 --- a/temp-examples/examples/steel-playwright.ts +++ b/temp-examples/examples/steel-playwright.ts @@ -42,8 +42,9 @@ const crawler = new PlaywrightCrawler({ }, browserPoolOptions: { retireBrowserAfterPageCount: 5, + maxOpenPagesPerBrowser: 1, }, - maxConcurrency: 1, + maxConcurrency: 5, maxRequestsPerCrawl: 10, async requestHandler({ page, request }) { const title = await page.title(); diff --git a/temp-examples/examples/steel-puppeteer.ts b/temp-examples/examples/steel-puppeteer.ts index f2985f8158a7..80cfc4dea97d 100644 --- a/temp-examples/examples/steel-puppeteer.ts +++ b/temp-examples/examples/steel-puppeteer.ts @@ -7,6 +7,8 @@ const apiKey = process.env.STEEL_API_KEY; if (!apiKey) throw new Error('STEEL_API_KEY env variable is required'); class SteelProvider extends RemoteBrowserProvider { + maxOpenBrowsers = 4; // Steel Hobby tier effective concurrent limit + async connect() { return { url: `wss://connect.steel.dev?apiKey=${apiKey}` }; } @@ -17,7 +19,7 @@ const crawler = new PuppeteerCrawler({ remoteBrowser: new SteelProvider(), }, browserPoolOptions: { - retireBrowserAfterPageCount: 3, + retireBrowserAfterPageCount: 5, maxOpenPagesPerBrowser: 1, }, maxConcurrency: 4, diff --git a/temp-examples/package.json b/temp-examples/package.json index 32e7681d0202..221ecd659a37 100644 --- a/temp-examples/package.json +++ b/temp-examples/package.json @@ -8,6 +8,9 @@ "example:browserless-puppeteer": "node --experimental-strip-types examples/browserless-puppeteer.ts", "example:browserless-playwright": "node --experimental-strip-types examples/browserless-playwright.ts", "example:browserless-playwright-ws": "node --experimental-strip-types examples/browserless-playwright-ws.ts", + "example:browserless-local-puppeteer": "node --experimental-strip-types examples/browserless-local-puppeteer.ts", + "example:browserless-local-playwright": "node --experimental-strip-types examples/browserless-local-playwright.ts", + "example:browserless-local-playwright-ws": "node --experimental-strip-types examples/browserless-local-playwright-ws.ts", "example:browserbase-puppeteer": "node --experimental-strip-types examples/browserbase-puppeteer.ts", "example:browserbase-playwright": "node --experimental-strip-types examples/browserbase-playwright.ts", "example:browserbase-playwright-ws": "node --experimental-strip-types examples/browserbase-playwright-ws.ts", diff --git a/temp-examples/readme.md b/temp-examples/readme.md index a570750b6774..5eeede14c6f0 100644 --- a/temp-examples/readme.md +++ b/temp-examples/readme.md @@ -1,12 +1,115 @@ -#how to start +# Remote Browser Service Examples -``` -##root -nr clean -nr build +Examples for connecting Crawlee crawlers to remote browser services using `RemoteBrowserProvider`. + +## How to run + +```bash +# from repo root +npm run clean +npm run build cd temp-examples npm install -npm run example:browserless-puppeteer -... +npm run example:steel-puppeteer +``` + +## Steel + +**Website:** https://steel.dev +**Docs:** https://docs.steel.dev +**Protocol:** CDP only (no Playwright WebSocket protocol) + +### Connection modes + +Steel supports two ways to connect: + +1. **Auto-managed sessions** — connect directly to `wss://connect.steel.dev?apiKey=...`. Steel creates and cleans up the session automatically. Simplest approach. + +2. **API-managed sessions** — create a session via `POST /v1/sessions`, connect with the returned `sessionId`, release via `POST /v1/sessions/{id}/release`. Gives control over session options (proxy, geolocation, etc.) and explicit cleanup. + +### Concurrent session limits (Hobby/free tier) + +- Docs say 5 concurrent sessions +- In practice, only 4 connections succeed simultaneously +- Excess connections **hang silently** — no 429 error, no timeout, `puppeteer.connect()` / `connectOverCDP()` just never resolves +- Set `maxOpenBrowsers = 4` to stay safe + +### Playwright + +Steel exposes a CDP endpoint. Use `connectOverCDP()`, not `connect()`: + +```typescript +// Works — CDP +const browser = await chromium.connectOverCDP('wss://connect.steel.dev?apiKey=...'); + +// Hangs forever — Steel doesn't speak Playwright's WebSocket protocol +const browser = await chromium.connect('wss://connect.steel.dev?apiKey=...'); +``` + +### Examples + +| Example | Connection | Session management | +|---------|-----------|-------------------| +| `steel-puppeteer.ts` | Puppeteer CDP | Auto-managed | +| `steel-playwright.ts` | Playwright CDP | API-managed (create/release) | + +--- + +## Browserbase + +TODO + +## Browserless + +**Website:** https://browserless.io +**Docker:** `ghcr.io/browserless/chromium` +**Protocol:** CDP and Playwright WebSocket + +### Local setup (Docker) + +```bash +docker run -p 3000:3000 -e CONCURRENT=4 ghcr.io/browserless/chromium ``` + +Or use the npm script: + +```bash +npm run docker:browserless +``` + +This starts a Browserless instance on `ws://localhost:3000` with a 4 concurrent session limit. + +### Connection modes + +Browserless supports both CDP and Playwright's native WebSocket protocol: + +- **CDP** — `ws://localhost:3000` (default endpoint) +- **Playwright WebSocket** — `ws://localhost:3000/chromium/playwright` (use `type: 'websocket'` on the provider) + +Unlike Steel, Browserless actually speaks the Playwright WebSocket protocol, so `browserType.connect()` works. + +### Session management + +The cloud version has a `/session` API for explicit session lifecycle: + +- **Create:** `POST /session?token=...` with `{ ttl: 60000 }` — returns `{ id, connect, stop }` +- **Connect:** Use the `connect` URL from the response +- **Release:** `DELETE {stop}&force=true` + +The local Docker image (open-source) does not have the `/session` API — sessions are auto-managed on connect/disconnect. + +### Examples + +| Example | Connection | Session management | Target | +|---------|-----------|-------------------|--------| +| `browserless-local-puppeteer.ts` | Puppeteer CDP | Auto-managed | Docker | +| `browserless-local-playwright.ts` | Playwright CDP | Auto-managed | Docker | +| `browserless-local-playwright-ws.ts` | Playwright WebSocket | Auto-managed | Docker | +| `browserless-puppeteer.ts` | Puppeteer CDP | Auto-managed | Remote | +| `browserless-playwright.ts` | Playwright CDP | API-managed (create/release) | Remote | +| `browserless-playwright-ws.ts` | Playwright WebSocket | Auto-managed | Remote | + +## Rebrowser + +TODO From cf594b11d9a2999309c9fe29af842f94678068cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sol=C3=A1r?= Date: Tue, 12 May 2026 10:33:38 +0200 Subject: [PATCH 19/45] revert temp if statement --- packages/browser-crawler/src/internals/browser-crawler.ts | 7 ------- 1 file changed, 7 deletions(-) diff --git a/packages/browser-crawler/src/internals/browser-crawler.ts b/packages/browser-crawler/src/internals/browser-crawler.ts index 073ae4ac6e9b..c638e378defa 100644 --- a/packages/browser-crawler/src/internals/browser-crawler.ts +++ b/packages/browser-crawler/src/internals/browser-crawler.ts @@ -676,13 +676,6 @@ export abstract class BrowserCrawler< return; } - // Remote browsers are expensive — don't retire them when a session retires. - // Let retireBrowserAfterPageCount control the browser lifecycle instead. - // See also: PR #3605 which fixes the root cause (maxUsageCount: 1 in BasicCrawler). - if (browserController.launchContext.isRemote) { - return; - } - let sessionIds = this.browserSessionIds.get(browserController); if (sessionIds) { From 74d402ebba02ab793510c6e1fd96dea10a57cb13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sol=C3=A1r?= Date: Tue, 12 May 2026 12:06:24 +0200 Subject: [PATCH 20/45] feat: align useIncognitoPages defaults for remote browsers and enable cookie sharing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remote CDP browsers (both Puppeteer and Playwright) now default useIncognitoPages to false, matching local behavior. For Playwright CDP, the browser's default context is wrapped in PlaywrightBrowserWithPersistentContext so pages share cookies — the same mechanism used locally with launchPersistentContext(). Playwright WebSocket still defaults to true since connect() returns a browser with no default context to wrap. The wrapper passes the real Browser as parentBrowser so close() also closes the WebSocket transport and disconnected events are forwarded to the pool. --- .../src/playwright/playwright-browser.ts | 20 ++- .../src/playwright/playwright-plugin.ts | 66 +++++++--- .../src/puppeteer/puppeteer-plugin.ts | 17 +-- .../browser-pool/test/remote-browser.test.ts | 59 +++++---- ...okie-sharing-playwright-local-vs-remote.ts | 109 ++++++++++++++++ .../cookie-sharing-playwright-test.ts | 118 ++++++++++++++++++ .../cookie-sharing-session-across-browsers.ts | 82 ++++++++++++ temp-examples/examples/cookie-sharing-test.ts | 118 ++++++++++++++++++ temp-examples/package.json | 6 +- 9 files changed, 538 insertions(+), 57 deletions(-) create mode 100644 temp-examples/examples/cookie-sharing-playwright-local-vs-remote.ts create mode 100644 temp-examples/examples/cookie-sharing-playwright-test.ts create mode 100644 temp-examples/examples/cookie-sharing-session-across-browsers.ts create mode 100644 temp-examples/examples/cookie-sharing-test.ts diff --git a/packages/browser-pool/src/playwright/playwright-browser.ts b/packages/browser-pool/src/playwright/playwright-browser.ts index 2b94fce5421e..f944795d416b 100644 --- a/packages/browser-pool/src/playwright/playwright-browser.ts +++ b/packages/browser-pool/src/playwright/playwright-browser.ts @@ -1,10 +1,12 @@ import { EventEmitter } from 'node:events'; -import type { BrowserContext, BrowserType } from 'playwright'; +import type { Browser, BrowserContext, BrowserType } from 'playwright'; export interface BrowserOptions { browserContext: BrowserContext; version: string; + /** When wrapping a remote CDP browser's default context, pass the real Browser so it can be closed properly. */ + parentBrowser?: Browser; } /** @@ -15,19 +17,28 @@ export class PlaywrightBrowser extends EventEmitter { private _version: string; private _isConnected = true; private _browserType?: BrowserType; + private _parentBrowser?: Browser; constructor(options: BrowserOptions) { super(); - const { browserContext, version } = options; + const { browserContext, version, parentBrowser } = options; this._browserContext = browserContext; - this._version = version; + this._parentBrowser = parentBrowser; this._browserContext.once('close', () => { this._isConnected = false; this.emit('disconnected'); }); + + // Forward real browser disconnection so the pool detects remote crashes. + if (parentBrowser) { + parentBrowser.once('disconnected', () => { + this._isConnected = false; + this.emit('disconnected'); + }); + } } async [Symbol.asyncDispose](): Promise { @@ -36,6 +47,9 @@ export class PlaywrightBrowser extends EventEmitter { async close(): Promise { await this._browserContext.close(); + if (this._parentBrowser) { + await this._parentBrowser.close().catch(() => {}); + } } contexts(): BrowserContext[] { diff --git a/packages/browser-pool/src/playwright/playwright-plugin.ts b/packages/browser-pool/src/playwright/playwright-plugin.ts index b59a7de6b19c..91de6032a836 100644 --- a/packages/browser-pool/src/playwright/playwright-plugin.ts +++ b/packages/browser-pool/src/playwright/playwright-plugin.ts @@ -81,21 +81,19 @@ export class PlaywrightPlugin extends BrowserPlugin< ); } - // We check options.useIncognitoPages (not this.useIncognitoPages) because super() collapses undefined to false. - // This preserves the distinction between "not set" (undefined → default to true) and "explicitly false". const isRemoteConnection = this.remoteBrowser || this.connectOptions || this.connectOverCDPOptions; - if (isRemoteConnection) { - if (options.useIncognitoPages === undefined) { + if (isRemoteConnection && options.useIncognitoPages === undefined) { + const isWebSocket = this.connectOptions || this.remoteBrowser?.type === 'websocket'; + if (isWebSocket) { this.useIncognitoPages = true; - this.log.info('Remote browser detected — defaulting useIncognitoPages to true for session isolation.'); - } else if (!options.useIncognitoPages) { - const isWebSocket = this.connectOptions || this.remoteBrowser?.type === 'websocket'; - const message = isWebSocket - ? 'useIncognitoPages is set to false with a remote WebSocket connection. ' + - 'This may cause errors because browserType.connect() returns a browser with no default context.' - : 'useIncognitoPages is set to false with a remote browser connection. ' + - 'Pages will share cookies and storage on the remote browser instance.'; - this.log.warning(message); + this.log.info( + 'Remote Playwright WebSocket connection detected — defaulting useIncognitoPages to true.', + ); + } else { + this.log.info( + 'Remote Playwright CDP connection detected — pages will share cookies and storage ' + + 'via the default browser context (useIncognitoPages defaults to false).', + ); } } } @@ -131,7 +129,8 @@ export class PlaywrightPlugin extends BrowserPlugin< return await this.library.connect(url, {}); } this.log.info('Connecting to remote browser via connectOverCDP.'); - return await this.library.connectOverCDP(url, {}); + const browser = await this.library.connectOverCDP(url, {}); + return this._maybeWrapWithSharedContext(browser, launchContext); } catch (cause) { await this._callRelease(url, context); throw new BrowserLaunchError( @@ -147,7 +146,8 @@ export class PlaywrightPlugin extends BrowserPlugin< const { endpointURL, ...options } = this.connectOverCDPOptions; this.log.info('Connecting to remote browser via connectOverCDP.'); try { - return await this.library.connectOverCDP(endpointURL, options); + const browser = await this.library.connectOverCDP(endpointURL, options); + return this._maybeWrapWithSharedContext(browser, launchContext); } catch (cause) { throw new BrowserLaunchError( `Failed to connect to remote browser via CDP at "${this._sanitizeEndpointForLog(endpointURL)}". ` + @@ -253,6 +253,42 @@ export class PlaywrightPlugin extends BrowserPlugin< return browser; } + /** + * When useIncognitoPages is false and we have a CDP-connected browser, + * wrap its default context in PlaywrightBrowser so that all pages share + * a single context (matching local persistent-context behavior). + * + * Playwright's browser.newPage() always creates a new context, so without + * this wrapper, pages would never share cookies even with useIncognitoPages: false. + */ + private _maybeWrapWithSharedContext( + browser: PlaywrightBrowser, + launchContext: LaunchContext, + ): PlaywrightBrowser { + if (launchContext.useIncognitoPages) { + return browser; + } + + const contexts = browser.contexts(); + const defaultContext = contexts[0]; + + if (!defaultContext) { + this.log.warning( + 'Remote CDP browser has no default context — cannot share cookies between pages. ' + + 'Falling back to standard behavior (new context per page).', + ); + return browser; + } + + this.log.info('Wrapping remote CDP browser default context for cookie sharing between pages.'); + + return new PlaywrightBrowserWithPersistentContext({ + browserContext: defaultContext, + version: browser.version(), + parentBrowser: browser, + }) as unknown as PlaywrightBrowser; + } + private _throwOnFailedLaunch(launchContext: LaunchContext, cause: unknown): never { this._throwAugmentedLaunchError( cause, diff --git a/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts b/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts index df24096b7f49..ea526643009e 100644 --- a/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts +++ b/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts @@ -62,19 +62,12 @@ export class PuppeteerPlugin extends BrowserPlugin< ); } - // We check options.useIncognitoPages (not this.useIncognitoPages) because super() collapses undefined to false. - // This preserves the distinction between "not set" (undefined → default to true) and "explicitly false". const isRemoteConnection = this.remoteBrowser || this.connectOverCDPOptions; - if (isRemoteConnection) { - if (options.useIncognitoPages === undefined) { - this.useIncognitoPages = true; - this.log.info('Remote browser detected — defaulting useIncognitoPages to true for session isolation.'); - } else if (!options.useIncognitoPages) { - this.log.warning( - 'useIncognitoPages is set to false with a remote browser connection. ' + - 'Pages will share cookies and storage on the remote browser instance.', - ); - } + if (isRemoteConnection && options.useIncognitoPages === undefined) { + this.log.info( + 'Remote browser detected — pages will share cookies and storage ' + + 'on the remote browser instance (useIncognitoPages defaults to false).', + ); } } diff --git a/packages/browser-pool/test/remote-browser.test.ts b/packages/browser-pool/test/remote-browser.test.ts index 4a3e4a8b59ad..827c6228194a 100644 --- a/packages/browser-pool/test/remote-browser.test.ts +++ b/packages/browser-pool/test/remote-browser.test.ts @@ -36,7 +36,7 @@ function createMockBrowser() { return { newPage: vi.fn().mockResolvedValue(createMockPage()), close: vi.fn().mockResolvedValue(undefined), - contexts: vi.fn(() => []), + contexts: vi.fn(() => [mockContext]), on: vi.fn(), off: vi.fn(), once: vi.fn(), @@ -326,16 +326,16 @@ describe('Remote browser — PlaywrightPlugin', () => { // --- useIncognitoPages default -------------------------------------------- describe('useIncognitoPages default', () => { - test('defaults to true for remote (connectOverCDP)', () => { + test('defaults to false for remote (connectOverCDP)', () => { const lib = createMockPlaywrightLibrary(); const plugin = new PlaywrightPlugin(lib as any, { connectOverCDPOptions: { endpointURL: 'http://remote:9222' }, }); - expect(plugin.useIncognitoPages).toBe(true); + expect(plugin.useIncognitoPages).toBe(false); }); - test('defaults to true for remote (connect)', () => { + test('defaults to true for remote (connect / WebSocket)', () => { const lib = createMockPlaywrightLibrary(); const plugin = new PlaywrightPlugin(lib as any, { connectOptions: { wsEndpoint: 'ws://remote:3000' }, @@ -372,9 +372,9 @@ describe('Remote browser — PlaywrightPlugin', () => { }); }); - // --- Warnings ------------------------------------------------------------- + // --- Info/Warnings -------------------------------------------------------- - describe('warnings', () => { + describe('info and warnings', () => { test('proxyUrl + remote → warning logged', async () => { const lib = createMockPlaywrightLibrary(); const plugin = new PlaywrightPlugin(lib as any, { @@ -390,27 +390,25 @@ describe('Remote browser — PlaywrightPlugin', () => { ); }); - test('useIncognitoPages: false + remote CDP → warning about shared state', () => { + test('remote CDP default → info about shared cookies', () => { const lib = createMockPlaywrightLibrary(); new PlaywrightPlugin(lib as any, { connectOverCDPOptions: { endpointURL: 'http://remote:9222' }, - useIncognitoPages: false, }); - expect(mockLogger.warning).toHaveBeenCalledWith( - expect.stringContaining('Pages will share cookies and storage'), + expect(mockLogger.info).toHaveBeenCalledWith( + expect.stringContaining('pages will share cookies and storage'), ); }); - test('useIncognitoPages: false + remote WebSocket → warning about no default context', () => { + test('remote WebSocket default → info about incognito true', () => { const lib = createMockPlaywrightLibrary(); new PlaywrightPlugin(lib as any, { connectOptions: { wsEndpoint: 'ws://remote:3000' }, - useIncognitoPages: false, }); - expect(mockLogger.warning).toHaveBeenCalledWith( - expect.stringContaining('browserType.connect() returns a browser with no default context'), + expect(mockLogger.info).toHaveBeenCalledWith( + expect.stringContaining('defaulting useIncognitoPages to true'), ); }); @@ -541,12 +539,13 @@ describe('Remote browser — PuppeteerPlugin', () => { const plugin = new PuppeteerPlugin(lib as any, { connectOverCDPOptions: { browserWSEndpoint: 'ws://remote:9222' }, proxyUrl: 'http://user:pass@proxy:8080', + useIncognitoPages: true, }); const ctx = plugin.createLaunchContext(); const wrappedBrowser = await plugin.launch(ctx); - // Call newPage on the wrapped browser — useIncognitoPages defaults to true for remote + // Call newPage on the wrapped browser — useIncognitoPages: true creates new context await (wrappedBrowser as any).newPage(); // createBrowserContext should be called with empty options (no proxyServer) @@ -586,13 +585,13 @@ describe('Remote browser — PuppeteerPlugin', () => { // --- useIncognitoPages default -------------------------------------------- describe('useIncognitoPages default', () => { - test('defaults to true for remote', () => { + test('defaults to false for remote', () => { const lib = createMockPuppeteerLibrary(); const plugin = new PuppeteerPlugin(lib as any, { connectOverCDPOptions: { browserWSEndpoint: 'ws://remote:9222' }, }); - expect(plugin.useIncognitoPages).toBe(true); + expect(plugin.useIncognitoPages).toBe(false); }); test('explicit false preserved for remote', () => { @@ -623,9 +622,9 @@ describe('Remote browser — PuppeteerPlugin', () => { }); }); - // --- Warnings ------------------------------------------------------------- + // --- Info/Warnings -------------------------------------------------------- - describe('warnings', () => { + describe('info and warnings', () => { test('proxyUrl + remote → warning logged', async () => { const lib = createMockPuppeteerLibrary(); const plugin = new PuppeteerPlugin(lib as any, { @@ -641,15 +640,14 @@ describe('Remote browser — PuppeteerPlugin', () => { ); }); - test('useIncognitoPages: false + remote → warning logged', () => { + test('remote default → info about shared cookies', () => { const lib = createMockPuppeteerLibrary(); new PuppeteerPlugin(lib as any, { connectOverCDPOptions: { browserWSEndpoint: 'ws://remote:9222' }, - useIncognitoPages: false, }); - expect(mockLogger.warning).toHaveBeenCalledWith( - expect.stringContaining('useIncognitoPages is set to false'), + expect(mockLogger.info).toHaveBeenCalledWith( + expect.stringContaining('pages will share cookies and storage'), ); }); @@ -743,12 +741,21 @@ describe('remoteBrowser config — PlaywrightPlugin', () => { expect(ctx.isRemote).toBe(true); }); - test('useIncognitoPages defaults to true when remoteBrowser is set', () => { + test('useIncognitoPages defaults to false when remoteBrowser is set (CDP)', () => { const lib = createMockPlaywrightLibrary(); const plugin = new PlaywrightPlugin(lib as any, { remoteBrowser: { endpoint: 'wss://test.io' }, }); + expect(plugin.useIncognitoPages).toBe(false); + }); + + test('useIncognitoPages defaults to true when remoteBrowser is set (WebSocket)', () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any, { + remoteBrowser: { endpoint: 'wss://test.io', type: 'websocket' }, + }); + expect(plugin.useIncognitoPages).toBe(true); }); @@ -1035,7 +1042,7 @@ describe('RemoteBrowserProvider — PlaywrightPlugin', () => { expect(ctx.isRemote).toBe(true); }); - test('provider sets useIncognitoPages default to true', () => { + test('provider sets useIncognitoPages default to false (CDP)', () => { const lib = createMockPlaywrightLibrary(); class P extends RemoteBrowserProvider { @@ -1045,7 +1052,7 @@ describe('RemoteBrowserProvider — PlaywrightPlugin', () => { } const plugin = new PlaywrightPlugin(lib as any, { remoteBrowser: new P() }); - expect(plugin.useIncognitoPages).toBe(true); + expect(plugin.useIncognitoPages).toBe(false); }); }); diff --git a/temp-examples/examples/cookie-sharing-playwright-local-vs-remote.ts b/temp-examples/examples/cookie-sharing-playwright-local-vs-remote.ts new file mode 100644 index 000000000000..b064bf41fbce --- /dev/null +++ b/temp-examples/examples/cookie-sharing-playwright-local-vs-remote.ts @@ -0,0 +1,109 @@ +/** + * Cookie sharing: Playwright local vs remote + * + * Compares whether cookies set on page A are visible on page B within the + * same browser for local (launchPersistentContext) vs remote (connect/CDP). + * + * Run local Browserless first: + * docker run -p 3000:3000 -e CONCURRENT=4 ghcr.io/browserless/chromium + * + * Then: + * npm run example:cookie-sharing-playwright-local-vs-remote + */ +import { PlaywrightPlugin, BrowserPool } from '@crawlee/browser-pool'; +import playwright from 'playwright'; + +const BROWSERLESS_CDP = 'ws://localhost:3000'; + +// --------------------------------------------------------------------------- +// Helper +// --------------------------------------------------------------------------- +async function testCookieSharing(label: string, plugin: PlaywrightPlugin) { + console.log(`\n--- ${label} ---`); + + const pool = new BrowserPool({ + browserPlugins: [plugin], + maxOpenPagesPerBrowser: 2, + }); + + try { + // Page A — set a cookie + const pageA = await pool.newPage(); + await pageA.goto('https://example.com', { waitUntil: 'domcontentloaded' }); + + const controllerA = pool.getBrowserControllerByPage(pageA)!; + await controllerA.setCookies(pageA, [ + { name: 'SHARED_TEST', value: 'from-page-a', domain: '.example.com', path: '/' }, + ]); + + const cookiesA = await controllerA.getCookies(pageA); + console.log(`Page A cookies: ${JSON.stringify(cookiesA.map((c) => ({ name: c.name, value: c.value })))}`); + + // Page B — same browser, check if cookie is visible + const pageB = await pool.newPage(); + await pageB.goto('https://example.com', { waitUntil: 'domcontentloaded' }); + + const controllerB = pool.getBrowserControllerByPage(pageB)!; + console.log(`Same browser controller: ${controllerA === controllerB}`); + + const cookiesB = await controllerB.getCookies(pageB); + console.log(`Page B cookies: ${JSON.stringify(cookiesB.map((c) => ({ name: c.name, value: c.value })))}`); + + const found = cookiesB.find((c) => c.name === 'SHARED_TEST'); + if (found) { + console.log(`✅ PASS — Cookie shared between pages (value: "${found.value}")`); + } else { + console.log(`❌ FAIL — Cookie NOT visible on page B`); + } + + await pageA.close(); + await pageB.close(); + } finally { + await pool.destroy(); + } +} + +// --------------------------------------------------------------------------- +// 1. Local — useIncognitoPages: false (launchPersistentContext → shared context) +// --------------------------------------------------------------------------- +await testCookieSharing( + 'Local Playwright — useIncognitoPages: false (persistent context)', + new PlaywrightPlugin(playwright.chromium, { useIncognitoPages: false }), +); + +// --------------------------------------------------------------------------- +// 2. Local — useIncognitoPages: true (new context per page) +// --------------------------------------------------------------------------- +await testCookieSharing( + 'Local Playwright — useIncognitoPages: true', + new PlaywrightPlugin(playwright.chromium, { useIncognitoPages: true }), +); + +// --------------------------------------------------------------------------- +// 3. Remote CDP — useIncognitoPages: false (browser.newPage() = new context anyway) +// --------------------------------------------------------------------------- +await testCookieSharing( + 'Remote CDP Playwright — useIncognitoPages: false', + new PlaywrightPlugin(playwright.chromium, { + useIncognitoPages: false, + connectOverCDPOptions: { endpointURL: BROWSERLESS_CDP }, + }), +); + +// --------------------------------------------------------------------------- +// 4. Remote CDP — useIncognitoPages: true +// --------------------------------------------------------------------------- +await testCookieSharing( + 'Remote CDP Playwright — useIncognitoPages: true', + new PlaywrightPlugin(playwright.chromium, { + useIncognitoPages: true, + connectOverCDPOptions: { endpointURL: BROWSERLESS_CDP }, + }), +); + +console.log('\n--- Summary ---'); +console.log('Local incognito:false → shared (launchPersistentContext)'); +console.log('Local incognito:true → isolated'); +console.log('Remote incognito:false → shared (wrapped default context from CDP)'); +console.log('Remote incognito:true → isolated'); +console.log('\nDone.'); diff --git a/temp-examples/examples/cookie-sharing-playwright-test.ts b/temp-examples/examples/cookie-sharing-playwright-test.ts new file mode 100644 index 000000000000..36f6d1f0c8c8 --- /dev/null +++ b/temp-examples/examples/cookie-sharing-playwright-test.ts @@ -0,0 +1,118 @@ +/** + * Cookie sharing test — Playwright: CDP vs WebSocket, incognito true vs false + * + * Tests whether cookies set on page A are visible on page B within the same + * browser, across all four combinations: + * 1. Playwright CDP + useIncognitoPages: false → should share + * 2. Playwright CDP + useIncognitoPages: true → should NOT share + * 3. Playwright WebSocket + useIncognitoPages: false → ??? (connect() has no default context) + * 4. Playwright WebSocket + useIncognitoPages: true → should NOT share + * + * Run local Browserless first: + * docker run -p 3000:3000 -e CONCURRENT=4 ghcr.io/browserless/chromium + * + * Then: + * npm run example:cookie-sharing-playwright-test + */ +import { PlaywrightPlugin, BrowserPool, RemoteBrowserProvider } from '@crawlee/browser-pool'; +import playwright from 'playwright'; + +const BROWSERLESS_CDP = 'ws://localhost:3000'; +const BROWSERLESS_WS = 'ws://localhost:3000/chromium/playwright'; + +// --------------------------------------------------------------------------- +// Helper +// --------------------------------------------------------------------------- +async function testCookieSharing(label: string, plugin: PlaywrightPlugin) { + console.log(`\n--- ${label} ---`); + + const pool = new BrowserPool({ + browserPlugins: [plugin], + maxOpenPagesPerBrowser: 2, + }); + + try { + // Page A — set a cookie + const pageA = await pool.newPage(); + await pageA.goto('https://example.com', { waitUntil: 'domcontentloaded' }); + + const controllerA = pool.getBrowserControllerByPage(pageA)!; + await controllerA.setCookies(pageA, [ + { name: 'SHARED_TEST', value: 'from-page-a', domain: '.example.com', path: '/' }, + ]); + + const cookiesA = await controllerA.getCookies(pageA); + console.log(`Page A cookies: ${JSON.stringify(cookiesA.map((c) => ({ name: c.name, value: c.value })))}`); + + // Page B — same browser, check if cookie is visible + const pageB = await pool.newPage(); + await pageB.goto('https://example.com', { waitUntil: 'domcontentloaded' }); + + const controllerB = pool.getBrowserControllerByPage(pageB)!; + const sameBrowser = controllerA === controllerB; + console.log(`Same browser controller: ${sameBrowser}`); + + const cookiesB = await controllerB.getCookies(pageB); + console.log(`Page B cookies: ${JSON.stringify(cookiesB.map((c) => ({ name: c.name, value: c.value })))}`); + + const found = cookiesB.find((c) => c.name === 'SHARED_TEST'); + if (found) { + console.log(`✅ PASS — Cookie shared between pages (value: "${found.value}")`); + } else { + console.log(`❌ FAIL — Cookie NOT visible on page B`); + } + + await pageA.close(); + await pageB.close(); + } finally { + await pool.destroy(); + } +} + +// --------------------------------------------------------------------------- +// 1. Playwright CDP — useIncognitoPages: false (should share) +// --------------------------------------------------------------------------- +await testCookieSharing( + 'Playwright CDP — useIncognitoPages: false', + new PlaywrightPlugin(playwright.chromium, { + useIncognitoPages: false, + connectOverCDPOptions: { endpointURL: BROWSERLESS_CDP }, + }), +); + +// --------------------------------------------------------------------------- +// 2. Playwright CDP — useIncognitoPages: true (should NOT share) +// --------------------------------------------------------------------------- +await testCookieSharing( + 'Playwright CDP — useIncognitoPages: true', + new PlaywrightPlugin(playwright.chromium, { + useIncognitoPages: true, + connectOverCDPOptions: { endpointURL: BROWSERLESS_CDP }, + }), +); + +// --------------------------------------------------------------------------- +// 3. Playwright WebSocket — useIncognitoPages: false (the question mark) +// connect() returns a browser with no default context — does newPage() +// create an implicit shared context, or a new one each time? +// --------------------------------------------------------------------------- +await testCookieSharing( + 'Playwright WebSocket — useIncognitoPages: false', + new PlaywrightPlugin(playwright.chromium, { + useIncognitoPages: false, + connectOptions: { wsEndpoint: BROWSERLESS_WS }, + }), +); + +// --------------------------------------------------------------------------- +// 4. Playwright WebSocket — useIncognitoPages: true (should NOT share) +// --------------------------------------------------------------------------- +await testCookieSharing( + 'Playwright WebSocket — useIncognitoPages: true', + new PlaywrightPlugin(playwright.chromium, { + useIncognitoPages: true, + connectOptions: { wsEndpoint: BROWSERLESS_WS }, + }), +); + +console.log('\nDone.'); diff --git a/temp-examples/examples/cookie-sharing-session-across-browsers.ts b/temp-examples/examples/cookie-sharing-session-across-browsers.ts new file mode 100644 index 000000000000..2f22e1a0d4a2 --- /dev/null +++ b/temp-examples/examples/cookie-sharing-session-across-browsers.ts @@ -0,0 +1,82 @@ +/** + * Session-based cookie sharing across remote browsers (Puppeteer) + * + * Demonstrates that the Session object transfers cookies between sequential + * requests even when they land on different browser instances. + * + * Setup: + * - retireBrowserAfterPageCount: 1 → forces a new browser per request + * - Single session pool → same session reused for all requests + * - saveResponseCookies: true (default) + * + * Run local Browserless first: + * docker run -p 3000:3000 -e CONCURRENT=4 ghcr.io/browserless/chromium + * + * Then: + * npm run example:cookie-sharing-session-across-browsers + */ +import { RemoteBrowserProvider } from '@crawlee/browser-pool'; +import { PuppeteerCrawler, SessionPool } from 'crawlee'; + +// --------------------------------------------------------------------------- +// Remote browser provider +// --------------------------------------------------------------------------- +class BrowserlessProvider extends RemoteBrowserProvider { + maxOpenBrowsers = 4; + async connect() { + return { url: 'ws://localhost:3000' }; + } +} + +// Single session so both requests share cookies +const sessionPool = new SessionPool({ maxPoolSize: 1 }); + +// --------------------------------------------------------------------------- +// Crawler — forces new browser per request to prove cross-browser sharing +// --------------------------------------------------------------------------- +const crawler = new PuppeteerCrawler({ + launchContext: { + remoteBrowser: new BrowserlessProvider(), + }, + browserPoolOptions: { + retireBrowserAfterPageCount: 1, // force new browser for each request + maxOpenPagesPerBrowser: 1, + }, + sessionPool, + maxConcurrency: 1, // sequential — so request 1 finishes before request 2 + async requestHandler({ page, request, session, browserController }) { + const controllerId = browserController.id; + + // Set a cookie manually on the first request and save it to the session + if (request.url.includes('/login')) { + await page.setCookie({ + name: 'AUTH_TOKEN', + value: 'secret-jwt-123', + domain: 'books.toscrape.com', + path: '/', + }); + // Save page cookies to the session (normally saveResponseCookies does this + // during navigation, but our cookie was set after navigation) + const cookies = await browserController.getCookies(page); + session?.setCookies(cookies, request.loadedUrl!); + } + + const pageCookies = await page.cookies(); + const sessionCookies = session?.getCookies(request.loadedUrl!) ?? []; + + console.log(`\n[${new URL(request.url).pathname}]`); + console.log(` Browser controller: ${controllerId}`); + console.log(` Session ID: ${session?.id}`); + console.log(` Page cookies: ${JSON.stringify(pageCookies.map((c) => ({ name: c.name, value: c.value })))}`); + console.log(` Session cookies: ${JSON.stringify(sessionCookies.map((c) => ({ name: c.name, value: c.value })))}`); + }, +}); + +await crawler.run([ + 'https://books.toscrape.com/login', // Request 1: browser A — sets AUTH_TOKEN cookie + 'https://books.toscrape.com/', // Request 2: browser B — should have cookie via Session +]); + +console.log('\nDone.'); +console.log('If request 2 shows AUTH_TOKEN in session cookies → session transferred cookies across browsers.'); +console.log('Check that Browser controller IDs are different → proves different browsers.'); diff --git a/temp-examples/examples/cookie-sharing-test.ts b/temp-examples/examples/cookie-sharing-test.ts new file mode 100644 index 000000000000..d9e42991bbd3 --- /dev/null +++ b/temp-examples/examples/cookie-sharing-test.ts @@ -0,0 +1,118 @@ +/** + * Cookie sharing test — useIncognitoPages: false with remote CDP (Puppeteer) + * + * Tests that cookies set on one page are visible on another page within the + * same browser session, comparing local vs remote behavior. + * + * Run local Browserless first: + * docker run -p 3000:3000 -e CONCURRENT=4 ghcr.io/browserless/chromium + * + * Then: + * npm run example:cookie-sharing-test + */ +import { PuppeteerPlugin, BrowserPool, RemoteBrowserProvider } from '@crawlee/browser-pool'; +import puppeteer from 'puppeteer'; + +const BROWSERLESS_URL = 'ws://localhost:3000'; + +// --------------------------------------------------------------------------- +// Helper: open two pages in the same browser, set cookie on page A, +// check if page B can see it without explicit transfer. +// --------------------------------------------------------------------------- +async function testCookieSharing(label: string, plugin: PuppeteerPlugin) { + console.log(`\n--- ${label} ---`); + + const pool = new BrowserPool({ + browserPlugins: [plugin], + maxOpenPagesPerBrowser: 2, + }); + + try { + // Page A — set a cookie + const pageA = await pool.newPage(); + await pageA.goto('https://example.com', { waitUntil: 'domcontentloaded' }); + + const controllerA = pool.getBrowserControllerByPage(pageA)!; + await controllerA.setCookies(pageA, [ + { name: 'SHARED_TEST', value: 'from-page-a', domain: '.example.com' }, + ]); + + const cookiesA = await controllerA.getCookies(pageA); + console.log(`Page A cookies: ${JSON.stringify(cookiesA.map((c) => ({ name: c.name, value: c.value })))}`); + + // Page B — same browser, check if cookie is visible + const pageB = await pool.newPage(); + await pageB.goto('https://example.com', { waitUntil: 'domcontentloaded' }); + + const controllerB = pool.getBrowserControllerByPage(pageB)!; + + // Verify both pages are in the same browser + const sameBrowser = controllerA === controllerB; + console.log(`Same browser controller: ${sameBrowser}`); + + const cookiesB = await controllerB.getCookies(pageB); + console.log(`Page B cookies: ${JSON.stringify(cookiesB.map((c) => ({ name: c.name, value: c.value })))}`); + + const found = cookiesB.find((c) => c.name === 'SHARED_TEST'); + if (found) { + console.log(`✅ PASS — Cookie shared between pages (value: "${found.value}")`); + } else { + console.log(`❌ FAIL — Cookie NOT visible on page B`); + } + + await pageA.close(); + await pageB.close(); + } finally { + await pool.destroy(); + } +} + +// --------------------------------------------------------------------------- +// Test 1: Local browser, useIncognitoPages: false (baseline) +// --------------------------------------------------------------------------- +await testCookieSharing( + 'Local Puppeteer — useIncognitoPages: false', + new PuppeteerPlugin(puppeteer, { useIncognitoPages: false }), +); + +// --------------------------------------------------------------------------- +// Test 2: Remote CDP (Browserless), useIncognitoPages: false +// --------------------------------------------------------------------------- +await testCookieSharing( + 'Remote CDP (Browserless) — useIncognitoPages: false', + new PuppeteerPlugin(puppeteer, { + useIncognitoPages: false, + connectOverCDPOptions: { browserWSEndpoint: BROWSERLESS_URL }, + }), +); + +// --------------------------------------------------------------------------- +// Test 3: Remote CDP (Browserless), useIncognitoPages: true (should NOT share) +// --------------------------------------------------------------------------- +await testCookieSharing( + 'Remote CDP (Browserless) — useIncognitoPages: true', + new PuppeteerPlugin(puppeteer, { + useIncognitoPages: true, + connectOverCDPOptions: { browserWSEndpoint: BROWSERLESS_URL }, + }), +); + +// --------------------------------------------------------------------------- +// Test 4: Remote via RemoteBrowserProvider, useIncognitoPages: false +// --------------------------------------------------------------------------- +class BrowserlessProvider extends RemoteBrowserProvider { + maxOpenBrowsers = 2; + async connect() { + return { url: BROWSERLESS_URL }; + } +} + +await testCookieSharing( + 'RemoteBrowserProvider (Browserless) — useIncognitoPages: false', + new PuppeteerPlugin(puppeteer, { + useIncognitoPages: false, + remoteBrowser: new BrowserlessProvider(), + }), +); + +console.log('\nDone.'); diff --git a/temp-examples/package.json b/temp-examples/package.json index 221ecd659a37..99c4edcde0c7 100644 --- a/temp-examples/package.json +++ b/temp-examples/package.json @@ -19,7 +19,11 @@ "example:steel-playwright-ws": "node --experimental-strip-types examples/steel-playwright-ws.ts", "example:rebrowser-puppeteer": "node --experimental-strip-types examples/rebrowser-puppeteer.ts", "example:rebrowser-playwright": "node --experimental-strip-types examples/rebrowser-playwright.ts", - "example:rebrowser-playwright-ws": "node --experimental-strip-types examples/rebrowser-playwright-ws.ts" + "example:rebrowser-playwright-ws": "node --experimental-strip-types examples/rebrowser-playwright-ws.ts", + "example:cookie-sharing-test": "node --experimental-strip-types examples/cookie-sharing-test.ts", + "example:cookie-sharing-playwright-test": "node --experimental-strip-types examples/cookie-sharing-playwright-test.ts", + "example:cookie-sharing-playwright-local-vs-remote": "node --experimental-strip-types examples/cookie-sharing-playwright-local-vs-remote.ts", + "example:cookie-sharing-session-across-browsers": "node --experimental-strip-types examples/cookie-sharing-session-across-browsers.ts" }, "dependencies": { "@crawlee/basic": "file:../packages/basic-crawler/dist", From 3139e13a33317b68917cd6cbfd14bbf49873179f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sol=C3=A1r?= Date: Tue, 12 May 2026 12:49:26 +0200 Subject: [PATCH 21/45] Add userDataDir warning --- .../browser-pool/src/abstract-classes/browser-plugin.ts | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/packages/browser-pool/src/abstract-classes/browser-plugin.ts b/packages/browser-pool/src/abstract-classes/browser-plugin.ts index 2cc8aea24b40..aaa6d6af9768 100644 --- a/packages/browser-pool/src/abstract-classes/browser-plugin.ts +++ b/packages/browser-pool/src/abstract-classes/browser-plugin.ts @@ -322,6 +322,13 @@ export abstract class BrowserPlugin< ); } + if (launchContext.userDataDir && launchContext.isRemote) { + this.log.warning( + 'userDataDir is set but will be ignored for remote browser connections. ' + + "Use your remote browser service's persistence API instead (e.g. Browserbase Contexts, Steel Profiles).", + ); + } + if (proxyUrl && !launchContext.isRemote) { await this._addProxyToLaunchOptions(launchContext); } From c8f0b2b413f2c15344d5c98858c62c7e93afca32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sol=C3=A1r?= Date: Tue, 12 May 2026 13:52:25 +0200 Subject: [PATCH 22/45] feat: forward proxyUrl to RemoteBrowserProvider.connect() The proxyUrl from Crawlee's ProxyConfiguration is now passed to RemoteBrowserProvider.connect({ proxyUrl }) and RemoteBrowserConfig.endpoint({ proxyUrl }), letting provider implementations forward it to the remote service's proxy API (e.g. Browserless externalProxyServer, Browserbase external proxies). Also adds a userDataDir warning for remote connections, matching the existing proxyUrl warning pattern. --- .../src/abstract-classes/browser-plugin.ts | 19 +++-- .../src/playwright/playwright-plugin.ts | 2 +- .../src/puppeteer/puppeteer-plugin.ts | 2 +- .../src/remote-browser-provider.ts | 13 ++- temp-examples/examples/remote-proxy-test.ts | 84 +++++++++++++++++++ temp-examples/package.json | 3 +- 6 files changed, 110 insertions(+), 13 deletions(-) create mode 100644 temp-examples/examples/remote-proxy-test.ts diff --git a/packages/browser-pool/src/abstract-classes/browser-plugin.ts b/packages/browser-pool/src/abstract-classes/browser-plugin.ts index aaa6d6af9768..6cf816d84af4 100644 --- a/packages/browser-pool/src/abstract-classes/browser-plugin.ts +++ b/packages/browser-pool/src/abstract-classes/browser-plugin.ts @@ -85,7 +85,11 @@ export interface RemoteBrowserConfig { * Can return a plain string or an object with `url` and optional `context` * that will be forwarded to `release()`. */ - endpoint: string | (() => string | RemoteBrowserEndpointResult | Promise); + endpoint: + | string + | ((options?: { + proxyUrl?: string; + }) => string | RemoteBrowserEndpointResult | Promise); /** * Optional cleanup function called when the browser closes, crashes, or the pool is destroyed. * Receives the resolved endpoint URL and the `context` object returned by `endpoint()`. @@ -214,7 +218,7 @@ export abstract class BrowserPlugin< if (remoteBrowser instanceof RemoteBrowserProvider) { const provider = remoteBrowser; this.remoteBrowser = { - endpoint: () => provider.connect(), + endpoint: (options) => provider.connect(options), release: ({ context }) => provider.release(context as any), type: provider.type, maxOpenBrowsers: provider.maxOpenBrowsers, @@ -225,9 +229,9 @@ export abstract class BrowserPlugin< } /** Resolves the remote browser endpoint from a string or function. Returns { url, context }. */ - protected async _resolveRemoteEndpoint(): Promise { + protected async _resolveRemoteEndpoint(options?: { proxyUrl?: string }): Promise { const { endpoint } = this.remoteBrowser!; - const result = typeof endpoint === 'function' ? await endpoint() : endpoint; + const result = typeof endpoint === 'function' ? await endpoint(options) : endpoint; if (typeof result === 'string') { return { url: result }; } @@ -316,9 +320,10 @@ export abstract class BrowserPlugin< const { proxyUrl, launchOptions } = launchContext; if (proxyUrl && launchContext.isRemote) { - this.log.warning( - 'proxyUrl is set but will be ignored for remote browser connections. ' + - 'Configure proxy settings on the remote browser service instead.', + this.log.info( + 'proxyUrl is set for a remote browser connection. ' + + "It will be forwarded to the remote browser provider's connect() method. " + + "Make sure your provider handles it (e.g. passes it to the service's proxy API).", ); } diff --git a/packages/browser-pool/src/playwright/playwright-plugin.ts b/packages/browser-pool/src/playwright/playwright-plugin.ts index 91de6032a836..0deeb21425a7 100644 --- a/packages/browser-pool/src/playwright/playwright-plugin.ts +++ b/packages/browser-pool/src/playwright/playwright-plugin.ts @@ -111,7 +111,7 @@ export class PlaywrightPlugin extends BrowserPlugin< let url: string; let context: Record | undefined; try { - const result = await this._resolveRemoteEndpoint(); + const result = await this._resolveRemoteEndpoint({ proxyUrl: launchContext.proxyUrl }); url = result.url; context = result.context; } catch (cause) { diff --git a/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts b/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts index ea526643009e..d24f4bc5f4d1 100644 --- a/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts +++ b/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts @@ -112,7 +112,7 @@ export class PuppeteerPlugin extends BrowserPlugin< let url: string; let context: Record | undefined; try { - const result = await this._resolveRemoteEndpoint(); + const result = await this._resolveRemoteEndpoint({ proxyUrl: launchContext.proxyUrl }); url = result.url; context = result.context; } catch (cause) { diff --git a/packages/browser-pool/src/remote-browser-provider.ts b/packages/browser-pool/src/remote-browser-provider.ts index 8b714691a0be..3fe43b643431 100644 --- a/packages/browser-pool/src/remote-browser-provider.ts +++ b/packages/browser-pool/src/remote-browser-provider.ts @@ -33,8 +33,10 @@ * class BrowserbaseProvider extends RemoteBrowserProvider<{ id: string }> { * maxOpenBrowsers = 2; // respect the service's concurrent session limit * - * async connect() { - * const session = await createSession(apiKey, projectId); + * async connect({ proxyUrl } = {}) { + * const session = await createSession(apiKey, projectId, { + * proxies: proxyUrl ? [{ type: 'external', server: proxyUrl }] : undefined, + * }); * return { url: session.connectUrl, context: { id: session.id } }; * } * @@ -63,8 +65,13 @@ export abstract class RemoteBrowserProvider | { url: string; context?: TContext }; + abstract connect(options?: { + proxyUrl?: string; + }): Promise<{ url: string; context?: TContext }> | { url: string; context?: TContext }; /** * Called when the browser closes, crashes, the pool is destroyed, or the diff --git a/temp-examples/examples/remote-proxy-test.ts b/temp-examples/examples/remote-proxy-test.ts new file mode 100644 index 000000000000..cf72058c9be6 --- /dev/null +++ b/temp-examples/examples/remote-proxy-test.ts @@ -0,0 +1,84 @@ +/** + * Remote browser with custom proxy — demonstrates proxyUrl forwarding + * + * Shows how proxyUrl from Crawlee's ProxyConfiguration is forwarded to + * the RemoteBrowserProvider.connect() method, letting the provider pass + * it to the remote service's proxy API. + * + * Run local Browserless first: + * docker run -p 3000:3000 -e CONCURRENT=4 ghcr.io/browserless/chromium + * + * Then: + * npm run example:remote-proxy-test + * + * Note: externalProxyServer is a paid Browserless feature. On the free/local + * Docker image the proxy is accepted but may not route traffic. The example + * proves the forwarding plumbing works regardless. + */ +import { RemoteBrowserProvider } from '@crawlee/browser-pool'; +import { PuppeteerCrawler, ProxyConfiguration } from 'crawlee'; + +// --------------------------------------------------------------------------- +// Provider that forwards proxyUrl to Browserless via externalProxyServer param +// --------------------------------------------------------------------------- +class BrowserlessWithProxyProvider extends RemoteBrowserProvider { + maxOpenBrowsers = 4; + + async connect({ proxyUrl } = {} as { proxyUrl?: string }) { + let url = 'ws://localhost:3000'; + + if (proxyUrl) { + // Browserless accepts custom proxy via externalProxyServer query param + // For other services, forward differently: + // Browserbase: proxies: [{ type: 'external', server: proxyUrl }] + // Steel: not supported (built-in only) + // Rebrowser: set on profile + url += `?externalProxyServer=${encodeURIComponent(proxyUrl)}`; + console.log(` [Provider] Forwarding proxy to Browserless: ${proxyUrl}`); + } else { + console.log(' [Provider] No proxy provided'); + } + + return { url }; + } +} + +// --------------------------------------------------------------------------- +// Proxy configuration — Crawlee rotates these per browser session +// --------------------------------------------------------------------------- +const proxyConfiguration = new ProxyConfiguration({ + proxyUrls: [ + 'http://34.135.166.24:80', + 'http://8.219.97.248:80', + ], +}); + +// --------------------------------------------------------------------------- +// Crawler +// --------------------------------------------------------------------------- +const crawler = new PuppeteerCrawler({ + launchContext: { + remoteBrowser: new BrowserlessWithProxyProvider(), + }, + proxyConfiguration, + browserPoolOptions: { + retireBrowserAfterPageCount: 2, + maxOpenPagesPerBrowser: 1, + }, + maxConcurrency: 1, + maxRequestsPerCrawl: 4, + async requestHandler({ page, request, proxyInfo }) { + const title = await page.title(); + console.log(`[Page] ${request.loadedUrl} — "${title}" (proxy: ${proxyInfo?.url ?? 'none'})`); + }, +}); + +await crawler.run([ + 'https://example.com', + 'https://books.toscrape.com', + 'https://quotes.toscrape.com', + 'https://httpbin.org/ip', +]); + +console.log('\nDone.'); +console.log('Check that [Provider] logs show the proxy URL being forwarded from ProxyConfiguration.'); diff --git a/temp-examples/package.json b/temp-examples/package.json index 99c4edcde0c7..2a106652fa97 100644 --- a/temp-examples/package.json +++ b/temp-examples/package.json @@ -23,7 +23,8 @@ "example:cookie-sharing-test": "node --experimental-strip-types examples/cookie-sharing-test.ts", "example:cookie-sharing-playwright-test": "node --experimental-strip-types examples/cookie-sharing-playwright-test.ts", "example:cookie-sharing-playwright-local-vs-remote": "node --experimental-strip-types examples/cookie-sharing-playwright-local-vs-remote.ts", - "example:cookie-sharing-session-across-browsers": "node --experimental-strip-types examples/cookie-sharing-session-across-browsers.ts" + "example:cookie-sharing-session-across-browsers": "node --experimental-strip-types examples/cookie-sharing-session-across-browsers.ts", + "example:remote-proxy-test": "node --experimental-strip-types examples/remote-proxy-test.ts" }, "dependencies": { "@crawlee/basic": "file:../packages/basic-crawler/dist", From 779105a014a8977276dc7a999b155e2acdde8b45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sol=C3=A1r?= Date: Thu, 21 May 2026 15:02:00 +0200 Subject: [PATCH 23/45] examples --- ...e-sharing-session-across-local-browsers.ts | 66 +++++++++ .../examples/remote-proxy-via-chrome-args.ts | 129 ++++++++++++++++++ temp-examples/package.json | 4 +- 3 files changed, 198 insertions(+), 1 deletion(-) create mode 100644 temp-examples/examples/cookie-sharing-session-across-local-browsers.ts create mode 100644 temp-examples/examples/remote-proxy-via-chrome-args.ts diff --git a/temp-examples/examples/cookie-sharing-session-across-local-browsers.ts b/temp-examples/examples/cookie-sharing-session-across-local-browsers.ts new file mode 100644 index 000000000000..0222cd9fa357 --- /dev/null +++ b/temp-examples/examples/cookie-sharing-session-across-local-browsers.ts @@ -0,0 +1,66 @@ +/** + * Session-based cookie sharing across local browsers (Puppeteer) + * + * Demonstrates that the Session object transfers cookies between sequential + * requests even when they land on different local browser instances. + * No remote service needed. + * + * Setup: + * - retireBrowserAfterPageCount: 1 → forces a new browser per request + * - Single session pool → same session reused for all requests + * - saveResponseCookies: true (default) + * + * Run: + * npm run example:cookie-sharing-session-across-local-browsers + */ +import { PuppeteerCrawler, SessionPool } from 'crawlee'; + +// Single session so both requests share cookies +const sessionPool = new SessionPool({ maxPoolSize: 1 }); + +// --------------------------------------------------------------------------- +// Crawler — forces new browser per request to prove cross-browser sharing +// --------------------------------------------------------------------------- +const crawler = new PuppeteerCrawler({ + browserPoolOptions: { + retireBrowserAfterPageCount: 1, // force new browser for each request + maxOpenPagesPerBrowser: 1, + }, + sessionPool, + maxConcurrency: 1, // sequential — so request 1 finishes before request 2 + async requestHandler({ page, request, session, browserController }) { + const controllerId = browserController.id; + + // Set a cookie manually on the first request and save it to the session + if (request.url.includes('/login')) { + await page.setCookie({ + name: 'AUTH_TOKEN', + value: 'secret-jwt-123', + domain: 'books.toscrape.com', + path: '/', + }); + // Save page cookies to the session (normally saveResponseCookies does this + // during navigation, but our cookie was set after navigation) + const cookies = await browserController.getCookies(page); + session?.setCookies(cookies, request.loadedUrl!); + } + + const pageCookies = await page.cookies(); + const sessionCookies = session?.getCookies(request.loadedUrl!) ?? []; + + console.log(`\n[${new URL(request.url).pathname}]`); + console.log(` Browser controller: ${controllerId}`); + console.log(` Session ID: ${session?.id}`); + console.log(` Page cookies: ${JSON.stringify(pageCookies.map((c) => ({ name: c.name, value: c.value })))}`); + console.log(` Session cookies: ${JSON.stringify(sessionCookies.map((c) => ({ name: c.name, value: c.value })))}`); + }, +}); + +await crawler.run([ + 'https://books.toscrape.com/login', // Request 1: browser A — sets AUTH_TOKEN cookie + 'https://books.toscrape.com/', // Request 2: browser B — should have cookie via Session +]); + +console.log('\nDone.'); +console.log('If request 2 shows AUTH_TOKEN in session cookies → session transferred cookies across browsers.'); +console.log('Check that Browser controller IDs are different → proves different browsers.'); diff --git a/temp-examples/examples/remote-proxy-via-chrome-args.ts b/temp-examples/examples/remote-proxy-via-chrome-args.ts new file mode 100644 index 000000000000..16604420558d --- /dev/null +++ b/temp-examples/examples/remote-proxy-via-chrome-args.ts @@ -0,0 +1,129 @@ +/** + * Remote browser with custom proxy via Chrome launch args + * + * Uses Browserless's `launch` query param to pass --proxy-server directly + * to Chrome. Works on the free/local Docker image (no paid features needed). + * + * ────────────────────────────────────────────────────────────────────────── + * SETUP STEPS + * ────────────────────────────────────────────────────────────────────────── + * + * 1. Get your Apify Proxy password: + * - Go to https://console.apify.com/account/integrations + * - Copy "Proxy password" from the Proxy section + * + * 2. Add it to temp-examples/.env: + * APIFY_PROXY_PASSWORD=your_password_here + * + * 3. Start local Browserless Docker (free image): + * docker run -p 3000:3000 ghcr.io/browserless/chromium + * + * 4. Run the example: + * npm run example:remote-proxy-via-chrome-args + * + * ────────────────────────────────────────────────────────────────────────── + * WHAT TO LOOK FOR + * ────────────────────────────────────────────────────────────────────────── + * + * Test target is httpbin.org/ip — it returns the IP making the request. + * - If the proxy routes correctly, the "Response" line shows the proxy's IP + * (NOT your own home/office IP). + * - With residential proxies, you should see different IPs on different + * requests if rotation is working. + * + * Without APIFY_PROXY_PASSWORD, the example falls back to a free public proxy + * (unreliable, may fail) so you can still see the forwarding mechanism work. + * + * ────────────────────────────────────────────────────────────────────────── + * APIFY PROXY URL FORMAT + * ────────────────────────────────────────────────────────────────────────── + * + * Base: http://USERNAME:PASSWORD@proxy.apify.com:8000 + * + * USERNAME options (combine with commas): + * - groups-RESIDENTIAL → residential proxies + * - groups-AUTO → auto-rotated datacenter (default) + * - groups-GOOGLE_SERP → Google SERP-specific + * - country-US → restrict to country (US, GB, DE, etc.) + * - session-myid123 → sticky session (same IP for same session) + * + * Examples: + * http://auto:PASSWORD@proxy.apify.com:8000 + * http://groups-RESIDENTIAL,country-US:PASSWORD@proxy.apify.com:8000 + * http://groups-RESIDENTIAL,session-abc:PASSWORD@proxy.apify.com:8000 + */ +import 'dotenv/config'; + +import { RemoteBrowserProvider } from '@crawlee/browser-pool'; +import { PuppeteerCrawler, ProxyConfiguration } from 'crawlee'; + +// --------------------------------------------------------------------------- +// Provider that forwards proxyUrl to Chrome via --proxy-server launch arg +// --------------------------------------------------------------------------- +class BrowserlessChromeArgsProvider extends RemoteBrowserProvider { + maxOpenBrowsers = 4; + + async connect({ proxyUrl } = {} as { proxyUrl?: string }) { + let url = 'ws://localhost:3000'; + + if (proxyUrl) { + // Pass proxy to Chrome via launch args (works on free Browserless) + const launchOpts = JSON.stringify({ + args: [`--proxy-server=${proxyUrl}`], + }); + url += `?launch=${encodeURIComponent(launchOpts)}`; + console.log(` [Provider] Forwarding proxy via Chrome args: ${proxyUrl}`); + } else { + console.log(' [Provider] No proxy provided'); + } + + return { url }; + } +} + +// --------------------------------------------------------------------------- +// Proxy configuration — use Apify Proxy or any HTTP proxy +// --------------------------------------------------------------------------- +const apifyPassword = process.env.APIFY_PROXY_PASSWORD; + +const proxyConfiguration = new ProxyConfiguration({ + proxyUrls: apifyPassword + ? [ + // Apify residential proxy (replace 'RESIDENTIAL' with your group if different) + `http://groups-RESIDENTIAL:${apifyPassword}@proxy.apify.com:8000`, + ] + : [ + // Fallback: free public proxies (unreliable, may not work) + 'http://34.135.166.24:80', + ], +}); + +// --------------------------------------------------------------------------- +// Crawler +// --------------------------------------------------------------------------- +const crawler = new PuppeteerCrawler({ + launchContext: { + remoteBrowser: new BrowserlessChromeArgsProvider(), + }, + proxyConfiguration, + browserPoolOptions: { + retireBrowserAfterPageCount: 1, // new browser per request to test rotation + maxOpenPagesPerBrowser: 1, + }, + maxConcurrency: 1, + maxRequestsPerCrawl: 2, + async requestHandler({ page, request, proxyInfo }) { + const body = await page.evaluate(() => document.body.textContent?.trim()); + console.log(`\n[${request.loadedUrl}]`); + console.log(` Configured proxy: ${proxyInfo?.url ?? 'none'}`); + console.log(` Response: ${body}`); + }, +}); + +await crawler.run([ + 'https://httpbin.org/ip', + 'https://httpbin.org/ip', +]); + +console.log('\nDone.'); +console.log('Compare "Response" IP with your own IP to verify proxy routing.'); diff --git a/temp-examples/package.json b/temp-examples/package.json index 2a106652fa97..39e0955ca0ec 100644 --- a/temp-examples/package.json +++ b/temp-examples/package.json @@ -24,7 +24,9 @@ "example:cookie-sharing-playwright-test": "node --experimental-strip-types examples/cookie-sharing-playwright-test.ts", "example:cookie-sharing-playwright-local-vs-remote": "node --experimental-strip-types examples/cookie-sharing-playwright-local-vs-remote.ts", "example:cookie-sharing-session-across-browsers": "node --experimental-strip-types examples/cookie-sharing-session-across-browsers.ts", - "example:remote-proxy-test": "node --experimental-strip-types examples/remote-proxy-test.ts" + "example:remote-proxy-test": "node --experimental-strip-types examples/remote-proxy-test.ts", + "example:cookie-sharing-session-across-local-browsers": "node --experimental-strip-types examples/cookie-sharing-session-across-local-browsers.ts", + "example:remote-proxy-via-chrome-args": "node --experimental-strip-types examples/remote-proxy-via-chrome-args.ts" }, "dependencies": { "@crawlee/basic": "file:../packages/basic-crawler/dist", From e6ed866c44e912fa3fd2da7099a19d5313df91eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sol=C3=A1r?= Date: Wed, 27 May 2026 12:50:58 +0000 Subject: [PATCH 24/45] Force useIncognitoPages: true for remote Playwright connections Drop the _maybeWrapWithSharedContext workaround that emulated launchPersistentContext semantics over connectOverCDP. Remote connections now always run with useIncognitoPages: true; explicit false is overridden with a warning pointing users at SessionPool for cross-request state sharing. Also remove the now-unused parentBrowser plumbing from PlaywrightBrowser, which only existed to keep the underlying CDP Browser alive while the wrapper was active. --- .../src/playwright/playwright-browser.ts | 19 +----- .../src/playwright/playwright-plugin.ts | 61 ++++--------------- .../browser-pool/test/remote-browser.test.ts | 40 +++++------- 3 files changed, 29 insertions(+), 91 deletions(-) diff --git a/packages/browser-pool/src/playwright/playwright-browser.ts b/packages/browser-pool/src/playwright/playwright-browser.ts index f944795d416b..c1e2c65b8ef8 100644 --- a/packages/browser-pool/src/playwright/playwright-browser.ts +++ b/packages/browser-pool/src/playwright/playwright-browser.ts @@ -1,12 +1,10 @@ import { EventEmitter } from 'node:events'; -import type { Browser, BrowserContext, BrowserType } from 'playwright'; +import type { BrowserContext, BrowserType } from 'playwright'; export interface BrowserOptions { browserContext: BrowserContext; version: string; - /** When wrapping a remote CDP browser's default context, pass the real Browser so it can be closed properly. */ - parentBrowser?: Browser; } /** @@ -17,28 +15,18 @@ export class PlaywrightBrowser extends EventEmitter { private _version: string; private _isConnected = true; private _browserType?: BrowserType; - private _parentBrowser?: Browser; constructor(options: BrowserOptions) { super(); - const { browserContext, version, parentBrowser } = options; + const { browserContext, version } = options; this._browserContext = browserContext; this._version = version; - this._parentBrowser = parentBrowser; this._browserContext.once('close', () => { this._isConnected = false; this.emit('disconnected'); }); - - // Forward real browser disconnection so the pool detects remote crashes. - if (parentBrowser) { - parentBrowser.once('disconnected', () => { - this._isConnected = false; - this.emit('disconnected'); - }); - } } async [Symbol.asyncDispose](): Promise { @@ -47,9 +35,6 @@ export class PlaywrightBrowser extends EventEmitter { async close(): Promise { await this._browserContext.close(); - if (this._parentBrowser) { - await this._parentBrowser.close().catch(() => {}); - } } contexts(): BrowserContext[] { diff --git a/packages/browser-pool/src/playwright/playwright-plugin.ts b/packages/browser-pool/src/playwright/playwright-plugin.ts index 0deeb21425a7..ac0ea243fef2 100644 --- a/packages/browser-pool/src/playwright/playwright-plugin.ts +++ b/packages/browser-pool/src/playwright/playwright-plugin.ts @@ -82,19 +82,20 @@ export class PlaywrightPlugin extends BrowserPlugin< } const isRemoteConnection = this.remoteBrowser || this.connectOptions || this.connectOverCDPOptions; - if (isRemoteConnection && options.useIncognitoPages === undefined) { - const isWebSocket = this.connectOptions || this.remoteBrowser?.type === 'websocket'; - if (isWebSocket) { - this.useIncognitoPages = true; - this.log.info( - 'Remote Playwright WebSocket connection detected — defaulting useIncognitoPages to true.', + if (isRemoteConnection) { + if (options.useIncognitoPages === false) { + this.log.warning( + 'Remote Playwright connections only support useIncognitoPages: true. ' + + 'The setting has been overridden — pages will not share cookies/storage. ' + + 'For state sharing across requests, use the SessionPool.', ); - } else { + } else if (options.useIncognitoPages === undefined) { this.log.info( - 'Remote Playwright CDP connection detected — pages will share cookies and storage ' + - 'via the default browser context (useIncognitoPages defaults to false).', + 'Remote Playwright connection detected — useIncognitoPages forced to true. ' + + 'Pages will not share cookies/storage between each other.', ); } + this.useIncognitoPages = true; } } @@ -129,8 +130,7 @@ export class PlaywrightPlugin extends BrowserPlugin< return await this.library.connect(url, {}); } this.log.info('Connecting to remote browser via connectOverCDP.'); - const browser = await this.library.connectOverCDP(url, {}); - return this._maybeWrapWithSharedContext(browser, launchContext); + return await this.library.connectOverCDP(url, {}); } catch (cause) { await this._callRelease(url, context); throw new BrowserLaunchError( @@ -146,8 +146,7 @@ export class PlaywrightPlugin extends BrowserPlugin< const { endpointURL, ...options } = this.connectOverCDPOptions; this.log.info('Connecting to remote browser via connectOverCDP.'); try { - const browser = await this.library.connectOverCDP(endpointURL, options); - return this._maybeWrapWithSharedContext(browser, launchContext); + return await this.library.connectOverCDP(endpointURL, options); } catch (cause) { throw new BrowserLaunchError( `Failed to connect to remote browser via CDP at "${this._sanitizeEndpointForLog(endpointURL)}". ` + @@ -253,42 +252,6 @@ export class PlaywrightPlugin extends BrowserPlugin< return browser; } - /** - * When useIncognitoPages is false and we have a CDP-connected browser, - * wrap its default context in PlaywrightBrowser so that all pages share - * a single context (matching local persistent-context behavior). - * - * Playwright's browser.newPage() always creates a new context, so without - * this wrapper, pages would never share cookies even with useIncognitoPages: false. - */ - private _maybeWrapWithSharedContext( - browser: PlaywrightBrowser, - launchContext: LaunchContext, - ): PlaywrightBrowser { - if (launchContext.useIncognitoPages) { - return browser; - } - - const contexts = browser.contexts(); - const defaultContext = contexts[0]; - - if (!defaultContext) { - this.log.warning( - 'Remote CDP browser has no default context — cannot share cookies between pages. ' + - 'Falling back to standard behavior (new context per page).', - ); - return browser; - } - - this.log.info('Wrapping remote CDP browser default context for cookie sharing between pages.'); - - return new PlaywrightBrowserWithPersistentContext({ - browserContext: defaultContext, - version: browser.version(), - parentBrowser: browser, - }) as unknown as PlaywrightBrowser; - } - private _throwOnFailedLaunch(launchContext: LaunchContext, cause: unknown): never { this._throwAugmentedLaunchError( cause, diff --git a/packages/browser-pool/test/remote-browser.test.ts b/packages/browser-pool/test/remote-browser.test.ts index 827c6228194a..5b970d849bec 100644 --- a/packages/browser-pool/test/remote-browser.test.ts +++ b/packages/browser-pool/test/remote-browser.test.ts @@ -326,16 +326,16 @@ describe('Remote browser — PlaywrightPlugin', () => { // --- useIncognitoPages default -------------------------------------------- describe('useIncognitoPages default', () => { - test('defaults to false for remote (connectOverCDP)', () => { + test('forced to true for remote (connectOverCDP)', () => { const lib = createMockPlaywrightLibrary(); const plugin = new PlaywrightPlugin(lib as any, { connectOverCDPOptions: { endpointURL: 'http://remote:9222' }, }); - expect(plugin.useIncognitoPages).toBe(false); + expect(plugin.useIncognitoPages).toBe(true); }); - test('defaults to true for remote (connect / WebSocket)', () => { + test('forced to true for remote (connect / WebSocket)', () => { const lib = createMockPlaywrightLibrary(); const plugin = new PlaywrightPlugin(lib as any, { connectOptions: { wsEndpoint: 'ws://remote:3000' }, @@ -344,14 +344,17 @@ describe('Remote browser — PlaywrightPlugin', () => { expect(plugin.useIncognitoPages).toBe(true); }); - test('explicit false preserved for remote', () => { + test('explicit false is overridden to true for remote (with warning)', () => { const lib = createMockPlaywrightLibrary(); const plugin = new PlaywrightPlugin(lib as any, { connectOverCDPOptions: { endpointURL: 'http://remote:9222' }, useIncognitoPages: false, }); - expect(plugin.useIncognitoPages).toBe(false); + expect(plugin.useIncognitoPages).toBe(true); + expect(mockLogger.warning).toHaveBeenCalledWith( + expect.stringContaining('only support useIncognitoPages: true'), + ); }); test('explicit true preserved for remote', () => { @@ -390,26 +393,13 @@ describe('Remote browser — PlaywrightPlugin', () => { ); }); - test('remote CDP default → info about shared cookies', () => { + test('remote default → info about incognito-only', () => { const lib = createMockPlaywrightLibrary(); new PlaywrightPlugin(lib as any, { connectOverCDPOptions: { endpointURL: 'http://remote:9222' }, }); - expect(mockLogger.info).toHaveBeenCalledWith( - expect.stringContaining('pages will share cookies and storage'), - ); - }); - - test('remote WebSocket default → info about incognito true', () => { - const lib = createMockPlaywrightLibrary(); - new PlaywrightPlugin(lib as any, { - connectOptions: { wsEndpoint: 'ws://remote:3000' }, - }); - - expect(mockLogger.info).toHaveBeenCalledWith( - expect.stringContaining('defaulting useIncognitoPages to true'), - ); + expect(mockLogger.info).toHaveBeenCalledWith(expect.stringContaining('useIncognitoPages forced to true')); }); test('no warnings for local browser usage', async () => { @@ -741,16 +731,16 @@ describe('remoteBrowser config — PlaywrightPlugin', () => { expect(ctx.isRemote).toBe(true); }); - test('useIncognitoPages defaults to false when remoteBrowser is set (CDP)', () => { + test('useIncognitoPages forced to true when remoteBrowser is set (CDP)', () => { const lib = createMockPlaywrightLibrary(); const plugin = new PlaywrightPlugin(lib as any, { remoteBrowser: { endpoint: 'wss://test.io' }, }); - expect(plugin.useIncognitoPages).toBe(false); + expect(plugin.useIncognitoPages).toBe(true); }); - test('useIncognitoPages defaults to true when remoteBrowser is set (WebSocket)', () => { + test('useIncognitoPages forced to true when remoteBrowser is set (WebSocket)', () => { const lib = createMockPlaywrightLibrary(); const plugin = new PlaywrightPlugin(lib as any, { remoteBrowser: { endpoint: 'wss://test.io', type: 'websocket' }, @@ -1042,7 +1032,7 @@ describe('RemoteBrowserProvider — PlaywrightPlugin', () => { expect(ctx.isRemote).toBe(true); }); - test('provider sets useIncognitoPages default to false (CDP)', () => { + test('provider forces useIncognitoPages to true (CDP)', () => { const lib = createMockPlaywrightLibrary(); class P extends RemoteBrowserProvider { @@ -1052,7 +1042,7 @@ describe('RemoteBrowserProvider — PlaywrightPlugin', () => { } const plugin = new PlaywrightPlugin(lib as any, { remoteBrowser: new P() }); - expect(plugin.useIncognitoPages).toBe(false); + expect(plugin.useIncognitoPages).toBe(true); }); }); From f99b98a10626739ff3f1d95344d5c8c7d01ffafd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sol=C3=A1r?= Date: Wed, 27 May 2026 13:50:23 +0000 Subject: [PATCH 25/45] Add remote-browser integration test suite MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a vitest-based integration suite under test/integration that exercises Crawlee end-to-end against a real Browserless instance. The first test verifies the force-incognito behavior for remote Playwright CDP connections: two requests landing on the same browser do not share cookies even when retireBrowserAfterPageCount is high and saveResponseCookies is disabled. Gated on CRAWLEE_DIFFICULT_TESTS so `pnpm test` skips the suite by default — `pnpm test:integration` and `pnpm test:full` set the flag. The suite expects Browserless and httpbin running on a shared Docker network; `pnpm test:integration:services:up` spins them up locally, and a new GitHub Actions workflow provides them as service containers. Also sets core-js-pure: false in pnpm-workspace.yaml allowBuilds to match prior skip-by-default behavior under pnpm 11. --- .github/workflows/test-integration.yml | 70 +++++++++++++++++++ package.json | 3 + pnpm-workspace.yaml | 65 ++++++++--------- test/integration/helpers.ts | 28 ++++++++ .../remote-browser-incognito.test.ts | 65 +++++++++++++++++ 5 files changed, 199 insertions(+), 32 deletions(-) create mode 100644 .github/workflows/test-integration.yml create mode 100644 test/integration/helpers.ts create mode 100644 test/integration/remote-browser-incognito.test.ts diff --git a/.github/workflows/test-integration.yml b/.github/workflows/test-integration.yml new file mode 100644 index 000000000000..7fdded9d6a84 --- /dev/null +++ b/.github/workflows/test-integration.yml @@ -0,0 +1,70 @@ +name: Integration tests + +on: + pull_request: + branches: [ master, v4 ] + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + remote-browser: + name: Remote browser integration + runs-on: ubuntu-22.04 + + # Side-services provide the remote browser and a deterministic HTTP target. + services: + browserless: + image: ghcr.io/browserless/chromium:latest + ports: + - 3000:3000 + env: + CONCURRENT: 4 + options: >- + --health-cmd "wget -qO- http://localhost:3000/json/version || exit 1" + --health-interval 5s + --health-timeout 5s + --health-retries 12 + httpbin: + # kennethreitz/httpbin is python:3.6-slim and ships without wget/curl, + # so no Docker HEALTHCHECK — httpbin starts in <1s and the first test + # request will surface any real failure. + image: kennethreitz/httpbin:latest + ports: + - 8080:80 + + steps: + - name: Checkout repository + uses: actions/checkout@v6 + + - name: Use Node.js 24 + uses: actions/setup-node@v6 + with: + node-version: 24 + package-manager-cache: false + + - name: Turbo cache + uses: actions/cache@v5 + with: + path: .turbo + key: turbo-${{ github.job }}-${{ github.ref_name }}-${{ github.sha }} + restore-keys: | + turbo-${{ github.job }}-${{ github.ref_name }}- + + - uses: apify/workflows/pnpm-install@main + + # No `playwright install` — these tests connect to remote Browserless + # over CDP and never launch a local browser binary. + + - name: Build + run: pnpm ci:build + + - name: Run integration tests + run: pnpm test:integration + env: + BROWSERLESS_URL: http://localhost:3000 + HTTPBIN_URL: http://httpbin + CRAWLEE_DIFFICULT_TESTS: 1 + RETRY_TESTS: 1 diff --git a/package.json b/package.json index 75cc9bcedea7..64d0211c673c 100644 --- a/package.json +++ b/package.json @@ -37,6 +37,9 @@ "ci:build": "turbo run build --filter=./packages/* --cache-dir=\".turbo\" && node ./scripts/typescript_fixes.mjs", "test": "vitest run --silent", "test:e2e": "node test/e2e/run.mjs", + "test:integration": "cross-env CRAWLEE_DIFFICULT_TESTS=1 vitest run --silent=true test/integration", + "test:integration:services:up": "docker network create crawlee-it 2>/dev/null; docker run -d --rm --name crawlee-it-browserless --network crawlee-it -p 3000:3000 -e CONCURRENT=4 ghcr.io/browserless/chromium && docker run -d --rm --name crawlee-it-httpbin --network crawlee-it --network-alias httpbin -p 8080:80 kennethreitz/httpbin", + "test:integration:services:down": "docker stop crawlee-it-browserless crawlee-it-httpbin; docker network rm crawlee-it 2>/dev/null; true", "test:full": "cross-env CRAWLEE_DIFFICULT_TESTS=1 vitest run --silent", "tsc-check-tests": "tsc --noEmit --project test/tsconfig.json", "coverage": "vitest --coverage", diff --git a/pnpm-workspace.yaml b/pnpm-workspace.yaml index d01f8dc0713e..113037afd2d1 100644 --- a/pnpm-workspace.yaml +++ b/pnpm-workspace.yaml @@ -1,26 +1,26 @@ packages: - - packages/* - - docs - - website + - packages/* + - docs + - website minimumReleaseAge: 1440 minimumReleaseAgeExclude: - - "@apify/*" - - "@crawlee/*" - - "apify-client" - - "apify" - - "crawlee" - - "got-scraping" + - "@apify/*" + - "@crawlee/*" + - "apify-client" + - "apify" + - "crawlee" + - "got-scraping" overrides: - playwright-core: 1.58.2 - "@browserbasehq/stagehand": 3.0.7 - # Dedup minimatch to v9 everywhere except inside lerna — lerna 9.x's bundled - # code (`__toESM(require('minimatch')).default(...)`) only works with v3, - # whose CJS export *is* the function. Pinning v9 there caused the publish - # step to silently report 0 changed packages. - minimatch: "^9.0.0" - "lerna>minimatch": "^3.1.4" + playwright-core: 1.58.2 + "@browserbasehq/stagehand": 3.0.7 + # Dedup minimatch to v9 everywhere except inside lerna — lerna 9.x's bundled + # code (`__toESM(require('minimatch')).default(...)`) only works with v3, + # whose CJS export *is* the function. Pinning v9 there caused the publish + # step to silently report 0 changed packages. + minimatch: "^9.0.0" + "lerna>minimatch": "^3.1.4" # pnpm 11 replaces `onlyBuiltDependencies` with an explicit `allowBuilds` map. # Each entry must be true (build allowed) or false (build skipped) — pnpm 11 @@ -28,19 +28,20 @@ overrides: # (combined with `strictDepBuilds: false` below so the install can still # proceed when new build-requesting deps appear without a manual entry). allowBuilds: - "@apify/ui-icons": true - "@playwright/browser-chromium": true - "@playwright/browser-firefox": true - "@playwright/browser-webkit": true - "@swc/core": true - better-sqlite3: true - bufferutil: true - core-js: true - esbuild: true - nx: true - protobufjs: true - puppeteer: true - unrs-resolver: true + "@apify/ui-icons": true + "@playwright/browser-chromium": true + "@playwright/browser-firefox": true + "@playwright/browser-webkit": true + "@swc/core": true + better-sqlite3: true + bufferutil: true + core-js: true + core-js-pure: false + esbuild: true + nx: true + protobufjs: true + puppeteer: true + unrs-resolver: true strictDepBuilds: false @@ -59,7 +60,7 @@ nodeLinker: hoisted linkWorkspacePackages: true preferWorkspacePackages: true publicHoistPattern: - - "*" + - "*" patchedDependencies: - "@docusaurus/core@3.9.2": patches/@docusaurus__core@3.9.2.patch + "@docusaurus/core@3.9.2": patches/@docusaurus__core@3.9.2.patch diff --git a/test/integration/helpers.ts b/test/integration/helpers.ts new file mode 100644 index 000000000000..c680a62c5a75 --- /dev/null +++ b/test/integration/helpers.ts @@ -0,0 +1,28 @@ +/** + * Helpers for remote-browser integration tests. + * + * These tests require a running Browserless instance and a deterministic HTTP + * target (httpbin). In CI both are provided as GitHub Actions service + * containers on a shared network. Locally, start them via + * `pnpm test:integration:services:up`. + * + * Network model: HTTPBIN_URL is consumed by the REMOTE browser (not the test + * runner). The browser lives in the Browserless container, so the URL must + * resolve inside that container's Docker network — typically `http://httpbin` + * via service name/alias. + * + * Env vars: + * BROWSERLESS_URL default: http://localhost:3000 (host-side; how the test + * runner reaches CDP) + * HTTPBIN_URL default: http://httpbin (browser-side; how the + * remote browser reaches + * httpbin via Docker DNS) + */ + +export const BROWSERLESS_URL = process.env.BROWSERLESS_URL ?? 'http://localhost:3000'; +export const HTTPBIN_URL = process.env.HTTPBIN_URL ?? 'http://httpbin'; + +/** Build a URL on the httpbin service from a path (e.g. '/cookies'). */ +export function httpbin(path: string): string { + return `${HTTPBIN_URL}${path.startsWith('/') ? path : `/${path}`}`; +} diff --git a/test/integration/remote-browser-incognito.test.ts b/test/integration/remote-browser-incognito.test.ts new file mode 100644 index 000000000000..d84bd488f739 --- /dev/null +++ b/test/integration/remote-browser-incognito.test.ts @@ -0,0 +1,65 @@ +/** + * Integration test: PlaywrightCrawler against a remote Browserless CDP endpoint + * forces useIncognitoPages: true, so two pages on the same remote browser do + * NOT share cookies. + * + * Mirrors temp-examples/examples/cookie-sharing-pages-same-remote-browser.ts: + * - retireBrowserAfterPageCount: 10 → both requests stay on the same browser + * - saveResponseCookies: false → Session cannot carry cookies across requests + * - Request 1 → /cookies/set?TOKEN=… (httpbin Set-Cookie) + * - Request 2 → /cookies (httpbin echoes received cookies in body) + * + * With the wrapper removed, request 2's body should report no cookies. + */ +import { RemoteBrowserProvider } from '@crawlee/browser-pool'; +import { PlaywrightCrawler } from 'crawlee'; +import { expect, test } from 'vitest'; + +import { BROWSERLESS_URL, httpbin } from './helpers.js'; + +class BrowserlessCDPProvider extends RemoteBrowserProvider { + maxOpenBrowsers = 1; + async connect() { + return { url: BROWSERLESS_URL }; + } +} + +// Gate on CRAWLEE_DIFFICULT_TESTS so plain `pnpm test` skips integration tests +// (no Docker required); `pnpm test:integration` and `pnpm test:full` set the flag. +test.skipIf(!process.env.CRAWLEE_DIFFICULT_TESTS)( + 'remote Playwright CDP: pages on the same browser do not share cookies', + async () => { + const observations: { controllerId: string; body: { cookies: Record } }[] = []; + + const crawler = new PlaywrightCrawler({ + launchContext: { + remoteBrowser: new BrowserlessCDPProvider(), + }, + browserPoolOptions: { + retireBrowserAfterPageCount: 10, // keep the same browser across both requests + maxOpenPagesPerBrowser: 2, + }, + saveResponseCookies: false, // remove Session-based propagation + maxConcurrency: 1, + maxRequestsPerCrawl: 2, + async requestHandler({ page, browserController }) { + const body = await page.evaluate(() => document.body.textContent?.trim()); + observations.push({ + controllerId: browserController.id, + body: body ? JSON.parse(body) : null, + }); + }, + }); + + await crawler.run([httpbin('/cookies/set?TOKEN=integration-test'), httpbin('/cookies')]); + + expect(observations).toHaveLength(2); + // Same browser handled both requests — otherwise the assertion below proves nothing. + expect(observations[0].controllerId).toBe(observations[1].controllerId); + // Request 1 actually got the cookie (else request 2's emptiness proves nothing). + expect(observations[0].body.cookies).toEqual({ TOKEN: 'integration-test' }); + // Request 2 (the /cookies echo) must NOT include the TOKEN cookie set by request 1. + expect(observations[1].body.cookies).toEqual({}); + }, + 60_000, +); From f7b7f030304036f529c418e6ef09865bffb3cbfa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sol=C3=A1r?= Date: Wed, 27 May 2026 13:55:39 +0000 Subject: [PATCH 26/45] Add override modifier to maxOpenBrowsers in integration test --- test/integration/remote-browser-incognito.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/integration/remote-browser-incognito.test.ts b/test/integration/remote-browser-incognito.test.ts index d84bd488f739..bb5d1d38125b 100644 --- a/test/integration/remote-browser-incognito.test.ts +++ b/test/integration/remote-browser-incognito.test.ts @@ -18,7 +18,7 @@ import { expect, test } from 'vitest'; import { BROWSERLESS_URL, httpbin } from './helpers.js'; class BrowserlessCDPProvider extends RemoteBrowserProvider { - maxOpenBrowsers = 1; + override maxOpenBrowsers = 1; async connect() { return { url: BROWSERLESS_URL }; } From 5abf3bc59e5604b69a39a9e3c4ddeca1bfcad036 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sol=C3=A1r?= Date: Wed, 27 May 2026 14:06:37 +0000 Subject: [PATCH 27/45] Update proxyUrl + remote test assertions to match forwarded-info behavior --- packages/browser-pool/test/remote-browser.test.ts | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/packages/browser-pool/test/remote-browser.test.ts b/packages/browser-pool/test/remote-browser.test.ts index 5b970d849bec..041f2069b8f0 100644 --- a/packages/browser-pool/test/remote-browser.test.ts +++ b/packages/browser-pool/test/remote-browser.test.ts @@ -378,7 +378,7 @@ describe('Remote browser — PlaywrightPlugin', () => { // --- Info/Warnings -------------------------------------------------------- describe('info and warnings', () => { - test('proxyUrl + remote → warning logged', async () => { + test('proxyUrl + remote → info about forwarding to provider', async () => { const lib = createMockPlaywrightLibrary(); const plugin = new PlaywrightPlugin(lib as any, { connectOverCDPOptions: { endpointURL: 'http://remote:9222' }, @@ -388,8 +388,8 @@ describe('Remote browser — PlaywrightPlugin', () => { const ctx = plugin.createLaunchContext(); await plugin.launch(ctx); - expect(mockLogger.warning).toHaveBeenCalledWith( - expect.stringContaining('proxyUrl is set but will be ignored'), + expect(mockLogger.info).toHaveBeenCalledWith( + expect.stringContaining("forwarded to the remote browser provider's connect()"), ); }); @@ -615,7 +615,7 @@ describe('Remote browser — PuppeteerPlugin', () => { // --- Info/Warnings -------------------------------------------------------- describe('info and warnings', () => { - test('proxyUrl + remote → warning logged', async () => { + test('proxyUrl + remote → info about forwarding to provider', async () => { const lib = createMockPuppeteerLibrary(); const plugin = new PuppeteerPlugin(lib as any, { connectOverCDPOptions: { browserWSEndpoint: 'ws://remote:9222' }, @@ -625,8 +625,8 @@ describe('Remote browser — PuppeteerPlugin', () => { const ctx = plugin.createLaunchContext(); await plugin.launch(ctx); - expect(mockLogger.warning).toHaveBeenCalledWith( - expect.stringContaining('proxyUrl is set but will be ignored'), + expect(mockLogger.info).toHaveBeenCalledWith( + expect.stringContaining("forwarded to the remote browser provider's connect()"), ); }); From e8f5eda07ae90299d22300c9caab5bcd748eec82 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sol=C3=A1r?= Date: Sat, 30 May 2026 14:15:58 +0000 Subject: [PATCH 28/45] Throw on mutually exclusive remoteBrowser / connect options Replace the warn-and-silently-drop path with a constructor throw in both PlaywrightPlugin and PuppeteerPlugin when more than one of remoteBrowser, connectOptions, or connectOverCDPOptions is set. Fixes the doc/impl mismatch where JSDoc claimed remoteBrowser "Takes precedence" but the implementation actually dropped it. --- .../src/abstract-classes/browser-plugin.ts | 2 +- .../src/playwright/playwright-plugin.ts | 22 +++------ .../src/puppeteer/puppeteer-plugin.ts | 19 +++---- .../browser-pool/test/remote-browser.test.ts | 49 ++++++++++--------- .../src/internals/playwright-launcher.ts | 2 +- .../src/internals/puppeteer-launcher.ts | 2 +- 6 files changed, 45 insertions(+), 51 deletions(-) diff --git a/packages/browser-pool/src/abstract-classes/browser-plugin.ts b/packages/browser-pool/src/abstract-classes/browser-plugin.ts index 6cf816d84af4..288f063b1a21 100644 --- a/packages/browser-pool/src/abstract-classes/browser-plugin.ts +++ b/packages/browser-pool/src/abstract-classes/browser-plugin.ts @@ -153,7 +153,7 @@ export interface BrowserPluginOptions { * * Accepts either a {@link RemoteBrowserConfig} object or a {@link RemoteBrowserProvider} instance. * - * Takes precedence over `connectOverCDPOptions` / `connectOptions` if both are set. + * Mutually exclusive with `connectOverCDPOptions` / `connectOptions` — setting more than one throws. */ remoteBrowser?: RemoteBrowserConfig | RemoteBrowserProvider; } diff --git a/packages/browser-pool/src/playwright/playwright-plugin.ts b/packages/browser-pool/src/playwright/playwright-plugin.ts index ac0ea243fef2..59f2c8274691 100644 --- a/packages/browser-pool/src/playwright/playwright-plugin.ts +++ b/packages/browser-pool/src/playwright/playwright-plugin.ts @@ -53,8 +53,14 @@ export class PlaywrightPlugin extends BrowserPlugin< constructor(library: BrowserType, options: PlaywrightPluginOptions = {}) { const { connectOptions, connectOverCDPOptions, ...baseOptions } = options; - if (connectOptions && connectOverCDPOptions) { - throw new Error("Cannot set both 'connectOptions' and 'connectOverCDPOptions' — pick one protocol."); + const remoteSourceCount = [baseOptions.remoteBrowser, connectOptions, connectOverCDPOptions].filter( + (v) => v != null, + ).length; + if (remoteSourceCount > 1) { + throw new Error( + "Set at most one of 'remoteBrowser', 'connectOptions', 'connectOverCDPOptions' — " + + 'these options are mutually exclusive.', + ); } if (connectOverCDPOptions && !connectOverCDPOptions.endpointURL) { @@ -65,22 +71,10 @@ export class PlaywrightPlugin extends BrowserPlugin< throw new Error("'connectOptions.wsEndpoint' must be a non-empty string."); } - const remoteBrowserIgnored = !!(baseOptions.remoteBrowser && (connectOverCDPOptions || connectOptions)); - if (remoteBrowserIgnored) { - baseOptions.remoteBrowser = undefined; - } - super(library, baseOptions); this.connectOptions = connectOptions; this.connectOverCDPOptions = connectOverCDPOptions; - if (remoteBrowserIgnored) { - this.log.warning( - 'Both remoteBrowser and connectOverCDPOptions/connectOptions are set. ' + - 'remoteBrowser is ignored when explicit connect options are provided.', - ); - } - const isRemoteConnection = this.remoteBrowser || this.connectOptions || this.connectOverCDPOptions; if (isRemoteConnection) { if (options.useIncognitoPages === false) { diff --git a/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts b/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts index d24f4bc5f4d1..b91fc578c327 100644 --- a/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts +++ b/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts @@ -39,6 +39,13 @@ export class PuppeteerPlugin extends BrowserPlugin< constructor(library: typeof Puppeteer, options: PuppeteerPluginOptions = {}) { const { connectOverCDPOptions, ...baseOptions } = options; + if (baseOptions.remoteBrowser && connectOverCDPOptions) { + throw new Error( + "Set at most one of 'remoteBrowser', 'connectOverCDPOptions' — these options are mutually exclusive. " + + 'Pick a single remote connection source.', + ); + } + if (connectOverCDPOptions && !connectOverCDPOptions.browserWSEndpoint && !connectOverCDPOptions.browserURL) { throw new Error("connectOverCDPOptions must include either 'browserWSEndpoint' or 'browserURL'."); } @@ -47,21 +54,9 @@ export class PuppeteerPlugin extends BrowserPlugin< throw new Error("Puppeteer does not support 'websocket' connection type. Use 'cdp' (default) instead."); } - const remoteBrowserIgnored = !!(baseOptions.remoteBrowser && connectOverCDPOptions); - if (remoteBrowserIgnored) { - baseOptions.remoteBrowser = undefined; - } - super(library, baseOptions); this.connectOverCDPOptions = connectOverCDPOptions; - if (remoteBrowserIgnored) { - this.log.warning( - 'Both remoteBrowser and connectOverCDPOptions are set. ' + - 'remoteBrowser is ignored when explicit connect options are provided.', - ); - } - const isRemoteConnection = this.remoteBrowser || this.connectOverCDPOptions; if (isRemoteConnection && options.useIncognitoPages === undefined) { this.log.info( diff --git a/packages/browser-pool/test/remote-browser.test.ts b/packages/browser-pool/test/remote-browser.test.ts index 041f2069b8f0..39da21728db9 100644 --- a/packages/browser-pool/test/remote-browser.test.ts +++ b/packages/browser-pool/test/remote-browser.test.ts @@ -202,7 +202,7 @@ describe('Remote browser — PlaywrightPlugin', () => { connectOptions: { wsEndpoint: 'ws://remote:3000' }, connectOverCDPOptions: { endpointURL: 'http://remote:9222' }, }), - ).toThrow("Cannot set both 'connectOptions' and 'connectOverCDPOptions'"); + ).toThrow('mutually exclusive'); }); test('throws when connectOverCDPOptions has no endpointURL', () => { @@ -813,18 +813,26 @@ describe('remoteBrowser config — PlaywrightPlugin', () => { await expect(plugin.launch(ctx)).rejects.toThrow('Failed to resolve remote browser endpoint'); }); - test('remoteBrowser ignored when connectOverCDPOptions also set', async () => { + test('throws when both remoteBrowser and connectOverCDPOptions are set', () => { const lib = createMockPlaywrightLibrary(); - const plugin = new PlaywrightPlugin(lib as any, { - remoteBrowser: { endpoint: 'wss://ignored.io' }, - connectOverCDPOptions: { endpointURL: 'wss://explicit.io' }, - }); - - const ctx = plugin.createLaunchContext(); - await plugin.launch(ctx); + expect( + () => + new PlaywrightPlugin(lib as any, { + remoteBrowser: { endpoint: 'wss://a.io' }, + connectOverCDPOptions: { endpointURL: 'wss://b.io' }, + }), + ).toThrow('mutually exclusive'); + }); - expect(lib.connectOverCDP).toHaveBeenCalledWith('wss://explicit.io', {}); - expect(mockLogger.warning).toHaveBeenCalledWith(expect.stringContaining('remoteBrowser is ignored')); + test('throws when both remoteBrowser and connectOptions are set', () => { + const lib = createMockPlaywrightLibrary(); + expect( + () => + new PlaywrightPlugin(lib as any, { + remoteBrowser: { endpoint: 'wss://a.io' }, + connectOptions: { wsEndpoint: 'wss://b.io' }, + }), + ).toThrow('mutually exclusive'); }); }); @@ -900,18 +908,15 @@ describe('remoteBrowser config — PuppeteerPlugin', () => { expect(releaseFn).toHaveBeenCalledWith({ endpoint: 'wss://fail.io', context: { id: 'sess-456' } }); }); - test('remoteBrowser ignored when connectOverCDPOptions also set', async () => { + test('throws when both remoteBrowser and connectOverCDPOptions are set', () => { const lib = createMockPuppeteerLibrary(); - const plugin = new PuppeteerPlugin(lib as any, { - remoteBrowser: { endpoint: 'wss://ignored.io' }, - connectOverCDPOptions: { browserWSEndpoint: 'wss://explicit.io' }, - }); - - const ctx = plugin.createLaunchContext(); - await plugin.launch(ctx); - - expect(lib.connect).toHaveBeenCalledWith({ browserWSEndpoint: 'wss://explicit.io' }); - expect(mockLogger.warning).toHaveBeenCalledWith(expect.stringContaining('remoteBrowser is ignored')); + expect( + () => + new PuppeteerPlugin(lib as any, { + remoteBrowser: { endpoint: 'wss://a.io' }, + connectOverCDPOptions: { browserWSEndpoint: 'wss://b.io' }, + }), + ).toThrow('mutually exclusive'); }); }); diff --git a/packages/playwright-crawler/src/internals/playwright-launcher.ts b/packages/playwright-crawler/src/internals/playwright-launcher.ts index d05e60572d40..d941b43f52ee 100644 --- a/packages/playwright-crawler/src/internals/playwright-launcher.ts +++ b/packages/playwright-crawler/src/internals/playwright-launcher.ts @@ -94,7 +94,7 @@ export interface PlaywrightLaunchContext extends BrowserLaunchContext Date: Sat, 30 May 2026 18:23:22 +0000 Subject: [PATCH 29/45] Throw when launchOptions is combined with a remote browser option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PlaywrightLauncher and PuppeteerLauncher now throw if launchOptions is set alongside connectOptions, connectOverCDPOptions, or remoteBrowser. The launcher is the right layer for this check — at the plugin level the launcher always injects defaults (executablePath) into launchOptions, so the plugin cannot distinguish user-set from framework-default. Removes the now-unreachable executablePath warning and consolidates the useChrome warning behind the unified hasRemote flag. --- .../src/internals/playwright-launcher.ts | 34 +++++++++---------- .../src/internals/puppeteer-launcher.ts | 31 +++++++---------- .../playwright_launcher.test.ts | 29 ++++++++++++++++ .../puppeteer_launcher.test.ts | 20 +++++++++++ 4 files changed, 79 insertions(+), 35 deletions(-) diff --git a/packages/playwright-crawler/src/internals/playwright-launcher.ts b/packages/playwright-crawler/src/internals/playwright-launcher.ts index d941b43f52ee..5100856dc9ab 100644 --- a/packages/playwright-crawler/src/internals/playwright-launcher.ts +++ b/packages/playwright-crawler/src/internals/playwright-launcher.ts @@ -143,6 +143,18 @@ export class PlaywrightLauncher extends BrowserLauncher { ) { ow(launchContext, 'PlaywrightLauncherOptions', ow.object.exactShape(PlaywrightLauncher.optionsShape)); + const hasRemote = !!( + launchContext.connectOptions || + launchContext.connectOverCDPOptions || + launchContext.remoteBrowser + ); + if (hasRemote && launchContext.launchOptions !== undefined) { + throw new Error( + "'launchOptions' is ignored when using a remote browser. Set at most one of " + + "'launchOptions', 'connectOptions', 'connectOverCDPOptions', 'remoteBrowser'.", + ); + } + const { launcher = BrowserLauncher.requireLauncherOrThrow( 'playwright', @@ -166,24 +178,12 @@ export class PlaywrightLauncher extends BrowserLauncher { this.Plugin = PlaywrightPlugin; - const connectOptionsPresent = !!(launchContext.connectOptions || launchContext.connectOverCDPOptions); - - if (connectOptionsPresent && (launchContext.useChrome || launchContext.launchOptions?.executablePath)) { + if (hasRemote && launchContext.useChrome) { const log = serviceLocator.getLogger().child({ prefix: 'PlaywrightLauncher' }); - - if (launchContext.useChrome) { - log.warning( - 'useChrome is set but will be ignored for remote browser connections. ' + - 'The remote service controls which browser binary is used.', - ); - } - - if (launchContext.launchOptions?.executablePath) { - log.warning( - 'executablePath is set but will be ignored for remote browser connections. ' + - 'The remote service controls which browser binary is used.', - ); - } + log.warning( + 'useChrome is set but will be ignored for remote browser connections. ' + + 'The remote service controls which browser binary is used.', + ); } } } diff --git a/packages/puppeteer-crawler/src/internals/puppeteer-launcher.ts b/packages/puppeteer-crawler/src/internals/puppeteer-launcher.ts index 1b2d450494f1..d81fa77dc5c6 100644 --- a/packages/puppeteer-crawler/src/internals/puppeteer-launcher.ts +++ b/packages/puppeteer-crawler/src/internals/puppeteer-launcher.ts @@ -123,6 +123,14 @@ export class PuppeteerLauncher extends BrowserLauncher ) { ow(launchContext, 'PuppeteerLauncher', ow.object.exactShape(PuppeteerLauncher.optionsShape)); + const hasRemote = !!(launchContext.connectOverCDPOptions || launchContext.remoteBrowser); + if (hasRemote && launchContext.launchOptions !== undefined) { + throw new Error( + "'launchOptions' is ignored when using a remote browser. Set at most one of " + + "'launchOptions', 'connectOverCDPOptions', 'remoteBrowser'.", + ); + } + const { launcher = BrowserLauncher.requireLauncherOrThrow('puppeteer', 'apify/actor-node-puppeteer-chrome'), ...browserLauncherOptions @@ -138,25 +146,12 @@ export class PuppeteerLauncher extends BrowserLauncher this.Plugin = PuppeteerPlugin; - if ( - launchContext.connectOverCDPOptions && - (launchContext.useChrome || (launchContext.launchOptions as Record)?.executablePath) - ) { + if (hasRemote && launchContext.useChrome) { const log = serviceLocator.getLogger().child({ prefix: 'PuppeteerLauncher' }); - - if (launchContext.useChrome) { - log.warning( - 'useChrome is set but will be ignored for remote browser connections. ' + - 'The remote service controls which browser binary is used.', - ); - } - - if ((launchContext.launchOptions as Record)?.executablePath) { - log.warning( - 'executablePath is set but will be ignored for remote browser connections. ' + - 'The remote service controls which browser binary is used.', - ); - } + log.warning( + 'useChrome is set but will be ignored for remote browser connections. ' + + 'The remote service controls which browser binary is used.', + ); } } diff --git a/test/core/browser_launchers/playwright_launcher.test.ts b/test/core/browser_launchers/playwright_launcher.test.ts index e2008b3458e0..fa2cc35ce351 100644 --- a/test/core/browser_launchers/playwright_launcher.test.ts +++ b/test/core/browser_launchers/playwright_launcher.test.ts @@ -288,4 +288,33 @@ describe('launchPlaywright()', () => { recursive: true, }); }); + + describe('launchOptions + remote mutual exclusion', () => { + test('throws when launchOptions combined with connectOptions', async () => { + await expect( + launchPlaywright({ + launchOptions: { headless: true }, + connectOptions: { wsEndpoint: 'ws://remote:3000' }, + }), + ).rejects.toThrow("'launchOptions' is ignored when using a remote browser"); + }); + + test('throws when launchOptions combined with connectOverCDPOptions', async () => { + await expect( + launchPlaywright({ + launchOptions: { headless: true }, + connectOverCDPOptions: { endpointURL: 'http://remote:9222' }, + }), + ).rejects.toThrow("'launchOptions' is ignored when using a remote browser"); + }); + + test('throws when launchOptions combined with remoteBrowser', async () => { + await expect( + launchPlaywright({ + launchOptions: { headless: true }, + remoteBrowser: { endpoint: 'wss://remote.io' }, + }), + ).rejects.toThrow("'launchOptions' is ignored when using a remote browser"); + }); + }); }); diff --git a/test/core/browser_launchers/puppeteer_launcher.test.ts b/test/core/browser_launchers/puppeteer_launcher.test.ts index 0963cd62de6b..735f832b715e 100644 --- a/test/core/browser_launchers/puppeteer_launcher.test.ts +++ b/test/core/browser_launchers/puppeteer_launcher.test.ts @@ -308,4 +308,24 @@ describe('launchPuppeteer()', () => { recursive: true, }); }); + + describe('launchOptions + remote mutual exclusion', () => { + test('throws when launchOptions combined with connectOverCDPOptions', async () => { + await expect( + launchPuppeteer({ + launchOptions: { headless: true }, + connectOverCDPOptions: { browserWSEndpoint: 'ws://remote:3000' }, + }), + ).rejects.toThrow("'launchOptions' is ignored when using a remote browser"); + }); + + test('throws when launchOptions combined with remoteBrowser', async () => { + await expect( + launchPuppeteer({ + launchOptions: { headless: true }, + remoteBrowser: { endpoint: 'wss://remote.io' }, + }), + ).rejects.toThrow("'launchOptions' is ignored when using a remote browser"); + }); + }); }); From 90f1e266841c529c88d884bb9376689314a64d54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sol=C3=A1r?= Date: Sat, 30 May 2026 18:29:31 +0000 Subject: [PATCH 30/45] Rename remoteBrowser.type 'websocket' to 'playwright' CDP is also a WebSocket protocol, so 'websocket' was a misleading label. Rename to 'playwright', which names the actual transport (Playwright's client-server protocol exposed via browserType.connect()). Updated: RemoteBrowserConfig.type, PlaywrightRemoteBrowserConfig.type, RemoteBrowserProvider.type, the playwright-plugin branch, the puppeteer "not supported" error message, the connect log line, and all tests. --- .../src/abstract-classes/browser-plugin.ts | 6 +++- .../src/playwright/playwright-plugin.ts | 4 +-- .../src/puppeteer/puppeteer-plugin.ts | 4 +-- .../src/remote-browser-provider.ts | 6 ++-- .../browser-pool/test/remote-browser.test.ts | 32 +++++++++---------- .../src/internals/playwright-launcher.ts | 12 ++++--- 6 files changed, 35 insertions(+), 29 deletions(-) diff --git a/packages/browser-pool/src/abstract-classes/browser-plugin.ts b/packages/browser-pool/src/abstract-classes/browser-plugin.ts index 288f063b1a21..7b911e3e30f4 100644 --- a/packages/browser-pool/src/abstract-classes/browser-plugin.ts +++ b/packages/browser-pool/src/abstract-classes/browser-plugin.ts @@ -99,9 +99,13 @@ export interface RemoteBrowserConfig { /** * Connection type. Subclass interfaces narrow this further * (e.g. Puppeteer only allows `'cdp'`). + * + * - `'cdp'` — Chrome DevTools Protocol (Puppeteer + Playwright). + * - `'playwright'` — Playwright's own client-server protocol (Playwright only). + * * @default 'cdp' */ - type?: 'cdp' | 'websocket'; + type?: 'cdp' | 'playwright'; /** * Maximum number of browsers that can be open at the same time. * When the limit is reached, the crawler waits for a browser to close before launching a new one. diff --git a/packages/browser-pool/src/playwright/playwright-plugin.ts b/packages/browser-pool/src/playwright/playwright-plugin.ts index 59f2c8274691..0d7971ad1319 100644 --- a/packages/browser-pool/src/playwright/playwright-plugin.ts +++ b/packages/browser-pool/src/playwright/playwright-plugin.ts @@ -119,8 +119,8 @@ export class PlaywrightPlugin extends BrowserPlugin< launchContext.extend({ _resolvedRemoteEndpoint: url, _remoteContext: context }); try { - if (type === 'websocket') { - this.log.info('Connecting to remote browser via connect (Playwright WebSocket).'); + if (type === 'playwright') { + this.log.info('Connecting to remote browser via connect (Playwright protocol).'); return await this.library.connect(url, {}); } this.log.info('Connecting to remote browser via connectOverCDP.'); diff --git a/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts b/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts index b91fc578c327..7003797f613c 100644 --- a/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts +++ b/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts @@ -50,8 +50,8 @@ export class PuppeteerPlugin extends BrowserPlugin< throw new Error("connectOverCDPOptions must include either 'browserWSEndpoint' or 'browserURL'."); } - if (baseOptions.remoteBrowser?.type === 'websocket') { - throw new Error("Puppeteer does not support 'websocket' connection type. Use 'cdp' (default) instead."); + if (baseOptions.remoteBrowser?.type === 'playwright') { + throw new Error("Puppeteer does not support 'playwright' connection type. Use 'cdp' (default) instead."); } super(library, baseOptions); diff --git a/packages/browser-pool/src/remote-browser-provider.ts b/packages/browser-pool/src/remote-browser-provider.ts index 3fe43b643431..fe4d901a0239 100644 --- a/packages/browser-pool/src/remote-browser-provider.ts +++ b/packages/browser-pool/src/remote-browser-provider.ts @@ -49,12 +49,12 @@ export abstract class RemoteBrowserProvider = Record> { /** * Connection type. - * - `'cdp'` — Chrome DevTools Protocol, works with Puppeteer and Playwright. - * - `'websocket'` — Playwright-specific WebSocket protocol (not supported by Puppeteer). + * - `'cdp'` — Chrome DevTools Protocol (Puppeteer + Playwright). + * - `'playwright'` — Playwright's own client-server protocol (Playwright only). * * @default 'cdp' */ - type: 'cdp' | 'websocket' = 'cdp'; + type: 'cdp' | 'playwright' = 'cdp'; /** * Maximum number of browsers that can be open at the same time. diff --git a/packages/browser-pool/test/remote-browser.test.ts b/packages/browser-pool/test/remote-browser.test.ts index 39da21728db9..2f54ce23ec2e 100644 --- a/packages/browser-pool/test/remote-browser.test.ts +++ b/packages/browser-pool/test/remote-browser.test.ts @@ -679,10 +679,10 @@ describe('remoteBrowser config — PlaywrightPlugin', () => { expect(lib.connect).not.toHaveBeenCalled(); }); - test('static string endpoint with type websocket → calls connect', async () => { + test('static string endpoint with type playwright → calls connect', async () => { const lib = createMockPlaywrightLibrary(); const plugin = new PlaywrightPlugin(lib as any, { - remoteBrowser: { endpoint: 'wss://browserless.io/ws', type: 'websocket' }, + remoteBrowser: { endpoint: 'wss://browserless.io/ws', type: 'playwright' }, }); const ctx = plugin.createLaunchContext(); @@ -740,10 +740,10 @@ describe('remoteBrowser config — PlaywrightPlugin', () => { expect(plugin.useIncognitoPages).toBe(true); }); - test('useIncognitoPages forced to true when remoteBrowser is set (WebSocket)', () => { + test('useIncognitoPages forced to true when remoteBrowser is set (Playwright)', () => { const lib = createMockPlaywrightLibrary(); const plugin = new PlaywrightPlugin(lib as any, { - remoteBrowser: { endpoint: 'wss://test.io', type: 'websocket' }, + remoteBrowser: { endpoint: 'wss://test.io', type: 'playwright' }, }); expect(plugin.useIncognitoPages).toBe(true); @@ -871,13 +871,13 @@ describe('remoteBrowser config — PuppeteerPlugin', () => { expect(lib.connect).toHaveBeenCalledWith({ browserWSEndpoint: 'wss://dynamic.io' }); }); - test('type websocket throws in constructor', () => { + test('type playwright throws in Puppeteer constructor', () => { const lib = createMockPuppeteerLibrary(); expect(() => { new PuppeteerPlugin(lib as any, { - remoteBrowser: { endpoint: 'wss://test.io', type: 'websocket' } as any, + remoteBrowser: { endpoint: 'wss://test.io', type: 'playwright' } as any, }); - }).toThrow("does not support 'websocket'"); + }).toThrow("does not support 'playwright'"); }); test('isRemote is true when remoteBrowser is set', () => { @@ -953,18 +953,18 @@ describe('RemoteBrowserProvider — PlaywrightPlugin', () => { expect(lib.launch).not.toHaveBeenCalled(); }); - test('provider with type=websocket → calls connect', async () => { + test('provider with type=playwright → calls connect', async () => { const lib = createMockPlaywrightLibrary(); - class WsProvider extends RemoteBrowserProvider { - override type = 'websocket' as const; + class PwProvider extends RemoteBrowserProvider { + override type = 'playwright' as const; async connect() { return { url: 'wss://provider.io/ws' }; } } const plugin = new PlaywrightPlugin(lib as any, { - remoteBrowser: new WsProvider(), + remoteBrowser: new PwProvider(), }); const ctx = plugin.createLaunchContext(); @@ -1076,19 +1076,19 @@ describe('RemoteBrowserProvider — PuppeteerPlugin', () => { expect(lib.launch).not.toHaveBeenCalled(); }); - test('provider with type=websocket throws in Puppeteer', () => { + test('provider with type=playwright throws in Puppeteer', () => { const lib = createMockPuppeteerLibrary(); - class WsProvider extends RemoteBrowserProvider { - override type = 'websocket' as const; + class PwProvider extends RemoteBrowserProvider { + override type = 'playwright' as const; async connect() { return { url: 'wss://test.io' }; } } expect(() => { - new PuppeteerPlugin(lib as any, { remoteBrowser: new WsProvider() }); - }).toThrow("does not support 'websocket'"); + new PuppeteerPlugin(lib as any, { remoteBrowser: new PwProvider() }); + }).toThrow("does not support 'playwright'"); }); test('provider release called on connection failure', async () => { diff --git a/packages/playwright-crawler/src/internals/playwright-launcher.ts b/packages/playwright-crawler/src/internals/playwright-launcher.ts index 5100856dc9ab..e9fb75c7e371 100644 --- a/packages/playwright-crawler/src/internals/playwright-launcher.ts +++ b/packages/playwright-crawler/src/internals/playwright-launcher.ts @@ -100,7 +100,7 @@ export interface PlaywrightLaunchContext extends BrowserLaunchContext Date: Sat, 30 May 2026 18:34:52 +0000 Subject: [PATCH 31/45] =?UTF-8?q?Drop=20remoteBrowser.type=20=E2=80=94=20C?= =?UTF-8?q?DP-only=20remote=20browser=20path?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The RemoteBrowserConfig / RemoteBrowserProvider abstraction is built for remote browser services (Browserless, Browserbase, Steel), which all speak CDP. The 'websocket'/'playwright' branch (browserType.connect()) had no real provider behind it, and naming it 'websocket' was misleading (CDP also rides WebSocket). Rather than commit to a name that BiDi will make obsolete anyway, drop the field entirely. Callers who genuinely need Playwright's connect() can still use connectOptions directly. Removes: - RemoteBrowserConfig.type and RemoteBrowserProvider.type - PlaywrightRemoteBrowserConfig and PuppeteerRemoteBrowserConfig (now-empty interface extensions) - The 'playwright' branch in PlaywrightPlugin._launch - The "Puppeteer does not support 'playwright'" throw + tests - 5 type-related test cases --- .../src/abstract-classes/browser-plugin.ts | 11 --- .../src/playwright/playwright-plugin.ts | 9 +-- .../src/puppeteer/puppeteer-plugin.ts | 4 -- .../src/remote-browser-provider.ts | 9 --- .../browser-pool/test/remote-browser.test.ts | 69 +------------------ .../src/internals/playwright-launcher.ts | 18 +---- .../src/internals/puppeteer-launcher.ts | 14 +--- 7 files changed, 5 insertions(+), 129 deletions(-) diff --git a/packages/browser-pool/src/abstract-classes/browser-plugin.ts b/packages/browser-pool/src/abstract-classes/browser-plugin.ts index 7b911e3e30f4..607394b12674 100644 --- a/packages/browser-pool/src/abstract-classes/browser-plugin.ts +++ b/packages/browser-pool/src/abstract-classes/browser-plugin.ts @@ -96,16 +96,6 @@ export interface RemoteBrowserConfig { * Errors are caught and logged as warnings — they never crash the crawler. */ release?: (info: { endpoint: string; context?: Record }) => void | Promise; - /** - * Connection type. Subclass interfaces narrow this further - * (e.g. Puppeteer only allows `'cdp'`). - * - * - `'cdp'` — Chrome DevTools Protocol (Puppeteer + Playwright). - * - `'playwright'` — Playwright's own client-server protocol (Playwright only). - * - * @default 'cdp' - */ - type?: 'cdp' | 'playwright'; /** * Maximum number of browsers that can be open at the same time. * When the limit is reached, the crawler waits for a browser to close before launching a new one. @@ -224,7 +214,6 @@ export abstract class BrowserPlugin< this.remoteBrowser = { endpoint: (options) => provider.connect(options), release: ({ context }) => provider.release(context as any), - type: provider.type, maxOpenBrowsers: provider.maxOpenBrowsers, }; } else { diff --git a/packages/browser-pool/src/playwright/playwright-plugin.ts b/packages/browser-pool/src/playwright/playwright-plugin.ts index 0d7971ad1319..6c63722a118e 100644 --- a/packages/browser-pool/src/playwright/playwright-plugin.ts +++ b/packages/browser-pool/src/playwright/playwright-plugin.ts @@ -102,7 +102,6 @@ export class PlaywrightPlugin extends BrowserPlugin< protected async _launch(launchContext: LaunchContext): Promise { if (this.remoteBrowser) { - const type = this.remoteBrowser.type ?? 'cdp'; let url: string; let context: Record | undefined; try { @@ -119,17 +118,13 @@ export class PlaywrightPlugin extends BrowserPlugin< launchContext.extend({ _resolvedRemoteEndpoint: url, _remoteContext: context }); try { - if (type === 'playwright') { - this.log.info('Connecting to remote browser via connect (Playwright protocol).'); - return await this.library.connect(url, {}); - } this.log.info('Connecting to remote browser via connectOverCDP.'); return await this.library.connectOverCDP(url, {}); } catch (cause) { await this._callRelease(url, context); throw new BrowserLaunchError( - `Failed to connect to remote browser at "${this._sanitizeEndpointForLog(url)}". ` + - `Connection type: ${type}. Check that the endpoint is reachable.\n\u200b`, + `Failed to connect to remote browser at "${this._sanitizeEndpointForLog(url)}" via CDP. ` + + 'Check that the endpoint is reachable.\n\u200b', { cause }, ); } diff --git a/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts b/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts index 7003797f613c..b4e82a3c6c26 100644 --- a/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts +++ b/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts @@ -50,10 +50,6 @@ export class PuppeteerPlugin extends BrowserPlugin< throw new Error("connectOverCDPOptions must include either 'browserWSEndpoint' or 'browserURL'."); } - if (baseOptions.remoteBrowser?.type === 'playwright') { - throw new Error("Puppeteer does not support 'playwright' connection type. Use 'cdp' (default) instead."); - } - super(library, baseOptions); this.connectOverCDPOptions = connectOverCDPOptions; diff --git a/packages/browser-pool/src/remote-browser-provider.ts b/packages/browser-pool/src/remote-browser-provider.ts index fe4d901a0239..85dd4b4cf851 100644 --- a/packages/browser-pool/src/remote-browser-provider.ts +++ b/packages/browser-pool/src/remote-browser-provider.ts @@ -47,15 +47,6 @@ * ``` */ export abstract class RemoteBrowserProvider = Record> { - /** - * Connection type. - * - `'cdp'` — Chrome DevTools Protocol (Puppeteer + Playwright). - * - `'playwright'` — Playwright's own client-server protocol (Playwright only). - * - * @default 'cdp' - */ - type: 'cdp' | 'playwright' = 'cdp'; - /** * Maximum number of browsers that can be open at the same time. * Set this to your remote service's concurrent session limit to avoid 429 errors. diff --git a/packages/browser-pool/test/remote-browser.test.ts b/packages/browser-pool/test/remote-browser.test.ts index 2f54ce23ec2e..a534c23ea831 100644 --- a/packages/browser-pool/test/remote-browser.test.ts +++ b/packages/browser-pool/test/remote-browser.test.ts @@ -679,19 +679,6 @@ describe('remoteBrowser config — PlaywrightPlugin', () => { expect(lib.connect).not.toHaveBeenCalled(); }); - test('static string endpoint with type playwright → calls connect', async () => { - const lib = createMockPlaywrightLibrary(); - const plugin = new PlaywrightPlugin(lib as any, { - remoteBrowser: { endpoint: 'wss://browserless.io/ws', type: 'playwright' }, - }); - - const ctx = plugin.createLaunchContext(); - await plugin.launch(ctx); - - expect(lib.connect).toHaveBeenCalledWith('wss://browserless.io/ws', {}); - expect(lib.connectOverCDP).not.toHaveBeenCalled(); - }); - test('function endpoint → called per launch', async () => { const lib = createMockPlaywrightLibrary(); const endpointFn = vi.fn().mockResolvedValue('wss://dynamic-endpoint.io'); @@ -731,7 +718,7 @@ describe('remoteBrowser config — PlaywrightPlugin', () => { expect(ctx.isRemote).toBe(true); }); - test('useIncognitoPages forced to true when remoteBrowser is set (CDP)', () => { + test('useIncognitoPages forced to true when remoteBrowser is set', () => { const lib = createMockPlaywrightLibrary(); const plugin = new PlaywrightPlugin(lib as any, { remoteBrowser: { endpoint: 'wss://test.io' }, @@ -740,15 +727,6 @@ describe('remoteBrowser config — PlaywrightPlugin', () => { expect(plugin.useIncognitoPages).toBe(true); }); - test('useIncognitoPages forced to true when remoteBrowser is set (Playwright)', () => { - const lib = createMockPlaywrightLibrary(); - const plugin = new PlaywrightPlugin(lib as any, { - remoteBrowser: { endpoint: 'wss://test.io', type: 'playwright' }, - }); - - expect(plugin.useIncognitoPages).toBe(true); - }); - test('release called on connection failure with context', async () => { const lib = createMockPlaywrightLibrary(); lib.connectOverCDP.mockRejectedValue(new Error('Connection refused')); @@ -871,15 +849,6 @@ describe('remoteBrowser config — PuppeteerPlugin', () => { expect(lib.connect).toHaveBeenCalledWith({ browserWSEndpoint: 'wss://dynamic.io' }); }); - test('type playwright throws in Puppeteer constructor', () => { - const lib = createMockPuppeteerLibrary(); - expect(() => { - new PuppeteerPlugin(lib as any, { - remoteBrowser: { endpoint: 'wss://test.io', type: 'playwright' } as any, - }); - }).toThrow("does not support 'playwright'"); - }); - test('isRemote is true when remoteBrowser is set', () => { const lib = createMockPuppeteerLibrary(); const plugin = new PuppeteerPlugin(lib as any, { @@ -953,27 +922,6 @@ describe('RemoteBrowserProvider — PlaywrightPlugin', () => { expect(lib.launch).not.toHaveBeenCalled(); }); - test('provider with type=playwright → calls connect', async () => { - const lib = createMockPlaywrightLibrary(); - - class PwProvider extends RemoteBrowserProvider { - override type = 'playwright' as const; - async connect() { - return { url: 'wss://provider.io/ws' }; - } - } - - const plugin = new PlaywrightPlugin(lib as any, { - remoteBrowser: new PwProvider(), - }); - - const ctx = plugin.createLaunchContext(); - await plugin.launch(ctx); - - expect(lib.connect).toHaveBeenCalledWith('wss://provider.io/ws', {}); - expect(lib.connectOverCDP).not.toHaveBeenCalled(); - }); - test('provider context flows to release', async () => { const lib = createMockPlaywrightLibrary(); @@ -1076,21 +1024,6 @@ describe('RemoteBrowserProvider — PuppeteerPlugin', () => { expect(lib.launch).not.toHaveBeenCalled(); }); - test('provider with type=playwright throws in Puppeteer', () => { - const lib = createMockPuppeteerLibrary(); - - class PwProvider extends RemoteBrowserProvider { - override type = 'playwright' as const; - async connect() { - return { url: 'wss://test.io' }; - } - } - - expect(() => { - new PuppeteerPlugin(lib as any, { remoteBrowser: new PwProvider() }); - }).toThrow("does not support 'playwright'"); - }); - test('provider release called on connection failure', async () => { const lib = createMockPuppeteerLibrary(); lib.connect.mockRejectedValue(new Error('Connection refused')); diff --git a/packages/playwright-crawler/src/internals/playwright-launcher.ts b/packages/playwright-crawler/src/internals/playwright-launcher.ts index e9fb75c7e371..9268b2937acc 100644 --- a/packages/playwright-crawler/src/internals/playwright-launcher.ts +++ b/packages/playwright-crawler/src/internals/playwright-launcher.ts @@ -100,26 +100,10 @@ export interface PlaywrightLaunchContext extends BrowserLaunchContext; -} - -/** - * Remote browser configuration for Playwright crawlers. - * Supports both CDP and Playwright's own client-server protocol. - */ -export interface PlaywrightRemoteBrowserConfig extends RemoteBrowserConfig { - /** - * Connection type to use. - * - `'cdp'` uses `browserType.connectOverCDP()` (Chrome DevTools Protocol). - * - `'playwright'` uses `browserType.connect()` (Playwright's client-server protocol). - * - * @default 'cdp' - */ - type?: 'cdp' | 'playwright'; + remoteBrowser?: RemoteBrowserConfig | RemoteBrowserProvider; } /** diff --git a/packages/puppeteer-crawler/src/internals/puppeteer-launcher.ts b/packages/puppeteer-crawler/src/internals/puppeteer-launcher.ts index d81fa77dc5c6..32b41d1ea7cf 100644 --- a/packages/puppeteer-crawler/src/internals/puppeteer-launcher.ts +++ b/packages/puppeteer-crawler/src/internals/puppeteer-launcher.ts @@ -87,19 +87,7 @@ export interface PuppeteerLaunchContext extends BrowserLaunchContext; -} - -/** - * Remote browser configuration for Puppeteer crawlers. - * Only CDP connections are supported (Puppeteer does not have a WebSocket connection mode). - */ -export interface PuppeteerRemoteBrowserConfig extends RemoteBrowserConfig { - /** - * Connection type. Only `'cdp'` is supported for Puppeteer. - * @default 'cdp' - */ - type?: 'cdp'; + remoteBrowser?: RemoteBrowserConfig | RemoteBrowserProvider; } /** From 741c1a67835770b29a026242549eaffd79beda55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sol=C3=A1r?= Date: Sat, 30 May 2026 18:42:38 +0000 Subject: [PATCH 32/45] cleanup --- packages/browser-pool/src/playwright/playwright-plugin.ts | 8 ++++---- packages/browser-pool/src/puppeteer/puppeteer-plugin.ts | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/packages/browser-pool/src/playwright/playwright-plugin.ts b/packages/browser-pool/src/playwright/playwright-plugin.ts index 6c63722a118e..6aad20a0a681 100644 --- a/packages/browser-pool/src/playwright/playwright-plugin.ts +++ b/packages/browser-pool/src/playwright/playwright-plugin.ts @@ -110,7 +110,7 @@ export class PlaywrightPlugin extends BrowserPlugin< context = result.context; } catch (cause) { throw new BrowserLaunchError( - 'Failed to resolve remote browser endpoint from remoteBrowser.endpoint() function.\n\u200b', + 'Failed to resolve remote browser endpoint from remoteBrowser.endpoint() function.\u200b', { cause }, ); } @@ -124,7 +124,7 @@ export class PlaywrightPlugin extends BrowserPlugin< await this._callRelease(url, context); throw new BrowserLaunchError( `Failed to connect to remote browser at "${this._sanitizeEndpointForLog(url)}" via CDP. ` + - 'Check that the endpoint is reachable.\n\u200b', + 'Check that the endpoint is reachable.\u200b', { cause }, ); } @@ -139,7 +139,7 @@ export class PlaywrightPlugin extends BrowserPlugin< } catch (cause) { throw new BrowserLaunchError( `Failed to connect to remote browser via CDP at "${this._sanitizeEndpointForLog(endpointURL)}". ` + - 'Check that the endpoint is reachable and the browser is accepting CDP connections.\n\u200b', + 'Check that the endpoint is reachable and the browser is accepting CDP connections.\u200b', { cause }, ); } @@ -154,7 +154,7 @@ export class PlaywrightPlugin extends BrowserPlugin< } catch (cause) { throw new BrowserLaunchError( `Failed to connect to remote browser via WebSocket at "${this._sanitizeEndpointForLog(wsEndpoint)}". ` + - 'Check that the endpoint is reachable and the Playwright server is running.\n\u200b', + 'Check that the endpoint is reachable and the Playwright server is running.\u200b', { cause }, ); } diff --git a/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts b/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts index b4e82a3c6c26..54de0c12c7c7 100644 --- a/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts +++ b/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts @@ -108,7 +108,7 @@ export class PuppeteerPlugin extends BrowserPlugin< context = result.context; } catch (cause) { throw new BrowserLaunchError( - 'Failed to resolve remote browser endpoint from remoteBrowser.endpoint() function.\n\u200b', + 'Failed to resolve remote browser endpoint from remoteBrowser.endpoint() function.\u200b', { cause }, ); } @@ -122,7 +122,7 @@ export class PuppeteerPlugin extends BrowserPlugin< await this._callRelease(url, context); throw new BrowserLaunchError( `Failed to connect to remote browser at "${this._sanitizeEndpointForLog(url)}". ` + - 'Check that the endpoint is reachable and the browser is accepting CDP connections.\n\u200b', + 'Check that the endpoint is reachable and the browser is accepting CDP connections.\u200b', { cause }, ); } @@ -136,7 +136,7 @@ export class PuppeteerPlugin extends BrowserPlugin< const safeEndpoint = this._sanitizeEndpointForLog(endpoint); throw new BrowserLaunchError( `Failed to connect to remote browser via CDP at "${safeEndpoint}". ` + - 'Check that the endpoint is reachable and the browser is accepting CDP connections.\n\u200b', + 'Check that the endpoint is reachable and the browser is accepting CDP connections.\u200b', { cause }, ); } From 9eccb11db278989248c9122367b0aec488361342 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sol=C3=A1r?= Date: Sat, 30 May 2026 19:25:23 +0000 Subject: [PATCH 33/45] Warn and drop top-level headless when remoteBrowser is set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The crawler-level 'headless' shortcut synthesized a launchContext.launchOptions object, which then tripped the launcher's mutual-exclusion check against remoteBrowser. Warn and skip the mutation instead — remote services control headless mode anyway. Mirrors the existing useChrome warning in the launcher. --- .../src/internals/playwright-crawler.ts | 12 ++++++++++-- .../src/internals/puppeteer-crawler.ts | 12 ++++++++++-- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/packages/playwright-crawler/src/internals/playwright-crawler.ts b/packages/playwright-crawler/src/internals/playwright-crawler.ts index 753f2273be67..2332935b6ef5 100644 --- a/packages/playwright-crawler/src/internals/playwright-crawler.ts +++ b/packages/playwright-crawler/src/internals/playwright-crawler.ts @@ -221,8 +221,16 @@ export class PlaywrightCrawler< } if (headless != null) { - launchContext.launchOptions ??= {} as LaunchOptions; - launchContext.launchOptions.headless = headless as boolean; + if (launchContext.remoteBrowser) { + const log = serviceLocator.getLogger().child({ prefix: 'PlaywrightCrawler' }); + log.warning( + "'headless' is ignored when using a remote browser. " + + 'The remote service controls headless mode.', + ); + } else { + launchContext.launchOptions ??= {} as LaunchOptions; + launchContext.launchOptions.headless = headless as boolean; + } } const playwrightLauncher = new PlaywrightLauncher(launchContext, options.configuration); diff --git a/packages/puppeteer-crawler/src/internals/puppeteer-crawler.ts b/packages/puppeteer-crawler/src/internals/puppeteer-crawler.ts index 1ab8385f88c6..ac0c40b53856 100644 --- a/packages/puppeteer-crawler/src/internals/puppeteer-crawler.ts +++ b/packages/puppeteer-crawler/src/internals/puppeteer-crawler.ts @@ -198,8 +198,16 @@ export class PuppeteerCrawler< } if (headless != null) { - launchContext.launchOptions ??= {} as LaunchOptions; - launchContext.launchOptions.headless = headless as boolean; + if (launchContext.remoteBrowser) { + const log = serviceLocator.getLogger().child({ prefix: 'PuppeteerCrawler' }); + log.warning( + "'headless' is ignored when using a remote browser. " + + 'The remote service controls headless mode.', + ); + } else { + launchContext.launchOptions ??= {} as LaunchOptions; + launchContext.launchOptions.headless = headless as boolean; + } } const puppeteerLauncher = new PuppeteerLauncher(launchContext, options.configuration); From ca529b67df210b942d65d11a205fe9937512f6b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sol=C3=A1r?= Date: Sat, 30 May 2026 23:13:34 +0000 Subject: [PATCH 34/45] ci: inline pnpm install in Build & Test for debugging Replace apify/workflows/pnpm-install@main with a direct pnpm install call without --loglevel error and without the pnpm store cache, to surface the actual error behind the 8-min silent hang on Node 24. Revert once root cause is identified. --- .github/workflows/test-ci.yml | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test-ci.yml b/.github/workflows/test-ci.yml index a7cecadc44a7..bb0e2eb80d17 100644 --- a/.github/workflows/test-ci.yml +++ b/.github/workflows/test-ci.yml @@ -46,7 +46,23 @@ jobs: restore-keys: | turbo-${{ github.job }}-${{ matrix.node-version }}-${{ github.ref_name }}- - - uses: apify/workflows/pnpm-install@main + # DEBUG: inline install, no pnpm store cache, no --loglevel error. + # Remove once #3545 install hang is understood. + - name: Install pnpm + uses: pnpm/action-setup@v6.0.8 + with: + run_install: false + + - name: Install dependencies (debug) + run: | + set -x + node --version + pnpm --version + df -h / + free -h + pnpm install --frozen-lockfile --reporter=default + env: + HUSKY: "0" - name: Install Playwright browsers run: pnpm exec playwright install --with-deps From 8e4a98c8cb72e929cb932b96b3b59c6c7164d55a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sol=C3=A1r?= Date: Sun, 31 May 2026 01:14:12 +0200 Subject: [PATCH 35/45] Revert "ci: inline pnpm install in Build & Test for debugging" This reverts commit ca529b67df210b942d65d11a205fe9937512f6b0. --- .github/workflows/test-ci.yml | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/.github/workflows/test-ci.yml b/.github/workflows/test-ci.yml index bb0e2eb80d17..a7cecadc44a7 100644 --- a/.github/workflows/test-ci.yml +++ b/.github/workflows/test-ci.yml @@ -46,23 +46,7 @@ jobs: restore-keys: | turbo-${{ github.job }}-${{ matrix.node-version }}-${{ github.ref_name }}- - # DEBUG: inline install, no pnpm store cache, no --loglevel error. - # Remove once #3545 install hang is understood. - - name: Install pnpm - uses: pnpm/action-setup@v6.0.8 - with: - run_install: false - - - name: Install dependencies (debug) - run: | - set -x - node --version - pnpm --version - df -h / - free -h - pnpm install --frozen-lockfile --reporter=default - env: - HUSKY: "0" + - uses: apify/workflows/pnpm-install@main - name: Install Playwright browsers run: pnpm exec playwright install --with-deps From 8c1e9772f8a309bea97048ea6b336d2bfec12edf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sol=C3=A1r?= Date: Sun, 31 May 2026 11:40:32 +0200 Subject: [PATCH 36/45] Warn and drop top-level headless for connect/connectOverCDP paths too --- .../playwright-crawler/src/internals/playwright-crawler.ts | 4 ++-- packages/puppeteer-crawler/src/internals/puppeteer-crawler.ts | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/packages/playwright-crawler/src/internals/playwright-crawler.ts b/packages/playwright-crawler/src/internals/playwright-crawler.ts index 2332935b6ef5..408b303821c3 100644 --- a/packages/playwright-crawler/src/internals/playwright-crawler.ts +++ b/packages/playwright-crawler/src/internals/playwright-crawler.ts @@ -221,10 +221,10 @@ export class PlaywrightCrawler< } if (headless != null) { - if (launchContext.remoteBrowser) { + if (launchContext.remoteBrowser || launchContext.connectOptions || launchContext.connectOverCDPOptions) { const log = serviceLocator.getLogger().child({ prefix: 'PlaywrightCrawler' }); log.warning( - "'headless' is ignored when using a remote browser. " + + "'headless' is ignored when connecting to a remote browser. " + 'The remote service controls headless mode.', ); } else { diff --git a/packages/puppeteer-crawler/src/internals/puppeteer-crawler.ts b/packages/puppeteer-crawler/src/internals/puppeteer-crawler.ts index ac0c40b53856..ff780adf0eb0 100644 --- a/packages/puppeteer-crawler/src/internals/puppeteer-crawler.ts +++ b/packages/puppeteer-crawler/src/internals/puppeteer-crawler.ts @@ -198,10 +198,10 @@ export class PuppeteerCrawler< } if (headless != null) { - if (launchContext.remoteBrowser) { + if (launchContext.remoteBrowser || launchContext.connectOverCDPOptions) { const log = serviceLocator.getLogger().child({ prefix: 'PuppeteerCrawler' }); log.warning( - "'headless' is ignored when using a remote browser. " + + "'headless' is ignored when connecting to a remote browser. " + 'The remote service controls headless mode.', ); } else { From 78dd61e5582030233d82ff32aa5a516cc10203e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sol=C3=A1r?= Date: Tue, 2 Jun 2026 13:48:39 +0200 Subject: [PATCH 37/45] Add remote browser services guide and drop temp-examples scratchpad --- docs/guides/remote_browser.mdx | 68 +++++++++ docs/guides/remote_browser_config.ts | 19 +++ docs/guides/remote_browser_provider.ts | 45 ++++++ docs/guides/remote_browser_puppeteer.ts | 18 +++ temp-examples/.env.example | 9 -- temp-examples/.gitignore | 1 - .../examples/browserbase-playwright-ws.ts | 54 -------- .../examples/browserbase-playwright.ts | 70 ---------- .../examples/browserbase-puppeteer.ts | 67 --------- .../browserless-local-playwright-ws.ts | 48 ------- .../examples/browserless-local-playwright.ts | 42 ------ .../examples/browserless-local-puppeteer.ts | 43 ------ .../examples/browserless-overlap-test.ts | 80 ----------- .../examples/browserless-playwright-ws.ts | 47 ------- .../examples/browserless-playwright.ts | 69 ---------- .../examples/browserless-puppeteer.ts | 44 ------ ...okie-sharing-playwright-local-vs-remote.ts | 109 --------------- .../cookie-sharing-playwright-test.ts | 118 ---------------- .../cookie-sharing-session-across-browsers.ts | 82 ----------- ...e-sharing-session-across-local-browsers.ts | 66 --------- temp-examples/examples/cookie-sharing-test.ts | 118 ---------------- .../examples/rebrowser-playwright-ws.ts | 52 ------- .../examples/rebrowser-playwright.ts | 33 ----- temp-examples/examples/rebrowser-puppeteer.ts | 33 ----- temp-examples/examples/remote-proxy-test.ts | 84 ------------ .../examples/remote-proxy-via-chrome-args.ts | 129 ------------------ temp-examples/examples/steel-playwright.ts | 66 --------- temp-examples/examples/steel-puppeteer.ts | 44 ------ temp-examples/package.json | 49 ------- temp-examples/readme.md | 115 ---------------- temp-examples/tsconfig.json | 9 -- website/sidebars.js | 1 + 32 files changed, 151 insertions(+), 1681 deletions(-) create mode 100644 docs/guides/remote_browser.mdx create mode 100644 docs/guides/remote_browser_config.ts create mode 100644 docs/guides/remote_browser_provider.ts create mode 100644 docs/guides/remote_browser_puppeteer.ts delete mode 100644 temp-examples/.env.example delete mode 100644 temp-examples/.gitignore delete mode 100644 temp-examples/examples/browserbase-playwright-ws.ts delete mode 100644 temp-examples/examples/browserbase-playwright.ts delete mode 100644 temp-examples/examples/browserbase-puppeteer.ts delete mode 100644 temp-examples/examples/browserless-local-playwright-ws.ts delete mode 100644 temp-examples/examples/browserless-local-playwright.ts delete mode 100644 temp-examples/examples/browserless-local-puppeteer.ts delete mode 100644 temp-examples/examples/browserless-overlap-test.ts delete mode 100644 temp-examples/examples/browserless-playwright-ws.ts delete mode 100644 temp-examples/examples/browserless-playwright.ts delete mode 100644 temp-examples/examples/browserless-puppeteer.ts delete mode 100644 temp-examples/examples/cookie-sharing-playwright-local-vs-remote.ts delete mode 100644 temp-examples/examples/cookie-sharing-playwright-test.ts delete mode 100644 temp-examples/examples/cookie-sharing-session-across-browsers.ts delete mode 100644 temp-examples/examples/cookie-sharing-session-across-local-browsers.ts delete mode 100644 temp-examples/examples/cookie-sharing-test.ts delete mode 100644 temp-examples/examples/rebrowser-playwright-ws.ts delete mode 100644 temp-examples/examples/rebrowser-playwright.ts delete mode 100644 temp-examples/examples/rebrowser-puppeteer.ts delete mode 100644 temp-examples/examples/remote-proxy-test.ts delete mode 100644 temp-examples/examples/remote-proxy-via-chrome-args.ts delete mode 100644 temp-examples/examples/steel-playwright.ts delete mode 100644 temp-examples/examples/steel-puppeteer.ts delete mode 100644 temp-examples/package.json delete mode 100644 temp-examples/readme.md delete mode 100644 temp-examples/tsconfig.json diff --git a/docs/guides/remote_browser.mdx b/docs/guides/remote_browser.mdx new file mode 100644 index 000000000000..40a0d8b80c12 --- /dev/null +++ b/docs/guides/remote_browser.mdx @@ -0,0 +1,68 @@ +--- +id: remote-browser +title: "Remote browser services" +sidebar_label: "Remote browsers" +description: Connect Crawlee crawlers to remote browser services like Browserbase, Browserless, or Steel. +--- + +import ApiLink from '@site/src/components/ApiLink'; +import CodeBlock from '@theme/CodeBlock'; + +import RemoteBrowserConfigSource from '!!raw-loader!./remote_browser_config.ts'; +import RemoteBrowserProviderSource from '!!raw-loader!./remote_browser_provider.ts'; +import RemoteBrowserPuppeteerSource from '!!raw-loader!./remote_browser_puppeteer.ts'; + +Instead of launching a local browser, Crawlee can connect to a remote browser service like [Browserbase](https://browserbase.com/), [Browserless](https://browserless.io/), [Steel](https://steel.dev/), or any service that exposes a WebSocket/CDP endpoint. The crawler manages the browser pool, session rotation, and proxy logic the same way it does locally — only the browser itself runs elsewhere. + +Use this when you need IPs in specific regions, want to offload CPU/memory from your runner, or need stealth features the service provides. + +## Inline config + +The simplest form passes a connection URL on `launchContext.remoteBrowser`. Use this when the service exposes a single endpoint and doesn't need per-session setup. + +{RemoteBrowserConfigSource} + +`endpoint` can also be a function returning `{ url, context }`, called once per browser launch. The optional `release({ endpoint, context })` callback runs when the browser closes, crashes, or the pool is destroyed — use it to clean up sessions on the service side. + +### Self-hosted + +Some services ship a Docker image you can run locally or on your own infrastructure. For example, [Browserless](https://www.browserless.io/) has an open-source Chromium image: + +```bash +docker run -p 3000:3000 -e CONCURRENT=4 ghcr.io/browserless/chromium +``` + +Point the crawler at the local endpoint: + +```ts +remoteBrowser: { endpoint: 'ws://localhost:3000' } +``` + +## Custom provider + +For services with a session-create / session-release lifecycle, extend `RemoteBrowserProvider`. `connect()` runs once per browser launch and returns the connection URL plus an optional `context` object passed back to `release()`. + +{RemoteBrowserProviderSource} + +`maxOpenBrowsers` caps the number of concurrent browsers — set it to the service's concurrent-session limit to avoid 429 errors. + +## Puppeteer + +`PuppeteerCrawler` supports the same `remoteBrowser` option. For services that only expose a raw CDP endpoint without per-session setup, you can also use `connectOverCDPOptions` directly: + +{RemoteBrowserPuppeteerSource} + +For Playwright, the analogous low-level options are `connectOptions` (Playwright's WebSocket protocol) and `connectOverCDPOptions` (CDP). `remoteBrowser`, `connectOptions`, and `connectOverCDPOptions` are mutually exclusive — set only one. + +## Limitations + +- **`headless` is ignored.** The remote service controls headless mode; setting `headless` on the crawler or in `launchOptions` is dropped with a warning. +- **`launchOptions` cannot be combined with a remote browser.** Setting both throws — browser flags, executable path, viewport, and similar must be configured on the service side. +- **`useIncognitoPages` is forced to `true`** for Playwright remote connections. Playwright's `connect()` / `connectOverCDP()` don't accept persistent contexts. +- **`userDataDir` has no effect** — there's no local profile when the browser runs remotely. +- **`maxOpenBrowsers` enforcement only gates new task starts.** Direct `BrowserPool.newPage` calls can exceed it; the limit is honored when the crawler drives the pool. + +## Further reading + +- `RemoteBrowserProvider` API reference +- `RemoteBrowserConfig` API reference diff --git a/docs/guides/remote_browser_config.ts b/docs/guides/remote_browser_config.ts new file mode 100644 index 000000000000..cab24954009a --- /dev/null +++ b/docs/guides/remote_browser_config.ts @@ -0,0 +1,19 @@ +import { PlaywrightCrawler } from 'crawlee'; + +const token = process.env.BROWSERLESS_TOKEN!; + +const crawler = new PlaywrightCrawler({ + launchContext: { + remoteBrowser: { + endpoint: `wss://production-sfo.browserless.io?token=${token}`, + // Optional — respect the service's concurrent session limit. + maxOpenBrowsers: 5, + }, + }, + async requestHandler({ page, request, log }) { + const title = await page.title(); + log.info(`${request.loadedUrl} — "${title}"`); + }, +}); + +await crawler.run(['https://crawlee.dev']); diff --git a/docs/guides/remote_browser_provider.ts b/docs/guides/remote_browser_provider.ts new file mode 100644 index 000000000000..6799909e217e --- /dev/null +++ b/docs/guides/remote_browser_provider.ts @@ -0,0 +1,45 @@ +import { RemoteBrowserProvider } from '@crawlee/browser-pool'; +import { PlaywrightCrawler } from 'crawlee'; + +const apiKey = process.env.BROWSERBASE_API_KEY!; +const projectId = process.env.BROWSERBASE_PROJECT_ID!; + +class BrowserbaseProvider extends RemoteBrowserProvider<{ id: string }> { + // Respect the service's concurrent session limit to avoid 429s. + maxOpenBrowsers = 5; + + async connect() { + const response = await fetch('https://api.browserbase.com/v1/sessions', { + method: 'POST', + headers: { 'x-bb-api-key': apiKey, 'Content-Type': 'application/json' }, + body: JSON.stringify({ projectId }), + }); + + if (!response.ok) { + throw new Error(`Failed to create session: ${response.status} ${response.statusText}`); + } + + const session = (await response.json()) as { id: string; connectUrl: string }; + return { url: session.connectUrl, context: { id: session.id } }; + } + + async release({ id }: { id: string }) { + await fetch(`https://api.browserbase.com/v1/sessions/${id}`, { + method: 'POST', + headers: { 'x-bb-api-key': apiKey, 'Content-Type': 'application/json' }, + body: JSON.stringify({ status: 'REQUEST_RELEASE' }), + }); + } +} + +const crawler = new PlaywrightCrawler({ + launchContext: { + remoteBrowser: new BrowserbaseProvider(), + }, + async requestHandler({ page, request, log }) { + const title = await page.title(); + log.info(`${request.loadedUrl} — "${title}"`); + }, +}); + +await crawler.run(['https://crawlee.dev']); diff --git a/docs/guides/remote_browser_puppeteer.ts b/docs/guides/remote_browser_puppeteer.ts new file mode 100644 index 000000000000..61952843cb24 --- /dev/null +++ b/docs/guides/remote_browser_puppeteer.ts @@ -0,0 +1,18 @@ +import { PuppeteerCrawler } from 'crawlee'; + +const token = process.env.BROWSERLESS_TOKEN!; + +const crawler = new PuppeteerCrawler({ + launchContext: { + // Puppeteer connects to remote browsers via CDP. + connectOverCDPOptions: { + browserWSEndpoint: `wss://production-sfo.browserless.io?token=${token}`, + }, + }, + async requestHandler({ page, request, log }) { + const title = await page.title(); + log.info(`${request.loadedUrl} — "${title}"`); + }, +}); + +await crawler.run(['https://crawlee.dev']); diff --git a/temp-examples/.env.example b/temp-examples/.env.example deleted file mode 100644 index 500f5da5f2ce..000000000000 --- a/temp-examples/.env.example +++ /dev/null @@ -1,9 +0,0 @@ -BROWSERBASE_API_KEY= -BROWSERBASE_PROJECT_ID= -# -BROWSERLESS_TOKEN= -# -REBROWSER_API_KEY= -REBROWSER_PROFILE_ID= -# -STEEL_API_KEY= diff --git a/temp-examples/.gitignore b/temp-examples/.gitignore deleted file mode 100644 index 4c49bd78f1d0..000000000000 --- a/temp-examples/.gitignore +++ /dev/null @@ -1 +0,0 @@ -.env diff --git a/temp-examples/examples/browserbase-playwright-ws.ts b/temp-examples/examples/browserbase-playwright-ws.ts deleted file mode 100644 index f576a9ee9add..000000000000 --- a/temp-examples/examples/browserbase-playwright-ws.ts +++ /dev/null @@ -1,54 +0,0 @@ -import 'dotenv/config'; - -import { RemoteBrowserProvider } from '@crawlee/browser-pool'; -import { PlaywrightCrawler } from 'crawlee'; - -const apiKey = process.env.BROWSERBASE_API_KEY; -const projectId = process.env.BROWSERBASE_PROJECT_ID; - -if (!apiKey) throw new Error('BROWSERBASE_API_KEY env variable is required'); -if (!projectId) throw new Error('BROWSERBASE_PROJECT_ID env variable is required'); - -class BrowserbaseWsProvider extends RemoteBrowserProvider<{ id: string }> { - override type = 'websocket' as const; - - async connect() { - const response = await fetch('https://api.browserbase.com/v1/sessions', { - method: 'POST', - headers: { 'x-bb-api-key': apiKey, 'Content-Type': 'application/json' }, - body: JSON.stringify({ projectId }), - }); - - if (!response.ok) { - throw new Error(`Failed to create Browserbase session: ${response.status} ${response.statusText}`); - } - - const session = await response.json(); - console.log(`>>> Session created: ${session.id}`); - - const url = `wss://connect.browserbase.com?apiKey=${apiKey}&sessionId=${session.id}`; - return { url, context: { id: session.id } }; - } - - async release({ id }: { id: string }) { - await fetch(`https://api.browserbase.com/v1/sessions/${id}`, { - method: 'POST', - headers: { 'x-bb-api-key': apiKey, 'Content-Type': 'application/json' }, - body: JSON.stringify({ status: 'REQUEST_RELEASE' }), - }).catch(() => {}); - console.log(`<<< Session released: ${id}`); - } -} - -const crawler = new PlaywrightCrawler({ - launchContext: { - remoteBrowser: new BrowserbaseWsProvider(), - }, - maxRequestsPerCrawl: 1, - async requestHandler({ page, request }) { - const title = await page.title(); - console.log(`[Page] ${request.loadedUrl} — "${title}"`); - }, -}); - -await crawler.run(['https://example.com']); diff --git a/temp-examples/examples/browserbase-playwright.ts b/temp-examples/examples/browserbase-playwright.ts deleted file mode 100644 index 6a6c48b0e77f..000000000000 --- a/temp-examples/examples/browserbase-playwright.ts +++ /dev/null @@ -1,70 +0,0 @@ -import 'dotenv/config'; - -import { RemoteBrowserProvider } from '@crawlee/browser-pool'; -import { PlaywrightCrawler } from 'crawlee'; - -const apiKey = process.env.BROWSERBASE_API_KEY; -const projectId = process.env.BROWSERBASE_PROJECT_ID; - -if (!apiKey) throw new Error('BROWSERBASE_API_KEY env variable is required'); -if (!projectId) throw new Error('BROWSERBASE_PROJECT_ID env variable is required'); - -class BrowserbaseProvider extends RemoteBrowserProvider<{ id: string }> { - maxOpenBrowsers = 1; - - async connect() { - const response = await fetch('https://api.browserbase.com/v1/sessions', { - method: 'POST', - headers: { 'x-bb-api-key': apiKey, 'Content-Type': 'application/json' }, - body: JSON.stringify({ projectId }), - }); - - if (!response.ok) { - throw new Error(`Failed to create Browserbase session: ${response.status} ${response.statusText}`); - } - - const session = await response.json(); - console.log(`>>> Session created: ${session.id}`); - - return { url: session.connectUrl, context: { id: session.id } }; - } - - async release({ id }: { id: string }) { - await fetch(`https://api.browserbase.com/v1/sessions/${id}`, { - method: 'POST', - headers: { 'x-bb-api-key': apiKey, 'Content-Type': 'application/json' }, - body: JSON.stringify({ status: 'REQUEST_RELEASE' }), - }).catch(() => {}); - console.log(`<<< Session released: ${id}`); - } -} - -const crawler = new PlaywrightCrawler({ - - launchContext: { - remoteBrowser: new BrowserbaseProvider(), - }, - browserPoolOptions: { - retireBrowserAfterPageCount: 3, - maxOpenPagesPerBrowser: 1, - }, - maxConcurrency: 2, - maxRequestsPerCrawl: 10, - async requestHandler({ page, request }) { - const title = await page.title(); - console.log(`[Page] ${request.loadedUrl} — "${title}"`); - }, -}); - -await crawler.run([ - 'https://example.com', - 'https://crawlee.dev', - 'https://www.google.com', - 'https://github.com', - 'https://wikipedia.org', - 'https://httpbin.org', - 'https://jsonplaceholder.typicode.com', - 'https://news.ycombinator.com', - 'https://books.toscrape.com', - 'https://quotes.toscrape.com', -]); diff --git a/temp-examples/examples/browserbase-puppeteer.ts b/temp-examples/examples/browserbase-puppeteer.ts deleted file mode 100644 index 14fff4c90034..000000000000 --- a/temp-examples/examples/browserbase-puppeteer.ts +++ /dev/null @@ -1,67 +0,0 @@ -import 'dotenv/config'; - -import { RemoteBrowserProvider } from '@crawlee/browser-pool'; -import { PuppeteerCrawler } from 'crawlee'; - -const apiKey = process.env.BROWSERBASE_API_KEY; -const projectId = process.env.BROWSERBASE_PROJECT_ID; - -if (!apiKey) throw new Error('BROWSERBASE_API_KEY env variable is required'); -if (!projectId) throw new Error('BROWSERBASE_PROJECT_ID env variable is required'); - -class BrowserbaseProvider extends RemoteBrowserProvider<{ id: string }> { - async connect() { - const response = await fetch('https://api.browserbase.com/v1/sessions', { - method: 'POST', - headers: { 'x-bb-api-key': apiKey, 'Content-Type': 'application/json' }, - body: JSON.stringify({ projectId }), - }); - - if (!response.ok) { - throw new Error(`Failed to create Browserbase session: ${response.status} ${response.statusText}`); - } - - const session = await response.json(); - console.log(`>>> Session created: ${session.id}`); - - return { url: session.connectUrl, context: { id: session.id } }; - } - - async release({ id }: { id: string }) { - await fetch(`https://api.browserbase.com/v1/sessions/${id}`, { - method: 'POST', - headers: { 'x-bb-api-key': apiKey, 'Content-Type': 'application/json' }, - body: JSON.stringify({ status: 'REQUEST_RELEASE' }), - }).catch(() => {}); - console.log(`<<< Session released: ${id}`); - } -} - -const crawler = new PuppeteerCrawler({ - launchContext: { - remoteBrowser: new BrowserbaseProvider(), - }, - browserPoolOptions: { - retireBrowserAfterPageCount: 3, - maxOpenPagesPerBrowser: 1, - }, - maxConcurrency: 2, - maxRequestsPerCrawl: 10, - async requestHandler({ page, request }) { - const title = await page.title(); - console.log(`[Page] ${request.loadedUrl} — "${title}"`); - }, -}); - -await crawler.run([ - 'https://example.com', - 'https://crawlee.dev', - 'https://www.google.com', - 'https://github.com', - 'https://wikipedia.org', - 'https://httpbin.org', - 'https://jsonplaceholder.typicode.com', - 'https://news.ycombinator.com', - 'https://books.toscrape.com', - 'https://quotes.toscrape.com', -]); diff --git a/temp-examples/examples/browserless-local-playwright-ws.ts b/temp-examples/examples/browserless-local-playwright-ws.ts deleted file mode 100644 index 5ec8933abbd3..000000000000 --- a/temp-examples/examples/browserless-local-playwright-ws.ts +++ /dev/null @@ -1,48 +0,0 @@ -/** - * Browserless local — Playwright WebSocket protocol - * Docker: docker run -p 3000:3000 -e CONCURRENT=4 ghcr.io/browserless/chromium - * - * Uses browserType.connect() (Playwright native WS), not connectOverCDP(). - * Browserless supports both protocols — the /chromium/playwright endpoint - * speaks the Playwright WebSocket protocol. - */ -import { RemoteBrowserProvider } from '@crawlee/browser-pool'; -import { PlaywrightCrawler } from 'crawlee'; - -class BrowserlessLocalWsProvider extends RemoteBrowserProvider { - override type = 'websocket' as const; - maxOpenBrowsers = 4; // match CONCURRENT=4 in docker - - async connect() { - return { url: 'ws://localhost:3000/chromium/playwright' }; - } -} - -const crawler = new PlaywrightCrawler({ - launchContext: { - remoteBrowser: new BrowserlessLocalWsProvider(), - }, - browserPoolOptions: { - retireBrowserAfterPageCount: 5, - maxOpenPagesPerBrowser: 1, - }, - maxConcurrency: 4, - maxRequestsPerCrawl: 10, - async requestHandler({ page, request }) { - const title = await page.title(); - console.log(`[Page] ${request.loadedUrl} — "${title}"`); - }, -}); - -await crawler.run([ - 'https://example.com', - 'https://crawlee.dev', - 'https://www.google.com', - 'https://github.com', - 'https://wikipedia.org', - 'https://httpbin.org', - 'https://jsonplaceholder.typicode.com', - 'https://news.ycombinator.com', - 'https://books.toscrape.com', - 'https://quotes.toscrape.com', -]); diff --git a/temp-examples/examples/browserless-local-playwright.ts b/temp-examples/examples/browserless-local-playwright.ts deleted file mode 100644 index 806938c6b98b..000000000000 --- a/temp-examples/examples/browserless-local-playwright.ts +++ /dev/null @@ -1,42 +0,0 @@ -/** - * Browserless local — Playwright CDP - * Docker: docker run -p 3000:3000 -e CONCURRENT=4 ghcr.io/browserless/chromium - */ -import { RemoteBrowserProvider } from '@crawlee/browser-pool'; -import { PlaywrightCrawler } from 'crawlee'; - -class BrowserlessLocalProvider extends RemoteBrowserProvider { - maxOpenBrowsers = 4; // match CONCURRENT=4 in docker - - async connect() { - return { url: 'ws://localhost:3000' }; - } -} - -const crawler = new PlaywrightCrawler({ - launchContext: { - remoteBrowser: new BrowserlessLocalProvider(), - }, - browserPoolOptions: { - retireBrowserAfterPageCount: 5, - }, - maxConcurrency: 4, - maxRequestsPerCrawl: 10, - async requestHandler({ page, request }) { - const title = await page.title(); - console.log(`[Page] ${request.loadedUrl} — "${title}"`); - }, -}); - -await crawler.run([ - 'https://example.com', - 'https://crawlee.dev', - 'https://www.google.com', - 'https://github.com', - 'https://wikipedia.org', - 'https://httpbin.org', - 'https://jsonplaceholder.typicode.com', - 'https://news.ycombinator.com', - 'https://books.toscrape.com', - 'https://quotes.toscrape.com', -]); diff --git a/temp-examples/examples/browserless-local-puppeteer.ts b/temp-examples/examples/browserless-local-puppeteer.ts deleted file mode 100644 index 5eb8b751bd73..000000000000 --- a/temp-examples/examples/browserless-local-puppeteer.ts +++ /dev/null @@ -1,43 +0,0 @@ -/** - * Browserless local — Puppeteer CDP - * Docker: docker run -p 3000:3000 -e CONCURRENT=4 ghcr.io/browserless/chromium - */ -import { RemoteBrowserProvider } from '@crawlee/browser-pool'; -import { PuppeteerCrawler } from 'crawlee'; - -class BrowserlessLocalProvider extends RemoteBrowserProvider { - maxOpenBrowsers = 4; // match CONCURRENT=4 in docker - - async connect() { - return { url: 'ws://localhost:3000' }; - } -} - -const crawler = new PuppeteerCrawler({ - launchContext: { - remoteBrowser: new BrowserlessLocalProvider(), - }, - browserPoolOptions: { - retireBrowserAfterPageCount: 5, - maxOpenPagesPerBrowser: 1, - }, - maxConcurrency: 4, - maxRequestsPerCrawl: 10, - async requestHandler({ page, request }) { - const title = await page.title(); - console.log(`[Page] ${request.loadedUrl} — "${title}"`); - }, -}); - -await crawler.run([ - 'https://example.com', - 'https://crawlee.dev', - 'https://www.google.com', - 'https://github.com', - 'https://wikipedia.org', - 'https://httpbin.org', - 'https://jsonplaceholder.typicode.com', - 'https://news.ycombinator.com', - 'https://books.toscrape.com', - 'https://quotes.toscrape.com', -]); diff --git a/temp-examples/examples/browserless-overlap-test.ts b/temp-examples/examples/browserless-overlap-test.ts deleted file mode 100644 index 5219ed81cc5e..000000000000 --- a/temp-examples/examples/browserless-overlap-test.ts +++ /dev/null @@ -1,80 +0,0 @@ -/** - * Test to reproduce the browser overlap problem during retirement. - * - * Local: docker run -p 3000:3000 -e CONCURRENT=2 ghcr.io/browserless/chromium - * Cloud: set BROWSERLESS_TOKEN in .env (free tier has 2 concurrent limit) - * - * Run: node --experimental-strip-types examples/browserless-overlap-test.ts - * - * With maxConcurrency:2 and a service limit of 2, you'd expect at most 2 browsers. - * But during retirement transitions, the pool briefly opens a 3rd connection → 429. - * - * The overlap: - * 1. Browser A hits retireBrowserAfterPageCount while its last page is still running - * 2. A moves to retiredBrowserControllers (still connected, page not yet closed) - * 3. Next page request → A is retired, no active browser → pool launches Browser C - * 4. A hasn't closed yet (1s timeout) → A + B + C = 3 concurrent connections → 429 - */ -import 'dotenv/config'; - -import { RemoteBrowserProvider } from '@crawlee/browser-pool'; -import { PlaywrightCrawler } from 'crawlee'; - -let activeConnections = 0; -let peakConnections = 0; - -class BrowserlessProvider extends RemoteBrowserProvider { - // Cap to match the service limit — prevents overlap during retirement - maxOpenBrowsers = 2; - - async connect() { - activeConnections++; - peakConnections = Math.max(peakConnections, activeConnections); - console.log(`>>> CONNECT active=${activeConnections} (peak=${peakConnections})`); - const token = process.env.BROWSERLESS_TOKEN; - const url = token ? `wss://production-sfo.browserless.io?token=${token}` : 'ws://localhost:3000'; - return { url }; - } - - async release() { - activeConnections--; - console.log(`<<< RELEASE active=${activeConnections}`); - } -} - -const crawler = new PlaywrightCrawler({ - launchContext: { - remoteBrowser: new BrowserlessProvider(), - }, - browserPoolOptions: { - // Retire after just 2 pages — forces frequent retirement - retireBrowserAfterPageCount: 2, - maxOpenPagesPerBrowser: 1, - }, - // 2 concurrent pages = 2 browsers needed, matching the docker CONCURRENT=2 - maxConcurrency: 2, - maxRequestsPerCrawl: 10, - async requestHandler({ page, request }) { - const title = await page.title(); - console.log(`[Page] ${request.loadedUrl} — "${title}"`); - }, -}); - -await crawler.run([ - 'https://example.com', - 'https://crawlee.dev', - 'https://www.google.com', - 'https://github.com', - 'https://wikipedia.org', - 'https://httpbin.org', - 'https://jsonplaceholder.typicode.com', - 'https://news.ycombinator.com', - 'https://books.toscrape.com', - 'https://quotes.toscrape.com', -]); - -console.log(`\nPeak concurrent connections: ${peakConnections}`); -console.log(`Expected max: 2 (matching maxConcurrency)`); -if (peakConnections > 2) { - console.log(`⚠ OVERLAP DETECTED: ${peakConnections} browsers were open simultaneously`); -} diff --git a/temp-examples/examples/browserless-playwright-ws.ts b/temp-examples/examples/browserless-playwright-ws.ts deleted file mode 100644 index 0f6d027388f3..000000000000 --- a/temp-examples/examples/browserless-playwright-ws.ts +++ /dev/null @@ -1,47 +0,0 @@ -import 'dotenv/config'; - -import { RemoteBrowserProvider } from '@crawlee/browser-pool'; -import { PlaywrightCrawler } from 'crawlee'; - -// Set BROWSERLESS_TOKEN in .env -// For local Docker, see browserless-local-playwright-ws.ts -const token = process.env.BROWSERLESS_TOKEN; -if (!token) throw new Error('BROWSERLESS_TOKEN env variable is required'); -const endpointUrl = `wss://production-sfo.browserless.io/chromium/playwright?token=${token}`; - -class BrowserlessWsProvider extends RemoteBrowserProvider { - override type = 'websocket' as const; - - async connect() { - return { url: endpointUrl }; - } -} - -const crawler = new PlaywrightCrawler({ - launchContext: { - remoteBrowser: new BrowserlessWsProvider(), - }, - browserPoolOptions: { - retireBrowserAfterPageCount: 5, - maxOpenPagesPerBrowser: 1, - }, - maxConcurrency: 4, - maxRequestsPerCrawl: 10, - async requestHandler({ page, request }) { - const title = await page.title(); - console.log(`[Page] ${request.loadedUrl} — "${title}"`); - }, -}); - -await crawler.run([ - 'https://example.com', - 'https://crawlee.dev', - 'https://www.google.com', - 'https://github.com', - 'https://wikipedia.org', - 'https://httpbin.org', - 'https://jsonplaceholder.typicode.com', - 'https://news.ycombinator.com', - 'https://books.toscrape.com', - 'https://quotes.toscrape.com', -]); diff --git a/temp-examples/examples/browserless-playwright.ts b/temp-examples/examples/browserless-playwright.ts deleted file mode 100644 index 0d0869372323..000000000000 --- a/temp-examples/examples/browserless-playwright.ts +++ /dev/null @@ -1,69 +0,0 @@ -/** - * Browserless remote — Playwright CDP with API-managed sessions - * Set BROWSERLESS_TOKEN in .env - * For local Docker, see browserless-local-playwright.ts - */ -import 'dotenv/config'; - -import { RemoteBrowserProvider } from '@crawlee/browser-pool'; -import { PlaywrightCrawler } from 'crawlee'; - -const token = process.env.BROWSERLESS_TOKEN; -if (!token) throw new Error('BROWSERLESS_TOKEN env variable is required'); - -const baseUrl = 'https://production-sfo.browserless.io'; - -class BrowserlessProvider extends RemoteBrowserProvider<{ stopUrl: string }> { - async connect() { - const response = await fetch(`${baseUrl}/session?token=${token}`, { - method: 'POST', - headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ ttl: 60_000 }), - }); - - if (!response.ok) { - throw new Error(`Failed to create session: ${response.status} ${response.statusText}`); - } - - const session = await response.json(); - console.log(`>>> Session created: ${session.id}`); - - return { - url: session.connect, - context: { stopUrl: session.stop }, - }; - } - - async release({ stopUrl }: { stopUrl: string }) { - await fetch(`${stopUrl}&force=true`, { method: 'DELETE' }).catch(() => {}); - console.log(`<<< Session released`); - } -} - -const crawler = new PlaywrightCrawler({ - launchContext: { - remoteBrowser: new BrowserlessProvider(), - }, - browserPoolOptions: { - retireBrowserAfterPageCount: 5, - }, - maxConcurrency: 4, - maxRequestsPerCrawl: 10, - async requestHandler({ page, request }) { - const title = await page.title(); - console.log(`[Page] ${request.loadedUrl} — "${title}"`); - }, -}); - -await crawler.run([ - 'https://example.com', - 'https://crawlee.dev', - 'https://www.google.com', - 'https://github.com', - 'https://wikipedia.org', - 'https://httpbin.org', - 'https://jsonplaceholder.typicode.com', - 'https://news.ycombinator.com', - 'https://books.toscrape.com', - 'https://quotes.toscrape.com', -]); diff --git a/temp-examples/examples/browserless-puppeteer.ts b/temp-examples/examples/browserless-puppeteer.ts deleted file mode 100644 index 36b973807401..000000000000 --- a/temp-examples/examples/browserless-puppeteer.ts +++ /dev/null @@ -1,44 +0,0 @@ -import 'dotenv/config'; - -import { RemoteBrowserProvider } from '@crawlee/browser-pool'; -import { PuppeteerCrawler } from 'crawlee'; - -// Set BROWSERLESS_TOKEN in .env -// For local Docker, see browserless-local-puppeteer.ts -const token = process.env.BROWSERLESS_TOKEN; -if (!token) throw new Error('BROWSERLESS_TOKEN env variable is required'); -const endpointUrl = `wss://production-sfo.browserless.io?token=${token}`; - -class BrowserlessProvider extends RemoteBrowserProvider { - async connect() { - return { url: endpointUrl }; - } -} - -const crawler = new PuppeteerCrawler({ - launchContext: { - remoteBrowser: new BrowserlessProvider(), - }, - browserPoolOptions: { - retireBrowserAfterPageCount: 5, - maxOpenPagesPerBrowser: 1, - }, - maxConcurrency: 4, - maxRequestsPerCrawl: 9, - async requestHandler({ page, request }) { - const title = await page.title(); - console.log(`[Page] ${request.loadedUrl} — "${title}"`); - }, -}); - -await crawler.run([ - 'https://example.com', - 'https://crawlee.dev', - 'https://www.google.com', - 'https://github.com', - 'https://wikipedia.org', - 'https://httpbin.org', - 'https://jsonplaceholder.typicode.com', - 'https://news.ycombinator.com', - 'https://books.toscrape.com', -]); diff --git a/temp-examples/examples/cookie-sharing-playwright-local-vs-remote.ts b/temp-examples/examples/cookie-sharing-playwright-local-vs-remote.ts deleted file mode 100644 index b064bf41fbce..000000000000 --- a/temp-examples/examples/cookie-sharing-playwright-local-vs-remote.ts +++ /dev/null @@ -1,109 +0,0 @@ -/** - * Cookie sharing: Playwright local vs remote - * - * Compares whether cookies set on page A are visible on page B within the - * same browser for local (launchPersistentContext) vs remote (connect/CDP). - * - * Run local Browserless first: - * docker run -p 3000:3000 -e CONCURRENT=4 ghcr.io/browserless/chromium - * - * Then: - * npm run example:cookie-sharing-playwright-local-vs-remote - */ -import { PlaywrightPlugin, BrowserPool } from '@crawlee/browser-pool'; -import playwright from 'playwright'; - -const BROWSERLESS_CDP = 'ws://localhost:3000'; - -// --------------------------------------------------------------------------- -// Helper -// --------------------------------------------------------------------------- -async function testCookieSharing(label: string, plugin: PlaywrightPlugin) { - console.log(`\n--- ${label} ---`); - - const pool = new BrowserPool({ - browserPlugins: [plugin], - maxOpenPagesPerBrowser: 2, - }); - - try { - // Page A — set a cookie - const pageA = await pool.newPage(); - await pageA.goto('https://example.com', { waitUntil: 'domcontentloaded' }); - - const controllerA = pool.getBrowserControllerByPage(pageA)!; - await controllerA.setCookies(pageA, [ - { name: 'SHARED_TEST', value: 'from-page-a', domain: '.example.com', path: '/' }, - ]); - - const cookiesA = await controllerA.getCookies(pageA); - console.log(`Page A cookies: ${JSON.stringify(cookiesA.map((c) => ({ name: c.name, value: c.value })))}`); - - // Page B — same browser, check if cookie is visible - const pageB = await pool.newPage(); - await pageB.goto('https://example.com', { waitUntil: 'domcontentloaded' }); - - const controllerB = pool.getBrowserControllerByPage(pageB)!; - console.log(`Same browser controller: ${controllerA === controllerB}`); - - const cookiesB = await controllerB.getCookies(pageB); - console.log(`Page B cookies: ${JSON.stringify(cookiesB.map((c) => ({ name: c.name, value: c.value })))}`); - - const found = cookiesB.find((c) => c.name === 'SHARED_TEST'); - if (found) { - console.log(`✅ PASS — Cookie shared between pages (value: "${found.value}")`); - } else { - console.log(`❌ FAIL — Cookie NOT visible on page B`); - } - - await pageA.close(); - await pageB.close(); - } finally { - await pool.destroy(); - } -} - -// --------------------------------------------------------------------------- -// 1. Local — useIncognitoPages: false (launchPersistentContext → shared context) -// --------------------------------------------------------------------------- -await testCookieSharing( - 'Local Playwright — useIncognitoPages: false (persistent context)', - new PlaywrightPlugin(playwright.chromium, { useIncognitoPages: false }), -); - -// --------------------------------------------------------------------------- -// 2. Local — useIncognitoPages: true (new context per page) -// --------------------------------------------------------------------------- -await testCookieSharing( - 'Local Playwright — useIncognitoPages: true', - new PlaywrightPlugin(playwright.chromium, { useIncognitoPages: true }), -); - -// --------------------------------------------------------------------------- -// 3. Remote CDP — useIncognitoPages: false (browser.newPage() = new context anyway) -// --------------------------------------------------------------------------- -await testCookieSharing( - 'Remote CDP Playwright — useIncognitoPages: false', - new PlaywrightPlugin(playwright.chromium, { - useIncognitoPages: false, - connectOverCDPOptions: { endpointURL: BROWSERLESS_CDP }, - }), -); - -// --------------------------------------------------------------------------- -// 4. Remote CDP — useIncognitoPages: true -// --------------------------------------------------------------------------- -await testCookieSharing( - 'Remote CDP Playwright — useIncognitoPages: true', - new PlaywrightPlugin(playwright.chromium, { - useIncognitoPages: true, - connectOverCDPOptions: { endpointURL: BROWSERLESS_CDP }, - }), -); - -console.log('\n--- Summary ---'); -console.log('Local incognito:false → shared (launchPersistentContext)'); -console.log('Local incognito:true → isolated'); -console.log('Remote incognito:false → shared (wrapped default context from CDP)'); -console.log('Remote incognito:true → isolated'); -console.log('\nDone.'); diff --git a/temp-examples/examples/cookie-sharing-playwright-test.ts b/temp-examples/examples/cookie-sharing-playwright-test.ts deleted file mode 100644 index 36f6d1f0c8c8..000000000000 --- a/temp-examples/examples/cookie-sharing-playwright-test.ts +++ /dev/null @@ -1,118 +0,0 @@ -/** - * Cookie sharing test — Playwright: CDP vs WebSocket, incognito true vs false - * - * Tests whether cookies set on page A are visible on page B within the same - * browser, across all four combinations: - * 1. Playwright CDP + useIncognitoPages: false → should share - * 2. Playwright CDP + useIncognitoPages: true → should NOT share - * 3. Playwright WebSocket + useIncognitoPages: false → ??? (connect() has no default context) - * 4. Playwright WebSocket + useIncognitoPages: true → should NOT share - * - * Run local Browserless first: - * docker run -p 3000:3000 -e CONCURRENT=4 ghcr.io/browserless/chromium - * - * Then: - * npm run example:cookie-sharing-playwright-test - */ -import { PlaywrightPlugin, BrowserPool, RemoteBrowserProvider } from '@crawlee/browser-pool'; -import playwright from 'playwright'; - -const BROWSERLESS_CDP = 'ws://localhost:3000'; -const BROWSERLESS_WS = 'ws://localhost:3000/chromium/playwright'; - -// --------------------------------------------------------------------------- -// Helper -// --------------------------------------------------------------------------- -async function testCookieSharing(label: string, plugin: PlaywrightPlugin) { - console.log(`\n--- ${label} ---`); - - const pool = new BrowserPool({ - browserPlugins: [plugin], - maxOpenPagesPerBrowser: 2, - }); - - try { - // Page A — set a cookie - const pageA = await pool.newPage(); - await pageA.goto('https://example.com', { waitUntil: 'domcontentloaded' }); - - const controllerA = pool.getBrowserControllerByPage(pageA)!; - await controllerA.setCookies(pageA, [ - { name: 'SHARED_TEST', value: 'from-page-a', domain: '.example.com', path: '/' }, - ]); - - const cookiesA = await controllerA.getCookies(pageA); - console.log(`Page A cookies: ${JSON.stringify(cookiesA.map((c) => ({ name: c.name, value: c.value })))}`); - - // Page B — same browser, check if cookie is visible - const pageB = await pool.newPage(); - await pageB.goto('https://example.com', { waitUntil: 'domcontentloaded' }); - - const controllerB = pool.getBrowserControllerByPage(pageB)!; - const sameBrowser = controllerA === controllerB; - console.log(`Same browser controller: ${sameBrowser}`); - - const cookiesB = await controllerB.getCookies(pageB); - console.log(`Page B cookies: ${JSON.stringify(cookiesB.map((c) => ({ name: c.name, value: c.value })))}`); - - const found = cookiesB.find((c) => c.name === 'SHARED_TEST'); - if (found) { - console.log(`✅ PASS — Cookie shared between pages (value: "${found.value}")`); - } else { - console.log(`❌ FAIL — Cookie NOT visible on page B`); - } - - await pageA.close(); - await pageB.close(); - } finally { - await pool.destroy(); - } -} - -// --------------------------------------------------------------------------- -// 1. Playwright CDP — useIncognitoPages: false (should share) -// --------------------------------------------------------------------------- -await testCookieSharing( - 'Playwright CDP — useIncognitoPages: false', - new PlaywrightPlugin(playwright.chromium, { - useIncognitoPages: false, - connectOverCDPOptions: { endpointURL: BROWSERLESS_CDP }, - }), -); - -// --------------------------------------------------------------------------- -// 2. Playwright CDP — useIncognitoPages: true (should NOT share) -// --------------------------------------------------------------------------- -await testCookieSharing( - 'Playwright CDP — useIncognitoPages: true', - new PlaywrightPlugin(playwright.chromium, { - useIncognitoPages: true, - connectOverCDPOptions: { endpointURL: BROWSERLESS_CDP }, - }), -); - -// --------------------------------------------------------------------------- -// 3. Playwright WebSocket — useIncognitoPages: false (the question mark) -// connect() returns a browser with no default context — does newPage() -// create an implicit shared context, or a new one each time? -// --------------------------------------------------------------------------- -await testCookieSharing( - 'Playwright WebSocket — useIncognitoPages: false', - new PlaywrightPlugin(playwright.chromium, { - useIncognitoPages: false, - connectOptions: { wsEndpoint: BROWSERLESS_WS }, - }), -); - -// --------------------------------------------------------------------------- -// 4. Playwright WebSocket — useIncognitoPages: true (should NOT share) -// --------------------------------------------------------------------------- -await testCookieSharing( - 'Playwright WebSocket — useIncognitoPages: true', - new PlaywrightPlugin(playwright.chromium, { - useIncognitoPages: true, - connectOptions: { wsEndpoint: BROWSERLESS_WS }, - }), -); - -console.log('\nDone.'); diff --git a/temp-examples/examples/cookie-sharing-session-across-browsers.ts b/temp-examples/examples/cookie-sharing-session-across-browsers.ts deleted file mode 100644 index 2f22e1a0d4a2..000000000000 --- a/temp-examples/examples/cookie-sharing-session-across-browsers.ts +++ /dev/null @@ -1,82 +0,0 @@ -/** - * Session-based cookie sharing across remote browsers (Puppeteer) - * - * Demonstrates that the Session object transfers cookies between sequential - * requests even when they land on different browser instances. - * - * Setup: - * - retireBrowserAfterPageCount: 1 → forces a new browser per request - * - Single session pool → same session reused for all requests - * - saveResponseCookies: true (default) - * - * Run local Browserless first: - * docker run -p 3000:3000 -e CONCURRENT=4 ghcr.io/browserless/chromium - * - * Then: - * npm run example:cookie-sharing-session-across-browsers - */ -import { RemoteBrowserProvider } from '@crawlee/browser-pool'; -import { PuppeteerCrawler, SessionPool } from 'crawlee'; - -// --------------------------------------------------------------------------- -// Remote browser provider -// --------------------------------------------------------------------------- -class BrowserlessProvider extends RemoteBrowserProvider { - maxOpenBrowsers = 4; - async connect() { - return { url: 'ws://localhost:3000' }; - } -} - -// Single session so both requests share cookies -const sessionPool = new SessionPool({ maxPoolSize: 1 }); - -// --------------------------------------------------------------------------- -// Crawler — forces new browser per request to prove cross-browser sharing -// --------------------------------------------------------------------------- -const crawler = new PuppeteerCrawler({ - launchContext: { - remoteBrowser: new BrowserlessProvider(), - }, - browserPoolOptions: { - retireBrowserAfterPageCount: 1, // force new browser for each request - maxOpenPagesPerBrowser: 1, - }, - sessionPool, - maxConcurrency: 1, // sequential — so request 1 finishes before request 2 - async requestHandler({ page, request, session, browserController }) { - const controllerId = browserController.id; - - // Set a cookie manually on the first request and save it to the session - if (request.url.includes('/login')) { - await page.setCookie({ - name: 'AUTH_TOKEN', - value: 'secret-jwt-123', - domain: 'books.toscrape.com', - path: '/', - }); - // Save page cookies to the session (normally saveResponseCookies does this - // during navigation, but our cookie was set after navigation) - const cookies = await browserController.getCookies(page); - session?.setCookies(cookies, request.loadedUrl!); - } - - const pageCookies = await page.cookies(); - const sessionCookies = session?.getCookies(request.loadedUrl!) ?? []; - - console.log(`\n[${new URL(request.url).pathname}]`); - console.log(` Browser controller: ${controllerId}`); - console.log(` Session ID: ${session?.id}`); - console.log(` Page cookies: ${JSON.stringify(pageCookies.map((c) => ({ name: c.name, value: c.value })))}`); - console.log(` Session cookies: ${JSON.stringify(sessionCookies.map((c) => ({ name: c.name, value: c.value })))}`); - }, -}); - -await crawler.run([ - 'https://books.toscrape.com/login', // Request 1: browser A — sets AUTH_TOKEN cookie - 'https://books.toscrape.com/', // Request 2: browser B — should have cookie via Session -]); - -console.log('\nDone.'); -console.log('If request 2 shows AUTH_TOKEN in session cookies → session transferred cookies across browsers.'); -console.log('Check that Browser controller IDs are different → proves different browsers.'); diff --git a/temp-examples/examples/cookie-sharing-session-across-local-browsers.ts b/temp-examples/examples/cookie-sharing-session-across-local-browsers.ts deleted file mode 100644 index 0222cd9fa357..000000000000 --- a/temp-examples/examples/cookie-sharing-session-across-local-browsers.ts +++ /dev/null @@ -1,66 +0,0 @@ -/** - * Session-based cookie sharing across local browsers (Puppeteer) - * - * Demonstrates that the Session object transfers cookies between sequential - * requests even when they land on different local browser instances. - * No remote service needed. - * - * Setup: - * - retireBrowserAfterPageCount: 1 → forces a new browser per request - * - Single session pool → same session reused for all requests - * - saveResponseCookies: true (default) - * - * Run: - * npm run example:cookie-sharing-session-across-local-browsers - */ -import { PuppeteerCrawler, SessionPool } from 'crawlee'; - -// Single session so both requests share cookies -const sessionPool = new SessionPool({ maxPoolSize: 1 }); - -// --------------------------------------------------------------------------- -// Crawler — forces new browser per request to prove cross-browser sharing -// --------------------------------------------------------------------------- -const crawler = new PuppeteerCrawler({ - browserPoolOptions: { - retireBrowserAfterPageCount: 1, // force new browser for each request - maxOpenPagesPerBrowser: 1, - }, - sessionPool, - maxConcurrency: 1, // sequential — so request 1 finishes before request 2 - async requestHandler({ page, request, session, browserController }) { - const controllerId = browserController.id; - - // Set a cookie manually on the first request and save it to the session - if (request.url.includes('/login')) { - await page.setCookie({ - name: 'AUTH_TOKEN', - value: 'secret-jwt-123', - domain: 'books.toscrape.com', - path: '/', - }); - // Save page cookies to the session (normally saveResponseCookies does this - // during navigation, but our cookie was set after navigation) - const cookies = await browserController.getCookies(page); - session?.setCookies(cookies, request.loadedUrl!); - } - - const pageCookies = await page.cookies(); - const sessionCookies = session?.getCookies(request.loadedUrl!) ?? []; - - console.log(`\n[${new URL(request.url).pathname}]`); - console.log(` Browser controller: ${controllerId}`); - console.log(` Session ID: ${session?.id}`); - console.log(` Page cookies: ${JSON.stringify(pageCookies.map((c) => ({ name: c.name, value: c.value })))}`); - console.log(` Session cookies: ${JSON.stringify(sessionCookies.map((c) => ({ name: c.name, value: c.value })))}`); - }, -}); - -await crawler.run([ - 'https://books.toscrape.com/login', // Request 1: browser A — sets AUTH_TOKEN cookie - 'https://books.toscrape.com/', // Request 2: browser B — should have cookie via Session -]); - -console.log('\nDone.'); -console.log('If request 2 shows AUTH_TOKEN in session cookies → session transferred cookies across browsers.'); -console.log('Check that Browser controller IDs are different → proves different browsers.'); diff --git a/temp-examples/examples/cookie-sharing-test.ts b/temp-examples/examples/cookie-sharing-test.ts deleted file mode 100644 index d9e42991bbd3..000000000000 --- a/temp-examples/examples/cookie-sharing-test.ts +++ /dev/null @@ -1,118 +0,0 @@ -/** - * Cookie sharing test — useIncognitoPages: false with remote CDP (Puppeteer) - * - * Tests that cookies set on one page are visible on another page within the - * same browser session, comparing local vs remote behavior. - * - * Run local Browserless first: - * docker run -p 3000:3000 -e CONCURRENT=4 ghcr.io/browserless/chromium - * - * Then: - * npm run example:cookie-sharing-test - */ -import { PuppeteerPlugin, BrowserPool, RemoteBrowserProvider } from '@crawlee/browser-pool'; -import puppeteer from 'puppeteer'; - -const BROWSERLESS_URL = 'ws://localhost:3000'; - -// --------------------------------------------------------------------------- -// Helper: open two pages in the same browser, set cookie on page A, -// check if page B can see it without explicit transfer. -// --------------------------------------------------------------------------- -async function testCookieSharing(label: string, plugin: PuppeteerPlugin) { - console.log(`\n--- ${label} ---`); - - const pool = new BrowserPool({ - browserPlugins: [plugin], - maxOpenPagesPerBrowser: 2, - }); - - try { - // Page A — set a cookie - const pageA = await pool.newPage(); - await pageA.goto('https://example.com', { waitUntil: 'domcontentloaded' }); - - const controllerA = pool.getBrowserControllerByPage(pageA)!; - await controllerA.setCookies(pageA, [ - { name: 'SHARED_TEST', value: 'from-page-a', domain: '.example.com' }, - ]); - - const cookiesA = await controllerA.getCookies(pageA); - console.log(`Page A cookies: ${JSON.stringify(cookiesA.map((c) => ({ name: c.name, value: c.value })))}`); - - // Page B — same browser, check if cookie is visible - const pageB = await pool.newPage(); - await pageB.goto('https://example.com', { waitUntil: 'domcontentloaded' }); - - const controllerB = pool.getBrowserControllerByPage(pageB)!; - - // Verify both pages are in the same browser - const sameBrowser = controllerA === controllerB; - console.log(`Same browser controller: ${sameBrowser}`); - - const cookiesB = await controllerB.getCookies(pageB); - console.log(`Page B cookies: ${JSON.stringify(cookiesB.map((c) => ({ name: c.name, value: c.value })))}`); - - const found = cookiesB.find((c) => c.name === 'SHARED_TEST'); - if (found) { - console.log(`✅ PASS — Cookie shared between pages (value: "${found.value}")`); - } else { - console.log(`❌ FAIL — Cookie NOT visible on page B`); - } - - await pageA.close(); - await pageB.close(); - } finally { - await pool.destroy(); - } -} - -// --------------------------------------------------------------------------- -// Test 1: Local browser, useIncognitoPages: false (baseline) -// --------------------------------------------------------------------------- -await testCookieSharing( - 'Local Puppeteer — useIncognitoPages: false', - new PuppeteerPlugin(puppeteer, { useIncognitoPages: false }), -); - -// --------------------------------------------------------------------------- -// Test 2: Remote CDP (Browserless), useIncognitoPages: false -// --------------------------------------------------------------------------- -await testCookieSharing( - 'Remote CDP (Browserless) — useIncognitoPages: false', - new PuppeteerPlugin(puppeteer, { - useIncognitoPages: false, - connectOverCDPOptions: { browserWSEndpoint: BROWSERLESS_URL }, - }), -); - -// --------------------------------------------------------------------------- -// Test 3: Remote CDP (Browserless), useIncognitoPages: true (should NOT share) -// --------------------------------------------------------------------------- -await testCookieSharing( - 'Remote CDP (Browserless) — useIncognitoPages: true', - new PuppeteerPlugin(puppeteer, { - useIncognitoPages: true, - connectOverCDPOptions: { browserWSEndpoint: BROWSERLESS_URL }, - }), -); - -// --------------------------------------------------------------------------- -// Test 4: Remote via RemoteBrowserProvider, useIncognitoPages: false -// --------------------------------------------------------------------------- -class BrowserlessProvider extends RemoteBrowserProvider { - maxOpenBrowsers = 2; - async connect() { - return { url: BROWSERLESS_URL }; - } -} - -await testCookieSharing( - 'RemoteBrowserProvider (Browserless) — useIncognitoPages: false', - new PuppeteerPlugin(puppeteer, { - useIncognitoPages: false, - remoteBrowser: new BrowserlessProvider(), - }), -); - -console.log('\nDone.'); diff --git a/temp-examples/examples/rebrowser-playwright-ws.ts b/temp-examples/examples/rebrowser-playwright-ws.ts deleted file mode 100644 index 7c783150e45d..000000000000 --- a/temp-examples/examples/rebrowser-playwright-ws.ts +++ /dev/null @@ -1,52 +0,0 @@ -import 'dotenv/config'; - -import { RemoteBrowserProvider } from '@crawlee/browser-pool'; -import { PlaywrightCrawler } from 'crawlee'; - -const apiKey = process.env.REBROWSER_API_KEY; -const profileId = process.env.REBROWSER_PROFILE_ID; - -if (!apiKey) throw new Error('REBROWSER_API_KEY env variable is required'); - -// Rebrowser WS connection: starts a dedicated run via REST API, -// which gives you a WebSocket endpoint for Playwright's native protocol. -class RebrowserWsProvider extends RemoteBrowserProvider { - override type = 'websocket' as const; - - async connect() { - const url = new URL(`https://rebrowser.net/api/startRun?apikey=${apiKey}`); - - if (profileId) { - url.searchParams.set('profileId', profileId); - console.log(`Using Rebrowser profile: ${profileId}`); - } - - const response = await fetch(url.toString()); - - if (!response.ok) { - throw new Error(`Failed to start Rebrowser run: ${response.status} ${response.statusText}`); - } - - const run = await response.json(); - console.log(`Started Rebrowser run with wsEndpoint: ${run.wsEndpoint}`); - - return { url: run.wsEndpoint }; - } -} - -const crawler = new PlaywrightCrawler({ - launchContext: { - remoteBrowser: new RebrowserWsProvider(), - }, - async requestHandler({ page, request }) { - const title = await page.title(); - console.log(`[${request.loadedUrl}] ${title}`); - }, - maxRequestsPerCrawl: 1, -}); - -await crawler.run(['https://example.com']); - -// Note: Rebrowser recommends calling finishRun after you're done to avoid idle billing. -// With Crawlee, the browser disconnects automatically after the crawl finishes, -// which should end the run. For explicit control, use the REST API finishRun endpoint. diff --git a/temp-examples/examples/rebrowser-playwright.ts b/temp-examples/examples/rebrowser-playwright.ts deleted file mode 100644 index 45a8804dae57..000000000000 --- a/temp-examples/examples/rebrowser-playwright.ts +++ /dev/null @@ -1,33 +0,0 @@ -import 'dotenv/config'; - -import { RemoteBrowserProvider } from '@crawlee/browser-pool'; -import { PlaywrightCrawler } from 'crawlee'; - -const apiKey = process.env.REBROWSER_API_KEY; -if (!apiKey) throw new Error('REBROWSER_API_KEY env variable is required'); - -// Rebrowser simple connection: no profile or run creation needed. -// A random profile is auto-selected when you connect with just an API key. -// Proxies are managed via the Rebrowser dashboard or WS URL params. -class RebrowserProvider extends RemoteBrowserProvider { - async connect() { - return { url: `wss://api.rebrowser.net?apikey=${apiKey}` }; - } -} - -const crawler = new PlaywrightCrawler({ - launchContext: { - remoteBrowser: new RebrowserProvider(), - }, - async requestHandler({ page, request }) { - const title = await page.title(); - console.log(`[${request.loadedUrl}] ${title}`); - }, - maxRequestsPerCrawl: 1, -}); - -await crawler.run(['https://example.com']); - -// Note: Rebrowser recommends calling finishRun after you're done to avoid idle billing. -// With Crawlee, the browser disconnects automatically after the crawl finishes, -// which should end the run. For explicit control, use the REST API finishRun endpoint. diff --git a/temp-examples/examples/rebrowser-puppeteer.ts b/temp-examples/examples/rebrowser-puppeteer.ts deleted file mode 100644 index 51691a8f85f4..000000000000 --- a/temp-examples/examples/rebrowser-puppeteer.ts +++ /dev/null @@ -1,33 +0,0 @@ -import 'dotenv/config'; - -import { RemoteBrowserProvider } from '@crawlee/browser-pool'; -import { PuppeteerCrawler } from 'crawlee'; - -const apiKey = process.env.REBROWSER_API_KEY; -if (!apiKey) throw new Error('REBROWSER_API_KEY env variable is required'); - -// Rebrowser simple connection: no profile or run creation needed. -// A random profile is auto-selected when you connect with just an API key. -// Proxies are managed via the Rebrowser dashboard or WS URL params. -class RebrowserProvider extends RemoteBrowserProvider { - async connect() { - return { url: `wss://api.rebrowser.net?apikey=${apiKey}` }; - } -} - -const crawler = new PuppeteerCrawler({ - launchContext: { - remoteBrowser: new RebrowserProvider(), - }, - async requestHandler({ page, request }) { - const title = await page.title(); - console.log(`[${request.loadedUrl}] ${title}`); - }, - maxRequestsPerCrawl: 1, -}); - -await crawler.run(['https://example.com']); - -// Note: Rebrowser recommends calling finishRun after you're done to avoid idle billing. -// With Crawlee, the browser disconnects automatically after the crawl finishes, -// which should end the run. For explicit control, use the REST API finishRun endpoint. diff --git a/temp-examples/examples/remote-proxy-test.ts b/temp-examples/examples/remote-proxy-test.ts deleted file mode 100644 index cf72058c9be6..000000000000 --- a/temp-examples/examples/remote-proxy-test.ts +++ /dev/null @@ -1,84 +0,0 @@ -/** - * Remote browser with custom proxy — demonstrates proxyUrl forwarding - * - * Shows how proxyUrl from Crawlee's ProxyConfiguration is forwarded to - * the RemoteBrowserProvider.connect() method, letting the provider pass - * it to the remote service's proxy API. - * - * Run local Browserless first: - * docker run -p 3000:3000 -e CONCURRENT=4 ghcr.io/browserless/chromium - * - * Then: - * npm run example:remote-proxy-test - * - * Note: externalProxyServer is a paid Browserless feature. On the free/local - * Docker image the proxy is accepted but may not route traffic. The example - * proves the forwarding plumbing works regardless. - */ -import { RemoteBrowserProvider } from '@crawlee/browser-pool'; -import { PuppeteerCrawler, ProxyConfiguration } from 'crawlee'; - -// --------------------------------------------------------------------------- -// Provider that forwards proxyUrl to Browserless via externalProxyServer param -// --------------------------------------------------------------------------- -class BrowserlessWithProxyProvider extends RemoteBrowserProvider { - maxOpenBrowsers = 4; - - async connect({ proxyUrl } = {} as { proxyUrl?: string }) { - let url = 'ws://localhost:3000'; - - if (proxyUrl) { - // Browserless accepts custom proxy via externalProxyServer query param - // For other services, forward differently: - // Browserbase: proxies: [{ type: 'external', server: proxyUrl }] - // Steel: not supported (built-in only) - // Rebrowser: set on profile - url += `?externalProxyServer=${encodeURIComponent(proxyUrl)}`; - console.log(` [Provider] Forwarding proxy to Browserless: ${proxyUrl}`); - } else { - console.log(' [Provider] No proxy provided'); - } - - return { url }; - } -} - -// --------------------------------------------------------------------------- -// Proxy configuration — Crawlee rotates these per browser session -// --------------------------------------------------------------------------- -const proxyConfiguration = new ProxyConfiguration({ - proxyUrls: [ - 'http://34.135.166.24:80', - 'http://8.219.97.248:80', - ], -}); - -// --------------------------------------------------------------------------- -// Crawler -// --------------------------------------------------------------------------- -const crawler = new PuppeteerCrawler({ - launchContext: { - remoteBrowser: new BrowserlessWithProxyProvider(), - }, - proxyConfiguration, - browserPoolOptions: { - retireBrowserAfterPageCount: 2, - maxOpenPagesPerBrowser: 1, - }, - maxConcurrency: 1, - maxRequestsPerCrawl: 4, - async requestHandler({ page, request, proxyInfo }) { - const title = await page.title(); - console.log(`[Page] ${request.loadedUrl} — "${title}" (proxy: ${proxyInfo?.url ?? 'none'})`); - }, -}); - -await crawler.run([ - 'https://example.com', - 'https://books.toscrape.com', - 'https://quotes.toscrape.com', - 'https://httpbin.org/ip', -]); - -console.log('\nDone.'); -console.log('Check that [Provider] logs show the proxy URL being forwarded from ProxyConfiguration.'); diff --git a/temp-examples/examples/remote-proxy-via-chrome-args.ts b/temp-examples/examples/remote-proxy-via-chrome-args.ts deleted file mode 100644 index 16604420558d..000000000000 --- a/temp-examples/examples/remote-proxy-via-chrome-args.ts +++ /dev/null @@ -1,129 +0,0 @@ -/** - * Remote browser with custom proxy via Chrome launch args - * - * Uses Browserless's `launch` query param to pass --proxy-server directly - * to Chrome. Works on the free/local Docker image (no paid features needed). - * - * ────────────────────────────────────────────────────────────────────────── - * SETUP STEPS - * ────────────────────────────────────────────────────────────────────────── - * - * 1. Get your Apify Proxy password: - * - Go to https://console.apify.com/account/integrations - * - Copy "Proxy password" from the Proxy section - * - * 2. Add it to temp-examples/.env: - * APIFY_PROXY_PASSWORD=your_password_here - * - * 3. Start local Browserless Docker (free image): - * docker run -p 3000:3000 ghcr.io/browserless/chromium - * - * 4. Run the example: - * npm run example:remote-proxy-via-chrome-args - * - * ────────────────────────────────────────────────────────────────────────── - * WHAT TO LOOK FOR - * ────────────────────────────────────────────────────────────────────────── - * - * Test target is httpbin.org/ip — it returns the IP making the request. - * - If the proxy routes correctly, the "Response" line shows the proxy's IP - * (NOT your own home/office IP). - * - With residential proxies, you should see different IPs on different - * requests if rotation is working. - * - * Without APIFY_PROXY_PASSWORD, the example falls back to a free public proxy - * (unreliable, may fail) so you can still see the forwarding mechanism work. - * - * ────────────────────────────────────────────────────────────────────────── - * APIFY PROXY URL FORMAT - * ────────────────────────────────────────────────────────────────────────── - * - * Base: http://USERNAME:PASSWORD@proxy.apify.com:8000 - * - * USERNAME options (combine with commas): - * - groups-RESIDENTIAL → residential proxies - * - groups-AUTO → auto-rotated datacenter (default) - * - groups-GOOGLE_SERP → Google SERP-specific - * - country-US → restrict to country (US, GB, DE, etc.) - * - session-myid123 → sticky session (same IP for same session) - * - * Examples: - * http://auto:PASSWORD@proxy.apify.com:8000 - * http://groups-RESIDENTIAL,country-US:PASSWORD@proxy.apify.com:8000 - * http://groups-RESIDENTIAL,session-abc:PASSWORD@proxy.apify.com:8000 - */ -import 'dotenv/config'; - -import { RemoteBrowserProvider } from '@crawlee/browser-pool'; -import { PuppeteerCrawler, ProxyConfiguration } from 'crawlee'; - -// --------------------------------------------------------------------------- -// Provider that forwards proxyUrl to Chrome via --proxy-server launch arg -// --------------------------------------------------------------------------- -class BrowserlessChromeArgsProvider extends RemoteBrowserProvider { - maxOpenBrowsers = 4; - - async connect({ proxyUrl } = {} as { proxyUrl?: string }) { - let url = 'ws://localhost:3000'; - - if (proxyUrl) { - // Pass proxy to Chrome via launch args (works on free Browserless) - const launchOpts = JSON.stringify({ - args: [`--proxy-server=${proxyUrl}`], - }); - url += `?launch=${encodeURIComponent(launchOpts)}`; - console.log(` [Provider] Forwarding proxy via Chrome args: ${proxyUrl}`); - } else { - console.log(' [Provider] No proxy provided'); - } - - return { url }; - } -} - -// --------------------------------------------------------------------------- -// Proxy configuration — use Apify Proxy or any HTTP proxy -// --------------------------------------------------------------------------- -const apifyPassword = process.env.APIFY_PROXY_PASSWORD; - -const proxyConfiguration = new ProxyConfiguration({ - proxyUrls: apifyPassword - ? [ - // Apify residential proxy (replace 'RESIDENTIAL' with your group if different) - `http://groups-RESIDENTIAL:${apifyPassword}@proxy.apify.com:8000`, - ] - : [ - // Fallback: free public proxies (unreliable, may not work) - 'http://34.135.166.24:80', - ], -}); - -// --------------------------------------------------------------------------- -// Crawler -// --------------------------------------------------------------------------- -const crawler = new PuppeteerCrawler({ - launchContext: { - remoteBrowser: new BrowserlessChromeArgsProvider(), - }, - proxyConfiguration, - browserPoolOptions: { - retireBrowserAfterPageCount: 1, // new browser per request to test rotation - maxOpenPagesPerBrowser: 1, - }, - maxConcurrency: 1, - maxRequestsPerCrawl: 2, - async requestHandler({ page, request, proxyInfo }) { - const body = await page.evaluate(() => document.body.textContent?.trim()); - console.log(`\n[${request.loadedUrl}]`); - console.log(` Configured proxy: ${proxyInfo?.url ?? 'none'}`); - console.log(` Response: ${body}`); - }, -}); - -await crawler.run([ - 'https://httpbin.org/ip', - 'https://httpbin.org/ip', -]); - -console.log('\nDone.'); -console.log('Compare "Response" IP with your own IP to verify proxy routing.'); diff --git a/temp-examples/examples/steel-playwright.ts b/temp-examples/examples/steel-playwright.ts deleted file mode 100644 index e14bf60bf31b..000000000000 --- a/temp-examples/examples/steel-playwright.ts +++ /dev/null @@ -1,66 +0,0 @@ -import 'dotenv/config'; - -import { RemoteBrowserProvider } from '@crawlee/browser-pool'; -import { PlaywrightCrawler } from 'crawlee'; - -const apiKey = process.env.STEEL_API_KEY; -if (!apiKey) throw new Error('STEEL_API_KEY env variable is required'); - -class SteelProvider extends RemoteBrowserProvider<{ id: string }> { - async connect() { - const response = await fetch('https://api.steel.dev/v1/sessions', { - method: 'POST', - headers: { 'Steel-Api-Key': apiKey, 'Content-Type': 'application/json' }, - body: JSON.stringify({}), - }); - - if (!response.ok) { - throw new Error(`Failed to create Steel session: ${response.status} ${response.statusText}`); - } - - const session = await response.json(); - console.log(`>>> Session created: ${session.id}`); - - return { - url: `wss://connect.steel.dev?apiKey=${apiKey}&sessionId=${session.id}`, - context: { id: session.id }, - }; - } - - async release({ id }: { id: string }) { - await fetch(`https://api.steel.dev/v1/sessions/${id}/release`, { - method: 'POST', - headers: { 'Steel-Api-Key': apiKey }, - }).catch(() => {}); - console.log(`<<< Session released: ${id}`); - } -} - -const crawler = new PlaywrightCrawler({ - launchContext: { - remoteBrowser: new SteelProvider(), - }, - browserPoolOptions: { - retireBrowserAfterPageCount: 5, - maxOpenPagesPerBrowser: 1, - }, - maxConcurrency: 5, - maxRequestsPerCrawl: 10, - async requestHandler({ page, request }) { - const title = await page.title(); - console.log(`[Page] ${request.loadedUrl} — "${title}"`); - }, -}); - -await crawler.run([ - 'https://example.com', - 'https://crawlee.dev', - 'https://www.google.com', - 'https://github.com', - 'https://wikipedia.org', - 'https://httpbin.org', - 'https://jsonplaceholder.typicode.com', - 'https://news.ycombinator.com', - 'https://books.toscrape.com', - 'https://quotes.toscrape.com', -]); diff --git a/temp-examples/examples/steel-puppeteer.ts b/temp-examples/examples/steel-puppeteer.ts deleted file mode 100644 index 80cfc4dea97d..000000000000 --- a/temp-examples/examples/steel-puppeteer.ts +++ /dev/null @@ -1,44 +0,0 @@ -import 'dotenv/config'; - -import { RemoteBrowserProvider } from '@crawlee/browser-pool'; -import { PuppeteerCrawler } from 'crawlee'; - -const apiKey = process.env.STEEL_API_KEY; -if (!apiKey) throw new Error('STEEL_API_KEY env variable is required'); - -class SteelProvider extends RemoteBrowserProvider { - maxOpenBrowsers = 4; // Steel Hobby tier effective concurrent limit - - async connect() { - return { url: `wss://connect.steel.dev?apiKey=${apiKey}` }; - } -} - -const crawler = new PuppeteerCrawler({ - launchContext: { - remoteBrowser: new SteelProvider(), - }, - browserPoolOptions: { - retireBrowserAfterPageCount: 5, - maxOpenPagesPerBrowser: 1, - }, - maxConcurrency: 4, - maxRequestsPerCrawl: 10, - async requestHandler({ page, request }) { - const title = await page.title(); - console.log(`[Page] ${request.loadedUrl} — "${title}"`); - }, -}); - -await crawler.run([ - 'https://example.com', - 'https://crawlee.dev', - 'https://www.google.com', - 'https://github.com', - 'https://wikipedia.org', - 'https://httpbin.org', - 'https://jsonplaceholder.typicode.com', - 'https://news.ycombinator.com', - 'https://books.toscrape.com', - 'https://quotes.toscrape.com', -]); diff --git a/temp-examples/package.json b/temp-examples/package.json deleted file mode 100644 index 39e0955ca0ec..000000000000 --- a/temp-examples/package.json +++ /dev/null @@ -1,49 +0,0 @@ -{ - "name": "temp-examples", - "version": "1.0.0", - "private": true, - "type": "module", - "scripts": { - "docker:browserless": "docker run -p 3000:3000 -e CONCURRENT=4 ghcr.io/browserless/chromium", - "example:browserless-puppeteer": "node --experimental-strip-types examples/browserless-puppeteer.ts", - "example:browserless-playwright": "node --experimental-strip-types examples/browserless-playwright.ts", - "example:browserless-playwright-ws": "node --experimental-strip-types examples/browserless-playwright-ws.ts", - "example:browserless-local-puppeteer": "node --experimental-strip-types examples/browserless-local-puppeteer.ts", - "example:browserless-local-playwright": "node --experimental-strip-types examples/browserless-local-playwright.ts", - "example:browserless-local-playwright-ws": "node --experimental-strip-types examples/browserless-local-playwright-ws.ts", - "example:browserbase-puppeteer": "node --experimental-strip-types examples/browserbase-puppeteer.ts", - "example:browserbase-playwright": "node --experimental-strip-types examples/browserbase-playwright.ts", - "example:browserbase-playwright-ws": "node --experimental-strip-types examples/browserbase-playwright-ws.ts", - "example:steel-puppeteer": "node --experimental-strip-types examples/steel-puppeteer.ts", - "example:steel-playwright": "node --experimental-strip-types examples/steel-playwright.ts", - "example:steel-playwright-ws": "node --experimental-strip-types examples/steel-playwright-ws.ts", - "example:rebrowser-puppeteer": "node --experimental-strip-types examples/rebrowser-puppeteer.ts", - "example:rebrowser-playwright": "node --experimental-strip-types examples/rebrowser-playwright.ts", - "example:rebrowser-playwright-ws": "node --experimental-strip-types examples/rebrowser-playwright-ws.ts", - "example:cookie-sharing-test": "node --experimental-strip-types examples/cookie-sharing-test.ts", - "example:cookie-sharing-playwright-test": "node --experimental-strip-types examples/cookie-sharing-playwright-test.ts", - "example:cookie-sharing-playwright-local-vs-remote": "node --experimental-strip-types examples/cookie-sharing-playwright-local-vs-remote.ts", - "example:cookie-sharing-session-across-browsers": "node --experimental-strip-types examples/cookie-sharing-session-across-browsers.ts", - "example:remote-proxy-test": "node --experimental-strip-types examples/remote-proxy-test.ts", - "example:cookie-sharing-session-across-local-browsers": "node --experimental-strip-types examples/cookie-sharing-session-across-local-browsers.ts", - "example:remote-proxy-via-chrome-args": "node --experimental-strip-types examples/remote-proxy-via-chrome-args.ts" - }, - "dependencies": { - "@crawlee/basic": "file:../packages/basic-crawler/dist", - "@crawlee/browser": "file:../packages/browser-crawler/dist", - "@crawlee/browser-pool": "file:../packages/browser-pool/dist", - "@crawlee/cheerio": "file:../packages/cheerio-crawler/dist", - "@crawlee/cli": "file:../packages/cli/dist", - "@crawlee/core": "file:../packages/core/dist", - "@crawlee/http": "file:../packages/http-crawler/dist", - "@crawlee/jsdom": "file:../packages/jsdom-crawler/dist", - "@crawlee/linkedom": "file:../packages/linkedom-crawler/dist", - "@crawlee/playwright": "file:../packages/playwright-crawler/dist", - "@crawlee/puppeteer": "file:../packages/puppeteer-crawler/dist", - "@crawlee/types": "file:../packages/types/dist", - "@crawlee/utils": "file:../packages/utils/dist", - "@types/node": "^25.2.0", - "crawlee": "file:../packages/crawlee/dist", - "dotenv": "^17.3.1" - } -} diff --git a/temp-examples/readme.md b/temp-examples/readme.md deleted file mode 100644 index 5eeede14c6f0..000000000000 --- a/temp-examples/readme.md +++ /dev/null @@ -1,115 +0,0 @@ -# Remote Browser Service Examples - -Examples for connecting Crawlee crawlers to remote browser services using `RemoteBrowserProvider`. - -## How to run - -```bash -# from repo root -npm run clean -npm run build - -cd temp-examples -npm install -npm run example:steel-puppeteer -``` - -## Steel - -**Website:** https://steel.dev -**Docs:** https://docs.steel.dev -**Protocol:** CDP only (no Playwright WebSocket protocol) - -### Connection modes - -Steel supports two ways to connect: - -1. **Auto-managed sessions** — connect directly to `wss://connect.steel.dev?apiKey=...`. Steel creates and cleans up the session automatically. Simplest approach. - -2. **API-managed sessions** — create a session via `POST /v1/sessions`, connect with the returned `sessionId`, release via `POST /v1/sessions/{id}/release`. Gives control over session options (proxy, geolocation, etc.) and explicit cleanup. - -### Concurrent session limits (Hobby/free tier) - -- Docs say 5 concurrent sessions -- In practice, only 4 connections succeed simultaneously -- Excess connections **hang silently** — no 429 error, no timeout, `puppeteer.connect()` / `connectOverCDP()` just never resolves -- Set `maxOpenBrowsers = 4` to stay safe - -### Playwright - -Steel exposes a CDP endpoint. Use `connectOverCDP()`, not `connect()`: - -```typescript -// Works — CDP -const browser = await chromium.connectOverCDP('wss://connect.steel.dev?apiKey=...'); - -// Hangs forever — Steel doesn't speak Playwright's WebSocket protocol -const browser = await chromium.connect('wss://connect.steel.dev?apiKey=...'); -``` - -### Examples - -| Example | Connection | Session management | -|---------|-----------|-------------------| -| `steel-puppeteer.ts` | Puppeteer CDP | Auto-managed | -| `steel-playwright.ts` | Playwright CDP | API-managed (create/release) | - ---- - -## Browserbase - -TODO - -## Browserless - -**Website:** https://browserless.io -**Docker:** `ghcr.io/browserless/chromium` -**Protocol:** CDP and Playwright WebSocket - -### Local setup (Docker) - -```bash -docker run -p 3000:3000 -e CONCURRENT=4 ghcr.io/browserless/chromium -``` - -Or use the npm script: - -```bash -npm run docker:browserless -``` - -This starts a Browserless instance on `ws://localhost:3000` with a 4 concurrent session limit. - -### Connection modes - -Browserless supports both CDP and Playwright's native WebSocket protocol: - -- **CDP** — `ws://localhost:3000` (default endpoint) -- **Playwright WebSocket** — `ws://localhost:3000/chromium/playwright` (use `type: 'websocket'` on the provider) - -Unlike Steel, Browserless actually speaks the Playwright WebSocket protocol, so `browserType.connect()` works. - -### Session management - -The cloud version has a `/session` API for explicit session lifecycle: - -- **Create:** `POST /session?token=...` with `{ ttl: 60000 }` — returns `{ id, connect, stop }` -- **Connect:** Use the `connect` URL from the response -- **Release:** `DELETE {stop}&force=true` - -The local Docker image (open-source) does not have the `/session` API — sessions are auto-managed on connect/disconnect. - -### Examples - -| Example | Connection | Session management | Target | -|---------|-----------|-------------------|--------| -| `browserless-local-puppeteer.ts` | Puppeteer CDP | Auto-managed | Docker | -| `browserless-local-playwright.ts` | Playwright CDP | Auto-managed | Docker | -| `browserless-local-playwright-ws.ts` | Playwright WebSocket | Auto-managed | Docker | -| `browserless-puppeteer.ts` | Puppeteer CDP | Auto-managed | Remote | -| `browserless-playwright.ts` | Playwright CDP | API-managed (create/release) | Remote | -| `browserless-playwright-ws.ts` | Playwright WebSocket | Auto-managed | Remote | - -## Rebrowser - -TODO diff --git a/temp-examples/tsconfig.json b/temp-examples/tsconfig.json deleted file mode 100644 index 5fcc4b7bad3a..000000000000 --- a/temp-examples/tsconfig.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "compilerOptions": { - "target": "ES2022", - "module": "Node16", - "moduleResolution": "Node16", - "esModuleInterop": true, - "sourceMap": false - } -} diff --git a/website/sidebars.js b/website/sidebars.js index c3672747a736..146b3e20dee8 100644 --- a/website/sidebars.js +++ b/website/sidebars.js @@ -37,6 +37,7 @@ module.exports = { 'guides/configuration', 'guides/cheerio-crawler-guide', 'guides/javascript-rendering', + 'guides/remote-browser', 'guides/proxy-management', 'guides/session-management', 'guides/scaling-crawlers', From 54f95f3cf0251767a1384d2e172ee935c6e60f94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sol=C3=A1r?= Date: Tue, 16 Jun 2026 11:20:43 +0000 Subject: [PATCH 38/45] Fix context leak, endpoint validation, and proxyUrl warning --- .../src/abstract-classes/browser-plugin.ts | 20 ++++++--- .../src/puppeteer/puppeteer-plugin.ts | 27 ++++++------ .../browser-pool/test/remote-browser.test.ts | 42 +++++++++++++++++-- 3 files changed, 67 insertions(+), 22 deletions(-) diff --git a/packages/browser-pool/src/abstract-classes/browser-plugin.ts b/packages/browser-pool/src/abstract-classes/browser-plugin.ts index 607394b12674..bdc385a7e01a 100644 --- a/packages/browser-pool/src/abstract-classes/browser-plugin.ts +++ b/packages/browser-pool/src/abstract-classes/browser-plugin.ts @@ -226,8 +226,12 @@ export abstract class BrowserPlugin< const { endpoint } = this.remoteBrowser!; const result = typeof endpoint === 'function' ? await endpoint(options) : endpoint; if (typeof result === 'string') { + if (!result) throw new Error('remoteBrowser.endpoint resolved to an empty string.'); return { url: result }; } + if (!result?.url) { + throw new Error("remoteBrowser.endpoint() must return a URL string or an object with a non-empty 'url'."); + } return result; } @@ -313,11 +317,17 @@ export abstract class BrowserPlugin< const { proxyUrl, launchOptions } = launchContext; if (proxyUrl && launchContext.isRemote) { - this.log.info( - 'proxyUrl is set for a remote browser connection. ' + - "It will be forwarded to the remote browser provider's connect() method. " + - "Make sure your provider handles it (e.g. passes it to the service's proxy API).", - ); + if (this.remoteBrowser) { + this.log.info( + 'proxyUrl is set and will be passed to the remoteBrowser.endpoint() function. ' + + "Make sure your endpoint() handles it (e.g. passes it to the service's proxy API).", + ); + } else { + this.log.warning( + 'proxyUrl is set but will be ignored when using connectOptions/connectOverCDPOptions. ' + + 'Configure the proxy through the remote service, or switch to `remoteBrowser` with an endpoint() that handles proxyUrl.', + ); + } } if (launchContext.userDataDir && launchContext.isRemote) { diff --git a/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts b/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts index 54de0c12c7c7..308dab48a728 100644 --- a/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts +++ b/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts @@ -255,26 +255,27 @@ export class PuppeteerPlugin extends BrowserPlugin< }) : ([undefined, noop] as const); - try { - const proxyServer = anonymizedProxyUrl ?? effectiveProxyUrl; - const contextOptions = proxyServer ? { proxyServer } : {}; - const context = (await (browser as any)[method]( - contextOptions, - )) as PuppeteerTypes.BrowserContext; + const proxyServer = anonymizedProxyUrl ?? effectiveProxyUrl; + const contextOptions = proxyServer ? { proxyServer } : {}; + const context = (await (browser as any)[method]( + contextOptions, + )) as PuppeteerTypes.BrowserContext; + try { page = await context.newPage(...args); - - page.once('close', async () => { - if (anonymizedProxyUrl) { - await close(); - } - await context.close().catch(noop); - }); } catch (error) { + await context.close().catch(noop); await close(); throw error; } + + page.once('close', async () => { + if (anonymizedProxyUrl) { + await close(); + } + await context.close().catch(noop); + }); } else { page = await boundMethods.newPage(...args); } diff --git a/packages/browser-pool/test/remote-browser.test.ts b/packages/browser-pool/test/remote-browser.test.ts index a534c23ea831..74dc1fe197d3 100644 --- a/packages/browser-pool/test/remote-browser.test.ts +++ b/packages/browser-pool/test/remote-browser.test.ts @@ -378,7 +378,7 @@ describe('Remote browser — PlaywrightPlugin', () => { // --- Info/Warnings -------------------------------------------------------- describe('info and warnings', () => { - test('proxyUrl + remote → info about forwarding to provider', async () => { + test('proxyUrl + connectOverCDPOptions → warning that proxyUrl is ignored', async () => { const lib = createMockPlaywrightLibrary(); const plugin = new PlaywrightPlugin(lib as any, { connectOverCDPOptions: { endpointURL: 'http://remote:9222' }, @@ -388,8 +388,25 @@ describe('Remote browser — PlaywrightPlugin', () => { const ctx = plugin.createLaunchContext(); await plugin.launch(ctx); + expect(mockLogger.warning).toHaveBeenCalledWith( + expect.stringContaining( + 'proxyUrl is set but will be ignored when using connectOptions/connectOverCDPOptions', + ), + ); + }); + + test('proxyUrl + remoteBrowser → info about forwarding to endpoint()', async () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any, { + remoteBrowser: { endpoint: 'http://remote:9222' }, + proxyUrl: 'http://user:pass@proxy:8080', + }); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + expect(mockLogger.info).toHaveBeenCalledWith( - expect.stringContaining("forwarded to the remote browser provider's connect()"), + expect.stringContaining('passed to the remoteBrowser.endpoint() function'), ); }); @@ -615,7 +632,7 @@ describe('Remote browser — PuppeteerPlugin', () => { // --- Info/Warnings -------------------------------------------------------- describe('info and warnings', () => { - test('proxyUrl + remote → info about forwarding to provider', async () => { + test('proxyUrl + connectOverCDPOptions → warning that proxyUrl is ignored', async () => { const lib = createMockPuppeteerLibrary(); const plugin = new PuppeteerPlugin(lib as any, { connectOverCDPOptions: { browserWSEndpoint: 'ws://remote:9222' }, @@ -625,8 +642,25 @@ describe('Remote browser — PuppeteerPlugin', () => { const ctx = plugin.createLaunchContext(); await plugin.launch(ctx); + expect(mockLogger.warning).toHaveBeenCalledWith( + expect.stringContaining( + 'proxyUrl is set but will be ignored when using connectOptions/connectOverCDPOptions', + ), + ); + }); + + test('proxyUrl + remoteBrowser → info about forwarding to endpoint()', async () => { + const lib = createMockPuppeteerLibrary(); + const plugin = new PuppeteerPlugin(lib as any, { + remoteBrowser: { endpoint: 'ws://remote:9222' }, + proxyUrl: 'http://user:pass@proxy:8080', + }); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + expect(mockLogger.info).toHaveBeenCalledWith( - expect.stringContaining("forwarded to the remote browser provider's connect()"), + expect.stringContaining('passed to the remoteBrowser.endpoint() function'), ); }); From ef362cd5b12311542a26ea7e9216cca23b48f44d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sol=C3=A1r?= Date: Tue, 16 Jun 2026 11:28:54 +0000 Subject: [PATCH 39/45] fix: warn when proxyUrl is dropped for string remoteBrowser.endpoint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A static-string endpoint can't receive proxyUrl, but the old log claimed it would be passed to endpoint() — masking that proxy traffic was being silently ignored. Branch on typeof endpoint and warn in the string case. --- .../src/abstract-classes/browser-plugin.ts | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/packages/browser-pool/src/abstract-classes/browser-plugin.ts b/packages/browser-pool/src/abstract-classes/browser-plugin.ts index bdc385a7e01a..6fd42f36e6cd 100644 --- a/packages/browser-pool/src/abstract-classes/browser-plugin.ts +++ b/packages/browser-pool/src/abstract-classes/browser-plugin.ts @@ -318,10 +318,17 @@ export abstract class BrowserPlugin< if (proxyUrl && launchContext.isRemote) { if (this.remoteBrowser) { - this.log.info( - 'proxyUrl is set and will be passed to the remoteBrowser.endpoint() function. ' + - "Make sure your endpoint() handles it (e.g. passes it to the service's proxy API).", - ); + if (typeof this.remoteBrowser.endpoint === 'function') { + this.log.info( + 'proxyUrl is set and will be passed to the remoteBrowser.endpoint() function. ' + + "Make sure your endpoint() handles it (e.g. passes it to the service's proxy API).", + ); + } else { + this.log.warning( + 'proxyUrl is set but will be ignored because remoteBrowser.endpoint is a static string. ' + + 'Switch endpoint to a function `(opts) => …` to receive proxyUrl, or configure the proxy through the remote service.', + ); + } } else { this.log.warning( 'proxyUrl is set but will be ignored when using connectOptions/connectOverCDPOptions. ' + From d8330b6f989121a0529e3aeccdeb7c28b42c9261 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sol=C3=A1r?= Date: Tue, 16 Jun 2026 11:53:13 +0000 Subject: [PATCH 40/45] fix(docs): add override modifier to RemoteBrowserProvider example members The remote_browser_provider.ts guide overrides maxOpenBrowsers and release() from the base class without the override modifier, failing the docs typecheck (TS4114). Co-Authored-By: Claude Opus 4.8 (1M context) --- docs/guides/remote_browser_provider.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/guides/remote_browser_provider.ts b/docs/guides/remote_browser_provider.ts index 6799909e217e..b950093bcf33 100644 --- a/docs/guides/remote_browser_provider.ts +++ b/docs/guides/remote_browser_provider.ts @@ -6,7 +6,7 @@ const projectId = process.env.BROWSERBASE_PROJECT_ID!; class BrowserbaseProvider extends RemoteBrowserProvider<{ id: string }> { // Respect the service's concurrent session limit to avoid 429s. - maxOpenBrowsers = 5; + override maxOpenBrowsers = 5; async connect() { const response = await fetch('https://api.browserbase.com/v1/sessions', { @@ -23,7 +23,7 @@ class BrowserbaseProvider extends RemoteBrowserProvider<{ id: string }> { return { url: session.connectUrl, context: { id: session.id } }; } - async release({ id }: { id: string }) { + override async release({ id }: { id: string }) { await fetch(`https://api.browserbase.com/v1/sessions/${id}`, { method: 'POST', headers: { 'x-bb-api-key': apiKey, 'Content-Type': 'application/json' }, From 868337a2bcddf49c4a0142f3586aab5be3a4943e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sol=C3=A1r?= Date: Tue, 16 Jun 2026 12:08:06 +0000 Subject: [PATCH 41/45] test: fix stale proxyUrl+remoteBrowser forwarding assertions These two tests configured a static-string endpoint but asserted the info log that only fires for a function endpoint. Since the string-endpoint path now warns instead (commit ef362cd5b), the assertions never matched. Use a function endpoint so the forwarding info log fires, matching the test's stated intent. Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/browser-pool/test/remote-browser.test.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/browser-pool/test/remote-browser.test.ts b/packages/browser-pool/test/remote-browser.test.ts index 74dc1fe197d3..8b0343ea2fa7 100644 --- a/packages/browser-pool/test/remote-browser.test.ts +++ b/packages/browser-pool/test/remote-browser.test.ts @@ -398,7 +398,7 @@ describe('Remote browser — PlaywrightPlugin', () => { test('proxyUrl + remoteBrowser → info about forwarding to endpoint()', async () => { const lib = createMockPlaywrightLibrary(); const plugin = new PlaywrightPlugin(lib as any, { - remoteBrowser: { endpoint: 'http://remote:9222' }, + remoteBrowser: { endpoint: () => 'http://remote:9222' }, proxyUrl: 'http://user:pass@proxy:8080', }); @@ -652,7 +652,7 @@ describe('Remote browser — PuppeteerPlugin', () => { test('proxyUrl + remoteBrowser → info about forwarding to endpoint()', async () => { const lib = createMockPuppeteerLibrary(); const plugin = new PuppeteerPlugin(lib as any, { - remoteBrowser: { endpoint: 'ws://remote:9222' }, + remoteBrowser: { endpoint: () => 'ws://remote:9222' }, proxyUrl: 'http://user:pass@proxy:8080', }); From 98f45b94b4087a24c71d9700c9abafc44525dba5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sol=C3=A1r?= Date: Tue, 16 Jun 2026 12:08:38 +0000 Subject: [PATCH 42/45] feat(browser-pool): add RemoteBrowserPool implementing IBrowserPool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce RemoteBrowserPool, an IBrowserPool implementation for remote browser services that wraps a BrowserPool (composition). It owns the one thing the plain pool cannot enforce on its own: a maxOpenBrowsers limit on concurrent remote browsers. newPage() waits for a free slot (event-driven, with a poll fallback) instead of letting the crawler overshoot the remote service's session quota. Pass an instance via the crawler's browserPool option (added in #3669) to plug remote support in as a first-class pool rather than threading it through launchContext. A pool supplied this way is not owned by the crawler, so its lifecycle (and reuse across crawlers) is the caller's. The remote-session lifecycle (connect/release) remains owned by the wrapped pool's plugin and its remoteBrowser config — this class only governs when new pages may open. Adds a docs guide section + typechecked example. Co-Authored-By: Claude Opus 4.8 (1M context) --- docs/guides/remote_browser.mdx | 11 +- docs/guides/remote_browser_pool.ts | 32 ++++ packages/browser-pool/src/index.ts | 2 + .../browser-pool/src/remote-browser-pool.ts | 153 ++++++++++++++++++ .../test/remote-browser-pool.test.ts | 144 +++++++++++++++++ 5 files changed, 341 insertions(+), 1 deletion(-) create mode 100644 docs/guides/remote_browser_pool.ts create mode 100644 packages/browser-pool/src/remote-browser-pool.ts create mode 100644 packages/browser-pool/test/remote-browser-pool.test.ts diff --git a/docs/guides/remote_browser.mdx b/docs/guides/remote_browser.mdx index 40a0d8b80c12..7d465142c364 100644 --- a/docs/guides/remote_browser.mdx +++ b/docs/guides/remote_browser.mdx @@ -11,6 +11,7 @@ import CodeBlock from '@theme/CodeBlock'; import RemoteBrowserConfigSource from '!!raw-loader!./remote_browser_config.ts'; import RemoteBrowserProviderSource from '!!raw-loader!./remote_browser_provider.ts'; import RemoteBrowserPuppeteerSource from '!!raw-loader!./remote_browser_puppeteer.ts'; +import RemoteBrowserPoolSource from '!!raw-loader!./remote_browser_pool.ts'; Instead of launching a local browser, Crawlee can connect to a remote browser service like [Browserbase](https://browserbase.com/), [Browserless](https://browserless.io/), [Steel](https://steel.dev/), or any service that exposes a WebSocket/CDP endpoint. The crawler manages the browser pool, session rotation, and proxy logic the same way it does locally — only the browser itself runs elsewhere. @@ -54,13 +55,21 @@ For services with a session-create / session-release lifecycle, extend `RemoteBrowserPool` and pass it as the crawler's `browserPool` option. It implements `IBrowserPool` by wrapping a `BrowserPool` and enforcing `maxOpenBrowsers` at the pool level: `newPage()` waits for a free slot rather than relying on the crawler to gate new tasks. + +{RemoteBrowserPoolSource} + +A pool passed this way is **not owned** by the crawler — the crawler never calls `destroy()` on it, so you control its lifecycle (and can reuse it across multiple crawlers). + ## Limitations - **`headless` is ignored.** The remote service controls headless mode; setting `headless` on the crawler or in `launchOptions` is dropped with a warning. - **`launchOptions` cannot be combined with a remote browser.** Setting both throws — browser flags, executable path, viewport, and similar must be configured on the service side. - **`useIncognitoPages` is forced to `true`** for Playwright remote connections. Playwright's `connect()` / `connectOverCDP()` don't accept persistent contexts. - **`userDataDir` has no effect** — there's no local profile when the browser runs remotely. -- **`maxOpenBrowsers` enforcement only gates new task starts.** Direct `BrowserPool.newPage` calls can exceed it; the limit is honored when the crawler drives the pool. +- **`maxOpenBrowsers` enforcement only gates new task starts** when configured via `launchContext`. Direct `BrowserPool.newPage` calls can exceed it. Use the [pluggable pool](#pluggable-pool-advanced) to enforce the limit inside `newPage()` itself. ## Further reading diff --git a/docs/guides/remote_browser_pool.ts b/docs/guides/remote_browser_pool.ts new file mode 100644 index 000000000000..03605a55d380 --- /dev/null +++ b/docs/guides/remote_browser_pool.ts @@ -0,0 +1,32 @@ +import { BrowserPool, PlaywrightPlugin, RemoteBrowserPool } from '@crawlee/browser-pool'; +import { PlaywrightCrawler } from 'crawlee'; +import playwright, { type Page } from 'playwright'; + +const token = process.env.BROWSERLESS_TOKEN!; + +// Build a BrowserPool whose single plugin connects to the remote service… +// The generic is the page type the crawler works with (Playwright's `Page`). +const remoteBrowserPool = new RemoteBrowserPool({ + browserPool: new BrowserPool({ + browserPlugins: [ + new PlaywrightPlugin(playwright.chromium, { + remoteBrowser: { endpoint: `wss://production-sfo.browserless.io?token=${token}` }, + }), + ], + }), + // …and cap concurrent remote browsers. newPage() waits for a free slot instead + // of overshooting the service's session quota. + maxOpenBrowsers: 5, +}); + +// Pass the pool in directly. The crawler uses it instead of building its own and, +// because the pool is not owned by the crawler, never tears it down. +const crawler = new PlaywrightCrawler({ + browserPool: remoteBrowserPool, + async requestHandler({ page, request, log }) { + const title = await page.title(); + log.info(`${request.loadedUrl} — "${title}"`); + }, +}); + +await crawler.run(['https://crawlee.dev']); diff --git a/packages/browser-pool/src/index.ts b/packages/browser-pool/src/index.ts index 960407ea58c6..087d717a45e5 100644 --- a/packages/browser-pool/src/index.ts +++ b/packages/browser-pool/src/index.ts @@ -54,6 +54,8 @@ export { BrowserPlugin, BrowserLaunchError, DEFAULT_USER_AGENT } from './abstrac export type { LaunchContextOptions } from './launch-context.js'; export { LaunchContext } from './launch-context.js'; export { RemoteBrowserProvider } from './remote-browser-provider.js'; +export { RemoteBrowserPool } from './remote-browser-pool.js'; +export type { RemoteBrowserPoolOptions } from './remote-browser-pool.js'; export type { InferBrowserPluginArray, UnwrapPromise } from './utils.js'; export { anonymizeProxySugar, type AnonymizeProxySugarOptions } from './anonymize-proxy.js'; export type { IBrowserPool, NewPageOptions } from '@crawlee/types'; diff --git a/packages/browser-pool/src/remote-browser-pool.ts b/packages/browser-pool/src/remote-browser-pool.ts new file mode 100644 index 000000000000..1223672e77eb --- /dev/null +++ b/packages/browser-pool/src/remote-browser-pool.ts @@ -0,0 +1,153 @@ +import type { IBrowserPool, NewPageOptions, PageState } from '@crawlee/types'; + +import type { BrowserPool } from './browser-pool.js'; +import { BROWSER_POOL_EVENTS } from './events.js'; + +export interface RemoteBrowserPoolOptions { + /** + * The underlying {@apilink BrowserPool} that performs the actual remote connections. Configure it + * with a single plugin set up for a remote connection (`remoteBrowser`, `connectOptions`, or + * `connectOverCDPOptions`). + */ + browserPool: BrowserPool; + /** + * Maximum number of remote browsers that may be open at the same time. When the limit is reached, + * {@apilink RemoteBrowserPool.newPage|newPage} waits until a browser closes (or an existing one frees + * a page slot) before opening a new page. Set this to your remote service's concurrent-session limit + * to avoid `429` errors. + * + * When omitted, the wrapped pool's own `maxOpenBrowsers` is used (defaults to `Infinity`, i.e. no limit). + */ + maxOpenBrowsers?: number; + /** + * Fallback poll interval, in milliseconds, used while waiting for a free browser slot. The wait is + * primarily event-driven (it wakes on browser/page close), so this only bounds how long it can sleep + * if no event fires. + * + * @default 500 + */ + slotPollIntervalMillis?: number; +} + +/** + * An {@apilink IBrowserPool} implementation for remote browser services. + * + * It wraps a {@apilink BrowserPool} configured for a remote connection and adds the one piece the plain + * pool cannot enforce on its own: a {@apilink RemoteBrowserPoolOptions.maxOpenBrowsers|concurrency limit} + * on open remote browsers. {@apilink RemoteBrowserPool.newPage|newPage} blocks until a slot is free instead + * of letting the crawler overshoot the remote service's session quota. + * + * The remote-session lifecycle (connecting via `endpoint()` and calling `release()` on close) is owned by + * the wrapped pool's plugin and its `remoteBrowser` configuration — this class only governs *when* new + * pages may open. + * + * Pass an instance as the `browserPool` option of a browser crawler: + * + * ```typescript + * import { BrowserPool, PlaywrightPlugin, RemoteBrowserPool } from '@crawlee/browser-pool'; + * import playwright from 'playwright'; + * + * const browserPool = new RemoteBrowserPool({ + * browserPool: new BrowserPool({ + * browserPlugins: [ + * new PlaywrightPlugin(playwright.chromium, { + * remoteBrowser: { endpoint: 'wss://production-sfo.browserless.io?token=xxx' }, + * }), + * ], + * }), + * maxOpenBrowsers: 2, + * }); + * + * const crawler = new PlaywrightCrawler({ browserPool }); + * ``` + * + * @category Browser management + */ +export class RemoteBrowserPool implements IBrowserPool { + /** The wrapped pool that performs the remote connections and serves pages. */ + readonly browserPool: BrowserPool; + + /** + * The wrapped pool viewed through the {@apilink IBrowserPool} contract it implements. Used for + * page delegation because the bare `BrowserPool` type widens its page type to `never`. + */ + private readonly pool: IBrowserPool; + + private readonly slotPollIntervalMillis: number; + + constructor(options: RemoteBrowserPoolOptions) { + const { browserPool, maxOpenBrowsers, slotPollIntervalMillis = 500 } = options; + + this.browserPool = browserPool; + this.pool = browserPool as unknown as IBrowserPool; + this.slotPollIntervalMillis = slotPollIntervalMillis; + + if (maxOpenBrowsers !== undefined) { + this.browserPool.maxOpenBrowsers = maxOpenBrowsers; + } + } + + /** Maximum number of remote browsers that may be open at the same time. */ + get maxOpenBrowsers(): number { + return this.browserPool.maxOpenBrowsers; + } + + set maxOpenBrowsers(value: number) { + this.browserPool.maxOpenBrowsers = value; + } + + /** + * Opens a new page, waiting first until the {@apilink RemoteBrowserPoolOptions.maxOpenBrowsers|browser + * limit} allows it. A page can open immediately when either a new browser slot is free or an already + * active browser still has room for another page. + */ + async newPage(options?: NewPageOptions): Promise { + await this._waitForFreeSlot(); + return this.pool.newPage(options); + } + + async closePage(page: Page, options?: { error?: Error }): Promise { + return this.pool.closePage(page, options); + } + + async extractPageState(page: Page): Promise { + return this.pool.extractPageState(page); + } + + async injectPageState(page: Page, state: PageState): Promise { + return this.pool.injectPageState(page, state); + } + + /** Closes all browsers and tears down the wrapped pool. */ + async destroy(): Promise { + await this.browserPool.destroy(); + } + + /** + * Resolves once the wrapped pool can serve another page without exceeding `maxOpenBrowsers`. The check + * is best-effort: concurrent `newPage` calls may briefly overshoot the limit, mirroring the advisory + * nature of the crawler-level throttle this replaces. + */ + private async _waitForFreeSlot(): Promise { + while (!this.browserPool.hasFreeBrowserSlot() && !this.browserPool.hasActiveBrowserWithFreeCapacity()) { + await this._waitForCapacityChange(); + } + } + + /** Resolves on the next browser-retired / page-closed event, or after `slotPollIntervalMillis`. */ + private async _waitForCapacityChange(): Promise { + await new Promise((resolve) => { + const done = () => { + clearTimeout(timer); + this.browserPool.off(BROWSER_POOL_EVENTS.BROWSER_RETIRED, done); + this.browserPool.off(BROWSER_POOL_EVENTS.PAGE_CLOSED, done); + resolve(); + }; + + const timer = setTimeout(done, this.slotPollIntervalMillis); + timer.unref?.(); + this.browserPool.once(BROWSER_POOL_EVENTS.BROWSER_RETIRED, done); + this.browserPool.once(BROWSER_POOL_EVENTS.PAGE_CLOSED, done); + }); + } +} diff --git a/packages/browser-pool/test/remote-browser-pool.test.ts b/packages/browser-pool/test/remote-browser-pool.test.ts new file mode 100644 index 000000000000..96901d2a2f40 --- /dev/null +++ b/packages/browser-pool/test/remote-browser-pool.test.ts @@ -0,0 +1,144 @@ +import { EventEmitter } from 'node:events'; + +import { vi } from 'vitest'; + +import { BROWSER_POOL_EVENTS } from '../src/events.js'; +import type { BrowserPool } from '../src/browser-pool.js'; +import { RemoteBrowserPool } from '../src/remote-browser-pool.js'; + +/** + * A minimal stand-in for {@link BrowserPool} exposing only the surface + * {@link RemoteBrowserPool} touches: the four `IBrowserPool` methods, `destroy`, + * `maxOpenBrowsers`, the two capacity helpers, and the event emitter. + */ +function createFakePool(overrides: Partial> = {}) { + const emitter = new EventEmitter(); + const pool = Object.assign(emitter, { + maxOpenBrowsers: Infinity, + hasFreeBrowserSlot: vi.fn(() => true), + hasActiveBrowserWithFreeCapacity: vi.fn(() => false), + newPage: vi.fn(async (options?: any) => ({ id: options?.id ?? 'page' })), + closePage: vi.fn(async () => {}), + extractPageState: vi.fn(async () => ({ cookies: [] })), + injectPageState: vi.fn(async () => {}), + destroy: vi.fn(async () => {}), + ...overrides, + }); + return pool as unknown as BrowserPool; +} + +describe('RemoteBrowserPool', () => { + describe('construction', () => { + it('applies maxOpenBrowsers to the wrapped pool', () => { + const fake = createFakePool(); + const remote = new RemoteBrowserPool({ browserPool: fake, maxOpenBrowsers: 3 }); + + expect(fake.maxOpenBrowsers).toBe(3); + expect(remote.maxOpenBrowsers).toBe(3); + }); + + it('leaves the wrapped pool default when maxOpenBrowsers is omitted', () => { + const fake = createFakePool(); + const remote = new RemoteBrowserPool({ browserPool: fake }); + + expect(remote.maxOpenBrowsers).toBe(Infinity); + }); + + it('proxies maxOpenBrowsers writes through to the wrapped pool', () => { + const fake = createFakePool(); + const remote = new RemoteBrowserPool({ browserPool: fake }); + + remote.maxOpenBrowsers = 5; + + expect(fake.maxOpenBrowsers).toBe(5); + }); + }); + + describe('delegation', () => { + it('forwards closePage / extractPageState / injectPageState / destroy', async () => { + const fake = createFakePool(); + const remote = new RemoteBrowserPool<{ id: string }>({ browserPool: fake }); + const page = { id: 'p1' }; + const error = new Error('boom'); + + await remote.closePage(page, { error }); + await remote.extractPageState(page); + await remote.injectPageState(page, { cookies: [] }); + await remote.destroy(); + + expect(fake.closePage).toHaveBeenCalledWith(page, { error }); + expect(fake.extractPageState).toHaveBeenCalledWith(page); + expect(fake.injectPageState).toHaveBeenCalledWith(page, { cookies: [] }); + expect(fake.destroy).toHaveBeenCalledOnce(); + }); + }); + + describe('newPage throttle', () => { + it('opens immediately when a browser slot is free', async () => { + const fake = createFakePool({ hasFreeBrowserSlot: vi.fn(() => true) }); + const remote = new RemoteBrowserPool({ browserPool: fake, maxOpenBrowsers: 2 }); + + const page = await remote.newPage({ id: 'x' }); + + expect(page).toEqual({ id: 'x' }); + expect(fake.newPage).toHaveBeenCalledOnce(); + }); + + it('opens immediately when an active browser has free page capacity', async () => { + const fake = createFakePool({ + hasFreeBrowserSlot: vi.fn(() => false), + hasActiveBrowserWithFreeCapacity: vi.fn(() => true), + }); + const remote = new RemoteBrowserPool({ browserPool: fake, maxOpenBrowsers: 1 }); + + await remote.newPage(); + + expect(fake.newPage).toHaveBeenCalledOnce(); + }); + + it('waits while at capacity, then opens once a browser is retired', async () => { + let atCapacity = true; + const fake = createFakePool({ + hasFreeBrowserSlot: vi.fn(() => !atCapacity), + hasActiveBrowserWithFreeCapacity: vi.fn(() => false), + }); + const remote = new RemoteBrowserPool({ browserPool: fake, maxOpenBrowsers: 1, slotPollIntervalMillis: 50 }); + + const pagePromise = remote.newPage(); + let resolved = false; + void pagePromise.then(() => { + resolved = true; + }); + + // Still blocked while at capacity. + await new Promise((r) => setTimeout(r, 20)); + expect(resolved).toBe(false); + expect(fake.newPage).not.toHaveBeenCalled(); + + // Free a slot and signal it. + atCapacity = false; + fake.emit(BROWSER_POOL_EVENTS.BROWSER_RETIRED); + + await pagePromise; + expect(resolved).toBe(true); + expect(fake.newPage).toHaveBeenCalledOnce(); + }); + + it('re-checks capacity via the poll fallback when no event fires', async () => { + let atCapacity = true; + const fake = createFakePool({ + hasFreeBrowserSlot: vi.fn(() => !atCapacity), + hasActiveBrowserWithFreeCapacity: vi.fn(() => false), + }); + const remote = new RemoteBrowserPool({ browserPool: fake, maxOpenBrowsers: 1, slotPollIntervalMillis: 20 }); + + const pagePromise = remote.newPage(); + setTimeout(() => { + atCapacity = false; + }, 30); + + await pagePromise; + expect(fake.newPage).toHaveBeenCalledOnce(); + }); + }); +}); From c1ade92ce4bab1cd3d38fdebf5788a4096b4781f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sol=C3=A1r?= Date: Tue, 16 Jun 2026 19:54:21 +0000 Subject: [PATCH 43/45] refactor!: migrate remote browser support into RemoteBrowserPool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make RemoteBrowserPool the single owner of all remote-session concerns instead of weaving remote logic through the base plugin, controller, launch context, launchers and crawlers. RemoteBrowserPool (implements IBrowserPool) now owns: - endpoint resolution (static URL, per-launch function, or RemoteBrowserProvider) via an internal RemoteSessionRegistry; - the release lifecycle, with a release-at-most-once guarantee and a releaseAll() backstop on destroy() so no remote session leaks; - the maxOpenBrowsers throttle, enforced inside newPage(). Plugins keep only the library-specific connect() call, driven by a thin RemoteConnection bridge the pool injects (useRemoteConnection). The controller releases sessions by token via that bridge. This removes the two footguns from the previous design by construction: double-release (idempotent registry + token clear) and the teardown session leak (releaseAll backstop). BREAKING CHANGE: remote browsers are configured exclusively through RemoteBrowserPool passed as the crawler's browserPool option. The launchContext.remoteBrowser / connectOptions / connectOverCDPOptions options are removed; Playwright CDP-vs-WebSocket selection moves to the pool's connection.protocol option. Co-Authored-By: Claude Opus 4.8 (1M context) feat(browser-crawler): add remoteBrowser option so the crawler builds the pool Building a RemoteBrowserPool by hand and passing it as browserPool meant constructing the browser plugin twice (the crawler builds and discards its own) and allowed mismatching the pool with the crawler — e.g. a Puppeteer-backed pool in a PlaywrightCrawler type-checked but broke at runtime. Add a remoteBrowser option (CrawlerRemoteBrowserOptions: endpoint / release / maxOpenBrowsers / connection — no plugin) to the browser crawlers. The crawler builds a RemoteBrowserPool around its OWN plugin, so the connection is always for the matching browser; there is nothing to construct and no way to mismatch. The crawler owns and tears down this pool. browserPool stays for sharing one remote pool across crawlers (not owned by the crawler); the two options are mutually exclusive. Co-Authored-By: Claude Opus 4.8 (1M context) simplify --- docs/guides/remote_browser.mdx | 47 +- docs/guides/remote_browser_config.ts | 12 +- docs/guides/remote_browser_pool.ts | 32 - docs/guides/remote_browser_provider.ts | 5 +- docs/guides/remote_browser_puppeteer.ts | 8 +- .../src/internals/browser-crawler.ts | 74 +- .../src/internals/browser-launcher.ts | 1 - .../abstract-classes/browser-controller.ts | 17 +- .../src/abstract-classes/browser-plugin.ts | 196 +-- packages/browser-pool/src/index.ts | 11 +- packages/browser-pool/src/launch-context.ts | 8 + .../src/playwright/playwright-plugin.ts | 160 +-- .../src/puppeteer/puppeteer-plugin.ts | 112 +- .../browser-pool/src/remote-browser-pool.ts | 275 ++++- .../src/remote-browser-provider.ts | 17 +- packages/browser-pool/src/utils.ts | 14 + .../test/remote-browser-pool.test.ts | 308 +++-- .../browser-pool/test/remote-browser.test.ts | 1051 ++--------------- .../src/internals/playwright-crawler.ts | 12 +- .../src/internals/playwright-launcher.ts | 57 - .../src/internals/puppeteer-crawler.ts | 12 +- .../src/internals/puppeteer-launcher.ts | 41 - .../playwright_launcher.test.ts | 28 - .../puppeteer_launcher.test.ts | 19 - test/core/crawlers/browser_crawler.test.ts | 41 + .../remote-browser-incognito.test.ts | 15 +- 26 files changed, 767 insertions(+), 1806 deletions(-) delete mode 100644 docs/guides/remote_browser_pool.ts diff --git a/docs/guides/remote_browser.mdx b/docs/guides/remote_browser.mdx index 7d465142c364..f02d41be4b64 100644 --- a/docs/guides/remote_browser.mdx +++ b/docs/guides/remote_browser.mdx @@ -11,19 +11,24 @@ import CodeBlock from '@theme/CodeBlock'; import RemoteBrowserConfigSource from '!!raw-loader!./remote_browser_config.ts'; import RemoteBrowserProviderSource from '!!raw-loader!./remote_browser_provider.ts'; import RemoteBrowserPuppeteerSource from '!!raw-loader!./remote_browser_puppeteer.ts'; -import RemoteBrowserPoolSource from '!!raw-loader!./remote_browser_pool.ts'; -Instead of launching a local browser, Crawlee can connect to a remote browser service like [Browserbase](https://browserbase.com/), [Browserless](https://browserless.io/), [Steel](https://steel.dev/), or any service that exposes a WebSocket/CDP endpoint. The crawler manages the browser pool, session rotation, and proxy logic the same way it does locally — only the browser itself runs elsewhere. +Instead of launching a local browser, Crawlee can connect to a remote browser service like [Browserbase](https://browserbase.com/), [Browserless](https://browserless.io/), [Steel](https://steel.dev/), or any service that exposes a WebSocket/CDP endpoint. The crawler manages session rotation and the request lifecycle the same way it does locally — only the browser itself runs elsewhere. Use this when you need IPs in specific regions, want to offload CPU/memory from your runner, or need stealth features the service provides. -## Inline config +## How it works -The simplest form passes a connection URL on `launchContext.remoteBrowser`. Use this when the service exposes a single endpoint and doesn't need per-session setup. +Set the crawler's `remoteBrowser` option with the connection details. The crawler builds a `RemoteBrowserPool` around its own browser plugin, so the connection is always for the matching browser — there's no plugin to construct and no way to mismatch the pool with the crawler. The pool (an `IBrowserPool` wrapping the regular `BrowserPool`) owns everything remote: resolving the endpoint, releasing sessions when browsers close, and capping how many remote browsers run at once. + +## Basic usage + +The simplest form is a static connection URL. Use this when the service exposes a single endpoint and doesn't need per-session setup. {RemoteBrowserConfigSource} -`endpoint` can also be a function returning `{ url, context }`, called once per browser launch. The optional `release({ endpoint, context })` callback runs when the browser closes, crashes, or the pool is destroyed — use it to clean up sessions on the service side. +`endpoint` can also be a function returning `{ url, context }`, called once per browser launch. Pair it with a `release` callback (it receives the `context`) to clean up sessions on the service side when the browser closes, crashes, or the pool is destroyed. + +`maxOpenBrowsers` caps the number of concurrent remote browsers — set it to the service's concurrent-session limit to avoid 429 errors. The pool enforces it inside `newPage()`, which waits for a free slot rather than overshooting. ### Self-hosted @@ -33,45 +38,33 @@ Some services ship a Docker image you can run locally or on your own infrastruct docker run -p 3000:3000 -e CONCURRENT=4 ghcr.io/browserless/chromium ``` -Point the crawler at the local endpoint: - -```ts -remoteBrowser: { endpoint: 'ws://localhost:3000' } -``` +Point the pool at the local endpoint with `endpoint: 'ws://localhost:3000'`. ## Custom provider -For services with a session-create / session-release lifecycle, extend `RemoteBrowserProvider`. `connect()` runs once per browser launch and returns the connection URL plus an optional `context` object passed back to `release()`. +For services with a session-create / session-release lifecycle, extend `RemoteBrowserProvider` and pass the instance as the pool's `endpoint`. `connect()` runs once per browser launch and returns the connection URL plus an optional `context` object passed back to `release()`. `maxOpenBrowsers` set on the provider is adopted by the pool. {RemoteBrowserProviderSource} -`maxOpenBrowsers` caps the number of concurrent browsers — set it to the service's concurrent-session limit to avoid 429 errors. - ## Puppeteer -`PuppeteerCrawler` supports the same `remoteBrowser` option. For services that only expose a raw CDP endpoint without per-session setup, you can also use `connectOverCDPOptions` directly: +`PuppeteerCrawler` works the same way — build the pool with a `PuppeteerPlugin`. Puppeteer connects over CDP: {RemoteBrowserPuppeteerSource} -For Playwright, the analogous low-level options are `connectOptions` (Playwright's WebSocket protocol) and `connectOverCDPOptions` (CDP). `remoteBrowser`, `connectOptions`, and `connectOverCDPOptions` are mutually exclusive — set only one. - -## Pluggable pool (advanced) - -The options above configure remote browsers through the crawler's `launchContext`. For full control — sharing one remote pool across crawlers, or capping concurrent sessions independently of the crawler — construct a `RemoteBrowserPool` and pass it as the crawler's `browserPool` option. It implements `IBrowserPool` by wrapping a `BrowserPool` and enforcing `maxOpenBrowsers` at the pool level: `newPage()` waits for a free slot rather than relying on the crawler to gate new tasks. +For Playwright you can choose the protocol via the `remoteBrowser.connection.protocol` option: `'cdp'` (default, `connectOverCDP()`) or `'playwright'` (`connect()`, Playwright's own WebSocket protocol). -{RemoteBrowserPoolSource} +## Sharing a pool across crawlers -A pool passed this way is **not owned** by the crawler — the crawler never calls `destroy()` on it, so you control its lifecycle (and can reuse it across multiple crawlers). +`remoteBrowser` builds a pool the crawler owns and tears down. To share one remote pool across multiple crawlers, construct a `RemoteBrowserPool` yourself and pass it as the `browserPool` option instead — a pool supplied that way is never destroyed by the crawler, so you control its lifecycle. Use `remoteBrowser` *or* `browserPool`, not both. ## Limitations -- **`headless` is ignored.** The remote service controls headless mode; setting `headless` on the crawler or in `launchOptions` is dropped with a warning. -- **`launchOptions` cannot be combined with a remote browser.** Setting both throws — browser flags, executable path, viewport, and similar must be configured on the service side. -- **`useIncognitoPages` is forced to `true`** for Playwright remote connections. Playwright's `connect()` / `connectOverCDP()` don't accept persistent contexts. -- **`userDataDir` has no effect** — there's no local profile when the browser runs remotely. -- **`maxOpenBrowsers` enforcement only gates new task starts** when configured via `launchContext`. Direct `BrowserPool.newPage` calls can exceed it. Use the [pluggable pool](#pluggable-pool-advanced) to enforce the limit inside `newPage()` itself. +- **`headless` and `launchOptions` don't apply.** The remote service controls headless mode and browser flags; configure them on the service side. +- **`useIncognitoPages` is forced to `true`** for Playwright remote connections — `connect()` / `connectOverCDP()` don't accept persistent contexts. For state shared across requests, use the `SessionPool`. +- **`userDataDir` has no effect** — there's no local profile when the browser runs remotely. Use the service's persistence API (e.g. Browserbase Contexts, Steel Profiles). ## Further reading +- `RemoteBrowserPool` API reference - `RemoteBrowserProvider` API reference -- `RemoteBrowserConfig` API reference diff --git a/docs/guides/remote_browser_config.ts b/docs/guides/remote_browser_config.ts index cab24954009a..41f4e0542fe8 100644 --- a/docs/guides/remote_browser_config.ts +++ b/docs/guides/remote_browser_config.ts @@ -3,12 +3,12 @@ import { PlaywrightCrawler } from 'crawlee'; const token = process.env.BROWSERLESS_TOKEN!; const crawler = new PlaywrightCrawler({ - launchContext: { - remoteBrowser: { - endpoint: `wss://production-sfo.browserless.io?token=${token}`, - // Optional — respect the service's concurrent session limit. - maxOpenBrowsers: 5, - }, + // Connect to a remote browser instead of launching locally. The crawler builds the right + // pool for its browser — you only supply the connection details. + remoteBrowser: { + endpoint: `wss://production-sfo.browserless.io?token=${token}`, + // Optional — respect the service's concurrent session limit. + maxOpenBrowsers: 5, }, async requestHandler({ page, request, log }) { const title = await page.title(); diff --git a/docs/guides/remote_browser_pool.ts b/docs/guides/remote_browser_pool.ts deleted file mode 100644 index 03605a55d380..000000000000 --- a/docs/guides/remote_browser_pool.ts +++ /dev/null @@ -1,32 +0,0 @@ -import { BrowserPool, PlaywrightPlugin, RemoteBrowserPool } from '@crawlee/browser-pool'; -import { PlaywrightCrawler } from 'crawlee'; -import playwright, { type Page } from 'playwright'; - -const token = process.env.BROWSERLESS_TOKEN!; - -// Build a BrowserPool whose single plugin connects to the remote service… -// The generic is the page type the crawler works with (Playwright's `Page`). -const remoteBrowserPool = new RemoteBrowserPool({ - browserPool: new BrowserPool({ - browserPlugins: [ - new PlaywrightPlugin(playwright.chromium, { - remoteBrowser: { endpoint: `wss://production-sfo.browserless.io?token=${token}` }, - }), - ], - }), - // …and cap concurrent remote browsers. newPage() waits for a free slot instead - // of overshooting the service's session quota. - maxOpenBrowsers: 5, -}); - -// Pass the pool in directly. The crawler uses it instead of building its own and, -// because the pool is not owned by the crawler, never tears it down. -const crawler = new PlaywrightCrawler({ - browserPool: remoteBrowserPool, - async requestHandler({ page, request, log }) { - const title = await page.title(); - log.info(`${request.loadedUrl} — "${title}"`); - }, -}); - -await crawler.run(['https://crawlee.dev']); diff --git a/docs/guides/remote_browser_provider.ts b/docs/guides/remote_browser_provider.ts index b950093bcf33..45594d0fe4f4 100644 --- a/docs/guides/remote_browser_provider.ts +++ b/docs/guides/remote_browser_provider.ts @@ -33,8 +33,9 @@ class BrowserbaseProvider extends RemoteBrowserProvider<{ id: string }> { } const crawler = new PlaywrightCrawler({ - launchContext: { - remoteBrowser: new BrowserbaseProvider(), + // Pass the provider as the `endpoint`; the crawler's pool calls connect()/release() per browser. + remoteBrowser: { + endpoint: new BrowserbaseProvider(), }, async requestHandler({ page, request, log }) { const title = await page.title(); diff --git a/docs/guides/remote_browser_puppeteer.ts b/docs/guides/remote_browser_puppeteer.ts index 61952843cb24..2bfc14be3d65 100644 --- a/docs/guides/remote_browser_puppeteer.ts +++ b/docs/guides/remote_browser_puppeteer.ts @@ -3,11 +3,9 @@ import { PuppeteerCrawler } from 'crawlee'; const token = process.env.BROWSERLESS_TOKEN!; const crawler = new PuppeteerCrawler({ - launchContext: { - // Puppeteer connects to remote browsers via CDP. - connectOverCDPOptions: { - browserWSEndpoint: `wss://production-sfo.browserless.io?token=${token}`, - }, + // PuppeteerCrawler connects over CDP. Same `remoteBrowser` option, matching browser guaranteed. + remoteBrowser: { + endpoint: `wss://production-sfo.browserless.io?token=${token}`, }, async requestHandler({ page, request, log }) { const title = await page.title(); diff --git a/packages/browser-crawler/src/internals/browser-crawler.ts b/packages/browser-crawler/src/internals/browser-crawler.ts index ad0968b2a714..3d4cc2ecb2aa 100644 --- a/packages/browser-crawler/src/internals/browser-crawler.ts +++ b/packages/browser-crawler/src/internals/browser-crawler.ts @@ -34,10 +34,11 @@ import type { BrowserPoolHooks, BrowserPoolOptions, CommonPage, + CrawlerRemoteBrowserOptions, InferBrowserPluginArray, LaunchContext, } from '@crawlee/browser-pool'; -import { BrowserPool } from '@crawlee/browser-pool'; +import { BrowserPool, RemoteBrowserPool } from '@crawlee/browser-pool'; import type { BatchAddRequestsResult, Cookie as CookieObject, IBrowserPool, ISession } from '@crawlee/types'; import type { RobotsTxtFile } from '@crawlee/utils'; import { CLOUDFLARE_RETRY_CSS_SELECTORS, RETRY_CSS_SELECTORS, sleep } from '@crawlee/utils'; @@ -123,6 +124,19 @@ export interface BrowserCrawlerOptions< */ browserPool?: IBrowserPool; + /** + * Connect to a remote browser service (Browserbase, Browserless, Steel, …) instead of launching locally. + * + * The crawler builds a {@apilink RemoteBrowserPool} around its own browser plugin, so the connection is + * always for the right browser — there is no plugin to construct and no way to mismatch the pool with the + * crawler. Supply the connection details only: a static `endpoint` URL, a function returning one per launch, + * or a {@apilink RemoteBrowserProvider}. + * + * Mutually exclusive with `browserPool`. For sharing a remote pool across crawlers, construct a + * {@apilink RemoteBrowserPool} yourself and pass it as `browserPool` instead. + */ + remoteBrowser?: CrawlerRemoteBrowserOptions; + /** * Function that is called to process each request. * @@ -322,12 +336,11 @@ export abstract class BrowserCrawler< browserPool: IBrowserPool; /** - * Set when the crawler constructed its own {@apilink BrowserPool} (no `browserPool` option was provided). - * Holds the same instance as `browserPool`, but typed as the concrete class so the crawler can call - * lifecycle methods (`destroy`) that aren't part of {@apilink IBrowserPool}. A user-supplied pool is - * never owned and never torn down by the crawler. + * Set when the crawler constructed its own pool (a {@apilink BrowserPool}, or a {@apilink RemoteBrowserPool} + * built from the `remoteBrowser` option). Holds the same instance as `browserPool` but is the only reference + * the crawler tears down — a user-supplied `browserPool` is never owned and never destroyed by the crawler. */ - private ownedBrowserPool?: BrowserPool; + private ownedBrowserPool?: { destroy: () => Promise }; launchContext: BrowserLaunchContext; @@ -349,6 +362,7 @@ export abstract class BrowserCrawler< launchContext: ow.optional.object, headless: ow.optional.any(ow.boolean, ow.string), browserPool: ow.optional.object.validate(validators.browserPool), + remoteBrowser: ow.optional.object, browserPoolOptions: ow.optional.object, saveResponseCookies: ow.optional.boolean, proxyConfiguration: ow.optional.object.validate(validators.proxyConfiguration), @@ -368,6 +382,7 @@ export abstract class BrowserCrawler< saveResponseCookies = true, launchContext = {}, browserPool, + remoteBrowser, browserPoolOptions, preNavigationHooks = [], postNavigationHooks = [], @@ -422,6 +437,13 @@ export abstract class BrowserCrawler< this.saveResponseCookies = saveResponseCookies; + if (browserPool && remoteBrowser) { + throw new Error( + "Set at most one of 'browserPool' and 'remoteBrowser'. To share a remote pool across crawlers, " + + 'build a RemoteBrowserPool yourself and pass it as `browserPool`.', + ); + } + if (browserPool) { this.browserPool = browserPool; return; @@ -435,17 +457,25 @@ export abstract class BrowserCrawler< resolvedBrowserPoolOptions.useFingerprints = false; } - this.ownedBrowserPool = new BrowserPool({ - ...(resolvedBrowserPoolOptions as any), - }); - - // Read maxOpenBrowsers from the remote browser config and apply it to the pool. - const remoteMaxBrowsers = this.ownedBrowserPool.browserPlugins[0]?.remoteBrowser?.maxOpenBrowsers; - if (remoteMaxBrowsers) { - this.ownedBrowserPool.maxOpenBrowsers = remoteMaxBrowsers; + if (remoteBrowser) { + // The crawler already built the right plugin for its browser — hand it to a RemoteBrowserPool so the + // remote connection is always for the matching browser (no plugin to construct, no way to mismatch). + const { browserPlugins, ...remoteBrowserPoolOptions } = resolvedBrowserPoolOptions; + const remotePool = new RemoteBrowserPool({ + browserPlugins: browserPlugins as BrowserPlugin[], + ...remoteBrowser, + browserPoolOptions: remoteBrowserPoolOptions as any, + }); + this.ownedBrowserPool = remotePool; + this.browserPool = remotePool as IBrowserPool; + return; } - this.browserPool = this.ownedBrowserPool as IBrowserPool; + const ownedBrowserPool = new BrowserPool({ + ...(resolvedBrowserPoolOptions as any), + }); + this.ownedBrowserPool = ownedBrowserPool; + this.browserPool = ownedBrowserPool as IBrowserPool; } protected override buildContextPipeline(): ContextPipeline< @@ -724,20 +754,6 @@ export abstract class BrowserCrawler< * Function for cleaning up after all requests are processed. * @ignore */ - protected override async _isTaskReadyFunction(): Promise { - // Don't start new tasks if browser pool is at its limit and no active browser has capacity. - // AutoscaledPool will retry automatically when a browser closes and frees a slot. - if ( - this.ownedBrowserPool && - !this.ownedBrowserPool.hasFreeBrowserSlot() && - !this.ownedBrowserPool.hasActiveBrowserWithFreeCapacity() - ) { - return false; - } - - return super._isTaskReadyFunction(); - } - override async teardown(): Promise { await this.ownedBrowserPool?.destroy(); await super.teardown(); diff --git a/packages/browser-crawler/src/internals/browser-launcher.ts b/packages/browser-crawler/src/internals/browser-launcher.ts index 431a3e14d078..bfac152053e4 100644 --- a/packages/browser-crawler/src/internals/browser-launcher.ts +++ b/packages/browser-crawler/src/internals/browser-launcher.ts @@ -113,7 +113,6 @@ export abstract class BrowserLauncher< userDataDir: ow.optional.string, launchOptions: ow.optional.object, userAgent: ow.optional.string, - remoteBrowser: ow.optional.object, }; static requireLauncherOrThrow(launcher: string, apifyImageName: string): T { diff --git a/packages/browser-pool/src/abstract-classes/browser-controller.ts b/packages/browser-pool/src/abstract-classes/browser-controller.ts index 9544b30b40a0..7c5d407416b2 100644 --- a/packages/browser-pool/src/abstract-classes/browser-controller.ts +++ b/packages/browser-pool/src/abstract-classes/browser-controller.ts @@ -231,19 +231,18 @@ export abstract class BrowserController< } /** - * Calls `remoteBrowser.release()` if configured. Safe to call multiple times — - * clears the endpoint after the first call so release only fires once. + * Releases the remote browser session (if this controller serves a remote browser) via the plugin's + * {@apilink RemoteConnection}. Safe to call multiple times — the token is cleared after the first call + * and the pool's registry also dedupes, so `release()` fires at most once across close()/kill(). */ private async _releaseRemoteBrowser(): Promise { - const endpoint = this.launchContext?._resolvedRemoteEndpoint as string | undefined; - if (!endpoint) return; + const token = this.launchContext?._remoteToken; + if (token === undefined) return; - const context = this.launchContext._remoteContext as Record | undefined; + // Clear so release only fires once (close() schedules kill() after a timeout). + this.launchContext._remoteToken = undefined; - // Clear so release only fires once (close() schedules kill() after timeout) - this.launchContext.extend({ _resolvedRemoteEndpoint: undefined, _remoteContext: undefined }); - - await this.browserPlugin._callRelease(endpoint, context); + await this.browserPlugin.remoteConnection?.release(token); } /** diff --git a/packages/browser-pool/src/abstract-classes/browser-plugin.ts b/packages/browser-pool/src/abstract-classes/browser-plugin.ts index 6fd42f36e6cd..22d14a1dee4d 100644 --- a/packages/browser-pool/src/abstract-classes/browser-plugin.ts +++ b/packages/browser-pool/src/abstract-classes/browser-plugin.ts @@ -4,8 +4,8 @@ import merge from 'lodash.merge'; import type { LaunchContextOptions } from '../launch-context.js'; import { LaunchContext } from '../launch-context.js'; -import { RemoteBrowserProvider } from '../remote-browser-provider.js'; -import type { UnwrapPromise } from '../utils.js'; +import type { RemoteConnection, RemoteConnectionParameters } from '../remote-browser-pool.js'; +import { sanitizeEndpointForLog, type UnwrapPromise } from '../utils.js'; import type { BrowserController } from './browser-controller.js'; /** @@ -45,65 +45,6 @@ export interface CommonPage { url(): string | Promise; } -/** - * Return type for dynamic endpoint functions that need to pass session - * metadata to the `release()` callback. - */ -export interface RemoteBrowserEndpointResult { - /** The browser endpoint URL to connect to. */ - url: string; - /** Opaque metadata passed back to `release()` — e.g. session IDs, API tokens. */ - context?: Record; -} - -/** - * Configuration for connecting to a remote browser service. - * - * **Static endpoint (e.g. Browserless):** - * ```typescript - * { endpoint: 'wss://browserless.io?token=xxx' } - * ``` - * - * **Dynamic endpoint with lifecycle (e.g. Browserbase):** - * ```typescript - * { - * endpoint: async () => { - * const session = await createSession(); - * return { url: session.connectUrl, context: { id: session.id } }; - * }, - * release: async ({ context }) => { - * await releaseSession(context.id); - * }, - * } - * ``` - */ -export interface RemoteBrowserConfig { - /** - * The browser endpoint URL, or an async function that returns one. - * When a function is provided, it is called once per browser launch (not per page). - * - * Can return a plain string or an object with `url` and optional `context` - * that will be forwarded to `release()`. - */ - endpoint: - | string - | ((options?: { - proxyUrl?: string; - }) => string | RemoteBrowserEndpointResult | Promise); - /** - * Optional cleanup function called when the browser closes, crashes, or the pool is destroyed. - * Receives the resolved endpoint URL and the `context` object returned by `endpoint()`. - * Errors are caught and logged as warnings — they never crash the crawler. - */ - release?: (info: { endpoint: string; context?: Record }) => void | Promise; - /** - * Maximum number of browsers that can be open at the same time. - * When the limit is reached, the crawler waits for a browser to close before launching a new one. - * Set this to your remote service's concurrent session limit to avoid 429 errors. - */ - maxOpenBrowsers?: number; -} - export interface BrowserPluginOptions { /** * Options that will be passed down to the automation library. E.g. @@ -141,15 +82,6 @@ export interface BrowserPluginOptions { * This is useful when using HTTPS proxies with self-signed certificates. */ ignoreProxyCertificate?: boolean; - /** - * Configuration for connecting to a remote browser service. - * When set, the plugin connects to a remote browser instead of launching a local one. - * - * Accepts either a {@link RemoteBrowserConfig} object or a {@link RemoteBrowserProvider} instance. - * - * Mutually exclusive with `connectOverCDPOptions` / `connectOptions` — setting more than one throws. - */ - remoteBrowser?: RemoteBrowserConfig | RemoteBrowserProvider; } export interface CreateLaunchContextOptions< @@ -185,7 +117,18 @@ export abstract class BrowserPlugin< browserPerProxy?: boolean; ignoreProxyCertificate?: boolean; - remoteBrowser?: RemoteBrowserConfig; + + /** + * Set by {@apilink RemoteBrowserPool} when this plugin connects to a remote browser service instead of + * launching locally. Holds the bridge the plugin uses to resolve endpoints and release sessions; all + * remote-session policy lives in the pool, not here. + * + * @internal + */ + remoteConnection?: RemoteConnection; + + /** Static connect() parameters for a remote connection (protocol, headers, …). @internal */ + remoteConnectionParameters?: RemoteConnectionParameters; constructor(library: Library, options: BrowserPluginOptions = {}) { const { @@ -195,7 +138,6 @@ export abstract class BrowserPlugin< useIncognitoPages = false, browserPerProxy = false, ignoreProxyCertificate = false, - remoteBrowser, } = options; this.log = serviceLocator.getLogger().child({ prefix: 'BrowserPool' }); @@ -206,55 +148,53 @@ export abstract class BrowserPlugin< this.useIncognitoPages = useIncognitoPages; this.browserPerProxy = browserPerProxy; this.ignoreProxyCertificate = ignoreProxyCertificate; - - // Normalize RemoteBrowserProvider instances into a plain RemoteBrowserConfig - // so all downstream code only deals with the config shape. - if (remoteBrowser instanceof RemoteBrowserProvider) { - const provider = remoteBrowser; - this.remoteBrowser = { - endpoint: (options) => provider.connect(options), - release: ({ context }) => provider.release(context as any), - maxOpenBrowsers: provider.maxOpenBrowsers, - }; - } else { - this.remoteBrowser = remoteBrowser; - } } - /** Resolves the remote browser endpoint from a string or function. Returns { url, context }. */ - protected async _resolveRemoteEndpoint(options?: { proxyUrl?: string }): Promise { - const { endpoint } = this.remoteBrowser!; - const result = typeof endpoint === 'function' ? await endpoint(options) : endpoint; - if (typeof result === 'string') { - if (!result) throw new Error('remoteBrowser.endpoint resolved to an empty string.'); - return { url: result }; - } - if (!result?.url) { - throw new Error("remoteBrowser.endpoint() must return a URL string or an object with a non-empty 'url'."); - } - return result; + /** + * Configures this plugin to connect to a remote browser using the given {@apilink RemoteConnection}. + * Called by {@apilink RemoteBrowserPool}; subclasses may override to apply library-specific defaults + * (e.g. forcing incognito pages). + * + * @internal + */ + useRemoteConnection(connection: RemoteConnection, parameters: RemoteConnectionParameters = {}): void { + this.remoteConnection = connection; + this.remoteConnectionParameters = parameters; } - /** @internal Called by BrowserController on browser close/kill. */ - async _callRelease(endpoint: string, context?: Record): Promise { + /** + * Resolves a remote endpoint via the injected {@apilink RemoteConnection}, stores the session token on + * the launch context (so the controller can release it on close), and runs the library-specific `connect`. + * On failure the session is released and the error is wrapped in a {@apilink BrowserLaunchError}. + * + * Subclasses implement only the `connect` callback — the resolve / token / release / error-wrap scaffolding + * lives here so it stays identical across plugins. + */ + protected async _connectToRemoteBrowser( + launchContext: LaunchContext, + connect: (url: string) => Promise, + ): Promise { + const connection = this.remoteConnection!; + + let url: string; + let token: number; try { - await this.remoteBrowser?.release?.({ endpoint, context }); - } catch (err) { - this.log.warning('remoteBrowser.release() failed.', { error: (err as Error)?.message }); + ({ url, token } = await connection.resolve({ proxyUrl: launchContext.proxyUrl })); + } catch (cause) { + throw new BrowserLaunchError('Failed to resolve the remote browser endpoint.​', { cause }); } - } - /** Strips credentials from a URL for safe logging. */ - protected _sanitizeEndpointForLog(endpoint: string): string { + launchContext._remoteToken = token; + try { - const url = new URL(endpoint); - if (url.username || url.password) { - url.username = '***'; - url.password = '***'; - } - return url.toString(); - } catch { - return ''; + return await connect(url); + } catch (cause) { + await connection.release(token); + throw new BrowserLaunchError( + `Failed to connect to remote browser at "${sanitizeEndpointForLog(url)}". ` + + 'Check that the endpoint is reachable and accepts the configured protocol.​', + { cause }, + ); } } @@ -275,7 +215,7 @@ export abstract class BrowserPlugin< userDataDir = this.userDataDir, browserPerProxy = this.browserPerProxy, ignoreProxyCertificate = this.ignoreProxyCertificate, - isRemote, + isRemote = !!this.remoteConnection, } = options; return new LaunchContext({ @@ -316,34 +256,6 @@ export abstract class BrowserPlugin< const { proxyUrl, launchOptions } = launchContext; - if (proxyUrl && launchContext.isRemote) { - if (this.remoteBrowser) { - if (typeof this.remoteBrowser.endpoint === 'function') { - this.log.info( - 'proxyUrl is set and will be passed to the remoteBrowser.endpoint() function. ' + - "Make sure your endpoint() handles it (e.g. passes it to the service's proxy API).", - ); - } else { - this.log.warning( - 'proxyUrl is set but will be ignored because remoteBrowser.endpoint is a static string. ' + - 'Switch endpoint to a function `(opts) => …` to receive proxyUrl, or configure the proxy through the remote service.', - ); - } - } else { - this.log.warning( - 'proxyUrl is set but will be ignored when using connectOptions/connectOverCDPOptions. ' + - 'Configure the proxy through the remote service, or switch to `remoteBrowser` with an endpoint() that handles proxyUrl.', - ); - } - } - - if (launchContext.userDataDir && launchContext.isRemote) { - this.log.warning( - 'userDataDir is set but will be ignored for remote browser connections. ' + - "Use your remote browser service's persistence API instead (e.g. Browserbase Contexts, Steel Profiles).", - ); - } - if (proxyUrl && !launchContext.isRemote) { await this._addProxyToLaunchOptions(launchContext); } diff --git a/packages/browser-pool/src/index.ts b/packages/browser-pool/src/index.ts index 087d717a45e5..9f3b49455248 100644 --- a/packages/browser-pool/src/index.ts +++ b/packages/browser-pool/src/index.ts @@ -47,15 +47,20 @@ export type { CommonLibrary, BrowserPluginOptions, CreateLaunchContextOptions, - RemoteBrowserConfig, - RemoteBrowserEndpointResult, } from './abstract-classes/browser-plugin.js'; export { BrowserPlugin, BrowserLaunchError, DEFAULT_USER_AGENT } from './abstract-classes/browser-plugin.js'; export type { LaunchContextOptions } from './launch-context.js'; export { LaunchContext } from './launch-context.js'; export { RemoteBrowserProvider } from './remote-browser-provider.js'; export { RemoteBrowserPool } from './remote-browser-pool.js'; -export type { RemoteBrowserPoolOptions } from './remote-browser-pool.js'; +export type { + RemoteBrowserPoolOptions, + CrawlerRemoteBrowserOptions, + RemoteBrowserEndpoint, + ResolvedRemoteEndpoint, + RemoteConnection, + RemoteConnectionParameters, +} from './remote-browser-pool.js'; export type { InferBrowserPluginArray, UnwrapPromise } from './utils.js'; export { anonymizeProxySugar, type AnonymizeProxySugarOptions } from './anonymize-proxy.js'; export type { IBrowserPool, NewPageOptions } from '@crawlee/types'; diff --git a/packages/browser-pool/src/launch-context.ts b/packages/browser-pool/src/launch-context.ts index 19a26e86b5d1..4bbec3236835 100644 --- a/packages/browser-pool/src/launch-context.ts +++ b/packages/browser-pool/src/launch-context.ts @@ -84,6 +84,14 @@ export class LaunchContext< private readonly _reservedFieldNames = [...Reflect.ownKeys(this), 'extend']; fingerprint?: BrowserFingerprintWithHeaders; + + /** + * Token identifying the remote browser session this context connected to, set by the plugin and read by + * the {@apilink BrowserController} to release the session on close. Only present for remote connections. + * @internal + */ + _remoteToken?: number; + [K: PropertyKey]: unknown; constructor(options: LaunchContextOptions) { diff --git a/packages/browser-pool/src/playwright/playwright-plugin.ts b/packages/browser-pool/src/playwright/playwright-plugin.ts index 6aad20a0a681..bd71092fcf0f 100644 --- a/packages/browser-pool/src/playwright/playwright-plugin.ts +++ b/packages/browser-pool/src/playwright/playwright-plugin.ts @@ -1,44 +1,17 @@ import fs from 'node:fs'; -import type { Browser as PlaywrightBrowser, BrowserType, ConnectOverCDPOptions, ConnectOptions } from 'playwright'; +import type { Browser as PlaywrightBrowser, BrowserType } from 'playwright'; -import { - BrowserLaunchError, - BrowserPlugin, - type BrowserPluginOptions, - type CreateLaunchContextOptions, -} from '../abstract-classes/browser-plugin.js'; +import { BrowserPlugin } from '../abstract-classes/browser-plugin.js'; import { anonymizeProxySugar } from '../anonymize-proxy.js'; import type { createProxyServerForContainers } from '../container-proxy-server.js'; import type { LaunchContext } from '../launch-context.js'; import { getLocalProxyAddress } from '../proxy-server.js'; +import type { RemoteConnection, RemoteConnectionParameters } from '../remote-browser-pool.js'; import type { SafeParameters } from '../utils.js'; import { PlaywrightBrowser as PlaywrightBrowserWithPersistentContext } from './playwright-browser.js'; import { PlaywrightController } from './playwright-controller.js'; -/** - * Options for connecting to a remote browser via CDP. - * Mirrors `browserType.connectOverCDP(endpointURL, options?)`. - */ -export interface PlaywrightConnectOverCDPOptions extends ConnectOverCDPOptions { - /** The CDP endpoint URL to connect to (required). Overrides the deprecated optional `endpointURL` from Playwright. */ - endpointURL: string; -} - -/** - * Options for connecting to a remote browser via WebSocket. - * Mirrors `browserType.connect(wsEndpoint, options?)`. - */ -export interface PlaywrightConnectOptions extends ConnectOptions { - /** The WebSocket endpoint URL to connect to (required). */ - wsEndpoint: string; -} - -export interface PlaywrightPluginOptions extends BrowserPluginOptions[0]> { - connectOptions?: PlaywrightConnectOptions; - connectOverCDPOptions?: PlaywrightConnectOverCDPOptions; -} - export class PlaywrightPlugin extends BrowserPlugin< BrowserType, SafeParameters[0], @@ -47,119 +20,34 @@ export class PlaywrightPlugin extends BrowserPlugin< private _browserVersion?: string; _containerProxyServer?: Awaited>; - connectOptions?: PlaywrightConnectOptions; - connectOverCDPOptions?: PlaywrightConnectOverCDPOptions; - - constructor(library: BrowserType, options: PlaywrightPluginOptions = {}) { - const { connectOptions, connectOverCDPOptions, ...baseOptions } = options; - - const remoteSourceCount = [baseOptions.remoteBrowser, connectOptions, connectOverCDPOptions].filter( - (v) => v != null, - ).length; - if (remoteSourceCount > 1) { - throw new Error( - "Set at most one of 'remoteBrowser', 'connectOptions', 'connectOverCDPOptions' — " + - 'these options are mutually exclusive.', + /** + * Playwright remote connections only support incognito pages — `connect()` / `connectOverCDP()` don't + * accept persistent contexts. Force it on (and inform the user) when wired for a remote connection. + */ + override useRemoteConnection(connection: RemoteConnection, parameters: RemoteConnectionParameters = {}): void { + super.useRemoteConnection(connection, parameters); + + if (!this.useIncognitoPages) { + this.log.info( + 'Remote Playwright connection — useIncognitoPages forced to true. ' + + 'Pages will not share cookies/storage between each other; use the SessionPool for shared state.', ); } - - if (connectOverCDPOptions && !connectOverCDPOptions.endpointURL) { - throw new Error("'connectOverCDPOptions.endpointURL' must be a non-empty string."); - } - - if (connectOptions && !connectOptions.wsEndpoint) { - throw new Error("'connectOptions.wsEndpoint' must be a non-empty string."); - } - - super(library, baseOptions); - this.connectOptions = connectOptions; - this.connectOverCDPOptions = connectOverCDPOptions; - - const isRemoteConnection = this.remoteBrowser || this.connectOptions || this.connectOverCDPOptions; - if (isRemoteConnection) { - if (options.useIncognitoPages === false) { - this.log.warning( - 'Remote Playwright connections only support useIncognitoPages: true. ' + - 'The setting has been overridden — pages will not share cookies/storage. ' + - 'For state sharing across requests, use the SessionPool.', - ); - } else if (options.useIncognitoPages === undefined) { - this.log.info( - 'Remote Playwright connection detected — useIncognitoPages forced to true. ' + - 'Pages will not share cookies/storage between each other.', - ); - } - this.useIncognitoPages = true; - } - } - - override createLaunchContext(options: CreateLaunchContextOptions = {}): LaunchContext { - return super.createLaunchContext({ - ...options, - isRemote: options.isRemote ?? !!(this.remoteBrowser || this.connectOptions || this.connectOverCDPOptions), - }); + this.useIncognitoPages = true; } protected async _launch(launchContext: LaunchContext): Promise { - if (this.remoteBrowser) { - let url: string; - let context: Record | undefined; - try { - const result = await this._resolveRemoteEndpoint({ proxyUrl: launchContext.proxyUrl }); - url = result.url; - context = result.context; - } catch (cause) { - throw new BrowserLaunchError( - 'Failed to resolve remote browser endpoint from remoteBrowser.endpoint() function.\u200b', - { cause }, - ); - } - - launchContext.extend({ _resolvedRemoteEndpoint: url, _remoteContext: context }); - - try { + if (this.remoteConnection) { + return this._connectToRemoteBrowser(launchContext, async (url) => { + const connectOptions = (this.remoteConnectionParameters?.connectOptions ?? {}) as any; + if (this.remoteConnectionParameters?.protocol === 'playwright') { + this.log.info('Connecting to remote browser via connect (Playwright WebSocket).'); + return this.library.connect(url, connectOptions); + } this.log.info('Connecting to remote browser via connectOverCDP.'); - return await this.library.connectOverCDP(url, {}); - } catch (cause) { - await this._callRelease(url, context); - throw new BrowserLaunchError( - `Failed to connect to remote browser at "${this._sanitizeEndpointForLog(url)}" via CDP. ` + - 'Check that the endpoint is reachable.\u200b', - { cause }, - ); - } + return this.library.connectOverCDP(url, connectOptions); + }); } - - // Remote CDP connection — skip all local launch/proxy logic - if (this.connectOverCDPOptions) { - const { endpointURL, ...options } = this.connectOverCDPOptions; - this.log.info('Connecting to remote browser via connectOverCDP.'); - try { - return await this.library.connectOverCDP(endpointURL, options); - } catch (cause) { - throw new BrowserLaunchError( - `Failed to connect to remote browser via CDP at "${this._sanitizeEndpointForLog(endpointURL)}". ` + - 'Check that the endpoint is reachable and the browser is accepting CDP connections.\u200b', - { cause }, - ); - } - } - - // Remote Playwright WebSocket connection — skip all local launch/proxy logic - if (this.connectOptions) { - const { wsEndpoint, ...options } = this.connectOptions; - this.log.info('Connecting to remote browser via connect (Playwright WebSocket).'); - try { - return await this.library.connect(wsEndpoint, options); - } catch (cause) { - throw new BrowserLaunchError( - `Failed to connect to remote browser via WebSocket at "${this._sanitizeEndpointForLog(wsEndpoint)}". ` + - 'Check that the endpoint is reachable and the Playwright server is running.\u200b', - { cause }, - ); - } - } - const { launchOptions, useIncognitoPages, userDataDir, proxyUrl } = launchContext; let browser: PlaywrightBrowser; diff --git a/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts b/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts index 308dab48a728..6e18b22b21cb 100644 --- a/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts +++ b/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts @@ -4,78 +4,34 @@ import type { Dictionary } from '@crawlee/types'; import type Puppeteer from 'puppeteer'; import type * as PuppeteerTypes from 'puppeteer'; -import { - BrowserLaunchError, - BrowserPlugin, - type BrowserPluginOptions, - type CreateLaunchContextOptions, -} from '../abstract-classes/browser-plugin.js'; +import { BrowserPlugin } from '../abstract-classes/browser-plugin.js'; import { anonymizeProxySugar } from '../anonymize-proxy.js'; import type { LaunchContext } from '../launch-context.js'; +import type { RemoteConnection, RemoteConnectionParameters } from '../remote-browser-pool.js'; import { noop } from '../utils.js'; import type { PuppeteerNewPageOptions } from './puppeteer-controller.js'; import { PuppeteerController } from './puppeteer-controller.js'; const PROXY_SERVER_ARG = '--proxy-server='; -/** - * Options for connecting to a remote browser via Puppeteer. - * Flat object matching Puppeteer's `ConnectOptions`. - */ -export type PuppeteerConnectOverCDPOptions = Parameters<(typeof Puppeteer)['connect']>[0]; - -export interface PuppeteerPluginOptions extends BrowserPluginOptions { - connectOverCDPOptions?: PuppeteerConnectOverCDPOptions; -} - export class PuppeteerPlugin extends BrowserPlugin< typeof Puppeteer, PuppeteerTypes.LaunchOptions, PuppeteerTypes.Browser, PuppeteerNewPageOptions > { - connectOverCDPOptions?: PuppeteerConnectOverCDPOptions; - - constructor(library: typeof Puppeteer, options: PuppeteerPluginOptions = {}) { - const { connectOverCDPOptions, ...baseOptions } = options; - - if (baseOptions.remoteBrowser && connectOverCDPOptions) { - throw new Error( - "Set at most one of 'remoteBrowser', 'connectOverCDPOptions' — these options are mutually exclusive. " + - 'Pick a single remote connection source.', - ); - } + /** Pages share cookies/storage on the remote browser (Puppeteer defaults to non-incognito). */ + override useRemoteConnection(connection: RemoteConnection, parameters: RemoteConnectionParameters = {}): void { + super.useRemoteConnection(connection, parameters); - if (connectOverCDPOptions && !connectOverCDPOptions.browserWSEndpoint && !connectOverCDPOptions.browserURL) { - throw new Error("connectOverCDPOptions must include either 'browserWSEndpoint' or 'browserURL'."); - } - - super(library, baseOptions); - this.connectOverCDPOptions = connectOverCDPOptions; - - const isRemoteConnection = this.remoteBrowser || this.connectOverCDPOptions; - if (isRemoteConnection && options.useIncognitoPages === undefined) { + if (!this.useIncognitoPages) { this.log.info( - 'Remote browser detected — pages will share cookies and storage ' + - 'on the remote browser instance (useIncognitoPages defaults to false).', + 'Remote Puppeteer connection — pages will share cookies and storage on the remote ' + + 'browser instance (useIncognitoPages defaults to false).', ); } } - override createLaunchContext( - options: CreateLaunchContextOptions< - typeof Puppeteer, - PuppeteerTypes.LaunchOptions, - PuppeteerTypes.Browser, - PuppeteerNewPageOptions - > = {}, - ): LaunchContext { - return super.createLaunchContext({ - ...options, - isRemote: options.isRemote ?? !!(this.remoteBrowser || this.connectOverCDPOptions), - }); - } - protected async _launch( launchContext: LaunchContext< typeof Puppeteer, @@ -99,47 +55,12 @@ export class PuppeteerPlugin extends BrowserPlugin< let browser: PuppeteerTypes.Browser; - if (this.remoteBrowser) { - let url: string; - let context: Record | undefined; - try { - const result = await this._resolveRemoteEndpoint({ proxyUrl: launchContext.proxyUrl }); - url = result.url; - context = result.context; - } catch (cause) { - throw new BrowserLaunchError( - 'Failed to resolve remote browser endpoint from remoteBrowser.endpoint() function.\u200b', - { cause }, - ); - } - - launchContext.extend({ _resolvedRemoteEndpoint: url, _remoteContext: context }); - - this.log.info('Connecting to remote browser via connect (CDP).'); - try { - browser = await this.library.connect({ browserWSEndpoint: url }); - } catch (cause) { - await this._callRelease(url, context); - throw new BrowserLaunchError( - `Failed to connect to remote browser at "${this._sanitizeEndpointForLog(url)}". ` + - 'Check that the endpoint is reachable and the browser is accepting CDP connections.\u200b', - { cause }, - ); - } - } else if (this.connectOverCDPOptions) { - // Remote CDP connection — skip local launch/proxy/headless logic - const endpoint = this.connectOverCDPOptions.browserWSEndpoint || this.connectOverCDPOptions.browserURL!; - this.log.info('Connecting to remote browser via connect (CDP).'); - try { - browser = await this.library.connect(this.connectOverCDPOptions); - } catch (cause) { - const safeEndpoint = this._sanitizeEndpointForLog(endpoint); - throw new BrowserLaunchError( - `Failed to connect to remote browser via CDP at "${safeEndpoint}". ` + - 'Check that the endpoint is reachable and the browser is accepting CDP connections.\u200b', - { cause }, - ); - } + if (this.remoteConnection) { + browser = await this._connectToRemoteBrowser(launchContext, async (url) => { + const connectOptions = this.remoteConnectionParameters?.connectOptions ?? {}; + this.log.info('Connecting to remote browser via connect (CDP).'); + return this.library.connect({ ...connectOptions, browserWSEndpoint: url }); + }); } else { const { launchOptions, userDataDir, experimentalContainers } = launchContext; @@ -215,7 +136,7 @@ export class PuppeteerPlugin extends BrowserPlugin< browser.on('targetcreated', targetCreatedHandler); // Clean up the listener when a remote browser disconnects to prevent leaks - if (this.remoteBrowser || this.connectOverCDPOptions) { + if (this.remoteConnection) { browser.once('disconnected', () => { browser.off('targetcreated', targetCreatedHandler); }); @@ -247,8 +168,7 @@ export class PuppeteerPlugin extends BrowserPlugin< if (useIncognitoPages) { // Skip proxy setup for remote connections — proxy is managed by the remote service. - const effectiveProxyUrl = - this.remoteBrowser || this.connectOverCDPOptions ? undefined : proxyUrl; + const effectiveProxyUrl = this.remoteConnection ? undefined : proxyUrl; const [anonymizedProxyUrl, close] = effectiveProxyUrl ? await anonymizeProxySugar(effectiveProxyUrl, undefined, undefined, { ignoreProxyCertificate, diff --git a/packages/browser-pool/src/remote-browser-pool.ts b/packages/browser-pool/src/remote-browser-pool.ts index 1223672e77eb..1d27353593cc 100644 --- a/packages/browser-pool/src/remote-browser-pool.ts +++ b/packages/browser-pool/src/remote-browser-pool.ts @@ -1,60 +1,192 @@ +import { type CrawleeLogger, serviceLocator } from '@crawlee/core'; import type { IBrowserPool, NewPageOptions, PageState } from '@crawlee/types'; -import type { BrowserPool } from './browser-pool.js'; +import type { BrowserPlugin } from './abstract-classes/browser-plugin.js'; +import { BrowserPool } from './browser-pool.js'; +import type { BrowserPoolHooks, BrowserPoolOptions } from './browser-pool.js'; import { BROWSER_POOL_EVENTS } from './events.js'; +import { RemoteBrowserProvider } from './remote-browser-provider.js'; + +/** + * The result of resolving a remote browser endpoint: the URL to connect to plus an optional opaque + * `context` object that is handed back to `release`. + */ +export interface ResolvedRemoteEndpoint { + /** The browser endpoint URL to connect to. */ + url: string; + /** Opaque metadata passed back to `release()` — e.g. session IDs, API tokens. */ + context?: Record; +} + +/** + * A remote browser endpoint: either a static URL string, or a function called once per browser launch + * that returns a URL (optionally with a `context` for `release`). + * + * The function receives the `proxyUrl` resolved by Crawlee's proxy configuration for the launch, so it + * can forward it to the remote service's proxy API. + */ +export type RemoteBrowserEndpoint = + | string + | ((options?: { proxyUrl?: string }) => string | ResolvedRemoteEndpoint | Promise); + +/** + * The bridge a {@apilink RemoteBrowserPool} injects into a {@apilink BrowserPlugin} so the plugin can + * connect to a remote browser without owning any remote-session policy. + * + * The plugin only knows how to make the library-specific `connect()` call; everything else — resolving + * the endpoint, calling the user's `release()`, and guaranteeing release fires at most once — lives in + * the pool. The plugin calls {@apilink RemoteConnection.resolve|resolve} before connecting, stores the + * returned `token` on its launch context, and the controller later calls + * {@apilink RemoteConnection.release|release} with that token when the browser closes. + * + * @internal + */ +export interface RemoteConnection { + /** Resolves the endpoint for a single browser launch. The `token` identifies the session for release. */ + resolve(options?: { proxyUrl?: string }): Promise<{ url: string; token: number }>; + /** Releases the remote session for `token`. Idempotent — safe to call from both `close()` and `kill()`. */ + release(token: number): Promise; +} + +/** + * Owns the lifecycle of remote browser sessions for a single {@apilink RemoteBrowserPool}: endpoint + * resolution, the user's `release()` callback, and a release-at-most-once guarantee. Implements + * {@apilink RemoteConnection} so it can be injected into a plugin. + */ +class RemoteSessionRegistry implements RemoteConnection { + private readonly sessions = new Map< + number, + { url: string; context?: Record; released: boolean } + >(); + private nextToken = 0; + + constructor( + private readonly endpoint: RemoteBrowserEndpoint, + private readonly onRelease: + | ((info: { endpoint: string; context?: Record }) => unknown) + | undefined, + private readonly log: CrawleeLogger, + ) {} + + async resolve(options?: { proxyUrl?: string }): Promise<{ url: string; token: number }> { + const resolved = typeof this.endpoint === 'function' ? await this.endpoint(options) : this.endpoint; + + let result: ResolvedRemoteEndpoint; + if (typeof resolved === 'string') { + if (!resolved) throw new Error('Remote browser endpoint resolved to an empty string.'); + result = { url: resolved }; + } else if (!resolved?.url) { + throw new Error("Remote browser endpoint() must return a URL string or an object with a non-empty 'url'."); + } else { + result = resolved; + } + + const token = this.nextToken++; + this.sessions.set(token, { url: result.url, context: result.context, released: false }); + return { url: result.url, token }; + } + + async release(token: number): Promise { + const session = this.sessions.get(token); + // Release at most once per session — guards a close()/teardown race (the `released` flag is set + // synchronously before the awaited onRelease, so releaseAll() can't double-fire an in-flight release). + if (!session || session.released) return; + session.released = true; + + try { + await this.onRelease?.({ endpoint: session.url, context: session.context }); + } catch (err) { + this.log.warning('Remote browser release() failed.', { error: (err as Error)?.message }); + } finally { + this.sessions.delete(token); + } + } + + /** Releases every session that is still open. Called on pool teardown so no remote session leaks. */ + async releaseAll(): Promise { + await Promise.all([...this.sessions.keys()].map(async (token) => this.release(token))); + } +} + +/** + * Per-plugin remote connection parameters, passed to {@apilink BrowserPlugin.useRemoteConnection}. + * The endpoint is supplied per-launch via {@apilink RemoteConnection}; these are the static connect() + * parameters (protocol, headers, timeouts, …). + */ +export interface RemoteConnectionParameters { + /** + * Playwright only: which protocol to connect with. `'cdp'` uses `connectOverCDP()` (the default), + * `'playwright'` uses `connect()` (Playwright's own WebSocket protocol). Ignored by Puppeteer. + */ + protocol?: 'cdp' | 'playwright'; + /** Extra options forwarded to the library `connect()` / `connectOverCDP()` call (endpoint excluded). */ + connectOptions?: Record; +} export interface RemoteBrowserPoolOptions { /** - * The underlying {@apilink BrowserPool} that performs the actual remote connections. Configure it - * with a single plugin set up for a remote connection (`remoteBrowser`, `connectOptions`, or - * `connectOverCDPOptions`). + * The browser plugin(s) used to connect to the remote service — e.g. `new PlaywrightPlugin(playwright.chromium)` + * or `new PuppeteerPlugin(puppeteer)`. The pool configures them for remote connection; do not set a local + * `launchOptions` on them. */ - browserPool: BrowserPool; + browserPlugins: BrowserPlugin[]; /** - * Maximum number of remote browsers that may be open at the same time. When the limit is reached, - * {@apilink RemoteBrowserPool.newPage|newPage} waits until a browser closes (or an existing one frees - * a page slot) before opening a new page. Set this to your remote service's concurrent-session limit - * to avoid `429` errors. - * - * When omitted, the wrapped pool's own `maxOpenBrowsers` is used (defaults to `Infinity`, i.e. no limit). + * The remote browser endpoint: a static URL, a function returning one per launch, or a + * {@apilink RemoteBrowserProvider} instance encapsulating a session create/release lifecycle. */ - maxOpenBrowsers?: number; + endpoint: RemoteBrowserEndpoint | RemoteBrowserProvider; + /** + * Cleanup callback invoked when a browser closes, crashes, or the pool is destroyed. Receives the + * `context` returned by a function endpoint. Errors are caught and logged. Ignored when `endpoint` + * is a {@apilink RemoteBrowserProvider} (its own `release()` is used instead). + */ + release?: (info: { endpoint: string; context?: Record }) => unknown; /** - * Fallback poll interval, in milliseconds, used while waiting for a free browser slot. The wait is - * primarily event-driven (it wakes on browser/page close), so this only bounds how long it can sleep - * if no event fires. - * - * @default 500 + * Maximum number of remote browsers open at once. When reached, {@apilink RemoteBrowserPool.newPage|newPage} + * waits for a browser to close before connecting a new one. Set it to your service's concurrent-session limit + * to avoid `429` errors. Defaults to the {@apilink RemoteBrowserProvider.maxOpenBrowsers|provider's value}, or + * `Infinity`. */ + maxOpenBrowsers?: number; + /** Static connect() parameters (Playwright protocol selection, headers, timeouts, …). */ + connection?: RemoteConnectionParameters; + /** Extra {@apilink BrowserPool} options (lifecycle hooks, page limits, fingerprinting, …). */ + browserPoolOptions?: Omit & BrowserPoolHooks; + /** Fallback poll interval (ms) while waiting for a free browser slot. The wait is event-driven; this only bounds it. @default 500 */ slotPollIntervalMillis?: number; } +/** + * The remote-connection configuration a browser crawler accepts on its `remoteBrowser` option. It is the + * {@apilink RemoteBrowserPoolOptions} a user supplies *minus* the parts the crawler provides itself — the + * `browserPlugins` (the crawler builds the correct one for its browser) and `browserPoolOptions` (taken from + * the crawler's own `browserPoolOptions`). This is what makes the crawler path both terse and mismatch-proof. + */ +export type CrawlerRemoteBrowserOptions = Omit; + /** * An {@apilink IBrowserPool} implementation for remote browser services. * - * It wraps a {@apilink BrowserPool} configured for a remote connection and adds the one piece the plain - * pool cannot enforce on its own: a {@apilink RemoteBrowserPoolOptions.maxOpenBrowsers|concurrency limit} - * on open remote browsers. {@apilink RemoteBrowserPool.newPage|newPage} blocks until a slot is free instead - * of letting the crawler overshoot the remote service's session quota. + * Unlike configuring a remote browser through a crawler's `launchContext`, this pool is the single owner + * of all remote-session concerns: + * - **endpoint resolution** — static URL, per-launch function, or {@apilink RemoteBrowserProvider}; + * - **release lifecycle** — `release()` fires exactly once per session on close/crash/teardown (no leaks, + * no double-release); + * - **concurrency** — {@apilink RemoteBrowserPoolOptions.maxOpenBrowsers|maxOpenBrowsers} is enforced inside + * {@apilink RemoteBrowserPool.newPage|newPage}, which waits for a free slot rather than overshooting. * - * The remote-session lifecycle (connecting via `endpoint()` and calling `release()` on close) is owned by - * the wrapped pool's plugin and its `remoteBrowser` configuration — this class only governs *when* new - * pages may open. + * The wrapped {@apilink BrowserPool} and its plugin only perform the library-specific `connect()` call. * - * Pass an instance as the `browserPool` option of a browser crawler: + * Pass an instance as the crawler's `browserPool` option: * * ```typescript - * import { BrowserPool, PlaywrightPlugin, RemoteBrowserPool } from '@crawlee/browser-pool'; + * import { PlaywrightPlugin, RemoteBrowserPool } from '@crawlee/browser-pool'; + * import { PlaywrightCrawler } from 'crawlee'; * import playwright from 'playwright'; * * const browserPool = new RemoteBrowserPool({ - * browserPool: new BrowserPool({ - * browserPlugins: [ - * new PlaywrightPlugin(playwright.chromium, { - * remoteBrowser: { endpoint: 'wss://production-sfo.browserless.io?token=xxx' }, - * }), - * ], - * }), + * browserPlugins: [new PlaywrightPlugin(playwright.chromium)], + * endpoint: 'wss://production-sfo.browserless.io?token=xxx', * maxOpenBrowsers: 2, * }); * @@ -67,23 +199,52 @@ export class RemoteBrowserPool implements IBrowserPool { /** The wrapped pool that performs the remote connections and serves pages. */ readonly browserPool: BrowserPool; - /** - * The wrapped pool viewed through the {@apilink IBrowserPool} contract it implements. Used for - * page delegation because the bare `BrowserPool` type widens its page type to `never`. - */ + /** The wrapped pool viewed through the {@apilink IBrowserPool} contract (the bare type widens pages to `never`). */ private readonly pool: IBrowserPool; + private readonly registry: RemoteSessionRegistry; private readonly slotPollIntervalMillis: number; + private readonly log: CrawleeLogger; + + /** Shared by all `newPage` callers waiting for a free slot, so they don't each register their own listeners. */ + private _capacityChange?: Promise; constructor(options: RemoteBrowserPoolOptions) { - const { browserPool, maxOpenBrowsers, slotPollIntervalMillis = 500 } = options; + const { + browserPlugins, + endpoint, + release, + maxOpenBrowsers, + connection = {}, + browserPoolOptions = {}, + slotPollIntervalMillis = 500, + } = options; - this.browserPool = browserPool; - this.pool = browserPool as unknown as IBrowserPool; + this.log = serviceLocator.getLogger().child({ prefix: 'RemoteBrowserPool' }); this.slotPollIntervalMillis = slotPollIntervalMillis; - if (maxOpenBrowsers !== undefined) { - this.browserPool.maxOpenBrowsers = maxOpenBrowsers; + // A RemoteBrowserProvider carries its own endpoint, release, and maxOpenBrowsers. + const provider = endpoint instanceof RemoteBrowserProvider ? endpoint : undefined; + const resolvedEndpoint: RemoteBrowserEndpoint = provider + ? (opts) => provider.connect(opts) + : (endpoint as RemoteBrowserEndpoint); + const resolvedRelease = provider + ? ({ context }: { context?: Record }) => provider.release(context as any) + : release; + const resolvedMax = maxOpenBrowsers ?? provider?.maxOpenBrowsers; + + this.registry = new RemoteSessionRegistry(resolvedEndpoint, resolvedRelease, this.log); + + // Wire every plugin for remote connection. + for (const plugin of browserPlugins) { + plugin.useRemoteConnection(this.registry, connection); + } + + this.browserPool = new BrowserPool({ ...browserPoolOptions, browserPlugins }) as unknown as BrowserPool; + this.pool = this.browserPool as unknown as IBrowserPool; + + if (resolvedMax !== undefined) { + this.browserPool.maxOpenBrowsers = resolvedMax; } } @@ -97,9 +258,8 @@ export class RemoteBrowserPool implements IBrowserPool { } /** - * Opens a new page, waiting first until the {@apilink RemoteBrowserPoolOptions.maxOpenBrowsers|browser - * limit} allows it. A page can open immediately when either a new browser slot is free or an already - * active browser still has room for another page. + * Opens a new page, waiting first until {@apilink RemoteBrowserPoolOptions.maxOpenBrowsers|maxOpenBrowsers} + * allows it (either a new browser slot is free, or an active browser still has page capacity). */ async newPage(options?: NewPageOptions): Promise { await this._waitForFreeSlot(); @@ -118,29 +278,32 @@ export class RemoteBrowserPool implements IBrowserPool { return this.pool.injectPageState(page, state); } - /** Closes all browsers and tears down the wrapped pool. */ + /** Closes all browsers, releases any still-open remote sessions, and tears down the wrapped pool. */ async destroy(): Promise { await this.browserPool.destroy(); + // Backstop: release any sessions whose browser never emitted a close (e.g. dropped on teardown). + await this.registry.releaseAll(); } - /** - * Resolves once the wrapped pool can serve another page without exceeding `maxOpenBrowsers`. The check - * is best-effort: concurrent `newPage` calls may briefly overshoot the limit, mirroring the advisory - * nature of the crawler-level throttle this replaces. - */ + /** Resolves once the wrapped pool can serve another page without exceeding `maxOpenBrowsers`. */ private async _waitForFreeSlot(): Promise { while (!this.browserPool.hasFreeBrowserSlot() && !this.browserPool.hasActiveBrowserWithFreeCapacity()) { - await this._waitForCapacityChange(); + await this._nextCapacityChange(); } } - /** Resolves on the next browser-retired / page-closed event, or after `slotPollIntervalMillis`. */ - private async _waitForCapacityChange(): Promise { - await new Promise((resolve) => { + /** + * Resolves on the next browser-retired / page-closed event, or after `slotPollIntervalMillis`. All + * concurrently-waiting `newPage` calls share a single promise (and a single pair of event listeners) + * per tick, so a fleet of saturated callers doesn't fan out into N listener pairs on the pool. + */ + private _nextCapacityChange(): Promise { + this._capacityChange ??= new Promise((resolve) => { const done = () => { clearTimeout(timer); this.browserPool.off(BROWSER_POOL_EVENTS.BROWSER_RETIRED, done); this.browserPool.off(BROWSER_POOL_EVENTS.PAGE_CLOSED, done); + this._capacityChange = undefined; resolve(); }; @@ -149,5 +312,7 @@ export class RemoteBrowserPool implements IBrowserPool { this.browserPool.once(BROWSER_POOL_EVENTS.BROWSER_RETIRED, done); this.browserPool.once(BROWSER_POOL_EVENTS.PAGE_CLOSED, done); }); + + return this._capacityChange; } } diff --git a/packages/browser-pool/src/remote-browser-provider.ts b/packages/browser-pool/src/remote-browser-provider.ts index 85dd4b4cf851..425d61ca65b1 100644 --- a/packages/browser-pool/src/remote-browser-provider.ts +++ b/packages/browser-pool/src/remote-browser-provider.ts @@ -2,19 +2,20 @@ * Abstract base class for remote browser service providers. * * Implement this class to encapsulate the lifecycle of a remote browser session - * (creation, connection URL resolution, and cleanup). The framework calls - * {@link connect} once per browser launch and {@link release} when the browser + * (creation, connection URL resolution, and cleanup). {@apilink RemoteBrowserPool} + * calls {@link connect} once per browser launch and {@link release} when the browser * closes, crashes, the pool is destroyed, or the connection fails during launch. * - * Pass the provider instance as the `remoteBrowser` option on the crawler's - * `launchContext` or directly on the plugin constructor: + * Pass the provider instance as the `endpoint` of a {@apilink RemoteBrowserPool}, then + * hand the pool to a crawler via its `browserPool` option: * * ```typescript - * const crawler = new PlaywrightCrawler({ - * launchContext: { - * remoteBrowser: new MyProvider(), - * }, + * const browserPool = new RemoteBrowserPool({ + * browserPlugins: [new PlaywrightPlugin(playwright.chromium)], + * endpoint: new MyProvider(), * }); + * + * const crawler = new PlaywrightCrawler({ browserPool }); * ``` * * **Example — simple static endpoint (e.g. Browserless):** diff --git a/packages/browser-pool/src/utils.ts b/packages/browser-pool/src/utils.ts index ae224fee62e5..476e98f42f65 100644 --- a/packages/browser-pool/src/utils.ts +++ b/packages/browser-pool/src/utils.ts @@ -6,6 +6,20 @@ export type UnwrapPromise = T extends PromiseLike ? UnwrapPromise export function noop(..._args: unknown[]): void {} +/** Strips credentials from a URL so it can be safely included in logs and error messages. */ +export function sanitizeEndpointForLog(endpoint: string): string { + try { + const url = new URL(endpoint); + if (url.username || url.password) { + url.username = '***'; + url.password = '***'; + } + return url.toString(); + } catch { + return ''; + } +} + /** * This is required when using optional dependencies. * Importing a type gives `any`, but `Parameters` gives `unknown[]` instead of `any` diff --git a/packages/browser-pool/test/remote-browser-pool.test.ts b/packages/browser-pool/test/remote-browser-pool.test.ts index 96901d2a2f40..7e5b48df5a09 100644 --- a/packages/browser-pool/test/remote-browser-pool.test.ts +++ b/packages/browser-pool/test/remote-browser-pool.test.ts @@ -1,144 +1,230 @@ -import { EventEmitter } from 'node:events'; - import { vi } from 'vitest'; +import { serviceLocator } from '@crawlee/core'; +import type { CrawleeLogger } from '@crawlee/core'; + import { BROWSER_POOL_EVENTS } from '../src/events.js'; -import type { BrowserPool } from '../src/browser-pool.js'; +import type { RemoteConnection } from '../src/remote-browser-pool.js'; import { RemoteBrowserPool } from '../src/remote-browser-pool.js'; +import { RemoteBrowserProvider } from '../src/remote-browser-provider.js'; + +function createMockLogger(): CrawleeLogger { + const logger: any = { + child: vi.fn(() => logger), + error: vi.fn(), + exception: vi.fn(), + softFail: vi.fn(), + warning: vi.fn(), + warningOnce: vi.fn(), + info: vi.fn(), + debug: vi.fn(), + perf: vi.fn(), + deprecated: vi.fn(), + getOptions: vi.fn(() => ({})), + setOptions: vi.fn(), + setLevel: vi.fn(), + getLevel: vi.fn(), + }; + return logger; +} /** - * A minimal stand-in for {@link BrowserPool} exposing only the surface - * {@link RemoteBrowserPool} touches: the four `IBrowserPool` methods, `destroy`, - * `maxOpenBrowsers`, the two capacity helpers, and the event emitter. + * A stand-in plugin that captures the {@link RemoteConnection} the pool injects, so tests can drive + * endpoint resolution / release directly without launching a real browser. */ -function createFakePool(overrides: Partial> = {}) { - const emitter = new EventEmitter(); - const pool = Object.assign(emitter, { - maxOpenBrowsers: Infinity, - hasFreeBrowserSlot: vi.fn(() => true), - hasActiveBrowserWithFreeCapacity: vi.fn(() => false), - newPage: vi.fn(async (options?: any) => ({ id: options?.id ?? 'page' })), - closePage: vi.fn(async () => {}), - extractPageState: vi.fn(async () => ({ cookies: [] })), - injectPageState: vi.fn(async () => {}), - destroy: vi.fn(async () => {}), - ...overrides, - }); - return pool as unknown as BrowserPool; +function createCapturingPlugin() { + let connection: RemoteConnection | undefined; + const plugin: any = { + useRemoteConnection: (conn: RemoteConnection) => { + connection = conn; + }, + }; + return { plugin, getConnection: () => connection! }; } -describe('RemoteBrowserPool', () => { - describe('construction', () => { - it('applies maxOpenBrowsers to the wrapped pool', () => { - const fake = createFakePool(); - const remote = new RemoteBrowserPool({ browserPool: fake, maxOpenBrowsers: 3 }); +beforeEach(() => { + serviceLocator.setLogger(createMockLogger()); +}); - expect(fake.maxOpenBrowsers).toBe(3); - expect(remote.maxOpenBrowsers).toBe(3); - }); +describe('RemoteBrowserPool — endpoint resolution', () => { + it('resolves a static string endpoint', async () => { + const { plugin, getConnection } = createCapturingPlugin(); + const pool = new RemoteBrowserPool({ browserPlugins: [plugin], endpoint: 'wss://remote:9222' }); - it('leaves the wrapped pool default when maxOpenBrowsers is omitted', () => { - const fake = createFakePool(); - const remote = new RemoteBrowserPool({ browserPool: fake }); + const { url, token } = await getConnection().resolve(); - expect(remote.maxOpenBrowsers).toBe(Infinity); - }); + expect(url).toBe('wss://remote:9222'); + expect(typeof token).toBe('number'); + await pool.destroy(); + }); - it('proxies maxOpenBrowsers writes through to the wrapped pool', () => { - const fake = createFakePool(); - const remote = new RemoteBrowserPool({ browserPool: fake }); + it('resolves a function endpoint and forwards proxyUrl', async () => { + const endpoint = vi.fn(() => 'wss://dynamic:9222'); + const { plugin, getConnection } = createCapturingPlugin(); + const pool = new RemoteBrowserPool({ browserPlugins: [plugin], endpoint }); - remote.maxOpenBrowsers = 5; + const { url } = await getConnection().resolve({ proxyUrl: 'http://proxy:8080' }); - expect(fake.maxOpenBrowsers).toBe(5); - }); + expect(url).toBe('wss://dynamic:9222'); + expect(endpoint).toHaveBeenCalledWith({ proxyUrl: 'http://proxy:8080' }); + await pool.destroy(); }); - describe('delegation', () => { - it('forwards closePage / extractPageState / injectPageState / destroy', async () => { - const fake = createFakePool(); - const remote = new RemoteBrowserPool<{ id: string }>({ browserPool: fake }); - const page = { id: 'p1' }; - const error = new Error('boom'); - - await remote.closePage(page, { error }); - await remote.extractPageState(page); - await remote.injectPageState(page, { cookies: [] }); - await remote.destroy(); - - expect(fake.closePage).toHaveBeenCalledWith(page, { error }); - expect(fake.extractPageState).toHaveBeenCalledWith(page); - expect(fake.injectPageState).toHaveBeenCalledWith(page, { cookies: [] }); - expect(fake.destroy).toHaveBeenCalledOnce(); + it('throws when an endpoint resolves to an empty string', async () => { + const { plugin, getConnection } = createCapturingPlugin(); + const pool = new RemoteBrowserPool({ browserPlugins: [plugin], endpoint: () => '' }); + + await expect(getConnection().resolve()).rejects.toThrow(/empty string/); + await pool.destroy(); + }); + + it('throws when a function endpoint returns an object without a url', async () => { + const { plugin, getConnection } = createCapturingPlugin(); + const pool = new RemoteBrowserPool({ browserPlugins: [plugin], endpoint: () => ({}) as any }); + + await expect(getConnection().resolve()).rejects.toThrow(/non-empty 'url'/); + await pool.destroy(); + }); +}); + +describe('RemoteBrowserPool — release lifecycle', () => { + it('calls release with the context from a function endpoint, exactly once', async () => { + const release = vi.fn(); + const { plugin, getConnection } = createCapturingPlugin(); + const pool = new RemoteBrowserPool({ + browserPlugins: [plugin], + endpoint: () => ({ url: 'wss://remote:9222', context: { id: 'sess-1' } }), + release, }); + + const { token } = await getConnection().resolve(); + await getConnection().release(token); + await getConnection().release(token); // second call must be a no-op (close()+kill()) + + expect(release).toHaveBeenCalledTimes(1); + expect(release).toHaveBeenCalledWith({ endpoint: 'wss://remote:9222', context: { id: 'sess-1' } }); + await pool.destroy(); }); - describe('newPage throttle', () => { - it('opens immediately when a browser slot is free', async () => { - const fake = createFakePool({ hasFreeBrowserSlot: vi.fn(() => true) }); - const remote = new RemoteBrowserPool({ browserPool: fake, maxOpenBrowsers: 2 }); + it('releases all still-open sessions on destroy()', async () => { + const release = vi.fn(); + const { plugin, getConnection } = createCapturingPlugin(); + const pool = new RemoteBrowserPool({ browserPlugins: [plugin], endpoint: 'wss://remote:9222', release }); + + await getConnection().resolve(); + await getConnection().resolve(); - const page = await remote.newPage({ id: 'x' }); + await pool.destroy(); + + expect(release).toHaveBeenCalledTimes(2); + }); - expect(page).toEqual({ id: 'x' }); - expect(fake.newPage).toHaveBeenCalledOnce(); + it('swallows errors thrown by release()', async () => { + const release = vi.fn(() => { + throw new Error('release boom'); }); + const { plugin, getConnection } = createCapturingPlugin(); + const pool = new RemoteBrowserPool({ browserPlugins: [plugin], endpoint: 'wss://remote:9222', release }); - it('opens immediately when an active browser has free page capacity', async () => { - const fake = createFakePool({ - hasFreeBrowserSlot: vi.fn(() => false), - hasActiveBrowserWithFreeCapacity: vi.fn(() => true), - }); - const remote = new RemoteBrowserPool({ browserPool: fake, maxOpenBrowsers: 1 }); + const { token } = await getConnection().resolve(); + await expect(getConnection().release(token)).resolves.toBeUndefined(); + await pool.destroy(); + }); +}); + +describe('RemoteBrowserPool — RemoteBrowserProvider endpoint', () => { + class TestProvider extends RemoteBrowserProvider<{ id: string }> { + override maxOpenBrowsers = 3; + connect = vi.fn(async () => ({ url: 'wss://provider:9222', context: { id: 'sess-1' } })); + override release = vi.fn(async () => {}); + } - await remote.newPage(); + it('wires connect/release and adopts the provider maxOpenBrowsers', async () => { + const provider = new TestProvider(); + const { plugin, getConnection } = createCapturingPlugin(); + const pool = new RemoteBrowserPool({ browserPlugins: [plugin], endpoint: provider }); - expect(fake.newPage).toHaveBeenCalledOnce(); + expect(pool.maxOpenBrowsers).toBe(3); + + const { url, token } = await getConnection().resolve({ proxyUrl: 'http://proxy:8080' }); + expect(url).toBe('wss://provider:9222'); + expect(provider.connect).toHaveBeenCalledWith({ proxyUrl: 'http://proxy:8080' }); + + await getConnection().release(token); + expect(provider.release).toHaveBeenCalledWith({ id: 'sess-1' }); + await pool.destroy(); + }); + + it('an explicit maxOpenBrowsers overrides the provider value', async () => { + const { plugin } = createCapturingPlugin(); + const pool = new RemoteBrowserPool({ + browserPlugins: [plugin], + endpoint: new TestProvider(), + maxOpenBrowsers: 7, }); - it('waits while at capacity, then opens once a browser is retired', async () => { - let atCapacity = true; - const fake = createFakePool({ - hasFreeBrowserSlot: vi.fn(() => !atCapacity), - hasActiveBrowserWithFreeCapacity: vi.fn(() => false), - }); - const remote = new RemoteBrowserPool({ browserPool: fake, maxOpenBrowsers: 1, slotPollIntervalMillis: 50 }); - - const pagePromise = remote.newPage(); - let resolved = false; - void pagePromise.then(() => { - resolved = true; - }); - - // Still blocked while at capacity. - await new Promise((r) => setTimeout(r, 20)); - expect(resolved).toBe(false); - expect(fake.newPage).not.toHaveBeenCalled(); - - // Free a slot and signal it. - atCapacity = false; - fake.emit(BROWSER_POOL_EVENTS.BROWSER_RETIRED); - - await pagePromise; - expect(resolved).toBe(true); - expect(fake.newPage).toHaveBeenCalledOnce(); + expect(pool.maxOpenBrowsers).toBe(7); + await pool.destroy(); + }); +}); + +describe('RemoteBrowserPool — maxOpenBrowsers throttle', () => { + it('proxies maxOpenBrowsers to the wrapped pool', async () => { + const { plugin } = createCapturingPlugin(); + const pool = new RemoteBrowserPool({ + browserPlugins: [plugin], + endpoint: 'wss://remote:9222', + maxOpenBrowsers: 2, }); - it('re-checks capacity via the poll fallback when no event fires', async () => { - let atCapacity = true; - const fake = createFakePool({ - hasFreeBrowserSlot: vi.fn(() => !atCapacity), - hasActiveBrowserWithFreeCapacity: vi.fn(() => false), - }); - const remote = new RemoteBrowserPool({ browserPool: fake, maxOpenBrowsers: 1, slotPollIntervalMillis: 20 }); - - const pagePromise = remote.newPage(); - setTimeout(() => { - atCapacity = false; - }, 30); - - await pagePromise; - expect(fake.newPage).toHaveBeenCalledOnce(); + expect(pool.browserPool.maxOpenBrowsers).toBe(2); + pool.maxOpenBrowsers = 5; + expect(pool.browserPool.maxOpenBrowsers).toBe(5); + await pool.destroy(); + }); + + it('opens immediately when a browser slot is free', async () => { + const { plugin } = createCapturingPlugin(); + const pool = new RemoteBrowserPool({ + browserPlugins: [plugin], + endpoint: 'wss://remote:9222', + maxOpenBrowsers: 2, + }); + + pool.browserPool.hasFreeBrowserSlot = vi.fn(() => true); + pool.browserPool.hasActiveBrowserWithFreeCapacity = vi.fn(() => false); + const newPage = vi.fn(async () => ({ id: 'p' })); + (pool.browserPool as any).newPage = newPage; + + await pool.newPage({ id: 'p' }); + expect(newPage).toHaveBeenCalledOnce(); + await pool.destroy(); + }); + + it('waits while at capacity, then opens once a browser is retired', async () => { + const { plugin } = createCapturingPlugin(); + const pool = new RemoteBrowserPool({ + browserPlugins: [plugin], + endpoint: 'wss://remote:9222', + maxOpenBrowsers: 1, + slotPollIntervalMillis: 50, }); + + let atCapacity = true; + pool.browserPool.hasFreeBrowserSlot = vi.fn(() => !atCapacity); + pool.browserPool.hasActiveBrowserWithFreeCapacity = vi.fn(() => false); + const newPage = vi.fn(async () => ({ id: 'p' })); + (pool.browserPool as any).newPage = newPage; + + const pagePromise = pool.newPage(); + await new Promise((r) => setTimeout(r, 20)); + expect(newPage).not.toHaveBeenCalled(); + + atCapacity = false; + pool.browserPool.emit(BROWSER_POOL_EVENTS.BROWSER_RETIRED, {} as any); + + await pagePromise; + expect(newPage).toHaveBeenCalledOnce(); + await pool.destroy(); }); }); diff --git a/packages/browser-pool/test/remote-browser.test.ts b/packages/browser-pool/test/remote-browser.test.ts index 8b0343ea2fa7..d4d24a4fac46 100644 --- a/packages/browser-pool/test/remote-browser.test.ts +++ b/packages/browser-pool/test/remote-browser.test.ts @@ -5,10 +5,10 @@ import type { CrawleeLogger } from '@crawlee/core'; import { PlaywrightPlugin } from '../src/playwright/playwright-plugin.js'; import { PuppeteerPlugin } from '../src/puppeteer/puppeteer-plugin.js'; -import { RemoteBrowserProvider } from '../src/remote-browser-provider.js'; +import type { RemoteConnection } from '../src/remote-browser-pool.js'; // --------------------------------------------------------------------------- -// Shared mock helpers +// Mock helpers // --------------------------------------------------------------------------- function createMockPage() { @@ -20,19 +20,14 @@ function createMockPage() { }; } -function createMockBrowserContext() { +function createMockBrowser() { const page = createMockPage(); - return { + const mockContext = { newPage: vi.fn().mockResolvedValue(page), close: vi.fn().mockResolvedValue(undefined), on: vi.fn(), once: vi.fn(), - _mockPage: page, }; -} - -function createMockBrowser() { - const mockContext = createMockBrowserContext(); return { newPage: vi.fn().mockResolvedValue(createMockPage()), close: vi.fn().mockResolvedValue(undefined), @@ -46,22 +41,16 @@ function createMockBrowser() { userAgent: vi.fn().mockResolvedValue('mock-ua'), createBrowserContext: vi.fn().mockResolvedValue(mockContext), createIncognitoBrowserContext: vi.fn().mockResolvedValue(mockContext), - _mockContext: mockContext, }; } function createMockPlaywrightLibrary(browser = createMockBrowser()) { - const mockContext = { - ...browser, - once: vi.fn(), - on: vi.fn(), - }; return { launch: vi.fn().mockResolvedValue(browser), connect: vi.fn().mockResolvedValue(browser), connectOverCDP: vi.fn().mockResolvedValue(browser), name: vi.fn(() => 'chromium'), - launchPersistentContext: vi.fn().mockResolvedValue(mockContext), + launchPersistentContext: vi.fn().mockResolvedValue(browser), }; } @@ -74,10 +63,8 @@ function createMockPuppeteerLibrary(browser = createMockBrowser()) { } function createMockLogger(): CrawleeLogger & { warning: ReturnType; info: ReturnType } { - const mockLogger: any = { - getOptions: vi.fn(() => ({})), - setOptions: vi.fn(), - child: vi.fn(() => mockLogger), + const logger: any = { + child: vi.fn(() => logger), error: vi.fn(), exception: vi.fn(), softFail: vi.fn(), @@ -87,1010 +74,138 @@ function createMockLogger(): CrawleeLogger & { warning: ReturnType debug: vi.fn(), perf: vi.fn(), deprecated: vi.fn(), - log: vi.fn(), + getOptions: vi.fn(() => ({})), + setOptions: vi.fn(), setLevel: vi.fn(), getLevel: vi.fn(), }; - return mockLogger; + return logger; +} + +/** A fake {@link RemoteConnection} that resolves to a fixed URL and records release() calls. */ +function createConnection(url = 'wss://remote:9222', context?: Record): RemoteConnection & { + resolve: ReturnType; + release: ReturnType; +} { + return { + resolve: vi.fn(async (_options?: { proxyUrl?: string }) => ({ url, token: 42, context })), + release: vi.fn(async () => {}), + } as any; } // --------------------------------------------------------------------------- // Tests // --------------------------------------------------------------------------- -describe('Remote browser — PlaywrightPlugin', () => { - let mockLogger: ReturnType; - - beforeEach(() => { - mockLogger = createMockLogger(); - serviceLocator.setLogger(mockLogger); - }); - - // --- Connection routing --------------------------------------------------- - - describe('connection routing', () => { - test('connectOverCDPOptions → calls connectOverCDP, not launch', async () => { - const lib = createMockPlaywrightLibrary(); - const plugin = new PlaywrightPlugin(lib as any, { - connectOverCDPOptions: { endpointURL: 'http://remote:9222' }, - }); - - const ctx = plugin.createLaunchContext(); - await plugin.launch(ctx); - - expect(lib.connectOverCDP).toHaveBeenCalledTimes(1); - expect(lib.connectOverCDP).toHaveBeenCalledWith('http://remote:9222', {}); - expect(lib.launch).not.toHaveBeenCalled(); - expect(lib.connect).not.toHaveBeenCalled(); - }); - - test('connectOptions → calls connect, not launch', async () => { - const lib = createMockPlaywrightLibrary(); - const plugin = new PlaywrightPlugin(lib as any, { - connectOptions: { wsEndpoint: 'ws://remote:3000' }, - }); - - const ctx = plugin.createLaunchContext(); - await plugin.launch(ctx); - - expect(lib.connect).toHaveBeenCalledTimes(1); - expect(lib.connect).toHaveBeenCalledWith('ws://remote:3000', {}); - expect(lib.launch).not.toHaveBeenCalled(); - expect(lib.connectOverCDP).not.toHaveBeenCalled(); - }); - - test('no connect options → calls launch', async () => { - const lib = createMockPlaywrightLibrary(); - const plugin = new PlaywrightPlugin(lib as any); - - const ctx = plugin.createLaunchContext(); - await plugin.launch(ctx); - - expect(lib.launch).toHaveBeenCalledTimes(1); - expect(lib.connect).not.toHaveBeenCalled(); - expect(lib.connectOverCDP).not.toHaveBeenCalled(); - }); - - test('passes extra options through to connectOverCDP', async () => { - const lib = createMockPlaywrightLibrary(); - const plugin = new PlaywrightPlugin(lib as any, { - connectOverCDPOptions: { - endpointURL: 'http://remote:9222', - timeout: 5000, - headers: { 'x-token': 'abc' }, - }, - }); - - const ctx = plugin.createLaunchContext(); - await plugin.launch(ctx); - - expect(lib.connectOverCDP).toHaveBeenCalledWith('http://remote:9222', { - timeout: 5000, - headers: { 'x-token': 'abc' }, - }); - }); - - test('passes extra options through to connect', async () => { - const lib = createMockPlaywrightLibrary(); - const plugin = new PlaywrightPlugin(lib as any, { - connectOptions: { - wsEndpoint: 'ws://remote:3000', - timeout: 3000, - headers: { Authorization: 'Bearer xyz' }, - }, - }); - - const ctx = plugin.createLaunchContext(); - await plugin.launch(ctx); - - expect(lib.connect).toHaveBeenCalledWith('ws://remote:3000', { - timeout: 3000, - headers: { Authorization: 'Bearer xyz' }, - }); - }); - }); - - // --- Validation ----------------------------------------------------------- - - describe('validation', () => { - test('throws when both connectOptions and connectOverCDPOptions are set', () => { - const lib = createMockPlaywrightLibrary(); - - expect( - () => - new PlaywrightPlugin(lib as any, { - connectOptions: { wsEndpoint: 'ws://remote:3000' }, - connectOverCDPOptions: { endpointURL: 'http://remote:9222' }, - }), - ).toThrow('mutually exclusive'); - }); - - test('throws when connectOverCDPOptions has no endpointURL', () => { - const lib = createMockPlaywrightLibrary(); - - expect( - () => - new PlaywrightPlugin(lib as any, { - connectOverCDPOptions: { endpointURL: '' }, - }), - ).toThrow("'connectOverCDPOptions.endpointURL' must be a non-empty string"); - }); - - test('throws when connectOptions has no wsEndpoint', () => { - const lib = createMockPlaywrightLibrary(); - - expect( - () => - new PlaywrightPlugin(lib as any, { - connectOptions: { wsEndpoint: '' }, - }), - ).toThrow("'connectOptions.wsEndpoint' must be a non-empty string"); - }); - }); - - // --- isRemote correctness ------------------------------------------------- - - describe('isRemote', () => { - test('true when connectOverCDPOptions is present', () => { - const lib = createMockPlaywrightLibrary(); - const plugin = new PlaywrightPlugin(lib as any, { - connectOverCDPOptions: { endpointURL: 'http://remote:9222' }, - }); - - const ctx = plugin.createLaunchContext(); - expect(ctx.isRemote).toBe(true); - }); - - test('true when connectOptions is present', () => { - const lib = createMockPlaywrightLibrary(); - const plugin = new PlaywrightPlugin(lib as any, { - connectOptions: { wsEndpoint: 'ws://remote:3000' }, - }); - - const ctx = plugin.createLaunchContext(); - expect(ctx.isRemote).toBe(true); - }); - - test('false when no connect options', () => { - const lib = createMockPlaywrightLibrary(); - const plugin = new PlaywrightPlugin(lib as any); - - const ctx = plugin.createLaunchContext(); - expect(ctx.isRemote).toBe(false); - }); - }); - - // --- Proxy/webdriver skipping --------------------------------------------- - - describe('proxy/webdriver skipping for remote', () => { - test('proxy is not applied for remote connections', async () => { - const lib = createMockPlaywrightLibrary(); - const plugin = new PlaywrightPlugin(lib as any, { - connectOverCDPOptions: { endpointURL: 'http://remote:9222' }, - proxyUrl: 'http://user:pass@proxy:8080', - }); - - const ctx = plugin.createLaunchContext(); - await plugin.launch(ctx); - - // The browser was connected via CDP, not launched — proxy is not set on launchOptions - expect(lib.connectOverCDP).toHaveBeenCalledTimes(1); - expect(lib.launch).not.toHaveBeenCalled(); - }); - - test('webdriver hiding args are not added for remote connections', async () => { - const lib = createMockPlaywrightLibrary(); - const plugin = new PlaywrightPlugin(lib as any, { - connectOverCDPOptions: { endpointURL: 'http://remote:9222' }, - launchOptions: { args: ['--custom-flag'] }, - }); - - const ctx = plugin.createLaunchContext(); - await plugin.launch(ctx); - - // The original args should be untouched — no webdriver stealth flag injected - expect(ctx.launchOptions?.args).toEqual(['--custom-flag']); - expect(ctx.launchOptions?.args).not.toContain('--disable-blink-features=AutomationControlled'); - }); - - test('webdriver hiding args ARE added for local chromium connections', async () => { - const lib = createMockPlaywrightLibrary(); - const plugin = new PlaywrightPlugin(lib as any, { - launchOptions: { args: ['--custom-flag'] }, - }); - - const ctx = plugin.createLaunchContext(); - await plugin.launch(ctx); - - expect(ctx.launchOptions?.args).toContain('--disable-blink-features=AutomationControlled'); - expect(ctx.launchOptions?.args).toContain('--custom-flag'); - }); - - test('proxy is applied for local connections', async () => { - const lib = createMockPlaywrightLibrary(); - const plugin = new PlaywrightPlugin(lib as any, { - proxyUrl: 'http://user:pass@proxy:8080', - }); - - const ctx = plugin.createLaunchContext(); - await plugin.launch(ctx); - - expect(lib.launch).toHaveBeenCalledTimes(1); - // Launch options should have proxy configured - const launchOpts = lib.launch.mock.calls[0][0]; - expect(launchOpts.proxy).toBeDefined(); - expect(launchOpts.proxy.server).toBeDefined(); - }); - }); - - // --- useIncognitoPages default -------------------------------------------- - - describe('useIncognitoPages default', () => { - test('forced to true for remote (connectOverCDP)', () => { - const lib = createMockPlaywrightLibrary(); - const plugin = new PlaywrightPlugin(lib as any, { - connectOverCDPOptions: { endpointURL: 'http://remote:9222' }, - }); - - expect(plugin.useIncognitoPages).toBe(true); - }); - - test('forced to true for remote (connect / WebSocket)', () => { - const lib = createMockPlaywrightLibrary(); - const plugin = new PlaywrightPlugin(lib as any, { - connectOptions: { wsEndpoint: 'ws://remote:3000' }, - }); - - expect(plugin.useIncognitoPages).toBe(true); - }); - - test('explicit false is overridden to true for remote (with warning)', () => { - const lib = createMockPlaywrightLibrary(); - const plugin = new PlaywrightPlugin(lib as any, { - connectOverCDPOptions: { endpointURL: 'http://remote:9222' }, - useIncognitoPages: false, - }); - - expect(plugin.useIncognitoPages).toBe(true); - expect(mockLogger.warning).toHaveBeenCalledWith( - expect.stringContaining('only support useIncognitoPages: true'), - ); - }); - - test('explicit true preserved for remote', () => { - const lib = createMockPlaywrightLibrary(); - const plugin = new PlaywrightPlugin(lib as any, { - connectOverCDPOptions: { endpointURL: 'http://remote:9222' }, - useIncognitoPages: true, - }); - - expect(plugin.useIncognitoPages).toBe(true); - }); - - test('default false for local', () => { - const lib = createMockPlaywrightLibrary(); - const plugin = new PlaywrightPlugin(lib as any); - - expect(plugin.useIncognitoPages).toBe(false); - }); - }); - - // --- Info/Warnings -------------------------------------------------------- - - describe('info and warnings', () => { - test('proxyUrl + connectOverCDPOptions → warning that proxyUrl is ignored', async () => { - const lib = createMockPlaywrightLibrary(); - const plugin = new PlaywrightPlugin(lib as any, { - connectOverCDPOptions: { endpointURL: 'http://remote:9222' }, - proxyUrl: 'http://user:pass@proxy:8080', - }); - - const ctx = plugin.createLaunchContext(); - await plugin.launch(ctx); - - expect(mockLogger.warning).toHaveBeenCalledWith( - expect.stringContaining( - 'proxyUrl is set but will be ignored when using connectOptions/connectOverCDPOptions', - ), - ); - }); - - test('proxyUrl + remoteBrowser → info about forwarding to endpoint()', async () => { - const lib = createMockPlaywrightLibrary(); - const plugin = new PlaywrightPlugin(lib as any, { - remoteBrowser: { endpoint: () => 'http://remote:9222' }, - proxyUrl: 'http://user:pass@proxy:8080', - }); - - const ctx = plugin.createLaunchContext(); - await plugin.launch(ctx); - - expect(mockLogger.info).toHaveBeenCalledWith( - expect.stringContaining('passed to the remoteBrowser.endpoint() function'), - ); - }); - - test('remote default → info about incognito-only', () => { - const lib = createMockPlaywrightLibrary(); - new PlaywrightPlugin(lib as any, { - connectOverCDPOptions: { endpointURL: 'http://remote:9222' }, - }); - - expect(mockLogger.info).toHaveBeenCalledWith(expect.stringContaining('useIncognitoPages forced to true')); - }); - - test('no warnings for local browser usage', async () => { - const lib = createMockPlaywrightLibrary(); - const plugin = new PlaywrightPlugin(lib as any); - - const ctx = plugin.createLaunchContext(); - await plugin.launch(ctx); - - expect(mockLogger.warning).not.toHaveBeenCalled(); - }); - }); -}); - -describe('Remote browser — PuppeteerPlugin', () => { - let mockLogger: ReturnType; - - beforeEach(() => { - mockLogger = createMockLogger(); - serviceLocator.setLogger(mockLogger); - }); - - // --- Connection routing --------------------------------------------------- - - describe('connection routing', () => { - test('connectOverCDPOptions → calls connect, not launch', async () => { - const lib = createMockPuppeteerLibrary(); - const plugin = new PuppeteerPlugin(lib as any, { - connectOverCDPOptions: { browserWSEndpoint: 'ws://remote:9222' }, - }); - - const ctx = plugin.createLaunchContext(); - await plugin.launch(ctx); - - expect(lib.connect).toHaveBeenCalledTimes(1); - expect(lib.connect).toHaveBeenCalledWith({ browserWSEndpoint: 'ws://remote:9222' }); - expect(lib.launch).not.toHaveBeenCalled(); - }); - - test('no connect options → calls launch', async () => { - const lib = createMockPuppeteerLibrary(); - const plugin = new PuppeteerPlugin(lib as any); - - const ctx = plugin.createLaunchContext(); - await plugin.launch(ctx); - - expect(lib.launch).toHaveBeenCalledTimes(1); - expect(lib.connect).not.toHaveBeenCalled(); - }); - - test('passes all connect options through to connect', async () => { - const lib = createMockPuppeteerLibrary(); - const plugin = new PuppeteerPlugin(lib as any, { - connectOverCDPOptions: { - browserWSEndpoint: 'ws://remote:9222', - defaultViewport: { width: 800, height: 600 }, - }, - }); - - const ctx = plugin.createLaunchContext(); - await plugin.launch(ctx); - - expect(lib.connect).toHaveBeenCalledWith({ - browserWSEndpoint: 'ws://remote:9222', - defaultViewport: { width: 800, height: 600 }, - }); - }); - }); - - // --- Validation ----------------------------------------------------------- - - describe('validation', () => { - test('throws when connectOverCDPOptions has no browserWSEndpoint or browserURL', () => { - const lib = createMockPuppeteerLibrary(); - - expect( - () => - new PuppeteerPlugin(lib as any, { - connectOverCDPOptions: {} as any, - }), - ).toThrow("connectOverCDPOptions must include either 'browserWSEndpoint' or 'browserURL'"); - }); - }); - - // --- isRemote correctness ------------------------------------------------- - - describe('isRemote', () => { - test('true when connectOverCDPOptions is present', () => { - const lib = createMockPuppeteerLibrary(); - const plugin = new PuppeteerPlugin(lib as any, { - connectOverCDPOptions: { browserWSEndpoint: 'ws://remote:9222' }, - }); - - const ctx = plugin.createLaunchContext(); - expect(ctx.isRemote).toBe(true); - }); - - test('false when no connect options', () => { - const lib = createMockPuppeteerLibrary(); - const plugin = new PuppeteerPlugin(lib as any); - - const ctx = plugin.createLaunchContext(); - expect(ctx.isRemote).toBe(false); - }); - }); - - // --- Proxy/webdriver skipping --------------------------------------------- - - describe('proxy/webdriver skipping for remote', () => { - test('proxy is not applied for remote connections', async () => { - const lib = createMockPuppeteerLibrary(); - const plugin = new PuppeteerPlugin(lib as any, { - connectOverCDPOptions: { browserWSEndpoint: 'ws://remote:9222' }, - proxyUrl: 'http://user:pass@proxy:8080', - }); - - const ctx = plugin.createLaunchContext(); - await plugin.launch(ctx); - - expect(lib.connect).toHaveBeenCalledTimes(1); - expect(lib.launch).not.toHaveBeenCalled(); - }); - - test('proxy is not leaked into createBrowserContext for remote newPage', async () => { - const browser = createMockBrowser(); - const lib = createMockPuppeteerLibrary(browser); - const plugin = new PuppeteerPlugin(lib as any, { - connectOverCDPOptions: { browserWSEndpoint: 'ws://remote:9222' }, - proxyUrl: 'http://user:pass@proxy:8080', - useIncognitoPages: true, - }); - - const ctx = plugin.createLaunchContext(); - const wrappedBrowser = await plugin.launch(ctx); - - // Call newPage on the wrapped browser — useIncognitoPages: true creates new context - await (wrappedBrowser as any).newPage(); - - // createBrowserContext should be called with empty options (no proxyServer) - expect(browser.createBrowserContext).toHaveBeenCalledTimes(1); - expect(browser.createBrowserContext).toHaveBeenCalledWith({}); - }); - - test('webdriver hiding args are not added for remote connections', async () => { - const lib = createMockPuppeteerLibrary(); - const plugin = new PuppeteerPlugin(lib as any, { - connectOverCDPOptions: { browserWSEndpoint: 'ws://remote:9222' }, - launchOptions: { args: ['--custom-flag'] }, - }); - - const ctx = plugin.createLaunchContext(); - await plugin.launch(ctx); - - // The original args should be untouched — no webdriver stealth flag injected - expect(ctx.launchOptions?.args).toEqual(['--custom-flag']); - expect(ctx.launchOptions?.args).not.toContain('--disable-blink-features=AutomationControlled'); - }); - - test('webdriver hiding args ARE added for local connections', async () => { - const lib = createMockPuppeteerLibrary(); - const plugin = new PuppeteerPlugin(lib as any, { - launchOptions: { args: ['--custom-flag'] }, - }); - - const ctx = plugin.createLaunchContext(); - await plugin.launch(ctx); - - expect(ctx.launchOptions?.args).toContain('--disable-blink-features=AutomationControlled'); - expect(ctx.launchOptions?.args).toContain('--custom-flag'); - }); - }); - - // --- useIncognitoPages default -------------------------------------------- - - describe('useIncognitoPages default', () => { - test('defaults to false for remote', () => { - const lib = createMockPuppeteerLibrary(); - const plugin = new PuppeteerPlugin(lib as any, { - connectOverCDPOptions: { browserWSEndpoint: 'ws://remote:9222' }, - }); - - expect(plugin.useIncognitoPages).toBe(false); - }); - - test('explicit false preserved for remote', () => { - const lib = createMockPuppeteerLibrary(); - const plugin = new PuppeteerPlugin(lib as any, { - connectOverCDPOptions: { browserWSEndpoint: 'ws://remote:9222' }, - useIncognitoPages: false, - }); - - expect(plugin.useIncognitoPages).toBe(false); - }); - - test('explicit true preserved for remote', () => { - const lib = createMockPuppeteerLibrary(); - const plugin = new PuppeteerPlugin(lib as any, { - connectOverCDPOptions: { browserWSEndpoint: 'ws://remote:9222' }, - useIncognitoPages: true, - }); - - expect(plugin.useIncognitoPages).toBe(true); - }); - - test('default false for local', () => { - const lib = createMockPuppeteerLibrary(); - const plugin = new PuppeteerPlugin(lib as any); - - expect(plugin.useIncognitoPages).toBe(false); - }); - }); - - // --- Info/Warnings -------------------------------------------------------- - - describe('info and warnings', () => { - test('proxyUrl + connectOverCDPOptions → warning that proxyUrl is ignored', async () => { - const lib = createMockPuppeteerLibrary(); - const plugin = new PuppeteerPlugin(lib as any, { - connectOverCDPOptions: { browserWSEndpoint: 'ws://remote:9222' }, - proxyUrl: 'http://user:pass@proxy:8080', - }); +let mockLogger: ReturnType; - const ctx = plugin.createLaunchContext(); - await plugin.launch(ctx); - - expect(mockLogger.warning).toHaveBeenCalledWith( - expect.stringContaining( - 'proxyUrl is set but will be ignored when using connectOptions/connectOverCDPOptions', - ), - ); - }); - - test('proxyUrl + remoteBrowser → info about forwarding to endpoint()', async () => { - const lib = createMockPuppeteerLibrary(); - const plugin = new PuppeteerPlugin(lib as any, { - remoteBrowser: { endpoint: () => 'ws://remote:9222' }, - proxyUrl: 'http://user:pass@proxy:8080', - }); - - const ctx = plugin.createLaunchContext(); - await plugin.launch(ctx); - - expect(mockLogger.info).toHaveBeenCalledWith( - expect.stringContaining('passed to the remoteBrowser.endpoint() function'), - ); - }); - - test('remote default → info about shared cookies', () => { - const lib = createMockPuppeteerLibrary(); - new PuppeteerPlugin(lib as any, { - connectOverCDPOptions: { browserWSEndpoint: 'ws://remote:9222' }, - }); - - expect(mockLogger.info).toHaveBeenCalledWith( - expect.stringContaining('pages will share cookies and storage'), - ); - }); - - test('no warnings for local browser usage', async () => { - const lib = createMockPuppeteerLibrary(); - const plugin = new PuppeteerPlugin(lib as any); - - const ctx = plugin.createLaunchContext(); - await plugin.launch(ctx); - - expect(mockLogger.warning).not.toHaveBeenCalled(); - }); - }); +beforeEach(() => { + mockLogger = createMockLogger(); + serviceLocator.setLogger(mockLogger); }); -// --------------------------------------------------------------------------- -// remoteBrowser config tests -// --------------------------------------------------------------------------- - -describe('remoteBrowser config — PlaywrightPlugin', () => { - let mockLogger: ReturnType; - - beforeEach(() => { - mockLogger = createMockLogger(); - serviceLocator.setLogger(mockLogger); - }); - - test('static string endpoint → calls connectOverCDP by default', async () => { - const lib = createMockPlaywrightLibrary(); - const plugin = new PlaywrightPlugin(lib as any, { - remoteBrowser: { endpoint: 'wss://browserless.io?token=xxx' }, - }); - - const ctx = plugin.createLaunchContext(); - await plugin.launch(ctx); - - expect(lib.connectOverCDP).toHaveBeenCalledWith('wss://browserless.io?token=xxx', {}); - expect(lib.launch).not.toHaveBeenCalled(); - expect(lib.connect).not.toHaveBeenCalled(); - }); - - test('function endpoint → called per launch', async () => { - const lib = createMockPlaywrightLibrary(); - const endpointFn = vi.fn().mockResolvedValue('wss://dynamic-endpoint.io'); - const plugin = new PlaywrightPlugin(lib as any, { - remoteBrowser: { endpoint: endpointFn }, - }); - - const ctx1 = plugin.createLaunchContext(); - await plugin.launch(ctx1); - - const ctx2 = plugin.createLaunchContext(); - await plugin.launch(ctx2); - - expect(endpointFn).toHaveBeenCalledTimes(2); - expect(lib.connectOverCDP).toHaveBeenCalledTimes(2); - }); - - test('resolved endpoint stored on launchContext', async () => { - const lib = createMockPlaywrightLibrary(); - const plugin = new PlaywrightPlugin(lib as any, { - remoteBrowser: { endpoint: 'wss://test.io' }, - }); - - const ctx = plugin.createLaunchContext(); - await plugin.launch(ctx); - - expect((ctx as any)._resolvedRemoteEndpoint).toBe('wss://test.io'); - }); - - test('isRemote is true when remoteBrowser is set', () => { - const lib = createMockPlaywrightLibrary(); - const plugin = new PlaywrightPlugin(lib as any, { - remoteBrowser: { endpoint: 'wss://test.io' }, - }); - - const ctx = plugin.createLaunchContext(); - expect(ctx.isRemote).toBe(true); - }); - - test('useIncognitoPages forced to true when remoteBrowser is set', () => { - const lib = createMockPlaywrightLibrary(); - const plugin = new PlaywrightPlugin(lib as any, { - remoteBrowser: { endpoint: 'wss://test.io' }, - }); +describe('Remote connection — PlaywrightPlugin', () => { + it('useRemoteConnection forces incognito pages on and marks the launch context remote', () => { + const plugin = new PlaywrightPlugin(createMockPlaywrightLibrary() as any, { useIncognitoPages: false }); + plugin.useRemoteConnection(createConnection()); expect(plugin.useIncognitoPages).toBe(true); + expect(plugin.createLaunchContext().isRemote).toBe(true); }); - test('release called on connection failure with context', async () => { - const lib = createMockPlaywrightLibrary(); - lib.connectOverCDP.mockRejectedValue(new Error('Connection refused')); - - const releaseFn = vi.fn(); - const plugin = new PlaywrightPlugin(lib as any, { - remoteBrowser: { - endpoint: async () => ({ url: 'wss://fail.io', context: { id: 'sess-123' } }), - release: releaseFn, - }, - }); - - const ctx = plugin.createLaunchContext(); - await expect(plugin.launch(ctx)).rejects.toThrow('Failed to connect to remote browser'); - - expect(releaseFn).toHaveBeenCalledWith({ endpoint: 'wss://fail.io', context: { id: 'sess-123' } }); - }); - - test('release receives context from endpoint function', async () => { + it('connects via connectOverCDP by default and skips a local launch', async () => { const lib = createMockPlaywrightLibrary(); - const releaseFn = vi.fn(); - const plugin = new PlaywrightPlugin(lib as any, { - remoteBrowser: { - endpoint: async () => ({ url: 'wss://test.io', context: { sessionId: 'abc' } }), - release: releaseFn, - }, - }); + const plugin = new PlaywrightPlugin(lib as any); + const connection = createConnection('http://remote:9222'); + plugin.useRemoteConnection(connection, { connectOptions: { timeout: 5000 } }); const ctx = plugin.createLaunchContext(); await plugin.launch(ctx); - // Context stored on launchContext for later release - expect((ctx as any)._remoteContext).toEqual({ sessionId: 'abc' }); - }); - - test('release failure is swallowed and logged as warning', async () => { - const lib = createMockPlaywrightLibrary(); - lib.connectOverCDP.mockRejectedValue(new Error('Connection refused')); - - const releaseFn = vi.fn().mockRejectedValue(new Error('Release failed')); - const plugin = new PlaywrightPlugin(lib as any, { - remoteBrowser: { endpoint: 'wss://fail.io', release: releaseFn }, - }); - - const ctx = plugin.createLaunchContext(); - await expect(plugin.launch(ctx)).rejects.toThrow('Failed to connect to remote browser'); - - expect(releaseFn).toHaveBeenCalled(); - expect(mockLogger.warning).toHaveBeenCalledWith( - 'remoteBrowser.release() failed.', - expect.objectContaining({ error: 'Release failed' }), - ); - }); - - test('endpoint function rejection throws BrowserLaunchError', async () => { - const lib = createMockPlaywrightLibrary(); - const plugin = new PlaywrightPlugin(lib as any, { - remoteBrowser: { endpoint: () => Promise.reject(new Error('API down')) }, - }); - - const ctx = plugin.createLaunchContext(); - await expect(plugin.launch(ctx)).rejects.toThrow('Failed to resolve remote browser endpoint'); - }); - - test('throws when both remoteBrowser and connectOverCDPOptions are set', () => { - const lib = createMockPlaywrightLibrary(); - expect( - () => - new PlaywrightPlugin(lib as any, { - remoteBrowser: { endpoint: 'wss://a.io' }, - connectOverCDPOptions: { endpointURL: 'wss://b.io' }, - }), - ).toThrow('mutually exclusive'); - }); - - test('throws when both remoteBrowser and connectOptions are set', () => { - const lib = createMockPlaywrightLibrary(); - expect( - () => - new PlaywrightPlugin(lib as any, { - remoteBrowser: { endpoint: 'wss://a.io' }, - connectOptions: { wsEndpoint: 'wss://b.io' }, - }), - ).toThrow('mutually exclusive'); - }); -}); - -describe('remoteBrowser config — PuppeteerPlugin', () => { - let mockLogger: ReturnType; - - beforeEach(() => { - mockLogger = createMockLogger(); - serviceLocator.setLogger(mockLogger); - }); - - test('static string endpoint → calls connect with browserWSEndpoint', async () => { - const lib = createMockPuppeteerLibrary(); - const plugin = new PuppeteerPlugin(lib as any, { - remoteBrowser: { endpoint: 'wss://browserless.io?token=xxx' }, - }); - - const ctx = plugin.createLaunchContext(); - await plugin.launch(ctx); - - expect(lib.connect).toHaveBeenCalledWith({ browserWSEndpoint: 'wss://browserless.io?token=xxx' }); - expect(lib.launch).not.toHaveBeenCalled(); - }); - - test('function endpoint → called per launch', async () => { - const lib = createMockPuppeteerLibrary(); - const endpointFn = vi.fn().mockResolvedValue('wss://dynamic.io'); - const plugin = new PuppeteerPlugin(lib as any, { - remoteBrowser: { endpoint: endpointFn }, - }); - - const ctx = plugin.createLaunchContext(); - await plugin.launch(ctx); - - expect(endpointFn).toHaveBeenCalledTimes(1); - expect(lib.connect).toHaveBeenCalledWith({ browserWSEndpoint: 'wss://dynamic.io' }); - }); - - test('isRemote is true when remoteBrowser is set', () => { - const lib = createMockPuppeteerLibrary(); - const plugin = new PuppeteerPlugin(lib as any, { - remoteBrowser: { endpoint: 'wss://test.io' }, - }); - - const ctx = plugin.createLaunchContext(); - expect(ctx.isRemote).toBe(true); - }); - - test('release called on connection failure with context', async () => { - const lib = createMockPuppeteerLibrary(); - lib.connect.mockRejectedValue(new Error('Connection refused')); - - const releaseFn = vi.fn(); - const plugin = new PuppeteerPlugin(lib as any, { - remoteBrowser: { - endpoint: async () => ({ url: 'wss://fail.io', context: { id: 'sess-456' } }), - release: releaseFn, - }, - }); - - const ctx = plugin.createLaunchContext(); - await expect(plugin.launch(ctx)).rejects.toThrow('Failed to connect to remote browser'); - - expect(releaseFn).toHaveBeenCalledWith({ endpoint: 'wss://fail.io', context: { id: 'sess-456' } }); - }); - - test('throws when both remoteBrowser and connectOverCDPOptions are set', () => { - const lib = createMockPuppeteerLibrary(); - expect( - () => - new PuppeteerPlugin(lib as any, { - remoteBrowser: { endpoint: 'wss://a.io' }, - connectOverCDPOptions: { browserWSEndpoint: 'wss://b.io' }, - }), - ).toThrow('mutually exclusive'); - }); -}); - -// --------------------------------------------------------------------------- -// RemoteBrowserProvider tests -// --------------------------------------------------------------------------- - -describe('RemoteBrowserProvider — PlaywrightPlugin', () => { - let mockLogger: ReturnType; - - beforeEach(() => { - mockLogger = createMockLogger(); - serviceLocator.setLogger(mockLogger); - }); - - test('provider connect() → calls connectOverCDP by default', async () => { - const lib = createMockPlaywrightLibrary(); - - class SimpleProvider extends RemoteBrowserProvider { - async connect() { - return { url: 'wss://provider.io/cdp' }; - } - } - - const plugin = new PlaywrightPlugin(lib as any, { - remoteBrowser: new SimpleProvider(), - }); - - const ctx = plugin.createLaunchContext(); - await plugin.launch(ctx); - - expect(lib.connectOverCDP).toHaveBeenCalledWith('wss://provider.io/cdp', {}); + expect(connection.resolve).toHaveBeenCalledTimes(1); + expect(lib.connectOverCDP).toHaveBeenCalledWith('http://remote:9222', { timeout: 5000 }); expect(lib.connect).not.toHaveBeenCalled(); expect(lib.launch).not.toHaveBeenCalled(); + expect(ctx._remoteToken).toBe(42); }); - test('provider context flows to release', async () => { + it("connects via connect() when protocol is 'playwright'", async () => { const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any); + plugin.useRemoteConnection(createConnection('ws://remote:3000'), { protocol: 'playwright' }); - interface Ctx { - sessionId: string; - } - - class SessionProvider extends RemoteBrowserProvider { - releasedContext?: Ctx; - async connect() { - return { url: 'wss://test.io', context: { sessionId: 'sess-42' } }; - } - async release(context: Ctx) { - this.releasedContext = context; - } - } + await plugin.launch(plugin.createLaunchContext()); - const provider = new SessionProvider(); - const plugin = new PlaywrightPlugin(lib as any, { remoteBrowser: provider }); - - const ctx = plugin.createLaunchContext(); - await plugin.launch(ctx); - - // Context stored on launchContext - expect((ctx as any)._remoteContext).toEqual({ sessionId: 'sess-42' }); + expect(lib.connect).toHaveBeenCalledWith('ws://remote:3000', {}); + expect(lib.connectOverCDP).not.toHaveBeenCalled(); }); - test('provider release called on connection failure', async () => { + it('releases the session and throws BrowserLaunchError when connect fails', async () => { const lib = createMockPlaywrightLibrary(); - lib.connectOverCDP.mockRejectedValue(new Error('Connection refused')); - - const releaseSpy = vi.fn(); - - class FailProvider extends RemoteBrowserProvider<{ id: string }> { - async connect() { - return { url: 'wss://fail.io', context: { id: 'sess-fail' } }; - } - async release(context: { id: string }) { - releaseSpy(context); - } - } + lib.connectOverCDP.mockRejectedValueOnce(new Error('ECONNREFUSED')); + const plugin = new PlaywrightPlugin(lib as any); + const connection = createConnection(); + plugin.useRemoteConnection(connection); - const plugin = new PlaywrightPlugin(lib as any, { remoteBrowser: new FailProvider() }); - const ctx = plugin.createLaunchContext(); - - await expect(plugin.launch(ctx)).rejects.toThrow('Failed to connect to remote browser'); - expect(releaseSpy).toHaveBeenCalledWith({ id: 'sess-fail' }); + await expect(plugin.launch(plugin.createLaunchContext())).rejects.toThrow(/Failed to connect to remote browser/); + expect(connection.release).toHaveBeenCalledWith(42); }); - test('provider sets isRemote = true', () => { + it('throws BrowserLaunchError (without connecting) when endpoint resolution fails', async () => { const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any); + const connection = createConnection(); + connection.resolve.mockRejectedValueOnce(new Error('no session')); + plugin.useRemoteConnection(connection); - class P extends RemoteBrowserProvider { - async connect() { - return { url: 'wss://test.io' }; - } - } - - const plugin = new PlaywrightPlugin(lib as any, { remoteBrowser: new P() }); - const ctx = plugin.createLaunchContext(); - expect(ctx.isRemote).toBe(true); + await expect(plugin.launch(plugin.createLaunchContext())).rejects.toThrow(/resolve the remote browser endpoint/); + expect(lib.connectOverCDP).not.toHaveBeenCalled(); + expect(connection.release).not.toHaveBeenCalled(); }); - test('provider forces useIncognitoPages to true (CDP)', () => { + it('a plain plugin (no remote connection) launches locally', async () => { const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any); - class P extends RemoteBrowserProvider { - async connect() { - return { url: 'wss://test.io' }; - } - } + await plugin.launch(plugin.createLaunchContext()); - const plugin = new PlaywrightPlugin(lib as any, { remoteBrowser: new P() }); - expect(plugin.useIncognitoPages).toBe(true); + expect(lib.launch).toHaveBeenCalledTimes(1); + expect(lib.connect).not.toHaveBeenCalled(); + expect(lib.connectOverCDP).not.toHaveBeenCalled(); }); }); -describe('RemoteBrowserProvider — PuppeteerPlugin', () => { - let mockLogger: ReturnType; - - beforeEach(() => { - mockLogger = createMockLogger(); - serviceLocator.setLogger(mockLogger); - }); - - test('provider connect() → calls connect with browserWSEndpoint', async () => { +describe('Remote connection — PuppeteerPlugin', () => { + it('connects via connect() with the resolved endpoint and skips a local launch', async () => { const lib = createMockPuppeteerLibrary(); + const plugin = new PuppeteerPlugin(lib as any); + const connection = createConnection('ws://remote:9222'); + plugin.useRemoteConnection(connection, { connectOptions: { protocolTimeout: 1000 } }); - class SimpleProvider extends RemoteBrowserProvider { - async connect() { - return { url: 'wss://provider.io/cdp' }; - } - } - - const plugin = new PuppeteerPlugin(lib as any, { remoteBrowser: new SimpleProvider() }); const ctx = plugin.createLaunchContext(); await plugin.launch(ctx); - expect(lib.connect).toHaveBeenCalledWith({ browserWSEndpoint: 'wss://provider.io/cdp' }); + expect(connection.resolve).toHaveBeenCalledTimes(1); + expect(lib.connect).toHaveBeenCalledWith({ protocolTimeout: 1000, browserWSEndpoint: 'ws://remote:9222' }); expect(lib.launch).not.toHaveBeenCalled(); + expect(ctx._remoteToken).toBe(42); }); - test('provider release called on connection failure', async () => { + it('releases the session and throws BrowserLaunchError when connect fails', async () => { const lib = createMockPuppeteerLibrary(); - lib.connect.mockRejectedValue(new Error('Connection refused')); - - const releaseSpy = vi.fn(); - - class FailProvider extends RemoteBrowserProvider<{ id: string }> { - async connect() { - return { url: 'wss://fail.io', context: { id: 'sess-pptr' } }; - } - async release(context: { id: string }) { - releaseSpy(context); - } - } - - const plugin = new PuppeteerPlugin(lib as any, { remoteBrowser: new FailProvider() }); - const ctx = plugin.createLaunchContext(); + lib.connect.mockRejectedValueOnce(new Error('ECONNREFUSED')); + const plugin = new PuppeteerPlugin(lib as any); + const connection = createConnection(); + plugin.useRemoteConnection(connection); - await expect(plugin.launch(ctx)).rejects.toThrow('Failed to connect to remote browser'); - expect(releaseSpy).toHaveBeenCalledWith({ id: 'sess-pptr' }); + await expect(plugin.launch(plugin.createLaunchContext())).rejects.toThrow(/Failed to connect to remote browser/); + expect(connection.release).toHaveBeenCalledWith(42); }); - test('provider isRemote = true', () => { - const lib = createMockPuppeteerLibrary(); - - class P extends RemoteBrowserProvider { - async connect() { - return { url: 'wss://test.io' }; - } - } + it('marks the launch context remote', () => { + const plugin = new PuppeteerPlugin(createMockPuppeteerLibrary() as any); + plugin.useRemoteConnection(createConnection()); - const plugin = new PuppeteerPlugin(lib as any, { remoteBrowser: new P() }); - const ctx = plugin.createLaunchContext(); - expect(ctx.isRemote).toBe(true); + expect(plugin.createLaunchContext().isRemote).toBe(true); }); }); diff --git a/packages/playwright-crawler/src/internals/playwright-crawler.ts b/packages/playwright-crawler/src/internals/playwright-crawler.ts index 44adbc59cc7c..42431bbbc446 100644 --- a/packages/playwright-crawler/src/internals/playwright-crawler.ts +++ b/packages/playwright-crawler/src/internals/playwright-crawler.ts @@ -220,16 +220,8 @@ export class PlaywrightCrawler< } if (headless != null) { - if (launchContext.remoteBrowser || launchContext.connectOptions || launchContext.connectOverCDPOptions) { - const log = serviceLocator.getLogger().child({ prefix: 'PlaywrightCrawler' }); - log.warning( - "'headless' is ignored when connecting to a remote browser. " + - 'The remote service controls headless mode.', - ); - } else { - launchContext.launchOptions ??= {} as LaunchOptions; - launchContext.launchOptions.headless = headless as boolean; - } + launchContext.launchOptions ??= {} as LaunchOptions; + launchContext.launchOptions.headless = headless as boolean; } const playwrightLauncher = new PlaywrightLauncher(launchContext, options.configuration); diff --git a/packages/playwright-crawler/src/internals/playwright-launcher.ts b/packages/playwright-crawler/src/internals/playwright-launcher.ts index 9268b2937acc..bdb538c2b274 100644 --- a/packages/playwright-crawler/src/internals/playwright-launcher.ts +++ b/packages/playwright-crawler/src/internals/playwright-launcher.ts @@ -1,13 +1,6 @@ import type { BrowserLaunchContext } from '@crawlee/browser'; import { BrowserLauncher, Configuration } from '@crawlee/browser'; import { PlaywrightPlugin } from '@crawlee/browser-pool'; -import type { - PlaywrightConnectOptions, - PlaywrightConnectOverCDPOptions, - RemoteBrowserConfig, - RemoteBrowserProvider, -} from '@crawlee/browser-pool'; -import { serviceLocator } from '@crawlee/core'; import ow from 'ow'; import type { Browser, BrowserType, LaunchOptions } from 'playwright'; @@ -77,33 +70,6 @@ export interface PlaywrightLaunchContext extends BrowserLaunchContext; } /** @@ -115,9 +81,6 @@ export class PlaywrightLauncher extends BrowserLauncher { ...BrowserLauncher.optionsShape, launcher: ow.optional.object, launchContextOptions: ow.optional.object, - connectOptions: ow.optional.object, - connectOverCDPOptions: ow.optional.object, - remoteBrowser: ow.optional.object, }; /** @@ -129,18 +92,6 @@ export class PlaywrightLauncher extends BrowserLauncher { ) { ow(launchContext, 'PlaywrightLauncherOptions', ow.object.exactShape(PlaywrightLauncher.optionsShape)); - const hasRemote = !!( - launchContext.connectOptions || - launchContext.connectOverCDPOptions || - launchContext.remoteBrowser - ); - if (hasRemote && launchContext.launchOptions !== undefined) { - throw new Error( - "'launchOptions' is ignored when using a remote browser. Set at most one of " + - "'launchOptions', 'connectOptions', 'connectOverCDPOptions', 'remoteBrowser'.", - ); - } - const { launcher = BrowserLauncher.requireLauncherOrThrow( 'playwright', @@ -163,14 +114,6 @@ export class PlaywrightLauncher extends BrowserLauncher { ); this.Plugin = PlaywrightPlugin; - - if (hasRemote && launchContext.useChrome) { - const log = serviceLocator.getLogger().child({ prefix: 'PlaywrightLauncher' }); - log.warning( - 'useChrome is set but will be ignored for remote browser connections. ' + - 'The remote service controls which browser binary is used.', - ); - } } } diff --git a/packages/puppeteer-crawler/src/internals/puppeteer-crawler.ts b/packages/puppeteer-crawler/src/internals/puppeteer-crawler.ts index f06b2e637cb1..cd6680bbab00 100644 --- a/packages/puppeteer-crawler/src/internals/puppeteer-crawler.ts +++ b/packages/puppeteer-crawler/src/internals/puppeteer-crawler.ts @@ -199,16 +199,8 @@ export class PuppeteerCrawler< } if (headless != null) { - if (launchContext.remoteBrowser || launchContext.connectOverCDPOptions) { - const log = serviceLocator.getLogger().child({ prefix: 'PuppeteerCrawler' }); - log.warning( - "'headless' is ignored when connecting to a remote browser. " + - 'The remote service controls headless mode.', - ); - } else { - launchContext.launchOptions ??= {} as LaunchOptions; - launchContext.launchOptions.headless = headless as boolean; - } + launchContext.launchOptions ??= {} as LaunchOptions; + launchContext.launchOptions.headless = headless as boolean; } const puppeteerLauncher = new PuppeteerLauncher(launchContext, options.configuration); diff --git a/packages/puppeteer-crawler/src/internals/puppeteer-launcher.ts b/packages/puppeteer-crawler/src/internals/puppeteer-launcher.ts index 32b41d1ea7cf..3d46c30dd432 100644 --- a/packages/puppeteer-crawler/src/internals/puppeteer-launcher.ts +++ b/packages/puppeteer-crawler/src/internals/puppeteer-launcher.ts @@ -1,8 +1,6 @@ import type { BrowserLaunchContext } from '@crawlee/browser'; import { BrowserLauncher, Configuration } from '@crawlee/browser'; import { PuppeteerPlugin } from '@crawlee/browser-pool'; -import type { PuppeteerConnectOverCDPOptions, RemoteBrowserConfig, RemoteBrowserProvider } from '@crawlee/browser-pool'; -import { serviceLocator } from '@crawlee/core'; import ow from 'ow'; import type { Browser } from 'puppeteer'; @@ -67,27 +65,6 @@ export interface PuppeteerLaunchContext extends BrowserLaunchContext; } /** @@ -98,8 +75,6 @@ export class PuppeteerLauncher extends BrowserLauncher protected static override optionsShape = { ...BrowserLauncher.optionsShape, launcher: ow.optional.object, - connectOverCDPOptions: ow.optional.object, - remoteBrowser: ow.optional.object, }; /** @@ -111,14 +86,6 @@ export class PuppeteerLauncher extends BrowserLauncher ) { ow(launchContext, 'PuppeteerLauncher', ow.object.exactShape(PuppeteerLauncher.optionsShape)); - const hasRemote = !!(launchContext.connectOverCDPOptions || launchContext.remoteBrowser); - if (hasRemote && launchContext.launchOptions !== undefined) { - throw new Error( - "'launchOptions' is ignored when using a remote browser. Set at most one of " + - "'launchOptions', 'connectOverCDPOptions', 'remoteBrowser'.", - ); - } - const { launcher = BrowserLauncher.requireLauncherOrThrow('puppeteer', 'apify/actor-node-puppeteer-chrome'), ...browserLauncherOptions @@ -133,14 +100,6 @@ export class PuppeteerLauncher extends BrowserLauncher ); this.Plugin = PuppeteerPlugin; - - if (hasRemote && launchContext.useChrome) { - const log = serviceLocator.getLogger().child({ prefix: 'PuppeteerLauncher' }); - log.warning( - 'useChrome is set but will be ignored for remote browser connections. ' + - 'The remote service controls which browser binary is used.', - ); - } } protected override _getDefaultHeadlessOption(): boolean { diff --git a/test/core/browser_launchers/playwright_launcher.test.ts b/test/core/browser_launchers/playwright_launcher.test.ts index fa2cc35ce351..ba29d0408c28 100644 --- a/test/core/browser_launchers/playwright_launcher.test.ts +++ b/test/core/browser_launchers/playwright_launcher.test.ts @@ -289,32 +289,4 @@ describe('launchPlaywright()', () => { }); }); - describe('launchOptions + remote mutual exclusion', () => { - test('throws when launchOptions combined with connectOptions', async () => { - await expect( - launchPlaywright({ - launchOptions: { headless: true }, - connectOptions: { wsEndpoint: 'ws://remote:3000' }, - }), - ).rejects.toThrow("'launchOptions' is ignored when using a remote browser"); - }); - - test('throws when launchOptions combined with connectOverCDPOptions', async () => { - await expect( - launchPlaywright({ - launchOptions: { headless: true }, - connectOverCDPOptions: { endpointURL: 'http://remote:9222' }, - }), - ).rejects.toThrow("'launchOptions' is ignored when using a remote browser"); - }); - - test('throws when launchOptions combined with remoteBrowser', async () => { - await expect( - launchPlaywright({ - launchOptions: { headless: true }, - remoteBrowser: { endpoint: 'wss://remote.io' }, - }), - ).rejects.toThrow("'launchOptions' is ignored when using a remote browser"); - }); - }); }); diff --git a/test/core/browser_launchers/puppeteer_launcher.test.ts b/test/core/browser_launchers/puppeteer_launcher.test.ts index 735f832b715e..4b36679a0ffe 100644 --- a/test/core/browser_launchers/puppeteer_launcher.test.ts +++ b/test/core/browser_launchers/puppeteer_launcher.test.ts @@ -309,23 +309,4 @@ describe('launchPuppeteer()', () => { }); }); - describe('launchOptions + remote mutual exclusion', () => { - test('throws when launchOptions combined with connectOverCDPOptions', async () => { - await expect( - launchPuppeteer({ - launchOptions: { headless: true }, - connectOverCDPOptions: { browserWSEndpoint: 'ws://remote:3000' }, - }), - ).rejects.toThrow("'launchOptions' is ignored when using a remote browser"); - }); - - test('throws when launchOptions combined with remoteBrowser', async () => { - await expect( - launchPuppeteer({ - launchOptions: { headless: true }, - remoteBrowser: { endpoint: 'wss://remote.io' }, - }), - ).rejects.toThrow("'launchOptions' is ignored when using a remote browser"); - }); - }); }); diff --git a/test/core/crawlers/browser_crawler.test.ts b/test/core/crawlers/browser_crawler.test.ts index 388b45781111..c0ca30eeba07 100644 --- a/test/core/crawlers/browser_crawler.test.ts +++ b/test/core/crawlers/browser_crawler.test.ts @@ -6,6 +6,7 @@ import { BrowserPool as BrowserPoolClass, OperatingSystemsName, PuppeteerPlugin, + RemoteBrowserPool, } from '@crawlee/browser-pool'; import { bindMethodsToServiceLocator, BLOCKED_STATUS_CODES, ServiceLocator, SessionPool } from '@crawlee/core'; import type { PuppeteerGoToOptions } from '@crawlee/puppeteer'; @@ -176,6 +177,46 @@ describe('BrowserCrawler', () => { } }); + test.concurrent('builds and owns a RemoteBrowserPool from the remoteBrowser option', async () => { + const localStorageEmulator = new MemoryStorageEmulator(); + await localStorageEmulator.init(); + + try { + const crawler = new BrowserCrawlerTest({ + remoteBrowser: { endpoint: 'ws://remote:9222', maxOpenBrowsers: 2 }, + browserPoolOptions: { browserPlugins: [new PuppeteerPlugin(puppeteer)] }, + requestHandler: async () => {}, + }); + + expect(crawler.browserPool).toBeInstanceOf(RemoteBrowserPool); + expect((crawler.browserPool as RemoteBrowserPool).maxOpenBrowsers).toBe(2); + + await (crawler.browserPool as RemoteBrowserPool).destroy(); + } finally { + await localStorageEmulator.destroy(); + } + }); + + test.concurrent('throws when both browserPool and remoteBrowser are set', async () => { + const localStorageEmulator = new MemoryStorageEmulator(); + await localStorageEmulator.init(); + const externalPool = new BrowserPoolClass({ browserPlugins: [new PuppeteerPlugin(puppeteer)] }); + + try { + expect( + () => + new BrowserCrawlerTest({ + browserPool: externalPool, + remoteBrowser: { endpoint: 'ws://remote:9222' }, + requestHandler: async () => {}, + }), + ).toThrow(/at most one of 'browserPool' and 'remoteBrowser'/); + } finally { + await externalPool.destroy(); + await localStorageEmulator.destroy(); + } + }); + test.concurrent('should retire session after TimeoutError', async () => { const localStorageEmulator = new MemoryStorageEmulator(); await localStorageEmulator.init(); diff --git a/test/integration/remote-browser-incognito.test.ts b/test/integration/remote-browser-incognito.test.ts index 766b4716216e..4c176870df99 100644 --- a/test/integration/remote-browser-incognito.test.ts +++ b/test/integration/remote-browser-incognito.test.ts @@ -11,19 +11,11 @@ * * With the wrapper removed, request 2's body should report no cookies. */ -import { RemoteBrowserProvider } from '@crawlee/browser-pool'; import { PlaywrightCrawler } from 'crawlee'; import { expect, test } from 'vitest'; import { BROWSERLESS_URL, httpbin } from './helpers.js'; -class BrowserlessCDPProvider extends RemoteBrowserProvider { - override maxOpenBrowsers = 1; - async connect() { - return { url: BROWSERLESS_URL }; - } -} - // Gate on CRAWLEE_DIFFICULT_TESTS so plain `pnpm test` skips integration tests // (no Docker required); `pnpm test:integration` and `pnpm test:full` set the flag. test.skipIf(!process.env.CRAWLEE_DIFFICULT_TESTS)( @@ -33,14 +25,15 @@ test.skipIf(!process.env.CRAWLEE_DIFFICULT_TESTS)( const controllerIdByPage = new WeakMap(); const crawler = new PlaywrightCrawler({ - launchContext: { - remoteBrowser: new BrowserlessCDPProvider(), + remoteBrowser: { + endpoint: BROWSERLESS_URL, + maxOpenBrowsers: 1, }, browserPoolOptions: { retireBrowserAfterPageCount: 10, // keep the same browser across both requests maxOpenPagesPerBrowser: 2, postPageCreateHooks: [ - (page, browserController) => { + (page: object, browserController: { id: string }) => { controllerIdByPage.set(page, browserController.id); }, ], From 08ea16b04d3e7c50702dfdd18badcc870f0a5d4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sol=C3=A1r?= Date: Tue, 16 Jun 2026 22:26:07 +0000 Subject: [PATCH 44/45] fix: remove zero-width spaces from remote BrowserLaunchError messages Two BrowserLaunchError strings ended with an invisible U+200B zero-width space, which shipped in user-facing error messages and broke log grepping / string matching. Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/browser-pool/src/abstract-classes/browser-plugin.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/browser-pool/src/abstract-classes/browser-plugin.ts b/packages/browser-pool/src/abstract-classes/browser-plugin.ts index 22d14a1dee4d..b08e3b55c38e 100644 --- a/packages/browser-pool/src/abstract-classes/browser-plugin.ts +++ b/packages/browser-pool/src/abstract-classes/browser-plugin.ts @@ -181,7 +181,7 @@ export abstract class BrowserPlugin< try { ({ url, token } = await connection.resolve({ proxyUrl: launchContext.proxyUrl })); } catch (cause) { - throw new BrowserLaunchError('Failed to resolve the remote browser endpoint.​', { cause }); + throw new BrowserLaunchError('Failed to resolve the remote browser endpoint.', { cause }); } launchContext._remoteToken = token; @@ -192,7 +192,7 @@ export abstract class BrowserPlugin< await connection.release(token); throw new BrowserLaunchError( `Failed to connect to remote browser at "${sanitizeEndpointForLog(url)}". ` + - 'Check that the endpoint is reachable and accepts the configured protocol.​', + 'Check that the endpoint is reachable and accepts the configured protocol.', { cause }, ); } From 91558cf007e9a5732bd56bca5ace298d27e6069c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Sol=C3=A1r?= Date: Tue, 16 Jun 2026 22:42:31 +0000 Subject: [PATCH 45/45] fix: strip query secrets from sanitized remote endpoint logs sanitizeEndpointForLog only masked userinfo credentials, so tokens passed in the query string (e.g. Browserless '?token=...') still leaked into connect-failure error messages. Drop the query and fragment entirely, keeping protocol/host/port/path for diagnostics. Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/browser-pool/src/utils.ts | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/packages/browser-pool/src/utils.ts b/packages/browser-pool/src/utils.ts index 476e98f42f65..ca59455d72c5 100644 --- a/packages/browser-pool/src/utils.ts +++ b/packages/browser-pool/src/utils.ts @@ -6,14 +6,19 @@ export type UnwrapPromise = T extends PromiseLike ? UnwrapPromise export function noop(..._args: unknown[]): void {} -/** Strips credentials from a URL so it can be safely included in logs and error messages. */ +/** + * Strips secrets from a URL so it can be safely included in logs and error messages. Removes userinfo + * credentials and the entire query string and fragment — remote browser services routinely carry tokens + * there (e.g. Browserless `?token=…`), and we can't tell which params are sensitive. Keeps the + * protocol, host, port, and path, which are enough to diagnose connection failures. + */ export function sanitizeEndpointForLog(endpoint: string): string { try { const url = new URL(endpoint); - if (url.username || url.password) { - url.username = '***'; - url.password = '***'; - } + url.username = ''; + url.password = ''; + url.search = ''; + url.hash = ''; return url.toString(); } catch { return '';