diff --git a/packages/apify/src/proxy_configuration.ts b/packages/apify/src/proxy_configuration.ts index 0d452ab645..770ef3c474 100644 --- a/packages/apify/src/proxy_configuration.ts +++ b/packages/apify/src/proxy_configuration.ts @@ -1,13 +1,10 @@ -import type { - ProxyConfigurationOptions as CoreProxyConfigurationOptions, - ProxyInfo as CoreProxyInfo, -} from '@crawlee/core'; +import type { ProxyConfigurationOptions as CoreProxyConfigurationOptions } from '@crawlee/core'; import { ProxyConfiguration as CoreProxyConfiguration } from '@crawlee/core'; +import type { ProxyInfo as CoreProxyInfo } from '@crawlee/types'; import { gotScraping } from 'got-scraping'; import ow from 'ow'; import { APIFY_ENV_VARS, APIFY_PROXY_VALUE_REGEX } from '@apify/consts'; -import { cryptoRandomObjectId } from '@apify/utilities'; import { Actor } from './actor.js'; import { Configuration } from './configuration.js'; @@ -18,6 +15,38 @@ const CHECK_ACCESS_REQUEST_TIMEOUT_MILLIS = 4_000; const CHECK_ACCESS_MAX_ATTEMPTS = 2; const COUNTRY_CODE_REGEX = /^[A-Z]{2}$/; +type CoreProxyOptions = Parameters[0]; + +/** + * Bridges the SDK's legacy `(sessionId, options?)` calling style with + * crawlee v4's `(options)` shape — pulls `sessionId` from a `Request` + * carried in `options` when no explicit `sessionId` is given. Rejects + * values that are neither a sessionId nor a plain options object + * (e.g. `Date`, arrays). + */ +function parseSessionIdOrOptions( + arg: string | number | CoreProxyOptions | undefined, + legacyOptions?: CoreProxyOptions, +): { sessionId: string | undefined; options: CoreProxyOptions } { + if (arg === undefined) { + return { sessionId: undefined, options: legacyOptions }; + } + if (typeof arg === 'string' || typeof arg === 'number') { + return { sessionId: String(arg), options: legacyOptions }; + } + if ( + typeof arg !== 'object' || + arg === null || + Array.isArray(arg) || + Object.getPrototypeOf(arg) !== Object.prototype + ) { + throw new TypeError( + 'Expected sessionId (string/number) or a TieredProxyOptions object', + ); + } + return { sessionId: arg.request?.sessionId, options: arg }; +} + export interface ProxyConfigurationOptions extends CoreProxyConfigurationOptions { /** @@ -56,15 +85,6 @@ export interface ProxyConfigurationOptions * configurate the proxy by UI input schema. You should use the `countryCode` option in your crawler code. */ apifyProxyCountry?: string; - - /** - * Multiple different ProxyConfigurationOptions stratified into tiers. Crawlee crawlers will switch between those tiers - * based on the blocked request statistics. - */ - tieredProxyConfig?: Omit< - ProxyConfigurationOptions, - keyof CoreProxyConfigurationOptions | 'tieredProxyConfig' - >[]; } /** @@ -100,6 +120,13 @@ export interface ProxyConfigurationOptions * ``` */ export interface ProxyInfo extends CoreProxyInfo { + /** + * The Apify Proxy session identifier the URL was minted for, if any. + * v3 carried this on the base `ProxyInfo`; v4 dropped it, so the SDK + * re-declares it here for users that read `proxyInfo.sessionId`. + */ + sessionId?: string; + /** * An array of proxy groups to be used by the [Apify Proxy](https://docs.apify.com/proxy). * If not provided, the proxy will select the groups automatically. @@ -193,10 +220,6 @@ export class ProxyConfiguration extends CoreProxyConfiguration { apifyProxyCountry: ow.optional.string.matches(COUNTRY_CODE_REGEX), password: ow.optional.string, - tieredProxyUrls: ow.optional.array.ofType( - ow.array.ofType(ow.string), - ), - tieredProxyConfig: ow.optional.array.ofType(ow.object), }), ); @@ -206,18 +229,11 @@ export class ProxyConfiguration extends CoreProxyConfiguration { countryCode, apifyProxyCountry, password = config.proxyPassword, - tieredProxyConfig, - tieredProxyUrls, } = options; - this.tieredProxyUrls ??= tieredProxyUrls; - - if (tieredProxyConfig) { - this.tieredProxyUrls = this._generateTieredProxyUrls( - tieredProxyConfig, - options, - ); - } + // crawlee v4 (>=beta.51) removed `tieredProxyUrls` / + // `tieredProxyConfig` (see apify/crawlee#3599) — the SDK no + // longer threads tiered config through to the base class. const groupsToUse = groups.length ? groups : apifyProxyGroups; const countryCodeToUse = countryCode || apifyProxyCountry; @@ -241,7 +257,7 @@ export class ProxyConfiguration extends CoreProxyConfiguration { this.port = port; this.usesApifyProxy = !this.proxyUrls && !this.newUrlFunction; - if (proxyUrls && proxyUrls.some((url) => url.includes('apify.com'))) { + if (proxyUrls && proxyUrls.some((url) => url?.includes('apify.com'))) { this.log.warning( 'Some Apify proxy features may work incorrectly. Please consider setting up Apify properties instead of `proxyUrls`.\n' + 'See https://sdk.apify.com/docs/guides/proxy-management#apify-proxy-configuration', @@ -304,10 +320,20 @@ export class ProxyConfiguration extends CoreProxyConfiguration { * @return Represents information about used proxy and its configuration. */ override async newProxyInfo( - sessionId?: string | number, - options?: Parameters[1], + sessionIdOrOptions?: + | string + | number + | Parameters[0], + legacyOptions?: Parameters[0], ): Promise { - if (typeof sessionId === 'number') sessionId = `${sessionId}`; + // crawlee v4 dropped the `(sessionId, options)` overload — `newProxyInfo` + // now takes a single `NewUrlOptions` argument and pulls `sessionId` + // from `options.request`. Keep the SDK's legacy "pass sessionId directly" + // shape working by discriminating at runtime. + const { sessionId } = parseSessionIdOrOptions( + sessionIdOrOptions, + legacyOptions, + ); ow( sessionId, ow.optional.string @@ -315,25 +341,43 @@ export class ProxyConfiguration extends CoreProxyConfiguration { .matches(APIFY_PROXY_VALUE_REGEX), ); - const proxyInfo = await super.newProxyInfo(sessionId, options); - if (!proxyInfo) return proxyInfo; + const url = await this.newUrl(sessionIdOrOptions, legacyOptions); + if (!url) return undefined; const { groups, countryCode, password, port, hostname } = ( - this.usesApifyProxy ? this : new URL(proxyInfo.url) + this.usesApifyProxy ? this : new URL(url) ) as ProxyConfiguration; - return { - ...proxyInfo, + // Extract `username` from the resolved URL — crawlee v3 carried it + // on `ProxyInfo` and tests rely on it (e.g. for Apify Proxy session + // formatting). v4's `super.newProxyInfo` would surface this, but we + // bypass `super` here so the SDK can keep its legacy `sessionId` + // calling convention. Decode the URL-encoded username so callers + // see the human-readable form (matches v3 behaviour). + const rawUsername = new URL(url).username; + const username = rawUsername + ? decodeURIComponent(rawUsername) + : undefined; + + // Build the result lazily: omit Apify-only fields when the SDK is + // wrapping a custom `proxyUrls` rotation (matches v3 shape, which + // tests rely on with strict deep-equal). + const result: Partial = { + url, sessionId, - groups, - countryCode, // this.password is not encoded, but the password from the URL will be, we need to normalize password: this.usesApifyProxy ? (password ?? '') : decodeURIComponent(password!), hostname, port: port!, + username, }; + if (this.usesApifyProxy) { + result.groups = groups; + if (countryCode !== undefined) result.countryCode = countryCode; + } + return result as ProxyInfo; } /** @@ -350,10 +394,16 @@ export class ProxyConfiguration extends CoreProxyConfiguration { * For example, `http://bob:password123@proxy.example.com:8000` */ override async newUrl( - sessionId?: string | number, - options?: Parameters[1], + sessionIdOrOptions?: + | string + | number + | Parameters[0], + legacyOptions?: Parameters[0], ): Promise { - if (typeof sessionId === 'number') sessionId = `${sessionId}`; + const { sessionId, options } = parseSessionIdOrOptions( + sessionIdOrOptions, + legacyOptions, + ); ow( sessionId, ow.optional.string @@ -362,40 +412,41 @@ export class ProxyConfiguration extends CoreProxyConfiguration { ); if (this.newUrlFunction) { return ( - (await this._callNewUrlFunction(sessionId, { + (await this._callNewUrlFunction({ request: options?.request, })) ?? undefined ); } if (this.proxyUrls) { - return this._handleCustomUrl(sessionId); - } - - if (this.tieredProxyUrls) { - return ( - this._handleTieredUrl( - sessionId ?? cryptoRandomObjectId(6), - options, - ).proxyUrl ?? undefined - ); + // `_handleCustomUrl` was removed from `CoreProxyConfiguration` in + // v4; inline the rotation logic to preserve session-stickiness. + // Round-robin index for sessionless calls (post-increment so the + // first call returns proxyUrls[0]); per-session sticky mapping + // when a sessionId is provided. + const index = + sessionId !== undefined + ? this.getSessionIndex(sessionId) + : this.nextCustomUrlIndex++ % this.proxyUrls.length; + return this.proxyUrls[index] ?? undefined; } return this.composeDefaultUrl(sessionId); } - protected _generateTieredProxyUrls( - tieredProxyConfig: NonNullable< - ProxyConfigurationOptions['tieredProxyConfig'] - >, - globalOptions: ProxyConfigurationOptions, - ) { - return tieredProxyConfig.map((config) => [ - new ProxyConfiguration({ - ...globalOptions, - ...config, - tieredProxyConfig: undefined, - }).composeDefaultUrl(), - ]); + /** + * Stable per-session index into `proxyUrls`, replacing the removed + * `_handleCustomUrl(sessionId)` from crawlee v3. + */ + private getSessionIndex(sessionId: string): number { + if (!this.usedProxyUrls.has(sessionId)) { + this.usedProxyUrls.set( + sessionId, + this.proxyUrls![ + this.usedProxyUrls.size % this.proxyUrls!.length + ], + ); + } + return this.proxyUrls!.indexOf(this.usedProxyUrls.get(sessionId)!); } /** @@ -438,7 +489,7 @@ export class ProxyConfiguration extends CoreProxyConfiguration { */ // TODO: Make this private protected async _setPasswordIfToken(): Promise { - const {token} = (this.config as Configuration); + const { token } = this.config as Configuration; if (!token) return; try { @@ -500,7 +551,7 @@ export class ProxyConfiguration extends CoreProxyConfiguration { } | undefined > { - const {proxyStatusUrl} = (this.config as Configuration); + const { proxyStatusUrl } = this.config as Configuration; const requestOpts = { url: `${proxyStatusUrl}/?format=json`, proxyUrl: await this.newUrl(), diff --git a/test/apify/proxy_configuration.test.ts b/test/apify/proxy_configuration.test.ts index 8c61a63177..d42b0808cb 100644 --- a/test/apify/proxy_configuration.test.ts +++ b/test/apify/proxy_configuration.test.ts @@ -1,10 +1,23 @@ -import { Actor, ProxyConfiguration } from 'apify'; +import { Actor, Configuration, ProxyConfiguration } from 'apify'; import { UserClient } from 'apify-client'; -import { type Dictionary, Request, sleep } from 'crawlee'; +import { type Dictionary, Request, serviceLocator, sleep } from 'crawlee'; import { gotScraping } from 'got-scraping'; import { APIFY_ENV_VARS, LOCAL_APIFY_ENV_VARS } from '@apify/consts'; +// crawlee v4's Configuration resolves env vars eagerly at construction, +// and the SDK keeps `Configuration.globalConfig` plus `Actor._instance` as +// cached singletons. Tests in this file mutate proxy-related env vars at +// runtime, so we have to clear those caches before each test. +function resetGlobalState() { + serviceLocator.reset(); + ( + Configuration as unknown as { globalConfig?: Configuration } + ).globalConfig = undefined; + // eslint-disable-next-line no-underscore-dangle -- `_instance` is the upstream Actor singleton field + (Actor as unknown as { _instance?: Actor })._instance = undefined; +} + const groups = ['GROUP1', 'GROUP2']; const hostname = LOCAL_APIFY_ENV_VARS[APIFY_ENV_VARS.PROXY_HOSTNAME]; const port = Number(LOCAL_APIFY_ENV_VARS[APIFY_ENV_VARS.PROXY_PORT]); @@ -175,8 +188,10 @@ describe('ProxyConfiguration', () => { proxyConfiguration.newUrl('a-b'), ).rejects.toThrow(), expect(proxyConfiguration.newUrl('a$b')).rejects.toThrow(), - // @ts-expect-error invalid input - expect(proxyConfiguration.newUrl({})).rejects.toThrow(), + // crawlee v4 made `newUrl` accept `TieredProxyOptions`, so + // an empty object is a valid (sessionless) call now. We only + // reject inputs that are neither a sessionId nor a plain + // options object. // @ts-expect-error invalid input expect(proxyConfiguration.newUrl(new Date())).rejects.toThrow(), expect( @@ -486,77 +501,19 @@ describe('ProxyConfiguration', () => { }); }); - describe('With tieredProxyUrls', () => { - test('proxy configuration accepts the tiered urls (Crawlee style)', async () => { - const proxyConfiguration = new ProxyConfiguration({ - tieredProxyUrls: [ - ['http://proxy.com:1111'], - ['http://proxy.com:2222'], - ['http://proxy.com:3333'], - ['http://proxy.com:4444'], - ], - }); - - // through newUrl() - expect( - await proxyConfiguration.newUrl('abc', { - request: new Request({ url: 'http://example.com' }) as any, - }), - ).toEqual('http://proxy.com:1111'); - - // through newProxyInfo() - expect( - (await proxyConfiguration.newProxyInfo('abc', { - request: new Request({ - url: 'http://example.com', - }) as any, - }))!.url, - ).toEqual('http://proxy.com:1111'); - }); - - test('shorthand tieredProxyConfig gets correctly expanded', async () => { - const proxyConfiguration = new ProxyConfiguration({ - password: 'password', - countryCode: 'DE', - tieredProxyConfig: [ - { - groups: ['GROUP1'], - countryCode: 'CZ', - }, - { - groups: ['GROUP2'], - countryCode: 'US', - }, - { - groups: ['GROUP3', 'GROUP4'], - }, - { - groups: ['GROUP3', 'GROUP4'], - countryCode: undefined, - }, - ], - }); - - // eslint-disable-next-line dot-notation - expect(proxyConfiguration['tieredProxyUrls']).toEqual([ - [ - 'http://groups-GROUP1,country-CZ:password@proxy.apify.com:8000', - ], - [ - 'http://groups-GROUP2,country-US:password@proxy.apify.com:8000', - ], - [ - 'http://groups-GROUP3+GROUP4,country-DE:password@proxy.apify.com:8000', - ], - ['http://groups-GROUP3+GROUP4:password@proxy.apify.com:8000'], - ]); - }); - }); + // `tieredProxyUrls` / `tieredProxyConfig` were removed from + // crawlee v4 (apify/crawlee#3599); the corresponding test groups + // were dropped here and in the `Actor.createProxyConfiguration()` + // describe below. }); describe('Actor.createProxyConfiguration()', () => { const userData = { proxy: { password } }; + beforeEach(() => { + resetGlobalState(); + }); + test('should work with all options', async () => { const status = { connected: true }; const proxyUrl = proxyUrlNoSession; @@ -714,70 +671,7 @@ describe('Actor.createProxyConfiguration()', () => { gotScrapingSpy.mockRestore(); }); - describe('With tieredProxyUrls', () => { - test('proxy configuration accepts the tiered urls (Crawlee style)', async () => { - const proxyConfiguration = await Actor.createProxyConfiguration({ - tieredProxyUrls: [ - ['http://proxy.com:1111'], - ['http://proxy.com:2222'], - ['http://proxy.com:3333'], - ['http://proxy.com:4444'], - ], - }); - - // through newUrl() - expect( - await proxyConfiguration!.newUrl('abc', { - request: new Request({ url: 'http://example.com' }) as any, - }), - ).toEqual('http://proxy.com:1111'); - - // through newProxyInfo() - expect( - (await proxyConfiguration!.newProxyInfo('abc', { - request: new Request({ - url: 'http://example.com', - }) as any, - }))!.url, - ).toEqual('http://proxy.com:1111'); - }); - - test('shorthand tieredProxyConfig gets correctly expanded', async () => { - const proxyConfiguration = await Actor.createProxyConfiguration({ - password: 'password', - countryCode: 'DE', - tieredProxyConfig: [ - { - groups: ['GROUP1'], - countryCode: 'CZ', - }, - { - groups: ['GROUP2'], - countryCode: 'US', - }, - { - groups: ['GROUP3', 'GROUP4'], - }, - { - groups: ['GROUP3', 'GROUP4'], - countryCode: undefined, - }, - ], - }); - - // eslint-disable-next-line dot-notation - expect(proxyConfiguration!['tieredProxyUrls']).toEqual([ - [ - 'http://groups-GROUP1,country-CZ:password@proxy.apify.com:8000', - ], - [ - 'http://groups-GROUP2,country-US:password@proxy.apify.com:8000', - ], - [ - 'http://groups-GROUP3+GROUP4,country-DE:password@proxy.apify.com:8000', - ], - ['http://groups-GROUP3+GROUP4:password@proxy.apify.com:8000'], - ]); - }); - }); + // `tieredProxyUrls` / `tieredProxyConfig` were removed from + // crawlee v4 (apify/crawlee#3599); the corresponding test groups + // were dropped here too. });