Skip to content
187 changes: 119 additions & 68 deletions packages/apify/src/proxy_configuration.ts
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
import type {
ProxyConfigurationOptions as CoreProxyConfigurationOptions,
ProxyInfo as CoreProxyInfo,
} from '@crawlee/core';
import type { ProxyConfigurationOptions as CoreProxyConfigurationOptions } from '@crawlee/core';
import { ProxyConfiguration as CoreProxyConfiguration } from '@crawlee/core';
import type { ProxyInfo as CoreProxyInfo } from '@crawlee/types';
import { gotScraping } from 'got-scraping';
import ow from 'ow';

import { APIFY_ENV_VARS, APIFY_PROXY_VALUE_REGEX } from '@apify/consts';
import { cryptoRandomObjectId } from '@apify/utilities';

import { Actor } from './actor.js';
import { Configuration } from './configuration.js';
Expand All @@ -18,6 +15,38 @@ const CHECK_ACCESS_REQUEST_TIMEOUT_MILLIS = 4_000;
const CHECK_ACCESS_MAX_ATTEMPTS = 2;
const COUNTRY_CODE_REGEX = /^[A-Z]{2}$/;

type CoreProxyOptions = Parameters<CoreProxyConfiguration['newProxyInfo']>[0];

/**
* Bridges the SDK's legacy `(sessionId, options?)` calling style with
* crawlee v4's `(options)` shape — pulls `sessionId` from a `Request`
* carried in `options` when no explicit `sessionId` is given. Rejects
* values that are neither a sessionId nor a plain options object
* (e.g. `Date`, arrays).
*/
function parseSessionIdOrOptions(
arg: string | number | CoreProxyOptions | undefined,
legacyOptions?: CoreProxyOptions,
): { sessionId: string | undefined; options: CoreProxyOptions } {
if (arg === undefined) {
return { sessionId: undefined, options: legacyOptions };
}
if (typeof arg === 'string' || typeof arg === 'number') {
return { sessionId: String(arg), options: legacyOptions };
}
if (
typeof arg !== 'object' ||
arg === null ||
Array.isArray(arg) ||
Object.getPrototypeOf(arg) !== Object.prototype
) {
throw new TypeError(
'Expected sessionId (string/number) or a TieredProxyOptions object',
);
}
return { sessionId: arg.request?.sessionId, options: arg };
}

export interface ProxyConfigurationOptions
extends CoreProxyConfigurationOptions {
/**
Expand Down Expand Up @@ -56,15 +85,6 @@ export interface ProxyConfigurationOptions
* configurate the proxy by UI input schema. You should use the `countryCode` option in your crawler code.
*/
apifyProxyCountry?: string;

/**
* Multiple different ProxyConfigurationOptions stratified into tiers. Crawlee crawlers will switch between those tiers
* based on the blocked request statistics.
*/
tieredProxyConfig?: Omit<
ProxyConfigurationOptions,
keyof CoreProxyConfigurationOptions | 'tieredProxyConfig'
>[];
}

/**
Expand Down Expand Up @@ -100,6 +120,13 @@ export interface ProxyConfigurationOptions
* ```
*/
export interface ProxyInfo extends CoreProxyInfo {
/**
* The Apify Proxy session identifier the URL was minted for, if any.
* v3 carried this on the base `ProxyInfo`; v4 dropped it, so the SDK
* re-declares it here for users that read `proxyInfo.sessionId`.
*/
sessionId?: string;

/**
* An array of proxy groups to be used by the [Apify Proxy](https://docs.apify.com/proxy).
* If not provided, the proxy will select the groups automatically.
Expand Down Expand Up @@ -193,10 +220,6 @@ export class ProxyConfiguration extends CoreProxyConfiguration {
apifyProxyCountry:
ow.optional.string.matches(COUNTRY_CODE_REGEX),
password: ow.optional.string,
tieredProxyUrls: ow.optional.array.ofType(
ow.array.ofType(ow.string),
),
tieredProxyConfig: ow.optional.array.ofType(ow.object),
}),
);

Expand All @@ -206,18 +229,11 @@ export class ProxyConfiguration extends CoreProxyConfiguration {
countryCode,
apifyProxyCountry,
password = config.proxyPassword,
tieredProxyConfig,
tieredProxyUrls,
} = options;

this.tieredProxyUrls ??= tieredProxyUrls;

if (tieredProxyConfig) {
this.tieredProxyUrls = this._generateTieredProxyUrls(
tieredProxyConfig,
options,
);
}
// crawlee v4 (>=beta.51) removed `tieredProxyUrls` /
// `tieredProxyConfig` (see apify/crawlee#3599) — the SDK no
// longer threads tiered config through to the base class.

const groupsToUse = groups.length ? groups : apifyProxyGroups;
const countryCodeToUse = countryCode || apifyProxyCountry;
Expand All @@ -241,7 +257,7 @@ export class ProxyConfiguration extends CoreProxyConfiguration {
this.port = port;
this.usesApifyProxy = !this.proxyUrls && !this.newUrlFunction;

if (proxyUrls && proxyUrls.some((url) => url.includes('apify.com'))) {
if (proxyUrls && proxyUrls.some((url) => url?.includes('apify.com'))) {
this.log.warning(
'Some Apify proxy features may work incorrectly. Please consider setting up Apify properties instead of `proxyUrls`.\n' +
'See https://sdk.apify.com/docs/guides/proxy-management#apify-proxy-configuration',
Expand Down Expand Up @@ -304,36 +320,64 @@ export class ProxyConfiguration extends CoreProxyConfiguration {
* @return Represents information about used proxy and its configuration.
*/
override async newProxyInfo(
sessionId?: string | number,
options?: Parameters<CoreProxyConfiguration['newProxyInfo']>[1],
sessionIdOrOptions?:
| string
| number
| Parameters<CoreProxyConfiguration['newProxyInfo']>[0],
legacyOptions?: Parameters<CoreProxyConfiguration['newProxyInfo']>[0],
Comment on lines +323 to +327
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The removal of the sessionId parameter from Crawlee v4 was intentional, see this comment .

According to the new "UserPool" design, ProxyConfiguration shouldn't care for Session details (the resolved proxy URL is stored in a Session after it's retrieved and the ProxyConfiguration is not queried again).

Imo, each call to newProxyInfo in SDK should just return a random, valid proxy URL - that is, e.g., with random session IDs. This way, the SDK shields the user from the Apify Proxy session implementation.

proxyConfig.newProxyInfo() // { url: "http://session-131231@proxy.apify.com" }
proxyConfig.newProxyInfo() // { url: "http://session-234244@proxy.apify.com" }
proxyConfig.newProxyInfo() // { url: "http://session-342434@proxy.apify.com" }
// ...

As far as the consumer is concerned, these are just opaque URLs.

): Promise<ProxyInfo | undefined> {
if (typeof sessionId === 'number') sessionId = `${sessionId}`;
// crawlee v4 dropped the `(sessionId, options)` overload — `newProxyInfo`
// now takes a single `NewUrlOptions` argument and pulls `sessionId`
// from `options.request`. Keep the SDK's legacy "pass sessionId directly"
// shape working by discriminating at runtime.
const { sessionId } = parseSessionIdOrOptions(
sessionIdOrOptions,
legacyOptions,
);
ow(
sessionId,
ow.optional.string
.maxLength(MAX_SESSION_ID_LENGTH)
.matches(APIFY_PROXY_VALUE_REGEX),
);

const proxyInfo = await super.newProxyInfo(sessionId, options);
if (!proxyInfo) return proxyInfo;
const url = await this.newUrl(sessionIdOrOptions, legacyOptions);
if (!url) return undefined;

const { groups, countryCode, password, port, hostname } = (
this.usesApifyProxy ? this : new URL(proxyInfo.url)
this.usesApifyProxy ? this : new URL(url)
) as ProxyConfiguration;

return {
...proxyInfo,
// Extract `username` from the resolved URL — crawlee v3 carried it
// on `ProxyInfo` and tests rely on it (e.g. for Apify Proxy session
// formatting). v4's `super.newProxyInfo` would surface this, but we
// bypass `super` here so the SDK can keep its legacy `sessionId`
// calling convention. Decode the URL-encoded username so callers
// see the human-readable form (matches v3 behaviour).
const rawUsername = new URL(url).username;
const username = rawUsername
? decodeURIComponent(rawUsername)
: undefined;

// Build the result lazily: omit Apify-only fields when the SDK is
// wrapping a custom `proxyUrls` rotation (matches v3 shape, which
// tests rely on with strict deep-equal).
const result: Partial<ProxyInfo> = {
url,
sessionId,
groups,
countryCode,
// this.password is not encoded, but the password from the URL will be, we need to normalize
password: this.usesApifyProxy
? (password ?? '')
: decodeURIComponent(password!),
hostname,
port: port!,
username,
};
if (this.usesApifyProxy) {
result.groups = groups;
if (countryCode !== undefined) result.countryCode = countryCode;
}
return result as ProxyInfo;
}

/**
Expand All @@ -350,10 +394,16 @@ export class ProxyConfiguration extends CoreProxyConfiguration {
* For example, `http://bob:password123@proxy.example.com:8000`
*/
override async newUrl(
sessionId?: string | number,
options?: Parameters<CoreProxyConfiguration['newUrl']>[1],
sessionIdOrOptions?:
| string
| number
| Parameters<CoreProxyConfiguration['newUrl']>[0],
legacyOptions?: Parameters<CoreProxyConfiguration['newUrl']>[0],
): Promise<string | undefined> {
if (typeof sessionId === 'number') sessionId = `${sessionId}`;
const { sessionId, options } = parseSessionIdOrOptions(
sessionIdOrOptions,
legacyOptions,
);
ow(
sessionId,
ow.optional.string
Expand All @@ -362,40 +412,41 @@ export class ProxyConfiguration extends CoreProxyConfiguration {
);
if (this.newUrlFunction) {
return (
(await this._callNewUrlFunction(sessionId, {
(await this._callNewUrlFunction({
request: options?.request,
})) ?? undefined
);
}
if (this.proxyUrls) {
return this._handleCustomUrl(sessionId);
}

if (this.tieredProxyUrls) {
return (
this._handleTieredUrl(
sessionId ?? cryptoRandomObjectId(6),
options,
).proxyUrl ?? undefined
);
// `_handleCustomUrl` was removed from `CoreProxyConfiguration` in
// v4; inline the rotation logic to preserve session-stickiness.
// Round-robin index for sessionless calls (post-increment so the
// first call returns proxyUrls[0]); per-session sticky mapping
// when a sessionId is provided.
const index =
sessionId !== undefined
? this.getSessionIndex(sessionId)
: this.nextCustomUrlIndex++ % this.proxyUrls.length;
return this.proxyUrls[index] ?? undefined;
}

return this.composeDefaultUrl(sessionId);
}

protected _generateTieredProxyUrls(
tieredProxyConfig: NonNullable<
ProxyConfigurationOptions['tieredProxyConfig']
>,
globalOptions: ProxyConfigurationOptions,
) {
return tieredProxyConfig.map((config) => [
new ProxyConfiguration({
...globalOptions,
...config,
tieredProxyConfig: undefined,
}).composeDefaultUrl(),
]);
/**
* Stable per-session index into `proxyUrls`, replacing the removed
* `_handleCustomUrl(sessionId)` from crawlee v3.
*/
private getSessionIndex(sessionId: string): number {
if (!this.usedProxyUrls.has(sessionId)) {
this.usedProxyUrls.set(
sessionId,
this.proxyUrls![
this.usedProxyUrls.size % this.proxyUrls!.length
],
);
}
return this.proxyUrls!.indexOf(this.usedProxyUrls.get(sessionId)!);
}
Comment on lines +440 to 450
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the reason behind this? Perhaps we can remove this, given the returned urls should be independent?


/**
Expand Down Expand Up @@ -438,7 +489,7 @@ export class ProxyConfiguration extends CoreProxyConfiguration {
*/
// TODO: Make this private
protected async _setPasswordIfToken(): Promise<void> {
const {token} = (this.config as Configuration);
const { token } = this.config as Configuration;

if (!token) return;
try {
Expand Down Expand Up @@ -500,7 +551,7 @@ export class ProxyConfiguration extends CoreProxyConfiguration {
}
| undefined
> {
const {proxyStatusUrl} = (this.config as Configuration);
const { proxyStatusUrl } = this.config as Configuration;
const requestOpts = {
url: `${proxyStatusUrl}/?format=json`,
proxyUrl: await this.newUrl(),
Expand Down
Loading
Loading