Skip to content
18 changes: 9 additions & 9 deletions packages/apify/src/actor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ import { decryptInputSecrets } from '@apify/input_secrets';
import log from '@apify/log';
import { addTimeoutToPromise } from '@apify/timeout';

import { ApifyStorageClient } from './apify_storage_client.js';
import type { ChargeOptions, ChargeResult } from './charging.js';
import { ChargingManager } from './charging.js';
import type { ConfigurationOptions } from './configuration.js';
Expand Down Expand Up @@ -495,7 +496,9 @@ export class Actor<Data extends Dictionary = Dictionary> {
if (this.isAtHome()) {
// availableMemoryRatio and disableBrowserSandbox are now set via
// conditional defaults in the Configuration constructor (isAtHome check)
serviceLocator.setStorageClient(this.apifyClient);
serviceLocator.setStorageClient(
new ApifyStorageClient(this.apifyClient),
);
serviceLocator.setEventManager(this.eventManager);
} else if (options.storage) {
serviceLocator.setStorageClient(options.storage);
Expand Down Expand Up @@ -1310,7 +1313,7 @@ export class Actor<Data extends Dictionary = Dictionary> {

// eslint-disable-next-line dot-notation
queue['initialCount'] =
(await queue.client.get())?.totalRequestCount ?? 0;
(await queue.client.getMetadata())?.totalRequestCount ?? 0;

return queue;
}
Expand Down Expand Up @@ -2235,13 +2238,10 @@ export class Actor<Data extends Dictionary = Dictionary> {
id?: string,
options: OpenStorageOptions = {},
) {
const client = options.forceCloud ? this.apifyClient : undefined;
return StorageManager.openStorage<T>(
storageClass,
id,
client,
this.config,
);
const client = options.forceCloud
? new ApifyStorageClient(this.apifyClient)
: undefined;
return StorageManager.openStorage<T>(storageClass, id, client);
}

private _ensureActorInit(methodCalled: string) {
Expand Down
68 changes: 68 additions & 0 deletions packages/apify/src/apify_storage_client.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import type {
CreateDatasetClientOptions,
CreateKeyValueStoreClientOptions,
CreateRequestQueueClientOptions,
DatasetClient,
KeyValueStoreClient,
RequestQueueClient,
StorageClient,
} from '@crawlee/types';
import type { ApifyClient } from 'apify-client';

/**
* Bridges `apify-client`'s synchronous resource accessors (`dataset(id)`,
* `keyValueStore(id)`, `requestQueue(id, options?)`) to crawlee v4's
* `StorageClient` interface (async factory methods accepting either an `id`
* or a `name`).
*
* When only a `name` is provided, we resolve it to a concrete ID via the
* collection client's `getOrCreate(name)` — matching the behaviour the SDK
* relied on in v3 when storages were opened by name.
*/
export class ApifyStorageClient implements StorageClient {
constructor(private readonly client: ApifyClient) {}

async createDatasetClient(
options?: CreateDatasetClientOptions,
): Promise<DatasetClient> {
const id =
options?.id ??
(options?.name
? (await this.client.datasets().getOrCreate(options.name)).id
: undefined);
Comment on lines +28 to +32
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that if we implement storageExists, Crawlee should be able to resolve the identifiers on its own(?)

// apify-client's resource clients overlap with `@crawlee/types`' shapes
// but don't yet implement the v4-added members (`getMetadata`,
// `getRecordPublicUrl`). Cast through for now; a follow-up should
// bring apify-client into structural alignment.
return this.client.dataset(id ?? '') as unknown as DatasetClient;
}

async createKeyValueStoreClient(
options?: CreateKeyValueStoreClientOptions,
): Promise<KeyValueStoreClient> {
const id =
options?.id ??
(options?.name
? (await this.client.keyValueStores().getOrCreate(options.name))
.id
: undefined);
return this.client.keyValueStore(
id ?? '',
) as unknown as KeyValueStoreClient;
}

async createRequestQueueClient(
options?: CreateRequestQueueClientOptions,
): Promise<RequestQueueClient> {
const id =
options?.id ??
(options?.name
? (await this.client.requestQueues().getOrCreate(options.name))
.id
: undefined);
return this.client.requestQueue(
id ?? '',
options?.clientKey ? { clientKey: options.clientKey } : undefined,
) as unknown as RequestQueueClient;
}
}
40 changes: 27 additions & 13 deletions packages/apify/src/key_value_store.ts
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
import type { StorageManagerOptions } from '@crawlee/core';
import { KeyValueStore as CoreKeyValueStore } from '@crawlee/core';
import type { KeyValueStoreInfo } from '@crawlee/types';

import { createHmacSignature } from '@apify/utilities';

import type { Configuration } from './configuration.js';

// @ts-ignore newer crawlee versions already declare this method in core
const { getPublicUrl } = CoreKeyValueStore.prototype;
// crawlee v4 dropped the `storageObject` cache from `KeyValueStore`, so the
// per-store `urlSigningSecretKey` (which is part of the platform's metadata
// response but not declared on `@crawlee/types`' `KeyValueStoreInfo`) has to
// be fetched on demand and accessed through a structural-typed augmentation.
type ApifyKeyValueStoreInfo = KeyValueStoreInfo & {
urlSigningSecretKey?: string;
};

/**
* @inheritDoc
Expand All @@ -15,24 +21,35 @@ export class KeyValueStore extends CoreKeyValueStore {
/**
* Returns a URL for the given key that may be used to publicly
* access the value in the remote key-value store.
*
* On the Apify platform the URL is signed with the store's
* `urlSigningSecretKey` so that anyone with the URL can read the record
* without authentication. Locally we delegate to crawlee's default
* implementation (which produces a `file://` URL or returns `undefined`).
*/
override getPublicUrl(key: string): string {
override async getPublicUrl(key: string): Promise<string | undefined> {
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe this change closes this issue - I'm not sure if the renaming was a requirement, or a way to make this BC.

const config = this.config as Configuration;
if (!config.isAtHome && getPublicUrl) {
return getPublicUrl.call(this, key);
if (!config.isAtHome) {
return super.getPublicUrl(key);
}

const publicUrl = new URL(
`${config.apiPublicBaseUrl}/v2/key-value-stores/${this.id}/records/${key}`,
);

if (this.storageObject?.urlSigningSecretKey) {
// `client` is `private` on `CoreKeyValueStore`; bypass the visibility
// check to fetch the per-store secret. There is no public crawlee API
// surface for this yet — track upstream exposure as a follow-up.
Comment on lines +40 to +42
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point, please create the issue in Crawlee (I see that Dataset and RequestProvider already both have client public, so it's weirdly asymmetrical now).

const metadata = (await (
this as unknown as {
client: { getMetadata(): Promise<KeyValueStoreInfo> };
}
).client.getMetadata()) as ApifyKeyValueStoreInfo;

if (metadata?.urlSigningSecretKey) {
publicUrl.searchParams.append(
'signature',
createHmacSignature(
this.storageObject.urlSigningSecretKey as string,
key,
),
createHmacSignature(metadata.urlSigningSecretKey, key),
);
}

Expand All @@ -49,6 +66,3 @@ export class KeyValueStore extends CoreKeyValueStore {
return super.open(storeIdOrName, options) as unknown as KeyValueStore;
}
}

// @ts-ignore newer crawlee versions already declare this method in core
CoreKeyValueStore.prototype.getPublicUrl = KeyValueStore.prototype.getPublicUrl;
38 changes: 33 additions & 5 deletions test/MemoryStorageEmulator.ts
Original file line number Diff line number Diff line change
@@ -1,14 +1,39 @@
import { AsyncLocalStorage } from 'node:async_hooks';
import { rm } from 'node:fs/promises';
import { resolve } from 'node:path';

import { StorageManager } from '@crawlee/core';
import { serviceLocator } from '@crawlee/core';
import { MemoryStorage } from '@crawlee/memory-storage';
import { Configuration } from 'apify';
import { Actor, Configuration } from 'apify';
import { ensureDir } from 'fs-extra';

import log from '@apify/log';
import { cryptoRandomObjectId } from '@apify/utilities';

function resetGlobalState() {
serviceLocator.reset();
// The SDK's `Configuration` keeps its own static singleton (separate
// from crawlee's serviceLocator), and `Actor` caches a default
// instance with the resolved config. Both must be cleared so each
// test starts with a fresh config that reads the env vars it just set.
(
Configuration as unknown as { globalConfig?: Configuration }
).globalConfig = undefined;
// eslint-disable-next-line no-underscore-dangle -- `_instance` is the upstream Actor singleton field
(Actor as unknown as { _instance?: Actor })._instance = undefined;
// `Actor.init()` calls `Configuration.storage.enterWith(this.config)`,
// which sticks the resolved config onto the *outer* async context
// (vitest's test runner). `enterWith(undefined)` from a child context
// (this beforeEach) doesn't propagate back up, so on Node 22 the next
// test still sees the stale store. Replace the entire AsyncLocalStorage
// instance to guarantee `getStore()` returns `undefined` everywhere.
(
Configuration as unknown as {
storage: AsyncLocalStorage<Configuration>;
}
).storage = new AsyncLocalStorage<Configuration>();
}

const LOCAL_EMULATION_DIR = resolve(
__dirname,
'..',
Expand All @@ -20,15 +45,18 @@ export class MemoryStorageEmulator {
protected localStorageDirectories: string[] = [];

async init(dirName = cryptoRandomObjectId(10)) {
StorageManager.clearCache();
// crawlee v4 dropped `StorageManager.clearCache()` and
// `Configuration.useStorageClient()`; reset the service locator
// and re-register the in-memory client instead.
resetGlobalState();
const localStorageDir = resolve(LOCAL_EMULATION_DIR, dirName);
this.localStorageDirectories.push(localStorageDir);
await ensureDir(localStorageDir);

const storage = new MemoryStorage({
localDataDirectory: localStorageDir,
});
Configuration.getGlobalConfig().useStorageClient(storage);
serviceLocator.setStorageClient(storage);
log.debug(
`Initialized emulated memory storage in folder ${localStorageDir}`,
);
Expand All @@ -40,7 +68,7 @@ export class MemoryStorageEmulator {
});

await Promise.all(promises);
StorageManager.clearCache();
resetGlobalState();
}

static toString() {
Expand Down
23 changes: 20 additions & 3 deletions test/apify/actor.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -764,13 +764,26 @@ describe('Actor', () => {
'openStorage',
);

// crawlee v4's `RequestQueueClient` exposes metadata via
// `getMetadata()` (the v3 `get()` was dropped).
const mockRQ = {
client: { get: () => ({ totalRequestCount: 10 }) },
client: {
getMetadata: async () => ({ totalRequestCount: 10 }),
},
};

openStorageSpy.mockImplementationOnce(async () => mockRQ);
const queue = await sdk.openRequestQueue(queueId, options);
expect(openStorageSpy).toBeCalledWith(queueId, sdk.apifyClient);
// The SDK now wraps `apifyClient` in an `ApifyStorageClient`
// adapter to satisfy crawlee v4's `StorageClient` interface.
expect(openStorageSpy).toBeCalledWith(
queueId,
expect.objectContaining({
createDatasetClient: expect.any(Function),
createKeyValueStoreClient: expect.any(Function),
createRequestQueueClient: expect.any(Function),
}),
);
expect(openStorageSpy).toBeCalledTimes(1);

// @ts-expect-error private prop
Expand All @@ -789,7 +802,11 @@ describe('Actor', () => {
expect(mockOpenStorage).toBeCalledTimes(1);
expect(mockOpenStorage).toBeCalledWith(
datasetName,
sdk.apifyClient,
expect.objectContaining({
createDatasetClient: expect.any(Function),
createKeyValueStoreClient: expect.any(Function),
createRequestQueueClient: expect.any(Function),
}),
);
});
});
Expand Down