diff --git a/libs/vectorized-exec-spi/src/main/java/org/opensearch/vectorized/execution/jni/PageCacheAware.java b/libs/vectorized-exec-spi/src/main/java/org/opensearch/vectorized/execution/jni/PageCacheAware.java
new file mode 100644
index 0000000000000..315258e30a8a4
--- /dev/null
+++ b/libs/vectorized-exec-spi/src/main/java/org/opensearch/vectorized/execution/jni/PageCacheAware.java
@@ -0,0 +1,32 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.vectorized.execution.jni;
+
+/**
+ * Marker interface for plugins that can receive a {@link PageCacheProvider}.
+ *
+ * Implemented by {@code TieredStoragePlugin} so that {@code Node.java} (in the
+ * {@code server} module) can inject a page cache provider without needing
+ * a compile-time dependency on the {@code modules/tiered-storage} module.
+ *
+ * {@code Node.java} discovers plugins implementing this interface and calls
+ * {@link #setPageCacheProvider(PageCacheProvider)} after the {@link PageCacheProvider}
+ * (e.g., {@code DataFusionPlugin}) has been discovered.
+ */
+public interface PageCacheAware {
+
+ /**
+ * Inject the page cache provider.
+ * Called by {@code Node.java} during node construction, after the plugin implementing
+ * {@link PageCacheProvider} (e.g. {@code DataFusionPlugin}) has been initialized.
+ *
+ * @param provider the page cache provider, never null when this method is called
+ */
+ void setPageCacheProvider(PageCacheProvider provider);
+}
diff --git a/libs/vectorized-exec-spi/src/main/java/org/opensearch/vectorized/execution/jni/PageCacheProvider.java b/libs/vectorized-exec-spi/src/main/java/org/opensearch/vectorized/execution/jni/PageCacheProvider.java
new file mode 100644
index 0000000000000..60ec86d126e40
--- /dev/null
+++ b/libs/vectorized-exec-spi/src/main/java/org/opensearch/vectorized/execution/jni/PageCacheProvider.java
@@ -0,0 +1,53 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.vectorized.execution.jni;
+
+/**
+ * Provider for a byte-range page cache used to serve Parquet file reads on warm indices.
+ *
+ * Implemented by search-engine plugins (e.g. {@code DataFusionPlugin}) that own an
+ * in-process page cache and want to expose it to the tiered-storage module so that
+ * {@code CachedParquetCacheStrategy} can serve Parquet column-chunk byte ranges from the
+ * cache instead of re-fetching from object storage (S3/GCS/Azure) on every
+ * {@code openIndexInput()} call.
+ *
+ * The cache key is the local filesystem path of the Parquet file (without leading slash)
+ * combined with the byte range, e.g. {@code "data/nodes/0/.../parquet/_parquet_0.parquet:4096-8192"}.
+ * The exact key format is an implementation detail of the provider.
+ */
+public interface PageCacheProvider {
+
+ /**
+ * Look up a cached byte range for a Parquet file.
+ *
+ * @param path the local file path used as cache key (e.g. "data/nodes/0/.../parquet/_parquet_0.parquet")
+ * @param start byte range start (inclusive)
+ * @param end byte range end (exclusive)
+ * @return the cached bytes, or {@code null} on cache miss
+ */
+ byte[] getPageRange(String path, int start, int end);
+
+ /**
+ * Store a byte range for a Parquet file in the cache.
+ *
+ * @param path the local file path used as cache key
+ * @param start byte range start (inclusive)
+ * @param end byte range end (exclusive)
+ * @param data the bytes to cache (must have length == end - start)
+ */
+ void putPageRange(String path, int start, int end, byte[] data);
+
+ /**
+ * Evict all cached byte ranges for a given Parquet file.
+ * Called when a file is deleted (merged, compacted, or tiered out).
+ *
+ * @param path the local file path whose cached ranges should be removed
+ */
+ void evictFile(String path);
+}
diff --git a/modules/tiered-storage/src/main/java/org/opensearch/storage/TieredStoragePlugin.java b/modules/tiered-storage/src/main/java/org/opensearch/storage/TieredStoragePlugin.java
index de466c1e70bfc..a898c6b4320fb 100644
--- a/modules/tiered-storage/src/main/java/org/opensearch/storage/TieredStoragePlugin.java
+++ b/modules/tiered-storage/src/main/java/org/opensearch/storage/TieredStoragePlugin.java
@@ -66,6 +66,8 @@
import org.opensearch.telemetry.metrics.MetricsRegistry;
import org.opensearch.telemetry.tracing.Tracer;
import org.opensearch.index.store.CompositeStoreDirectoryFactory;
+import org.opensearch.vectorized.execution.jni.PageCacheAware;
+import org.opensearch.vectorized.execution.jni.PageCacheProvider;
import org.opensearch.vectorized.execution.jni.NativeObjectStoreProvider;
import java.util.ArrayList;
@@ -88,7 +90,7 @@
* Per-repository remote stores are added to the shared {@code FileRegistry} as new
* repositories are encountered. Different indices can point to different repositories.
*/
-public class TieredStoragePlugin extends Plugin implements IndexStorePlugin, ActionPlugin, TelemetryAwarePlugin, NativeObjectStoreProvider {
+public class TieredStoragePlugin extends Plugin implements IndexStorePlugin, ActionPlugin, TelemetryAwarePlugin, NativeObjectStoreProvider, PageCacheAware {
private static final org.apache.logging.log4j.Logger logger = org.apache.logging.log4j.LogManager.getLogger(TieredStoragePlugin.class);
@@ -101,6 +103,14 @@ public class TieredStoragePlugin extends Plugin implements IndexStorePlugin, Act
private volatile Supplier repositoriesServiceSupplier;
+ /**
+ * Page cache provider — received from DataFusionPlugin via Node.java.
+ * Passed into TieredCompositeStoreDirectoryFactory so that CachedParquetCacheStrategy
+ * can be used for parquet format files instead of PassthroughCacheStrategy.
+ * May be null if DataFusionPlugin is not loaded or page cache is disabled.
+ */
+ private volatile PageCacheProvider pageCacheProvider;
+
// Global native ObjectStore — created lazily on first warm shard creation
private volatile long globalObjStoreDataPtr;
private volatile long globalObjStoreVtablePtr;
@@ -139,6 +149,17 @@ public Map getCompositeStoreDirectoryFac
return Collections.emptyMap();
}
+ /**
+ * Set the page cache provider (e.g. from DataFusionPlugin).
+ * Called by Node.java after discovering a plugin implementing {@link PageCacheProvider},
+ * so TieredCompositeStoreDirectoryFactory can use CachedParquetCacheStrategy.
+ */
+ @Override
+ public void setPageCacheProvider(PageCacheProvider provider) {
+ this.pageCacheProvider = provider;
+ logger.info("[TieredStoragePlugin] PageCacheProvider set — parquet reads will use page cache");
+ }
+
@Override
public Map getCachedCompositeStoreDirectoryFactories() {
return Map.of(TIERED_COMPOSITE_INDEX_TYPE, new TieredCompositeStoreDirectoryFactory(
@@ -147,7 +168,12 @@ public Map this.pageCacheProvider
));
}
diff --git a/modules/tiered-storage/src/main/java/org/opensearch/storage/directory/CachedParquetCacheStrategy.java b/modules/tiered-storage/src/main/java/org/opensearch/storage/directory/CachedParquetCacheStrategy.java
new file mode 100644
index 0000000000000..d144207fe55dd
--- /dev/null
+++ b/modules/tiered-storage/src/main/java/org/opensearch/storage/directory/CachedParquetCacheStrategy.java
@@ -0,0 +1,304 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.storage.directory;
+
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexInput;
+import org.opensearch.index.engine.exec.FileMetadata;
+import org.opensearch.index.store.CompositeRemoteSegmentStoreDirectory;
+import org.opensearch.index.store.FormatCacheStrategy;
+import org.opensearch.index.store.FormatStoreDirectory;
+import org.opensearch.storage.jni.TieredStoreNative;
+import org.opensearch.vectorized.execution.jni.PageCacheProvider;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.util.zip.CRC32;
+
+/**
+ * Foyer-backed cache strategy for the Parquet format on warm indices.
+ * Replaces {@link PassthroughCacheStrategy} for the {@code "parquet"} format name.
+ * The key difference is in {@link #openInput}: for REMOTE files, instead of opening a full
+ * streaming {@link IndexInput} from {@link CompositeRemoteSegmentStoreDirectory} (which
+ * re-fetches the entire file from S3/GCS/Azure on every call), it returns a
+ * {@link CachedParquetIndexInput} that serves byte ranges from Foyer (cache miss fetches from
+ * remote and populates Foyer). All other operations (FileRegistry routing, ref-counting,
+ * file registration, deletion, checksums) are identical to {@link PassthroughCacheStrategy}.
+ * For Lucene format files, {@link PassthroughCacheStrategy} is still used unchanged.
+ */
+public class CachedParquetCacheStrategy implements FormatCacheStrategy {
+
+ private static final Logger logger = LogManager.getLogger(CachedParquetCacheStrategy.class);
+
+ private final String formatName;
+ private final CompositeRemoteSegmentStoreDirectory remoteDirectory;
+ private final long registryPtr;
+ private final String dirPathPrefix;
+ /** Page cache — provided by DataFusionPlugin via the PageCacheProvider interface */
+ private final PageCacheProvider foyerCache;
+ /** Reference to the owning directory for deferred local file deletion */
+ private volatile TieredCompositeStoreDirectory owningDirectory;
+
+ public CachedParquetCacheStrategy(
+ String formatName,
+ CompositeRemoteSegmentStoreDirectory remoteDirectory,
+ long registryPtr,
+ String dirPathPrefix,
+ PageCacheProvider foyerCache
+ ) {
+ this.formatName = formatName;
+ this.remoteDirectory = remoteDirectory;
+ this.registryPtr = registryPtr;
+ this.dirPathPrefix = dirPathPrefix;
+ this.foyerCache = foyerCache;
+ }
+
+ /**
+ * Set the owning directory reference. Called after construction by
+ * {@link TieredCompositeStoreDirectory} so that deferred local file
+ * deletion can be triggered when the last reader closes.
+ */
+ public void setOwningDirectory(TieredCompositeStoreDirectory directory) {
+ this.owningDirectory = directory;
+ }
+
+ /** Build the full registry key for a file (matches DataFusion/object_store key format). */
+ private String registryKey(String fileName) {
+ return dirPathPrefix + "/" + fileName;
+ }
+
+ /**
+ * Build the format-qualified remote file name.
+ * {@link CompositeRemoteSegmentStoreDirectory#openInput} uses the FileMetadata
+ * delimiter to route to the correct format container.
+ */
+ private String remoteFileName(String fileName) {
+ if (formatName == null || formatName.isEmpty() || "lucene".equals(formatName)) {
+ return fileName;
+ }
+ return fileName + FileMetadata.DELIMITER + formatName;
+ }
+
+ @Override
+ public String name() {
+ return "foyer-parquet(" + formatName + ")";
+ }
+
+ /**
+ * Opens an IndexInput for reading a Parquet file.
+ * LOCAL files are served from disk; REMOTE files return a {@link CachedParquetIndexInput}
+ * that serves byte ranges from Foyer (cache miss fetches from remote and populates cache).
+ * FileRegistry ref-counting is maintained in both paths (same as PassthroughCacheStrategy).
+ */
+ @Override
+ public IndexInput openInput(String fileName, IOContext context, FormatStoreDirectory> delegate)
+ throws IOException {
+
+ String key = registryKey(fileName);
+ // Acquire read reference in the FileRegistry — prevents local eviction while reading
+ int location = TieredStoreNative.registryAcquireRead(registryPtr, key);
+
+ // --- LOCAL or UNREGISTERED: read directly from disk ---
+ if (location == TieredStoreNative.LOCATION_LOCAL
+ || location == TieredStoreNative.LOCATION_NOT_FOUND) {
+ logger.debug("[CachedParquetCacheStrategy] openInput LOCAL: format={}, file={}, key={}, loc={}",
+ formatName, fileName, key, locationName(location));
+ try {
+ IndexInput localInput = delegate.openIndexInput(fileName, context);
+ // Wrap in RefCountedIndexInput for safe eviction (same as PassthroughCacheStrategy)
+ return new PassthroughCacheStrategy.RefCountedIndexInputPublic(
+ localInput, key, registryPtr, owningDirectory
+ );
+ } catch (IOException e) {
+ TieredStoreNative.registryReleaseRead(registryPtr, key);
+ throw e;
+ }
+ }
+
+ // --- REMOTE or BOTH: serve via Foyer page cache ---
+ if (remoteDirectory != null) {
+ logger.debug("[CachedParquetCacheStrategy] openInput REMOTE (Foyer): format={}, file={}, key={}",
+ formatName, fileName, key);
+ try {
+ // Resolve file length: try registry first (O(1)), fall back to remote metadata
+ long fileLen = TieredStoreNative.registryGetSize(registryPtr, key);
+ if (fileLen < 0) {
+ fileLen = remoteDirectory.fileLength(remoteFileName(fileName));
+ }
+
+ return new CachedParquetIndexInput(
+ "CachedParquet(" + fileName + ")",
+ remoteFileName(fileName),
+ key,
+ fileLen,
+ foyerCache,
+ remoteDirectory,
+ registryPtr,
+ owningDirectory
+ );
+ } catch (IOException e) {
+ // Remote failed — fall back to local
+ logger.warn("[CachedParquetCacheStrategy] remote open failed for {}, trying local: {}",
+ fileName, e.getMessage());
+ try {
+ IndexInput localInput = delegate.openIndexInput(fileName, context);
+ return new PassthroughCacheStrategy.RefCountedIndexInputPublic(
+ localInput, key, registryPtr, owningDirectory
+ );
+ } catch (IOException localEx) {
+ TieredStoreNative.registryReleaseRead(registryPtr, key);
+ throw e;
+ }
+ }
+ }
+
+ // No remote directory — fall back to local
+ logger.debug("[CachedParquetCacheStrategy] openInput LOCAL fallback (no remote dir): format={}, file={}",
+ formatName, fileName);
+ try {
+ IndexInput localInput = delegate.openIndexInput(fileName, context);
+ return new PassthroughCacheStrategy.RefCountedIndexInputPublic(
+ localInput, key, registryPtr, owningDirectory
+ );
+ } catch (IOException e) {
+ TieredStoreNative.registryReleaseRead(registryPtr, key);
+ throw e;
+ }
+ }
+
+ @Override
+ public void onFileWritten(String fileName, FormatStoreDirectory> delegate) throws IOException {
+ String key = registryKey(fileName);
+ long size = 0;
+ try {
+ size = delegate.fileLength(fileName);
+ } catch (IOException e) {
+ logger.warn("[CachedParquetCacheStrategy] could not get size for {}: {}", fileName, e.getMessage());
+ }
+ TieredStoreNative.registryRegisterLocal(registryPtr, key, size);
+ logger.debug("[CachedParquetCacheStrategy] onFileWritten: format={}, file={}, key={}, size={}",
+ formatName, fileName, key, size);
+ }
+
+ @Override
+ public void onFileDeleted(String fileName) throws IOException {
+ String key = registryKey(fileName);
+ int location = TieredStoreNative.registryGetLocation(registryPtr, key);
+ if (location == TieredStoreNative.LOCATION_BOTH) {
+ TieredStoreNative.registryMarkLocalDeleted(registryPtr, key);
+ logger.debug("[CachedParquetCacheStrategy] onFileDeleted (mark local deleted): key={}", key);
+ } else if (location == TieredStoreNative.LOCATION_REMOTE) {
+ logger.debug("[CachedParquetCacheStrategy] onFileDeleted (already remote-only): key={}", key);
+ } else {
+ TieredStoreNative.registryRemove(registryPtr, key);
+ logger.debug("[CachedParquetCacheStrategy] onFileDeleted (removed from registry): key={}", key);
+ }
+ // Also evict from Foyer page cache — stale bytes must not be served
+ foyerCache.evictFile(key);
+ }
+
+ @Override
+ public long fileLength(String fileName, FormatStoreDirectory> delegate) throws IOException {
+ String key = registryKey(fileName);
+ long size = TieredStoreNative.registryGetSize(registryPtr, key);
+ if (size >= 0) return size;
+
+ try {
+ return delegate.fileLength(fileName);
+ } catch (IOException e) {
+ // fall through to remote
+ }
+
+ if (remoteDirectory != null) {
+ return remoteDirectory.fileLength(remoteFileName(fileName));
+ }
+ throw new IOException("fileLength failed for " + fileName + " — not in registry, local, or remote");
+ }
+
+ @Override
+ public long calculateChecksum(String fileName, FormatStoreDirectory> delegate) throws IOException {
+ String key = registryKey(fileName);
+ int location = TieredStoreNative.registryGetLocation(registryPtr, key);
+
+ // For LOCAL or BOTH: try local first ONLY if the file actually exists on disk.
+ // On a warm node receiving shards via peer recovery, the registry may report BOTH
+ // (populated from remote metadata) but the file is not yet present locally —
+ // in that case, calling delegate.calculateChecksum() would log a misleading ERROR.
+ if (location == TieredStoreNative.LOCATION_LOCAL || location == TieredStoreNative.LOCATION_BOTH) {
+ if (Files.exists(delegate.getDirectoryPath().resolve(fileName))) {
+ try { return delegate.calculateChecksum(fileName); } catch (IOException ignored) {}
+ } else {
+ logger.debug("[CachedParquetCacheStrategy] calculateChecksum: skipping local (file not on disk): key={}, loc={}",
+ key, locationName(location));
+ }
+ }
+ // REMOTE, BOTH-with-local-failure/missing, or NOT_FOUND: try remote
+ if (remoteDirectory != null) {
+ try (IndexInput input = remoteDirectory.openInput(remoteFileName(fileName), IOContext.READONCE)) {
+ return computeCrc32(input);
+ }
+ }
+ // Last resort: try local (handles NOT_FOUND case where file may still exist)
+ return delegate.calculateChecksum(fileName);
+ }
+
+ @Override
+ public String calculateUploadChecksum(String fileName, FormatStoreDirectory> delegate)
+ throws IOException {
+ String key = registryKey(fileName);
+ int location = TieredStoreNative.registryGetLocation(registryPtr, key);
+
+ // For LOCAL or BOTH: try local first ONLY if the file actually exists on disk.
+ if (location == TieredStoreNative.LOCATION_LOCAL || location == TieredStoreNative.LOCATION_BOTH) {
+ if (Files.exists(delegate.getDirectoryPath().resolve(fileName))) {
+ try { return delegate.calculateUploadChecksum(fileName); } catch (IOException ignored) {}
+ } else {
+ logger.debug("[CachedParquetCacheStrategy] calculateUploadChecksum: skipping local (file not on disk): key={}, loc={}",
+ key, locationName(location));
+ }
+ }
+ // REMOTE, BOTH-with-local-failure/missing, or NOT_FOUND: try remote
+ if (remoteDirectory != null) {
+ try (IndexInput input = remoteDirectory.openInput(remoteFileName(fileName), IOContext.READONCE)) {
+ return Long.toString(computeCrc32(input));
+ }
+ }
+ // Last resort: try local
+ return delegate.calculateUploadChecksum(fileName);
+ }
+
+ @Override
+ public void close() throws IOException {
+ // no-op — registry is owned by TieredCompositeStoreDirectory
+ }
+
+ private static long computeCrc32(IndexInput input) throws IOException {
+ CRC32 crc32 = new CRC32();
+ byte[] buffer = new byte[8192];
+ long remaining = input.length();
+ while (remaining > 0) {
+ int toRead = (int) Math.min(buffer.length, remaining);
+ input.readBytes(buffer, 0, toRead);
+ crc32.update(buffer, 0, toRead);
+ remaining -= toRead;
+ }
+ return crc32.getValue();
+ }
+
+ private static String locationName(int loc) {
+ switch (loc) {
+ case TieredStoreNative.LOCATION_LOCAL: return "LOCAL";
+ case TieredStoreNative.LOCATION_REMOTE: return "REMOTE";
+ case TieredStoreNative.LOCATION_BOTH: return "BOTH";
+ default: return "UNREGISTERED";
+ }
+ }
+}
diff --git a/modules/tiered-storage/src/main/java/org/opensearch/storage/directory/CachedParquetIndexInput.java b/modules/tiered-storage/src/main/java/org/opensearch/storage/directory/CachedParquetIndexInput.java
new file mode 100644
index 0000000000000..9677b247ecfd4
--- /dev/null
+++ b/modules/tiered-storage/src/main/java/org/opensearch/storage/directory/CachedParquetIndexInput.java
@@ -0,0 +1,257 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.storage.directory;
+
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexInput;
+import org.opensearch.index.store.CompositeRemoteSegmentStoreDirectory;
+import org.opensearch.storage.jni.TieredStoreNative;
+import org.opensearch.vectorized.execution.jni.PageCacheProvider;
+
+import java.io.IOException;
+
+/**
+ * Lucene {@link IndexInput} backed by the Foyer in-memory page cache.
+ *
+ * Used by {@link CachedParquetCacheStrategy} for REMOTE Parquet files.
+ * On every {@code readBytes()} call it translates the current file pointer
+ * and length into a byte range and serves it from Foyer (fetching from
+ * {@code remoteDirectory} on cache miss).
+ *
+ * On {@code close()}, releases the FileRegistry read reference — same safety
+ * contract as {@link PassthroughCacheStrategy.RefCountedIndexInput}.
+ */
+public class CachedParquetIndexInput extends IndexInput {
+
+ private static final Logger logger = LogManager.getLogger(CachedParquetIndexInput.class);
+
+ /** Remote file name (format-qualified, e.g. "_parquet_0.parquet:::parquet") */
+ private final String remoteFileName;
+ /** FileRegistry key (local path without leading "/") */
+ private final String registryKey;
+ /** Total file length in bytes */
+ private final long fileLength;
+ /** Page cache provider — calls back into DataFusionPlugin via JNI */
+ private final PageCacheProvider foyerCache;
+ /** Remote directory for fetching bytes on cache miss */
+ private final CompositeRemoteSegmentStoreDirectory remoteDirectory;
+ /** FileRegistry pointer for ref-counting */
+ private final long registryPtr;
+ /** Owning directory for deferred eviction after last reader closes */
+ private final TieredCompositeStoreDirectory owningDirectory;
+
+ /** Current virtual file pointer */
+ private long filePointer = 0L;
+ /** Whether this input has been closed */
+ private boolean closed = false;
+
+ public CachedParquetIndexInput(
+ String resourceDescription,
+ String remoteFileName,
+ String registryKey,
+ long fileLength,
+ PageCacheProvider foyerCache,
+ CompositeRemoteSegmentStoreDirectory remoteDirectory,
+ long registryPtr,
+ TieredCompositeStoreDirectory owningDirectory
+ ) {
+ super(resourceDescription);
+ this.remoteFileName = remoteFileName;
+ this.registryKey = registryKey;
+ this.fileLength = fileLength;
+ this.foyerCache = foyerCache;
+ this.remoteDirectory = remoteDirectory;
+ this.registryPtr = registryPtr;
+ this.owningDirectory = owningDirectory;
+ }
+
+ @Override
+ public byte readByte() throws IOException {
+ byte[] buf = new byte[1];
+ readBytes(buf, 0, 1);
+ return buf[0];
+ }
+
+ @Override
+ public void readBytes(byte[] b, int offset, int len) throws IOException {
+ if (closed) throw new IOException("IndexInput is closed: " + toString());
+ if (filePointer + len > fileLength) {
+ throw new IOException(
+ "Read past EOF: filePointer=" + filePointer + ", len=" + len +
+ ", fileLength=" + fileLength + ", file=" + remoteFileName
+ );
+ }
+
+ int start = (int) filePointer;
+ int end = start + len;
+
+ // 1. Try Foyer cache first
+ byte[] cached = foyerCache.getPageRange(registryKey, start, end);
+ if (cached != null) {
+ logger.debug("[CachedParquetIndexInput] cache HIT: key={}, range={}..{}", registryKey, start, end);
+ System.arraycopy(cached, 0, b, offset, len);
+ filePointer += len;
+ return;
+ }
+
+ // 2. Cache miss — fetch from remote directory
+ logger.debug("[CachedParquetIndexInput] cache MISS: key={}, range={}..{} — fetching from remote",
+ registryKey, start, end);
+
+ byte[] fetched = fetchRangeFromRemote(start, len);
+
+ // 3. Populate Foyer for future reads
+ foyerCache.putPageRange(registryKey, start, end, fetched);
+
+ System.arraycopy(fetched, 0, b, offset, len);
+ filePointer += len;
+ }
+
+ /**
+ * Fetch a byte range from the remote directory.
+ * Opens a temporary IndexInput at the remote path, seeks to {@code start}, reads {@code len} bytes.
+ */
+ private byte[] fetchRangeFromRemote(int start, int len) throws IOException {
+ try (IndexInput remoteInput = remoteDirectory.openInput(remoteFileName, IOContext.READONCE)) {
+ remoteInput.seek(start);
+ byte[] buf = new byte[len];
+ remoteInput.readBytes(buf, 0, len);
+ return buf;
+ }
+ }
+
+ @Override
+ public void seek(long pos) throws IOException {
+ if (pos < 0 || pos > fileLength) {
+ throw new IOException("Seek out of bounds: pos=" + pos + ", fileLength=" + fileLength);
+ }
+ filePointer = pos;
+ }
+
+ @Override
+ public long getFilePointer() {
+ return filePointer;
+ }
+
+ @Override
+ public long length() {
+ return fileLength;
+ }
+
+ @Override
+ public IndexInput slice(String sliceDescription, long offset, long length) throws IOException {
+ // Create a sliced view — delegates reads through this input
+ return new SlicedFoyerIndexInput(sliceDescription, this, offset, length);
+ }
+
+ @Override
+ public void close() throws IOException {
+ if (!closed) {
+ closed = true;
+ // Release FileRegistry read reference
+ long remaining = TieredStoreNative.registryReleaseRead(registryPtr, registryKey);
+ // If last reader and file is REMOTE-only, trigger deferred local delete
+ if (remaining == 0 && owningDirectory != null) {
+ owningDirectory.tryDeleteLocalFileAfterLastRead(registryKey);
+ }
+ }
+ }
+
+ @Override
+ public IndexInput clone() {
+ CachedParquetIndexInput cloned = (CachedParquetIndexInput) super.clone();
+ // Each clone has its own file pointer (already handled by super.clone())
+ // Increment registry read ref for the clone
+ TieredStoreNative.registryAcquireRead(registryPtr, registryKey);
+ return cloned;
+ }
+
+ // -------------------------------------------------------------------------
+ // Inner class: slice support
+ // -------------------------------------------------------------------------
+
+ /**
+ * Sliced view of a {@link CachedParquetIndexInput}.
+ * Reads are delegated to the parent input with offset adjustment.
+ */
+ static class SlicedFoyerIndexInput extends IndexInput {
+
+ private final CachedParquetIndexInput parent;
+ private final long sliceOffset;
+ private final long sliceLength;
+ private long localPointer = 0L;
+
+ SlicedFoyerIndexInput(
+ String resourceDescription,
+ CachedParquetIndexInput parent,
+ long offset,
+ long length
+ ) {
+ super(resourceDescription);
+ this.parent = parent;
+ this.sliceOffset = offset;
+ this.sliceLength = length;
+ }
+
+ @Override
+ public byte readByte() throws IOException {
+ byte[] buf = new byte[1];
+ readBytes(buf, 0, 1);
+ return buf[0];
+ }
+
+ @Override
+ public void readBytes(byte[] b, int offset, int len) throws IOException {
+ if (localPointer + len > sliceLength) {
+ throw new IOException("Read past slice end");
+ }
+ long absoluteStart = sliceOffset + localPointer;
+ int start = (int) absoluteStart;
+ int end = start + len;
+
+ // Try Foyer cache via parent
+ byte[] cached = parent.foyerCache.getPageRange(parent.registryKey, start, end);
+ if (cached != null) {
+ System.arraycopy(cached, 0, b, offset, len);
+ localPointer += len;
+ return;
+ }
+
+ // Fetch from remote
+ byte[] fetched = parent.fetchRangeFromRemote(start, len);
+ parent.foyerCache.putPageRange(parent.registryKey, start, end, fetched);
+ System.arraycopy(fetched, 0, b, offset, len);
+ localPointer += len;
+ }
+
+ @Override
+ public void seek(long pos) throws IOException {
+ if (pos < 0 || pos > sliceLength) throw new IOException("Seek out of slice bounds");
+ localPointer = pos;
+ }
+
+ @Override
+ public long getFilePointer() { return localPointer; }
+
+ @Override
+ public long length() { return sliceLength; }
+
+ @Override
+ public IndexInput slice(String desc, long offset, long length) throws IOException {
+ return new SlicedFoyerIndexInput(desc, parent, sliceOffset + offset, length);
+ }
+
+ @Override
+ public void close() throws IOException {
+ // Slice does not own the ref-count; parent does
+ }
+ }
+}
diff --git a/modules/tiered-storage/src/main/java/org/opensearch/storage/directory/PassthroughCacheStrategy.java b/modules/tiered-storage/src/main/java/org/opensearch/storage/directory/PassthroughCacheStrategy.java
index c5b155c49771f..ef61a3bcb92bc 100644
--- a/modules/tiered-storage/src/main/java/org/opensearch/storage/directory/PassthroughCacheStrategy.java
+++ b/modules/tiered-storage/src/main/java/org/opensearch/storage/directory/PassthroughCacheStrategy.java
@@ -341,12 +341,23 @@ private static String locationName(int loc) {
}
}
+ /**
+ * Package-visible alias so that {@link CachedParquetCacheStrategy} can reuse the same
+ * ref-counting wrapper for LOCAL reads without duplicating the logic.
+ */
+ static class RefCountedIndexInputPublic extends RefCountedIndexInput {
+ RefCountedIndexInputPublic(IndexInput delegate, String fileName, long registryPtr,
+ TieredCompositeStoreDirectory owningDirectory) {
+ super(delegate, fileName, registryPtr, owningDirectory);
+ }
+ }
+
/**
* Wrapper that releases the read reference in the Rust FileRegistry when closed.
* When the last reader closes (active_reads → 0) and the file is REMOTE-only,
* triggers deferred local file deletion via the owning directory.
*/
- private static class RefCountedIndexInput extends IndexInput {
+ static class RefCountedIndexInput extends IndexInput {
private IndexInput delegate;
private final String fileName;
diff --git a/modules/tiered-storage/src/main/java/org/opensearch/storage/directory/TieredCompositeStoreDirectoryFactory.java b/modules/tiered-storage/src/main/java/org/opensearch/storage/directory/TieredCompositeStoreDirectoryFactory.java
index 2f00798610ae0..146a042abc5df 100644
--- a/modules/tiered-storage/src/main/java/org/opensearch/storage/directory/TieredCompositeStoreDirectoryFactory.java
+++ b/modules/tiered-storage/src/main/java/org/opensearch/storage/directory/TieredCompositeStoreDirectoryFactory.java
@@ -24,6 +24,7 @@
import org.opensearch.repositories.RepositoriesService;
import org.opensearch.repositories.Repository;
import org.opensearch.repositories.blobstore.BlobStoreRepository;
+import org.opensearch.vectorized.execution.jni.PageCacheProvider;
import java.io.IOException;
import java.util.function.Supplier;
@@ -50,13 +51,44 @@ public class TieredCompositeStoreDirectoryFactory implements CachedCompositeStor
private final Supplier repositoriesService;
private final java.util.function.Function globalRegistryPtrResolver;
+ /**
+ * Supplier for the page cache provider.
+ *
+ * Using a Supplier (rather than holding the provider directly) is critical: in Node.java,
+ * {@code getCachedCompositeStoreDirectoryFactories()} is called at line ~973, BEFORE
+ * {@code setPageCacheProvider()} is called at line ~1191. By capturing it as a Supplier,
+ * the actual provider is resolved lazily at shard creation time (the first call to
+ * {@code newDirectory()}), by which point the provider has already been set.
+ */
+ private final Supplier pageCacheProviderSupplier;
+ /**
+ * Constructor without Foyer cache (backward compatible).
+ * Uses PassthroughCacheStrategy for all formats.
+ */
public TieredCompositeStoreDirectoryFactory(
Supplier repositoriesService,
java.util.function.Function globalRegistryPtrResolver
+ ) {
+ this(repositoriesService, globalRegistryPtrResolver, () -> null);
+ }
+
+ /**
+ * Constructor with lazy page cache provider supplier.
+ *
+ * The supplier is called per-shard at {@code newDirectory()} time (not at factory construction
+ * time), so it correctly observes the provider value that was set after factory creation.
+ * When the supplier returns non-null for a shard's parquet format, {@link CachedParquetCacheStrategy}
+ * is used; otherwise {@link PassthroughCacheStrategy} is used.
+ */
+ public TieredCompositeStoreDirectoryFactory(
+ Supplier repositoriesService,
+ java.util.function.Function globalRegistryPtrResolver,
+ Supplier pageCacheProviderSupplier
) {
this.repositoriesService = repositoriesService;
this.globalRegistryPtrResolver = globalRegistryPtrResolver;
+ this.pageCacheProviderSupplier = pageCacheProviderSupplier;
}
@Override
@@ -82,12 +114,26 @@ public CompositeStoreDirectory newDirectory(
fileCache != null ? "present" : "null",
remoteDirectory != null ? "present" : "null");
+ // Cache strategy factory:
+ // "parquet" + Foyer available → CachedParquetCacheStrategy (byte-range caching via Foyer)
+ // "parquet" + no Foyer → PassthroughCacheStrategy (full remote read each time)
+ // "lucene" / "metadata" / etc → PassthroughCacheStrategy (FieldCache will replace later)
+ // Resolved HERE (at shard creation time), not at factory construction time.
+ // This is why pageCacheProviderSupplier is a Supplier — the provider is set in Node.java
+ // AFTER getCachedCompositeStoreDirectoryFactories() is called.
+ final PageCacheProvider pageCache = this.pageCacheProviderSupplier.get();
TieredCompositeStoreDirectory directory = new TieredCompositeStoreDirectory(
indexSettings,
pluginsService,
shardId,
shardPath,
- (formatName, dirPathPrefix) -> new PassthroughCacheStrategy(formatName, remoteDirectory, registryPtr, dirPathPrefix),
+ (formatName, dirPathPrefix) -> {
+ if ("parquet".equals(formatName) && pageCache != null) {
+ logger.debug("[TieredCompositeStoreDirectoryFactory] using CachedParquetCacheStrategy for format=parquet, shard={}", shardId);
+ return new CachedParquetCacheStrategy(formatName, remoteDirectory, registryPtr, dirPathPrefix, pageCache);
+ }
+ return new PassthroughCacheStrategy(formatName, remoteDirectory, registryPtr, dirPathPrefix);
+ },
registryPtr,
remoteDataBlobPath,
repositoryName,
diff --git a/plugins/engine-datafusion/Cargo.toml b/plugins/engine-datafusion/Cargo.toml
index 9889595c669c2..346a7cacecc1f 100644
--- a/plugins/engine-datafusion/Cargo.toml
+++ b/plugins/engine-datafusion/Cargo.toml
@@ -19,6 +19,8 @@ arrow-array = "57.3.0"
arrow-schema = "57.3.0"
arrow-buffer = "57.3.0"
downcast-rs = "1.2"
+foyer = { version = "=0.11.5" }
+bytes = "1.9"
# JNI dependencies
diff --git a/plugins/engine-datafusion/jni/Cargo.toml b/plugins/engine-datafusion/jni/Cargo.toml
index a2538d76b13d9..6144aef22ada1 100644
--- a/plugins/engine-datafusion/jni/Cargo.toml
+++ b/plugins/engine-datafusion/jni/Cargo.toml
@@ -68,6 +68,12 @@ url = { workspace = true }
# Liquid Cache for byte-level caching
liquid-cache-datafusion-local = { workspace = true }
+# Foyer hybrid in-memory+disk cache for Parquet page caching
+foyer = { workspace = true }
+
+# serde_bytes: efficient Bytes serialization needed for Foyer's StorageValue bound
+serde_bytes = "0.11"
+
# Substrait support
substrait = { workspace = true }
diff --git a/plugins/engine-datafusion/jni/src/cache.rs b/plugins/engine-datafusion/jni/src/cache.rs
index 4172c0926d724..26ad695a72998 100644
--- a/plugins/engine-datafusion/jni/src/cache.rs
+++ b/plugins/engine-datafusion/jni/src/cache.rs
@@ -7,11 +7,13 @@ use datafusion::execution::cache::CacheAccessor;
use object_store::ObjectMeta;
use vectorized_exec_spi::log_error;
-pub const ALL_CACHE_TYPES: &[&str] = &[CACHE_TYPE_METADATA, CACHE_TYPE_STATS];
+pub const ALL_CACHE_TYPES: &[&str] = &[CACHE_TYPE_METADATA, CACHE_TYPE_STATS, CACHE_TYPE_PAGES];
// Cache type constants
pub const CACHE_TYPE_METADATA: &str = "METADATA";
pub const CACHE_TYPE_STATS: &str = "STATISTICS";
+/// Foyer-backed byte-level page cache for Parquet column chunk data (Cache Layer 3)
+pub const CACHE_TYPE_PAGES: &str = "PAGES";
// Helper function to handle cache errors
#[allow(dead_code)]
diff --git a/plugins/engine-datafusion/jni/src/cache_jni.rs b/plugins/engine-datafusion/jni/src/cache_jni.rs
index db45f8adc7cb4..3863e8c2c808d 100644
--- a/plugins/engine-datafusion/jni/src/cache_jni.rs
+++ b/plugins/engine-datafusion/jni/src/cache_jni.rs
@@ -1,5 +1,5 @@
-use jni::objects::{JClass, JObjectArray, JString};
-use jni::sys::jlong;
+use jni::objects::{JByteArray, JClass, JObjectArray, JString};
+use jni::sys::{jbyteArray, jint, jlong};
use jni::{JNIEnv};
use crate::custom_cache_manager::CustomCacheManager;
use crate::util::{parse_string_arr};
@@ -7,8 +7,35 @@ use crate::cache;
use crate::DataFusionRuntime;
use datafusion::execution::cache::cache_unit::DefaultFilesMetadataCache;
use std::sync::Arc;
+use bytes::Bytes;
use vectorized_exec_spi::{log_info, log_error, log_debug};
+// Default page cache budgets — overridden by Java settings via createCache()
+const DEFAULT_PAGE_CACHE_MEMORY_BYTES: usize = 256 * 1024 * 1024; // 256 MB L1 memory
+const DEFAULT_PAGE_CACHE_DISK_BYTES: usize = 10 * 1024 * 1024 * 1024; // 10 GB L2 disk
+const DEFAULT_PAGE_CACHE_DIR: &str = "/tmp/foyer-page-cache";
+
+/// Parse the eviction_type string for PAGES cache type.
+/// Expected format: "|"
+/// Falls back to defaults if the string is malformed (e.g. plain "LRU" from old Java code).
+fn parse_page_cache_params(eviction_str: &str) -> (usize, String) {
+ if let Some(sep) = eviction_str.find('|') {
+ let disk_bytes_str = &eviction_str[..sep];
+ let disk_dir = eviction_str[sep + 1..].to_string();
+ if let Ok(disk_bytes) = disk_bytes_str.parse::() {
+ let dir = if disk_dir.is_empty() { DEFAULT_PAGE_CACHE_DIR.to_string() } else { disk_dir };
+ return (disk_bytes, dir);
+ }
+ }
+ // Fallback: plain eviction type like "LRU" from legacy config
+ log_info!(
+ "[FOYER-PAGE-CACHE] eviction_type '{}' is not in '|' format; \
+ using defaults: disk={}B, dir={}",
+ eviction_str, DEFAULT_PAGE_CACHE_DISK_BYTES, DEFAULT_PAGE_CACHE_DIR
+ );
+ (DEFAULT_PAGE_CACHE_DISK_BYTES, DEFAULT_PAGE_CACHE_DIR.to_string())
+}
+
/// Create a CustomCacheManager instance
#[no_mangle]
pub extern "system" fn Java_org_opensearch_datafusion_jni_NativeBridge_createCustomCacheManager(
@@ -89,6 +116,29 @@ pub extern "system" fn Java_org_opensearch_datafusion_jni_NativeBridge_createCac
manager.set_statistics_cache(stats_cache);
log_info!("[CACHE INFO] Successfully created {} cache in CustomCacheManager", cache_type_str);
}
+ cache::CACHE_TYPE_PAGES => {
+ // Create Foyer hybrid (memory + disk) page cache — Cache Layer 3.
+ // `size_limit` is the L1 memory budget in bytes (e.g. 256 MB).
+ // The L2 disk budget and disk directory come from the Java settings
+ // PAGE_CACHE_DISK_CAPACITY and PAGE_CACHE_DIR; for this cache creation
+ // call they are passed via the eviction_type string as
+ // "|".
+ // Format: eviction_type_str = "|"
+ let (disk_bytes, disk_dir) = parse_page_cache_params(&eviction_type_str);
+ log_info!(
+ "[FOYER-PAGE-CACHE] creating hybrid page cache: L1-mem={}B, L2-disk={}B, dir={}",
+ size_limit, disk_bytes, disk_dir
+ );
+ let page_cache = Arc::new(crate::foyer_cache::FoyerDiskPageCache::new(
+ size_limit as usize,
+ disk_bytes,
+ disk_dir,
+ ));
+ manager.set_page_cache(page_cache);
+ log_info!(
+ "[FOYER-PAGE-CACHE] successfully created Foyer hybrid page cache in CustomCacheManager"
+ );
+ }
_ => {
let msg = format!("Invalid cache type: {}", cache_type_str);
log_error!("[CACHE ERROR] {}", msg);
@@ -444,3 +494,128 @@ pub extern "system" fn Java_org_opensearch_datafusion_jni_NativeBridge_cacheMana
}
}
}
+
+// ============================================================================
+// Foyer page cache JNI operations (Layer 3: Parquet byte range cache)
+// Called by DataFusionPlugin.FoyerCacheProvider implementation to serve
+// PassthroughCacheStrategy → FoyerParquetCacheStrategy in the tiered-storage module.
+// ============================================================================
+
+/// Look up a cached byte range for a Parquet file.
+/// Returns the cached bytes as a Java byte[], or null on cache miss.
+#[no_mangle]
+pub extern "system" fn Java_org_opensearch_datafusion_jni_NativeBridge_foyerPageCacheGet(
+ mut env: JNIEnv,
+ _class: JClass,
+ runtime_ptr: jlong,
+ path: JString,
+ start: jint,
+ end: jint,
+) -> jbyteArray {
+ if runtime_ptr == 0 {
+ return std::ptr::null_mut();
+ }
+
+ let runtime = unsafe { &*(runtime_ptr as *const DataFusionRuntime) };
+ let path_str: String = match env.get_string(&path) {
+ Ok(s) => s.into(),
+ Err(_) => return std::ptr::null_mut(),
+ };
+
+ let page_cache = match runtime.custom_cache_manager.as_ref().and_then(|m| m.get_page_cache()) {
+ Some(c) => c,
+ None => return std::ptr::null_mut(),
+ };
+
+ // FoyerDiskPageCache.get() is async (disk I/O). Use get_blocking() since JNI is synchronous.
+ match page_cache.get_blocking(&path_str, start as usize, end as usize) {
+ Some(bytes) => {
+ log_debug!(
+ "[FOYER-PAGE-CACHE] JNI get HIT: path={}, range={}..{}, size={}B",
+ path_str, start, end, bytes.len()
+ );
+ match env.byte_array_from_slice(&bytes) {
+ Ok(arr) => arr.into_raw(),
+ Err(e) => {
+ log_debug!("[FOYER-PAGE-CACHE] JNI get: failed to create Java byte[]: {}", e);
+ std::ptr::null_mut()
+ }
+ }
+ }
+ None => {
+ log_debug!(
+ "[FOYER-PAGE-CACHE] JNI get MISS: path={}, range={}..{}",
+ path_str, start, end
+ );
+ std::ptr::null_mut()
+ }
+ }
+}
+
+/// Store a byte range for a Parquet file in the Foyer page cache.
+#[no_mangle]
+pub extern "system" fn Java_org_opensearch_datafusion_jni_NativeBridge_foyerPageCachePut(
+ mut env: JNIEnv,
+ _class: JClass,
+ runtime_ptr: jlong,
+ path: JString,
+ start: jint,
+ end: jint,
+ data: JByteArray,
+) {
+ if runtime_ptr == 0 {
+ return;
+ }
+
+ let runtime = unsafe { &*(runtime_ptr as *const DataFusionRuntime) };
+ let path_str: String = match env.get_string(&path) {
+ Ok(s) => s.into(),
+ Err(e) => {
+ log_debug!("[FoyerCache] foyerPageCachePut: failed to convert path: {}", e);
+ return;
+ }
+ };
+
+ let page_cache = match runtime.custom_cache_manager.as_ref().and_then(|m| m.get_page_cache()) {
+ Some(c) => c,
+ None => return,
+ };
+
+ let bytes_vec: Vec = match env.convert_byte_array(data) {
+ Ok(v) => v,
+ Err(e) => {
+ log_debug!("[FoyerCache] foyerPageCachePut: failed to convert byte array: {}", e);
+ return;
+ }
+ };
+
+ page_cache.put(path_str, start as usize, end as usize, Bytes::from(bytes_vec));
+}
+
+/// Evict all cached byte ranges for a given Parquet file.
+/// Called when a file is deleted (merged/compacted/tiered out).
+#[no_mangle]
+pub extern "system" fn Java_org_opensearch_datafusion_jni_NativeBridge_foyerPageCacheEvictFile(
+ mut env: JNIEnv,
+ _class: JClass,
+ runtime_ptr: jlong,
+ path: JString,
+) {
+ if runtime_ptr == 0 {
+ return;
+ }
+
+ let runtime = unsafe { &*(runtime_ptr as *const DataFusionRuntime) };
+ let path_str: String = match env.get_string(&path) {
+ Ok(s) => s.into(),
+ Err(e) => {
+ log_debug!("[FoyerCache] foyerPageCacheEvictFile: failed to convert path: {}", e);
+ return;
+ }
+ };
+
+ if let Some(page_cache) = runtime.custom_cache_manager.as_ref().and_then(|m| m.get_page_cache()) {
+ page_cache.evict_file(&path_str);
+ log_debug!("[FoyerCache] evicted file from page cache: {}", path_str);
+ }
+}
diff --git a/plugins/engine-datafusion/jni/src/caching_object_store.rs b/plugins/engine-datafusion/jni/src/caching_object_store.rs
new file mode 100644
index 0000000000000..83f203459479e
--- /dev/null
+++ b/plugins/engine-datafusion/jni/src/caching_object_store.rs
@@ -0,0 +1,234 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+//! Foyer-backed caching wrapper around any [`ObjectStore`].
+//!
+//! [`CachingObjectStore`] intercepts `get_range()` and `get_ranges()` calls —
+//! the two methods DataFusion uses to fetch Parquet column chunk byte ranges.
+//! All other methods are delegated transparently to the inner store.
+//!
+//! ## Two-tier read path
+//!
+//! ```text
+//! DataFusion.get_range(file, 4096..8192)
+//! └── CachingObjectStore.get_range()
+//! ├── [FOYER-PAGE-CACHE] check L1-memory → HIT: return bytes (0 I/O)
+//! ├── [FOYER-PAGE-CACHE] check L2-disk → HIT: return bytes (local NVMe)
+//! └── MISS: inner.get_range() → S3/local read
+//! └── [FOYER-PAGE-CACHE] PUT → L1-memory (async spill to L2-disk)
+//! ```
+//!
+//! ## Log prefix
+//!
+//! All log lines produced by this module use `[FOYER-PAGE-CACHE]` for easy grepping.
+
+use std::fmt;
+use std::ops::Range;
+use std::sync::Arc;
+
+use async_trait::async_trait;
+use bytes::Bytes;
+use futures::stream::BoxStream;
+use object_store::{
+ path::Path, GetOptions, GetResult, ListResult, MultipartUpload, ObjectMeta, ObjectStore,
+ PutMultipartOpts, PutOptions, PutPayload, PutResult,
+};
+use vectorized_exec_spi::{log_debug, log_info};
+
+use crate::foyer_cache::FoyerDiskPageCache;
+
+/// An [`ObjectStore`] wrapper that caches `get_range` / `get_ranges` results
+/// in the Foyer hybrid (memory + disk) page cache.
+pub struct CachingObjectStore {
+ inner: Arc,
+ page_cache: Arc,
+}
+
+impl fmt::Debug for CachingObjectStore {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ write!(f, "CachingObjectStore(inner={}, {})",
+ self.inner, self.page_cache.disk_dir().display())
+ }
+}
+
+impl fmt::Display for CachingObjectStore {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ write!(f, "CachingObjectStore({})", self.inner)
+ }
+}
+
+impl CachingObjectStore {
+ /// Wrap `inner` with Foyer-backed page caching.
+ pub fn new(inner: Arc, page_cache: Arc) -> Self {
+ log_info!(
+ "[FOYER-PAGE-CACHE] CachingObjectStore created: inner={}, disk_dir={}, \
+ mem_capacity={}B, disk_capacity={}B",
+ inner,
+ page_cache.disk_dir().display(),
+ page_cache.memory_capacity_bytes(),
+ page_cache.disk_capacity_bytes()
+ );
+ Self { inner, page_cache }
+ }
+
+ /// Strip the leading `/` so the cache key matches the FileRegistry key format.
+ fn cache_path(location: &Path) -> String {
+ let s = location.as_ref();
+ if s.starts_with('/') { s[1..].to_string() } else { s.to_string() }
+ }
+}
+
+#[async_trait]
+impl ObjectStore for CachingObjectStore {
+ // ── Write passthrough ──────────────────────────────────────────
+
+ async fn put(&self, location: &Path, payload: PutPayload) -> object_store::Result {
+ self.inner.put(location, payload).await
+ }
+ async fn put_opts(&self, location: &Path, payload: PutPayload, opts: PutOptions) -> object_store::Result {
+ self.inner.put_opts(location, payload, opts).await
+ }
+ async fn put_multipart(&self, location: &Path) -> object_store::Result> {
+ self.inner.put_multipart(location).await
+ }
+ async fn put_multipart_opts(&self, location: &Path, opts: PutMultipartOpts) -> object_store::Result> {
+ self.inner.put_multipart_opts(location, opts).await
+ }
+
+ // ── Read passthrough (non-range) ───────────────────────────────
+
+ async fn get(&self, location: &Path) -> object_store::Result {
+ self.inner.get(location).await
+ }
+ async fn get_opts(&self, location: &Path, options: GetOptions) -> object_store::Result {
+ self.inner.get_opts(location, options).await
+ }
+ async fn head(&self, location: &Path) -> object_store::Result {
+ self.inner.head(location).await
+ }
+
+ // ── Range reads: intercepted by Foyer page cache ───────────────
+
+ /// Fetch a single byte range.
+ /// Checks Foyer L1 (memory) then L2 (disk) before falling through to the inner store.
+ async fn get_range(&self, location: &Path, range: Range) -> object_store::Result {
+ let path_str = Self::cache_path(location);
+ let start = range.start as usize;
+ let end = range.end as usize;
+
+ // L1+L2 lookup (async — disk I/O is async in Foyer)
+ if let Some(cached) = self.page_cache.get(&path_str, start, end).await {
+ log_info!(
+ "[FOYER-PAGE-CACHE] get_range HIT: path={}, range={}..{}, size={}B",
+ path_str, start, end, cached.len()
+ );
+ return Ok(cached);
+ }
+
+ // L1+L2 miss — fetch from inner store (local NVMe or S3/GCS/Azure)
+ log_info!(
+ "[FOYER-PAGE-CACHE] get_range MISS → inner store: path={}, range={}..{}",
+ path_str, start, end
+ );
+ let bytes = self.inner.get_range(location, range).await?;
+
+ // Populate cache (insert to L1; Foyer spills to L2 asynchronously)
+ log_info!(
+ "[FOYER-PAGE-CACHE] get_range PUT: path={}, range={}..{}, size={}B",
+ path_str, start, end, bytes.len()
+ );
+ self.page_cache.put(path_str, start, end, bytes.clone());
+
+ Ok(bytes)
+ }
+
+ /// Fetch multiple byte ranges in one call.
+ /// Each range is looked up individually so partial cache hits are exploited.
+ async fn get_ranges(&self, location: &Path, ranges: &[Range]) -> object_store::Result> {
+ let path_str = Self::cache_path(location);
+
+ let mut results: Vec